feat: refactor summarizer and PDF extraction pipeline
- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
This commit is contained in:
+2
-99
@@ -4,7 +4,6 @@ from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from datetime import date, timedelta
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query, Request
|
||||
@@ -15,6 +14,7 @@ from sqlalchemy.orm import Session, joinedload
|
||||
from app.config import settings
|
||||
from app.database import get_db
|
||||
from app.models import PAPER_FULL_LOAD, Paper
|
||||
from app.services.pdf_image_extractor import link_figures_with_images
|
||||
from app.utils import (
|
||||
PAPERS_DIR,
|
||||
safe_json_loads,
|
||||
@@ -120,7 +120,7 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
|
||||
paper.summary.figures_json if paper.summary else None, default=[]
|
||||
)
|
||||
|
||||
linked_figures = _link_figures_with_images(figures_raw, images, arxiv_id)
|
||||
linked_figures = link_figures_with_images(figures_raw, images, arxiv_id)
|
||||
|
||||
# 拆分图片到对应展示区域:
|
||||
# table_figures → 实验结果区域(Table 截图,不变)
|
||||
@@ -279,100 +279,3 @@ def _get_paper_images(arxiv_id: str) -> list[dict]:
|
||||
}
|
||||
)
|
||||
return images
|
||||
|
||||
|
||||
def _link_figures_with_images(
|
||||
figures: list[dict], images: list[dict], arxiv_id: str
|
||||
) -> list[dict]:
|
||||
"""将 summary figures 元数据与提取的图片文件关联。
|
||||
|
||||
策略:
|
||||
1. 优先用 manifest.json 的 label 做 ID 精确匹配
|
||||
2. 未匹配的 figure 用序号兜底:第 N 个 Figure → 第 N 张提取图
|
||||
"""
|
||||
if not figures or not images:
|
||||
return figures
|
||||
|
||||
manifest_path = PAPERS_DIR / arxiv_id / "images" / "manifest.json"
|
||||
|
||||
# ── 策略 1:manifest ID 精确匹配 ──
|
||||
id_to_url: dict[str, str] = {}
|
||||
if manifest_path.exists():
|
||||
try:
|
||||
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
|
||||
except (ValueError, TypeError):
|
||||
manifest = {}
|
||||
for filename, info in manifest.items():
|
||||
url = f"/papers/{arxiv_id}/images/{filename}"
|
||||
# 优先用 label 字段(新格式)
|
||||
label = info.get("label", "")
|
||||
if label:
|
||||
id_to_url[label] = url
|
||||
# 也兼容 figures/tables 列表(旧格式)
|
||||
for fig_id in info.get("figures", []) + info.get("tables", []):
|
||||
if fig_id not in id_to_url:
|
||||
id_to_url[fig_id] = url
|
||||
|
||||
for fig in figures:
|
||||
raw_id = fig.get("id", "")
|
||||
normalized = _normalize_figure_id(raw_id)
|
||||
if normalized in id_to_url:
|
||||
fig["image_url"] = id_to_url[normalized]
|
||||
|
||||
# ── 策略 2:序号兜底(manifest 匹配不到时) ──
|
||||
unmatched = [f for f in figures if not f.get("image_url")]
|
||||
if not unmatched:
|
||||
return figures
|
||||
|
||||
# 按类型分流:Figure vs Table
|
||||
fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))]
|
||||
table_type_unmatched = [
|
||||
f for f in unmatched if not _is_figure_type(f.get("id", ""))
|
||||
]
|
||||
|
||||
# 提取的图片按类型分流,按文件名中的编号排序
|
||||
def _sort_key(name: str) -> tuple[int, int]:
|
||||
# 新格式:figure_1.jpg, table_1.jpg
|
||||
m = re.search(r"(?:figure|table)_(\d+)", name)
|
||||
if m:
|
||||
return (0, int(m.group(1)))
|
||||
# 旧格式:page2_img1.png, page5_table1.png, figure_1.png
|
||||
m2 = re.search(r"page(\d+)_(?:img|table)(\d+)", name)
|
||||
if m2:
|
||||
return (int(m2.group(1)), int(m2.group(2)))
|
||||
return (0, 0)
|
||||
|
||||
fig_images = sorted(
|
||||
[img for img in images if "table" not in img["name"].lower()],
|
||||
key=lambda img: _sort_key(img["name"]),
|
||||
)
|
||||
table_images = sorted(
|
||||
[img for img in images if "table" in img["name"].lower()],
|
||||
key=lambda img: _sort_key(img["name"]),
|
||||
)
|
||||
|
||||
for i, fig in enumerate(fig_type_unmatched):
|
||||
if i < len(fig_images):
|
||||
fig["image_url"] = fig_images[i]["url"]
|
||||
|
||||
for i, fig in enumerate(table_type_unmatched):
|
||||
if i < len(table_images):
|
||||
fig["image_url"] = table_images[i]["url"]
|
||||
|
||||
return figures
|
||||
|
||||
|
||||
def _normalize_figure_id(raw_id: str) -> str:
|
||||
"""归一化 Figure/Table ID:'Figure 1'/'Fig.1' → 'Figure 1'。"""
|
||||
m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", raw_id, re.IGNORECASE)
|
||||
if m:
|
||||
return f"Figure {m.group(1)}"
|
||||
m2 = re.match(r"Table\s*(\d+)", raw_id, re.IGNORECASE)
|
||||
if m2:
|
||||
return f"Table {m2.group(1)}"
|
||||
return raw_id
|
||||
|
||||
|
||||
def _is_figure_type(fig_id: str) -> bool:
|
||||
"""判断是否为 Figure 类型(非 Table)。"""
|
||||
return not re.match(r"Table\s*(\d+)", fig_id, re.IGNORECASE)
|
||||
|
||||
Reference in New Issue
Block a user