feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
@@ -4,7 +4,6 @@ from __future__ import annotations

 import json
 import logging
-import re
 from datetime import date, timedelta

 from fastapi import APIRouter, Depends, HTTPException, Query, Request
@@ -15,6 +14,7 @@ from sqlalchemy.orm import Session, joinedload
 from app.config import settings
 from app.database import get_db
 from app.models import PAPER_FULL_LOAD, Paper
+from app.services.pdf_image_extractor import link_figures_with_images
 from app.utils import (
    PAPERS_DIR,
    safe_json_loads,
@@ -120,7 +120,7 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
        paper.summary.figures_json if paper.summary else None, default=[]
    )

-    linked_figures = _link_figures_with_images(figures_raw, images, arxiv_id)
+    linked_figures = link_figures_with_images(figures_raw, images, arxiv_id)

    # 拆分图片到对应展示区域：
    #   table_figures   → 实验结果区域（Table 截图，不变）
@@ -279,100 +279,3 @@ def _get_paper_images(arxiv_id: str) -> list[dict]:
                }
            )
    return images
-
-
-def _link_figures_with_images(
-    figures: list[dict], images: list[dict], arxiv_id: str
-) -> list[dict]:
-    """将 summary figures 元数据与提取的图片文件关联。
-
-    策略：
-    1. 优先用 manifest.json 的 label 做 ID 精确匹配
-    2. 未匹配的 figure 用序号兜底：第 N 个 Figure → 第 N 张提取图
-    """
-    if not figures or not images:
-        return figures
-
-    manifest_path = PAPERS_DIR / arxiv_id / "images" / "manifest.json"
-
-    # ── 策略 1：manifest ID 精确匹配 ──
-    id_to_url: dict[str, str] = {}
-    if manifest_path.exists():
-        try:
-            manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
-        except (ValueError, TypeError):
-            manifest = {}
-        for filename, info in manifest.items():
-            url = f"/papers/{arxiv_id}/images/{filename}"
-            # 优先用 label 字段（新格式）
-            label = info.get("label", "")
-            if label:
-                id_to_url[label] = url
-            # 也兼容 figures/tables 列表（旧格式）
-            for fig_id in info.get("figures", []) + info.get("tables", []):
-                if fig_id not in id_to_url:
-                    id_to_url[fig_id] = url
-
-    for fig in figures:
-        raw_id = fig.get("id", "")
-        normalized = _normalize_figure_id(raw_id)
-        if normalized in id_to_url:
-            fig["image_url"] = id_to_url[normalized]
-
-    # ── 策略 2：序号兜底（manifest 匹配不到时） ──
-    unmatched = [f for f in figures if not f.get("image_url")]
-    if not unmatched:
-        return figures
-
-    # 按类型分流：Figure vs Table
-    fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))]
-    table_type_unmatched = [
-        f for f in unmatched if not _is_figure_type(f.get("id", ""))
-    ]
-
-    # 提取的图片按类型分流，按文件名中的编号排序
-    def _sort_key(name: str) -> tuple[int, int]:
-        # 新格式：figure_1.jpg, table_1.jpg
-        m = re.search(r"(?:figure|table)_(\d+)", name)
-        if m:
-            return (0, int(m.group(1)))
-        # 旧格式：page2_img1.png, page5_table1.png, figure_1.png
-        m2 = re.search(r"page(\d+)_(?:img|table)(\d+)", name)
-        if m2:
-            return (int(m2.group(1)), int(m2.group(2)))
-        return (0, 0)
-
-    fig_images = sorted(
-        [img for img in images if "table" not in img["name"].lower()],
-        key=lambda img: _sort_key(img["name"]),
-    )
-    table_images = sorted(
-        [img for img in images if "table" in img["name"].lower()],
-        key=lambda img: _sort_key(img["name"]),
-    )
-
-    for i, fig in enumerate(fig_type_unmatched):
-        if i < len(fig_images):
-            fig["image_url"] = fig_images[i]["url"]
-
-    for i, fig in enumerate(table_type_unmatched):
-        if i < len(table_images):
-            fig["image_url"] = table_images[i]["url"]
-
-    return figures
-
-
-def _normalize_figure_id(raw_id: str) -> str:
-    """归一化 Figure/Table ID：'Figure 1'/'Fig.1' → 'Figure 1'。"""
-    m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", raw_id, re.IGNORECASE)
-    if m:
-        return f"Figure {m.group(1)}"
-    m2 = re.match(r"Table\s*(\d+)", raw_id, re.IGNORECASE)
-    if m2:
-        return f"Table {m2.group(1)}"
-    return raw_id
-
-
-def _is_figure_type(fig_id: str) -> bool:
-    """判断是否为 Figure 类型（非 Table）。"""
-    return not re.match(r"Table\s*(\d+)", fig_id, re.IGNORECASE)