refactor: replace Phase 2 label matching with PDF text-stream caption pairing

- Extract captions from PDF text dict instead of DocLayout caption boxes - Use _CaptionBlock dataclass to carry authoritative ID, kind, text, bbox - Pair captions to content boxes with directional preference (figure below, table above) - Filter out uncaptioned boxes (Algorithm pseudo-code, unnumbered appendix tables, false positives) - Remove label_images_by_summary and Phase 2 rename pipeline entirely - Update tests to cover text-based caption pairing and filtering
2026-06-15 01:09:29 +08:00
parent 29fb20828e
commit 1ccac1f29a
3 changed files with 236 additions and 401 deletions
@@ -201,20 +201,15 @@ def _cleanup_old_images(db: Session, paper: Paper) -> None:
 def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None:
    """从 PDF 提取图片和表格（失败不影响总结）。

-    两阶段流水线：
-    1. DocLayout-YOLO 检测 + 渲染截图（通用标签）
-    2. 用 summary 的 figures ID 在 PDF 中搜索定位 → 重命名
+    DocLayout-YOLO 检测 figure/table 内容区域 → PDF 文本定位 caption → 只渲染
+    配到 Figure/Table 标题的（Algorithm、无编号附录表、误检碎片一律过滤）。
+    标题来源已切换为 PDF 文本，schema.figures 不再参与命名，参数保留备用。
    """
    try:
-        from app.services.pdf_image_extractor import (
-            extract_images_from_pdf,
-            label_images_by_summary,
-        )
+        from app.services.pdf_image_extractor import extract_images_from_pdf

        pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
        extract_images_from_pdf(arxiv_id, pdf_path)
-        if schema.figures:
-            label_images_by_summary(arxiv_id, schema.figures, pdf_path)
    except Exception:
        logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)