refactor: replace Phase 2 label matching with PDF text-stream caption pairing

- Extract captions from PDF text dict instead of DocLayout caption boxes - Use _CaptionBlock dataclass to carry authoritative ID, kind, text, bbox - Pair captions to content boxes with directional preference (figure below, table above) - Filter out uncaptioned boxes (Algorithm pseudo-code, unnumbered appendix tables, false positives) - Remove label_images_by_summary and Phase 2 rename pipeline entirely - Update tests to cover text-based caption pairing and filtering
2026-06-15 01:09:29 +08:00
parent 29fb20828e
commit 1ccac1f29a
3 changed files with 236 additions and 401 deletions
@@ -1,6 +1,5 @@
 from __future__ import annotations

-import json
 from unittest.mock import MagicMock

 import pymupdf
@@ -9,7 +8,17 @@ from app.services import pdf_image_extractor as mod
 from app.services.layout_detector import LayoutBox


-def test_process_page_extracts_doclayout_caption(tmp_path):
+def _caption_block(bbox, text):
+    """构造一个 page.get_text("dict") 风格的文本块。"""
+    return {
+        "type": 0,
+        "bbox": list(bbox),
+        "lines": [{"spans": [{"text": text}]}],
+    }
+
+
+def test_process_page_pairs_caption_from_text(tmp_path):
+    """caption 来自 PDF 文本流（figure 标题在内容下方），用其 ID 直接命名。"""
    images_dest = tmp_path / "images"
    images_dest.mkdir()
    manifest: dict[str, dict] = {}
@@ -19,16 +28,18 @@ def test_process_page_extracts_doclayout_caption(tmp_path):

    page = MagicMock()
    page.rect.width = 600
+    page.rect.height = 800
    page.get_pixmap.return_value = pix
-    page.get_text.return_value = "Figure 1: Overall architecture.\n"
+    page.get_text.return_value = {
+        "blocks": [
+            _caption_block((95, 310, 320, 325), "Figure 1: Overall architecture.")
+        ]
+    }

    doc = MagicMock()
    doc.__getitem__.return_value = page

-    boxes = [
-        LayoutBox(100, 100, 300, 300, "picture"),
-        LayoutBox(95, 310, 320, 325, "figure_caption"),
-    ]
+    boxes = [LayoutBox(100, 100, 300, 300, "picture")]

    extracted = mod._process_page(
        doc,
@@ -41,14 +52,15 @@ def test_process_page_extracts_doclayout_caption(tmp_path):
    )

    assert extracted == 1
-    info = manifest["figure_(p1-1).jpg"]
+    # caption 自带 ID → 直接命名 figure_1.jpg
+    info = manifest["figure_1.jpg"]
+    assert info["label"] == "Figure 1"
    assert info["caption_text"] == "Figure 1: Overall architecture."
-    assert info["caption_source"] == "doclayout"
-    assert info["caption_box"] == [95.0, 310.0, 320.0, 325.0]
+    assert info["caption_source"] == "text"


 def test_process_page_includes_caption_in_render(tmp_path):
-    """渲染时把 caption 区域合并进同一张截图。"""
+    """渲染时把 caption 文本块区域合并进同一张截图。"""
    images_dest = tmp_path / "images"
    images_dest.mkdir()
    manifest: dict[str, dict] = {}
@@ -58,16 +70,16 @@ def test_process_page_includes_caption_in_render(tmp_path):

    page = MagicMock()
    page.rect.width = 600
+    page.rect.height = 800
    page.get_pixmap.return_value = pix
-    page.get_text.return_value = "Figure 1: Caption text.\n"
+    page.get_text.return_value = {
+        "blocks": [_caption_block((95, 310, 320, 325), "Figure 1: Caption text.")]
+    }

    doc = MagicMock()
    doc.__getitem__.return_value = page

-    boxes = [
-        LayoutBox(100, 100, 300, 300, "picture"),
-        LayoutBox(95, 310, 320, 325, "figure_caption"),
-    ]
+    boxes = [LayoutBox(100, 100, 300, 300, "picture")]

    mod._process_page(
        doc,
@@ -85,50 +97,74 @@ def test_process_page_includes_caption_in_render(tmp_path):
    assert clip == pymupdf.Rect(90, 95, 325, 330)


-def test_label_images_preserves_doclayout_caption(tmp_path, monkeypatch):
-    arxiv_id = "2401.00001"
-    paper_root = tmp_path / arxiv_id
-    images_dest = paper_root / "images"
-    images_dest.mkdir(parents=True)
-    (images_dest / "figure_(p1-1).jpg").write_bytes(b"jpeg")
-    (images_dest / "manifest.json").write_text(
-        json.dumps(
-            {
-                "figure_(p1-1).jpg": {
-                    "page": 1,
-                    "type": "figure",
-                    "label": "Figure (p1-1)",
-                    "box": [100, 100, 300, 300],
-                    "caption_text": "Figure 1: PDF original caption.",
-                    "caption_source": "doclayout",
-                }
-            }
-        )
-    )
+def test_process_page_table_caption_above(tmp_path):
+    """table 标题惯例在内容上方，配对后命名 table_N.jpg。"""
+    images_dest = tmp_path / "images"
+    images_dest.mkdir()
+    manifest: dict[str, dict] = {}

-    pdf_path = tmp_path / "paper.pdf"
-    pdf_path.write_bytes(b"%PDF")
-    monkeypatch.setattr(mod, "paper_dir", lambda _arxiv_id: paper_root)
+    pix = MagicMock()
+    pix.tobytes.return_value = b"jpeg"

    page = MagicMock()
-    page.search_for.return_value = [pymupdf.Rect(120, 305, 180, 320)]
+    page.rect.width = 600
+    page.rect.height = 800
+    page.get_pixmap.return_value = pix
+    # caption 在内容上方 [80, 90, 320, 105]，内容表格 [80, 120, 320, 280]
+    page.get_text.return_value = {
+        "blocks": [_caption_block((80, 90, 320, 105), "Table 2 | Results summary.")]
+    }

-    fake_doc = MagicMock()
-    fake_doc.page_count = 1
-    fake_doc.__getitem__.return_value = page
-    fake_doc.__enter__.return_value = fake_doc
-    fake_doc.__exit__.return_value = False
-    monkeypatch.setattr(mod.pymupdf, "open", lambda _path: fake_doc)
+    doc = MagicMock()
+    doc.__getitem__.return_value = page

-    labeled = mod.label_images_by_summary(
-        arxiv_id,
-        [{"id": "Figure 1", "caption": "Summary caption."}],
-        pdf_path=pdf_path,
+    boxes = [LayoutBox(80, 120, 320, 280, "table")]
+
+    extracted = mod._process_page(
+        doc,
+        0,
+        boxes,
+        images_dest=images_dest,
+        manifest=manifest,
+        seen_labels=set(),
+        arxiv_id="2401.00001",
    )

-    assert labeled == 1
-    manifest = json.loads((images_dest / "manifest.json").read_text())
-    info = manifest["figure_1.jpg"]
-    assert info["caption_text"] == "Figure 1: PDF original caption."
-    assert info["caption_source"] == "doclayout"
-    assert info["summary_caption_text"] == "Summary caption."
+    assert extracted == 1
+    info = manifest["table_2.jpg"]
+    assert info["label"] == "Table 2"
+    assert info["caption_source"] == "text"
+
+
+def test_process_page_filters_uncaptioned(tmp_path):
+    """没有 Figure/Table caption 配对的 box（Algorithm、无编号表等）被过滤，不输出。"""
+    images_dest = tmp_path / "images"
+    images_dest.mkdir()
+    manifest: dict[str, dict] = {}
+
+    pix = MagicMock()
+    pix.tobytes.return_value = b"jpeg"
+
+    page = MagicMock()
+    page.rect.width = 600
+    page.rect.height = 800
+    page.get_pixmap.return_value = pix
+    page.get_text.return_value = {"blocks": []}  # 无任何 caption 文本块
+
+    doc = MagicMock()
+    doc.__getitem__.return_value = page
+
+    boxes = [LayoutBox(100, 100, 300, 300, "picture")]
+
+    extracted = mod._process_page(
+        doc,
+        0,
+        boxes,
+        images_dest=images_dest,
+        manifest=manifest,
+        seen_labels=set(),
+        arxiv_id="2401.00001",
+    )
+
+    assert extracted == 0
+    assert manifest == {}