feat: add concurrency safety, caption detection, admin enhancements, and performance improvements

2026-06-14 22:20:02 +08:00
parent 8f13c31991
commit 29fb20828e
23 changed files with 1782 additions and 114 deletions
@@ -0,0 +1,134 @@
+from __future__ import annotations
+
+import json
+from unittest.mock import MagicMock
+
+import pymupdf
+
+from app.services import pdf_image_extractor as mod
+from app.services.layout_detector import LayoutBox
+
+
+def test_process_page_extracts_doclayout_caption(tmp_path):
+    images_dest = tmp_path / "images"
+    images_dest.mkdir()
+    manifest: dict[str, dict] = {}
+
+    pix = MagicMock()
+    pix.tobytes.return_value = b"jpeg"
+
+    page = MagicMock()
+    page.rect.width = 600
+    page.get_pixmap.return_value = pix
+    page.get_text.return_value = "Figure 1: Overall architecture.\n"
+
+    doc = MagicMock()
+    doc.__getitem__.return_value = page
+
+    boxes = [
+        LayoutBox(100, 100, 300, 300, "picture"),
+        LayoutBox(95, 310, 320, 325, "figure_caption"),
+    ]
+
+    extracted = mod._process_page(
+        doc,
+        0,
+        boxes,
+        images_dest=images_dest,
+        manifest=manifest,
+        seen_labels=set(),
+        arxiv_id="2401.00001",
+    )
+
+    assert extracted == 1
+    info = manifest["figure_(p1-1).jpg"]
+    assert info["caption_text"] == "Figure 1: Overall architecture."
+    assert info["caption_source"] == "doclayout"
+    assert info["caption_box"] == [95.0, 310.0, 320.0, 325.0]
+
+
+def test_process_page_includes_caption_in_render(tmp_path):
+    """渲染时把 caption 区域合并进同一张截图。"""
+    images_dest = tmp_path / "images"
+    images_dest.mkdir()
+    manifest: dict[str, dict] = {}
+
+    pix = MagicMock()
+    pix.tobytes.return_value = b"jpeg"
+
+    page = MagicMock()
+    page.rect.width = 600
+    page.get_pixmap.return_value = pix
+    page.get_text.return_value = "Figure 1: Caption text.\n"
+
+    doc = MagicMock()
+    doc.__getitem__.return_value = page
+
+    boxes = [
+        LayoutBox(100, 100, 300, 300, "picture"),
+        LayoutBox(95, 310, 320, 325, "figure_caption"),
+    ]
+
+    mod._process_page(
+        doc,
+        0,
+        boxes,
+        images_dest=images_dest,
+        manifest=manifest,
+        seen_labels=set(),
+        arxiv_id="2401.00001",
+    )
+
+    # 内容 [100,100,300,300] ∪ caption [95,310,320,325]，各方向加 _REGION_PADDING=5
+    # → Rect(90, 95, 325, 330)
+    clip = page.get_pixmap.call_args.kwargs["clip"]
+    assert clip == pymupdf.Rect(90, 95, 325, 330)
+
+
+def test_label_images_preserves_doclayout_caption(tmp_path, monkeypatch):
+    arxiv_id = "2401.00001"
+    paper_root = tmp_path / arxiv_id
+    images_dest = paper_root / "images"
+    images_dest.mkdir(parents=True)
+    (images_dest / "figure_(p1-1).jpg").write_bytes(b"jpeg")
+    (images_dest / "manifest.json").write_text(
+        json.dumps(
+            {
+                "figure_(p1-1).jpg": {
+                    "page": 1,
+                    "type": "figure",
+                    "label": "Figure (p1-1)",
+                    "box": [100, 100, 300, 300],
+                    "caption_text": "Figure 1: PDF original caption.",
+                    "caption_source": "doclayout",
+                }
+            }
+        )
+    )
+
+    pdf_path = tmp_path / "paper.pdf"
+    pdf_path.write_bytes(b"%PDF")
+    monkeypatch.setattr(mod, "paper_dir", lambda _arxiv_id: paper_root)
+
+    page = MagicMock()
+    page.search_for.return_value = [pymupdf.Rect(120, 305, 180, 320)]
+
+    fake_doc = MagicMock()
+    fake_doc.page_count = 1
+    fake_doc.__getitem__.return_value = page
+    fake_doc.__enter__.return_value = fake_doc
+    fake_doc.__exit__.return_value = False
+    monkeypatch.setattr(mod.pymupdf, "open", lambda _path: fake_doc)
+
+    labeled = mod.label_images_by_summary(
+        arxiv_id,
+        [{"id": "Figure 1", "caption": "Summary caption."}],
+        pdf_path=pdf_path,
+    )
+
+    assert labeled == 1
+    manifest = json.loads((images_dest / "manifest.json").read_text())
+    info = manifest["figure_1.jpg"]
+    assert info["caption_text"] == "Figure 1: PDF original caption."
+    assert info["caption_source"] == "doclayout"
+    assert info["summary_caption_text"] == "Summary caption."