from __future__ import annotations import json from unittest.mock import MagicMock import pymupdf from app.services import pdf_image_extractor as mod from app.services.layout_detector import LayoutBox def test_process_page_extracts_doclayout_caption(tmp_path): images_dest = tmp_path / "images" images_dest.mkdir() manifest: dict[str, dict] = {} pix = MagicMock() pix.tobytes.return_value = b"jpeg" page = MagicMock() page.rect.width = 600 page.get_pixmap.return_value = pix page.get_text.return_value = "Figure 1: Overall architecture.\n" doc = MagicMock() doc.__getitem__.return_value = page boxes = [ LayoutBox(100, 100, 300, 300, "picture"), LayoutBox(95, 310, 320, 325, "figure_caption"), ] extracted = mod._process_page( doc, 0, boxes, images_dest=images_dest, manifest=manifest, seen_labels=set(), arxiv_id="2401.00001", ) assert extracted == 1 info = manifest["figure_(p1-1).jpg"] assert info["caption_text"] == "Figure 1: Overall architecture." assert info["caption_source"] == "doclayout" assert info["caption_box"] == [95.0, 310.0, 320.0, 325.0] def test_process_page_includes_caption_in_render(tmp_path): """渲染时把 caption 区域合并进同一张截图。""" images_dest = tmp_path / "images" images_dest.mkdir() manifest: dict[str, dict] = {} pix = MagicMock() pix.tobytes.return_value = b"jpeg" page = MagicMock() page.rect.width = 600 page.get_pixmap.return_value = pix page.get_text.return_value = "Figure 1: Caption text.\n" doc = MagicMock() doc.__getitem__.return_value = page boxes = [ LayoutBox(100, 100, 300, 300, "picture"), LayoutBox(95, 310, 320, 325, "figure_caption"), ] mod._process_page( doc, 0, boxes, images_dest=images_dest, manifest=manifest, seen_labels=set(), arxiv_id="2401.00001", ) # 内容 [100,100,300,300] ∪ caption [95,310,320,325],各方向加 _REGION_PADDING=5 # → Rect(90, 95, 325, 330) clip = page.get_pixmap.call_args.kwargs["clip"] assert clip == pymupdf.Rect(90, 95, 325, 330) def test_label_images_preserves_doclayout_caption(tmp_path, monkeypatch): arxiv_id = "2401.00001" paper_root = tmp_path / arxiv_id images_dest = paper_root / "images" images_dest.mkdir(parents=True) (images_dest / "figure_(p1-1).jpg").write_bytes(b"jpeg") (images_dest / "manifest.json").write_text( json.dumps( { "figure_(p1-1).jpg": { "page": 1, "type": "figure", "label": "Figure (p1-1)", "box": [100, 100, 300, 300], "caption_text": "Figure 1: PDF original caption.", "caption_source": "doclayout", } } ) ) pdf_path = tmp_path / "paper.pdf" pdf_path.write_bytes(b"%PDF") monkeypatch.setattr(mod, "paper_dir", lambda _arxiv_id: paper_root) page = MagicMock() page.search_for.return_value = [pymupdf.Rect(120, 305, 180, 320)] fake_doc = MagicMock() fake_doc.page_count = 1 fake_doc.__getitem__.return_value = page fake_doc.__enter__.return_value = fake_doc fake_doc.__exit__.return_value = False monkeypatch.setattr(mod.pymupdf, "open", lambda _path: fake_doc) labeled = mod.label_images_by_summary( arxiv_id, [{"id": "Figure 1", "caption": "Summary caption."}], pdf_path=pdf_path, ) assert labeled == 1 manifest = json.loads((images_dest / "manifest.json").read_text()) info = manifest["figure_1.jpg"] assert info["caption_text"] == "Figure 1: PDF original caption." assert info["caption_source"] == "doclayout" assert info["summary_caption_text"] == "Summary caption."