from __future__ import annotations from unittest.mock import MagicMock import pymupdf from app.services import pdf_image_extractor as mod from app.services.layout_detector import LayoutBox def _caption_block(bbox, text): """构造一个 page.get_text("dict") 风格的文本块。""" return { "type": 0, "bbox": list(bbox), "lines": [{"spans": [{"text": text}]}], } def test_process_page_pairs_caption_from_text(tmp_path): """caption 来自 PDF 文本流(figure 标题在内容下方),用其 ID 直接命名。""" images_dest = tmp_path / "images" images_dest.mkdir() manifest: dict[str, dict] = {} pix = MagicMock() pix.tobytes.return_value = b"jpeg" page = MagicMock() page.rect.width = 600 page.rect.height = 800 page.get_pixmap.return_value = pix page.get_text.return_value = { "blocks": [ _caption_block((95, 310, 320, 325), "Figure 1: Overall architecture.") ] } doc = MagicMock() doc.__getitem__.return_value = page boxes = [LayoutBox(100, 100, 300, 300, "picture")] extracted = mod._process_page( doc, 0, boxes, images_dest=images_dest, manifest=manifest, seen_labels=set(), arxiv_id="2401.00001", ) assert extracted == 1 # caption 自带 ID → 直接命名 figure_1.jpg info = manifest["figure_1.jpg"] assert info["label"] == "Figure 1" assert info["caption_text"] == "Figure 1: Overall architecture." assert info["caption_source"] == "text" def test_process_page_includes_caption_in_render(tmp_path): """渲染时把 caption 文本块区域合并进同一张截图。""" images_dest = tmp_path / "images" images_dest.mkdir() manifest: dict[str, dict] = {} pix = MagicMock() pix.tobytes.return_value = b"jpeg" page = MagicMock() page.rect.width = 600 page.rect.height = 800 page.get_pixmap.return_value = pix page.get_text.return_value = { "blocks": [_caption_block((95, 310, 320, 325), "Figure 1: Caption text.")] } doc = MagicMock() doc.__getitem__.return_value = page boxes = [LayoutBox(100, 100, 300, 300, "picture")] mod._process_page( doc, 0, boxes, images_dest=images_dest, manifest=manifest, seen_labels=set(), arxiv_id="2401.00001", ) # 内容 [100,100,300,300] ∪ caption [95,310,320,325],各方向加 _REGION_PADDING=5 # → Rect(90, 95, 325, 330) clip = page.get_pixmap.call_args.kwargs["clip"] assert clip == pymupdf.Rect(90, 95, 325, 330) def test_process_page_table_caption_above(tmp_path): """table 标题惯例在内容上方,配对后命名 table_N.jpg。""" images_dest = tmp_path / "images" images_dest.mkdir() manifest: dict[str, dict] = {} pix = MagicMock() pix.tobytes.return_value = b"jpeg" page = MagicMock() page.rect.width = 600 page.rect.height = 800 page.get_pixmap.return_value = pix # caption 在内容上方 [80, 90, 320, 105],内容表格 [80, 120, 320, 280] page.get_text.return_value = { "blocks": [_caption_block((80, 90, 320, 105), "Table 2 | Results summary.")] } doc = MagicMock() doc.__getitem__.return_value = page boxes = [LayoutBox(80, 120, 320, 280, "table")] extracted = mod._process_page( doc, 0, boxes, images_dest=images_dest, manifest=manifest, seen_labels=set(), arxiv_id="2401.00001", ) assert extracted == 1 info = manifest["table_2.jpg"] assert info["label"] == "Table 2" assert info["caption_source"] == "text" def test_process_page_filters_uncaptioned(tmp_path): """没有 Figure/Table caption 配对的 box(Algorithm、无编号表等)被过滤,不输出。""" images_dest = tmp_path / "images" images_dest.mkdir() manifest: dict[str, dict] = {} pix = MagicMock() pix.tobytes.return_value = b"jpeg" page = MagicMock() page.rect.width = 600 page.rect.height = 800 page.get_pixmap.return_value = pix page.get_text.return_value = {"blocks": []} # 无任何 caption 文本块 doc = MagicMock() doc.__getitem__.return_value = page boxes = [LayoutBox(100, 100, 300, 300, "picture")] extracted = mod._process_page( doc, 0, boxes, images_dest=images_dest, manifest=manifest, seen_labels=set(), arxiv_id="2401.00001", ) assert extracted == 0 assert manifest == {}