Files
daily-paper/tests/test_pdf_image_extractor.py
T
Rain-Bus 1ccac1f29a refactor: replace Phase 2 label matching with PDF text-stream caption pairing
- Extract captions from PDF text dict instead of DocLayout caption boxes
- Use _CaptionBlock dataclass to carry authoritative ID, kind, text, bbox
- Pair captions to content boxes with directional preference (figure below, table above)
- Filter out uncaptioned boxes (Algorithm pseudo-code, unnumbered appendix tables, false positives)
- Remove label_images_by_summary and Phase 2 rename pipeline entirely
- Update tests to cover text-based caption pairing and filtering
2026-06-15 01:09:29 +08:00

171 lines
4.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
from unittest.mock import MagicMock
import pymupdf
from app.services import pdf_image_extractor as mod
from app.services.layout_detector import LayoutBox
def _caption_block(bbox, text):
"""构造一个 page.get_text("dict") 风格的文本块。"""
return {
"type": 0,
"bbox": list(bbox),
"lines": [{"spans": [{"text": text}]}],
}
def test_process_page_pairs_caption_from_text(tmp_path):
"""caption 来自 PDF 文本流(figure 标题在内容下方),用其 ID 直接命名。"""
images_dest = tmp_path / "images"
images_dest.mkdir()
manifest: dict[str, dict] = {}
pix = MagicMock()
pix.tobytes.return_value = b"jpeg"
page = MagicMock()
page.rect.width = 600
page.rect.height = 800
page.get_pixmap.return_value = pix
page.get_text.return_value = {
"blocks": [
_caption_block((95, 310, 320, 325), "Figure 1: Overall architecture.")
]
}
doc = MagicMock()
doc.__getitem__.return_value = page
boxes = [LayoutBox(100, 100, 300, 300, "picture")]
extracted = mod._process_page(
doc,
0,
boxes,
images_dest=images_dest,
manifest=manifest,
seen_labels=set(),
arxiv_id="2401.00001",
)
assert extracted == 1
# caption 自带 ID → 直接命名 figure_1.jpg
info = manifest["figure_1.jpg"]
assert info["label"] == "Figure 1"
assert info["caption_text"] == "Figure 1: Overall architecture."
assert info["caption_source"] == "text"
def test_process_page_includes_caption_in_render(tmp_path):
"""渲染时把 caption 文本块区域合并进同一张截图。"""
images_dest = tmp_path / "images"
images_dest.mkdir()
manifest: dict[str, dict] = {}
pix = MagicMock()
pix.tobytes.return_value = b"jpeg"
page = MagicMock()
page.rect.width = 600
page.rect.height = 800
page.get_pixmap.return_value = pix
page.get_text.return_value = {
"blocks": [_caption_block((95, 310, 320, 325), "Figure 1: Caption text.")]
}
doc = MagicMock()
doc.__getitem__.return_value = page
boxes = [LayoutBox(100, 100, 300, 300, "picture")]
mod._process_page(
doc,
0,
boxes,
images_dest=images_dest,
manifest=manifest,
seen_labels=set(),
arxiv_id="2401.00001",
)
# 内容 [100,100,300,300] caption [95,310,320,325],各方向加 _REGION_PADDING=5
# → Rect(90, 95, 325, 330)
clip = page.get_pixmap.call_args.kwargs["clip"]
assert clip == pymupdf.Rect(90, 95, 325, 330)
def test_process_page_table_caption_above(tmp_path):
"""table 标题惯例在内容上方,配对后命名 table_N.jpg。"""
images_dest = tmp_path / "images"
images_dest.mkdir()
manifest: dict[str, dict] = {}
pix = MagicMock()
pix.tobytes.return_value = b"jpeg"
page = MagicMock()
page.rect.width = 600
page.rect.height = 800
page.get_pixmap.return_value = pix
# caption 在内容上方 [80, 90, 320, 105],内容表格 [80, 120, 320, 280]
page.get_text.return_value = {
"blocks": [_caption_block((80, 90, 320, 105), "Table 2 | Results summary.")]
}
doc = MagicMock()
doc.__getitem__.return_value = page
boxes = [LayoutBox(80, 120, 320, 280, "table")]
extracted = mod._process_page(
doc,
0,
boxes,
images_dest=images_dest,
manifest=manifest,
seen_labels=set(),
arxiv_id="2401.00001",
)
assert extracted == 1
info = manifest["table_2.jpg"]
assert info["label"] == "Table 2"
assert info["caption_source"] == "text"
def test_process_page_filters_uncaptioned(tmp_path):
"""没有 Figure/Table caption 配对的 boxAlgorithm、无编号表等)被过滤,不输出。"""
images_dest = tmp_path / "images"
images_dest.mkdir()
manifest: dict[str, dict] = {}
pix = MagicMock()
pix.tobytes.return_value = b"jpeg"
page = MagicMock()
page.rect.width = 600
page.rect.height = 800
page.get_pixmap.return_value = pix
page.get_text.return_value = {"blocks": []} # 无任何 caption 文本块
doc = MagicMock()
doc.__getitem__.return_value = page
boxes = [LayoutBox(100, 100, 300, 300, "picture")]
extracted = mod._process_page(
doc,
0,
boxes,
images_dest=images_dest,
manifest=manifest,
seen_labels=set(),
arxiv_id="2401.00001",
)
assert extracted == 0
assert manifest == {}