1ccac1f29a
- Extract captions from PDF text dict instead of DocLayout caption boxes - Use _CaptionBlock dataclass to carry authoritative ID, kind, text, bbox - Pair captions to content boxes with directional preference (figure below, table above) - Filter out uncaptioned boxes (Algorithm pseudo-code, unnumbered appendix tables, false positives) - Remove label_images_by_summary and Phase 2 rename pipeline entirely - Update tests to cover text-based caption pairing and filtering
171 lines
4.5 KiB
Python
171 lines
4.5 KiB
Python
from __future__ import annotations
|
||
|
||
from unittest.mock import MagicMock
|
||
|
||
import pymupdf
|
||
|
||
from app.services import pdf_image_extractor as mod
|
||
from app.services.layout_detector import LayoutBox
|
||
|
||
|
||
def _caption_block(bbox, text):
|
||
"""构造一个 page.get_text("dict") 风格的文本块。"""
|
||
return {
|
||
"type": 0,
|
||
"bbox": list(bbox),
|
||
"lines": [{"spans": [{"text": text}]}],
|
||
}
|
||
|
||
|
||
def test_process_page_pairs_caption_from_text(tmp_path):
|
||
"""caption 来自 PDF 文本流(figure 标题在内容下方),用其 ID 直接命名。"""
|
||
images_dest = tmp_path / "images"
|
||
images_dest.mkdir()
|
||
manifest: dict[str, dict] = {}
|
||
|
||
pix = MagicMock()
|
||
pix.tobytes.return_value = b"jpeg"
|
||
|
||
page = MagicMock()
|
||
page.rect.width = 600
|
||
page.rect.height = 800
|
||
page.get_pixmap.return_value = pix
|
||
page.get_text.return_value = {
|
||
"blocks": [
|
||
_caption_block((95, 310, 320, 325), "Figure 1: Overall architecture.")
|
||
]
|
||
}
|
||
|
||
doc = MagicMock()
|
||
doc.__getitem__.return_value = page
|
||
|
||
boxes = [LayoutBox(100, 100, 300, 300, "picture")]
|
||
|
||
extracted = mod._process_page(
|
||
doc,
|
||
0,
|
||
boxes,
|
||
images_dest=images_dest,
|
||
manifest=manifest,
|
||
seen_labels=set(),
|
||
arxiv_id="2401.00001",
|
||
)
|
||
|
||
assert extracted == 1
|
||
# caption 自带 ID → 直接命名 figure_1.jpg
|
||
info = manifest["figure_1.jpg"]
|
||
assert info["label"] == "Figure 1"
|
||
assert info["caption_text"] == "Figure 1: Overall architecture."
|
||
assert info["caption_source"] == "text"
|
||
|
||
|
||
def test_process_page_includes_caption_in_render(tmp_path):
|
||
"""渲染时把 caption 文本块区域合并进同一张截图。"""
|
||
images_dest = tmp_path / "images"
|
||
images_dest.mkdir()
|
||
manifest: dict[str, dict] = {}
|
||
|
||
pix = MagicMock()
|
||
pix.tobytes.return_value = b"jpeg"
|
||
|
||
page = MagicMock()
|
||
page.rect.width = 600
|
||
page.rect.height = 800
|
||
page.get_pixmap.return_value = pix
|
||
page.get_text.return_value = {
|
||
"blocks": [_caption_block((95, 310, 320, 325), "Figure 1: Caption text.")]
|
||
}
|
||
|
||
doc = MagicMock()
|
||
doc.__getitem__.return_value = page
|
||
|
||
boxes = [LayoutBox(100, 100, 300, 300, "picture")]
|
||
|
||
mod._process_page(
|
||
doc,
|
||
0,
|
||
boxes,
|
||
images_dest=images_dest,
|
||
manifest=manifest,
|
||
seen_labels=set(),
|
||
arxiv_id="2401.00001",
|
||
)
|
||
|
||
# 内容 [100,100,300,300] ∪ caption [95,310,320,325],各方向加 _REGION_PADDING=5
|
||
# → Rect(90, 95, 325, 330)
|
||
clip = page.get_pixmap.call_args.kwargs["clip"]
|
||
assert clip == pymupdf.Rect(90, 95, 325, 330)
|
||
|
||
|
||
def test_process_page_table_caption_above(tmp_path):
|
||
"""table 标题惯例在内容上方,配对后命名 table_N.jpg。"""
|
||
images_dest = tmp_path / "images"
|
||
images_dest.mkdir()
|
||
manifest: dict[str, dict] = {}
|
||
|
||
pix = MagicMock()
|
||
pix.tobytes.return_value = b"jpeg"
|
||
|
||
page = MagicMock()
|
||
page.rect.width = 600
|
||
page.rect.height = 800
|
||
page.get_pixmap.return_value = pix
|
||
# caption 在内容上方 [80, 90, 320, 105],内容表格 [80, 120, 320, 280]
|
||
page.get_text.return_value = {
|
||
"blocks": [_caption_block((80, 90, 320, 105), "Table 2 | Results summary.")]
|
||
}
|
||
|
||
doc = MagicMock()
|
||
doc.__getitem__.return_value = page
|
||
|
||
boxes = [LayoutBox(80, 120, 320, 280, "table")]
|
||
|
||
extracted = mod._process_page(
|
||
doc,
|
||
0,
|
||
boxes,
|
||
images_dest=images_dest,
|
||
manifest=manifest,
|
||
seen_labels=set(),
|
||
arxiv_id="2401.00001",
|
||
)
|
||
|
||
assert extracted == 1
|
||
info = manifest["table_2.jpg"]
|
||
assert info["label"] == "Table 2"
|
||
assert info["caption_source"] == "text"
|
||
|
||
|
||
def test_process_page_filters_uncaptioned(tmp_path):
|
||
"""没有 Figure/Table caption 配对的 box(Algorithm、无编号表等)被过滤,不输出。"""
|
||
images_dest = tmp_path / "images"
|
||
images_dest.mkdir()
|
||
manifest: dict[str, dict] = {}
|
||
|
||
pix = MagicMock()
|
||
pix.tobytes.return_value = b"jpeg"
|
||
|
||
page = MagicMock()
|
||
page.rect.width = 600
|
||
page.rect.height = 800
|
||
page.get_pixmap.return_value = pix
|
||
page.get_text.return_value = {"blocks": []} # 无任何 caption 文本块
|
||
|
||
doc = MagicMock()
|
||
doc.__getitem__.return_value = page
|
||
|
||
boxes = [LayoutBox(100, 100, 300, 300, "picture")]
|
||
|
||
extracted = mod._process_page(
|
||
doc,
|
||
0,
|
||
boxes,
|
||
images_dest=images_dest,
|
||
manifest=manifest,
|
||
seen_labels=set(),
|
||
arxiv_id="2401.00001",
|
||
)
|
||
|
||
assert extracted == 0
|
||
assert manifest == {}
|