Files
daily-paper/tests/test_pdf_image_extractor.py
T

135 lines
3.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import json
from unittest.mock import MagicMock
import pymupdf
from app.services import pdf_image_extractor as mod
from app.services.layout_detector import LayoutBox
def test_process_page_extracts_doclayout_caption(tmp_path):
images_dest = tmp_path / "images"
images_dest.mkdir()
manifest: dict[str, dict] = {}
pix = MagicMock()
pix.tobytes.return_value = b"jpeg"
page = MagicMock()
page.rect.width = 600
page.get_pixmap.return_value = pix
page.get_text.return_value = "Figure 1: Overall architecture.\n"
doc = MagicMock()
doc.__getitem__.return_value = page
boxes = [
LayoutBox(100, 100, 300, 300, "picture"),
LayoutBox(95, 310, 320, 325, "figure_caption"),
]
extracted = mod._process_page(
doc,
0,
boxes,
images_dest=images_dest,
manifest=manifest,
seen_labels=set(),
arxiv_id="2401.00001",
)
assert extracted == 1
info = manifest["figure_(p1-1).jpg"]
assert info["caption_text"] == "Figure 1: Overall architecture."
assert info["caption_source"] == "doclayout"
assert info["caption_box"] == [95.0, 310.0, 320.0, 325.0]
def test_process_page_includes_caption_in_render(tmp_path):
"""渲染时把 caption 区域合并进同一张截图。"""
images_dest = tmp_path / "images"
images_dest.mkdir()
manifest: dict[str, dict] = {}
pix = MagicMock()
pix.tobytes.return_value = b"jpeg"
page = MagicMock()
page.rect.width = 600
page.get_pixmap.return_value = pix
page.get_text.return_value = "Figure 1: Caption text.\n"
doc = MagicMock()
doc.__getitem__.return_value = page
boxes = [
LayoutBox(100, 100, 300, 300, "picture"),
LayoutBox(95, 310, 320, 325, "figure_caption"),
]
mod._process_page(
doc,
0,
boxes,
images_dest=images_dest,
manifest=manifest,
seen_labels=set(),
arxiv_id="2401.00001",
)
# 内容 [100,100,300,300] caption [95,310,320,325],各方向加 _REGION_PADDING=5
# → Rect(90, 95, 325, 330)
clip = page.get_pixmap.call_args.kwargs["clip"]
assert clip == pymupdf.Rect(90, 95, 325, 330)
def test_label_images_preserves_doclayout_caption(tmp_path, monkeypatch):
arxiv_id = "2401.00001"
paper_root = tmp_path / arxiv_id
images_dest = paper_root / "images"
images_dest.mkdir(parents=True)
(images_dest / "figure_(p1-1).jpg").write_bytes(b"jpeg")
(images_dest / "manifest.json").write_text(
json.dumps(
{
"figure_(p1-1).jpg": {
"page": 1,
"type": "figure",
"label": "Figure (p1-1)",
"box": [100, 100, 300, 300],
"caption_text": "Figure 1: PDF original caption.",
"caption_source": "doclayout",
}
}
)
)
pdf_path = tmp_path / "paper.pdf"
pdf_path.write_bytes(b"%PDF")
monkeypatch.setattr(mod, "paper_dir", lambda _arxiv_id: paper_root)
page = MagicMock()
page.search_for.return_value = [pymupdf.Rect(120, 305, 180, 320)]
fake_doc = MagicMock()
fake_doc.page_count = 1
fake_doc.__getitem__.return_value = page
fake_doc.__enter__.return_value = fake_doc
fake_doc.__exit__.return_value = False
monkeypatch.setattr(mod.pymupdf, "open", lambda _path: fake_doc)
labeled = mod.label_images_by_summary(
arxiv_id,
[{"id": "Figure 1", "caption": "Summary caption."}],
pdf_path=pdf_path,
)
assert labeled == 1
manifest = json.loads((images_dest / "manifest.json").read_text())
info = manifest["figure_1.jpg"]
assert info["caption_text"] == "Figure 1: PDF original caption."
assert info["caption_source"] == "doclayout"
assert info["summary_caption_text"] == "Summary caption."