feat: add concurrency safety, caption detection, admin enhancements, and performance improvements
This commit is contained in:
@@ -0,0 +1,134 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pymupdf
|
||||
|
||||
from app.services import pdf_image_extractor as mod
|
||||
from app.services.layout_detector import LayoutBox
|
||||
|
||||
|
||||
def test_process_page_extracts_doclayout_caption(tmp_path):
|
||||
images_dest = tmp_path / "images"
|
||||
images_dest.mkdir()
|
||||
manifest: dict[str, dict] = {}
|
||||
|
||||
pix = MagicMock()
|
||||
pix.tobytes.return_value = b"jpeg"
|
||||
|
||||
page = MagicMock()
|
||||
page.rect.width = 600
|
||||
page.get_pixmap.return_value = pix
|
||||
page.get_text.return_value = "Figure 1: Overall architecture.\n"
|
||||
|
||||
doc = MagicMock()
|
||||
doc.__getitem__.return_value = page
|
||||
|
||||
boxes = [
|
||||
LayoutBox(100, 100, 300, 300, "picture"),
|
||||
LayoutBox(95, 310, 320, 325, "figure_caption"),
|
||||
]
|
||||
|
||||
extracted = mod._process_page(
|
||||
doc,
|
||||
0,
|
||||
boxes,
|
||||
images_dest=images_dest,
|
||||
manifest=manifest,
|
||||
seen_labels=set(),
|
||||
arxiv_id="2401.00001",
|
||||
)
|
||||
|
||||
assert extracted == 1
|
||||
info = manifest["figure_(p1-1).jpg"]
|
||||
assert info["caption_text"] == "Figure 1: Overall architecture."
|
||||
assert info["caption_source"] == "doclayout"
|
||||
assert info["caption_box"] == [95.0, 310.0, 320.0, 325.0]
|
||||
|
||||
|
||||
def test_process_page_includes_caption_in_render(tmp_path):
|
||||
"""渲染时把 caption 区域合并进同一张截图。"""
|
||||
images_dest = tmp_path / "images"
|
||||
images_dest.mkdir()
|
||||
manifest: dict[str, dict] = {}
|
||||
|
||||
pix = MagicMock()
|
||||
pix.tobytes.return_value = b"jpeg"
|
||||
|
||||
page = MagicMock()
|
||||
page.rect.width = 600
|
||||
page.get_pixmap.return_value = pix
|
||||
page.get_text.return_value = "Figure 1: Caption text.\n"
|
||||
|
||||
doc = MagicMock()
|
||||
doc.__getitem__.return_value = page
|
||||
|
||||
boxes = [
|
||||
LayoutBox(100, 100, 300, 300, "picture"),
|
||||
LayoutBox(95, 310, 320, 325, "figure_caption"),
|
||||
]
|
||||
|
||||
mod._process_page(
|
||||
doc,
|
||||
0,
|
||||
boxes,
|
||||
images_dest=images_dest,
|
||||
manifest=manifest,
|
||||
seen_labels=set(),
|
||||
arxiv_id="2401.00001",
|
||||
)
|
||||
|
||||
# 内容 [100,100,300,300] ∪ caption [95,310,320,325],各方向加 _REGION_PADDING=5
|
||||
# → Rect(90, 95, 325, 330)
|
||||
clip = page.get_pixmap.call_args.kwargs["clip"]
|
||||
assert clip == pymupdf.Rect(90, 95, 325, 330)
|
||||
|
||||
|
||||
def test_label_images_preserves_doclayout_caption(tmp_path, monkeypatch):
|
||||
arxiv_id = "2401.00001"
|
||||
paper_root = tmp_path / arxiv_id
|
||||
images_dest = paper_root / "images"
|
||||
images_dest.mkdir(parents=True)
|
||||
(images_dest / "figure_(p1-1).jpg").write_bytes(b"jpeg")
|
||||
(images_dest / "manifest.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"figure_(p1-1).jpg": {
|
||||
"page": 1,
|
||||
"type": "figure",
|
||||
"label": "Figure (p1-1)",
|
||||
"box": [100, 100, 300, 300],
|
||||
"caption_text": "Figure 1: PDF original caption.",
|
||||
"caption_source": "doclayout",
|
||||
}
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
pdf_path = tmp_path / "paper.pdf"
|
||||
pdf_path.write_bytes(b"%PDF")
|
||||
monkeypatch.setattr(mod, "paper_dir", lambda _arxiv_id: paper_root)
|
||||
|
||||
page = MagicMock()
|
||||
page.search_for.return_value = [pymupdf.Rect(120, 305, 180, 320)]
|
||||
|
||||
fake_doc = MagicMock()
|
||||
fake_doc.page_count = 1
|
||||
fake_doc.__getitem__.return_value = page
|
||||
fake_doc.__enter__.return_value = fake_doc
|
||||
fake_doc.__exit__.return_value = False
|
||||
monkeypatch.setattr(mod.pymupdf, "open", lambda _path: fake_doc)
|
||||
|
||||
labeled = mod.label_images_by_summary(
|
||||
arxiv_id,
|
||||
[{"id": "Figure 1", "caption": "Summary caption."}],
|
||||
pdf_path=pdf_path,
|
||||
)
|
||||
|
||||
assert labeled == 1
|
||||
manifest = json.loads((images_dest / "manifest.json").read_text())
|
||||
info = manifest["figure_1.jpg"]
|
||||
assert info["caption_text"] == "Figure 1: PDF original caption."
|
||||
assert info["caption_source"] == "doclayout"
|
||||
assert info["summary_caption_text"] == "Summary caption."
|
||||
Reference in New Issue
Block a user