daily-paper/tests/test_pdf_image_extractor.py

from __future__ import annotations

import json
from unittest.mock import MagicMock

import pymupdf

from app.services import pdf_image_extractor as mod
from app.services.layout_detector import LayoutBox


def test_process_page_extracts_doclayout_caption(tmp_path):
    images_dest = tmp_path / "images"
    images_dest.mkdir()
    manifest: dict[str, dict] = {}

    pix = MagicMock()
    pix.tobytes.return_value = b"jpeg"

    page = MagicMock()
    page.rect.width = 600
    page.get_pixmap.return_value = pix
    page.get_text.return_value = "Figure 1: Overall architecture.\n"

    doc = MagicMock()
    doc.__getitem__.return_value = page

    boxes = [
        LayoutBox(100, 100, 300, 300, "picture"),
        LayoutBox(95, 310, 320, 325, "figure_caption"),
    ]

    extracted = mod._process_page(
        doc,
        0,
        boxes,
        images_dest=images_dest,
        manifest=manifest,
        seen_labels=set(),
        arxiv_id="2401.00001",
    )

    assert extracted == 1
    info = manifest["figure_(p1-1).jpg"]
    assert info["caption_text"] == "Figure 1: Overall architecture."
    assert info["caption_source"] == "doclayout"
    assert info["caption_box"] == [95.0, 310.0, 320.0, 325.0]


def test_process_page_includes_caption_in_render(tmp_path):
    """渲染时把 caption 区域合并进同一张截图。"""
    images_dest = tmp_path / "images"
    images_dest.mkdir()
    manifest: dict[str, dict] = {}

    pix = MagicMock()
    pix.tobytes.return_value = b"jpeg"

    page = MagicMock()
    page.rect.width = 600
    page.get_pixmap.return_value = pix
    page.get_text.return_value = "Figure 1: Caption text.\n"

    doc = MagicMock()
    doc.__getitem__.return_value = page

    boxes = [
        LayoutBox(100, 100, 300, 300, "picture"),
        LayoutBox(95, 310, 320, 325, "figure_caption"),
    ]

    mod._process_page(
        doc,
        0,
        boxes,
        images_dest=images_dest,
        manifest=manifest,
        seen_labels=set(),
        arxiv_id="2401.00001",
    )

    # 内容 [100,100,300,300] ∪ caption [95,310,320,325]，各方向加 _REGION_PADDING=5
    # → Rect(90, 95, 325, 330)
    clip = page.get_pixmap.call_args.kwargs["clip"]
    assert clip == pymupdf.Rect(90, 95, 325, 330)


def test_label_images_preserves_doclayout_caption(tmp_path, monkeypatch):
    arxiv_id = "2401.00001"
    paper_root = tmp_path / arxiv_id
    images_dest = paper_root / "images"
    images_dest.mkdir(parents=True)
    (images_dest / "figure_(p1-1).jpg").write_bytes(b"jpeg")
    (images_dest / "manifest.json").write_text(
        json.dumps(
            {
                "figure_(p1-1).jpg": {
                    "page": 1,
                    "type": "figure",
                    "label": "Figure (p1-1)",
                    "box": [100, 100, 300, 300],
                    "caption_text": "Figure 1: PDF original caption.",
                    "caption_source": "doclayout",
                }
            }
        )
    )

    pdf_path = tmp_path / "paper.pdf"
    pdf_path.write_bytes(b"%PDF")
    monkeypatch.setattr(mod, "paper_dir", lambda _arxiv_id: paper_root)

    page = MagicMock()
    page.search_for.return_value = [pymupdf.Rect(120, 305, 180, 320)]

    fake_doc = MagicMock()
    fake_doc.page_count = 1
    fake_doc.__getitem__.return_value = page
    fake_doc.__enter__.return_value = fake_doc
    fake_doc.__exit__.return_value = False
    monkeypatch.setattr(mod.pymupdf, "open", lambda _path: fake_doc)

    labeled = mod.label_images_by_summary(
        arxiv_id,
        [{"id": "Figure 1", "caption": "Summary caption."}],
        pdf_path=pdf_path,
    )

    assert labeled == 1
    manifest = json.loads((images_dest / "manifest.json").read_text())
    info = manifest["figure_1.jpg"]
    assert info["caption_text"] == "Figure 1: PDF original caption."
    assert info["caption_source"] == "doclayout"
    assert info["summary_caption_text"] == "Summary caption."