daily-paper/tests/test_pdf_image_extractor.py

from __future__ import annotations

from unittest.mock import MagicMock

import pymupdf

from app.services import pdf_image_extractor as mod
from app.services.layout_detector import LayoutBox


def _caption_block(bbox, text):
    """构造一个 page.get_text("dict") 风格的文本块。"""
    return {
        "type": 0,
        "bbox": list(bbox),
        "lines": [{"spans": [{"text": text}]}],
    }


def test_process_page_pairs_caption_from_text(tmp_path):
    """caption 来自 PDF 文本流（figure 标题在内容下方），用其 ID 直接命名。"""
    images_dest = tmp_path / "images"
    images_dest.mkdir()
    manifest: dict[str, dict] = {}

    pix = MagicMock()
    pix.tobytes.return_value = b"jpeg"

    page = MagicMock()
    page.rect.width = 600
    page.rect.height = 800
    page.get_pixmap.return_value = pix
    page.get_text.return_value = {
        "blocks": [
            _caption_block((95, 310, 320, 325), "Figure 1: Overall architecture.")
        ]
    }

    doc = MagicMock()
    doc.__getitem__.return_value = page

    boxes = [LayoutBox(100, 100, 300, 300, "picture")]

    extracted = mod._process_page(
        doc,
        0,
        boxes,
        images_dest=images_dest,
        manifest=manifest,
        seen_labels=set(),
        arxiv_id="2401.00001",
    )

    assert extracted == 1
    # caption 自带 ID → 直接命名 figure_1.jpg
    info = manifest["figure_1.jpg"]
    assert info["label"] == "Figure 1"
    assert info["caption_text"] == "Figure 1: Overall architecture."
    assert info["caption_source"] == "text"


def test_process_page_includes_caption_in_render(tmp_path):
    """渲染时把 caption 文本块区域合并进同一张截图。"""
    images_dest = tmp_path / "images"
    images_dest.mkdir()
    manifest: dict[str, dict] = {}

    pix = MagicMock()
    pix.tobytes.return_value = b"jpeg"

    page = MagicMock()
    page.rect.width = 600
    page.rect.height = 800
    page.get_pixmap.return_value = pix
    page.get_text.return_value = {
        "blocks": [_caption_block((95, 310, 320, 325), "Figure 1: Caption text.")]
    }

    doc = MagicMock()
    doc.__getitem__.return_value = page

    boxes = [LayoutBox(100, 100, 300, 300, "picture")]

    mod._process_page(
        doc,
        0,
        boxes,
        images_dest=images_dest,
        manifest=manifest,
        seen_labels=set(),
        arxiv_id="2401.00001",
    )

    # 内容 [100,100,300,300] ∪ caption [95,310,320,325]，各方向加 _REGION_PADDING=5
    # → Rect(90, 95, 325, 330)
    clip = page.get_pixmap.call_args.kwargs["clip"]
    assert clip == pymupdf.Rect(90, 95, 325, 330)


def test_process_page_table_caption_above(tmp_path):
    """table 标题惯例在内容上方，配对后命名 table_N.jpg。"""
    images_dest = tmp_path / "images"
    images_dest.mkdir()
    manifest: dict[str, dict] = {}

    pix = MagicMock()
    pix.tobytes.return_value = b"jpeg"

    page = MagicMock()
    page.rect.width = 600
    page.rect.height = 800
    page.get_pixmap.return_value = pix
    # caption 在内容上方 [80, 90, 320, 105]，内容表格 [80, 120, 320, 280]
    page.get_text.return_value = {
        "blocks": [_caption_block((80, 90, 320, 105), "Table 2 | Results summary.")]
    }

    doc = MagicMock()
    doc.__getitem__.return_value = page

    boxes = [LayoutBox(80, 120, 320, 280, "table")]

    extracted = mod._process_page(
        doc,
        0,
        boxes,
        images_dest=images_dest,
        manifest=manifest,
        seen_labels=set(),
        arxiv_id="2401.00001",
    )

    assert extracted == 1
    info = manifest["table_2.jpg"]
    assert info["label"] == "Table 2"
    assert info["caption_source"] == "text"


def test_process_page_filters_uncaptioned(tmp_path):
    """没有 Figure/Table caption 配对的 box（Algorithm、无编号表等）被过滤，不输出。"""
    images_dest = tmp_path / "images"
    images_dest.mkdir()
    manifest: dict[str, dict] = {}

    pix = MagicMock()
    pix.tobytes.return_value = b"jpeg"

    page = MagicMock()
    page.rect.width = 600
    page.rect.height = 800
    page.get_pixmap.return_value = pix
    page.get_text.return_value = {"blocks": []}  # 无任何 caption 文本块

    doc = MagicMock()
    doc.__getitem__.return_value = page

    boxes = [LayoutBox(100, 100, 300, 300, "picture")]

    extracted = mod._process_page(
        doc,
        0,
        boxes,
        images_dest=images_dest,
        manifest=manifest,
        seen_labels=set(),
        arxiv_id="2401.00001",
    )

    assert extracted == 0
    assert manifest == {}