daily-paper/app/services/pdf_image_extractor.py

"""PDF 图片与表格提取。

DocLayout-YOLO 检测 figure/table 内容区域 → PDF 文本流定位 caption → 只渲染配到
Figure/Table 标题的，用 caption 自带权威 ID 命名。没配到标题的（Algorithm 伪代码、
无编号附录表、DocLayout 误检碎片）一律过滤，不输出。

caption 定位用 PDF 文本而非 DocLayout 的 caption box —— 后者检测不稳（多行标题只
框一行→截断、漏检→无标题、配对错误→串台）。page.get_text("dict") 找以
"Figure N"/"Table N" 开头的文本块：文本块天然含完整多行标题，且其 ID 即论文实际
编号，直接命名规避串台。figure 标题优先在下方、table 标题优先在上方配对。
"""

from __future__ import annotations

import json
import logging
import re
from dataclasses import dataclass
from pathlib import Path

import pymupdf

from app.services.layout_detector import LayoutBox, detect_page_layout
from app.services.pdf_downloader import paper_dir
from app.utils import PAPERS_DIR, TMP_DIR

logger = logging.getLogger(__name__)

# 截图区域的外边距（单位: pt）
_REGION_PADDING = 5
# 渲染倍率（3x 保证清晰度）
_RENDER_ZOOM = 3
# 相邻 box 聚类间距（单位: pt）— 同一 figure/table 的碎片间距通常 < 15pt
_CLUSTER_GAP = 15
# 最小 bbox 面积（单位: pt²）— 过滤 icon/logo 等微小误检
_MIN_BOX_AREA = 2000
# caption 文本块与 figure/table 内容块的最大垂直距离（单位: pt）
_CAPTION_MATCH_DISTANCE = 120
# 方向不符（figure 标题在上 / table 标题在下）的配对惩罚分（仍允许，兜底异常排版）
_CAPTION_WRONG_SIDE_PENALTY = 300
# caption 开头标记：Figure 3 / Fig. 3 / Table C1 / Figure 3.5 等（大小写均可）
# 编号 = 数字开头 或 字母+数字（附录 C1）；行首匹配，规避正文 "see Table 3" 引用
_CAPTION_HEAD_RE = re.compile(
    r"^\s*(Figure|Fig\.?|Table)\b\.?\s+([0-9][0-9A-Za-z.]*|[A-Z]\d[0-9A-Za-z.]*)",
    re.IGNORECASE,
)


# ── Box 聚类 ─────────────────────────────────────────────────────────


class _BoxCluster:
    """合并后的布局区域（由一个或多个相邻 LayoutBox 组成）。"""

    __slots__ = ("x0", "y0", "x1", "y1", "boxclass")

    def __init__(self, boxes: list):
        self.x0 = min(b.x0 for b in boxes)
        self.y0 = min(b.y0 for b in boxes)
        self.x1 = max(b.x1 for b in boxes)
        self.y1 = max(b.y1 for b in boxes)
        self.boxclass = boxes[0].boxclass


@dataclass(frozen=True)
class _CaptionBlock:
    """从 PDF 文本流提取的标题块：自带权威 ID、完整多行文本、精确 bbox。"""

    id: str  # "Figure 3" / "Table C1"
    kind: str  # "figure" | "table"
    text: str  # 完整多行标题文本
    bbox: list[float]  # [x0, y0, x1, y1]


def _cluster_to_box(cluster: _BoxCluster) -> list[float]:
    return [
        round(float(cluster.x0), 1),
        round(float(cluster.y0), 1),
        round(float(cluster.x1), 1),
        round(float(cluster.y1), 1),
    ]


def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]:
    """将相邻的同类型 box 合并为聚类。"""
    if not boxes:
        return []

    n = len(boxes)
    parent = list(range(n))

    def find(x: int) -> int:
        while parent[x] != x:
            parent[x] = parent[parent[x]]
            x = parent[x]
        return x

    def union(a: int, b: int) -> None:
        ra, rb = find(a), find(b)
        if ra != rb:
            parent[ra] = rb

    for i in range(n):
        bi = boxes[i]
        for j in range(i + 1, n):
            bj = boxes[j]
            if bi.boxclass != bj.boxclass:
                continue
            h_gap = max(0.0, max(bi.x0, bj.x0) - min(bi.x1, bj.x1))
            v_gap = max(0.0, max(bi.y0, bj.y0) - min(bi.y1, bj.y1))
            h_overlap = bi.x1 > bj.x0 - gap and bj.x1 > bi.x0 - gap
            v_overlap = bi.y1 > bj.y0 - gap and bj.y1 > bi.y0 - gap
            if (h_gap <= gap and v_overlap) or (v_gap <= gap and h_overlap):
                union(i, j)

    groups: dict[int, list] = {}
    for i in range(n):
        groups.setdefault(find(i), []).append(boxes[i])

    return [_BoxCluster(members) for members in groups.values()]


def _find_caption_blocks(page) -> list[_CaptionBlock]:
    """从页面文本流提取以 "Figure N"/"Table N"/"Fig. N" 开头的标题块。

    用 PDF 文本而非 DocLayout caption box：文本块天然含完整多行标题，
    且其 ID 即论文实际编号（如 "Table C1"），权威且不依赖模型检测。
    """
    try:
        d = page.get_text("dict")
    except Exception:
        return []

    results: list[_CaptionBlock] = []
    for block in d.get("blocks", []):
        if block.get("type") != 0:  # 仅文本块
            continue
        lines = block.get("lines", [])
        if not lines:
            continue
        line_texts = [
            "".join(span.get("text", "") for span in line.get("spans", []))
            for line in lines
        ]
        first_line = next((t for t in line_texts if t.strip()), "")
        m = _CAPTION_HEAD_RE.match(first_line)
        if not m:
            continue
        kind_word, num = m.group(1), m.group(2)
        is_table = kind_word.lower().startswith("table")
        bbox = block.get("bbox")
        if not bbox or len(bbox) != 4:
            continue
        full_text = " ".join(t.strip() for t in line_texts if t.strip())
        results.append(
            _CaptionBlock(
                id=f"{'Table' if is_table else 'Figure'} {num}",
                kind="table" if is_table else "figure",
                text=full_text,
                bbox=[float(v) for v in bbox],
            )
        )
    return results


def _pair_caption_blocks(
    content_clusters: list[_BoxCluster],
    caption_blocks: list[_CaptionBlock],
) -> dict[int, _CaptionBlock]:
    """每个内容块配方向上最近的同类型标题块。

    figure 标题惯例在下方、table 标题在上方；方向相符优先，不符加惩罚兜底
    （跨页 / 异常排版）。按 (距离+惩罚) 升序贪心匹配，每个内容块与标题块唯一配对。
    """
    candidates: list[tuple[float, int, int]] = []
    for c_idx, content in enumerate(content_clusters):
        want_below = content.boxclass == "picture"  # figure 标题在下
        want_kind = "figure" if want_below else "table"
        for b_idx, cap in enumerate(caption_blocks):
            if cap.kind != want_kind:
                continue
            cx0, cy0, cx1, cy1 = cap.bbox
            h_overlap = min(content.x1, cx1) - max(content.x0, cx0)
            min_width = min(content.x1 - content.x0, cx1 - cx0)
            if min_width <= 0 or h_overlap < min_width * 0.25:
                continue
            if cy1 <= content.y0:  # 标题在内容上方
                side_below, v_gap = False, content.y0 - cy1
            elif cy0 >= content.y1:  # 标题在内容下方
                side_below, v_gap = True, cy0 - content.y1
            else:
                continue  # 重叠，跳过
            if v_gap > _CAPTION_MATCH_DISTANCE:
                continue
            penalty = 0.0 if side_below == want_below else _CAPTION_WRONG_SIDE_PENALTY
            candidates.append((v_gap + penalty, c_idx, b_idx))

    matches: dict[int, _CaptionBlock] = {}
    used: set[int] = set()
    for _score, c_idx, b_idx in sorted(candidates):
        if c_idx in matches or b_idx in used:
            continue
        matches[c_idx] = caption_blocks[b_idx]
        used.add(b_idx)
    return matches


# ── Phase 1: 检测 + 渲染 ──────────────────────────────────────────────


def _render_box(
    page,
    box: _BoxCluster,
    images_dest: Path,
    filename: str,
    cap_type: str,
    page_num: int,
    caption_bbox: list[float] | None = None,
) -> bool:
    """渲染单个 box 区域并保存 JPEG，成功返回 True。

    若提供 caption_bbox，则将内容与标题区域合并后一起截取，
    使同一张截图同时包含图/表及其完整标题。
    """
    page_width = page.rect.width
    page_height = page.rect.height
    x0, y0, x1, y1 = box.x0, box.y0, box.x1, box.y1
    if caption_bbox is not None:
        cx0, cy0, cx1, cy1 = caption_bbox
        x0 = min(x0, cx0)
        y0 = min(y0, cy0)
        x1 = max(x1, cx1)
        y1 = max(y1, cy1)
    clip = pymupdf.Rect(
        max(0, x0 - _REGION_PADDING),
        max(0, y0 - _REGION_PADDING),
        min(page_width, x1 + _REGION_PADDING),
        min(page_height, y1 + _REGION_PADDING),
    )
    mat = pymupdf.Matrix(_RENDER_ZOOM, _RENDER_ZOOM)
    try:
        pix = page.get_pixmap(matrix=mat, clip=clip)
    except Exception:
        return False

    (images_dest / filename).write_bytes(pix.tobytes("jpeg", jpg_quality=92))
    return True


def _process_page(
    doc,
    page_idx: int,
    page_boxes: list[LayoutBox],
    images_dest: Path,
    manifest: dict,
    seen_labels: set,
    arxiv_id: str,
) -> int:
    """处理单页：检测内容 box → 文本定位 caption → 只渲染配到标题的。

    配到 Figure/Table caption 的 box 用 caption 自带 ID 命名（figure_3.jpg）；
    没配到标题的（Algorithm 伪代码、无编号附录表、误检碎片）一律过滤，不输出。
    """
    page = doc[page_idx]
    page_num = page_idx + 1

    # 收集本页 figure/table 内容 box（跳过极小区域；caption 改由文本定位，不收 box）
    raw_boxes = []
    for box in page_boxes:
        if box.boxclass in ("table", "picture"):
            w = box.x1 - box.x0
            h = box.y1 - box.y0
            if w < 20 or h < 20 or w * h < _MIN_BOX_AREA:
                continue
            raw_boxes.append(box)

    if not raw_boxes:
        return 0

    # 聚类：将同一 figure/table 的碎片 box 合并；用 PDF 文本定位 caption
    clusters = _cluster_boxes(raw_boxes)
    caption_blocks = _find_caption_blocks(page)
    caption_matches = _pair_caption_blocks(clusters, caption_blocks)

    extracted = 0
    for cluster_idx, cluster in enumerate(clusters):
        cap_match = caption_matches.get(cluster_idx)
        if cap_match is None:
            continue  # 无 Figure/Table 标题 → 过滤（Algorithm、无编号表、误检碎片）
        if cap_match.id in seen_labels:
            continue  # 同一图表被 DocLayout 切成多块重复检测，跳过后续
        seen_labels.add(cap_match.id)

        filename = f"{cap_match.id.replace(' ', '_').lower()}.jpg"
        if not _render_box(
            page,
            cluster,
            images_dest,
            filename,
            cap_match.kind,
            page_num,
            caption_bbox=cap_match.bbox,
        ):
            continue

        manifest[filename] = {
            "page": page_num,
            "type": cap_match.kind,
            "label": cap_match.id,
            "box": _cluster_to_box(cluster),
            "caption_text": cap_match.text[:500],
            "caption_box": cap_match.bbox,
            "caption_source": "text",
        }
        extracted += 1

    return extracted


# ── Phase 1 核心入口 ───────────────────────────────────────────────────


def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
    """Phase 1: 从 PDF 提取 Figure/Table 截图，生成通用标签的 manifest。

    Args:
        arxiv_id: 论文 ID
        pdf_path: PDF 路径，默认 data/tmp/{arxiv_id}/paper.pdf

    Returns:
        提取的图片数量
    """
    if pdf_path is None:
        pdf_path = TMP_DIR / arxiv_id / "paper.pdf"

    if not pdf_path.exists():
        logger.warning("PDF not found for %s: %s", arxiv_id, pdf_path)
        return 0

    images_dest = paper_dir(arxiv_id) / "images"
    images_dest.mkdir(parents=True, exist_ok=True)

    # 清理上次提取的旧图片
    for old_file in images_dest.iterdir():
        if old_file.suffix.lower() in (".png", ".jpg", ".jpeg"):
            old_file.unlink()
    if (images_dest / "manifest.json").exists():
        (images_dest / "manifest.json").unlink()

    with pymupdf.open(str(pdf_path)) as doc:
        extracted = 0
        manifest: dict[str, dict] = {}
        seen_labels: set[str] = set()

        for page_idx in range(doc.page_count):
            try:
                page_boxes = detect_page_layout(doc[page_idx])
                extracted += _process_page(
                    doc,
                    page_idx,
                    page_boxes,
                    images_dest=images_dest,
                    manifest=manifest,
                    seen_labels=seen_labels,
                    arxiv_id=arxiv_id,
                )
            except Exception:
                logger.warning(
                    "Failed to process page %d for %s",
                    page_idx + 1,
                    arxiv_id,
                    exc_info=True,
                )
                continue

    # 保存 manifest
    manifest_path = images_dest / "manifest.json"
    manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2))

    if extracted > 0:
        logger.info(
            "Extracted %d figure/table screenshots from PDF for %s",
            extracted,
            arxiv_id,
        )

    return extracted


# ── Figure ↔ Image 关联 ────────────────────────────────────────────────


def _normalize_figure_id(raw_id: str) -> str:
    """归一化 Figure/Table ID：'Figure 1'/'Fig.1' → 'Figure 1'。"""
    m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", raw_id, re.IGNORECASE)
    if m:
        return f"Figure {m.group(1)}"
    m2 = re.match(r"Table\s*(\d+)", raw_id, re.IGNORECASE)
    if m2:
        return f"Table {m2.group(1)}"
    return raw_id


def _is_figure_type(fig_id: str) -> bool:
    """判断是否为 Figure 类型（非 Table）。"""
    return not re.match(r"Table\s*(\d+)", fig_id, re.IGNORECASE)


def _image_sort_key(name: str) -> tuple[int, int]:
    """按文件名中的编号排序提取的图片。"""
    # 新格式：figure_1.jpg, table_1.jpg
    m = re.search(r"(?:figure|table)_(\d+)", name)
    if m:
        return (0, int(m.group(1)))
    return (0, 0)


def link_figures_with_images(
    figures: list[dict], images: list[dict], arxiv_id: str
) -> list[dict]:
    """将 summary figures 元数据与提取的图片文件关联。

    策略：
    1. 优先用 manifest.json 的 label 做 ID 精确匹配
    2. 未匹配的 figure 用序号兜底：第 N 个 Figure → 第 N 张提取图
    """
    if not figures or not images:
        return figures

    manifest_path = PAPERS_DIR / arxiv_id / "images" / "manifest.json"

    # ── 策略 1：manifest ID 精确匹配 ──
    id_to_url: dict[str, str] = {}
    if manifest_path.exists():
        try:
            manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
        except (ValueError, TypeError):
            manifest = {}
        for filename, info in manifest.items():
            url = f"/papers/{arxiv_id}/images/{filename}"
            # 优先用 label 字段（新格式）
            label = info.get("label", "")
            if label:
                id_to_url[label] = url
            # 也兼容 figures/tables 列表（旧格式）
            for fig_id in info.get("figures", []) + info.get("tables", []):
                if fig_id not in id_to_url:
                    id_to_url[fig_id] = url

    for fig in figures:
        raw_id = fig.get("id", "")
        normalized = _normalize_figure_id(raw_id)
        if normalized in id_to_url:
            fig["image_url"] = id_to_url[normalized]

    # ── 策略 2：序号兜底（manifest 匹配不到时） ──
    unmatched = [f for f in figures if not f.get("image_url")]
    if not unmatched:
        return figures

    # 按类型分流：Figure vs Table
    fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))]
    table_type_unmatched = [
        f for f in unmatched if not _is_figure_type(f.get("id", ""))
    ]

    # 提取的图片按类型分流，按文件名中的编号排序
    fig_images = sorted(
        [img for img in images if "table" not in img["name"].lower()],
        key=lambda img: _image_sort_key(img["name"]),
    )
    table_images = sorted(
        [img for img in images if "table" in img["name"].lower()],
        key=lambda img: _image_sort_key(img["name"]),
    )

    for i, fig in enumerate(fig_type_unmatched):
        if i < len(fig_images):
            fig["image_url"] = fig_images[i]["url"]

    for i, fig in enumerate(table_type_unmatched):
        if i < len(table_images):
            fig["image_url"] = table_images[i]["url"]

    return figures