daily-paper/app/services/pdf_image_extractor.py

"""PDF 图片与表格提取 — 基于 pymupdf4llm layout analysis。

用 pymupdf4llm 的 layout analysis 检测 table / picture 区域，
再通过 caption 文字匹配确定 Figure/Table 编号，渲染为 JPEG。

相比旧方案（caption 正则 + pdfplumber/find_tables/文本块扫描三套策略）：
- layout analysis 直接给出区域 bbox，不存在相邻表格互相侵入的问题
- 无需手动调参（最大高度、间隙阈值等）
- 页面级 caption 匹配：每个 caption 只分配给最近的 box，避免上下相邻表格抢夺同一个 caption
"""

from __future__ import annotations

import json
import logging
import re
from pathlib import Path

import pymupdf
import pymupdf4llm.helpers.document_layout as dl

from app.services.pdf_downloader import paper_dir
from app.utils import TMP_DIR

logger = logging.getLogger(__name__)

# ── Caption 正则 ───────────────────────────────────────────────────────

# 用于从 caption 文字中提取 Figure/Table 编号
_FIGURE_CAPTION_RE = re.compile(
    r"^(?:Fig\.?|Figure)\s+(\d+)\s*(?:[:\.]\s*|\s+(?=(?-i:[A-Z])))",
    re.IGNORECASE,
)
_TABLE_CAPTION_RE = re.compile(
    r"^Table\s+(\d+)\s*(?:[:\.]\s*|\s+(?=(?-i:[A-Z])))",
    re.IGNORECASE,
)

# caption 与 table/picture 的最大匹配距离（点）
_CAPTION_MATCH_DISTANCE = 100
# 截图区域的外边距
_REGION_PADDING = 5
# 3x 渲染，保证清晰度
_RENDER_ZOOM = 3
# 相邻 box 聚类间距（点）— 同一 figure/table 的碎片间距通常 < 15pt
_CLUSTER_GAP = 15


# ── Box 聚类 ─────────────────────────────────────────────────────────


class _BoxCluster:
    """合并后的布局区域（由一个或多个相邻 LayoutBox 组成）。

    pymupdf4llm 有时将一个大图拆成多个小 picture box（如视频帧网格），
    聚类后用整体 bbox 作为渲染区域。
    """

    __slots__ = ("x0", "y0", "x1", "y1", "boxclass")

    def __init__(self, boxes: list):
        self.x0 = min(b.x0 for b in boxes)
        self.y0 = min(b.y0 for b in boxes)
        self.x1 = max(b.x1 for b in boxes)
        self.y1 = max(b.y1 for b in boxes)
        # table-fallback 归一化为 table（layout model 检测到表格但无法提取结构）
        raw = boxes[0].boxclass
        self.boxclass = "table" if raw == "table-fallback" else raw


def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]:
    """将相邻的同类型 box 合并为聚类。

    用 union-find 将间距 ≤ gap 的同类型 box 归为一组，
    每组生成一个 _BoxCluster（整体 bbox）。
    """
    if not boxes:
        return []

    n = len(boxes)
    parent = list(range(n))

    def find(x: int) -> int:
        while parent[x] != x:
            parent[x] = parent[parent[x]]
            x = parent[x]
        return x

    def union(a: int, b: int) -> None:
        ra, rb = find(a), find(b)
        if ra != rb:
            parent[ra] = rb

    for i in range(n):
        bi = boxes[i]
        for j in range(i + 1, n):
            bj = boxes[j]
            if bi.boxclass != bj.boxclass:
                continue
            h_gap = max(0.0, max(bi.x0, bj.x0) - min(bi.x1, bj.x1))
            v_gap = max(0.0, max(bi.y0, bj.y0) - min(bi.y1, bj.y1))
            h_overlap = bi.x1 > bj.x0 - gap and bj.x1 > bi.x0 - gap
            v_overlap = bi.y1 > bj.y0 - gap and bj.y1 > bi.y0 - gap
            if (h_gap <= gap and v_overlap) or (v_gap <= gap and h_overlap):
                union(i, j)

    groups: dict[int, list] = {}
    for i in range(n):
        groups.setdefault(find(i), []).append(boxes[i])

    return [_BoxCluster(members) for members in groups.values()]


# ── 页面级 Caption 查找与匹配 ──────────────────────────────────────────


def _find_page_captions(page) -> list[dict]:
    """查找页面上所有 Figure/Table caption 文字块。"""
    blocks = page.get_text("blocks")
    captions = []
    for b in blocks:
        if len(b) < 5:
            continue
        bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
        text = str(b[4]).strip()
        first_line = text.split("\n")[0].strip()

        cap_type = None
        m = _TABLE_CAPTION_RE.match(first_line)
        if m:
            cap_type = "table"
        else:
            m = _FIGURE_CAPTION_RE.match(first_line)
            if m:
                cap_type = "figure"
        if m is None:
            continue

        captions.append(
            {
                "label": f"{'Table' if cap_type == 'table' else 'Figure'} {m.group(1)}",
                "type": cap_type,
                "caption_text": text,
                "caption_y0": by0,
                "caption_y1": by1,
                "caption_x0": bx0,
                "caption_x1": bx1,
            }
        )
    return captions


def _vertical_distance(cap_y0, cap_y1, box_y0, box_y1) -> float | None:
    """计算 caption 到 box 的垂直距离。不邻接时返回 None。

    三种情况：caption 完全在 box 上方、完全在下方、与 box 有垂直重叠。
    重叠（含部分溢出）视为 distance=0，确保 caption 延伸到 box 边界外时不会丢失。
    """
    # Caption 完全在 box 上方
    if cap_y1 <= box_y0:
        dist = box_y0 - cap_y1
        return dist if dist <= _CAPTION_MATCH_DISTANCE else None
    # Caption 完全在 box 下方
    if cap_y0 >= box_y1:
        dist = cap_y0 - box_y1
        return dist if dist <= _CAPTION_MATCH_DISTANCE else None
    # Caption 与 box 有垂直重叠（内部、部分溢出都算）→ 距离 0
    return 0


def _same_column(cap: dict, box, page_width: float) -> bool:
    """判断 caption 和 box 是否在同一列。

    双栏论文中左右栏间距有限，简单的水平重叠检查会跨列匹配。
    策略：用中心 X 坐标判断各自在哪半边，只有同半边才算同列。
    跨栏图表（caption 或 box 宽度 >65% 页宽）不受此限制。
    """
    cap_w = cap["caption_x1"] - cap["caption_x0"]
    box_w = box.x1 - box.x0

    # 跨栏元素：宽度超过页面的 65%
    if cap_w > page_width * 0.65 or box_w > page_width * 0.65:
        return True

    cap_cx = (cap["caption_x0"] + cap["caption_x1"]) / 2
    box_cx = (box.x0 + box.x1) / 2
    mid = page_width / 2

    # 同在左半边或同在右半边
    return (cap_cx < mid) == (box_cx < mid)


def _match_captions_to_boxes(
    page_boxes: list, captions: list[dict], page_width: float
) -> list[tuple[list[int], list[dict]]]:
    """将 caption 分配给 box，允许一个 caption 匹配多个同类型 box。

    典型场景：
    - Figure 由左右两个 picture box 组成，caption 同时靠近两者
    - Table 的视觉内容被 layout analysis 误分类为 picture，需要跨类型匹配

    Returns:
        [(box_indices, captions), ...] 每组是一个独立的渲染任务
    """
    # 每个 caption 找到所有距离在阈值内的 box
    # 优先匹配同类型；如果找不到，再匹配任意 table/picture box
    cap_to_boxes: dict[int, list[tuple[int, float]]] = {}

    for ci, cap in enumerate(captions):
        same_type: list[tuple[int, float]] = []
        any_type: list[tuple[int, float]] = []
        expected = "table" if cap["type"] == "table" else "picture"

        for bi, box in enumerate(page_boxes):
            # 列感知：双栏论文中只匹配同栏的 box
            if not _same_column(cap, box, page_width):
                continue
            # 水平重叠检查（同列内仍需有重叠）
            if not (
                cap["caption_x1"] > box.x0 - 5 and cap["caption_x0"] < box.x1 + 5
            ):
                continue
            dist = _vertical_distance(
                cap["caption_y0"], cap["caption_y1"], box.y0, box.y1
            )
            if dist is None:
                continue
            entry = (bi, dist)
            any_type.append(entry)
            if box.boxclass == expected:
                same_type.append(entry)

        # 优先用同类型匹配；没有时回退到任意类型；都没有则跳过
        if same_type:
            cap_to_boxes[ci] = same_type
        elif any_type:
            cap_to_boxes[ci] = any_type
        # else: 该 caption 无匹配 box，不加入 cap_to_boxes

    # 每个 caption → 最近的 box（用于分组），但记录所有匹配的 box
    cap_primary: dict[int, int] = {}  # caption → primary box index
    cap_all_boxes: dict[int, list[int]] = {}  # caption → all matched box indices
    for ci, matches in cap_to_boxes.items():
        matches.sort(key=lambda x: x[1])
        cap_primary[ci] = matches[0][0]
        # 所有距离最近的同组 box（距离差 < 20pt 视为同一组）
        best_dist = matches[0][1]
        cap_all_boxes[ci] = [bi for bi, d in matches if d <= best_dist + 20]

    # 按 primary box 分组
    box_to_caps: dict[int, list[int]] = {}
    for ci, bi in cap_primary.items():
        box_to_caps.setdefault(bi, []).append(ci)

    # 构建渲染组：每个 caption 独立成组（共享 box 但各自渲染）
    # 同类型同 label 的 caption 会合并；不同类型则分开
    used_captions: set[int] = set()
    groups: list[tuple[list[int], list[dict]]] = []

    for bi in sorted(box_to_caps.keys()):
        cis = box_to_caps[bi]
        for ci in cis:
            if ci in used_captions:
                continue
            used_captions.add(ci)

            all_box_indices = set(cap_all_boxes.get(ci, [bi]))
            # 只合并同 label 的 caption（同 figure/table 的重复 caption）
            merged_captions = [captions[ci]]
            for other_bi in all_box_indices:
                if other_bi in box_to_caps:
                    for other_ci in box_to_caps[other_bi]:
                        if other_ci not in used_captions:
                            other_cap = captions[other_ci]
                            if other_cap["label"] == captions[ci]["label"]:
                                used_captions.add(other_ci)
                                merged_captions.append(other_cap)
            groups.append((sorted(all_box_indices), merged_captions))

    return groups


# ── 单页处理 ─────────────────────────────────────────────────────────


def _render_and_save(
    page,
    clip: pymupdf.Rect,
    images_dest: Path,
    manifest: dict,
    label: str,
    cap_type: str,
    caption_text: str,
    page_num_1based: int,
    arxiv_id: str,
) -> bool:
    """渲染页面区域并保存 JPEG，写入 manifest。成功返回 True。"""
    mat = pymupdf.Matrix(_RENDER_ZOOM, _RENDER_ZOOM)
    try:
        pix = page.get_pixmap(matrix=mat, clip=clip)
    except Exception:
        logger.debug("Failed to render %s for %s", label, arxiv_id)
        return False

    filename = f"{label.replace(' ', '_').lower()}.jpg"
    (images_dest / filename).write_bytes(pix.tobytes("jpeg"))

    manifest[filename] = {
        "page": page_num_1based,
        "type": cap_type,
        "label": label,
        "caption_text": caption_text[:200] if caption_text else "",
        "figures" if cap_type == "figure" else "tables": [label],
    }
    logger.debug(
        "Rendered %s: page %d, region (%.0f,%.0f)-(%.0f,%.0f) → %s",
        label,
        page_num_1based,
        clip.x0,
        clip.y0,
        clip.x1,
        clip.y1,
        filename,
    )
    return True


def _process_page(
    doc,
    page_idx: int,
    page_layout,
    images_dest: Path,
    manifest: dict,
    seen_labels: set,
    arxiv_id: str,
) -> int:
    """处理单页：caption 匹配 + orphan 兜底，返回本页提取数量。"""
    page = doc[page_idx]
    page_width = page.rect.width
    page_num = page_idx + 1
    orphan_fig_counter = 0
    orphan_tbl_counter = 0

    # 收集本页的 table/picture box（跳过极小区域）
    raw_boxes = []
    for box in page_layout.boxes:
        if box.boxclass not in ("table", "table-fallback", "picture"):
            continue
        if (box.x1 - box.x0) < 20 or (box.y1 - box.y0) < 20:
            continue
        raw_boxes.append(box)

    if not raw_boxes:
        return 0

    # 聚类：将同一 figure/table 的碎片 box 合并
    page_boxes = _cluster_boxes(raw_boxes)

    # 页面级匹配：查找所有 caption，分配给 box
    captions = _find_page_captions(page)
    groups = _match_captions_to_boxes(page_boxes, captions, page_width)

    # 只合并同 label 的 group（同一个 figure/table 的重复 caption）
    # 不同 label 的 group 即使共享 box 也不合并（如 Figure 7 和 Figure 8），
    # 渲染时用 caption 位置切割区域
    _merged_groups: set[int] = set()
    merged_groups: list[tuple[list[int], list[dict]]] = []
    for gi, (box_indices, caps) in enumerate(groups):
        if gi in _merged_groups:
            continue
        this_labels = {c["label"] for c in caps}
        all_box_set = set(box_indices)
        merge_targets = {gi}
        for other_gi, (other_bi, other_caps) in enumerate(groups):
            if other_gi <= gi or other_gi in _merged_groups:
                continue
            other_labels = {c["label"] for c in other_caps}
            # 只在 label 有交集时合并（同一个 figure/table）
            if this_labels & other_labels and all_box_set & set(other_bi):
                merge_targets.add(other_gi)
                all_box_set |= set(other_bi)
        all_caps = []
        for mgi in sorted(merge_targets):
            _merged_groups.add(mgi)
            all_caps.extend(groups[mgi][1])
        merged_groups.append((sorted(all_box_set), all_caps))
    groups = merged_groups

    # ── 阶段 1：渲染有 caption 匹配的图/表 ──
    matched_box_indices: set[int] = set()
    extracted = 0

    for box_indices, caps in groups:
        matched_box_indices.update(box_indices)

        # 去重同一 label，跳过已处理的
        unique_caps = []
        for cap in caps:
            if cap["label"] not in seen_labels:
                seen_labels.add(cap["label"])
                unique_caps.append(cap)
        if not unique_caps:
            continue

        # 合并所有关联 box 的 bbox
        bx0 = min(page_boxes[i].x0 for i in box_indices)
        by0 = min(page_boxes[i].y0 for i in box_indices)
        bx1 = max(page_boxes[i].x1 for i in box_indices)
        by1 = max(page_boxes[i].y1 for i in box_indices)

        # 渲染区域：box + caption
        all_cap_y0 = min(c["caption_y0"] for c in unique_caps)
        all_cap_y1 = max(c["caption_y1"] for c in unique_caps)
        all_cap_x0 = min(c["caption_x0"] for c in unique_caps)
        all_cap_x1 = max(c["caption_x1"] for c in unique_caps)

        top = max(0, min(by0, all_cap_y0) - _REGION_PADDING)
        bottom = max(by1, all_cap_y1) + _REGION_PADDING
        rx0 = max(0, min(bx0, all_cap_x0) - _REGION_PADDING)
        rx1 = min(page_width, max(bx1, all_cap_x1) + _REGION_PADDING)

        clip = pymupdf.Rect(rx0, top, rx1, bottom)
        # 多个 caption 可能共享同一区域（如 subfigure），只需渲染一次
        jpeg_bytes = None
        for cap in unique_caps:
            if jpeg_bytes is None:
                if not _render_and_save(
                    page,
                    clip,
                    images_dest,
                    manifest,
                    cap["label"],
                    cap["type"],
                    cap["caption_text"],
                    page_num,
                    arxiv_id,
                ):
                    break
                # 读取刚写入的 bytes 供后续同名 caption 复用
                filename = f"{cap['label'].replace(' ', '_').lower()}.jpg"
                jpeg_bytes = (images_dest / filename).read_bytes()
                extracted += 1
            else:
                # 同区域的不同 caption（如 subfigure），复用图片
                filename = f"{cap['label'].replace(' ', '_').lower()}.jpg"
                (images_dest / filename).write_bytes(jpeg_bytes)
                cap_preview = cap["caption_text"][:200]
                manifest[filename] = {
                    "page": page_num,
                    "type": cap["type"],
                    "label": cap["label"],
                    "caption_text": cap_preview,
                    "figures" if cap["type"] == "figure" else "tables": [cap["label"]],
                }
                extracted += 1

    # ── 阶段 2：渲染无 caption 匹配的图/表（orphan boxes） ──
    orphan_indices = set(range(len(page_boxes))) - matched_box_indices
    for bi in sorted(orphan_indices):
        box = page_boxes[bi]
        cap_type = "figure" if box.boxclass == "picture" else "table"

        if cap_type == "figure":
            orphan_fig_counter += 1
            label = f"Figure (p{page_num}-{orphan_fig_counter})"
        else:
            orphan_tbl_counter += 1
            label = f"Table (p{page_num}-{orphan_tbl_counter})"

        if label in seen_labels:
            continue
        seen_labels.add(label)

        clip = pymupdf.Rect(
            max(0, box.x0 - _REGION_PADDING),
            max(0, box.y0 - _REGION_PADDING),
            min(page_width, box.x1 + _REGION_PADDING),
            box.y1 + _REGION_PADDING,
        )
        if _render_and_save(
            page,
            clip,
            images_dest,
            manifest,
            label,
            cap_type,
            "",
            page_num,
            arxiv_id,
        ):
            extracted += 1

    return extracted


# ── 核心提取 ───────────────────────────────────────────────────────────


def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
    """从 PDF 提取 Figure/Table 截图，生成 manifest。

    用 pymupdf4llm layout analysis 检测 table/picture 区域，
    再通过 caption 文字确定编号，渲染为 JPEG。

    Args:
        arxiv_id: 论文 ID
        pdf_path: PDF 路径，默认 data/tmp/{arxiv_id}/paper.pdf

    Returns:
        提取的图片数量
    """
    if pdf_path is None:
        pdf_path = TMP_DIR / arxiv_id / "paper.pdf"

    if not pdf_path.exists():
        logger.warning("PDF not found for %s: %s", arxiv_id, pdf_path)
        return 0

    images_dest = paper_dir(arxiv_id) / "images"
    images_dest.mkdir(parents=True, exist_ok=True)

    # 清理上次提取的旧图片
    for old_file in images_dest.iterdir():
        if old_file.suffix.lower() in (".png", ".jpg", ".jpeg"):
            old_file.unlink()
    if (images_dest / "manifest.json").exists():
        (images_dest / "manifest.json").unlink()

    doc = pymupdf.open(str(pdf_path))

    # layout analysis
    try:
        parsed = dl.parse_document(
            doc, filename=str(pdf_path), use_ocr=dl.OCRMode.NEVER
        )
    except Exception:
        logger.warning(
            "pymupdf4llm layout analysis failed for %s", arxiv_id, exc_info=True
        )
        doc.close()
        return 0

    extracted = 0
    manifest: dict[str, dict] = {}
    seen_labels: set[str] = set()

    for page_idx, page_layout in enumerate(parsed.pages):
        try:
            extracted += _process_page(
                doc,
                page_idx,
                page_layout,
                images_dest=images_dest,
                manifest=manifest,
                seen_labels=seen_labels,
                arxiv_id=arxiv_id,
            )
        except Exception:
            logger.warning(
                "Failed to process page %d for %s",
                page_idx + 1,
                arxiv_id,
                exc_info=True,
            )
            continue

    doc.close()

    # 保存 manifest
    manifest_path = images_dest / "manifest.json"
    manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2))

    if extracted > 0:
        logger.info(
            "Extracted %d figure/table screenshots from PDF for %s",
            extracted,
            arxiv_id,
        )

    return extracted


# ── 按 summary 过滤 ────────────────────────────────────────────────────


def filter_images_by_summary(arxiv_id: str, figures: list[dict]) -> int:
    """根据 summary 中的 figures 字段过滤提取的图片/表格。

    用 manifest.json 中的 label 匹配，保留被 AI 总结引用的图片。
    """
    if not figures:
        return 0

    images_dir = paper_dir(arxiv_id) / "images"
    manifest_path = images_dir / "manifest.json"

    if not images_dir.exists() or not manifest_path.exists():
        return 0

    all_files = [
        f for f in images_dir.iterdir() if f.suffix.lower() in (".png", ".jpg", ".jpeg")
    ]
    if not all_files:
        return 0

    manifest: dict = json.loads(manifest_path.read_text(encoding="utf-8"))

    # 收集 summary 中引用的所有 Figure/Table ID（归一化）
    referenced_ids: set[str] = set()
    for fig in figures:
        fig_id = fig.get("id", "")
        m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", fig_id, re.IGNORECASE)
        if m:
            referenced_ids.add(f"Figure {m.group(1)}")
        m2 = re.match(r"Table\s*(\d+)", fig_id, re.IGNORECASE)
        if m2:
            referenced_ids.add(f"Table {m2.group(1)}")

    if not referenced_ids:
        logger.warning("No valid figure/table IDs in summary for %s", arxiv_id)
        return len(all_files)

    # 根据 manifest 的 label 字段匹配
    keep_filenames: set[str] = set()
    for filename, info in manifest.items():
        label = info.get("label", "")
        if label in referenced_ids:
            keep_filenames.add(filename)
            continue
        for ref in info.get("figures", []) + info.get("tables", []):
            if ref in referenced_ids:
                keep_filenames.add(filename)
                break

    if not keep_filenames:
        logger.warning(
            "No manifest matches for %s (refs=%s), keeping all",
            arxiv_id,
            referenced_ids,
        )
        return len(all_files)

    removed = 0
    for f in all_files:
        if f.name not in keep_filenames:
            f.unlink()
            removed += 1

    kept = len(all_files) - removed
    logger.info(
        "Filtered images for %s: kept %d, removed %d (refs=%s)",
        arxiv_id,
        kept,
        removed,
        referenced_ids,
    )
    return kept