feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
@@ -1,12 +1,12 @@
-"""PDF 图片与表格提取 — 基于 pymupdf4llm layout analysis。
+"""PDF 图片与表格提取 — 两阶段流水线。

-用 pymupdf4llm 的 layout analysis 检测 table / picture 区域，
-再通过 caption 文字匹配确定 Figure/Table 编号，渲染为 JPEG。
+Phase 1: PicoDet-S_layout_3cls 检测 figure/table 区域 → 渲染为 JPEG（通用标签）
+Phase 2: 用 LLM summary 的 figures[].id 在 PDF 中搜索定位 → 匹配到 box → 重命名

-相比旧方案（caption 正则 + pdfplumber/find_tables/文本块扫描三套策略）：
- layout analysis 直接给出区域 bbox，不存在相邻表格互相侵入的问题
- 无需手动调参（最大高度、间隙阈值等）
- 页面级 caption 匹配：每个 caption 只分配给最近的 box，避免上下相邻表格抢夺同一个 caption
+相比旧方案（正则匹配 caption）：
+- 不再依赖正则，用 LLM 输出的 ID 直接搜索 PDF 文本
+- page.search_for() 精确搜索 + 空间距离过滤，避免正文引用误匹配
+- 通用标签兜底，LLM 没提到的图表不会被丢弃
 """

 from __future__ import annotations
@@ -17,44 +17,30 @@ import re
 from pathlib import Path

 import pymupdf
-import pymupdf4llm.helpers.document_layout as dl

+from app.services.layout_detector import LayoutBox, detect_page_layout
 from app.services.pdf_downloader import paper_dir
-from app.utils import TMP_DIR
+from app.utils import PAPERS_DIR, TMP_DIR

 logger = logging.getLogger(__name__)

-# ── Caption 正则 ───────────────────────────────────────────────────────
-
-# 用于从 caption 文字中提取 Figure/Table 编号
-_FIGURE_CAPTION_RE = re.compile(
-    r"^(?:Fig\.?|Figure)\s+(\d+)\s*(?:[:\.]\s*|\s+(?=(?-i:[A-Z])))",
-    re.IGNORECASE,
-)
-_TABLE_CAPTION_RE = re.compile(
-    r"^Table\s+(\d+)\s*(?:[:\.]\s*|\s+(?=(?-i:[A-Z])))",
-    re.IGNORECASE,
-)
-
-# caption 与 table/picture 的最大匹配距离（点）
-_CAPTION_MATCH_DISTANCE = 100
-# 截图区域的外边距
+# 截图区域的外边距（单位: pt）
 _REGION_PADDING = 5
-# 3x 渲染，保证清晰度
+# 渲染倍率（3x 保证清晰度）
 _RENDER_ZOOM = 3
-# 相邻 box 聚类间距（点）— 同一 figure/table 的碎片间距通常 < 15pt
+# 相邻 box 聚类间距（单位: pt）— 同一 figure/table 的碎片间距通常 < 15pt
 _CLUSTER_GAP = 15
+# 最小 bbox 面积（单位: pt²）— 过滤 icon/logo 等微小误检
+_MIN_BOX_AREA = 2000
+# Phase 2: 搜索文本到 box 的最大匹配距离（单位: pt）
+_LABEL_MATCH_DISTANCE = 100


 # ── Box 聚类 ─────────────────────────────────────────────────────────


 class _BoxCluster:
-    """合并后的布局区域（由一个或多个相邻 LayoutBox 组成）。
-
-    pymupdf4llm 有时将一个大图拆成多个小 picture box（如视频帧网格），
-    聚类后用整体 bbox 作为渲染区域。
-    """
+    """合并后的布局区域（由一个或多个相邻 LayoutBox 组成）。"""

    __slots__ = ("x0", "y0", "x1", "y1", "boxclass")

@@ -63,17 +49,12 @@ class _BoxCluster:
        self.y0 = min(b.y0 for b in boxes)
        self.x1 = max(b.x1 for b in boxes)
        self.y1 = max(b.y1 for b in boxes)
-        # table-fallback 归一化为 table（layout model 检测到表格但无法提取结构）
        raw = boxes[0].boxclass
        self.boxclass = "table" if raw == "table-fallback" else raw


 def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]:
-    """将相邻的同类型 box 合并为聚类。
-
-    用 union-find 将间距 ≤ gap 的同类型 box 归为一组，
-    每组生成一个 _BoxCluster（整体 bbox）。
-    """
+    """将相邻的同类型 box 合并为聚类。"""
    if not boxes:
        return []

@@ -111,242 +92,58 @@ def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]:
    return [_BoxCluster(members) for members in groups.values()]


-# ── 页面级 Caption 查找与匹配 ──────────────────────────────────────────
+# ── Phase 1: 检测 + 渲染 ──────────────────────────────────────────────


-def _find_page_captions(page) -> list[dict]:
-    """查找页面上所有 Figure/Table caption 文字块。"""
-    blocks = page.get_text("blocks")
-    captions = []
-    for b in blocks:
-        if len(b) < 5:
-            continue
-        bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
-        text = str(b[4]).strip()
-        first_line = text.split("\n")[0].strip()
-
-        cap_type = None
-        m = _TABLE_CAPTION_RE.match(first_line)
-        if m:
-            cap_type = "table"
-        else:
-            m = _FIGURE_CAPTION_RE.match(first_line)
-            if m:
-                cap_type = "figure"
-        if m is None:
-            continue
-
-        captions.append(
-            {
-                "label": f"{'Table' if cap_type == 'table' else 'Figure'} {m.group(1)}",
-                "type": cap_type,
-                "caption_text": text,
-                "caption_y0": by0,
-                "caption_y1": by1,
-                "caption_x0": bx0,
-                "caption_x1": bx1,
-            }
-        )
-    return captions
-
-
-def _vertical_distance(cap_y0, cap_y1, box_y0, box_y1) -> float | None:
-    """计算 caption 到 box 的垂直距离。不邻接时返回 None。
-
-    三种情况：caption 完全在 box 上方、完全在下方、与 box 有垂直重叠。
-    重叠（含部分溢出）视为 distance=0，确保 caption 延伸到 box 边界外时不会丢失。
-    """
-    # Caption 完全在 box 上方
-    if cap_y1 <= box_y0:
-        dist = box_y0 - cap_y1
-        return dist if dist <= _CAPTION_MATCH_DISTANCE else None
-    # Caption 完全在 box 下方
-    if cap_y0 >= box_y1:
-        dist = cap_y0 - box_y1
-        return dist if dist <= _CAPTION_MATCH_DISTANCE else None
-    # Caption 与 box 有垂直重叠（内部、部分溢出都算）→ 距离 0
-    return 0
-
-
-def _same_column(cap: dict, box, page_width: float) -> bool:
-    """判断 caption 和 box 是否在同一列。
-
-    双栏论文中左右栏间距有限，简单的水平重叠检查会跨列匹配。
-    策略：用中心 X 坐标判断各自在哪半边，只有同半边才算同列。
-    跨栏图表（caption 或 box 宽度 >65% 页宽）不受此限制。
-    """
-    cap_w = cap["caption_x1"] - cap["caption_x0"]
-    box_w = box.x1 - box.x0
-
-    # 跨栏元素：宽度超过页面的 65%
-    if cap_w > page_width * 0.65 or box_w > page_width * 0.65:
-        return True
-
-    cap_cx = (cap["caption_x0"] + cap["caption_x1"]) / 2
-    box_cx = (box.x0 + box.x1) / 2
-    mid = page_width / 2
-
-    # 同在左半边或同在右半边
-    return (cap_cx < mid) == (box_cx < mid)
-
-
-def _match_captions_to_boxes(
-    page_boxes: list, captions: list[dict], page_width: float
-) -> list[tuple[list[int], list[dict]]]:
-    """将 caption 分配给 box，允许一个 caption 匹配多个同类型 box。
-
-    典型场景：
-    - Figure 由左右两个 picture box 组成，caption 同时靠近两者
-    - Table 的视觉内容被 layout analysis 误分类为 picture，需要跨类型匹配
-
-    Returns:
-        [(box_indices, captions), ...] 每组是一个独立的渲染任务
-    """
-    # 每个 caption 找到所有距离在阈值内的 box
-    # 优先匹配同类型；如果找不到，再匹配任意 table/picture box
-    cap_to_boxes: dict[int, list[tuple[int, float]]] = {}
-
-    for ci, cap in enumerate(captions):
-        same_type: list[tuple[int, float]] = []
-        any_type: list[tuple[int, float]] = []
-        expected = "table" if cap["type"] == "table" else "picture"
-
-        for bi, box in enumerate(page_boxes):
-            # 列感知：双栏论文中只匹配同栏的 box
-            if not _same_column(cap, box, page_width):
-                continue
-            # 水平重叠检查（同列内仍需有重叠）
-            if not (
-                cap["caption_x1"] > box.x0 - 5 and cap["caption_x0"] < box.x1 + 5
-            ):
-                continue
-            dist = _vertical_distance(
-                cap["caption_y0"], cap["caption_y1"], box.y0, box.y1
-            )
-            if dist is None:
-                continue
-            entry = (bi, dist)
-            any_type.append(entry)
-            if box.boxclass == expected:
-                same_type.append(entry)
-
-        # 优先用同类型匹配；没有时回退到任意类型；都没有则跳过
-        if same_type:
-            cap_to_boxes[ci] = same_type
-        elif any_type:
-            cap_to_boxes[ci] = any_type
-        # else: 该 caption 无匹配 box，不加入 cap_to_boxes
-
-    # 每个 caption → 最近的 box（用于分组），但记录所有匹配的 box
-    cap_primary: dict[int, int] = {}  # caption → primary box index
-    cap_all_boxes: dict[int, list[int]] = {}  # caption → all matched box indices
-    for ci, matches in cap_to_boxes.items():
-        matches.sort(key=lambda x: x[1])
-        cap_primary[ci] = matches[0][0]
-        # 所有距离最近的同组 box（距离差 < 20pt 视为同一组）
-        best_dist = matches[0][1]
-        cap_all_boxes[ci] = [bi for bi, d in matches if d <= best_dist + 20]
-
-    # 按 primary box 分组
-    box_to_caps: dict[int, list[int]] = {}
-    for ci, bi in cap_primary.items():
-        box_to_caps.setdefault(bi, []).append(ci)
-
-    # 构建渲染组：每个 caption 独立成组（共享 box 但各自渲染）
-    # 同类型同 label 的 caption 会合并；不同类型则分开
-    used_captions: set[int] = set()
-    groups: list[tuple[list[int], list[dict]]] = []
-
-    for bi in sorted(box_to_caps.keys()):
-        cis = box_to_caps[bi]
-        for ci in cis:
-            if ci in used_captions:
-                continue
-            used_captions.add(ci)
-
-            all_box_indices = set(cap_all_boxes.get(ci, [bi]))
-            # 只合并同 label 的 caption（同 figure/table 的重复 caption）
-            merged_captions = [captions[ci]]
-            for other_bi in all_box_indices:
-                if other_bi in box_to_caps:
-                    for other_ci in box_to_caps[other_bi]:
-                        if other_ci not in used_captions:
-                            other_cap = captions[other_ci]
-                            if other_cap["label"] == captions[ci]["label"]:
-                                used_captions.add(other_ci)
-                                merged_captions.append(other_cap)
-            groups.append((sorted(all_box_indices), merged_captions))
-
-    return groups
-
-
-# ── 单页处理 ─────────────────────────────────────────────────────────
-
-
-def _render_and_save(
+def _render_box(
    page,
-    clip: pymupdf.Rect,
+    box: _BoxCluster,
    images_dest: Path,
-    manifest: dict,
-    label: str,
+    filename: str,
    cap_type: str,
-    caption_text: str,
-    page_num_1based: int,
-    arxiv_id: str,
+    page_num: int,
 ) -> bool:
-    """渲染页面区域并保存 JPEG，写入 manifest。成功返回 True。"""
+    """渲染单个 box 区域并保存 JPEG，成功返回 True。"""
+    page_width = page.rect.width
+    clip = pymupdf.Rect(
+        max(0, box.x0 - _REGION_PADDING),
+        max(0, box.y0 - _REGION_PADDING),
+        min(page_width, box.x1 + _REGION_PADDING),
+        box.y1 + _REGION_PADDING,
+    )
    mat = pymupdf.Matrix(_RENDER_ZOOM, _RENDER_ZOOM)
    try:
        pix = page.get_pixmap(matrix=mat, clip=clip)
    except Exception:
-        logger.debug("Failed to render %s for %s", label, arxiv_id)
        return False

-    filename = f"{label.replace(' ', '_').lower()}.jpg"
    (images_dest / filename).write_bytes(pix.tobytes("jpeg"))
-
-    manifest[filename] = {
-        "page": page_num_1based,
-        "type": cap_type,
-        "label": label,
-        "caption_text": caption_text[:200] if caption_text else "",
-        "figures" if cap_type == "figure" else "tables": [label],
-    }
-    logger.debug(
-        "Rendered %s: page %d, region (%.0f,%.0f)-(%.0f,%.0f) → %s",
-        label,
-        page_num_1based,
-        clip.x0,
-        clip.y0,
-        clip.x1,
-        clip.y1,
-        filename,
-    )
    return True


 def _process_page(
    doc,
    page_idx: int,
-    page_layout,
+    page_boxes: list[LayoutBox],
    images_dest: Path,
    manifest: dict,
    seen_labels: set,
    arxiv_id: str,
 ) -> int:
-    """处理单页：caption 匹配 + orphan 兜底，返回本页提取数量。"""
+    """处理单页：检测 → 聚类 → 渲染，全部用通用标签。"""
    page = doc[page_idx]
-    page_width = page.rect.width
    page_num = page_idx + 1
-    orphan_fig_counter = 0
-    orphan_tbl_counter = 0
+    fig_counter = 0
+    tbl_counter = 0

    # 收集本页的 table/picture box（跳过极小区域）
    raw_boxes = []
-    for box in page_layout.boxes:
+    for box in page_boxes:
        if box.boxclass not in ("table", "table-fallback", "picture"):
            continue
-        if (box.x1 - box.x0) < 20 or (box.y1 - box.y0) < 20:
+        w = box.x1 - box.x0
+        h = box.y1 - box.y0
+        if w < 20 or h < 20 or w * h < _MIN_BOX_AREA:
            continue
        raw_boxes.append(box)

@@ -354,153 +151,48 @@ def _process_page(
        return 0

    # 聚类：将同一 figure/table 的碎片 box 合并
-    page_boxes = _cluster_boxes(raw_boxes)
+    clusters = _cluster_boxes(raw_boxes)

-    # 页面级匹配：查找所有 caption，分配给 box
-    captions = _find_page_captions(page)
-    groups = _match_captions_to_boxes(page_boxes, captions, page_width)
-
-    # 只合并同 label 的 group（同一个 figure/table 的重复 caption）
-    # 不同 label 的 group 即使共享 box 也不合并（如 Figure 7 和 Figure 8），
-    # 渲染时用 caption 位置切割区域
-    _merged_groups: set[int] = set()
-    merged_groups: list[tuple[list[int], list[dict]]] = []
-    for gi, (box_indices, caps) in enumerate(groups):
-        if gi in _merged_groups:
-            continue
-        this_labels = {c["label"] for c in caps}
-        all_box_set = set(box_indices)
-        merge_targets = {gi}
-        for other_gi, (other_bi, other_caps) in enumerate(groups):
-            if other_gi <= gi or other_gi in _merged_groups:
-                continue
-            other_labels = {c["label"] for c in other_caps}
-            # 只在 label 有交集时合并（同一个 figure/table）
-            if this_labels & other_labels and all_box_set & set(other_bi):
-                merge_targets.add(other_gi)
-                all_box_set |= set(other_bi)
-        all_caps = []
-        for mgi in sorted(merge_targets):
-            _merged_groups.add(mgi)
-            all_caps.extend(groups[mgi][1])
-        merged_groups.append((sorted(all_box_set), all_caps))
-    groups = merged_groups
-
-    # ── 阶段 1：渲染有 caption 匹配的图/表 ──
-    matched_box_indices: set[int] = set()
    extracted = 0
-
-    for box_indices, caps in groups:
-        matched_box_indices.update(box_indices)
-
-        # 去重同一 label，跳过已处理的
-        unique_caps = []
-        for cap in caps:
-            if cap["label"] not in seen_labels:
-                seen_labels.add(cap["label"])
-                unique_caps.append(cap)
-        if not unique_caps:
-            continue
-
-        # 合并所有关联 box 的 bbox
-        bx0 = min(page_boxes[i].x0 for i in box_indices)
-        by0 = min(page_boxes[i].y0 for i in box_indices)
-        bx1 = max(page_boxes[i].x1 for i in box_indices)
-        by1 = max(page_boxes[i].y1 for i in box_indices)
-
-        # 渲染区域：box + caption
-        all_cap_y0 = min(c["caption_y0"] for c in unique_caps)
-        all_cap_y1 = max(c["caption_y1"] for c in unique_caps)
-        all_cap_x0 = min(c["caption_x0"] for c in unique_caps)
-        all_cap_x1 = max(c["caption_x1"] for c in unique_caps)
-
-        top = max(0, min(by0, all_cap_y0) - _REGION_PADDING)
-        bottom = max(by1, all_cap_y1) + _REGION_PADDING
-        rx0 = max(0, min(bx0, all_cap_x0) - _REGION_PADDING)
-        rx1 = min(page_width, max(bx1, all_cap_x1) + _REGION_PADDING)
-
-        clip = pymupdf.Rect(rx0, top, rx1, bottom)
-        # 多个 caption 可能共享同一区域（如 subfigure），只需渲染一次
-        jpeg_bytes = None
-        for cap in unique_caps:
-            if jpeg_bytes is None:
-                if not _render_and_save(
-                    page,
-                    clip,
-                    images_dest,
-                    manifest,
-                    cap["label"],
-                    cap["type"],
-                    cap["caption_text"],
-                    page_num,
-                    arxiv_id,
-                ):
-                    break
-                # 读取刚写入的 bytes 供后续同名 caption 复用
-                filename = f"{cap['label'].replace(' ', '_').lower()}.jpg"
-                jpeg_bytes = (images_dest / filename).read_bytes()
-                extracted += 1
-            else:
-                # 同区域的不同 caption（如 subfigure），复用图片
-                filename = f"{cap['label'].replace(' ', '_').lower()}.jpg"
-                (images_dest / filename).write_bytes(jpeg_bytes)
-                cap_preview = cap["caption_text"][:200]
-                manifest[filename] = {
-                    "page": page_num,
-                    "type": cap["type"],
-                    "label": cap["label"],
-                    "caption_text": cap_preview,
-                    "figures" if cap["type"] == "figure" else "tables": [cap["label"]],
-                }
-                extracted += 1
-
-    # ── 阶段 2：渲染无 caption 匹配的图/表（orphan boxes） ──
-    orphan_indices = set(range(len(page_boxes))) - matched_box_indices
-    for bi in sorted(orphan_indices):
-        box = page_boxes[bi]
-        cap_type = "figure" if box.boxclass == "picture" else "table"
+    for cluster in clusters:
+        cap_type = "figure" if cluster.boxclass == "picture" else "table"

        if cap_type == "figure":
-            orphan_fig_counter += 1
-            label = f"Figure (p{page_num}-{orphan_fig_counter})"
+            fig_counter += 1
+            label = f"Figure (p{page_num}-{fig_counter})"
        else:
-            orphan_tbl_counter += 1
-            label = f"Table (p{page_num}-{orphan_tbl_counter})"
+            tbl_counter += 1
+            label = f"Table (p{page_num}-{tbl_counter})"

        if label in seen_labels:
            continue
        seen_labels.add(label)

-        clip = pymupdf.Rect(
-            max(0, box.x0 - _REGION_PADDING),
-            max(0, box.y0 - _REGION_PADDING),
-            min(page_width, box.x1 + _REGION_PADDING),
-            box.y1 + _REGION_PADDING,
-        )
-        if _render_and_save(
-            page,
-            clip,
-            images_dest,
-            manifest,
-            label,
-            cap_type,
-            "",
-            page_num,
-            arxiv_id,
-        ):
-            extracted += 1
+        filename = f"{label.replace(' ', '_').lower()}.jpg"
+        if not _render_box(page, cluster, images_dest, filename, cap_type, page_num):
+            continue
+
+        manifest[filename] = {
+            "page": page_num,
+            "type": cap_type,
+            "label": label,
+            "box": [
+                round(float(cluster.x0), 1),
+                round(float(cluster.y0), 1),
+                round(float(cluster.x1), 1),
+                round(float(cluster.y1), 1),
+            ],
+        }
+        extracted += 1

    return extracted


-# ── 核心提取 ───────────────────────────────────────────────────────────
+# ── Phase 1 核心入口 ───────────────────────────────────────────────────


 def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
-    """从 PDF 提取 Figure/Table 截图，生成 manifest。
-
-    用 pymupdf4llm layout analysis 检测 table/picture 区域，
-    再通过 caption 文字确定编号，渲染为 JPEG。
+    """Phase 1: 从 PDF 提取 Figure/Table 截图，生成通用标签的 manifest。

    Args:
        arxiv_id: 论文 ID
@@ -526,45 +218,31 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
    if (images_dest / "manifest.json").exists():
        (images_dest / "manifest.json").unlink()

-    doc = pymupdf.open(str(pdf_path))
+    with pymupdf.open(str(pdf_path)) as doc:
+        extracted = 0
+        manifest: dict[str, dict] = {}
+        seen_labels: set[str] = set()

-    # layout analysis
-    try:
-        parsed = dl.parse_document(
-            doc, filename=str(pdf_path), use_ocr=dl.OCRMode.NEVER
-        )
-    except Exception:
-        logger.warning(
-            "pymupdf4llm layout analysis failed for %s", arxiv_id, exc_info=True
-        )
-        doc.close()
-        return 0
-
-    extracted = 0
-    manifest: dict[str, dict] = {}
-    seen_labels: set[str] = set()
-
-    for page_idx, page_layout in enumerate(parsed.pages):
-        try:
-            extracted += _process_page(
-                doc,
-                page_idx,
-                page_layout,
-                images_dest=images_dest,
-                manifest=manifest,
-                seen_labels=seen_labels,
-                arxiv_id=arxiv_id,
-            )
-        except Exception:
-            logger.warning(
-                "Failed to process page %d for %s",
-                page_idx + 1,
-                arxiv_id,
-                exc_info=True,
-            )
-            continue
-
-    doc.close()
+        for page_idx in range(doc.page_count):
+            try:
+                page_boxes = detect_page_layout(doc[page_idx])
+                extracted += _process_page(
+                    doc,
+                    page_idx,
+                    page_boxes,
+                    images_dest=images_dest,
+                    manifest=manifest,
+                    seen_labels=seen_labels,
+                    arxiv_id=arxiv_id,
+                )
+            except Exception:
+                logger.warning(
+                    "Failed to process page %d for %s",
+                    page_idx + 1,
+                    arxiv_id,
+                    exc_info=True,
+                )
+                continue

    # 保存 manifest
    manifest_path = images_dest / "manifest.json"
@@ -580,78 +258,321 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
    return extracted


-# ── 按 summary 过滤 ────────────────────────────────────────────────────
+# ── Phase 2: 用 summary 的 figures ID 定位并重命名 ─────────────────────


-def filter_images_by_summary(arxiv_id: str, figures: list[dict]) -> int:
-    """根据 summary 中的 figures 字段过滤提取的图片/表格。
+def _distance_text_to_box(rect: pymupdf.Rect, box: list[float]) -> float | None:
+    """计算搜索到的文本 rect 到 box 的距离。超出阈值返回 None。

-    用 manifest.json 中的 label 匹配，保留被 AI 总结引用的图片。
+    判断逻辑：rect 中心与 box 的垂直距离 + 水平重叠检查。
+    """
+    rect_cx = (rect.x0 + rect.x1) / 2
+    rect_cy = (rect.y0 + rect.y1) / 2
+    bx0, by0, bx1, by1 = box
+
+    # 水平重叠：rect 中心在 box 水平范围内（或接近）
+    if not (bx0 - 20 <= rect_cx <= bx1 + 20):
+        return None
+
+    # 垂直距离
+    if rect_cy < by0:
+        dist = by0 - rect_cy
+    elif rect_cy > by1:
+        dist = rect_cy - by1
+    else:
+        dist = 0
+
+    return dist if dist <= _LABEL_MATCH_DISTANCE else None
+
+
+def _search_variants(fig_id: str) -> list[str]:
+    """为 figure/table ID 生成搜索变体。
+
+    "Figure 1" → ["Figure 1", "Fig. 1", "Fig 1"]
+    "Fig. 1"   → ["Fig. 1", "Figure 1", "Fig 1"]
+    "Table A1" → ["Table A1"]
+    """
+    variants = [fig_id]
+
+    m = re.match(r"(Fig\.?|Figure)\s+(\d+.*)", fig_id, re.IGNORECASE)
+    if m:
+        num_part = m.group(2)
+        variants.extend(
+            [
+                f"Figure {num_part}",
+                f"Fig. {num_part}",
+                f"Fig {num_part}",
+            ]
+        )
+
+    # 去重保序
+    seen = set()
+    result = []
+    for v in variants:
+        if v not in seen:
+            seen.add(v)
+            result.append(v)
+    return result
+
+
+def label_images_by_summary(
+    arxiv_id: str,
+    figures: list[dict],
+    pdf_path: Path | None = None,
+) -> int:
+    """Phase 2: 用 summary 的 figures ID 在 PDF 中搜索定位，重命名图片。
+
+    对 summary 中的每个 figure/table ID：
+    1. page.search_for(id) 在所有页面搜索文本位置
+    2. 计算搜索位置与 manifest 中 box 坐标的距离
+    3. 最近匹配 → 重命名文件、更新 manifest
+
+    Args:
+        arxiv_id: 论文 ID
+        figures: summary 的 figures 列表，每项含 id/caption/description 等
+        pdf_path: PDF 路径
+
+    Returns:
+        成功重命名的图片数量
    """
    if not figures:
        return 0

-    images_dir = paper_dir(arxiv_id) / "images"
-    manifest_path = images_dir / "manifest.json"
-
-    if not images_dir.exists() or not manifest_path.exists():
+    if pdf_path is None:
+        pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
+    if not pdf_path.exists():
        return 0

-    all_files = [
-        f for f in images_dir.iterdir() if f.suffix.lower() in (".png", ".jpg", ".jpeg")
-    ]
-    if not all_files:
+    images_dest = paper_dir(arxiv_id) / "images"
+    manifest_path = images_dest / "manifest.json"
+    if not manifest_path.exists():
        return 0

-    manifest: dict = json.loads(manifest_path.read_text(encoding="utf-8"))
+    manifest: dict[str, dict] = json.loads(manifest_path.read_text(encoding="utf-8"))
+    if not manifest:
+        return 0

-    # 收集 summary 中引用的所有 Figure/Table ID（归一化）
-    referenced_ids: set[str] = set()
-    for fig in figures:
-        fig_id = fig.get("id", "")
-        m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", fig_id, re.IGNORECASE)
-        if m:
-            referenced_ids.add(f"Figure {m.group(1)}")
-        m2 = re.match(r"Table\s*(\d+)", fig_id, re.IGNORECASE)
-        if m2:
-            referenced_ids.add(f"Table {m2.group(1)}")
+    # 构建候选列表：只对通用标签的条目做匹配
+    candidates: dict[str, dict] = {}  # filename → {page, box, ...}
+    for fname, info in manifest.items():
+        if "(p" in info.get("label", ""):
+            candidates[fname] = info

-    if not referenced_ids:
-        logger.warning("No valid figure/table IDs in summary for %s", arxiv_id)
-        return len(all_files)
+    if not candidates:
+        return 0

-    # 根据 manifest 的 label 字段匹配
-    keep_filenames: set[str] = set()
-    for filename, info in manifest.items():
-        label = info.get("label", "")
-        if label in referenced_ids:
-            keep_filenames.add(filename)
+    with pymupdf.open(str(pdf_path)) as doc:
+        # 收集所有匹配候选：(fig_id, fig_index, filename, distance)
+        matches: list[tuple[str, int, str, float]] = []
+
+        for fig_idx, fig in enumerate(figures):
+            fig_id = fig.get("id", "")
+            if not fig_id:
+                continue
+
+            # 生成搜索变体：Figure 1 / Fig. 1 / Fig 1 等
+            search_terms = _search_variants(fig_id)
+
+            # 在所有页面搜索该文本（含变体）
+            search_hits: list[tuple[int, pymupdf.Rect]] = []  # (page_num_1based, Rect)
+            for page_idx in range(doc.page_count):
+                page = doc[page_idx]
+                seen_rects: set[tuple[float, float]] = set()
+                for term in search_terms:
+                    for r in page.search_for(term):
+                        key = (round(r.x0, 1), round(r.y0, 1))
+                        if key not in seen_rects:
+                            seen_rects.add(key)
+                            search_hits.append((page_idx + 1, r))
+
+            if not search_hits:
+                continue
+
+            # 对每个候选 manifest 条目，找最近的搜索命中
+            for fname, info in candidates.items():
+                box = info.get("box")
+                if not box:
+                    continue
+                manifest_page = info.get("page", 0)
+
+                best_dist: float | None = None
+                for hit_page, rect in search_hits:
+                    # 只匹配同页面
+                    if hit_page != manifest_page:
+                        continue
+                    dist = _distance_text_to_box(rect, box)
+                    if dist is not None and (best_dist is None or dist < best_dist):
+                        best_dist = dist
+
+                if best_dist is not None:
+                    matches.append((fig_id, fig_idx, fname, best_dist))
+
+    if not matches:
+        logger.info("No label matches for %s", arxiv_id)
+        return 0
+
+    # 去冲突：按距离排序，每个 fig_id 和每个 filename 只匹配一次
+    matches.sort(key=lambda x: x[3])
+    used_fig_ids: set[int] = set()
+    used_filenames: set[str] = set()
+    renames: list[tuple[str, str, str]] = []  # (old_fname, new_fname, fig_id)
+
+    for fig_id, fig_idx, fname, dist in matches:
+        if fig_idx in used_fig_ids or fname in used_filenames:
            continue
-        for ref in info.get("figures", []) + info.get("tables", []):
-            if ref in referenced_ids:
-                keep_filenames.add(filename)
+        used_fig_ids.add(fig_idx)
+        used_filenames.add(fname)
+        new_fname = f"{fig_id.replace(' ', '_').lower()}.jpg"
+        renames.append((fname, new_fname, fig_id))
+
+    # 执行重命名
+    labeled = 0
+    new_manifest: dict[str, dict] = {}
+
+    for fname, info in manifest.items():
+        if fname in used_filenames:
+            continue
+        # 未匹配的保持原样
+        new_manifest[fname] = info
+
+    for old_fname, new_fname, fig_id in renames:
+        old_path = images_dest / old_fname
+        new_path = images_dest / new_fname
+        if not old_path.exists():
+            continue
+
+        # 搬运 manifest 信息
+        info = manifest[old_fname].copy()
+        cap_type = info.get("type", "figure")
+
+        # 读取 caption 文本（从 figures 列表）
+        caption_text = ""
+        for fig in figures:
+            if fig.get("id") == fig_id:
+                caption_text = fig.get("caption", "")
                break

-    if not keep_filenames:
-        logger.warning(
-            "No manifest matches for %s (refs=%s), keeping all",
-            arxiv_id,
-            referenced_ids,
+        info["label"] = fig_id
+        info["caption_text"] = caption_text[:200] if caption_text else ""
+        info.setdefault("figures" if cap_type == "figure" else "tables", []).append(
+            fig_id
        )
-        return len(all_files)

-    removed = 0
-    for f in all_files:
-        if f.name not in keep_filenames:
-            f.unlink()
-            removed += 1
+        # 重命名文件
+        if new_fname != old_fname:
+            old_path.rename(new_path)
+        new_manifest[new_fname] = info
+        labeled += 1
+
+    # 写回 manifest
+    manifest_path.write_text(json.dumps(new_manifest, ensure_ascii=False, indent=2))

-    kept = len(all_files) - removed
    logger.info(
-        "Filtered images for %s: kept %d, removed %d (refs=%s)",
+        "Labeled %d/%d images for %s using summary figures",
+        labeled,
+        len(manifest),
        arxiv_id,
-        kept,
-        removed,
-        referenced_ids,
    )
-    return kept
+    return labeled
+
+
+# ── Figure ↔ Image 关联 ────────────────────────────────────────────────
+
+
+def _normalize_figure_id(raw_id: str) -> str:
+    """归一化 Figure/Table ID：'Figure 1'/'Fig.1' → 'Figure 1'。"""
+    m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", raw_id, re.IGNORECASE)
+    if m:
+        return f"Figure {m.group(1)}"
+    m2 = re.match(r"Table\s*(\d+)", raw_id, re.IGNORECASE)
+    if m2:
+        return f"Table {m2.group(1)}"
+    return raw_id
+
+
+def _is_figure_type(fig_id: str) -> bool:
+    """判断是否为 Figure 类型（非 Table）。"""
+    return not re.match(r"Table\s*(\d+)", fig_id, re.IGNORECASE)
+
+
+def _image_sort_key(name: str) -> tuple[int, int]:
+    """按文件名中的编号排序提取的图片。"""
+    # 新格式：figure_1.jpg, table_1.jpg
+    m = re.search(r"(?:figure|table)_(\d+)", name)
+    if m:
+        return (0, int(m.group(1)))
+    # 旧格式：page2_img1.png, page5_table1.png, figure_1.png
+    m2 = re.search(r"page(\d+)_(?:img|table)(\d+)", name)
+    if m2:
+        return (int(m2.group(1)), int(m2.group(2)))
+    return (0, 0)
+
+
+def link_figures_with_images(
+    figures: list[dict], images: list[dict], arxiv_id: str
+) -> list[dict]:
+    """将 summary figures 元数据与提取的图片文件关联。
+
+    策略：
+    1. 优先用 manifest.json 的 label 做 ID 精确匹配
+    2. 未匹配的 figure 用序号兜底：第 N 个 Figure → 第 N 张提取图
+    """
+    if not figures or not images:
+        return figures
+
+    manifest_path = PAPERS_DIR / arxiv_id / "images" / "manifest.json"
+
+    # ── 策略 1：manifest ID 精确匹配 ──
+    id_to_url: dict[str, str] = {}
+    if manifest_path.exists():
+        try:
+            manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
+        except (ValueError, TypeError):
+            manifest = {}
+        for filename, info in manifest.items():
+            url = f"/papers/{arxiv_id}/images/{filename}"
+            # 优先用 label 字段（新格式）
+            label = info.get("label", "")
+            if label:
+                id_to_url[label] = url
+            # 也兼容 figures/tables 列表（旧格式）
+            for fig_id in info.get("figures", []) + info.get("tables", []):
+                if fig_id not in id_to_url:
+                    id_to_url[fig_id] = url
+
+    for fig in figures:
+        raw_id = fig.get("id", "")
+        normalized = _normalize_figure_id(raw_id)
+        if normalized in id_to_url:
+            fig["image_url"] = id_to_url[normalized]
+
+    # ── 策略 2：序号兜底（manifest 匹配不到时） ──
+    unmatched = [f for f in figures if not f.get("image_url")]
+    if not unmatched:
+        return figures
+
+    # 按类型分流：Figure vs Table
+    fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))]
+    table_type_unmatched = [
+        f for f in unmatched if not _is_figure_type(f.get("id", ""))
+    ]
+
+    # 提取的图片按类型分流，按文件名中的编号排序
+    fig_images = sorted(
+        [img for img in images if "table" not in img["name"].lower()],
+        key=lambda img: _image_sort_key(img["name"]),
+    )
+    table_images = sorted(
+        [img for img in images if "table" in img["name"].lower()],
+        key=lambda img: _image_sort_key(img["name"]),
+    )
+
+    for i, fig in enumerate(fig_type_unmatched):
+        if i < len(fig_images):
+            fig["image_url"] = fig_images[i]["url"]
+
+    for i, fig in enumerate(table_type_unmatched):
+        if i < len(table_images):
+            fig["image_url"] = table_images[i]["url"]
+
+    return figures