refactor: replace Phase 2 label matching with PDF text-stream caption pairing

- Extract captions from PDF text dict instead of DocLayout caption boxes - Use _CaptionBlock dataclass to carry authoritative ID, kind, text, bbox - Pair captions to content boxes with directional preference (figure below, table above) - Filter out uncaptioned boxes (Algorithm pseudo-code, unnumbered appendix tables, false positives) - Remove label_images_by_summary and Phase 2 rename pipeline entirely - Update tests to cover text-based caption pairing and filtering
2026-06-15 01:09:29 +08:00
parent 29fb20828e
commit 1ccac1f29a
3 changed files with 236 additions and 401 deletions
@@ -1,12 +1,13 @@
-"""PDF 图片与表格提取 — 两阶段流水线。
+"""PDF 图片与表格提取。
-Phase 1: DocLayout-YOLO 检测 figure/table 区域 → 渲染为 JPEG（通用标签）
+DocLayout-YOLO 检测 figure/table 内容区域 → PDF 文本流定位 caption → 只渲染配到
-Phase 2: 用 LLM summary 的 figures[].id 在 PDF 中搜索定位 → 匹配到 box → 重命名
+Figure/Table 标题的，用 caption 自带权威 ID 命名。没配到标题的（Algorithm 伪代码、
 无编号附录表、DocLayout 误检碎片）一律过滤，不输出。
-相比旧方案（正则匹配 caption）：
+caption 定位用 PDF 文本而非 DocLayout 的 caption box —— 后者检测不稳（多行标题只
- 不再依赖正则，用 LLM 输出的 ID 直接搜索 PDF 文本
+框一行→截断、漏检→无标题、配对错误→串台）。page.get_text("dict") 找以
- page.search_for() 精确搜索 + 空间距离过滤，避免正文引用误匹配
+"Figure N"/"Table N" 开头的文本块：文本块天然含完整多行标题，且其 ID 即论文实际
- 通用标签兜底，LLM 没提到的图表不会被丢弃
+编号，直接命名规避串台。figure 标题优先在下方、table 标题优先在上方配对。
 """
 from __future__ import annotations
@@ -14,6 +15,7 @@ from __future__ import annotations
 import json
 import logging
 import re
 from dataclasses import dataclass
 from pathlib import Path
 import pymupdf
@@ -32,10 +34,16 @@ _RENDER_ZOOM = 3
 _CLUSTER_GAP = 15
 # 最小 bbox 面积（单位: pt²）— 过滤 icon/logo 等微小误检
 _MIN_BOX_AREA = 2000
-# Phase 2: 搜索文本到 box 的最大匹配距离（单位: pt）
+# caption 文本块与 figure/table 内容块的最大垂直距离（单位: pt）
 _LABEL_MATCH_DISTANCE = 100
 # DocLayout caption 与 figure/table 匹配的最大距离（单位: pt）
 _CAPTION_MATCH_DISTANCE = 120
 # 方向不符（figure 标题在上 / table 标题在下）的配对惩罚分（仍允许，兜底异常排版）
 _CAPTION_WRONG_SIDE_PENALTY = 300
 # caption 开头标记：Figure 3 / Fig. 3 / Table C1 / Figure 3.5 等（大小写均可）
 # 编号 = 数字开头 或 字母+数字（附录 C1）；行首匹配，规避正文 "see Table 3" 引用
 _CAPTION_HEAD_RE = re.compile(
    r"^\s*(Figure|Fig\.?|Table)\b\.?\s+([0-9][0-9A-Za-z.]*|[A-Z]\d[0-9A-Za-z.]*)",
    re.IGNORECASE,
 )
 # ── Box 聚类 ─────────────────────────────────────────────────────────
@@ -51,8 +59,17 @@ class _BoxCluster:
        self.y0 = min(b.y0 for b in boxes)
        self.x1 = max(b.x1 for b in boxes)
        self.y1 = max(b.y1 for b in boxes)
-        raw = boxes[0].boxclass
+        self.boxclass = boxes[0].boxclass
-        self.boxclass = "table" if raw == "table-fallback" else raw
+
@dataclass(frozen=True)
 class _CaptionBlock:
    """从 PDF 文本流提取的标题块：自带权威 ID、完整多行文本、精确 bbox。"""
    id: str  # "Figure 3" / "Table C1"
    kind: str  # "figure" | "table"
    text: str  # 完整多行标题文本
    bbox: list[float]  # [x0, y0, x1, y1]
 def _cluster_to_box(cluster: _BoxCluster) -> list[float]:
@@ -103,64 +120,88 @@ def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]:
    return [_BoxCluster(members) for members in groups.values()]
-def _caption_class_for_content(boxclass: str) -> str:
+def _find_caption_blocks(page) -> list[_CaptionBlock]:
-    return "figure_caption" if boxclass == "picture" else "table_caption"
+    """从页面文本流提取以 "Figure N"/"Table N"/"Fig. N" 开头的标题块。
-
+    用 PDF 文本而非 DocLayout caption box：文本块天然含完整多行标题，
-def _caption_distance(content: _BoxCluster, caption: _BoxCluster) -> float | None:
+    且其 ID 即论文实际编号（如 "Table C1"），权威且不依赖模型检测。
-    """Return a spatial score for pairing a caption with a content box."""
+    """
    h_overlap = min(content.x1, caption.x1) - max(content.x0, caption.x0)
    min_width = min(content.x1 - content.x0, caption.x1 - caption.x0)
    if min_width <= 0 or h_overlap < min_width * 0.25:
        return None
    if caption.y1 < content.y0:
        v_gap = content.y0 - caption.y1
    elif caption.y0 > content.y1:
        v_gap = caption.y0 - content.y1
    else:
        v_gap = 0.0
    return v_gap if v_gap <= _CAPTION_MATCH_DISTANCE else None
 def _extract_caption_text(page, caption: _BoxCluster) -> str:
    rect = pymupdf.Rect(caption.x0, caption.y0, caption.x1, caption.y1)
    try:
-        text = page.get_text("text", clip=rect)
+        d = page.get_text("dict")
    except Exception:
-        return ""
+        return []
-    return " ".join(text.split())
+
    results: list[_CaptionBlock] = []
    for block in d.get("blocks", []):
        if block.get("type") != 0:  # 仅文本块
            continue
        lines = block.get("lines", [])
        if not lines:
            continue
        line_texts = [
            "".join(span.get("text", "") for span in line.get("spans", []))
            for line in lines
        ]
        first_line = next((t for t in line_texts if t.strip()), "")
        m = _CAPTION_HEAD_RE.match(first_line)
        if not m:
            continue
        kind_word, num = m.group(1), m.group(2)
        is_table = kind_word.lower().startswith("table")
        bbox = block.get("bbox")
        if not bbox or len(bbox) != 4:
            continue
        full_text = " ".join(t.strip() for t in line_texts if t.strip())
        results.append(
            _CaptionBlock(
                id=f"{'Table' if is_table else 'Figure'} {num}",
                kind="table" if is_table else "figure",
                text=full_text,
                bbox=[float(v) for v in bbox],
            )
        )
    return results
-def _match_captions(
+def _pair_caption_blocks(
    page,
    content_clusters: list[_BoxCluster],
-    caption_clusters: list[_BoxCluster],
+    caption_blocks: list[_CaptionBlock],
-) -> dict[int, tuple[_BoxCluster, str]]:
+) -> dict[int, _CaptionBlock]:
-    """Match each content cluster to its nearest same-type DocLayout caption."""
+    """每个内容块配方向上最近的同类型标题块。
-    matches: dict[int, tuple[_BoxCluster, str]] = {}
+
-    used_captions: set[int] = set()
+    figure 标题惯例在下方、table 标题在上方；方向相符优先，不符加惩罚兜底
    （跨页 / 异常排版）。按 (距离+惩罚) 升序贪心匹配，每个内容块与标题块唯一配对。
    """
    candidates: list[tuple[float, int, int]] = []
-
+    for c_idx, content in enumerate(content_clusters):
-    for content_idx, content in enumerate(content_clusters):
+        want_below = content.boxclass == "picture"  # figure 标题在下
-        wanted_caption_class = _caption_class_for_content(content.boxclass)
+        want_kind = "figure" if want_below else "table"
-        for caption_idx, caption in enumerate(caption_clusters):
+        for b_idx, cap in enumerate(caption_blocks):
-            if caption.boxclass != wanted_caption_class:
+            if cap.kind != want_kind:
                continue
-            dist = _caption_distance(content, caption)
+            cx0, cy0, cx1, cy1 = cap.bbox
-            if dist is not None:
+            h_overlap = min(content.x1, cx1) - max(content.x0, cx0)
-                candidates.append((dist, content_idx, caption_idx))
+            min_width = min(content.x1 - content.x0, cx1 - cx0)
            if min_width <= 0 or h_overlap < min_width * 0.25:
                continue
            if cy1 <= content.y0:  # 标题在内容上方
                side_below, v_gap = False, content.y0 - cy1
            elif cy0 >= content.y1:  # 标题在内容下方
                side_below, v_gap = True, cy0 - content.y1
            else:
                continue  # 重叠，跳过
            if v_gap > _CAPTION_MATCH_DISTANCE:
                continue
            penalty = 0.0 if side_below == want_below else _CAPTION_WRONG_SIDE_PENALTY
            candidates.append((v_gap + penalty, c_idx, b_idx))
-    for _dist, content_idx, caption_idx in sorted(candidates):
+    matches: dict[int, _CaptionBlock] = {}
-        if content_idx in matches or caption_idx in used_captions:
+    used: set[int] = set()
    for _score, c_idx, b_idx in sorted(candidates):
        if c_idx in matches or b_idx in used:
            continue
-        text = _extract_caption_text(page, caption_clusters[caption_idx])
+        matches[c_idx] = caption_blocks[b_idx]
-        if not text:
+        used.add(b_idx)
            continue
        matches[content_idx] = (caption_clusters[caption_idx], text)
        used_captions.add(caption_idx)
    return matches
@@ -174,25 +215,27 @@ def _render_box(
    filename: str,
    cap_type: str,
    page_num: int,
-    caption: _BoxCluster | None = None,
+    caption_bbox: list[float] | None = None,
 ) -> bool:
    """渲染单个 box 区域并保存 JPEG，成功返回 True。
-    若提供 caption，则将内容与 caption 区域合并后一起截取，
+    若提供 caption_bbox，则将内容与标题区域合并后一起截取，
-    使同一张截图同时包含图/表及其标题文字。
+    使同一张截图同时包含图/表及其完整标题。
    """
    page_width = page.rect.width
    page_height = page.rect.height
    x0, y0, x1, y1 = box.x0, box.y0, box.x1, box.y1
-    if caption is not None:
+    if caption_bbox is not None:
-        x0 = min(x0, caption.x0)
+        cx0, cy0, cx1, cy1 = caption_bbox
-        y0 = min(y0, caption.y0)
+        x0 = min(x0, cx0)
-        x1 = max(x1, caption.x1)
+        y0 = min(y0, cy0)
-        y1 = max(y1, caption.y1)
+        x1 = max(x1, cx1)
        y1 = max(y1, cy1)
    clip = pymupdf.Rect(
        max(0, x0 - _REGION_PADDING),
        max(0, y0 - _REGION_PADDING),
        min(page_width, x1 + _REGION_PADDING),
-        y1 + _REGION_PADDING,
+        min(page_height, y1 + _REGION_PADDING),
    )
    mat = pymupdf.Matrix(_RENDER_ZOOM, _RENDER_ZOOM)
    try:
@@ -200,7 +243,7 @@ def _render_box(
    except Exception:
        return False
-    (images_dest / filename).write_bytes(pix.tobytes("jpeg"))
+    (images_dest / filename).write_bytes(pix.tobytes("jpeg", jpg_quality=92))
    return True
@@ -213,77 +256,62 @@ def _process_page(
    seen_labels: set,
    arxiv_id: str,
 ) -> int:
-    """处理单页：检测 → 聚类 → 渲染，全部用通用标签。"""
+    """处理单页：检测内容 box → 文本定位 caption → 只渲染配到标题的。
    配到 Figure/Table caption 的 box 用 caption 自带 ID 命名（figure_3.jpg）；
    没配到标题的（Algorithm 伪代码、无编号附录表、误检碎片）一律过滤，不输出。
    """
    page = doc[page_idx]
    page_num = page_idx + 1
    fig_counter = 0
    tbl_counter = 0
-    # 收集本页的 table/picture box 与 caption box（跳过极小区域）
+    # 收集本页 figure/table 内容 box（跳过极小区域；caption 改由文本定位，不收 box）
    raw_boxes = []
    raw_caption_boxes = []
    for box in page_boxes:
-        w = box.x1 - box.x0
+        if box.boxclass in ("table", "picture"):
-        h = box.y1 - box.y0
+            w = box.x1 - box.x0
-        if box.boxclass in ("table", "table-fallback", "picture"):
+            h = box.y1 - box.y0
            if w < 20 or h < 20 or w * h < _MIN_BOX_AREA:
                continue
            raw_boxes.append(box)
        elif box.boxclass in ("figure_caption", "table_caption"):
            if w < 30 or h < 6:
                continue
            raw_caption_boxes.append(box)
    if not raw_boxes:
        return 0
-    # 聚类：将同一 figure/table 的碎片 box 合并
+    # 聚类：将同一 figure/table 的碎片 box 合并；用 PDF 文本定位 caption
    clusters = _cluster_boxes(raw_boxes)
-    caption_clusters = _cluster_boxes(raw_caption_boxes)
+    caption_blocks = _find_caption_blocks(page)
-    caption_matches = _match_captions(page, clusters, caption_clusters)
+    caption_matches = _pair_caption_blocks(clusters, caption_blocks)
    extracted = 0
    for cluster_idx, cluster in enumerate(clusters):
-        cap_type = "figure" if cluster.boxclass == "picture" else "table"
+        cap_match = caption_matches.get(cluster_idx)
        if cap_match is None:
            continue  # 无 Figure/Table 标题 → 过滤（Algorithm、无编号表、误检碎片）
        if cap_match.id in seen_labels:
            continue  # 同一图表被 DocLayout 切成多块重复检测，跳过后续
        seen_labels.add(cap_match.id)
-        if cap_type == "figure":
+        filename = f"{cap_match.id.replace(' ', '_').lower()}.jpg"
            fig_counter += 1
            label = f"Figure (p{page_num}-{fig_counter})"
        else:
            tbl_counter += 1
            label = f"Table (p{page_num}-{tbl_counter})"
        if label in seen_labels:
            continue
        seen_labels.add(label)
        caption_match = caption_matches.get(cluster_idx)
        caption_cluster = caption_match[0] if caption_match else None
        filename = f"{label.replace(' ', '_').lower()}.jpg"
        if not _render_box(
            page,
            cluster,
            images_dest,
            filename,
-            cap_type,
+            cap_match.kind,
            page_num,
-            caption=caption_cluster,
+            caption_bbox=cap_match.bbox,
        ):
            continue
-        info = {
+        manifest[filename] = {
            "page": page_num,
-            "type": cap_type,
+            "type": cap_match.kind,
-            "label": label,
+            "label": cap_match.id,
            "box": _cluster_to_box(cluster),
            "caption_text": cap_match.text[:500],
            "caption_box": cap_match.bbox,
            "caption_source": "text",
        }
        if caption_match:
            info["caption_text"] = caption_match[1][:500]
            info["caption_box"] = _cluster_to_box(caption_cluster)
            info["caption_source"] = "doclayout"
        manifest[filename] = info
        extracted += 1
    return extracted
@@ -359,230 +387,6 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
    return extracted
 # ── Phase 2: 用 summary 的 figures ID 定位并重命名 ─────────────────────
 def _distance_text_to_box(rect: pymupdf.Rect, box: list[float]) -> float | None:
    """计算搜索到的文本 rect 到 box 的距离。超出阈值返回 None。
    判断逻辑：rect 中心与 box 的垂直距离 + 水平重叠检查。
    """
    rect_cx = (rect.x0 + rect.x1) / 2
    rect_cy = (rect.y0 + rect.y1) / 2
    bx0, by0, bx1, by1 = box
    # 水平重叠：rect 中心在 box 水平范围内（或接近）
    if not (bx0 - 20 <= rect_cx <= bx1 + 20):
        return None
    # 垂直距离
    if rect_cy < by0:
        dist = by0 - rect_cy
    elif rect_cy > by1:
        dist = rect_cy - by1
    else:
        dist = 0
    return dist if dist <= _LABEL_MATCH_DISTANCE else None
 def _search_variants(fig_id: str) -> list[str]:
    """为 figure/table ID 生成搜索变体。
    "Figure 1" → ["Figure 1", "Fig. 1", "Fig 1"]
    "Fig. 1"   → ["Fig. 1", "Figure 1", "Fig 1"]
    "Table A1" → ["Table A1"]
    """
    variants = [fig_id]
    m = re.match(r"(Fig\.?|Figure)\s+(\d+.*)", fig_id, re.IGNORECASE)
    if m:
        num_part = m.group(2)
        variants.extend(
            [
                f"Figure {num_part}",
                f"Fig. {num_part}",
                f"Fig {num_part}",
            ]
        )
    # 去重保序
    seen = set()
    result = []
    for v in variants:
        if v not in seen:
            seen.add(v)
            result.append(v)
    return result
 def label_images_by_summary(
    arxiv_id: str,
    figures: list[dict],
    pdf_path: Path | None = None,
 ) -> int:
    """Phase 2: 用 summary 的 figures ID 在 PDF 中搜索定位，重命名图片。
    对 summary 中的每个 figure/table ID：
    1. page.search_for(id) 在所有页面搜索文本位置
    2. 计算搜索位置与 manifest 中 box 坐标的距离
    3. 最近匹配 → 重命名文件、更新 manifest
    Args:
        arxiv_id: 论文 ID
        figures: summary 的 figures 列表，每项含 id/caption/description 等
        pdf_path: PDF 路径
    Returns:
        成功重命名的图片数量
    """
    if not figures:
        return 0
    if pdf_path is None:
        pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
    if not pdf_path.exists():
        return 0
    images_dest = paper_dir(arxiv_id) / "images"
    manifest_path = images_dest / "manifest.json"
    if not manifest_path.exists():
        return 0
    manifest: dict[str, dict] = json.loads(manifest_path.read_text(encoding="utf-8"))
    if not manifest:
        return 0
    # 构建候选列表：只对通用标签的条目做匹配
    candidates: dict[str, dict] = {}  # filename → {page, box, ...}
    for fname, info in manifest.items():
        if "(p" in info.get("label", ""):
            candidates[fname] = info
    if not candidates:
        return 0
    with pymupdf.open(str(pdf_path)) as doc:
        # 收集所有匹配候选：(fig_id, fig_index, filename, distance)
        matches: list[tuple[str, int, str, float]] = []
        for fig_idx, fig in enumerate(figures):
            fig_id = fig.get("id", "")
            if not fig_id:
                continue
            # 生成搜索变体：Figure 1 / Fig. 1 / Fig 1 等
            search_terms = _search_variants(fig_id)
            # 在所有页面搜索该文本（含变体）
            search_hits: list[tuple[int, pymupdf.Rect]] = []  # (page_num_1based, Rect)
            for page_idx in range(doc.page_count):
                page = doc[page_idx]
                seen_rects: set[tuple[float, float]] = set()
                for term in search_terms:
                    for r in page.search_for(term):
                        key = (round(r.x0, 1), round(r.y0, 1))
                        if key not in seen_rects:
                            seen_rects.add(key)
                            search_hits.append((page_idx + 1, r))
            if not search_hits:
                continue
            # 对每个候选 manifest 条目，找最近的搜索命中
            for fname, info in candidates.items():
                box = info.get("box")
                if not box:
                    continue
                manifest_page = info.get("page", 0)
                best_dist: float | None = None
                for hit_page, rect in search_hits:
                    # 只匹配同页面
                    if hit_page != manifest_page:
                        continue
                    dist = _distance_text_to_box(rect, box)
                    if dist is not None and (best_dist is None or dist < best_dist):
                        best_dist = dist
                if best_dist is not None:
                    matches.append((fig_id, fig_idx, fname, best_dist))
    if not matches:
        logger.info("No label matches for %s", arxiv_id)
        return 0
    # 去冲突：按距离排序，每个 fig_id 和每个 filename 只匹配一次
    matches.sort(key=lambda x: x[3])
    used_fig_ids: set[int] = set()
    used_filenames: set[str] = set()
    renames: list[tuple[str, str, str]] = []  # (old_fname, new_fname, fig_id)
    for fig_id, fig_idx, fname, dist in matches:
        if fig_idx in used_fig_ids or fname in used_filenames:
            continue
        used_fig_ids.add(fig_idx)
        used_filenames.add(fname)
        new_fname = f"{fig_id.replace(' ', '_').lower()}.jpg"
        renames.append((fname, new_fname, fig_id))
    # 执行重命名
    labeled = 0
    new_manifest: dict[str, dict] = {}
    for fname, info in manifest.items():
        if fname in used_filenames:
            continue
        # 未匹配的保持原样
        new_manifest[fname] = info
    for old_fname, new_fname, fig_id in renames:
        old_path = images_dest / old_fname
        new_path = images_dest / new_fname
        if not old_path.exists():
            continue
        # 搬运 manifest 信息
        info = manifest[old_fname].copy()
        cap_type = info.get("type", "figure")
        # 读取 caption 文本（从 figures 列表）
        summary_caption_text = ""
        for fig in figures:
            if fig.get("id") == fig_id:
                summary_caption_text = fig.get("caption", "")
                break
        info["label"] = fig_id
        existing_caption_text = info.get("caption_text", "")
        if existing_caption_text and summary_caption_text:
            info["summary_caption_text"] = summary_caption_text[:500]
        else:
            info["caption_text"] = (
                summary_caption_text[:500] if summary_caption_text else ""
            )
        info.setdefault("figures" if cap_type == "figure" else "tables", []).append(
            fig_id
        )
        # 重命名文件
        if new_fname != old_fname:
            old_path.rename(new_path)
        new_manifest[new_fname] = info
        labeled += 1
    # 写回 manifest
    manifest_path.write_text(json.dumps(new_manifest, ensure_ascii=False, indent=2))
    logger.info(
        "Labeled %d/%d images for %s using summary figures",
        labeled,
        len(manifest),
        arxiv_id,
    )
    return labeled
 # ── Figure ↔ Image 关联 ────────────────────────────────────────────────
@@ -201,20 +201,15 @@ def _cleanup_old_images(db: Session, paper: Paper) -> None:
 def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None:
    """从 PDF 提取图片和表格（失败不影响总结）。
-    两阶段流水线：
+    DocLayout-YOLO 检测 figure/table 内容区域 → PDF 文本定位 caption → 只渲染
-    1. DocLayout-YOLO 检测 + 渲染截图（通用标签）
+    配到 Figure/Table 标题的（Algorithm、无编号附录表、误检碎片一律过滤）。
-    2. 用 summary 的 figures ID 在 PDF 中搜索定位 → 重命名
+    标题来源已切换为 PDF 文本，schema.figures 不再参与命名，参数保留备用。
    """
    try:
-        from app.services.pdf_image_extractor import (
+        from app.services.pdf_image_extractor import extract_images_from_pdf
            extract_images_from_pdf,
            label_images_by_summary,
        )
        pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
        extract_images_from_pdf(arxiv_id, pdf_path)
        if schema.figures:
            label_images_by_summary(arxiv_id, schema.figures, pdf_path)
    except Exception:
        logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
@@ -1,6 +1,5 @@
 from __future__ import annotations
 import json
 from unittest.mock import MagicMock
 import pymupdf
@@ -9,7 +8,17 @@ from app.services import pdf_image_extractor as mod
 from app.services.layout_detector import LayoutBox
-def test_process_page_extracts_doclayout_caption(tmp_path):
+def _caption_block(bbox, text):
    """构造一个 page.get_text("dict") 风格的文本块。"""
    return {
        "type": 0,
        "bbox": list(bbox),
        "lines": [{"spans": [{"text": text}]}],
    }
 def test_process_page_pairs_caption_from_text(tmp_path):
    """caption 来自 PDF 文本流（figure 标题在内容下方），用其 ID 直接命名。"""
    images_dest = tmp_path / "images"
    images_dest.mkdir()
    manifest: dict[str, dict] = {}
@@ -19,16 +28,18 @@ def test_process_page_extracts_doclayout_caption(tmp_path):
    page = MagicMock()
    page.rect.width = 600
    page.rect.height = 800
    page.get_pixmap.return_value = pix
-    page.get_text.return_value = "Figure 1: Overall architecture.\n"
+    page.get_text.return_value = {
        "blocks": [
            _caption_block((95, 310, 320, 325), "Figure 1: Overall architecture.")
        ]
    }
    doc = MagicMock()
    doc.__getitem__.return_value = page
-    boxes = [
+    boxes = [LayoutBox(100, 100, 300, 300, "picture")]
        LayoutBox(100, 100, 300, 300, "picture"),
        LayoutBox(95, 310, 320, 325, "figure_caption"),
    ]
    extracted = mod._process_page(
        doc,
@@ -41,14 +52,15 @@ def test_process_page_extracts_doclayout_caption(tmp_path):
    )
    assert extracted == 1
-    info = manifest["figure_(p1-1).jpg"]
+    # caption 自带 ID → 直接命名 figure_1.jpg
    info = manifest["figure_1.jpg"]
    assert info["label"] == "Figure 1"
    assert info["caption_text"] == "Figure 1: Overall architecture."
-    assert info["caption_source"] == "doclayout"
+    assert info["caption_source"] == "text"
    assert info["caption_box"] == [95.0, 310.0, 320.0, 325.0]
 def test_process_page_includes_caption_in_render(tmp_path):
-    """渲染时把 caption 区域合并进同一张截图。"""
+    """渲染时把 caption 文本块区域合并进同一张截图。"""
    images_dest = tmp_path / "images"
    images_dest.mkdir()
    manifest: dict[str, dict] = {}
@@ -58,16 +70,16 @@ def test_process_page_includes_caption_in_render(tmp_path):
    page = MagicMock()
    page.rect.width = 600
    page.rect.height = 800
    page.get_pixmap.return_value = pix
-    page.get_text.return_value = "Figure 1: Caption text.\n"
+    page.get_text.return_value = {
        "blocks": [_caption_block((95, 310, 320, 325), "Figure 1: Caption text.")]
    }
    doc = MagicMock()
    doc.__getitem__.return_value = page
-    boxes = [
+    boxes = [LayoutBox(100, 100, 300, 300, "picture")]
        LayoutBox(100, 100, 300, 300, "picture"),
        LayoutBox(95, 310, 320, 325, "figure_caption"),
    ]
    mod._process_page(
        doc,
@@ -85,50 +97,74 @@ def test_process_page_includes_caption_in_render(tmp_path):
    assert clip == pymupdf.Rect(90, 95, 325, 330)
-def test_label_images_preserves_doclayout_caption(tmp_path, monkeypatch):
+def test_process_page_table_caption_above(tmp_path):
-    arxiv_id = "2401.00001"
+    """table 标题惯例在内容上方，配对后命名 table_N.jpg。"""
-    paper_root = tmp_path / arxiv_id
+    images_dest = tmp_path / "images"
-    images_dest = paper_root / "images"
+    images_dest.mkdir()
-    images_dest.mkdir(parents=True)
+    manifest: dict[str, dict] = {}
    (images_dest / "figure_(p1-1).jpg").write_bytes(b"jpeg")
    (images_dest / "manifest.json").write_text(
        json.dumps(
            {
                "figure_(p1-1).jpg": {
                    "page": 1,
                    "type": "figure",
                    "label": "Figure (p1-1)",
                    "box": [100, 100, 300, 300],
                    "caption_text": "Figure 1: PDF original caption.",
                    "caption_source": "doclayout",
                }
            }
        )
    )
-    pdf_path = tmp_path / "paper.pdf"
+    pix = MagicMock()
-    pdf_path.write_bytes(b"%PDF")
+    pix.tobytes.return_value = b"jpeg"
    monkeypatch.setattr(mod, "paper_dir", lambda _arxiv_id: paper_root)
    page = MagicMock()
-    page.search_for.return_value = [pymupdf.Rect(120, 305, 180, 320)]
+    page.rect.width = 600
    page.rect.height = 800
    page.get_pixmap.return_value = pix
    # caption 在内容上方 [80, 90, 320, 105]，内容表格 [80, 120, 320, 280]
    page.get_text.return_value = {
        "blocks": [_caption_block((80, 90, 320, 105), "Table 2 | Results summary.")]
    }
-    fake_doc = MagicMock()
+    doc = MagicMock()
-    fake_doc.page_count = 1
+    doc.__getitem__.return_value = page
    fake_doc.__getitem__.return_value = page
    fake_doc.__enter__.return_value = fake_doc
    fake_doc.__exit__.return_value = False
    monkeypatch.setattr(mod.pymupdf, "open", lambda _path: fake_doc)
-    labeled = mod.label_images_by_summary(
+    boxes = [LayoutBox(80, 120, 320, 280, "table")]
-        arxiv_id,
+
-        [{"id": "Figure 1", "caption": "Summary caption."}],
+    extracted = mod._process_page(
-        pdf_path=pdf_path,
+        doc,
        0,
        boxes,
        images_dest=images_dest,
        manifest=manifest,
        seen_labels=set(),
        arxiv_id="2401.00001",
    )
-    assert labeled == 1
+    assert extracted == 1
-    manifest = json.loads((images_dest / "manifest.json").read_text())
+    info = manifest["table_2.jpg"]
-    info = manifest["figure_1.jpg"]
+    assert info["label"] == "Table 2"
-    assert info["caption_text"] == "Figure 1: PDF original caption."
+    assert info["caption_source"] == "text"
-    assert info["caption_source"] == "doclayout"
+
-    assert info["summary_caption_text"] == "Summary caption."
+
 def test_process_page_filters_uncaptioned(tmp_path):
    """没有 Figure/Table caption 配对的 box（Algorithm、无编号表等）被过滤，不输出。"""
    images_dest = tmp_path / "images"
    images_dest.mkdir()
    manifest: dict[str, dict] = {}
    pix = MagicMock()
    pix.tobytes.return_value = b"jpeg"
    page = MagicMock()
    page.rect.width = 600
    page.rect.height = 800
    page.get_pixmap.return_value = pix
    page.get_text.return_value = {"blocks": []}  # 无任何 caption 文本块
    doc = MagicMock()
    doc.__getitem__.return_value = page
    boxes = [LayoutBox(100, 100, 300, 300, "picture")]
    extracted = mod._process_page(
        doc,
        0,
        boxes,
        images_dest=images_dest,
        manifest=manifest,
        seen_labels=set(),
        arxiv_id="2401.00001",
    )
    assert extracted == 0
    assert manifest == {}