refactor: replace Phase 2 label matching with PDF text-stream caption pairing

- Extract captions from PDF text dict instead of DocLayout caption boxes
- Use _CaptionBlock dataclass to carry authoritative ID, kind, text, bbox
- Pair captions to content boxes with directional preference (figure below, table above)
- Filter out uncaptioned boxes (Algorithm pseudo-code, unnumbered appendix tables, false positives)
- Remove label_images_by_summary and Phase 2 rename pipeline entirely
- Update tests to cover text-based caption pairing and filtering
This commit is contained in:
2026-06-15 01:09:29 +08:00
parent 29fb20828e
commit 1ccac1f29a
3 changed files with 236 additions and 401 deletions
+140 -336
View File
@@ -1,12 +1,13 @@
"""PDF 图片与表格提取 — 两阶段流水线 """PDF 图片与表格提取。
Phase 1: DocLayout-YOLO 检测 figure/table 区域 → 渲染为 JPEG(通用标签) DocLayout-YOLO 检测 figure/table 内容区域 → PDF 文本流定位 caption → 只渲染配到
Phase 2: 用 LLM summary 的 figures[].id 在 PDF 中搜索定位 → 匹配到 box → 重命名 Figure/Table 标题的,用 caption 自带权威 ID 命名。没配到标题的(Algorithm 伪代码、
无编号附录表、DocLayout 误检碎片)一律过滤,不输出。
相比旧方案(正则匹配 caption): caption 定位用 PDF 文本而非 DocLayout 的 caption box —— 后者检测不稳(多行标题只
- 不再依赖正则,用 LLM 输出的 ID 直接搜索 PDF 文本 框一行→截断、漏检→无标题、配对错误→串台)。page.get_text("dict") 找以
- page.search_for() 精确搜索 + 空间距离过滤,避免正文引用误匹配 "Figure N"/"Table N" 开头的文本块:文本块天然含完整多行标题,且其 ID 即论文实际
- 通用标签兜底,LLM 没提到的图表不会被丢弃 编号,直接命名规避串台。figure 标题优先在下方、table 标题优先在上方配对。
""" """
from __future__ import annotations from __future__ import annotations
@@ -14,6 +15,7 @@ from __future__ import annotations
import json import json
import logging import logging
import re import re
from dataclasses import dataclass
from pathlib import Path from pathlib import Path
import pymupdf import pymupdf
@@ -32,10 +34,16 @@ _RENDER_ZOOM = 3
_CLUSTER_GAP = 15 _CLUSTER_GAP = 15
# 最小 bbox 面积(单位: pt²)— 过滤 icon/logo 等微小误检 # 最小 bbox 面积(单位: pt²)— 过滤 icon/logo 等微小误检
_MIN_BOX_AREA = 2000 _MIN_BOX_AREA = 2000
# Phase 2: 搜索文本到 box 的最大匹配距离(单位: pt # caption 文本块与 figure/table 内容块的最大垂直距离(单位: pt
_LABEL_MATCH_DISTANCE = 100
# DocLayout caption 与 figure/table 匹配的最大距离(单位: pt)
_CAPTION_MATCH_DISTANCE = 120 _CAPTION_MATCH_DISTANCE = 120
# 方向不符(figure 标题在上 / table 标题在下)的配对惩罚分(仍允许,兜底异常排版)
_CAPTION_WRONG_SIDE_PENALTY = 300
# caption 开头标记:Figure 3 / Fig. 3 / Table C1 / Figure 3.5 等(大小写均可)
# 编号 = 数字开头 或 字母+数字(附录 C1);行首匹配,规避正文 "see Table 3" 引用
_CAPTION_HEAD_RE = re.compile(
r"^\s*(Figure|Fig\.?|Table)\b\.?\s+([0-9][0-9A-Za-z.]*|[A-Z]\d[0-9A-Za-z.]*)",
re.IGNORECASE,
)
# ── Box 聚类 ───────────────────────────────────────────────────────── # ── Box 聚类 ─────────────────────────────────────────────────────────
@@ -51,8 +59,17 @@ class _BoxCluster:
self.y0 = min(b.y0 for b in boxes) self.y0 = min(b.y0 for b in boxes)
self.x1 = max(b.x1 for b in boxes) self.x1 = max(b.x1 for b in boxes)
self.y1 = max(b.y1 for b in boxes) self.y1 = max(b.y1 for b in boxes)
raw = boxes[0].boxclass self.boxclass = boxes[0].boxclass
self.boxclass = "table" if raw == "table-fallback" else raw
@dataclass(frozen=True)
class _CaptionBlock:
"""从 PDF 文本流提取的标题块:自带权威 ID、完整多行文本、精确 bbox。"""
id: str # "Figure 3" / "Table C1"
kind: str # "figure" | "table"
text: str # 完整多行标题文本
bbox: list[float] # [x0, y0, x1, y1]
def _cluster_to_box(cluster: _BoxCluster) -> list[float]: def _cluster_to_box(cluster: _BoxCluster) -> list[float]:
@@ -103,64 +120,88 @@ def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]:
return [_BoxCluster(members) for members in groups.values()] return [_BoxCluster(members) for members in groups.values()]
def _caption_class_for_content(boxclass: str) -> str: def _find_caption_blocks(page) -> list[_CaptionBlock]:
return "figure_caption" if boxclass == "picture" else "table_caption" """从页面文本流提取以 "Figure N"/"Table N"/"Fig. N" 开头的标题块。
用 PDF 文本而非 DocLayout caption box:文本块天然含完整多行标题,
def _caption_distance(content: _BoxCluster, caption: _BoxCluster) -> float | None: 且其 ID 即论文实际编号(如 "Table C1"),权威且不依赖模型检测。
"""Return a spatial score for pairing a caption with a content box.""" """
h_overlap = min(content.x1, caption.x1) - max(content.x0, caption.x0)
min_width = min(content.x1 - content.x0, caption.x1 - caption.x0)
if min_width <= 0 or h_overlap < min_width * 0.25:
return None
if caption.y1 < content.y0:
v_gap = content.y0 - caption.y1
elif caption.y0 > content.y1:
v_gap = caption.y0 - content.y1
else:
v_gap = 0.0
return v_gap if v_gap <= _CAPTION_MATCH_DISTANCE else None
def _extract_caption_text(page, caption: _BoxCluster) -> str:
rect = pymupdf.Rect(caption.x0, caption.y0, caption.x1, caption.y1)
try: try:
text = page.get_text("text", clip=rect) d = page.get_text("dict")
except Exception: except Exception:
return "" return []
return " ".join(text.split())
results: list[_CaptionBlock] = []
for block in d.get("blocks", []):
if block.get("type") != 0: # 仅文本块
continue
lines = block.get("lines", [])
if not lines:
continue
line_texts = [
"".join(span.get("text", "") for span in line.get("spans", []))
for line in lines
]
first_line = next((t for t in line_texts if t.strip()), "")
m = _CAPTION_HEAD_RE.match(first_line)
if not m:
continue
kind_word, num = m.group(1), m.group(2)
is_table = kind_word.lower().startswith("table")
bbox = block.get("bbox")
if not bbox or len(bbox) != 4:
continue
full_text = " ".join(t.strip() for t in line_texts if t.strip())
results.append(
_CaptionBlock(
id=f"{'Table' if is_table else 'Figure'} {num}",
kind="table" if is_table else "figure",
text=full_text,
bbox=[float(v) for v in bbox],
)
)
return results
def _match_captions( def _pair_caption_blocks(
page,
content_clusters: list[_BoxCluster], content_clusters: list[_BoxCluster],
caption_clusters: list[_BoxCluster], caption_blocks: list[_CaptionBlock],
) -> dict[int, tuple[_BoxCluster, str]]: ) -> dict[int, _CaptionBlock]:
"""Match each content cluster to its nearest same-type DocLayout caption.""" """每个内容块配方向上最近的同类型标题块。
matches: dict[int, tuple[_BoxCluster, str]] = {}
used_captions: set[int] = set() figure 标题惯例在下方、table 标题在上方;方向相符优先,不符加惩罚兜底
(跨页 / 异常排版)。按 (距离+惩罚) 升序贪心匹配,每个内容块与标题块唯一配对。
"""
candidates: list[tuple[float, int, int]] = [] candidates: list[tuple[float, int, int]] = []
for c_idx, content in enumerate(content_clusters):
for content_idx, content in enumerate(content_clusters): want_below = content.boxclass == "picture" # figure 标题在下
wanted_caption_class = _caption_class_for_content(content.boxclass) want_kind = "figure" if want_below else "table"
for caption_idx, caption in enumerate(caption_clusters): for b_idx, cap in enumerate(caption_blocks):
if caption.boxclass != wanted_caption_class: if cap.kind != want_kind:
continue continue
dist = _caption_distance(content, caption) cx0, cy0, cx1, cy1 = cap.bbox
if dist is not None: h_overlap = min(content.x1, cx1) - max(content.x0, cx0)
candidates.append((dist, content_idx, caption_idx)) min_width = min(content.x1 - content.x0, cx1 - cx0)
if min_width <= 0 or h_overlap < min_width * 0.25:
continue
if cy1 <= content.y0: # 标题在内容上方
side_below, v_gap = False, content.y0 - cy1
elif cy0 >= content.y1: # 标题在内容下方
side_below, v_gap = True, cy0 - content.y1
else:
continue # 重叠,跳过
if v_gap > _CAPTION_MATCH_DISTANCE:
continue
penalty = 0.0 if side_below == want_below else _CAPTION_WRONG_SIDE_PENALTY
candidates.append((v_gap + penalty, c_idx, b_idx))
for _dist, content_idx, caption_idx in sorted(candidates): matches: dict[int, _CaptionBlock] = {}
if content_idx in matches or caption_idx in used_captions: used: set[int] = set()
for _score, c_idx, b_idx in sorted(candidates):
if c_idx in matches or b_idx in used:
continue continue
text = _extract_caption_text(page, caption_clusters[caption_idx]) matches[c_idx] = caption_blocks[b_idx]
if not text: used.add(b_idx)
continue
matches[content_idx] = (caption_clusters[caption_idx], text)
used_captions.add(caption_idx)
return matches return matches
@@ -174,25 +215,27 @@ def _render_box(
filename: str, filename: str,
cap_type: str, cap_type: str,
page_num: int, page_num: int,
caption: _BoxCluster | None = None, caption_bbox: list[float] | None = None,
) -> bool: ) -> bool:
"""渲染单个 box 区域并保存 JPEG,成功返回 True。 """渲染单个 box 区域并保存 JPEG,成功返回 True。
若提供 caption,则将内容与 caption 区域合并后一起截取, 若提供 caption_bbox,则将内容与标题区域合并后一起截取,
使同一张截图同时包含图/表及其标题文字 使同一张截图同时包含图/表及其完整标题。
""" """
page_width = page.rect.width page_width = page.rect.width
page_height = page.rect.height
x0, y0, x1, y1 = box.x0, box.y0, box.x1, box.y1 x0, y0, x1, y1 = box.x0, box.y0, box.x1, box.y1
if caption is not None: if caption_bbox is not None:
x0 = min(x0, caption.x0) cx0, cy0, cx1, cy1 = caption_bbox
y0 = min(y0, caption.y0) x0 = min(x0, cx0)
x1 = max(x1, caption.x1) y0 = min(y0, cy0)
y1 = max(y1, caption.y1) x1 = max(x1, cx1)
y1 = max(y1, cy1)
clip = pymupdf.Rect( clip = pymupdf.Rect(
max(0, x0 - _REGION_PADDING), max(0, x0 - _REGION_PADDING),
max(0, y0 - _REGION_PADDING), max(0, y0 - _REGION_PADDING),
min(page_width, x1 + _REGION_PADDING), min(page_width, x1 + _REGION_PADDING),
y1 + _REGION_PADDING, min(page_height, y1 + _REGION_PADDING),
) )
mat = pymupdf.Matrix(_RENDER_ZOOM, _RENDER_ZOOM) mat = pymupdf.Matrix(_RENDER_ZOOM, _RENDER_ZOOM)
try: try:
@@ -200,7 +243,7 @@ def _render_box(
except Exception: except Exception:
return False return False
(images_dest / filename).write_bytes(pix.tobytes("jpeg")) (images_dest / filename).write_bytes(pix.tobytes("jpeg", jpg_quality=92))
return True return True
@@ -213,77 +256,62 @@ def _process_page(
seen_labels: set, seen_labels: set,
arxiv_id: str, arxiv_id: str,
) -> int: ) -> int:
"""处理单页:检测 → 聚类 → 渲染,全部用通用标签。""" """处理单页:检测内容 box → 文本定位 caption渲染配到标题的。
配到 Figure/Table caption 的 box 用 caption 自带 ID 命名(figure_3.jpg);
没配到标题的(Algorithm 伪代码、无编号附录表、误检碎片)一律过滤,不输出。
"""
page = doc[page_idx] page = doc[page_idx]
page_num = page_idx + 1 page_num = page_idx + 1
fig_counter = 0
tbl_counter = 0
# 收集本页的 table/picture box 与 caption box(跳过极小区域 # 收集本页 figure/table 内容 box(跳过极小区域;caption 改由文本定位,不收 box
raw_boxes = [] raw_boxes = []
raw_caption_boxes = []
for box in page_boxes: for box in page_boxes:
w = box.x1 - box.x0 if box.boxclass in ("table", "picture"):
h = box.y1 - box.y0 w = box.x1 - box.x0
if box.boxclass in ("table", "table-fallback", "picture"): h = box.y1 - box.y0
if w < 20 or h < 20 or w * h < _MIN_BOX_AREA: if w < 20 or h < 20 or w * h < _MIN_BOX_AREA:
continue continue
raw_boxes.append(box) raw_boxes.append(box)
elif box.boxclass in ("figure_caption", "table_caption"):
if w < 30 or h < 6:
continue
raw_caption_boxes.append(box)
if not raw_boxes: if not raw_boxes:
return 0 return 0
# 聚类:将同一 figure/table 的碎片 box 合并 # 聚类:将同一 figure/table 的碎片 box 合并;用 PDF 文本定位 caption
clusters = _cluster_boxes(raw_boxes) clusters = _cluster_boxes(raw_boxes)
caption_clusters = _cluster_boxes(raw_caption_boxes) caption_blocks = _find_caption_blocks(page)
caption_matches = _match_captions(page, clusters, caption_clusters) caption_matches = _pair_caption_blocks(clusters, caption_blocks)
extracted = 0 extracted = 0
for cluster_idx, cluster in enumerate(clusters): for cluster_idx, cluster in enumerate(clusters):
cap_type = "figure" if cluster.boxclass == "picture" else "table" cap_match = caption_matches.get(cluster_idx)
if cap_match is None:
continue # 无 Figure/Table 标题 → 过滤(Algorithm、无编号表、误检碎片)
if cap_match.id in seen_labels:
continue # 同一图表被 DocLayout 切成多块重复检测,跳过后续
seen_labels.add(cap_match.id)
if cap_type == "figure": filename = f"{cap_match.id.replace(' ', '_').lower()}.jpg"
fig_counter += 1
label = f"Figure (p{page_num}-{fig_counter})"
else:
tbl_counter += 1
label = f"Table (p{page_num}-{tbl_counter})"
if label in seen_labels:
continue
seen_labels.add(label)
caption_match = caption_matches.get(cluster_idx)
caption_cluster = caption_match[0] if caption_match else None
filename = f"{label.replace(' ', '_').lower()}.jpg"
if not _render_box( if not _render_box(
page, page,
cluster, cluster,
images_dest, images_dest,
filename, filename,
cap_type, cap_match.kind,
page_num, page_num,
caption=caption_cluster, caption_bbox=cap_match.bbox,
): ):
continue continue
info = { manifest[filename] = {
"page": page_num, "page": page_num,
"type": cap_type, "type": cap_match.kind,
"label": label, "label": cap_match.id,
"box": _cluster_to_box(cluster), "box": _cluster_to_box(cluster),
"caption_text": cap_match.text[:500],
"caption_box": cap_match.bbox,
"caption_source": "text",
} }
if caption_match:
info["caption_text"] = caption_match[1][:500]
info["caption_box"] = _cluster_to_box(caption_cluster)
info["caption_source"] = "doclayout"
manifest[filename] = info
extracted += 1 extracted += 1
return extracted return extracted
@@ -359,230 +387,6 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
return extracted return extracted
# ── Phase 2: 用 summary 的 figures ID 定位并重命名 ─────────────────────
def _distance_text_to_box(rect: pymupdf.Rect, box: list[float]) -> float | None:
"""计算搜索到的文本 rect 到 box 的距离。超出阈值返回 None。
判断逻辑:rect 中心与 box 的垂直距离 + 水平重叠检查。
"""
rect_cx = (rect.x0 + rect.x1) / 2
rect_cy = (rect.y0 + rect.y1) / 2
bx0, by0, bx1, by1 = box
# 水平重叠:rect 中心在 box 水平范围内(或接近)
if not (bx0 - 20 <= rect_cx <= bx1 + 20):
return None
# 垂直距离
if rect_cy < by0:
dist = by0 - rect_cy
elif rect_cy > by1:
dist = rect_cy - by1
else:
dist = 0
return dist if dist <= _LABEL_MATCH_DISTANCE else None
def _search_variants(fig_id: str) -> list[str]:
"""为 figure/table ID 生成搜索变体。
"Figure 1" → ["Figure 1", "Fig. 1", "Fig 1"]
"Fig. 1" → ["Fig. 1", "Figure 1", "Fig 1"]
"Table A1" → ["Table A1"]
"""
variants = [fig_id]
m = re.match(r"(Fig\.?|Figure)\s+(\d+.*)", fig_id, re.IGNORECASE)
if m:
num_part = m.group(2)
variants.extend(
[
f"Figure {num_part}",
f"Fig. {num_part}",
f"Fig {num_part}",
]
)
# 去重保序
seen = set()
result = []
for v in variants:
if v not in seen:
seen.add(v)
result.append(v)
return result
def label_images_by_summary(
arxiv_id: str,
figures: list[dict],
pdf_path: Path | None = None,
) -> int:
"""Phase 2: 用 summary 的 figures ID 在 PDF 中搜索定位,重命名图片。
对 summary 中的每个 figure/table ID
1. page.search_for(id) 在所有页面搜索文本位置
2. 计算搜索位置与 manifest 中 box 坐标的距离
3. 最近匹配 → 重命名文件、更新 manifest
Args:
arxiv_id: 论文 ID
figures: summary 的 figures 列表,每项含 id/caption/description 等
pdf_path: PDF 路径
Returns:
成功重命名的图片数量
"""
if not figures:
return 0
if pdf_path is None:
pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
if not pdf_path.exists():
return 0
images_dest = paper_dir(arxiv_id) / "images"
manifest_path = images_dest / "manifest.json"
if not manifest_path.exists():
return 0
manifest: dict[str, dict] = json.loads(manifest_path.read_text(encoding="utf-8"))
if not manifest:
return 0
# 构建候选列表:只对通用标签的条目做匹配
candidates: dict[str, dict] = {} # filename → {page, box, ...}
for fname, info in manifest.items():
if "(p" in info.get("label", ""):
candidates[fname] = info
if not candidates:
return 0
with pymupdf.open(str(pdf_path)) as doc:
# 收集所有匹配候选:(fig_id, fig_index, filename, distance)
matches: list[tuple[str, int, str, float]] = []
for fig_idx, fig in enumerate(figures):
fig_id = fig.get("id", "")
if not fig_id:
continue
# 生成搜索变体:Figure 1 / Fig. 1 / Fig 1 等
search_terms = _search_variants(fig_id)
# 在所有页面搜索该文本(含变体)
search_hits: list[tuple[int, pymupdf.Rect]] = [] # (page_num_1based, Rect)
for page_idx in range(doc.page_count):
page = doc[page_idx]
seen_rects: set[tuple[float, float]] = set()
for term in search_terms:
for r in page.search_for(term):
key = (round(r.x0, 1), round(r.y0, 1))
if key not in seen_rects:
seen_rects.add(key)
search_hits.append((page_idx + 1, r))
if not search_hits:
continue
# 对每个候选 manifest 条目,找最近的搜索命中
for fname, info in candidates.items():
box = info.get("box")
if not box:
continue
manifest_page = info.get("page", 0)
best_dist: float | None = None
for hit_page, rect in search_hits:
# 只匹配同页面
if hit_page != manifest_page:
continue
dist = _distance_text_to_box(rect, box)
if dist is not None and (best_dist is None or dist < best_dist):
best_dist = dist
if best_dist is not None:
matches.append((fig_id, fig_idx, fname, best_dist))
if not matches:
logger.info("No label matches for %s", arxiv_id)
return 0
# 去冲突:按距离排序,每个 fig_id 和每个 filename 只匹配一次
matches.sort(key=lambda x: x[3])
used_fig_ids: set[int] = set()
used_filenames: set[str] = set()
renames: list[tuple[str, str, str]] = [] # (old_fname, new_fname, fig_id)
for fig_id, fig_idx, fname, dist in matches:
if fig_idx in used_fig_ids or fname in used_filenames:
continue
used_fig_ids.add(fig_idx)
used_filenames.add(fname)
new_fname = f"{fig_id.replace(' ', '_').lower()}.jpg"
renames.append((fname, new_fname, fig_id))
# 执行重命名
labeled = 0
new_manifest: dict[str, dict] = {}
for fname, info in manifest.items():
if fname in used_filenames:
continue
# 未匹配的保持原样
new_manifest[fname] = info
for old_fname, new_fname, fig_id in renames:
old_path = images_dest / old_fname
new_path = images_dest / new_fname
if not old_path.exists():
continue
# 搬运 manifest 信息
info = manifest[old_fname].copy()
cap_type = info.get("type", "figure")
# 读取 caption 文本(从 figures 列表)
summary_caption_text = ""
for fig in figures:
if fig.get("id") == fig_id:
summary_caption_text = fig.get("caption", "")
break
info["label"] = fig_id
existing_caption_text = info.get("caption_text", "")
if existing_caption_text and summary_caption_text:
info["summary_caption_text"] = summary_caption_text[:500]
else:
info["caption_text"] = (
summary_caption_text[:500] if summary_caption_text else ""
)
info.setdefault("figures" if cap_type == "figure" else "tables", []).append(
fig_id
)
# 重命名文件
if new_fname != old_fname:
old_path.rename(new_path)
new_manifest[new_fname] = info
labeled += 1
# 写回 manifest
manifest_path.write_text(json.dumps(new_manifest, ensure_ascii=False, indent=2))
logger.info(
"Labeled %d/%d images for %s using summary figures",
labeled,
len(manifest),
arxiv_id,
)
return labeled
# ── Figure ↔ Image 关联 ──────────────────────────────────────────────── # ── Figure ↔ Image 关联 ────────────────────────────────────────────────
+4 -9
View File
@@ -201,20 +201,15 @@ def _cleanup_old_images(db: Session, paper: Paper) -> None:
def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None: def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None:
"""从 PDF 提取图片和表格(失败不影响总结)。 """从 PDF 提取图片和表格(失败不影响总结)。
两阶段流水线: DocLayout-YOLO 检测 figure/table 内容区域 → PDF 文本定位 caption → 只渲染
1. DocLayout-YOLO 检测 + 渲染截图(通用标签) 配到 Figure/Table 标题的(Algorithm、无编号附录表、误检碎片一律过滤)。
2. 用 summary 的 figures ID 在 PDF 中搜索定位 → 重命名 标题来源已切换为 PDF 文本,schema.figures 不再参与命名,参数保留备用。
""" """
try: try:
from app.services.pdf_image_extractor import ( from app.services.pdf_image_extractor import extract_images_from_pdf
extract_images_from_pdf,
label_images_by_summary,
)
pdf_path = TMP_DIR / arxiv_id / "paper.pdf" pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
extract_images_from_pdf(arxiv_id, pdf_path) extract_images_from_pdf(arxiv_id, pdf_path)
if schema.figures:
label_images_by_summary(arxiv_id, schema.figures, pdf_path)
except Exception: except Exception:
logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True) logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
+92 -56
View File
@@ -1,6 +1,5 @@
from __future__ import annotations from __future__ import annotations
import json
from unittest.mock import MagicMock from unittest.mock import MagicMock
import pymupdf import pymupdf
@@ -9,7 +8,17 @@ from app.services import pdf_image_extractor as mod
from app.services.layout_detector import LayoutBox from app.services.layout_detector import LayoutBox
def test_process_page_extracts_doclayout_caption(tmp_path): def _caption_block(bbox, text):
"""构造一个 page.get_text("dict") 风格的文本块。"""
return {
"type": 0,
"bbox": list(bbox),
"lines": [{"spans": [{"text": text}]}],
}
def test_process_page_pairs_caption_from_text(tmp_path):
"""caption 来自 PDF 文本流(figure 标题在内容下方),用其 ID 直接命名。"""
images_dest = tmp_path / "images" images_dest = tmp_path / "images"
images_dest.mkdir() images_dest.mkdir()
manifest: dict[str, dict] = {} manifest: dict[str, dict] = {}
@@ -19,16 +28,18 @@ def test_process_page_extracts_doclayout_caption(tmp_path):
page = MagicMock() page = MagicMock()
page.rect.width = 600 page.rect.width = 600
page.rect.height = 800
page.get_pixmap.return_value = pix page.get_pixmap.return_value = pix
page.get_text.return_value = "Figure 1: Overall architecture.\n" page.get_text.return_value = {
"blocks": [
_caption_block((95, 310, 320, 325), "Figure 1: Overall architecture.")
]
}
doc = MagicMock() doc = MagicMock()
doc.__getitem__.return_value = page doc.__getitem__.return_value = page
boxes = [ boxes = [LayoutBox(100, 100, 300, 300, "picture")]
LayoutBox(100, 100, 300, 300, "picture"),
LayoutBox(95, 310, 320, 325, "figure_caption"),
]
extracted = mod._process_page( extracted = mod._process_page(
doc, doc,
@@ -41,14 +52,15 @@ def test_process_page_extracts_doclayout_caption(tmp_path):
) )
assert extracted == 1 assert extracted == 1
info = manifest["figure_(p1-1).jpg"] # caption 自带 ID → 直接命名 figure_1.jpg
info = manifest["figure_1.jpg"]
assert info["label"] == "Figure 1"
assert info["caption_text"] == "Figure 1: Overall architecture." assert info["caption_text"] == "Figure 1: Overall architecture."
assert info["caption_source"] == "doclayout" assert info["caption_source"] == "text"
assert info["caption_box"] == [95.0, 310.0, 320.0, 325.0]
def test_process_page_includes_caption_in_render(tmp_path): def test_process_page_includes_caption_in_render(tmp_path):
"""渲染时把 caption 区域合并进同一张截图。""" """渲染时把 caption 文本块区域合并进同一张截图。"""
images_dest = tmp_path / "images" images_dest = tmp_path / "images"
images_dest.mkdir() images_dest.mkdir()
manifest: dict[str, dict] = {} manifest: dict[str, dict] = {}
@@ -58,16 +70,16 @@ def test_process_page_includes_caption_in_render(tmp_path):
page = MagicMock() page = MagicMock()
page.rect.width = 600 page.rect.width = 600
page.rect.height = 800
page.get_pixmap.return_value = pix page.get_pixmap.return_value = pix
page.get_text.return_value = "Figure 1: Caption text.\n" page.get_text.return_value = {
"blocks": [_caption_block((95, 310, 320, 325), "Figure 1: Caption text.")]
}
doc = MagicMock() doc = MagicMock()
doc.__getitem__.return_value = page doc.__getitem__.return_value = page
boxes = [ boxes = [LayoutBox(100, 100, 300, 300, "picture")]
LayoutBox(100, 100, 300, 300, "picture"),
LayoutBox(95, 310, 320, 325, "figure_caption"),
]
mod._process_page( mod._process_page(
doc, doc,
@@ -85,50 +97,74 @@ def test_process_page_includes_caption_in_render(tmp_path):
assert clip == pymupdf.Rect(90, 95, 325, 330) assert clip == pymupdf.Rect(90, 95, 325, 330)
def test_label_images_preserves_doclayout_caption(tmp_path, monkeypatch): def test_process_page_table_caption_above(tmp_path):
arxiv_id = "2401.00001" """table 标题惯例在内容上方,配对后命名 table_N.jpg。"""
paper_root = tmp_path / arxiv_id images_dest = tmp_path / "images"
images_dest = paper_root / "images" images_dest.mkdir()
images_dest.mkdir(parents=True) manifest: dict[str, dict] = {}
(images_dest / "figure_(p1-1).jpg").write_bytes(b"jpeg")
(images_dest / "manifest.json").write_text(
json.dumps(
{
"figure_(p1-1).jpg": {
"page": 1,
"type": "figure",
"label": "Figure (p1-1)",
"box": [100, 100, 300, 300],
"caption_text": "Figure 1: PDF original caption.",
"caption_source": "doclayout",
}
}
)
)
pdf_path = tmp_path / "paper.pdf" pix = MagicMock()
pdf_path.write_bytes(b"%PDF") pix.tobytes.return_value = b"jpeg"
monkeypatch.setattr(mod, "paper_dir", lambda _arxiv_id: paper_root)
page = MagicMock() page = MagicMock()
page.search_for.return_value = [pymupdf.Rect(120, 305, 180, 320)] page.rect.width = 600
page.rect.height = 800
page.get_pixmap.return_value = pix
# caption 在内容上方 [80, 90, 320, 105],内容表格 [80, 120, 320, 280]
page.get_text.return_value = {
"blocks": [_caption_block((80, 90, 320, 105), "Table 2 | Results summary.")]
}
fake_doc = MagicMock() doc = MagicMock()
fake_doc.page_count = 1 doc.__getitem__.return_value = page
fake_doc.__getitem__.return_value = page
fake_doc.__enter__.return_value = fake_doc
fake_doc.__exit__.return_value = False
monkeypatch.setattr(mod.pymupdf, "open", lambda _path: fake_doc)
labeled = mod.label_images_by_summary( boxes = [LayoutBox(80, 120, 320, 280, "table")]
arxiv_id,
[{"id": "Figure 1", "caption": "Summary caption."}], extracted = mod._process_page(
pdf_path=pdf_path, doc,
0,
boxes,
images_dest=images_dest,
manifest=manifest,
seen_labels=set(),
arxiv_id="2401.00001",
) )
assert labeled == 1 assert extracted == 1
manifest = json.loads((images_dest / "manifest.json").read_text()) info = manifest["table_2.jpg"]
info = manifest["figure_1.jpg"] assert info["label"] == "Table 2"
assert info["caption_text"] == "Figure 1: PDF original caption." assert info["caption_source"] == "text"
assert info["caption_source"] == "doclayout"
assert info["summary_caption_text"] == "Summary caption."
def test_process_page_filters_uncaptioned(tmp_path):
"""没有 Figure/Table caption 配对的 boxAlgorithm、无编号表等)被过滤,不输出。"""
images_dest = tmp_path / "images"
images_dest.mkdir()
manifest: dict[str, dict] = {}
pix = MagicMock()
pix.tobytes.return_value = b"jpeg"
page = MagicMock()
page.rect.width = 600
page.rect.height = 800
page.get_pixmap.return_value = pix
page.get_text.return_value = {"blocks": []} # 无任何 caption 文本块
doc = MagicMock()
doc.__getitem__.return_value = page
boxes = [LayoutBox(100, 100, 300, 300, "picture")]
extracted = mod._process_page(
doc,
0,
boxes,
images_dest=images_dest,
manifest=manifest,
seen_labels=set(),
arxiv_id="2401.00001",
)
assert extracted == 0
assert manifest == {}