refactor: replace Phase 2 label matching with PDF text-stream caption pairing
- Extract captions from PDF text dict instead of DocLayout caption boxes - Use _CaptionBlock dataclass to carry authoritative ID, kind, text, bbox - Pair captions to content boxes with directional preference (figure below, table above) - Filter out uncaptioned boxes (Algorithm pseudo-code, unnumbered appendix tables, false positives) - Remove label_images_by_summary and Phase 2 rename pipeline entirely - Update tests to cover text-based caption pairing and filtering
This commit is contained in:
+140
-336
@@ -1,12 +1,13 @@
|
|||||||
"""PDF 图片与表格提取 — 两阶段流水线。
|
"""PDF 图片与表格提取。
|
||||||
|
|
||||||
Phase 1: DocLayout-YOLO 检测 figure/table 区域 → 渲染为 JPEG(通用标签)
|
DocLayout-YOLO 检测 figure/table 内容区域 → PDF 文本流定位 caption → 只渲染配到
|
||||||
Phase 2: 用 LLM summary 的 figures[].id 在 PDF 中搜索定位 → 匹配到 box → 重命名
|
Figure/Table 标题的,用 caption 自带权威 ID 命名。没配到标题的(Algorithm 伪代码、
|
||||||
|
无编号附录表、DocLayout 误检碎片)一律过滤,不输出。
|
||||||
|
|
||||||
相比旧方案(正则匹配 caption):
|
caption 定位用 PDF 文本而非 DocLayout 的 caption box —— 后者检测不稳(多行标题只
|
||||||
- 不再依赖正则,用 LLM 输出的 ID 直接搜索 PDF 文本
|
框一行→截断、漏检→无标题、配对错误→串台)。page.get_text("dict") 找以
|
||||||
- page.search_for() 精确搜索 + 空间距离过滤,避免正文引用误匹配
|
"Figure N"/"Table N" 开头的文本块:文本块天然含完整多行标题,且其 ID 即论文实际
|
||||||
- 通用标签兜底,LLM 没提到的图表不会被丢弃
|
编号,直接命名规避串台。figure 标题优先在下方、table 标题优先在上方配对。
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
@@ -14,6 +15,7 @@ from __future__ import annotations
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pymupdf
|
import pymupdf
|
||||||
@@ -32,10 +34,16 @@ _RENDER_ZOOM = 3
|
|||||||
_CLUSTER_GAP = 15
|
_CLUSTER_GAP = 15
|
||||||
# 最小 bbox 面积(单位: pt²)— 过滤 icon/logo 等微小误检
|
# 最小 bbox 面积(单位: pt²)— 过滤 icon/logo 等微小误检
|
||||||
_MIN_BOX_AREA = 2000
|
_MIN_BOX_AREA = 2000
|
||||||
# Phase 2: 搜索文本到 box 的最大匹配距离(单位: pt)
|
# caption 文本块与 figure/table 内容块的最大垂直距离(单位: pt)
|
||||||
_LABEL_MATCH_DISTANCE = 100
|
|
||||||
# DocLayout caption 与 figure/table 匹配的最大距离(单位: pt)
|
|
||||||
_CAPTION_MATCH_DISTANCE = 120
|
_CAPTION_MATCH_DISTANCE = 120
|
||||||
|
# 方向不符(figure 标题在上 / table 标题在下)的配对惩罚分(仍允许,兜底异常排版)
|
||||||
|
_CAPTION_WRONG_SIDE_PENALTY = 300
|
||||||
|
# caption 开头标记:Figure 3 / Fig. 3 / Table C1 / Figure 3.5 等(大小写均可)
|
||||||
|
# 编号 = 数字开头 或 字母+数字(附录 C1);行首匹配,规避正文 "see Table 3" 引用
|
||||||
|
_CAPTION_HEAD_RE = re.compile(
|
||||||
|
r"^\s*(Figure|Fig\.?|Table)\b\.?\s+([0-9][0-9A-Za-z.]*|[A-Z]\d[0-9A-Za-z.]*)",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ── Box 聚类 ─────────────────────────────────────────────────────────
|
# ── Box 聚类 ─────────────────────────────────────────────────────────
|
||||||
@@ -51,8 +59,17 @@ class _BoxCluster:
|
|||||||
self.y0 = min(b.y0 for b in boxes)
|
self.y0 = min(b.y0 for b in boxes)
|
||||||
self.x1 = max(b.x1 for b in boxes)
|
self.x1 = max(b.x1 for b in boxes)
|
||||||
self.y1 = max(b.y1 for b in boxes)
|
self.y1 = max(b.y1 for b in boxes)
|
||||||
raw = boxes[0].boxclass
|
self.boxclass = boxes[0].boxclass
|
||||||
self.boxclass = "table" if raw == "table-fallback" else raw
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class _CaptionBlock:
|
||||||
|
"""从 PDF 文本流提取的标题块:自带权威 ID、完整多行文本、精确 bbox。"""
|
||||||
|
|
||||||
|
id: str # "Figure 3" / "Table C1"
|
||||||
|
kind: str # "figure" | "table"
|
||||||
|
text: str # 完整多行标题文本
|
||||||
|
bbox: list[float] # [x0, y0, x1, y1]
|
||||||
|
|
||||||
|
|
||||||
def _cluster_to_box(cluster: _BoxCluster) -> list[float]:
|
def _cluster_to_box(cluster: _BoxCluster) -> list[float]:
|
||||||
@@ -103,64 +120,88 @@ def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]:
|
|||||||
return [_BoxCluster(members) for members in groups.values()]
|
return [_BoxCluster(members) for members in groups.values()]
|
||||||
|
|
||||||
|
|
||||||
def _caption_class_for_content(boxclass: str) -> str:
|
def _find_caption_blocks(page) -> list[_CaptionBlock]:
|
||||||
return "figure_caption" if boxclass == "picture" else "table_caption"
|
"""从页面文本流提取以 "Figure N"/"Table N"/"Fig. N" 开头的标题块。
|
||||||
|
|
||||||
|
用 PDF 文本而非 DocLayout caption box:文本块天然含完整多行标题,
|
||||||
def _caption_distance(content: _BoxCluster, caption: _BoxCluster) -> float | None:
|
且其 ID 即论文实际编号(如 "Table C1"),权威且不依赖模型检测。
|
||||||
"""Return a spatial score for pairing a caption with a content box."""
|
"""
|
||||||
h_overlap = min(content.x1, caption.x1) - max(content.x0, caption.x0)
|
|
||||||
min_width = min(content.x1 - content.x0, caption.x1 - caption.x0)
|
|
||||||
if min_width <= 0 or h_overlap < min_width * 0.25:
|
|
||||||
return None
|
|
||||||
|
|
||||||
if caption.y1 < content.y0:
|
|
||||||
v_gap = content.y0 - caption.y1
|
|
||||||
elif caption.y0 > content.y1:
|
|
||||||
v_gap = caption.y0 - content.y1
|
|
||||||
else:
|
|
||||||
v_gap = 0.0
|
|
||||||
|
|
||||||
return v_gap if v_gap <= _CAPTION_MATCH_DISTANCE else None
|
|
||||||
|
|
||||||
|
|
||||||
def _extract_caption_text(page, caption: _BoxCluster) -> str:
|
|
||||||
rect = pymupdf.Rect(caption.x0, caption.y0, caption.x1, caption.y1)
|
|
||||||
try:
|
try:
|
||||||
text = page.get_text("text", clip=rect)
|
d = page.get_text("dict")
|
||||||
except Exception:
|
except Exception:
|
||||||
return ""
|
return []
|
||||||
return " ".join(text.split())
|
|
||||||
|
results: list[_CaptionBlock] = []
|
||||||
|
for block in d.get("blocks", []):
|
||||||
|
if block.get("type") != 0: # 仅文本块
|
||||||
|
continue
|
||||||
|
lines = block.get("lines", [])
|
||||||
|
if not lines:
|
||||||
|
continue
|
||||||
|
line_texts = [
|
||||||
|
"".join(span.get("text", "") for span in line.get("spans", []))
|
||||||
|
for line in lines
|
||||||
|
]
|
||||||
|
first_line = next((t for t in line_texts if t.strip()), "")
|
||||||
|
m = _CAPTION_HEAD_RE.match(first_line)
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
kind_word, num = m.group(1), m.group(2)
|
||||||
|
is_table = kind_word.lower().startswith("table")
|
||||||
|
bbox = block.get("bbox")
|
||||||
|
if not bbox or len(bbox) != 4:
|
||||||
|
continue
|
||||||
|
full_text = " ".join(t.strip() for t in line_texts if t.strip())
|
||||||
|
results.append(
|
||||||
|
_CaptionBlock(
|
||||||
|
id=f"{'Table' if is_table else 'Figure'} {num}",
|
||||||
|
kind="table" if is_table else "figure",
|
||||||
|
text=full_text,
|
||||||
|
bbox=[float(v) for v in bbox],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
def _match_captions(
|
def _pair_caption_blocks(
|
||||||
page,
|
|
||||||
content_clusters: list[_BoxCluster],
|
content_clusters: list[_BoxCluster],
|
||||||
caption_clusters: list[_BoxCluster],
|
caption_blocks: list[_CaptionBlock],
|
||||||
) -> dict[int, tuple[_BoxCluster, str]]:
|
) -> dict[int, _CaptionBlock]:
|
||||||
"""Match each content cluster to its nearest same-type DocLayout caption."""
|
"""每个内容块配方向上最近的同类型标题块。
|
||||||
matches: dict[int, tuple[_BoxCluster, str]] = {}
|
|
||||||
used_captions: set[int] = set()
|
figure 标题惯例在下方、table 标题在上方;方向相符优先,不符加惩罚兜底
|
||||||
|
(跨页 / 异常排版)。按 (距离+惩罚) 升序贪心匹配,每个内容块与标题块唯一配对。
|
||||||
|
"""
|
||||||
candidates: list[tuple[float, int, int]] = []
|
candidates: list[tuple[float, int, int]] = []
|
||||||
|
for c_idx, content in enumerate(content_clusters):
|
||||||
for content_idx, content in enumerate(content_clusters):
|
want_below = content.boxclass == "picture" # figure 标题在下
|
||||||
wanted_caption_class = _caption_class_for_content(content.boxclass)
|
want_kind = "figure" if want_below else "table"
|
||||||
for caption_idx, caption in enumerate(caption_clusters):
|
for b_idx, cap in enumerate(caption_blocks):
|
||||||
if caption.boxclass != wanted_caption_class:
|
if cap.kind != want_kind:
|
||||||
continue
|
continue
|
||||||
dist = _caption_distance(content, caption)
|
cx0, cy0, cx1, cy1 = cap.bbox
|
||||||
if dist is not None:
|
h_overlap = min(content.x1, cx1) - max(content.x0, cx0)
|
||||||
candidates.append((dist, content_idx, caption_idx))
|
min_width = min(content.x1 - content.x0, cx1 - cx0)
|
||||||
|
if min_width <= 0 or h_overlap < min_width * 0.25:
|
||||||
|
continue
|
||||||
|
if cy1 <= content.y0: # 标题在内容上方
|
||||||
|
side_below, v_gap = False, content.y0 - cy1
|
||||||
|
elif cy0 >= content.y1: # 标题在内容下方
|
||||||
|
side_below, v_gap = True, cy0 - content.y1
|
||||||
|
else:
|
||||||
|
continue # 重叠,跳过
|
||||||
|
if v_gap > _CAPTION_MATCH_DISTANCE:
|
||||||
|
continue
|
||||||
|
penalty = 0.0 if side_below == want_below else _CAPTION_WRONG_SIDE_PENALTY
|
||||||
|
candidates.append((v_gap + penalty, c_idx, b_idx))
|
||||||
|
|
||||||
for _dist, content_idx, caption_idx in sorted(candidates):
|
matches: dict[int, _CaptionBlock] = {}
|
||||||
if content_idx in matches or caption_idx in used_captions:
|
used: set[int] = set()
|
||||||
|
for _score, c_idx, b_idx in sorted(candidates):
|
||||||
|
if c_idx in matches or b_idx in used:
|
||||||
continue
|
continue
|
||||||
text = _extract_caption_text(page, caption_clusters[caption_idx])
|
matches[c_idx] = caption_blocks[b_idx]
|
||||||
if not text:
|
used.add(b_idx)
|
||||||
continue
|
|
||||||
matches[content_idx] = (caption_clusters[caption_idx], text)
|
|
||||||
used_captions.add(caption_idx)
|
|
||||||
|
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
|
|
||||||
@@ -174,25 +215,27 @@ def _render_box(
|
|||||||
filename: str,
|
filename: str,
|
||||||
cap_type: str,
|
cap_type: str,
|
||||||
page_num: int,
|
page_num: int,
|
||||||
caption: _BoxCluster | None = None,
|
caption_bbox: list[float] | None = None,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""渲染单个 box 区域并保存 JPEG,成功返回 True。
|
"""渲染单个 box 区域并保存 JPEG,成功返回 True。
|
||||||
|
|
||||||
若提供 caption,则将内容与 caption 区域合并后一起截取,
|
若提供 caption_bbox,则将内容与标题区域合并后一起截取,
|
||||||
使同一张截图同时包含图/表及其标题文字。
|
使同一张截图同时包含图/表及其完整标题。
|
||||||
"""
|
"""
|
||||||
page_width = page.rect.width
|
page_width = page.rect.width
|
||||||
|
page_height = page.rect.height
|
||||||
x0, y0, x1, y1 = box.x0, box.y0, box.x1, box.y1
|
x0, y0, x1, y1 = box.x0, box.y0, box.x1, box.y1
|
||||||
if caption is not None:
|
if caption_bbox is not None:
|
||||||
x0 = min(x0, caption.x0)
|
cx0, cy0, cx1, cy1 = caption_bbox
|
||||||
y0 = min(y0, caption.y0)
|
x0 = min(x0, cx0)
|
||||||
x1 = max(x1, caption.x1)
|
y0 = min(y0, cy0)
|
||||||
y1 = max(y1, caption.y1)
|
x1 = max(x1, cx1)
|
||||||
|
y1 = max(y1, cy1)
|
||||||
clip = pymupdf.Rect(
|
clip = pymupdf.Rect(
|
||||||
max(0, x0 - _REGION_PADDING),
|
max(0, x0 - _REGION_PADDING),
|
||||||
max(0, y0 - _REGION_PADDING),
|
max(0, y0 - _REGION_PADDING),
|
||||||
min(page_width, x1 + _REGION_PADDING),
|
min(page_width, x1 + _REGION_PADDING),
|
||||||
y1 + _REGION_PADDING,
|
min(page_height, y1 + _REGION_PADDING),
|
||||||
)
|
)
|
||||||
mat = pymupdf.Matrix(_RENDER_ZOOM, _RENDER_ZOOM)
|
mat = pymupdf.Matrix(_RENDER_ZOOM, _RENDER_ZOOM)
|
||||||
try:
|
try:
|
||||||
@@ -200,7 +243,7 @@ def _render_box(
|
|||||||
except Exception:
|
except Exception:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
(images_dest / filename).write_bytes(pix.tobytes("jpeg"))
|
(images_dest / filename).write_bytes(pix.tobytes("jpeg", jpg_quality=92))
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
@@ -213,77 +256,62 @@ def _process_page(
|
|||||||
seen_labels: set,
|
seen_labels: set,
|
||||||
arxiv_id: str,
|
arxiv_id: str,
|
||||||
) -> int:
|
) -> int:
|
||||||
"""处理单页:检测 → 聚类 → 渲染,全部用通用标签。"""
|
"""处理单页:检测内容 box → 文本定位 caption → 只渲染配到标题的。
|
||||||
|
|
||||||
|
配到 Figure/Table caption 的 box 用 caption 自带 ID 命名(figure_3.jpg);
|
||||||
|
没配到标题的(Algorithm 伪代码、无编号附录表、误检碎片)一律过滤,不输出。
|
||||||
|
"""
|
||||||
page = doc[page_idx]
|
page = doc[page_idx]
|
||||||
page_num = page_idx + 1
|
page_num = page_idx + 1
|
||||||
fig_counter = 0
|
|
||||||
tbl_counter = 0
|
|
||||||
|
|
||||||
# 收集本页的 table/picture box 与 caption box(跳过极小区域)
|
# 收集本页 figure/table 内容 box(跳过极小区域;caption 改由文本定位,不收 box)
|
||||||
raw_boxes = []
|
raw_boxes = []
|
||||||
raw_caption_boxes = []
|
|
||||||
for box in page_boxes:
|
for box in page_boxes:
|
||||||
w = box.x1 - box.x0
|
if box.boxclass in ("table", "picture"):
|
||||||
h = box.y1 - box.y0
|
w = box.x1 - box.x0
|
||||||
if box.boxclass in ("table", "table-fallback", "picture"):
|
h = box.y1 - box.y0
|
||||||
if w < 20 or h < 20 or w * h < _MIN_BOX_AREA:
|
if w < 20 or h < 20 or w * h < _MIN_BOX_AREA:
|
||||||
continue
|
continue
|
||||||
raw_boxes.append(box)
|
raw_boxes.append(box)
|
||||||
elif box.boxclass in ("figure_caption", "table_caption"):
|
|
||||||
if w < 30 or h < 6:
|
|
||||||
continue
|
|
||||||
raw_caption_boxes.append(box)
|
|
||||||
|
|
||||||
if not raw_boxes:
|
if not raw_boxes:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
# 聚类:将同一 figure/table 的碎片 box 合并
|
# 聚类:将同一 figure/table 的碎片 box 合并;用 PDF 文本定位 caption
|
||||||
clusters = _cluster_boxes(raw_boxes)
|
clusters = _cluster_boxes(raw_boxes)
|
||||||
caption_clusters = _cluster_boxes(raw_caption_boxes)
|
caption_blocks = _find_caption_blocks(page)
|
||||||
caption_matches = _match_captions(page, clusters, caption_clusters)
|
caption_matches = _pair_caption_blocks(clusters, caption_blocks)
|
||||||
|
|
||||||
extracted = 0
|
extracted = 0
|
||||||
for cluster_idx, cluster in enumerate(clusters):
|
for cluster_idx, cluster in enumerate(clusters):
|
||||||
cap_type = "figure" if cluster.boxclass == "picture" else "table"
|
cap_match = caption_matches.get(cluster_idx)
|
||||||
|
if cap_match is None:
|
||||||
|
continue # 无 Figure/Table 标题 → 过滤(Algorithm、无编号表、误检碎片)
|
||||||
|
if cap_match.id in seen_labels:
|
||||||
|
continue # 同一图表被 DocLayout 切成多块重复检测,跳过后续
|
||||||
|
seen_labels.add(cap_match.id)
|
||||||
|
|
||||||
if cap_type == "figure":
|
filename = f"{cap_match.id.replace(' ', '_').lower()}.jpg"
|
||||||
fig_counter += 1
|
|
||||||
label = f"Figure (p{page_num}-{fig_counter})"
|
|
||||||
else:
|
|
||||||
tbl_counter += 1
|
|
||||||
label = f"Table (p{page_num}-{tbl_counter})"
|
|
||||||
|
|
||||||
if label in seen_labels:
|
|
||||||
continue
|
|
||||||
seen_labels.add(label)
|
|
||||||
|
|
||||||
caption_match = caption_matches.get(cluster_idx)
|
|
||||||
caption_cluster = caption_match[0] if caption_match else None
|
|
||||||
|
|
||||||
filename = f"{label.replace(' ', '_').lower()}.jpg"
|
|
||||||
if not _render_box(
|
if not _render_box(
|
||||||
page,
|
page,
|
||||||
cluster,
|
cluster,
|
||||||
images_dest,
|
images_dest,
|
||||||
filename,
|
filename,
|
||||||
cap_type,
|
cap_match.kind,
|
||||||
page_num,
|
page_num,
|
||||||
caption=caption_cluster,
|
caption_bbox=cap_match.bbox,
|
||||||
):
|
):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
info = {
|
manifest[filename] = {
|
||||||
"page": page_num,
|
"page": page_num,
|
||||||
"type": cap_type,
|
"type": cap_match.kind,
|
||||||
"label": label,
|
"label": cap_match.id,
|
||||||
"box": _cluster_to_box(cluster),
|
"box": _cluster_to_box(cluster),
|
||||||
|
"caption_text": cap_match.text[:500],
|
||||||
|
"caption_box": cap_match.bbox,
|
||||||
|
"caption_source": "text",
|
||||||
}
|
}
|
||||||
if caption_match:
|
|
||||||
info["caption_text"] = caption_match[1][:500]
|
|
||||||
info["caption_box"] = _cluster_to_box(caption_cluster)
|
|
||||||
info["caption_source"] = "doclayout"
|
|
||||||
|
|
||||||
manifest[filename] = info
|
|
||||||
extracted += 1
|
extracted += 1
|
||||||
|
|
||||||
return extracted
|
return extracted
|
||||||
@@ -359,230 +387,6 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
|
|||||||
return extracted
|
return extracted
|
||||||
|
|
||||||
|
|
||||||
# ── Phase 2: 用 summary 的 figures ID 定位并重命名 ─────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
def _distance_text_to_box(rect: pymupdf.Rect, box: list[float]) -> float | None:
|
|
||||||
"""计算搜索到的文本 rect 到 box 的距离。超出阈值返回 None。
|
|
||||||
|
|
||||||
判断逻辑:rect 中心与 box 的垂直距离 + 水平重叠检查。
|
|
||||||
"""
|
|
||||||
rect_cx = (rect.x0 + rect.x1) / 2
|
|
||||||
rect_cy = (rect.y0 + rect.y1) / 2
|
|
||||||
bx0, by0, bx1, by1 = box
|
|
||||||
|
|
||||||
# 水平重叠:rect 中心在 box 水平范围内(或接近)
|
|
||||||
if not (bx0 - 20 <= rect_cx <= bx1 + 20):
|
|
||||||
return None
|
|
||||||
|
|
||||||
# 垂直距离
|
|
||||||
if rect_cy < by0:
|
|
||||||
dist = by0 - rect_cy
|
|
||||||
elif rect_cy > by1:
|
|
||||||
dist = rect_cy - by1
|
|
||||||
else:
|
|
||||||
dist = 0
|
|
||||||
|
|
||||||
return dist if dist <= _LABEL_MATCH_DISTANCE else None
|
|
||||||
|
|
||||||
|
|
||||||
def _search_variants(fig_id: str) -> list[str]:
|
|
||||||
"""为 figure/table ID 生成搜索变体。
|
|
||||||
|
|
||||||
"Figure 1" → ["Figure 1", "Fig. 1", "Fig 1"]
|
|
||||||
"Fig. 1" → ["Fig. 1", "Figure 1", "Fig 1"]
|
|
||||||
"Table A1" → ["Table A1"]
|
|
||||||
"""
|
|
||||||
variants = [fig_id]
|
|
||||||
|
|
||||||
m = re.match(r"(Fig\.?|Figure)\s+(\d+.*)", fig_id, re.IGNORECASE)
|
|
||||||
if m:
|
|
||||||
num_part = m.group(2)
|
|
||||||
variants.extend(
|
|
||||||
[
|
|
||||||
f"Figure {num_part}",
|
|
||||||
f"Fig. {num_part}",
|
|
||||||
f"Fig {num_part}",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
# 去重保序
|
|
||||||
seen = set()
|
|
||||||
result = []
|
|
||||||
for v in variants:
|
|
||||||
if v not in seen:
|
|
||||||
seen.add(v)
|
|
||||||
result.append(v)
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def label_images_by_summary(
|
|
||||||
arxiv_id: str,
|
|
||||||
figures: list[dict],
|
|
||||||
pdf_path: Path | None = None,
|
|
||||||
) -> int:
|
|
||||||
"""Phase 2: 用 summary 的 figures ID 在 PDF 中搜索定位,重命名图片。
|
|
||||||
|
|
||||||
对 summary 中的每个 figure/table ID:
|
|
||||||
1. page.search_for(id) 在所有页面搜索文本位置
|
|
||||||
2. 计算搜索位置与 manifest 中 box 坐标的距离
|
|
||||||
3. 最近匹配 → 重命名文件、更新 manifest
|
|
||||||
|
|
||||||
Args:
|
|
||||||
arxiv_id: 论文 ID
|
|
||||||
figures: summary 的 figures 列表,每项含 id/caption/description 等
|
|
||||||
pdf_path: PDF 路径
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
成功重命名的图片数量
|
|
||||||
"""
|
|
||||||
if not figures:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if pdf_path is None:
|
|
||||||
pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
|
|
||||||
if not pdf_path.exists():
|
|
||||||
return 0
|
|
||||||
|
|
||||||
images_dest = paper_dir(arxiv_id) / "images"
|
|
||||||
manifest_path = images_dest / "manifest.json"
|
|
||||||
if not manifest_path.exists():
|
|
||||||
return 0
|
|
||||||
|
|
||||||
manifest: dict[str, dict] = json.loads(manifest_path.read_text(encoding="utf-8"))
|
|
||||||
if not manifest:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
# 构建候选列表:只对通用标签的条目做匹配
|
|
||||||
candidates: dict[str, dict] = {} # filename → {page, box, ...}
|
|
||||||
for fname, info in manifest.items():
|
|
||||||
if "(p" in info.get("label", ""):
|
|
||||||
candidates[fname] = info
|
|
||||||
|
|
||||||
if not candidates:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
with pymupdf.open(str(pdf_path)) as doc:
|
|
||||||
# 收集所有匹配候选:(fig_id, fig_index, filename, distance)
|
|
||||||
matches: list[tuple[str, int, str, float]] = []
|
|
||||||
|
|
||||||
for fig_idx, fig in enumerate(figures):
|
|
||||||
fig_id = fig.get("id", "")
|
|
||||||
if not fig_id:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 生成搜索变体:Figure 1 / Fig. 1 / Fig 1 等
|
|
||||||
search_terms = _search_variants(fig_id)
|
|
||||||
|
|
||||||
# 在所有页面搜索该文本(含变体)
|
|
||||||
search_hits: list[tuple[int, pymupdf.Rect]] = [] # (page_num_1based, Rect)
|
|
||||||
for page_idx in range(doc.page_count):
|
|
||||||
page = doc[page_idx]
|
|
||||||
seen_rects: set[tuple[float, float]] = set()
|
|
||||||
for term in search_terms:
|
|
||||||
for r in page.search_for(term):
|
|
||||||
key = (round(r.x0, 1), round(r.y0, 1))
|
|
||||||
if key not in seen_rects:
|
|
||||||
seen_rects.add(key)
|
|
||||||
search_hits.append((page_idx + 1, r))
|
|
||||||
|
|
||||||
if not search_hits:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 对每个候选 manifest 条目,找最近的搜索命中
|
|
||||||
for fname, info in candidates.items():
|
|
||||||
box = info.get("box")
|
|
||||||
if not box:
|
|
||||||
continue
|
|
||||||
manifest_page = info.get("page", 0)
|
|
||||||
|
|
||||||
best_dist: float | None = None
|
|
||||||
for hit_page, rect in search_hits:
|
|
||||||
# 只匹配同页面
|
|
||||||
if hit_page != manifest_page:
|
|
||||||
continue
|
|
||||||
dist = _distance_text_to_box(rect, box)
|
|
||||||
if dist is not None and (best_dist is None or dist < best_dist):
|
|
||||||
best_dist = dist
|
|
||||||
|
|
||||||
if best_dist is not None:
|
|
||||||
matches.append((fig_id, fig_idx, fname, best_dist))
|
|
||||||
|
|
||||||
if not matches:
|
|
||||||
logger.info("No label matches for %s", arxiv_id)
|
|
||||||
return 0
|
|
||||||
|
|
||||||
# 去冲突:按距离排序,每个 fig_id 和每个 filename 只匹配一次
|
|
||||||
matches.sort(key=lambda x: x[3])
|
|
||||||
used_fig_ids: set[int] = set()
|
|
||||||
used_filenames: set[str] = set()
|
|
||||||
renames: list[tuple[str, str, str]] = [] # (old_fname, new_fname, fig_id)
|
|
||||||
|
|
||||||
for fig_id, fig_idx, fname, dist in matches:
|
|
||||||
if fig_idx in used_fig_ids or fname in used_filenames:
|
|
||||||
continue
|
|
||||||
used_fig_ids.add(fig_idx)
|
|
||||||
used_filenames.add(fname)
|
|
||||||
new_fname = f"{fig_id.replace(' ', '_').lower()}.jpg"
|
|
||||||
renames.append((fname, new_fname, fig_id))
|
|
||||||
|
|
||||||
# 执行重命名
|
|
||||||
labeled = 0
|
|
||||||
new_manifest: dict[str, dict] = {}
|
|
||||||
|
|
||||||
for fname, info in manifest.items():
|
|
||||||
if fname in used_filenames:
|
|
||||||
continue
|
|
||||||
# 未匹配的保持原样
|
|
||||||
new_manifest[fname] = info
|
|
||||||
|
|
||||||
for old_fname, new_fname, fig_id in renames:
|
|
||||||
old_path = images_dest / old_fname
|
|
||||||
new_path = images_dest / new_fname
|
|
||||||
if not old_path.exists():
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 搬运 manifest 信息
|
|
||||||
info = manifest[old_fname].copy()
|
|
||||||
cap_type = info.get("type", "figure")
|
|
||||||
|
|
||||||
# 读取 caption 文本(从 figures 列表)
|
|
||||||
summary_caption_text = ""
|
|
||||||
for fig in figures:
|
|
||||||
if fig.get("id") == fig_id:
|
|
||||||
summary_caption_text = fig.get("caption", "")
|
|
||||||
break
|
|
||||||
|
|
||||||
info["label"] = fig_id
|
|
||||||
existing_caption_text = info.get("caption_text", "")
|
|
||||||
if existing_caption_text and summary_caption_text:
|
|
||||||
info["summary_caption_text"] = summary_caption_text[:500]
|
|
||||||
else:
|
|
||||||
info["caption_text"] = (
|
|
||||||
summary_caption_text[:500] if summary_caption_text else ""
|
|
||||||
)
|
|
||||||
info.setdefault("figures" if cap_type == "figure" else "tables", []).append(
|
|
||||||
fig_id
|
|
||||||
)
|
|
||||||
|
|
||||||
# 重命名文件
|
|
||||||
if new_fname != old_fname:
|
|
||||||
old_path.rename(new_path)
|
|
||||||
new_manifest[new_fname] = info
|
|
||||||
labeled += 1
|
|
||||||
|
|
||||||
# 写回 manifest
|
|
||||||
manifest_path.write_text(json.dumps(new_manifest, ensure_ascii=False, indent=2))
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"Labeled %d/%d images for %s using summary figures",
|
|
||||||
labeled,
|
|
||||||
len(manifest),
|
|
||||||
arxiv_id,
|
|
||||||
)
|
|
||||||
return labeled
|
|
||||||
|
|
||||||
|
|
||||||
# ── Figure ↔ Image 关联 ────────────────────────────────────────────────
|
# ── Figure ↔ Image 关联 ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -201,20 +201,15 @@ def _cleanup_old_images(db: Session, paper: Paper) -> None:
|
|||||||
def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None:
|
def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None:
|
||||||
"""从 PDF 提取图片和表格(失败不影响总结)。
|
"""从 PDF 提取图片和表格(失败不影响总结)。
|
||||||
|
|
||||||
两阶段流水线:
|
DocLayout-YOLO 检测 figure/table 内容区域 → PDF 文本定位 caption → 只渲染
|
||||||
1. DocLayout-YOLO 检测 + 渲染截图(通用标签)
|
配到 Figure/Table 标题的(Algorithm、无编号附录表、误检碎片一律过滤)。
|
||||||
2. 用 summary 的 figures ID 在 PDF 中搜索定位 → 重命名
|
标题来源已切换为 PDF 文本,schema.figures 不再参与命名,参数保留备用。
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
from app.services.pdf_image_extractor import (
|
from app.services.pdf_image_extractor import extract_images_from_pdf
|
||||||
extract_images_from_pdf,
|
|
||||||
label_images_by_summary,
|
|
||||||
)
|
|
||||||
|
|
||||||
pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
|
pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
|
||||||
extract_images_from_pdf(arxiv_id, pdf_path)
|
extract_images_from_pdf(arxiv_id, pdf_path)
|
||||||
if schema.figures:
|
|
||||||
label_images_by_summary(arxiv_id, schema.figures, pdf_path)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
|
logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
|
||||||
from unittest.mock import MagicMock
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
import pymupdf
|
import pymupdf
|
||||||
@@ -9,7 +8,17 @@ from app.services import pdf_image_extractor as mod
|
|||||||
from app.services.layout_detector import LayoutBox
|
from app.services.layout_detector import LayoutBox
|
||||||
|
|
||||||
|
|
||||||
def test_process_page_extracts_doclayout_caption(tmp_path):
|
def _caption_block(bbox, text):
|
||||||
|
"""构造一个 page.get_text("dict") 风格的文本块。"""
|
||||||
|
return {
|
||||||
|
"type": 0,
|
||||||
|
"bbox": list(bbox),
|
||||||
|
"lines": [{"spans": [{"text": text}]}],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_process_page_pairs_caption_from_text(tmp_path):
|
||||||
|
"""caption 来自 PDF 文本流(figure 标题在内容下方),用其 ID 直接命名。"""
|
||||||
images_dest = tmp_path / "images"
|
images_dest = tmp_path / "images"
|
||||||
images_dest.mkdir()
|
images_dest.mkdir()
|
||||||
manifest: dict[str, dict] = {}
|
manifest: dict[str, dict] = {}
|
||||||
@@ -19,16 +28,18 @@ def test_process_page_extracts_doclayout_caption(tmp_path):
|
|||||||
|
|
||||||
page = MagicMock()
|
page = MagicMock()
|
||||||
page.rect.width = 600
|
page.rect.width = 600
|
||||||
|
page.rect.height = 800
|
||||||
page.get_pixmap.return_value = pix
|
page.get_pixmap.return_value = pix
|
||||||
page.get_text.return_value = "Figure 1: Overall architecture.\n"
|
page.get_text.return_value = {
|
||||||
|
"blocks": [
|
||||||
|
_caption_block((95, 310, 320, 325), "Figure 1: Overall architecture.")
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
doc = MagicMock()
|
doc = MagicMock()
|
||||||
doc.__getitem__.return_value = page
|
doc.__getitem__.return_value = page
|
||||||
|
|
||||||
boxes = [
|
boxes = [LayoutBox(100, 100, 300, 300, "picture")]
|
||||||
LayoutBox(100, 100, 300, 300, "picture"),
|
|
||||||
LayoutBox(95, 310, 320, 325, "figure_caption"),
|
|
||||||
]
|
|
||||||
|
|
||||||
extracted = mod._process_page(
|
extracted = mod._process_page(
|
||||||
doc,
|
doc,
|
||||||
@@ -41,14 +52,15 @@ def test_process_page_extracts_doclayout_caption(tmp_path):
|
|||||||
)
|
)
|
||||||
|
|
||||||
assert extracted == 1
|
assert extracted == 1
|
||||||
info = manifest["figure_(p1-1).jpg"]
|
# caption 自带 ID → 直接命名 figure_1.jpg
|
||||||
|
info = manifest["figure_1.jpg"]
|
||||||
|
assert info["label"] == "Figure 1"
|
||||||
assert info["caption_text"] == "Figure 1: Overall architecture."
|
assert info["caption_text"] == "Figure 1: Overall architecture."
|
||||||
assert info["caption_source"] == "doclayout"
|
assert info["caption_source"] == "text"
|
||||||
assert info["caption_box"] == [95.0, 310.0, 320.0, 325.0]
|
|
||||||
|
|
||||||
|
|
||||||
def test_process_page_includes_caption_in_render(tmp_path):
|
def test_process_page_includes_caption_in_render(tmp_path):
|
||||||
"""渲染时把 caption 区域合并进同一张截图。"""
|
"""渲染时把 caption 文本块区域合并进同一张截图。"""
|
||||||
images_dest = tmp_path / "images"
|
images_dest = tmp_path / "images"
|
||||||
images_dest.mkdir()
|
images_dest.mkdir()
|
||||||
manifest: dict[str, dict] = {}
|
manifest: dict[str, dict] = {}
|
||||||
@@ -58,16 +70,16 @@ def test_process_page_includes_caption_in_render(tmp_path):
|
|||||||
|
|
||||||
page = MagicMock()
|
page = MagicMock()
|
||||||
page.rect.width = 600
|
page.rect.width = 600
|
||||||
|
page.rect.height = 800
|
||||||
page.get_pixmap.return_value = pix
|
page.get_pixmap.return_value = pix
|
||||||
page.get_text.return_value = "Figure 1: Caption text.\n"
|
page.get_text.return_value = {
|
||||||
|
"blocks": [_caption_block((95, 310, 320, 325), "Figure 1: Caption text.")]
|
||||||
|
}
|
||||||
|
|
||||||
doc = MagicMock()
|
doc = MagicMock()
|
||||||
doc.__getitem__.return_value = page
|
doc.__getitem__.return_value = page
|
||||||
|
|
||||||
boxes = [
|
boxes = [LayoutBox(100, 100, 300, 300, "picture")]
|
||||||
LayoutBox(100, 100, 300, 300, "picture"),
|
|
||||||
LayoutBox(95, 310, 320, 325, "figure_caption"),
|
|
||||||
]
|
|
||||||
|
|
||||||
mod._process_page(
|
mod._process_page(
|
||||||
doc,
|
doc,
|
||||||
@@ -85,50 +97,74 @@ def test_process_page_includes_caption_in_render(tmp_path):
|
|||||||
assert clip == pymupdf.Rect(90, 95, 325, 330)
|
assert clip == pymupdf.Rect(90, 95, 325, 330)
|
||||||
|
|
||||||
|
|
||||||
def test_label_images_preserves_doclayout_caption(tmp_path, monkeypatch):
|
def test_process_page_table_caption_above(tmp_path):
|
||||||
arxiv_id = "2401.00001"
|
"""table 标题惯例在内容上方,配对后命名 table_N.jpg。"""
|
||||||
paper_root = tmp_path / arxiv_id
|
images_dest = tmp_path / "images"
|
||||||
images_dest = paper_root / "images"
|
images_dest.mkdir()
|
||||||
images_dest.mkdir(parents=True)
|
manifest: dict[str, dict] = {}
|
||||||
(images_dest / "figure_(p1-1).jpg").write_bytes(b"jpeg")
|
|
||||||
(images_dest / "manifest.json").write_text(
|
|
||||||
json.dumps(
|
|
||||||
{
|
|
||||||
"figure_(p1-1).jpg": {
|
|
||||||
"page": 1,
|
|
||||||
"type": "figure",
|
|
||||||
"label": "Figure (p1-1)",
|
|
||||||
"box": [100, 100, 300, 300],
|
|
||||||
"caption_text": "Figure 1: PDF original caption.",
|
|
||||||
"caption_source": "doclayout",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
pdf_path = tmp_path / "paper.pdf"
|
pix = MagicMock()
|
||||||
pdf_path.write_bytes(b"%PDF")
|
pix.tobytes.return_value = b"jpeg"
|
||||||
monkeypatch.setattr(mod, "paper_dir", lambda _arxiv_id: paper_root)
|
|
||||||
|
|
||||||
page = MagicMock()
|
page = MagicMock()
|
||||||
page.search_for.return_value = [pymupdf.Rect(120, 305, 180, 320)]
|
page.rect.width = 600
|
||||||
|
page.rect.height = 800
|
||||||
|
page.get_pixmap.return_value = pix
|
||||||
|
# caption 在内容上方 [80, 90, 320, 105],内容表格 [80, 120, 320, 280]
|
||||||
|
page.get_text.return_value = {
|
||||||
|
"blocks": [_caption_block((80, 90, 320, 105), "Table 2 | Results summary.")]
|
||||||
|
}
|
||||||
|
|
||||||
fake_doc = MagicMock()
|
doc = MagicMock()
|
||||||
fake_doc.page_count = 1
|
doc.__getitem__.return_value = page
|
||||||
fake_doc.__getitem__.return_value = page
|
|
||||||
fake_doc.__enter__.return_value = fake_doc
|
|
||||||
fake_doc.__exit__.return_value = False
|
|
||||||
monkeypatch.setattr(mod.pymupdf, "open", lambda _path: fake_doc)
|
|
||||||
|
|
||||||
labeled = mod.label_images_by_summary(
|
boxes = [LayoutBox(80, 120, 320, 280, "table")]
|
||||||
arxiv_id,
|
|
||||||
[{"id": "Figure 1", "caption": "Summary caption."}],
|
extracted = mod._process_page(
|
||||||
pdf_path=pdf_path,
|
doc,
|
||||||
|
0,
|
||||||
|
boxes,
|
||||||
|
images_dest=images_dest,
|
||||||
|
manifest=manifest,
|
||||||
|
seen_labels=set(),
|
||||||
|
arxiv_id="2401.00001",
|
||||||
)
|
)
|
||||||
|
|
||||||
assert labeled == 1
|
assert extracted == 1
|
||||||
manifest = json.loads((images_dest / "manifest.json").read_text())
|
info = manifest["table_2.jpg"]
|
||||||
info = manifest["figure_1.jpg"]
|
assert info["label"] == "Table 2"
|
||||||
assert info["caption_text"] == "Figure 1: PDF original caption."
|
assert info["caption_source"] == "text"
|
||||||
assert info["caption_source"] == "doclayout"
|
|
||||||
assert info["summary_caption_text"] == "Summary caption."
|
|
||||||
|
def test_process_page_filters_uncaptioned(tmp_path):
|
||||||
|
"""没有 Figure/Table caption 配对的 box(Algorithm、无编号表等)被过滤,不输出。"""
|
||||||
|
images_dest = tmp_path / "images"
|
||||||
|
images_dest.mkdir()
|
||||||
|
manifest: dict[str, dict] = {}
|
||||||
|
|
||||||
|
pix = MagicMock()
|
||||||
|
pix.tobytes.return_value = b"jpeg"
|
||||||
|
|
||||||
|
page = MagicMock()
|
||||||
|
page.rect.width = 600
|
||||||
|
page.rect.height = 800
|
||||||
|
page.get_pixmap.return_value = pix
|
||||||
|
page.get_text.return_value = {"blocks": []} # 无任何 caption 文本块
|
||||||
|
|
||||||
|
doc = MagicMock()
|
||||||
|
doc.__getitem__.return_value = page
|
||||||
|
|
||||||
|
boxes = [LayoutBox(100, 100, 300, 300, "picture")]
|
||||||
|
|
||||||
|
extracted = mod._process_page(
|
||||||
|
doc,
|
||||||
|
0,
|
||||||
|
boxes,
|
||||||
|
images_dest=images_dest,
|
||||||
|
manifest=manifest,
|
||||||
|
seen_labels=set(),
|
||||||
|
arxiv_id="2401.00001",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert extracted == 0
|
||||||
|
assert manifest == {}
|
||||||
|
|||||||
Reference in New Issue
Block a user