refactor: replace Phase 2 label matching with PDF text-stream caption pairing
- Extract captions from PDF text dict instead of DocLayout caption boxes - Use _CaptionBlock dataclass to carry authoritative ID, kind, text, bbox - Pair captions to content boxes with directional preference (figure below, table above) - Filter out uncaptioned boxes (Algorithm pseudo-code, unnumbered appendix tables, false positives) - Remove label_images_by_summary and Phase 2 rename pipeline entirely - Update tests to cover text-based caption pairing and filtering
This commit is contained in:
+140
-336
@@ -1,12 +1,13 @@
|
||||
"""PDF 图片与表格提取 — 两阶段流水线。
|
||||
"""PDF 图片与表格提取。
|
||||
|
||||
Phase 1: DocLayout-YOLO 检测 figure/table 区域 → 渲染为 JPEG(通用标签)
|
||||
Phase 2: 用 LLM summary 的 figures[].id 在 PDF 中搜索定位 → 匹配到 box → 重命名
|
||||
DocLayout-YOLO 检测 figure/table 内容区域 → PDF 文本流定位 caption → 只渲染配到
|
||||
Figure/Table 标题的,用 caption 自带权威 ID 命名。没配到标题的(Algorithm 伪代码、
|
||||
无编号附录表、DocLayout 误检碎片)一律过滤,不输出。
|
||||
|
||||
相比旧方案(正则匹配 caption):
|
||||
- 不再依赖正则,用 LLM 输出的 ID 直接搜索 PDF 文本
|
||||
- page.search_for() 精确搜索 + 空间距离过滤,避免正文引用误匹配
|
||||
- 通用标签兜底,LLM 没提到的图表不会被丢弃
|
||||
caption 定位用 PDF 文本而非 DocLayout 的 caption box —— 后者检测不稳(多行标题只
|
||||
框一行→截断、漏检→无标题、配对错误→串台)。page.get_text("dict") 找以
|
||||
"Figure N"/"Table N" 开头的文本块:文本块天然含完整多行标题,且其 ID 即论文实际
|
||||
编号,直接命名规避串台。figure 标题优先在下方、table 标题优先在上方配对。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -14,6 +15,7 @@ from __future__ import annotations
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import pymupdf
|
||||
@@ -32,10 +34,16 @@ _RENDER_ZOOM = 3
|
||||
_CLUSTER_GAP = 15
|
||||
# 最小 bbox 面积(单位: pt²)— 过滤 icon/logo 等微小误检
|
||||
_MIN_BOX_AREA = 2000
|
||||
# Phase 2: 搜索文本到 box 的最大匹配距离(单位: pt)
|
||||
_LABEL_MATCH_DISTANCE = 100
|
||||
# DocLayout caption 与 figure/table 匹配的最大距离(单位: pt)
|
||||
# caption 文本块与 figure/table 内容块的最大垂直距离(单位: pt)
|
||||
_CAPTION_MATCH_DISTANCE = 120
|
||||
# 方向不符(figure 标题在上 / table 标题在下)的配对惩罚分(仍允许,兜底异常排版)
|
||||
_CAPTION_WRONG_SIDE_PENALTY = 300
|
||||
# caption 开头标记:Figure 3 / Fig. 3 / Table C1 / Figure 3.5 等(大小写均可)
|
||||
# 编号 = 数字开头 或 字母+数字(附录 C1);行首匹配,规避正文 "see Table 3" 引用
|
||||
_CAPTION_HEAD_RE = re.compile(
|
||||
r"^\s*(Figure|Fig\.?|Table)\b\.?\s+([0-9][0-9A-Za-z.]*|[A-Z]\d[0-9A-Za-z.]*)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
# ── Box 聚类 ─────────────────────────────────────────────────────────
|
||||
@@ -51,8 +59,17 @@ class _BoxCluster:
|
||||
self.y0 = min(b.y0 for b in boxes)
|
||||
self.x1 = max(b.x1 for b in boxes)
|
||||
self.y1 = max(b.y1 for b in boxes)
|
||||
raw = boxes[0].boxclass
|
||||
self.boxclass = "table" if raw == "table-fallback" else raw
|
||||
self.boxclass = boxes[0].boxclass
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _CaptionBlock:
|
||||
"""从 PDF 文本流提取的标题块:自带权威 ID、完整多行文本、精确 bbox。"""
|
||||
|
||||
id: str # "Figure 3" / "Table C1"
|
||||
kind: str # "figure" | "table"
|
||||
text: str # 完整多行标题文本
|
||||
bbox: list[float] # [x0, y0, x1, y1]
|
||||
|
||||
|
||||
def _cluster_to_box(cluster: _BoxCluster) -> list[float]:
|
||||
@@ -103,64 +120,88 @@ def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]:
|
||||
return [_BoxCluster(members) for members in groups.values()]
|
||||
|
||||
|
||||
def _caption_class_for_content(boxclass: str) -> str:
|
||||
return "figure_caption" if boxclass == "picture" else "table_caption"
|
||||
def _find_caption_blocks(page) -> list[_CaptionBlock]:
|
||||
"""从页面文本流提取以 "Figure N"/"Table N"/"Fig. N" 开头的标题块。
|
||||
|
||||
|
||||
def _caption_distance(content: _BoxCluster, caption: _BoxCluster) -> float | None:
|
||||
"""Return a spatial score for pairing a caption with a content box."""
|
||||
h_overlap = min(content.x1, caption.x1) - max(content.x0, caption.x0)
|
||||
min_width = min(content.x1 - content.x0, caption.x1 - caption.x0)
|
||||
if min_width <= 0 or h_overlap < min_width * 0.25:
|
||||
return None
|
||||
|
||||
if caption.y1 < content.y0:
|
||||
v_gap = content.y0 - caption.y1
|
||||
elif caption.y0 > content.y1:
|
||||
v_gap = caption.y0 - content.y1
|
||||
else:
|
||||
v_gap = 0.0
|
||||
|
||||
return v_gap if v_gap <= _CAPTION_MATCH_DISTANCE else None
|
||||
|
||||
|
||||
def _extract_caption_text(page, caption: _BoxCluster) -> str:
|
||||
rect = pymupdf.Rect(caption.x0, caption.y0, caption.x1, caption.y1)
|
||||
用 PDF 文本而非 DocLayout caption box:文本块天然含完整多行标题,
|
||||
且其 ID 即论文实际编号(如 "Table C1"),权威且不依赖模型检测。
|
||||
"""
|
||||
try:
|
||||
text = page.get_text("text", clip=rect)
|
||||
d = page.get_text("dict")
|
||||
except Exception:
|
||||
return ""
|
||||
return " ".join(text.split())
|
||||
return []
|
||||
|
||||
results: list[_CaptionBlock] = []
|
||||
for block in d.get("blocks", []):
|
||||
if block.get("type") != 0: # 仅文本块
|
||||
continue
|
||||
lines = block.get("lines", [])
|
||||
if not lines:
|
||||
continue
|
||||
line_texts = [
|
||||
"".join(span.get("text", "") for span in line.get("spans", []))
|
||||
for line in lines
|
||||
]
|
||||
first_line = next((t for t in line_texts if t.strip()), "")
|
||||
m = _CAPTION_HEAD_RE.match(first_line)
|
||||
if not m:
|
||||
continue
|
||||
kind_word, num = m.group(1), m.group(2)
|
||||
is_table = kind_word.lower().startswith("table")
|
||||
bbox = block.get("bbox")
|
||||
if not bbox or len(bbox) != 4:
|
||||
continue
|
||||
full_text = " ".join(t.strip() for t in line_texts if t.strip())
|
||||
results.append(
|
||||
_CaptionBlock(
|
||||
id=f"{'Table' if is_table else 'Figure'} {num}",
|
||||
kind="table" if is_table else "figure",
|
||||
text=full_text,
|
||||
bbox=[float(v) for v in bbox],
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def _match_captions(
|
||||
page,
|
||||
def _pair_caption_blocks(
|
||||
content_clusters: list[_BoxCluster],
|
||||
caption_clusters: list[_BoxCluster],
|
||||
) -> dict[int, tuple[_BoxCluster, str]]:
|
||||
"""Match each content cluster to its nearest same-type DocLayout caption."""
|
||||
matches: dict[int, tuple[_BoxCluster, str]] = {}
|
||||
used_captions: set[int] = set()
|
||||
caption_blocks: list[_CaptionBlock],
|
||||
) -> dict[int, _CaptionBlock]:
|
||||
"""每个内容块配方向上最近的同类型标题块。
|
||||
|
||||
figure 标题惯例在下方、table 标题在上方;方向相符优先,不符加惩罚兜底
|
||||
(跨页 / 异常排版)。按 (距离+惩罚) 升序贪心匹配,每个内容块与标题块唯一配对。
|
||||
"""
|
||||
candidates: list[tuple[float, int, int]] = []
|
||||
|
||||
for content_idx, content in enumerate(content_clusters):
|
||||
wanted_caption_class = _caption_class_for_content(content.boxclass)
|
||||
for caption_idx, caption in enumerate(caption_clusters):
|
||||
if caption.boxclass != wanted_caption_class:
|
||||
for c_idx, content in enumerate(content_clusters):
|
||||
want_below = content.boxclass == "picture" # figure 标题在下
|
||||
want_kind = "figure" if want_below else "table"
|
||||
for b_idx, cap in enumerate(caption_blocks):
|
||||
if cap.kind != want_kind:
|
||||
continue
|
||||
dist = _caption_distance(content, caption)
|
||||
if dist is not None:
|
||||
candidates.append((dist, content_idx, caption_idx))
|
||||
cx0, cy0, cx1, cy1 = cap.bbox
|
||||
h_overlap = min(content.x1, cx1) - max(content.x0, cx0)
|
||||
min_width = min(content.x1 - content.x0, cx1 - cx0)
|
||||
if min_width <= 0 or h_overlap < min_width * 0.25:
|
||||
continue
|
||||
if cy1 <= content.y0: # 标题在内容上方
|
||||
side_below, v_gap = False, content.y0 - cy1
|
||||
elif cy0 >= content.y1: # 标题在内容下方
|
||||
side_below, v_gap = True, cy0 - content.y1
|
||||
else:
|
||||
continue # 重叠,跳过
|
||||
if v_gap > _CAPTION_MATCH_DISTANCE:
|
||||
continue
|
||||
penalty = 0.0 if side_below == want_below else _CAPTION_WRONG_SIDE_PENALTY
|
||||
candidates.append((v_gap + penalty, c_idx, b_idx))
|
||||
|
||||
for _dist, content_idx, caption_idx in sorted(candidates):
|
||||
if content_idx in matches or caption_idx in used_captions:
|
||||
matches: dict[int, _CaptionBlock] = {}
|
||||
used: set[int] = set()
|
||||
for _score, c_idx, b_idx in sorted(candidates):
|
||||
if c_idx in matches or b_idx in used:
|
||||
continue
|
||||
text = _extract_caption_text(page, caption_clusters[caption_idx])
|
||||
if not text:
|
||||
continue
|
||||
matches[content_idx] = (caption_clusters[caption_idx], text)
|
||||
used_captions.add(caption_idx)
|
||||
|
||||
matches[c_idx] = caption_blocks[b_idx]
|
||||
used.add(b_idx)
|
||||
return matches
|
||||
|
||||
|
||||
@@ -174,25 +215,27 @@ def _render_box(
|
||||
filename: str,
|
||||
cap_type: str,
|
||||
page_num: int,
|
||||
caption: _BoxCluster | None = None,
|
||||
caption_bbox: list[float] | None = None,
|
||||
) -> bool:
|
||||
"""渲染单个 box 区域并保存 JPEG,成功返回 True。
|
||||
|
||||
若提供 caption,则将内容与 caption 区域合并后一起截取,
|
||||
使同一张截图同时包含图/表及其标题文字。
|
||||
若提供 caption_bbox,则将内容与标题区域合并后一起截取,
|
||||
使同一张截图同时包含图/表及其完整标题。
|
||||
"""
|
||||
page_width = page.rect.width
|
||||
page_height = page.rect.height
|
||||
x0, y0, x1, y1 = box.x0, box.y0, box.x1, box.y1
|
||||
if caption is not None:
|
||||
x0 = min(x0, caption.x0)
|
||||
y0 = min(y0, caption.y0)
|
||||
x1 = max(x1, caption.x1)
|
||||
y1 = max(y1, caption.y1)
|
||||
if caption_bbox is not None:
|
||||
cx0, cy0, cx1, cy1 = caption_bbox
|
||||
x0 = min(x0, cx0)
|
||||
y0 = min(y0, cy0)
|
||||
x1 = max(x1, cx1)
|
||||
y1 = max(y1, cy1)
|
||||
clip = pymupdf.Rect(
|
||||
max(0, x0 - _REGION_PADDING),
|
||||
max(0, y0 - _REGION_PADDING),
|
||||
min(page_width, x1 + _REGION_PADDING),
|
||||
y1 + _REGION_PADDING,
|
||||
min(page_height, y1 + _REGION_PADDING),
|
||||
)
|
||||
mat = pymupdf.Matrix(_RENDER_ZOOM, _RENDER_ZOOM)
|
||||
try:
|
||||
@@ -200,7 +243,7 @@ def _render_box(
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
(images_dest / filename).write_bytes(pix.tobytes("jpeg"))
|
||||
(images_dest / filename).write_bytes(pix.tobytes("jpeg", jpg_quality=92))
|
||||
return True
|
||||
|
||||
|
||||
@@ -213,77 +256,62 @@ def _process_page(
|
||||
seen_labels: set,
|
||||
arxiv_id: str,
|
||||
) -> int:
|
||||
"""处理单页:检测 → 聚类 → 渲染,全部用通用标签。"""
|
||||
"""处理单页:检测内容 box → 文本定位 caption → 只渲染配到标题的。
|
||||
|
||||
配到 Figure/Table caption 的 box 用 caption 自带 ID 命名(figure_3.jpg);
|
||||
没配到标题的(Algorithm 伪代码、无编号附录表、误检碎片)一律过滤,不输出。
|
||||
"""
|
||||
page = doc[page_idx]
|
||||
page_num = page_idx + 1
|
||||
fig_counter = 0
|
||||
tbl_counter = 0
|
||||
|
||||
# 收集本页的 table/picture box 与 caption box(跳过极小区域)
|
||||
# 收集本页 figure/table 内容 box(跳过极小区域;caption 改由文本定位,不收 box)
|
||||
raw_boxes = []
|
||||
raw_caption_boxes = []
|
||||
for box in page_boxes:
|
||||
w = box.x1 - box.x0
|
||||
h = box.y1 - box.y0
|
||||
if box.boxclass in ("table", "table-fallback", "picture"):
|
||||
if box.boxclass in ("table", "picture"):
|
||||
w = box.x1 - box.x0
|
||||
h = box.y1 - box.y0
|
||||
if w < 20 or h < 20 or w * h < _MIN_BOX_AREA:
|
||||
continue
|
||||
raw_boxes.append(box)
|
||||
elif box.boxclass in ("figure_caption", "table_caption"):
|
||||
if w < 30 or h < 6:
|
||||
continue
|
||||
raw_caption_boxes.append(box)
|
||||
|
||||
if not raw_boxes:
|
||||
return 0
|
||||
|
||||
# 聚类:将同一 figure/table 的碎片 box 合并
|
||||
# 聚类:将同一 figure/table 的碎片 box 合并;用 PDF 文本定位 caption
|
||||
clusters = _cluster_boxes(raw_boxes)
|
||||
caption_clusters = _cluster_boxes(raw_caption_boxes)
|
||||
caption_matches = _match_captions(page, clusters, caption_clusters)
|
||||
caption_blocks = _find_caption_blocks(page)
|
||||
caption_matches = _pair_caption_blocks(clusters, caption_blocks)
|
||||
|
||||
extracted = 0
|
||||
for cluster_idx, cluster in enumerate(clusters):
|
||||
cap_type = "figure" if cluster.boxclass == "picture" else "table"
|
||||
cap_match = caption_matches.get(cluster_idx)
|
||||
if cap_match is None:
|
||||
continue # 无 Figure/Table 标题 → 过滤(Algorithm、无编号表、误检碎片)
|
||||
if cap_match.id in seen_labels:
|
||||
continue # 同一图表被 DocLayout 切成多块重复检测,跳过后续
|
||||
seen_labels.add(cap_match.id)
|
||||
|
||||
if cap_type == "figure":
|
||||
fig_counter += 1
|
||||
label = f"Figure (p{page_num}-{fig_counter})"
|
||||
else:
|
||||
tbl_counter += 1
|
||||
label = f"Table (p{page_num}-{tbl_counter})"
|
||||
|
||||
if label in seen_labels:
|
||||
continue
|
||||
seen_labels.add(label)
|
||||
|
||||
caption_match = caption_matches.get(cluster_idx)
|
||||
caption_cluster = caption_match[0] if caption_match else None
|
||||
|
||||
filename = f"{label.replace(' ', '_').lower()}.jpg"
|
||||
filename = f"{cap_match.id.replace(' ', '_').lower()}.jpg"
|
||||
if not _render_box(
|
||||
page,
|
||||
cluster,
|
||||
images_dest,
|
||||
filename,
|
||||
cap_type,
|
||||
cap_match.kind,
|
||||
page_num,
|
||||
caption=caption_cluster,
|
||||
caption_bbox=cap_match.bbox,
|
||||
):
|
||||
continue
|
||||
|
||||
info = {
|
||||
manifest[filename] = {
|
||||
"page": page_num,
|
||||
"type": cap_type,
|
||||
"label": label,
|
||||
"type": cap_match.kind,
|
||||
"label": cap_match.id,
|
||||
"box": _cluster_to_box(cluster),
|
||||
"caption_text": cap_match.text[:500],
|
||||
"caption_box": cap_match.bbox,
|
||||
"caption_source": "text",
|
||||
}
|
||||
if caption_match:
|
||||
info["caption_text"] = caption_match[1][:500]
|
||||
info["caption_box"] = _cluster_to_box(caption_cluster)
|
||||
info["caption_source"] = "doclayout"
|
||||
|
||||
manifest[filename] = info
|
||||
extracted += 1
|
||||
|
||||
return extracted
|
||||
@@ -359,230 +387,6 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
|
||||
return extracted
|
||||
|
||||
|
||||
# ── Phase 2: 用 summary 的 figures ID 定位并重命名 ─────────────────────
|
||||
|
||||
|
||||
def _distance_text_to_box(rect: pymupdf.Rect, box: list[float]) -> float | None:
|
||||
"""计算搜索到的文本 rect 到 box 的距离。超出阈值返回 None。
|
||||
|
||||
判断逻辑:rect 中心与 box 的垂直距离 + 水平重叠检查。
|
||||
"""
|
||||
rect_cx = (rect.x0 + rect.x1) / 2
|
||||
rect_cy = (rect.y0 + rect.y1) / 2
|
||||
bx0, by0, bx1, by1 = box
|
||||
|
||||
# 水平重叠:rect 中心在 box 水平范围内(或接近)
|
||||
if not (bx0 - 20 <= rect_cx <= bx1 + 20):
|
||||
return None
|
||||
|
||||
# 垂直距离
|
||||
if rect_cy < by0:
|
||||
dist = by0 - rect_cy
|
||||
elif rect_cy > by1:
|
||||
dist = rect_cy - by1
|
||||
else:
|
||||
dist = 0
|
||||
|
||||
return dist if dist <= _LABEL_MATCH_DISTANCE else None
|
||||
|
||||
|
||||
def _search_variants(fig_id: str) -> list[str]:
|
||||
"""为 figure/table ID 生成搜索变体。
|
||||
|
||||
"Figure 1" → ["Figure 1", "Fig. 1", "Fig 1"]
|
||||
"Fig. 1" → ["Fig. 1", "Figure 1", "Fig 1"]
|
||||
"Table A1" → ["Table A1"]
|
||||
"""
|
||||
variants = [fig_id]
|
||||
|
||||
m = re.match(r"(Fig\.?|Figure)\s+(\d+.*)", fig_id, re.IGNORECASE)
|
||||
if m:
|
||||
num_part = m.group(2)
|
||||
variants.extend(
|
||||
[
|
||||
f"Figure {num_part}",
|
||||
f"Fig. {num_part}",
|
||||
f"Fig {num_part}",
|
||||
]
|
||||
)
|
||||
|
||||
# 去重保序
|
||||
seen = set()
|
||||
result = []
|
||||
for v in variants:
|
||||
if v not in seen:
|
||||
seen.add(v)
|
||||
result.append(v)
|
||||
return result
|
||||
|
||||
|
||||
def label_images_by_summary(
|
||||
arxiv_id: str,
|
||||
figures: list[dict],
|
||||
pdf_path: Path | None = None,
|
||||
) -> int:
|
||||
"""Phase 2: 用 summary 的 figures ID 在 PDF 中搜索定位,重命名图片。
|
||||
|
||||
对 summary 中的每个 figure/table ID:
|
||||
1. page.search_for(id) 在所有页面搜索文本位置
|
||||
2. 计算搜索位置与 manifest 中 box 坐标的距离
|
||||
3. 最近匹配 → 重命名文件、更新 manifest
|
||||
|
||||
Args:
|
||||
arxiv_id: 论文 ID
|
||||
figures: summary 的 figures 列表,每项含 id/caption/description 等
|
||||
pdf_path: PDF 路径
|
||||
|
||||
Returns:
|
||||
成功重命名的图片数量
|
||||
"""
|
||||
if not figures:
|
||||
return 0
|
||||
|
||||
if pdf_path is None:
|
||||
pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
|
||||
if not pdf_path.exists():
|
||||
return 0
|
||||
|
||||
images_dest = paper_dir(arxiv_id) / "images"
|
||||
manifest_path = images_dest / "manifest.json"
|
||||
if not manifest_path.exists():
|
||||
return 0
|
||||
|
||||
manifest: dict[str, dict] = json.loads(manifest_path.read_text(encoding="utf-8"))
|
||||
if not manifest:
|
||||
return 0
|
||||
|
||||
# 构建候选列表:只对通用标签的条目做匹配
|
||||
candidates: dict[str, dict] = {} # filename → {page, box, ...}
|
||||
for fname, info in manifest.items():
|
||||
if "(p" in info.get("label", ""):
|
||||
candidates[fname] = info
|
||||
|
||||
if not candidates:
|
||||
return 0
|
||||
|
||||
with pymupdf.open(str(pdf_path)) as doc:
|
||||
# 收集所有匹配候选:(fig_id, fig_index, filename, distance)
|
||||
matches: list[tuple[str, int, str, float]] = []
|
||||
|
||||
for fig_idx, fig in enumerate(figures):
|
||||
fig_id = fig.get("id", "")
|
||||
if not fig_id:
|
||||
continue
|
||||
|
||||
# 生成搜索变体:Figure 1 / Fig. 1 / Fig 1 等
|
||||
search_terms = _search_variants(fig_id)
|
||||
|
||||
# 在所有页面搜索该文本(含变体)
|
||||
search_hits: list[tuple[int, pymupdf.Rect]] = [] # (page_num_1based, Rect)
|
||||
for page_idx in range(doc.page_count):
|
||||
page = doc[page_idx]
|
||||
seen_rects: set[tuple[float, float]] = set()
|
||||
for term in search_terms:
|
||||
for r in page.search_for(term):
|
||||
key = (round(r.x0, 1), round(r.y0, 1))
|
||||
if key not in seen_rects:
|
||||
seen_rects.add(key)
|
||||
search_hits.append((page_idx + 1, r))
|
||||
|
||||
if not search_hits:
|
||||
continue
|
||||
|
||||
# 对每个候选 manifest 条目,找最近的搜索命中
|
||||
for fname, info in candidates.items():
|
||||
box = info.get("box")
|
||||
if not box:
|
||||
continue
|
||||
manifest_page = info.get("page", 0)
|
||||
|
||||
best_dist: float | None = None
|
||||
for hit_page, rect in search_hits:
|
||||
# 只匹配同页面
|
||||
if hit_page != manifest_page:
|
||||
continue
|
||||
dist = _distance_text_to_box(rect, box)
|
||||
if dist is not None and (best_dist is None or dist < best_dist):
|
||||
best_dist = dist
|
||||
|
||||
if best_dist is not None:
|
||||
matches.append((fig_id, fig_idx, fname, best_dist))
|
||||
|
||||
if not matches:
|
||||
logger.info("No label matches for %s", arxiv_id)
|
||||
return 0
|
||||
|
||||
# 去冲突:按距离排序,每个 fig_id 和每个 filename 只匹配一次
|
||||
matches.sort(key=lambda x: x[3])
|
||||
used_fig_ids: set[int] = set()
|
||||
used_filenames: set[str] = set()
|
||||
renames: list[tuple[str, str, str]] = [] # (old_fname, new_fname, fig_id)
|
||||
|
||||
for fig_id, fig_idx, fname, dist in matches:
|
||||
if fig_idx in used_fig_ids or fname in used_filenames:
|
||||
continue
|
||||
used_fig_ids.add(fig_idx)
|
||||
used_filenames.add(fname)
|
||||
new_fname = f"{fig_id.replace(' ', '_').lower()}.jpg"
|
||||
renames.append((fname, new_fname, fig_id))
|
||||
|
||||
# 执行重命名
|
||||
labeled = 0
|
||||
new_manifest: dict[str, dict] = {}
|
||||
|
||||
for fname, info in manifest.items():
|
||||
if fname in used_filenames:
|
||||
continue
|
||||
# 未匹配的保持原样
|
||||
new_manifest[fname] = info
|
||||
|
||||
for old_fname, new_fname, fig_id in renames:
|
||||
old_path = images_dest / old_fname
|
||||
new_path = images_dest / new_fname
|
||||
if not old_path.exists():
|
||||
continue
|
||||
|
||||
# 搬运 manifest 信息
|
||||
info = manifest[old_fname].copy()
|
||||
cap_type = info.get("type", "figure")
|
||||
|
||||
# 读取 caption 文本(从 figures 列表)
|
||||
summary_caption_text = ""
|
||||
for fig in figures:
|
||||
if fig.get("id") == fig_id:
|
||||
summary_caption_text = fig.get("caption", "")
|
||||
break
|
||||
|
||||
info["label"] = fig_id
|
||||
existing_caption_text = info.get("caption_text", "")
|
||||
if existing_caption_text and summary_caption_text:
|
||||
info["summary_caption_text"] = summary_caption_text[:500]
|
||||
else:
|
||||
info["caption_text"] = (
|
||||
summary_caption_text[:500] if summary_caption_text else ""
|
||||
)
|
||||
info.setdefault("figures" if cap_type == "figure" else "tables", []).append(
|
||||
fig_id
|
||||
)
|
||||
|
||||
# 重命名文件
|
||||
if new_fname != old_fname:
|
||||
old_path.rename(new_path)
|
||||
new_manifest[new_fname] = info
|
||||
labeled += 1
|
||||
|
||||
# 写回 manifest
|
||||
manifest_path.write_text(json.dumps(new_manifest, ensure_ascii=False, indent=2))
|
||||
|
||||
logger.info(
|
||||
"Labeled %d/%d images for %s using summary figures",
|
||||
labeled,
|
||||
len(manifest),
|
||||
arxiv_id,
|
||||
)
|
||||
return labeled
|
||||
|
||||
|
||||
# ── Figure ↔ Image 关联 ────────────────────────────────────────────────
|
||||
|
||||
|
||||
|
||||
@@ -201,20 +201,15 @@ def _cleanup_old_images(db: Session, paper: Paper) -> None:
|
||||
def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None:
|
||||
"""从 PDF 提取图片和表格(失败不影响总结)。
|
||||
|
||||
两阶段流水线:
|
||||
1. DocLayout-YOLO 检测 + 渲染截图(通用标签)
|
||||
2. 用 summary 的 figures ID 在 PDF 中搜索定位 → 重命名
|
||||
DocLayout-YOLO 检测 figure/table 内容区域 → PDF 文本定位 caption → 只渲染
|
||||
配到 Figure/Table 标题的(Algorithm、无编号附录表、误检碎片一律过滤)。
|
||||
标题来源已切换为 PDF 文本,schema.figures 不再参与命名,参数保留备用。
|
||||
"""
|
||||
try:
|
||||
from app.services.pdf_image_extractor import (
|
||||
extract_images_from_pdf,
|
||||
label_images_by_summary,
|
||||
)
|
||||
from app.services.pdf_image_extractor import extract_images_from_pdf
|
||||
|
||||
pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
|
||||
extract_images_from_pdf(arxiv_id, pdf_path)
|
||||
if schema.figures:
|
||||
label_images_by_summary(arxiv_id, schema.figures, pdf_path)
|
||||
except Exception:
|
||||
logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user