feat: refactor summarizer and PDF extraction pipeline
- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
This commit is contained in:
+383
-462
@@ -1,12 +1,12 @@
|
||||
"""PDF 图片与表格提取 — 基于 pymupdf4llm layout analysis。
|
||||
"""PDF 图片与表格提取 — 两阶段流水线。
|
||||
|
||||
用 pymupdf4llm 的 layout analysis 检测 table / picture 区域,
|
||||
再通过 caption 文字匹配确定 Figure/Table 编号,渲染为 JPEG。
|
||||
Phase 1: PicoDet-S_layout_3cls 检测 figure/table 区域 → 渲染为 JPEG(通用标签)
|
||||
Phase 2: 用 LLM summary 的 figures[].id 在 PDF 中搜索定位 → 匹配到 box → 重命名
|
||||
|
||||
相比旧方案(caption 正则 + pdfplumber/find_tables/文本块扫描三套策略):
|
||||
- layout analysis 直接给出区域 bbox,不存在相邻表格互相侵入的问题
|
||||
- 无需手动调参(最大高度、间隙阈值等)
|
||||
- 页面级 caption 匹配:每个 caption 只分配给最近的 box,避免上下相邻表格抢夺同一个 caption
|
||||
相比旧方案(正则匹配 caption):
|
||||
- 不再依赖正则,用 LLM 输出的 ID 直接搜索 PDF 文本
|
||||
- page.search_for() 精确搜索 + 空间距离过滤,避免正文引用误匹配
|
||||
- 通用标签兜底,LLM 没提到的图表不会被丢弃
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -17,44 +17,30 @@ import re
|
||||
from pathlib import Path
|
||||
|
||||
import pymupdf
|
||||
import pymupdf4llm.helpers.document_layout as dl
|
||||
|
||||
from app.services.layout_detector import LayoutBox, detect_page_layout
|
||||
from app.services.pdf_downloader import paper_dir
|
||||
from app.utils import TMP_DIR
|
||||
from app.utils import PAPERS_DIR, TMP_DIR
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ── Caption 正则 ───────────────────────────────────────────────────────
|
||||
|
||||
# 用于从 caption 文字中提取 Figure/Table 编号
|
||||
_FIGURE_CAPTION_RE = re.compile(
|
||||
r"^(?:Fig\.?|Figure)\s+(\d+)\s*(?:[:\.]\s*|\s+(?=(?-i:[A-Z])))",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_TABLE_CAPTION_RE = re.compile(
|
||||
r"^Table\s+(\d+)\s*(?:[:\.]\s*|\s+(?=(?-i:[A-Z])))",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# caption 与 table/picture 的最大匹配距离(点)
|
||||
_CAPTION_MATCH_DISTANCE = 100
|
||||
# 截图区域的外边距
|
||||
# 截图区域的外边距(单位: pt)
|
||||
_REGION_PADDING = 5
|
||||
# 3x 渲染,保证清晰度
|
||||
# 渲染倍率(3x 保证清晰度)
|
||||
_RENDER_ZOOM = 3
|
||||
# 相邻 box 聚类间距(点)— 同一 figure/table 的碎片间距通常 < 15pt
|
||||
# 相邻 box 聚类间距(单位: pt)— 同一 figure/table 的碎片间距通常 < 15pt
|
||||
_CLUSTER_GAP = 15
|
||||
# 最小 bbox 面积(单位: pt²)— 过滤 icon/logo 等微小误检
|
||||
_MIN_BOX_AREA = 2000
|
||||
# Phase 2: 搜索文本到 box 的最大匹配距离(单位: pt)
|
||||
_LABEL_MATCH_DISTANCE = 100
|
||||
|
||||
|
||||
# ── Box 聚类 ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class _BoxCluster:
|
||||
"""合并后的布局区域(由一个或多个相邻 LayoutBox 组成)。
|
||||
|
||||
pymupdf4llm 有时将一个大图拆成多个小 picture box(如视频帧网格),
|
||||
聚类后用整体 bbox 作为渲染区域。
|
||||
"""
|
||||
"""合并后的布局区域(由一个或多个相邻 LayoutBox 组成)。"""
|
||||
|
||||
__slots__ = ("x0", "y0", "x1", "y1", "boxclass")
|
||||
|
||||
@@ -63,17 +49,12 @@ class _BoxCluster:
|
||||
self.y0 = min(b.y0 for b in boxes)
|
||||
self.x1 = max(b.x1 for b in boxes)
|
||||
self.y1 = max(b.y1 for b in boxes)
|
||||
# table-fallback 归一化为 table(layout model 检测到表格但无法提取结构)
|
||||
raw = boxes[0].boxclass
|
||||
self.boxclass = "table" if raw == "table-fallback" else raw
|
||||
|
||||
|
||||
def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]:
|
||||
"""将相邻的同类型 box 合并为聚类。
|
||||
|
||||
用 union-find 将间距 ≤ gap 的同类型 box 归为一组,
|
||||
每组生成一个 _BoxCluster(整体 bbox)。
|
||||
"""
|
||||
"""将相邻的同类型 box 合并为聚类。"""
|
||||
if not boxes:
|
||||
return []
|
||||
|
||||
@@ -111,242 +92,58 @@ def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]:
|
||||
return [_BoxCluster(members) for members in groups.values()]
|
||||
|
||||
|
||||
# ── 页面级 Caption 查找与匹配 ──────────────────────────────────────────
|
||||
# ── Phase 1: 检测 + 渲染 ──────────────────────────────────────────────
|
||||
|
||||
|
||||
def _find_page_captions(page) -> list[dict]:
|
||||
"""查找页面上所有 Figure/Table caption 文字块。"""
|
||||
blocks = page.get_text("blocks")
|
||||
captions = []
|
||||
for b in blocks:
|
||||
if len(b) < 5:
|
||||
continue
|
||||
bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
|
||||
text = str(b[4]).strip()
|
||||
first_line = text.split("\n")[0].strip()
|
||||
|
||||
cap_type = None
|
||||
m = _TABLE_CAPTION_RE.match(first_line)
|
||||
if m:
|
||||
cap_type = "table"
|
||||
else:
|
||||
m = _FIGURE_CAPTION_RE.match(first_line)
|
||||
if m:
|
||||
cap_type = "figure"
|
||||
if m is None:
|
||||
continue
|
||||
|
||||
captions.append(
|
||||
{
|
||||
"label": f"{'Table' if cap_type == 'table' else 'Figure'} {m.group(1)}",
|
||||
"type": cap_type,
|
||||
"caption_text": text,
|
||||
"caption_y0": by0,
|
||||
"caption_y1": by1,
|
||||
"caption_x0": bx0,
|
||||
"caption_x1": bx1,
|
||||
}
|
||||
)
|
||||
return captions
|
||||
|
||||
|
||||
def _vertical_distance(cap_y0, cap_y1, box_y0, box_y1) -> float | None:
|
||||
"""计算 caption 到 box 的垂直距离。不邻接时返回 None。
|
||||
|
||||
三种情况:caption 完全在 box 上方、完全在下方、与 box 有垂直重叠。
|
||||
重叠(含部分溢出)视为 distance=0,确保 caption 延伸到 box 边界外时不会丢失。
|
||||
"""
|
||||
# Caption 完全在 box 上方
|
||||
if cap_y1 <= box_y0:
|
||||
dist = box_y0 - cap_y1
|
||||
return dist if dist <= _CAPTION_MATCH_DISTANCE else None
|
||||
# Caption 完全在 box 下方
|
||||
if cap_y0 >= box_y1:
|
||||
dist = cap_y0 - box_y1
|
||||
return dist if dist <= _CAPTION_MATCH_DISTANCE else None
|
||||
# Caption 与 box 有垂直重叠(内部、部分溢出都算)→ 距离 0
|
||||
return 0
|
||||
|
||||
|
||||
def _same_column(cap: dict, box, page_width: float) -> bool:
|
||||
"""判断 caption 和 box 是否在同一列。
|
||||
|
||||
双栏论文中左右栏间距有限,简单的水平重叠检查会跨列匹配。
|
||||
策略:用中心 X 坐标判断各自在哪半边,只有同半边才算同列。
|
||||
跨栏图表(caption 或 box 宽度 >65% 页宽)不受此限制。
|
||||
"""
|
||||
cap_w = cap["caption_x1"] - cap["caption_x0"]
|
||||
box_w = box.x1 - box.x0
|
||||
|
||||
# 跨栏元素:宽度超过页面的 65%
|
||||
if cap_w > page_width * 0.65 or box_w > page_width * 0.65:
|
||||
return True
|
||||
|
||||
cap_cx = (cap["caption_x0"] + cap["caption_x1"]) / 2
|
||||
box_cx = (box.x0 + box.x1) / 2
|
||||
mid = page_width / 2
|
||||
|
||||
# 同在左半边或同在右半边
|
||||
return (cap_cx < mid) == (box_cx < mid)
|
||||
|
||||
|
||||
def _match_captions_to_boxes(
|
||||
page_boxes: list, captions: list[dict], page_width: float
|
||||
) -> list[tuple[list[int], list[dict]]]:
|
||||
"""将 caption 分配给 box,允许一个 caption 匹配多个同类型 box。
|
||||
|
||||
典型场景:
|
||||
- Figure 由左右两个 picture box 组成,caption 同时靠近两者
|
||||
- Table 的视觉内容被 layout analysis 误分类为 picture,需要跨类型匹配
|
||||
|
||||
Returns:
|
||||
[(box_indices, captions), ...] 每组是一个独立的渲染任务
|
||||
"""
|
||||
# 每个 caption 找到所有距离在阈值内的 box
|
||||
# 优先匹配同类型;如果找不到,再匹配任意 table/picture box
|
||||
cap_to_boxes: dict[int, list[tuple[int, float]]] = {}
|
||||
|
||||
for ci, cap in enumerate(captions):
|
||||
same_type: list[tuple[int, float]] = []
|
||||
any_type: list[tuple[int, float]] = []
|
||||
expected = "table" if cap["type"] == "table" else "picture"
|
||||
|
||||
for bi, box in enumerate(page_boxes):
|
||||
# 列感知:双栏论文中只匹配同栏的 box
|
||||
if not _same_column(cap, box, page_width):
|
||||
continue
|
||||
# 水平重叠检查(同列内仍需有重叠)
|
||||
if not (
|
||||
cap["caption_x1"] > box.x0 - 5 and cap["caption_x0"] < box.x1 + 5
|
||||
):
|
||||
continue
|
||||
dist = _vertical_distance(
|
||||
cap["caption_y0"], cap["caption_y1"], box.y0, box.y1
|
||||
)
|
||||
if dist is None:
|
||||
continue
|
||||
entry = (bi, dist)
|
||||
any_type.append(entry)
|
||||
if box.boxclass == expected:
|
||||
same_type.append(entry)
|
||||
|
||||
# 优先用同类型匹配;没有时回退到任意类型;都没有则跳过
|
||||
if same_type:
|
||||
cap_to_boxes[ci] = same_type
|
||||
elif any_type:
|
||||
cap_to_boxes[ci] = any_type
|
||||
# else: 该 caption 无匹配 box,不加入 cap_to_boxes
|
||||
|
||||
# 每个 caption → 最近的 box(用于分组),但记录所有匹配的 box
|
||||
cap_primary: dict[int, int] = {} # caption → primary box index
|
||||
cap_all_boxes: dict[int, list[int]] = {} # caption → all matched box indices
|
||||
for ci, matches in cap_to_boxes.items():
|
||||
matches.sort(key=lambda x: x[1])
|
||||
cap_primary[ci] = matches[0][0]
|
||||
# 所有距离最近的同组 box(距离差 < 20pt 视为同一组)
|
||||
best_dist = matches[0][1]
|
||||
cap_all_boxes[ci] = [bi for bi, d in matches if d <= best_dist + 20]
|
||||
|
||||
# 按 primary box 分组
|
||||
box_to_caps: dict[int, list[int]] = {}
|
||||
for ci, bi in cap_primary.items():
|
||||
box_to_caps.setdefault(bi, []).append(ci)
|
||||
|
||||
# 构建渲染组:每个 caption 独立成组(共享 box 但各自渲染)
|
||||
# 同类型同 label 的 caption 会合并;不同类型则分开
|
||||
used_captions: set[int] = set()
|
||||
groups: list[tuple[list[int], list[dict]]] = []
|
||||
|
||||
for bi in sorted(box_to_caps.keys()):
|
||||
cis = box_to_caps[bi]
|
||||
for ci in cis:
|
||||
if ci in used_captions:
|
||||
continue
|
||||
used_captions.add(ci)
|
||||
|
||||
all_box_indices = set(cap_all_boxes.get(ci, [bi]))
|
||||
# 只合并同 label 的 caption(同 figure/table 的重复 caption)
|
||||
merged_captions = [captions[ci]]
|
||||
for other_bi in all_box_indices:
|
||||
if other_bi in box_to_caps:
|
||||
for other_ci in box_to_caps[other_bi]:
|
||||
if other_ci not in used_captions:
|
||||
other_cap = captions[other_ci]
|
||||
if other_cap["label"] == captions[ci]["label"]:
|
||||
used_captions.add(other_ci)
|
||||
merged_captions.append(other_cap)
|
||||
groups.append((sorted(all_box_indices), merged_captions))
|
||||
|
||||
return groups
|
||||
|
||||
|
||||
# ── 单页处理 ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _render_and_save(
|
||||
def _render_box(
|
||||
page,
|
||||
clip: pymupdf.Rect,
|
||||
box: _BoxCluster,
|
||||
images_dest: Path,
|
||||
manifest: dict,
|
||||
label: str,
|
||||
filename: str,
|
||||
cap_type: str,
|
||||
caption_text: str,
|
||||
page_num_1based: int,
|
||||
arxiv_id: str,
|
||||
page_num: int,
|
||||
) -> bool:
|
||||
"""渲染页面区域并保存 JPEG,写入 manifest。成功返回 True。"""
|
||||
"""渲染单个 box 区域并保存 JPEG,成功返回 True。"""
|
||||
page_width = page.rect.width
|
||||
clip = pymupdf.Rect(
|
||||
max(0, box.x0 - _REGION_PADDING),
|
||||
max(0, box.y0 - _REGION_PADDING),
|
||||
min(page_width, box.x1 + _REGION_PADDING),
|
||||
box.y1 + _REGION_PADDING,
|
||||
)
|
||||
mat = pymupdf.Matrix(_RENDER_ZOOM, _RENDER_ZOOM)
|
||||
try:
|
||||
pix = page.get_pixmap(matrix=mat, clip=clip)
|
||||
except Exception:
|
||||
logger.debug("Failed to render %s for %s", label, arxiv_id)
|
||||
return False
|
||||
|
||||
filename = f"{label.replace(' ', '_').lower()}.jpg"
|
||||
(images_dest / filename).write_bytes(pix.tobytes("jpeg"))
|
||||
|
||||
manifest[filename] = {
|
||||
"page": page_num_1based,
|
||||
"type": cap_type,
|
||||
"label": label,
|
||||
"caption_text": caption_text[:200] if caption_text else "",
|
||||
"figures" if cap_type == "figure" else "tables": [label],
|
||||
}
|
||||
logger.debug(
|
||||
"Rendered %s: page %d, region (%.0f,%.0f)-(%.0f,%.0f) → %s",
|
||||
label,
|
||||
page_num_1based,
|
||||
clip.x0,
|
||||
clip.y0,
|
||||
clip.x1,
|
||||
clip.y1,
|
||||
filename,
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def _process_page(
|
||||
doc,
|
||||
page_idx: int,
|
||||
page_layout,
|
||||
page_boxes: list[LayoutBox],
|
||||
images_dest: Path,
|
||||
manifest: dict,
|
||||
seen_labels: set,
|
||||
arxiv_id: str,
|
||||
) -> int:
|
||||
"""处理单页:caption 匹配 + orphan 兜底,返回本页提取数量。"""
|
||||
"""处理单页:检测 → 聚类 → 渲染,全部用通用标签。"""
|
||||
page = doc[page_idx]
|
||||
page_width = page.rect.width
|
||||
page_num = page_idx + 1
|
||||
orphan_fig_counter = 0
|
||||
orphan_tbl_counter = 0
|
||||
fig_counter = 0
|
||||
tbl_counter = 0
|
||||
|
||||
# 收集本页的 table/picture box(跳过极小区域)
|
||||
raw_boxes = []
|
||||
for box in page_layout.boxes:
|
||||
for box in page_boxes:
|
||||
if box.boxclass not in ("table", "table-fallback", "picture"):
|
||||
continue
|
||||
if (box.x1 - box.x0) < 20 or (box.y1 - box.y0) < 20:
|
||||
w = box.x1 - box.x0
|
||||
h = box.y1 - box.y0
|
||||
if w < 20 or h < 20 or w * h < _MIN_BOX_AREA:
|
||||
continue
|
||||
raw_boxes.append(box)
|
||||
|
||||
@@ -354,153 +151,48 @@ def _process_page(
|
||||
return 0
|
||||
|
||||
# 聚类:将同一 figure/table 的碎片 box 合并
|
||||
page_boxes = _cluster_boxes(raw_boxes)
|
||||
clusters = _cluster_boxes(raw_boxes)
|
||||
|
||||
# 页面级匹配:查找所有 caption,分配给 box
|
||||
captions = _find_page_captions(page)
|
||||
groups = _match_captions_to_boxes(page_boxes, captions, page_width)
|
||||
|
||||
# 只合并同 label 的 group(同一个 figure/table 的重复 caption)
|
||||
# 不同 label 的 group 即使共享 box 也不合并(如 Figure 7 和 Figure 8),
|
||||
# 渲染时用 caption 位置切割区域
|
||||
_merged_groups: set[int] = set()
|
||||
merged_groups: list[tuple[list[int], list[dict]]] = []
|
||||
for gi, (box_indices, caps) in enumerate(groups):
|
||||
if gi in _merged_groups:
|
||||
continue
|
||||
this_labels = {c["label"] for c in caps}
|
||||
all_box_set = set(box_indices)
|
||||
merge_targets = {gi}
|
||||
for other_gi, (other_bi, other_caps) in enumerate(groups):
|
||||
if other_gi <= gi or other_gi in _merged_groups:
|
||||
continue
|
||||
other_labels = {c["label"] for c in other_caps}
|
||||
# 只在 label 有交集时合并(同一个 figure/table)
|
||||
if this_labels & other_labels and all_box_set & set(other_bi):
|
||||
merge_targets.add(other_gi)
|
||||
all_box_set |= set(other_bi)
|
||||
all_caps = []
|
||||
for mgi in sorted(merge_targets):
|
||||
_merged_groups.add(mgi)
|
||||
all_caps.extend(groups[mgi][1])
|
||||
merged_groups.append((sorted(all_box_set), all_caps))
|
||||
groups = merged_groups
|
||||
|
||||
# ── 阶段 1:渲染有 caption 匹配的图/表 ──
|
||||
matched_box_indices: set[int] = set()
|
||||
extracted = 0
|
||||
|
||||
for box_indices, caps in groups:
|
||||
matched_box_indices.update(box_indices)
|
||||
|
||||
# 去重同一 label,跳过已处理的
|
||||
unique_caps = []
|
||||
for cap in caps:
|
||||
if cap["label"] not in seen_labels:
|
||||
seen_labels.add(cap["label"])
|
||||
unique_caps.append(cap)
|
||||
if not unique_caps:
|
||||
continue
|
||||
|
||||
# 合并所有关联 box 的 bbox
|
||||
bx0 = min(page_boxes[i].x0 for i in box_indices)
|
||||
by0 = min(page_boxes[i].y0 for i in box_indices)
|
||||
bx1 = max(page_boxes[i].x1 for i in box_indices)
|
||||
by1 = max(page_boxes[i].y1 for i in box_indices)
|
||||
|
||||
# 渲染区域:box + caption
|
||||
all_cap_y0 = min(c["caption_y0"] for c in unique_caps)
|
||||
all_cap_y1 = max(c["caption_y1"] for c in unique_caps)
|
||||
all_cap_x0 = min(c["caption_x0"] for c in unique_caps)
|
||||
all_cap_x1 = max(c["caption_x1"] for c in unique_caps)
|
||||
|
||||
top = max(0, min(by0, all_cap_y0) - _REGION_PADDING)
|
||||
bottom = max(by1, all_cap_y1) + _REGION_PADDING
|
||||
rx0 = max(0, min(bx0, all_cap_x0) - _REGION_PADDING)
|
||||
rx1 = min(page_width, max(bx1, all_cap_x1) + _REGION_PADDING)
|
||||
|
||||
clip = pymupdf.Rect(rx0, top, rx1, bottom)
|
||||
# 多个 caption 可能共享同一区域(如 subfigure),只需渲染一次
|
||||
jpeg_bytes = None
|
||||
for cap in unique_caps:
|
||||
if jpeg_bytes is None:
|
||||
if not _render_and_save(
|
||||
page,
|
||||
clip,
|
||||
images_dest,
|
||||
manifest,
|
||||
cap["label"],
|
||||
cap["type"],
|
||||
cap["caption_text"],
|
||||
page_num,
|
||||
arxiv_id,
|
||||
):
|
||||
break
|
||||
# 读取刚写入的 bytes 供后续同名 caption 复用
|
||||
filename = f"{cap['label'].replace(' ', '_').lower()}.jpg"
|
||||
jpeg_bytes = (images_dest / filename).read_bytes()
|
||||
extracted += 1
|
||||
else:
|
||||
# 同区域的不同 caption(如 subfigure),复用图片
|
||||
filename = f"{cap['label'].replace(' ', '_').lower()}.jpg"
|
||||
(images_dest / filename).write_bytes(jpeg_bytes)
|
||||
cap_preview = cap["caption_text"][:200]
|
||||
manifest[filename] = {
|
||||
"page": page_num,
|
||||
"type": cap["type"],
|
||||
"label": cap["label"],
|
||||
"caption_text": cap_preview,
|
||||
"figures" if cap["type"] == "figure" else "tables": [cap["label"]],
|
||||
}
|
||||
extracted += 1
|
||||
|
||||
# ── 阶段 2:渲染无 caption 匹配的图/表(orphan boxes) ──
|
||||
orphan_indices = set(range(len(page_boxes))) - matched_box_indices
|
||||
for bi in sorted(orphan_indices):
|
||||
box = page_boxes[bi]
|
||||
cap_type = "figure" if box.boxclass == "picture" else "table"
|
||||
for cluster in clusters:
|
||||
cap_type = "figure" if cluster.boxclass == "picture" else "table"
|
||||
|
||||
if cap_type == "figure":
|
||||
orphan_fig_counter += 1
|
||||
label = f"Figure (p{page_num}-{orphan_fig_counter})"
|
||||
fig_counter += 1
|
||||
label = f"Figure (p{page_num}-{fig_counter})"
|
||||
else:
|
||||
orphan_tbl_counter += 1
|
||||
label = f"Table (p{page_num}-{orphan_tbl_counter})"
|
||||
tbl_counter += 1
|
||||
label = f"Table (p{page_num}-{tbl_counter})"
|
||||
|
||||
if label in seen_labels:
|
||||
continue
|
||||
seen_labels.add(label)
|
||||
|
||||
clip = pymupdf.Rect(
|
||||
max(0, box.x0 - _REGION_PADDING),
|
||||
max(0, box.y0 - _REGION_PADDING),
|
||||
min(page_width, box.x1 + _REGION_PADDING),
|
||||
box.y1 + _REGION_PADDING,
|
||||
)
|
||||
if _render_and_save(
|
||||
page,
|
||||
clip,
|
||||
images_dest,
|
||||
manifest,
|
||||
label,
|
||||
cap_type,
|
||||
"",
|
||||
page_num,
|
||||
arxiv_id,
|
||||
):
|
||||
extracted += 1
|
||||
filename = f"{label.replace(' ', '_').lower()}.jpg"
|
||||
if not _render_box(page, cluster, images_dest, filename, cap_type, page_num):
|
||||
continue
|
||||
|
||||
manifest[filename] = {
|
||||
"page": page_num,
|
||||
"type": cap_type,
|
||||
"label": label,
|
||||
"box": [
|
||||
round(float(cluster.x0), 1),
|
||||
round(float(cluster.y0), 1),
|
||||
round(float(cluster.x1), 1),
|
||||
round(float(cluster.y1), 1),
|
||||
],
|
||||
}
|
||||
extracted += 1
|
||||
|
||||
return extracted
|
||||
|
||||
|
||||
# ── 核心提取 ───────────────────────────────────────────────────────────
|
||||
# ── Phase 1 核心入口 ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
|
||||
"""从 PDF 提取 Figure/Table 截图,生成 manifest。
|
||||
|
||||
用 pymupdf4llm layout analysis 检测 table/picture 区域,
|
||||
再通过 caption 文字确定编号,渲染为 JPEG。
|
||||
"""Phase 1: 从 PDF 提取 Figure/Table 截图,生成通用标签的 manifest。
|
||||
|
||||
Args:
|
||||
arxiv_id: 论文 ID
|
||||
@@ -526,45 +218,31 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
|
||||
if (images_dest / "manifest.json").exists():
|
||||
(images_dest / "manifest.json").unlink()
|
||||
|
||||
doc = pymupdf.open(str(pdf_path))
|
||||
with pymupdf.open(str(pdf_path)) as doc:
|
||||
extracted = 0
|
||||
manifest: dict[str, dict] = {}
|
||||
seen_labels: set[str] = set()
|
||||
|
||||
# layout analysis
|
||||
try:
|
||||
parsed = dl.parse_document(
|
||||
doc, filename=str(pdf_path), use_ocr=dl.OCRMode.NEVER
|
||||
)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"pymupdf4llm layout analysis failed for %s", arxiv_id, exc_info=True
|
||||
)
|
||||
doc.close()
|
||||
return 0
|
||||
|
||||
extracted = 0
|
||||
manifest: dict[str, dict] = {}
|
||||
seen_labels: set[str] = set()
|
||||
|
||||
for page_idx, page_layout in enumerate(parsed.pages):
|
||||
try:
|
||||
extracted += _process_page(
|
||||
doc,
|
||||
page_idx,
|
||||
page_layout,
|
||||
images_dest=images_dest,
|
||||
manifest=manifest,
|
||||
seen_labels=seen_labels,
|
||||
arxiv_id=arxiv_id,
|
||||
)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"Failed to process page %d for %s",
|
||||
page_idx + 1,
|
||||
arxiv_id,
|
||||
exc_info=True,
|
||||
)
|
||||
continue
|
||||
|
||||
doc.close()
|
||||
for page_idx in range(doc.page_count):
|
||||
try:
|
||||
page_boxes = detect_page_layout(doc[page_idx])
|
||||
extracted += _process_page(
|
||||
doc,
|
||||
page_idx,
|
||||
page_boxes,
|
||||
images_dest=images_dest,
|
||||
manifest=manifest,
|
||||
seen_labels=seen_labels,
|
||||
arxiv_id=arxiv_id,
|
||||
)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"Failed to process page %d for %s",
|
||||
page_idx + 1,
|
||||
arxiv_id,
|
||||
exc_info=True,
|
||||
)
|
||||
continue
|
||||
|
||||
# 保存 manifest
|
||||
manifest_path = images_dest / "manifest.json"
|
||||
@@ -580,78 +258,321 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
|
||||
return extracted
|
||||
|
||||
|
||||
# ── 按 summary 过滤 ────────────────────────────────────────────────────
|
||||
# ── Phase 2: 用 summary 的 figures ID 定位并重命名 ─────────────────────
|
||||
|
||||
|
||||
def filter_images_by_summary(arxiv_id: str, figures: list[dict]) -> int:
|
||||
"""根据 summary 中的 figures 字段过滤提取的图片/表格。
|
||||
def _distance_text_to_box(rect: pymupdf.Rect, box: list[float]) -> float | None:
|
||||
"""计算搜索到的文本 rect 到 box 的距离。超出阈值返回 None。
|
||||
|
||||
用 manifest.json 中的 label 匹配,保留被 AI 总结引用的图片。
|
||||
判断逻辑:rect 中心与 box 的垂直距离 + 水平重叠检查。
|
||||
"""
|
||||
rect_cx = (rect.x0 + rect.x1) / 2
|
||||
rect_cy = (rect.y0 + rect.y1) / 2
|
||||
bx0, by0, bx1, by1 = box
|
||||
|
||||
# 水平重叠:rect 中心在 box 水平范围内(或接近)
|
||||
if not (bx0 - 20 <= rect_cx <= bx1 + 20):
|
||||
return None
|
||||
|
||||
# 垂直距离
|
||||
if rect_cy < by0:
|
||||
dist = by0 - rect_cy
|
||||
elif rect_cy > by1:
|
||||
dist = rect_cy - by1
|
||||
else:
|
||||
dist = 0
|
||||
|
||||
return dist if dist <= _LABEL_MATCH_DISTANCE else None
|
||||
|
||||
|
||||
def _search_variants(fig_id: str) -> list[str]:
|
||||
"""为 figure/table ID 生成搜索变体。
|
||||
|
||||
"Figure 1" → ["Figure 1", "Fig. 1", "Fig 1"]
|
||||
"Fig. 1" → ["Fig. 1", "Figure 1", "Fig 1"]
|
||||
"Table A1" → ["Table A1"]
|
||||
"""
|
||||
variants = [fig_id]
|
||||
|
||||
m = re.match(r"(Fig\.?|Figure)\s+(\d+.*)", fig_id, re.IGNORECASE)
|
||||
if m:
|
||||
num_part = m.group(2)
|
||||
variants.extend(
|
||||
[
|
||||
f"Figure {num_part}",
|
||||
f"Fig. {num_part}",
|
||||
f"Fig {num_part}",
|
||||
]
|
||||
)
|
||||
|
||||
# 去重保序
|
||||
seen = set()
|
||||
result = []
|
||||
for v in variants:
|
||||
if v not in seen:
|
||||
seen.add(v)
|
||||
result.append(v)
|
||||
return result
|
||||
|
||||
|
||||
def label_images_by_summary(
|
||||
arxiv_id: str,
|
||||
figures: list[dict],
|
||||
pdf_path: Path | None = None,
|
||||
) -> int:
|
||||
"""Phase 2: 用 summary 的 figures ID 在 PDF 中搜索定位,重命名图片。
|
||||
|
||||
对 summary 中的每个 figure/table ID:
|
||||
1. page.search_for(id) 在所有页面搜索文本位置
|
||||
2. 计算搜索位置与 manifest 中 box 坐标的距离
|
||||
3. 最近匹配 → 重命名文件、更新 manifest
|
||||
|
||||
Args:
|
||||
arxiv_id: 论文 ID
|
||||
figures: summary 的 figures 列表,每项含 id/caption/description 等
|
||||
pdf_path: PDF 路径
|
||||
|
||||
Returns:
|
||||
成功重命名的图片数量
|
||||
"""
|
||||
if not figures:
|
||||
return 0
|
||||
|
||||
images_dir = paper_dir(arxiv_id) / "images"
|
||||
manifest_path = images_dir / "manifest.json"
|
||||
|
||||
if not images_dir.exists() or not manifest_path.exists():
|
||||
if pdf_path is None:
|
||||
pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
|
||||
if not pdf_path.exists():
|
||||
return 0
|
||||
|
||||
all_files = [
|
||||
f for f in images_dir.iterdir() if f.suffix.lower() in (".png", ".jpg", ".jpeg")
|
||||
]
|
||||
if not all_files:
|
||||
images_dest = paper_dir(arxiv_id) / "images"
|
||||
manifest_path = images_dest / "manifest.json"
|
||||
if not manifest_path.exists():
|
||||
return 0
|
||||
|
||||
manifest: dict = json.loads(manifest_path.read_text(encoding="utf-8"))
|
||||
manifest: dict[str, dict] = json.loads(manifest_path.read_text(encoding="utf-8"))
|
||||
if not manifest:
|
||||
return 0
|
||||
|
||||
# 收集 summary 中引用的所有 Figure/Table ID(归一化)
|
||||
referenced_ids: set[str] = set()
|
||||
for fig in figures:
|
||||
fig_id = fig.get("id", "")
|
||||
m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", fig_id, re.IGNORECASE)
|
||||
if m:
|
||||
referenced_ids.add(f"Figure {m.group(1)}")
|
||||
m2 = re.match(r"Table\s*(\d+)", fig_id, re.IGNORECASE)
|
||||
if m2:
|
||||
referenced_ids.add(f"Table {m2.group(1)}")
|
||||
# 构建候选列表:只对通用标签的条目做匹配
|
||||
candidates: dict[str, dict] = {} # filename → {page, box, ...}
|
||||
for fname, info in manifest.items():
|
||||
if "(p" in info.get("label", ""):
|
||||
candidates[fname] = info
|
||||
|
||||
if not referenced_ids:
|
||||
logger.warning("No valid figure/table IDs in summary for %s", arxiv_id)
|
||||
return len(all_files)
|
||||
if not candidates:
|
||||
return 0
|
||||
|
||||
# 根据 manifest 的 label 字段匹配
|
||||
keep_filenames: set[str] = set()
|
||||
for filename, info in manifest.items():
|
||||
label = info.get("label", "")
|
||||
if label in referenced_ids:
|
||||
keep_filenames.add(filename)
|
||||
with pymupdf.open(str(pdf_path)) as doc:
|
||||
# 收集所有匹配候选:(fig_id, fig_index, filename, distance)
|
||||
matches: list[tuple[str, int, str, float]] = []
|
||||
|
||||
for fig_idx, fig in enumerate(figures):
|
||||
fig_id = fig.get("id", "")
|
||||
if not fig_id:
|
||||
continue
|
||||
|
||||
# 生成搜索变体:Figure 1 / Fig. 1 / Fig 1 等
|
||||
search_terms = _search_variants(fig_id)
|
||||
|
||||
# 在所有页面搜索该文本(含变体)
|
||||
search_hits: list[tuple[int, pymupdf.Rect]] = [] # (page_num_1based, Rect)
|
||||
for page_idx in range(doc.page_count):
|
||||
page = doc[page_idx]
|
||||
seen_rects: set[tuple[float, float]] = set()
|
||||
for term in search_terms:
|
||||
for r in page.search_for(term):
|
||||
key = (round(r.x0, 1), round(r.y0, 1))
|
||||
if key not in seen_rects:
|
||||
seen_rects.add(key)
|
||||
search_hits.append((page_idx + 1, r))
|
||||
|
||||
if not search_hits:
|
||||
continue
|
||||
|
||||
# 对每个候选 manifest 条目,找最近的搜索命中
|
||||
for fname, info in candidates.items():
|
||||
box = info.get("box")
|
||||
if not box:
|
||||
continue
|
||||
manifest_page = info.get("page", 0)
|
||||
|
||||
best_dist: float | None = None
|
||||
for hit_page, rect in search_hits:
|
||||
# 只匹配同页面
|
||||
if hit_page != manifest_page:
|
||||
continue
|
||||
dist = _distance_text_to_box(rect, box)
|
||||
if dist is not None and (best_dist is None or dist < best_dist):
|
||||
best_dist = dist
|
||||
|
||||
if best_dist is not None:
|
||||
matches.append((fig_id, fig_idx, fname, best_dist))
|
||||
|
||||
if not matches:
|
||||
logger.info("No label matches for %s", arxiv_id)
|
||||
return 0
|
||||
|
||||
# 去冲突:按距离排序,每个 fig_id 和每个 filename 只匹配一次
|
||||
matches.sort(key=lambda x: x[3])
|
||||
used_fig_ids: set[int] = set()
|
||||
used_filenames: set[str] = set()
|
||||
renames: list[tuple[str, str, str]] = [] # (old_fname, new_fname, fig_id)
|
||||
|
||||
for fig_id, fig_idx, fname, dist in matches:
|
||||
if fig_idx in used_fig_ids or fname in used_filenames:
|
||||
continue
|
||||
for ref in info.get("figures", []) + info.get("tables", []):
|
||||
if ref in referenced_ids:
|
||||
keep_filenames.add(filename)
|
||||
used_fig_ids.add(fig_idx)
|
||||
used_filenames.add(fname)
|
||||
new_fname = f"{fig_id.replace(' ', '_').lower()}.jpg"
|
||||
renames.append((fname, new_fname, fig_id))
|
||||
|
||||
# 执行重命名
|
||||
labeled = 0
|
||||
new_manifest: dict[str, dict] = {}
|
||||
|
||||
for fname, info in manifest.items():
|
||||
if fname in used_filenames:
|
||||
continue
|
||||
# 未匹配的保持原样
|
||||
new_manifest[fname] = info
|
||||
|
||||
for old_fname, new_fname, fig_id in renames:
|
||||
old_path = images_dest / old_fname
|
||||
new_path = images_dest / new_fname
|
||||
if not old_path.exists():
|
||||
continue
|
||||
|
||||
# 搬运 manifest 信息
|
||||
info = manifest[old_fname].copy()
|
||||
cap_type = info.get("type", "figure")
|
||||
|
||||
# 读取 caption 文本(从 figures 列表)
|
||||
caption_text = ""
|
||||
for fig in figures:
|
||||
if fig.get("id") == fig_id:
|
||||
caption_text = fig.get("caption", "")
|
||||
break
|
||||
|
||||
if not keep_filenames:
|
||||
logger.warning(
|
||||
"No manifest matches for %s (refs=%s), keeping all",
|
||||
arxiv_id,
|
||||
referenced_ids,
|
||||
info["label"] = fig_id
|
||||
info["caption_text"] = caption_text[:200] if caption_text else ""
|
||||
info.setdefault("figures" if cap_type == "figure" else "tables", []).append(
|
||||
fig_id
|
||||
)
|
||||
return len(all_files)
|
||||
|
||||
removed = 0
|
||||
for f in all_files:
|
||||
if f.name not in keep_filenames:
|
||||
f.unlink()
|
||||
removed += 1
|
||||
# 重命名文件
|
||||
if new_fname != old_fname:
|
||||
old_path.rename(new_path)
|
||||
new_manifest[new_fname] = info
|
||||
labeled += 1
|
||||
|
||||
# 写回 manifest
|
||||
manifest_path.write_text(json.dumps(new_manifest, ensure_ascii=False, indent=2))
|
||||
|
||||
kept = len(all_files) - removed
|
||||
logger.info(
|
||||
"Filtered images for %s: kept %d, removed %d (refs=%s)",
|
||||
"Labeled %d/%d images for %s using summary figures",
|
||||
labeled,
|
||||
len(manifest),
|
||||
arxiv_id,
|
||||
kept,
|
||||
removed,
|
||||
referenced_ids,
|
||||
)
|
||||
return kept
|
||||
return labeled
|
||||
|
||||
|
||||
# ── Figure ↔ Image 关联 ────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _normalize_figure_id(raw_id: str) -> str:
|
||||
"""归一化 Figure/Table ID:'Figure 1'/'Fig.1' → 'Figure 1'。"""
|
||||
m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", raw_id, re.IGNORECASE)
|
||||
if m:
|
||||
return f"Figure {m.group(1)}"
|
||||
m2 = re.match(r"Table\s*(\d+)", raw_id, re.IGNORECASE)
|
||||
if m2:
|
||||
return f"Table {m2.group(1)}"
|
||||
return raw_id
|
||||
|
||||
|
||||
def _is_figure_type(fig_id: str) -> bool:
|
||||
"""判断是否为 Figure 类型(非 Table)。"""
|
||||
return not re.match(r"Table\s*(\d+)", fig_id, re.IGNORECASE)
|
||||
|
||||
|
||||
def _image_sort_key(name: str) -> tuple[int, int]:
|
||||
"""按文件名中的编号排序提取的图片。"""
|
||||
# 新格式:figure_1.jpg, table_1.jpg
|
||||
m = re.search(r"(?:figure|table)_(\d+)", name)
|
||||
if m:
|
||||
return (0, int(m.group(1)))
|
||||
# 旧格式:page2_img1.png, page5_table1.png, figure_1.png
|
||||
m2 = re.search(r"page(\d+)_(?:img|table)(\d+)", name)
|
||||
if m2:
|
||||
return (int(m2.group(1)), int(m2.group(2)))
|
||||
return (0, 0)
|
||||
|
||||
|
||||
def link_figures_with_images(
|
||||
figures: list[dict], images: list[dict], arxiv_id: str
|
||||
) -> list[dict]:
|
||||
"""将 summary figures 元数据与提取的图片文件关联。
|
||||
|
||||
策略:
|
||||
1. 优先用 manifest.json 的 label 做 ID 精确匹配
|
||||
2. 未匹配的 figure 用序号兜底:第 N 个 Figure → 第 N 张提取图
|
||||
"""
|
||||
if not figures or not images:
|
||||
return figures
|
||||
|
||||
manifest_path = PAPERS_DIR / arxiv_id / "images" / "manifest.json"
|
||||
|
||||
# ── 策略 1:manifest ID 精确匹配 ──
|
||||
id_to_url: dict[str, str] = {}
|
||||
if manifest_path.exists():
|
||||
try:
|
||||
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
|
||||
except (ValueError, TypeError):
|
||||
manifest = {}
|
||||
for filename, info in manifest.items():
|
||||
url = f"/papers/{arxiv_id}/images/{filename}"
|
||||
# 优先用 label 字段(新格式)
|
||||
label = info.get("label", "")
|
||||
if label:
|
||||
id_to_url[label] = url
|
||||
# 也兼容 figures/tables 列表(旧格式)
|
||||
for fig_id in info.get("figures", []) + info.get("tables", []):
|
||||
if fig_id not in id_to_url:
|
||||
id_to_url[fig_id] = url
|
||||
|
||||
for fig in figures:
|
||||
raw_id = fig.get("id", "")
|
||||
normalized = _normalize_figure_id(raw_id)
|
||||
if normalized in id_to_url:
|
||||
fig["image_url"] = id_to_url[normalized]
|
||||
|
||||
# ── 策略 2:序号兜底(manifest 匹配不到时) ──
|
||||
unmatched = [f for f in figures if not f.get("image_url")]
|
||||
if not unmatched:
|
||||
return figures
|
||||
|
||||
# 按类型分流:Figure vs Table
|
||||
fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))]
|
||||
table_type_unmatched = [
|
||||
f for f in unmatched if not _is_figure_type(f.get("id", ""))
|
||||
]
|
||||
|
||||
# 提取的图片按类型分流,按文件名中的编号排序
|
||||
fig_images = sorted(
|
||||
[img for img in images if "table" not in img["name"].lower()],
|
||||
key=lambda img: _image_sort_key(img["name"]),
|
||||
)
|
||||
table_images = sorted(
|
||||
[img for img in images if "table" in img["name"].lower()],
|
||||
key=lambda img: _image_sort_key(img["name"]),
|
||||
)
|
||||
|
||||
for i, fig in enumerate(fig_type_unmatched):
|
||||
if i < len(fig_images):
|
||||
fig["image_url"] = fig_images[i]["url"]
|
||||
|
||||
for i, fig in enumerate(table_type_unmatched):
|
||||
if i < len(table_images):
|
||||
fig["image_url"] = table_images[i]["url"]
|
||||
|
||||
return figures
|
||||
|
||||
Reference in New Issue
Block a user