658 lines
22 KiB
Python
658 lines
22 KiB
Python
"""PDF 图片与表格提取 — 基于 pymupdf4llm layout analysis。
|
||
|
||
用 pymupdf4llm 的 layout analysis 检测 table / picture 区域,
|
||
再通过 caption 文字匹配确定 Figure/Table 编号,渲染为 JPEG。
|
||
|
||
相比旧方案(caption 正则 + pdfplumber/find_tables/文本块扫描三套策略):
|
||
- layout analysis 直接给出区域 bbox,不存在相邻表格互相侵入的问题
|
||
- 无需手动调参(最大高度、间隙阈值等)
|
||
- 页面级 caption 匹配:每个 caption 只分配给最近的 box,避免上下相邻表格抢夺同一个 caption
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import logging
|
||
import re
|
||
from pathlib import Path
|
||
|
||
import pymupdf
|
||
import pymupdf4llm.helpers.document_layout as dl
|
||
|
||
from app.services.pdf_downloader import paper_dir
|
||
from app.utils import TMP_DIR
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# ── Caption 正则 ───────────────────────────────────────────────────────
|
||
|
||
# 用于从 caption 文字中提取 Figure/Table 编号
|
||
_FIGURE_CAPTION_RE = re.compile(
|
||
r"^(?:Fig\.?|Figure)\s+(\d+)\s*(?:[:\.]\s*|\s+(?=(?-i:[A-Z])))",
|
||
re.IGNORECASE,
|
||
)
|
||
_TABLE_CAPTION_RE = re.compile(
|
||
r"^Table\s+(\d+)\s*(?:[:\.]\s*|\s+(?=(?-i:[A-Z])))",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# caption 与 table/picture 的最大匹配距离(点)
|
||
_CAPTION_MATCH_DISTANCE = 100
|
||
# 截图区域的外边距
|
||
_REGION_PADDING = 5
|
||
# 3x 渲染,保证清晰度
|
||
_RENDER_ZOOM = 3
|
||
# 相邻 box 聚类间距(点)— 同一 figure/table 的碎片间距通常 < 15pt
|
||
_CLUSTER_GAP = 15
|
||
|
||
|
||
# ── Box 聚类 ─────────────────────────────────────────────────────────
|
||
|
||
|
||
class _BoxCluster:
|
||
"""合并后的布局区域(由一个或多个相邻 LayoutBox 组成)。
|
||
|
||
pymupdf4llm 有时将一个大图拆成多个小 picture box(如视频帧网格),
|
||
聚类后用整体 bbox 作为渲染区域。
|
||
"""
|
||
|
||
__slots__ = ("x0", "y0", "x1", "y1", "boxclass")
|
||
|
||
def __init__(self, boxes: list):
|
||
self.x0 = min(b.x0 for b in boxes)
|
||
self.y0 = min(b.y0 for b in boxes)
|
||
self.x1 = max(b.x1 for b in boxes)
|
||
self.y1 = max(b.y1 for b in boxes)
|
||
# table-fallback 归一化为 table(layout model 检测到表格但无法提取结构)
|
||
raw = boxes[0].boxclass
|
||
self.boxclass = "table" if raw == "table-fallback" else raw
|
||
|
||
|
||
def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]:
|
||
"""将相邻的同类型 box 合并为聚类。
|
||
|
||
用 union-find 将间距 ≤ gap 的同类型 box 归为一组,
|
||
每组生成一个 _BoxCluster(整体 bbox)。
|
||
"""
|
||
if not boxes:
|
||
return []
|
||
|
||
n = len(boxes)
|
||
parent = list(range(n))
|
||
|
||
def find(x: int) -> int:
|
||
while parent[x] != x:
|
||
parent[x] = parent[parent[x]]
|
||
x = parent[x]
|
||
return x
|
||
|
||
def union(a: int, b: int) -> None:
|
||
ra, rb = find(a), find(b)
|
||
if ra != rb:
|
||
parent[ra] = rb
|
||
|
||
for i in range(n):
|
||
bi = boxes[i]
|
||
for j in range(i + 1, n):
|
||
bj = boxes[j]
|
||
if bi.boxclass != bj.boxclass:
|
||
continue
|
||
h_gap = max(0.0, max(bi.x0, bj.x0) - min(bi.x1, bj.x1))
|
||
v_gap = max(0.0, max(bi.y0, bj.y0) - min(bi.y1, bj.y1))
|
||
h_overlap = bi.x1 > bj.x0 - gap and bj.x1 > bi.x0 - gap
|
||
v_overlap = bi.y1 > bj.y0 - gap and bj.y1 > bi.y0 - gap
|
||
if (h_gap <= gap and v_overlap) or (v_gap <= gap and h_overlap):
|
||
union(i, j)
|
||
|
||
groups: dict[int, list] = {}
|
||
for i in range(n):
|
||
groups.setdefault(find(i), []).append(boxes[i])
|
||
|
||
return [_BoxCluster(members) for members in groups.values()]
|
||
|
||
|
||
# ── 页面级 Caption 查找与匹配 ──────────────────────────────────────────
|
||
|
||
|
||
def _find_page_captions(page) -> list[dict]:
|
||
"""查找页面上所有 Figure/Table caption 文字块。"""
|
||
blocks = page.get_text("blocks")
|
||
captions = []
|
||
for b in blocks:
|
||
if len(b) < 5:
|
||
continue
|
||
bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
|
||
text = str(b[4]).strip()
|
||
first_line = text.split("\n")[0].strip()
|
||
|
||
cap_type = None
|
||
m = _TABLE_CAPTION_RE.match(first_line)
|
||
if m:
|
||
cap_type = "table"
|
||
else:
|
||
m = _FIGURE_CAPTION_RE.match(first_line)
|
||
if m:
|
||
cap_type = "figure"
|
||
if m is None:
|
||
continue
|
||
|
||
captions.append(
|
||
{
|
||
"label": f"{'Table' if cap_type == 'table' else 'Figure'} {m.group(1)}",
|
||
"type": cap_type,
|
||
"caption_text": text,
|
||
"caption_y0": by0,
|
||
"caption_y1": by1,
|
||
"caption_x0": bx0,
|
||
"caption_x1": bx1,
|
||
}
|
||
)
|
||
return captions
|
||
|
||
|
||
def _vertical_distance(cap_y0, cap_y1, box_y0, box_y1) -> float | None:
|
||
"""计算 caption 到 box 的垂直距离。不邻接时返回 None。
|
||
|
||
三种情况:caption 完全在 box 上方、完全在下方、与 box 有垂直重叠。
|
||
重叠(含部分溢出)视为 distance=0,确保 caption 延伸到 box 边界外时不会丢失。
|
||
"""
|
||
# Caption 完全在 box 上方
|
||
if cap_y1 <= box_y0:
|
||
dist = box_y0 - cap_y1
|
||
return dist if dist <= _CAPTION_MATCH_DISTANCE else None
|
||
# Caption 完全在 box 下方
|
||
if cap_y0 >= box_y1:
|
||
dist = cap_y0 - box_y1
|
||
return dist if dist <= _CAPTION_MATCH_DISTANCE else None
|
||
# Caption 与 box 有垂直重叠(内部、部分溢出都算)→ 距离 0
|
||
return 0
|
||
|
||
|
||
def _same_column(cap: dict, box, page_width: float) -> bool:
|
||
"""判断 caption 和 box 是否在同一列。
|
||
|
||
双栏论文中左右栏间距有限,简单的水平重叠检查会跨列匹配。
|
||
策略:用中心 X 坐标判断各自在哪半边,只有同半边才算同列。
|
||
跨栏图表(caption 或 box 宽度 >65% 页宽)不受此限制。
|
||
"""
|
||
cap_w = cap["caption_x1"] - cap["caption_x0"]
|
||
box_w = box.x1 - box.x0
|
||
|
||
# 跨栏元素:宽度超过页面的 65%
|
||
if cap_w > page_width * 0.65 or box_w > page_width * 0.65:
|
||
return True
|
||
|
||
cap_cx = (cap["caption_x0"] + cap["caption_x1"]) / 2
|
||
box_cx = (box.x0 + box.x1) / 2
|
||
mid = page_width / 2
|
||
|
||
# 同在左半边或同在右半边
|
||
return (cap_cx < mid) == (box_cx < mid)
|
||
|
||
|
||
def _match_captions_to_boxes(
|
||
page_boxes: list, captions: list[dict], page_width: float
|
||
) -> list[tuple[list[int], list[dict]]]:
|
||
"""将 caption 分配给 box,允许一个 caption 匹配多个同类型 box。
|
||
|
||
典型场景:
|
||
- Figure 由左右两个 picture box 组成,caption 同时靠近两者
|
||
- Table 的视觉内容被 layout analysis 误分类为 picture,需要跨类型匹配
|
||
|
||
Returns:
|
||
[(box_indices, captions), ...] 每组是一个独立的渲染任务
|
||
"""
|
||
# 每个 caption 找到所有距离在阈值内的 box
|
||
# 优先匹配同类型;如果找不到,再匹配任意 table/picture box
|
||
cap_to_boxes: dict[int, list[tuple[int, float]]] = {}
|
||
|
||
for ci, cap in enumerate(captions):
|
||
same_type: list[tuple[int, float]] = []
|
||
any_type: list[tuple[int, float]] = []
|
||
expected = "table" if cap["type"] == "table" else "picture"
|
||
|
||
for bi, box in enumerate(page_boxes):
|
||
# 列感知:双栏论文中只匹配同栏的 box
|
||
if not _same_column(cap, box, page_width):
|
||
continue
|
||
# 水平重叠检查(同列内仍需有重叠)
|
||
if not (
|
||
cap["caption_x1"] > box.x0 - 5 and cap["caption_x0"] < box.x1 + 5
|
||
):
|
||
continue
|
||
dist = _vertical_distance(
|
||
cap["caption_y0"], cap["caption_y1"], box.y0, box.y1
|
||
)
|
||
if dist is None:
|
||
continue
|
||
entry = (bi, dist)
|
||
any_type.append(entry)
|
||
if box.boxclass == expected:
|
||
same_type.append(entry)
|
||
|
||
# 优先用同类型匹配;没有时回退到任意类型;都没有则跳过
|
||
if same_type:
|
||
cap_to_boxes[ci] = same_type
|
||
elif any_type:
|
||
cap_to_boxes[ci] = any_type
|
||
# else: 该 caption 无匹配 box,不加入 cap_to_boxes
|
||
|
||
# 每个 caption → 最近的 box(用于分组),但记录所有匹配的 box
|
||
cap_primary: dict[int, int] = {} # caption → primary box index
|
||
cap_all_boxes: dict[int, list[int]] = {} # caption → all matched box indices
|
||
for ci, matches in cap_to_boxes.items():
|
||
matches.sort(key=lambda x: x[1])
|
||
cap_primary[ci] = matches[0][0]
|
||
# 所有距离最近的同组 box(距离差 < 20pt 视为同一组)
|
||
best_dist = matches[0][1]
|
||
cap_all_boxes[ci] = [bi for bi, d in matches if d <= best_dist + 20]
|
||
|
||
# 按 primary box 分组
|
||
box_to_caps: dict[int, list[int]] = {}
|
||
for ci, bi in cap_primary.items():
|
||
box_to_caps.setdefault(bi, []).append(ci)
|
||
|
||
# 构建渲染组:每个 caption 独立成组(共享 box 但各自渲染)
|
||
# 同类型同 label 的 caption 会合并;不同类型则分开
|
||
used_captions: set[int] = set()
|
||
groups: list[tuple[list[int], list[dict]]] = []
|
||
|
||
for bi in sorted(box_to_caps.keys()):
|
||
cis = box_to_caps[bi]
|
||
for ci in cis:
|
||
if ci in used_captions:
|
||
continue
|
||
used_captions.add(ci)
|
||
|
||
all_box_indices = set(cap_all_boxes.get(ci, [bi]))
|
||
# 只合并同 label 的 caption(同 figure/table 的重复 caption)
|
||
merged_captions = [captions[ci]]
|
||
for other_bi in all_box_indices:
|
||
if other_bi in box_to_caps:
|
||
for other_ci in box_to_caps[other_bi]:
|
||
if other_ci not in used_captions:
|
||
other_cap = captions[other_ci]
|
||
if other_cap["label"] == captions[ci]["label"]:
|
||
used_captions.add(other_ci)
|
||
merged_captions.append(other_cap)
|
||
groups.append((sorted(all_box_indices), merged_captions))
|
||
|
||
return groups
|
||
|
||
|
||
# ── 单页处理 ─────────────────────────────────────────────────────────
|
||
|
||
|
||
def _render_and_save(
|
||
page,
|
||
clip: pymupdf.Rect,
|
||
images_dest: Path,
|
||
manifest: dict,
|
||
label: str,
|
||
cap_type: str,
|
||
caption_text: str,
|
||
page_num_1based: int,
|
||
arxiv_id: str,
|
||
) -> bool:
|
||
"""渲染页面区域并保存 JPEG,写入 manifest。成功返回 True。"""
|
||
mat = pymupdf.Matrix(_RENDER_ZOOM, _RENDER_ZOOM)
|
||
try:
|
||
pix = page.get_pixmap(matrix=mat, clip=clip)
|
||
except Exception:
|
||
logger.debug("Failed to render %s for %s", label, arxiv_id)
|
||
return False
|
||
|
||
filename = f"{label.replace(' ', '_').lower()}.jpg"
|
||
(images_dest / filename).write_bytes(pix.tobytes("jpeg"))
|
||
|
||
manifest[filename] = {
|
||
"page": page_num_1based,
|
||
"type": cap_type,
|
||
"label": label,
|
||
"caption_text": caption_text[:200] if caption_text else "",
|
||
"figures" if cap_type == "figure" else "tables": [label],
|
||
}
|
||
logger.debug(
|
||
"Rendered %s: page %d, region (%.0f,%.0f)-(%.0f,%.0f) → %s",
|
||
label,
|
||
page_num_1based,
|
||
clip.x0,
|
||
clip.y0,
|
||
clip.x1,
|
||
clip.y1,
|
||
filename,
|
||
)
|
||
return True
|
||
|
||
|
||
def _process_page(
|
||
doc,
|
||
page_idx: int,
|
||
page_layout,
|
||
images_dest: Path,
|
||
manifest: dict,
|
||
seen_labels: set,
|
||
arxiv_id: str,
|
||
) -> int:
|
||
"""处理单页:caption 匹配 + orphan 兜底,返回本页提取数量。"""
|
||
page = doc[page_idx]
|
||
page_width = page.rect.width
|
||
page_num = page_idx + 1
|
||
orphan_fig_counter = 0
|
||
orphan_tbl_counter = 0
|
||
|
||
# 收集本页的 table/picture box(跳过极小区域)
|
||
raw_boxes = []
|
||
for box in page_layout.boxes:
|
||
if box.boxclass not in ("table", "table-fallback", "picture"):
|
||
continue
|
||
if (box.x1 - box.x0) < 20 or (box.y1 - box.y0) < 20:
|
||
continue
|
||
raw_boxes.append(box)
|
||
|
||
if not raw_boxes:
|
||
return 0
|
||
|
||
# 聚类:将同一 figure/table 的碎片 box 合并
|
||
page_boxes = _cluster_boxes(raw_boxes)
|
||
|
||
# 页面级匹配:查找所有 caption,分配给 box
|
||
captions = _find_page_captions(page)
|
||
groups = _match_captions_to_boxes(page_boxes, captions, page_width)
|
||
|
||
# 只合并同 label 的 group(同一个 figure/table 的重复 caption)
|
||
# 不同 label 的 group 即使共享 box 也不合并(如 Figure 7 和 Figure 8),
|
||
# 渲染时用 caption 位置切割区域
|
||
_merged_groups: set[int] = set()
|
||
merged_groups: list[tuple[list[int], list[dict]]] = []
|
||
for gi, (box_indices, caps) in enumerate(groups):
|
||
if gi in _merged_groups:
|
||
continue
|
||
this_labels = {c["label"] for c in caps}
|
||
all_box_set = set(box_indices)
|
||
merge_targets = {gi}
|
||
for other_gi, (other_bi, other_caps) in enumerate(groups):
|
||
if other_gi <= gi or other_gi in _merged_groups:
|
||
continue
|
||
other_labels = {c["label"] for c in other_caps}
|
||
# 只在 label 有交集时合并(同一个 figure/table)
|
||
if this_labels & other_labels and all_box_set & set(other_bi):
|
||
merge_targets.add(other_gi)
|
||
all_box_set |= set(other_bi)
|
||
all_caps = []
|
||
for mgi in sorted(merge_targets):
|
||
_merged_groups.add(mgi)
|
||
all_caps.extend(groups[mgi][1])
|
||
merged_groups.append((sorted(all_box_set), all_caps))
|
||
groups = merged_groups
|
||
|
||
# ── 阶段 1:渲染有 caption 匹配的图/表 ──
|
||
matched_box_indices: set[int] = set()
|
||
extracted = 0
|
||
|
||
for box_indices, caps in groups:
|
||
matched_box_indices.update(box_indices)
|
||
|
||
# 去重同一 label,跳过已处理的
|
||
unique_caps = []
|
||
for cap in caps:
|
||
if cap["label"] not in seen_labels:
|
||
seen_labels.add(cap["label"])
|
||
unique_caps.append(cap)
|
||
if not unique_caps:
|
||
continue
|
||
|
||
# 合并所有关联 box 的 bbox
|
||
bx0 = min(page_boxes[i].x0 for i in box_indices)
|
||
by0 = min(page_boxes[i].y0 for i in box_indices)
|
||
bx1 = max(page_boxes[i].x1 for i in box_indices)
|
||
by1 = max(page_boxes[i].y1 for i in box_indices)
|
||
|
||
# 渲染区域:box + caption
|
||
all_cap_y0 = min(c["caption_y0"] for c in unique_caps)
|
||
all_cap_y1 = max(c["caption_y1"] for c in unique_caps)
|
||
all_cap_x0 = min(c["caption_x0"] for c in unique_caps)
|
||
all_cap_x1 = max(c["caption_x1"] for c in unique_caps)
|
||
|
||
top = max(0, min(by0, all_cap_y0) - _REGION_PADDING)
|
||
bottom = max(by1, all_cap_y1) + _REGION_PADDING
|
||
rx0 = max(0, min(bx0, all_cap_x0) - _REGION_PADDING)
|
||
rx1 = min(page_width, max(bx1, all_cap_x1) + _REGION_PADDING)
|
||
|
||
clip = pymupdf.Rect(rx0, top, rx1, bottom)
|
||
# 多个 caption 可能共享同一区域(如 subfigure),只需渲染一次
|
||
jpeg_bytes = None
|
||
for cap in unique_caps:
|
||
if jpeg_bytes is None:
|
||
if not _render_and_save(
|
||
page,
|
||
clip,
|
||
images_dest,
|
||
manifest,
|
||
cap["label"],
|
||
cap["type"],
|
||
cap["caption_text"],
|
||
page_num,
|
||
arxiv_id,
|
||
):
|
||
break
|
||
# 读取刚写入的 bytes 供后续同名 caption 复用
|
||
filename = f"{cap['label'].replace(' ', '_').lower()}.jpg"
|
||
jpeg_bytes = (images_dest / filename).read_bytes()
|
||
extracted += 1
|
||
else:
|
||
# 同区域的不同 caption(如 subfigure),复用图片
|
||
filename = f"{cap['label'].replace(' ', '_').lower()}.jpg"
|
||
(images_dest / filename).write_bytes(jpeg_bytes)
|
||
cap_preview = cap["caption_text"][:200]
|
||
manifest[filename] = {
|
||
"page": page_num,
|
||
"type": cap["type"],
|
||
"label": cap["label"],
|
||
"caption_text": cap_preview,
|
||
"figures" if cap["type"] == "figure" else "tables": [cap["label"]],
|
||
}
|
||
extracted += 1
|
||
|
||
# ── 阶段 2:渲染无 caption 匹配的图/表(orphan boxes) ──
|
||
orphan_indices = set(range(len(page_boxes))) - matched_box_indices
|
||
for bi in sorted(orphan_indices):
|
||
box = page_boxes[bi]
|
||
cap_type = "figure" if box.boxclass == "picture" else "table"
|
||
|
||
if cap_type == "figure":
|
||
orphan_fig_counter += 1
|
||
label = f"Figure (p{page_num}-{orphan_fig_counter})"
|
||
else:
|
||
orphan_tbl_counter += 1
|
||
label = f"Table (p{page_num}-{orphan_tbl_counter})"
|
||
|
||
if label in seen_labels:
|
||
continue
|
||
seen_labels.add(label)
|
||
|
||
clip = pymupdf.Rect(
|
||
max(0, box.x0 - _REGION_PADDING),
|
||
max(0, box.y0 - _REGION_PADDING),
|
||
min(page_width, box.x1 + _REGION_PADDING),
|
||
box.y1 + _REGION_PADDING,
|
||
)
|
||
if _render_and_save(
|
||
page,
|
||
clip,
|
||
images_dest,
|
||
manifest,
|
||
label,
|
||
cap_type,
|
||
"",
|
||
page_num,
|
||
arxiv_id,
|
||
):
|
||
extracted += 1
|
||
|
||
return extracted
|
||
|
||
|
||
# ── 核心提取 ───────────────────────────────────────────────────────────
|
||
|
||
|
||
def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
|
||
"""从 PDF 提取 Figure/Table 截图,生成 manifest。
|
||
|
||
用 pymupdf4llm layout analysis 检测 table/picture 区域,
|
||
再通过 caption 文字确定编号,渲染为 JPEG。
|
||
|
||
Args:
|
||
arxiv_id: 论文 ID
|
||
pdf_path: PDF 路径,默认 data/tmp/{arxiv_id}/paper.pdf
|
||
|
||
Returns:
|
||
提取的图片数量
|
||
"""
|
||
if pdf_path is None:
|
||
pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
|
||
|
||
if not pdf_path.exists():
|
||
logger.warning("PDF not found for %s: %s", arxiv_id, pdf_path)
|
||
return 0
|
||
|
||
images_dest = paper_dir(arxiv_id) / "images"
|
||
images_dest.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 清理上次提取的旧图片
|
||
for old_file in images_dest.iterdir():
|
||
if old_file.suffix.lower() in (".png", ".jpg", ".jpeg"):
|
||
old_file.unlink()
|
||
if (images_dest / "manifest.json").exists():
|
||
(images_dest / "manifest.json").unlink()
|
||
|
||
doc = pymupdf.open(str(pdf_path))
|
||
|
||
# layout analysis
|
||
try:
|
||
parsed = dl.parse_document(
|
||
doc, filename=str(pdf_path), use_ocr=dl.OCRMode.NEVER
|
||
)
|
||
except Exception:
|
||
logger.warning(
|
||
"pymupdf4llm layout analysis failed for %s", arxiv_id, exc_info=True
|
||
)
|
||
doc.close()
|
||
return 0
|
||
|
||
extracted = 0
|
||
manifest: dict[str, dict] = {}
|
||
seen_labels: set[str] = set()
|
||
|
||
for page_idx, page_layout in enumerate(parsed.pages):
|
||
try:
|
||
extracted += _process_page(
|
||
doc,
|
||
page_idx,
|
||
page_layout,
|
||
images_dest=images_dest,
|
||
manifest=manifest,
|
||
seen_labels=seen_labels,
|
||
arxiv_id=arxiv_id,
|
||
)
|
||
except Exception:
|
||
logger.warning(
|
||
"Failed to process page %d for %s",
|
||
page_idx + 1,
|
||
arxiv_id,
|
||
exc_info=True,
|
||
)
|
||
continue
|
||
|
||
doc.close()
|
||
|
||
# 保存 manifest
|
||
manifest_path = images_dest / "manifest.json"
|
||
manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2))
|
||
|
||
if extracted > 0:
|
||
logger.info(
|
||
"Extracted %d figure/table screenshots from PDF for %s",
|
||
extracted,
|
||
arxiv_id,
|
||
)
|
||
|
||
return extracted
|
||
|
||
|
||
# ── 按 summary 过滤 ────────────────────────────────────────────────────
|
||
|
||
|
||
def filter_images_by_summary(arxiv_id: str, figures: list[dict]) -> int:
|
||
"""根据 summary 中的 figures 字段过滤提取的图片/表格。
|
||
|
||
用 manifest.json 中的 label 匹配,保留被 AI 总结引用的图片。
|
||
"""
|
||
if not figures:
|
||
return 0
|
||
|
||
images_dir = paper_dir(arxiv_id) / "images"
|
||
manifest_path = images_dir / "manifest.json"
|
||
|
||
if not images_dir.exists() or not manifest_path.exists():
|
||
return 0
|
||
|
||
all_files = [
|
||
f for f in images_dir.iterdir() if f.suffix.lower() in (".png", ".jpg", ".jpeg")
|
||
]
|
||
if not all_files:
|
||
return 0
|
||
|
||
manifest: dict = json.loads(manifest_path.read_text(encoding="utf-8"))
|
||
|
||
# 收集 summary 中引用的所有 Figure/Table ID(归一化)
|
||
referenced_ids: set[str] = set()
|
||
for fig in figures:
|
||
fig_id = fig.get("id", "")
|
||
m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", fig_id, re.IGNORECASE)
|
||
if m:
|
||
referenced_ids.add(f"Figure {m.group(1)}")
|
||
m2 = re.match(r"Table\s*(\d+)", fig_id, re.IGNORECASE)
|
||
if m2:
|
||
referenced_ids.add(f"Table {m2.group(1)}")
|
||
|
||
if not referenced_ids:
|
||
logger.warning("No valid figure/table IDs in summary for %s", arxiv_id)
|
||
return len(all_files)
|
||
|
||
# 根据 manifest 的 label 字段匹配
|
||
keep_filenames: set[str] = set()
|
||
for filename, info in manifest.items():
|
||
label = info.get("label", "")
|
||
if label in referenced_ids:
|
||
keep_filenames.add(filename)
|
||
continue
|
||
for ref in info.get("figures", []) + info.get("tables", []):
|
||
if ref in referenced_ids:
|
||
keep_filenames.add(filename)
|
||
break
|
||
|
||
if not keep_filenames:
|
||
logger.warning(
|
||
"No manifest matches for %s (refs=%s), keeping all",
|
||
arxiv_id,
|
||
referenced_ids,
|
||
)
|
||
return len(all_files)
|
||
|
||
removed = 0
|
||
for f in all_files:
|
||
if f.name not in keep_filenames:
|
||
f.unlink()
|
||
removed += 1
|
||
|
||
kept = len(all_files) - removed
|
||
logger.info(
|
||
"Filtered images for %s: kept %d, removed %d (refs=%s)",
|
||
arxiv_id,
|
||
kept,
|
||
removed,
|
||
referenced_ids,
|
||
)
|
||
return kept
|