feat: improve PDF image extraction with caption-based labeling and fallback matching

- Enhance pdf_image_extractor with caption text extraction near images/tables
- Add figure/table type correction based on caption content
- Implement sequential numbering fallback for unmatched items
- Improve figure linking in pages with manifest ID matching and fallback strategies
- Remove docling dependency, add dev dependency group
This commit is contained in:
2026-06-09 14:07:21 +08:00
parent 32978b3fc5
commit 18f44ac244
4 changed files with 343 additions and 1593 deletions
+69 -21
View File
@@ -273,38 +273,86 @@ def _link_figures_with_images(
) -> list[dict]:
"""将 summary figures 元数据与提取的图片文件关联。
通过 manifest.json 中的 figure ID 匹配,给每个 figure 加上 image_url。
策略:
1. 优先用 manifest.json 的 label 做 ID 精确匹配
2. 未匹配的 figure 用序号兜底:第 N 个 Figure → 第 N 张提取图
"""
if not figures or not images:
return figures
manifest_path = PAPERS_DIR / arxiv_id / "images" / "manifest.json"
if not manifest_path.exists():
return figures
try:
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
except (ValueError, TypeError):
return figures
# 构建 figure_id -> image_url 的映射
# ── 策略 1manifest ID 精确匹配 ──
id_to_url: dict[str, str] = {}
for filename, info in manifest.items():
url = f"/papers/{arxiv_id}/images/{filename}"
for fig_id in info.get("figures", []) + info.get("tables", []):
id_to_url[fig_id] = url
if manifest_path.exists():
try:
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
except (ValueError, TypeError):
manifest = {}
for filename, info in manifest.items():
url = f"/papers/{arxiv_id}/images/{filename}"
# 优先用 label 字段(新格式)
label = info.get("label", "")
if label:
id_to_url[label] = url
# 也兼容 figures/tables 列表(旧格式)
for fig_id in info.get("figures", []) + info.get("tables", []):
if fig_id not in id_to_url:
id_to_url[fig_id] = url
# 归一化 summary figures 的 ID
for fig in figures:
raw_id = fig.get("id", "")
m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", raw_id, re.IGNORECASE)
if m:
normalized = f"Figure {m.group(1)}"
else:
m2 = re.match(r"Table\s*(\d+)", raw_id, re.IGNORECASE)
normalized = f"Table {m2.group(1)}" if m2 else raw_id
normalized = _normalize_figure_id(raw_id)
if normalized in id_to_url:
fig["image_url"] = id_to_url[normalized]
# ── 策略 2:序号兜底(manifest 匹配不到时) ──
unmatched = [f for f in figures if not f.get("image_url")]
if not unmatched:
return figures
# 按类型分流:Figure vs Table
fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))]
table_type_unmatched = [f for f in unmatched if not _is_figure_type(f.get("id", ""))]
# 提取的图片也按类型分流,按文件名排序
def _sort_key(name: str) -> tuple[int, int]:
m = re.search(r'page(\d+)_(?:img|table)(\d+)', name)
if m:
return (int(m.group(1)), int(m.group(2)))
return (0, 0)
fig_images = sorted(
[img for img in images if "table" not in img["name"].lower()],
key=lambda img: _sort_key(img["name"]),
)
table_images = sorted(
[img for img in images if "table" in img["name"].lower()],
key=lambda img: _sort_key(img["name"]),
)
for i, fig in enumerate(fig_type_unmatched):
if i < len(fig_images):
fig["image_url"] = fig_images[i]["url"]
for i, fig in enumerate(table_type_unmatched):
if i < len(table_images):
fig["image_url"] = table_images[i]["url"]
return figures
def _normalize_figure_id(raw_id: str) -> str:
"""归一化 Figure/Table ID'Figure 1'/'Fig.1''Figure 1'"""
m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", raw_id, re.IGNORECASE)
if m:
return f"Figure {m.group(1)}"
m2 = re.match(r"Table\s*(\d+)", raw_id, re.IGNORECASE)
if m2:
return f"Table {m2.group(1)}"
return raw_id
def _is_figure_type(fig_id: str) -> bool:
"""判断是否为 Figure 类型(非 Table)。"""
return not re.match(r"Table\s*(\d+)", fig_id, re.IGNORECASE)