feat: improve PDF image extraction with caption-based labeling and fallback matching
- Enhance pdf_image_extractor with caption text extraction near images/tables - Add figure/table type correction based on caption content - Implement sequential numbering fallback for unmatched items - Improve figure linking in pages with manifest ID matching and fallback strategies - Remove docling dependency, add dev dependency group
This commit is contained in:
+69
-21
@@ -273,38 +273,86 @@ def _link_figures_with_images(
|
||||
) -> list[dict]:
|
||||
"""将 summary figures 元数据与提取的图片文件关联。
|
||||
|
||||
通过 manifest.json 中的 figure ID 匹配,给每个 figure 加上 image_url。
|
||||
策略:
|
||||
1. 优先用 manifest.json 的 label 做 ID 精确匹配
|
||||
2. 未匹配的 figure 用序号兜底:第 N 个 Figure → 第 N 张提取图
|
||||
"""
|
||||
if not figures or not images:
|
||||
return figures
|
||||
|
||||
manifest_path = PAPERS_DIR / arxiv_id / "images" / "manifest.json"
|
||||
if not manifest_path.exists():
|
||||
return figures
|
||||
|
||||
try:
|
||||
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
|
||||
except (ValueError, TypeError):
|
||||
return figures
|
||||
|
||||
# 构建 figure_id -> image_url 的映射
|
||||
# ── 策略 1:manifest ID 精确匹配 ──
|
||||
id_to_url: dict[str, str] = {}
|
||||
for filename, info in manifest.items():
|
||||
url = f"/papers/{arxiv_id}/images/{filename}"
|
||||
for fig_id in info.get("figures", []) + info.get("tables", []):
|
||||
id_to_url[fig_id] = url
|
||||
if manifest_path.exists():
|
||||
try:
|
||||
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
|
||||
except (ValueError, TypeError):
|
||||
manifest = {}
|
||||
for filename, info in manifest.items():
|
||||
url = f"/papers/{arxiv_id}/images/{filename}"
|
||||
# 优先用 label 字段(新格式)
|
||||
label = info.get("label", "")
|
||||
if label:
|
||||
id_to_url[label] = url
|
||||
# 也兼容 figures/tables 列表(旧格式)
|
||||
for fig_id in info.get("figures", []) + info.get("tables", []):
|
||||
if fig_id not in id_to_url:
|
||||
id_to_url[fig_id] = url
|
||||
|
||||
# 归一化 summary figures 的 ID
|
||||
for fig in figures:
|
||||
raw_id = fig.get("id", "")
|
||||
m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", raw_id, re.IGNORECASE)
|
||||
if m:
|
||||
normalized = f"Figure {m.group(1)}"
|
||||
else:
|
||||
m2 = re.match(r"Table\s*(\d+)", raw_id, re.IGNORECASE)
|
||||
normalized = f"Table {m2.group(1)}" if m2 else raw_id
|
||||
|
||||
normalized = _normalize_figure_id(raw_id)
|
||||
if normalized in id_to_url:
|
||||
fig["image_url"] = id_to_url[normalized]
|
||||
|
||||
# ── 策略 2:序号兜底(manifest 匹配不到时) ──
|
||||
unmatched = [f for f in figures if not f.get("image_url")]
|
||||
if not unmatched:
|
||||
return figures
|
||||
|
||||
# 按类型分流:Figure vs Table
|
||||
fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))]
|
||||
table_type_unmatched = [f for f in unmatched if not _is_figure_type(f.get("id", ""))]
|
||||
|
||||
# 提取的图片也按类型分流,按文件名排序
|
||||
def _sort_key(name: str) -> tuple[int, int]:
|
||||
m = re.search(r'page(\d+)_(?:img|table)(\d+)', name)
|
||||
if m:
|
||||
return (int(m.group(1)), int(m.group(2)))
|
||||
return (0, 0)
|
||||
|
||||
fig_images = sorted(
|
||||
[img for img in images if "table" not in img["name"].lower()],
|
||||
key=lambda img: _sort_key(img["name"]),
|
||||
)
|
||||
table_images = sorted(
|
||||
[img for img in images if "table" in img["name"].lower()],
|
||||
key=lambda img: _sort_key(img["name"]),
|
||||
)
|
||||
|
||||
for i, fig in enumerate(fig_type_unmatched):
|
||||
if i < len(fig_images):
|
||||
fig["image_url"] = fig_images[i]["url"]
|
||||
|
||||
for i, fig in enumerate(table_type_unmatched):
|
||||
if i < len(table_images):
|
||||
fig["image_url"] = table_images[i]["url"]
|
||||
|
||||
return figures
|
||||
|
||||
|
||||
def _normalize_figure_id(raw_id: str) -> str:
|
||||
"""归一化 Figure/Table ID:'Figure 1'/'Fig.1' → 'Figure 1'。"""
|
||||
m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", raw_id, re.IGNORECASE)
|
||||
if m:
|
||||
return f"Figure {m.group(1)}"
|
||||
m2 = re.match(r"Table\s*(\d+)", raw_id, re.IGNORECASE)
|
||||
if m2:
|
||||
return f"Table {m2.group(1)}"
|
||||
return raw_id
|
||||
|
||||
|
||||
def _is_figure_type(fig_id: str) -> bool:
|
||||
"""判断是否为 Figure 类型(非 Table)。"""
|
||||
return not re.match(r"Table\s*(\d+)", fig_id, re.IGNORECASE)
|
||||
|
||||
Reference in New Issue
Block a user