feat: improve PDF image extraction with caption-based labeling and fallback matching

- Enhance pdf_image_extractor with caption text extraction near images/tables - Add figure/table type correction based on caption content - Implement sequential numbering fallback for unmatched items - Improve figure linking in pages with manifest ID matching and fallback strategies - Remove docling dependency, add dev dependency group
2026-06-09 14:07:21 +08:00
parent 32978b3fc5
commit 18f44ac244
4 changed files with 343 additions and 1593 deletions
@@ -273,38 +273,86 @@ def _link_figures_with_images(
 ) -> list[dict]:
    """将 summary figures 元数据与提取的图片文件关联。

-    通过 manifest.json 中的 figure ID 匹配，给每个 figure 加上 image_url。
+    策略：
+    1. 优先用 manifest.json 的 label 做 ID 精确匹配
+    2. 未匹配的 figure 用序号兜底：第 N 个 Figure → 第 N 张提取图
    """
    if not figures or not images:
        return figures

    manifest_path = PAPERS_DIR / arxiv_id / "images" / "manifest.json"
-    if not manifest_path.exists():
-        return figures

-    try:
-        manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
-    except (ValueError, TypeError):
-        return figures
-
-    # 构建 figure_id -> image_url 的映射
+    # ── 策略 1：manifest ID 精确匹配 ──
    id_to_url: dict[str, str] = {}
-    for filename, info in manifest.items():
-        url = f"/papers/{arxiv_id}/images/{filename}"
-        for fig_id in info.get("figures", []) + info.get("tables", []):
-            id_to_url[fig_id] = url
+    if manifest_path.exists():
+        try:
+            manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
+        except (ValueError, TypeError):
+            manifest = {}
+        for filename, info in manifest.items():
+            url = f"/papers/{arxiv_id}/images/{filename}"
+            # 优先用 label 字段（新格式）
+            label = info.get("label", "")
+            if label:
+                id_to_url[label] = url
+            # 也兼容 figures/tables 列表（旧格式）
+            for fig_id in info.get("figures", []) + info.get("tables", []):
+                if fig_id not in id_to_url:
+                    id_to_url[fig_id] = url

-    # 归一化 summary figures 的 ID
    for fig in figures:
        raw_id = fig.get("id", "")
-        m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", raw_id, re.IGNORECASE)
-        if m:
-            normalized = f"Figure {m.group(1)}"
-        else:
-            m2 = re.match(r"Table\s*(\d+)", raw_id, re.IGNORECASE)
-            normalized = f"Table {m2.group(1)}" if m2 else raw_id
-
+        normalized = _normalize_figure_id(raw_id)
        if normalized in id_to_url:
            fig["image_url"] = id_to_url[normalized]

+    # ── 策略 2：序号兜底（manifest 匹配不到时） ──
+    unmatched = [f for f in figures if not f.get("image_url")]
+    if not unmatched:
+        return figures
+
+    # 按类型分流：Figure vs Table
+    fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))]
+    table_type_unmatched = [f for f in unmatched if not _is_figure_type(f.get("id", ""))]
+
+    # 提取的图片也按类型分流，按文件名排序
+    def _sort_key(name: str) -> tuple[int, int]:
+        m = re.search(r'page(\d+)_(?:img|table)(\d+)', name)
+        if m:
+            return (int(m.group(1)), int(m.group(2)))
+        return (0, 0)
+
+    fig_images = sorted(
+        [img for img in images if "table" not in img["name"].lower()],
+        key=lambda img: _sort_key(img["name"]),
+    )
+    table_images = sorted(
+        [img for img in images if "table" in img["name"].lower()],
+        key=lambda img: _sort_key(img["name"]),
+    )
+
+    for i, fig in enumerate(fig_type_unmatched):
+        if i < len(fig_images):
+            fig["image_url"] = fig_images[i]["url"]
+
+    for i, fig in enumerate(table_type_unmatched):
+        if i < len(table_images):
+            fig["image_url"] = table_images[i]["url"]
+
    return figures
+
+
+def _normalize_figure_id(raw_id: str) -> str:
+    """归一化 Figure/Table ID：'Figure 1'/'Fig.1' → 'Figure 1'。"""
+    m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", raw_id, re.IGNORECASE)
+    if m:
+        return f"Figure {m.group(1)}"
+    m2 = re.match(r"Table\s*(\d+)", raw_id, re.IGNORECASE)
+    if m2:
+        return f"Table {m2.group(1)}"
+    return raw_id
+
+
+def _is_figure_type(fig_id: str) -> bool:
+    """判断是否为 Figure 类型（非 Table）。"""
+    return not re.match(r"Table\s*(\d+)", fig_id, re.IGNORECASE)