feat: enhance PDF extraction with section-based figure routing and improved caption detection

2026-06-10 02:05:30 +08:00
parent c94ff48254
commit a1e0962820
7 changed files with 253 additions and 116 deletions
@@ -122,17 +122,32 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))

    linked_figures = _link_figures_with_images(figures_raw, images, arxiv_id)

-    # 拆分：table_figures（有截图的 Table 类型）→ 实验结果区域展示截图
-    #       figures（其余）→ 论文图表画廊
-    table_figures = []
-    figures = []
+    # 拆分图片到对应展示区域：
+    #   table_figures   → 实验结果区域（Table 截图，不变）
+    #   method_figures  → 核心方法区域（section=="method"）
+    #   results_figures → 实验结果区域（section=="results" 的 Figure）
+    #   gallery_figures → 底部画廊（其余：motivation/limitations/无 section/无图）
+    table_figures: list[dict] = []
+    method_figures: list[dict] = []
+    results_figures: list[dict] = []
+    gallery_figures: list[dict] = []
    for fig in linked_figures:
        fig_id = fig.get("id", "")
+        section = fig.get("section", "")
        is_table = fig_id.lower().startswith("table")
+
        if is_table and fig.get("image_url"):
            table_figures.append(fig)
+        elif not is_table and section == "method" and fig.get("image_url"):
+            method_figures.append(fig)
+        elif (
+            not is_table
+            and section == "results"
+            and fig.get("image_url")
+        ):
+            results_figures.append(fig)
        else:
-            figures.append(fig)
+            gallery_figures.append(fig)

    return templates.TemplateResponse(
        request,
@@ -144,8 +159,10 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
            "paper_images": images,
            "prereqs": prereqs,
            "benchmarks": benchmarks,
-            "figures": figures,
+            "figures": gallery_figures,
            "table_figures": table_figures,
+            "method_figures": method_figures,
+            "results_figures": results_figures,
            "chroma_enabled": settings.CHROMA_ENABLED,
            "page_title": paper.title_zh or paper.title_en,
        },