feat: enhance PDF extraction with section-based figure routing and improved caption detection

This commit is contained in:
2026-06-10 02:05:30 +08:00
parent c94ff48254
commit a1e0962820
7 changed files with 253 additions and 116 deletions
+23 -6
View File
@@ -122,17 +122,32 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
linked_figures = _link_figures_with_images(figures_raw, images, arxiv_id)
# 拆分table_figures(有截图的 Table 类型)→ 实验结果区域展示截图
# figures(其余)→ 论文图表画廊
table_figures = []
figures = []
# 拆分图片到对应展示区域:
# table_figures → 实验结果区域(Table 截图,不变)
# method_figures → 核心方法区域(section=="method"
# results_figures → 实验结果区域(section=="results" 的 Figure
# gallery_figures → 底部画廊(其余:motivation/limitations/无 section/无图)
table_figures: list[dict] = []
method_figures: list[dict] = []
results_figures: list[dict] = []
gallery_figures: list[dict] = []
for fig in linked_figures:
fig_id = fig.get("id", "")
section = fig.get("section", "")
is_table = fig_id.lower().startswith("table")
if is_table and fig.get("image_url"):
table_figures.append(fig)
elif not is_table and section == "method" and fig.get("image_url"):
method_figures.append(fig)
elif (
not is_table
and section == "results"
and fig.get("image_url")
):
results_figures.append(fig)
else:
figures.append(fig)
gallery_figures.append(fig)
return templates.TemplateResponse(
request,
@@ -144,8 +159,10 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
"paper_images": images,
"prereqs": prereqs,
"benchmarks": benchmarks,
"figures": figures,
"figures": gallery_figures,
"table_figures": table_figures,
"method_figures": method_figures,
"results_figures": results_figures,
"chroma_enabled": settings.CHROMA_ENABLED,
"page_title": paper.title_zh or paper.title_en,
},