feat: enhance PDF extraction with section-based figure routing and improved caption detection

2026-06-10 02:05:30 +08:00
parent c94ff48254
commit a1e0962820
7 changed files with 253 additions and 116 deletions
@@ -391,6 +391,20 @@ def _handle_summary_failure(
    }


+def _cleanup_old_images(db: Session, paper: Paper) -> None:
+    """清理旧的图片文件和 figures_json，避免重新总结时残留。"""
+    arxiv_id = paper.arxiv_id
+    images_dir = paper_dir(arxiv_id) / "images"
+    if images_dir.exists():
+        for old_file in images_dir.iterdir():
+            if old_file.suffix.lower() in (".png", ".jpg", ".jpeg", ".gif", ".svg") or old_file.name == "manifest.json":
+                old_file.unlink(missing_ok=True)
+    # 清除数据库中的 figures_json
+    if paper.summary and paper.summary.figures_json:
+        paper.summary.figures_json = None
+        db.commit()
+
+
 def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None:
    """从 PDF 提取图片和表格（失败不影响总结）。"""
    try:
@@ -437,6 +451,9 @@ async def _do_summarize_one(
    paper.summary_status.started_at = utc_now()
    db.commit()

+    # 清理旧的图片文件和 figures_json，避免重新总结时残留
+    _cleanup_old_images(db, paper)
+
    raw_output = ""
    try:
        meta_path = write_meta_json(paper)