feat: enhance PDF extraction with section-based figure routing and improved caption detection

This commit is contained in:
2026-06-10 02:05:30 +08:00
parent c94ff48254
commit a1e0962820
7 changed files with 253 additions and 116 deletions
+17
View File
@@ -391,6 +391,20 @@ def _handle_summary_failure(
}
def _cleanup_old_images(db: Session, paper: Paper) -> None:
"""清理旧的图片文件和 figures_json,避免重新总结时残留。"""
arxiv_id = paper.arxiv_id
images_dir = paper_dir(arxiv_id) / "images"
if images_dir.exists():
for old_file in images_dir.iterdir():
if old_file.suffix.lower() in (".png", ".jpg", ".jpeg", ".gif", ".svg") or old_file.name == "manifest.json":
old_file.unlink(missing_ok=True)
# 清除数据库中的 figures_json
if paper.summary and paper.summary.figures_json:
paper.summary.figures_json = None
db.commit()
def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None:
"""从 PDF 提取图片和表格(失败不影响总结)。"""
try:
@@ -437,6 +451,9 @@ async def _do_summarize_one(
paper.summary_status.started_at = utc_now()
db.commit()
# 清理旧的图片文件和 figures_json,避免重新总结时残留
_cleanup_old_images(db, paper)
raw_output = ""
try:
meta_path = write_meta_json(paper)