feat: enhance PDF extraction with section-based figure routing and improved caption detection
This commit is contained in:
@@ -391,6 +391,20 @@ def _handle_summary_failure(
|
||||
}
|
||||
|
||||
|
||||
def _cleanup_old_images(db: Session, paper: Paper) -> None:
|
||||
"""清理旧的图片文件和 figures_json,避免重新总结时残留。"""
|
||||
arxiv_id = paper.arxiv_id
|
||||
images_dir = paper_dir(arxiv_id) / "images"
|
||||
if images_dir.exists():
|
||||
for old_file in images_dir.iterdir():
|
||||
if old_file.suffix.lower() in (".png", ".jpg", ".jpeg", ".gif", ".svg") or old_file.name == "manifest.json":
|
||||
old_file.unlink(missing_ok=True)
|
||||
# 清除数据库中的 figures_json
|
||||
if paper.summary and paper.summary.figures_json:
|
||||
paper.summary.figures_json = None
|
||||
db.commit()
|
||||
|
||||
|
||||
def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None:
|
||||
"""从 PDF 提取图片和表格(失败不影响总结)。"""
|
||||
try:
|
||||
@@ -437,6 +451,9 @@ async def _do_summarize_one(
|
||||
paper.summary_status.started_at = utc_now()
|
||||
db.commit()
|
||||
|
||||
# 清理旧的图片文件和 figures_json,避免重新总结时残留
|
||||
_cleanup_old_images(db, paper)
|
||||
|
||||
raw_output = ""
|
||||
try:
|
||||
meta_path = write_meta_json(paper)
|
||||
|
||||
Reference in New Issue
Block a user