feat: add concurrency safety, caption detection, admin enhancements, and performance improvements

This commit is contained in:
2026-06-14 22:20:02 +08:00
parent 8f13c31991
commit 29fb20828e
23 changed files with 1782 additions and 114 deletions
+22 -2
View File
@@ -31,6 +31,7 @@ from app.services.summary_persister import (
_cleanup_old_images,
_handle_summary_failure,
_persist_summary,
_run_post_processing,
)
from app.utils import TMP_DIR, release_lock, truncate_error, utc_now
@@ -115,12 +116,31 @@ async def _do_summarize_one(db: Session, paper: Paper, pdf_mode: str = "auto") -
_t3 = _time.monotonic()
logger.info(" [%s] pi生成: %.2fs", arxiv_id, _t3 - _t2)
quality = _persist_summary(db, paper, json_data, raw_output)
quality, schema = _persist_summary(db, paper, json_data, raw_output)
_t4 = _time.monotonic()
logger.info(" [%s] 持久化: %.2fs", arxiv_id, _t4 - _t3)
# 后处理(图片提取 + ChromaDB 索引)搬到线程池跑,避免 CPU 密集推理冻结
# 事件循环。paper 字段在此(事件循环线程)提取成纯值再传入,规避 worker
# 线程跨线程访问 ORM 的 DetachedInstanceError。DocLayout 推理由单例的
# threading.Lock 串行化,并发 worker 不会同时压模型。
paper_meta = {
"title_en": paper.title_en or "",
"tags": " ".join(t.tag for t in paper.tags) if paper.tags else "",
"paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
}
_t5 = _time.monotonic()
try:
await asyncio.to_thread(_run_post_processing, arxiv_id, schema, paper_meta)
except Exception:
# 双保险:_run_post_processing 内部已 try/except,此处兜底,
# 确保后处理失败绝不影响已 DONE 的总结。
logger.warning("Post-processing error for %s", arxiv_id, exc_info=True)
_t6 = _time.monotonic()
logger.info(" [%s] 后处理(线程池): %.2fs", arxiv_id, _t6 - _t5)
logger.info(
"✅ [%s] 完成: quality=%s 总耗时: %.2fs", arxiv_id, quality, _t4 - _t0
"✅ [%s] 完成: quality=%s 总耗时: %.2fs", arxiv_id, quality, _t6 - _t0
)
return {"arxiv_id": arxiv_id, "status": "done", "quality": quality}