feat: add concurrency safety, caption detection, admin enhancements, and performance improvements

This commit is contained in:
2026-06-14 22:20:02 +08:00
parent 8f13c31991
commit 29fb20828e
23 changed files with 1782 additions and 114 deletions
+38 -22
View File
@@ -131,8 +131,12 @@ def _handle_summary_failure(
def _persist_summary(
db: Session, paper: Paper, json_data: dict, raw_output: str
) -> str:
"""Pydantic 校验 → 质量评估 → 保存文件 → 更新 DB → 返回 quality"""
) -> tuple[str, SummarySchema]:
"""Pydantic 校验 → 质量评估 → 保存文件 → 更新 DB → 返回 (quality, schema)。
后处理(图片提取/ChromaDB)不再在此函数内执行,由调用方搬到线程池,
以免阻塞事件循环。返回 schema 供调用方在线程池里跑后处理。
"""
import time as _time
arxiv_id = paper.arxiv_id
@@ -165,21 +169,10 @@ def _persist_summary(
_t4 - _t3,
)
# 触发性增强(失败不影响总结)
_t5 = _time.monotonic()
_maybe_extract_images(arxiv_id, schema)
_t6 = _time.monotonic()
_maybe_index_chroma(arxiv_id, paper, schema)
_t7 = _time.monotonic()
logger.info(
" [%s] 后处理: 图片提取=%.2fs ChromaDB=%.2fs",
arxiv_id,
_t6 - _t5,
_t7 - _t6,
)
return quality
# 后处理(图片提取 + ChromaDB 索引)已上移到调用方 _do_summarize_one
# 经 asyncio.to_thread 在线程池跑——DB session 必须留在事件循环线程,
# 而 CPU/IO 密集的后处理搬走才不冻结事件循环。
return quality, schema
# ── 清理 ────────────────────────────────────────────────────────────────
@@ -226,21 +219,44 @@ def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None:
logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
def _maybe_index_chroma(arxiv_id: str, paper: Paper, schema: SummarySchema) -> None:
"""写入 ChromaDB 语义索引(失败不影响总结)。"""
def _maybe_index_chroma(arxiv_id: str, schema: SummarySchema, paper_meta: dict) -> None:
"""写入 ChromaDB 语义索引(失败不影响总结)。
paper_meta 是调用方在事件循环线程从 ORM 提取的纯值(title_en/tags/paper_date),
规避此函数在线程池跑时跨线程访问 ORM 的 DetachedInstanceError 风险。
"""
try:
from app.services.embedder import index_paper
texts_dict = {
"arxiv_id": arxiv_id,
"title_zh": schema.title_zh or "",
"title_en": paper.title_en or "",
"tags": " ".join(t.tag for t in paper.tags) if paper.tags else "",
"title_en": paper_meta.get("title_en", ""),
"tags": paper_meta.get("tags", ""),
"one_line": schema.one_line or "",
"motivation_problem": schema.motivation.problem or "",
"method_key_idea": schema.method.key_idea or "",
"paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
"paper_date": paper_meta.get("paper_date", ""),
}
index_paper(arxiv_id, texts_dict)
except Exception:
logger.warning("Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True)
def _run_post_processing(
arxiv_id: str, schema: SummarySchema, paper_meta: dict
) -> None:
"""线程池里跑的 CPU/IO 密集后处理(由 _do_summarize_one 经 asyncio.to_thread 调用)。
顺序与原 _persist_summary 内部一致:图片提取 → ChromaDB 索引。两者各自
try/except(失败不影响已成功的总结),此处再包一层做双保险。
"""
try:
_maybe_extract_images(arxiv_id, schema)
_maybe_index_chroma(arxiv_id, schema, paper_meta)
except Exception:
logger.warning(
"Post-processing failed for %s (summary already persisted)",
arxiv_id,
exc_info=True,
)