feat: add concurrency safety, caption detection, admin enhancements, and performance improvements

2026-06-14 22:20:02 +08:00
parent 8f13c31991
commit 29fb20828e
23 changed files with 1782 additions and 114 deletions
@@ -131,8 +131,12 @@ def _handle_summary_failure(

 def _persist_summary(
    db: Session, paper: Paper, json_data: dict, raw_output: str
-) -> str:
-    """Pydantic 校验 → 质量评估 → 保存文件 → 更新 DB → 返回 quality。"""
+) -> tuple[str, SummarySchema]:
+    """Pydantic 校验 → 质量评估 → 保存文件 → 更新 DB → 返回 (quality, schema)。
+
+    后处理（图片提取/ChromaDB）不再在此函数内执行，由调用方搬到线程池，
+    以免阻塞事件循环。返回 schema 供调用方在线程池里跑后处理。
+    """
    import time as _time

    arxiv_id = paper.arxiv_id
@@ -165,21 +169,10 @@ def _persist_summary(
        _t4 - _t3,
    )

-    # 触发性增强（失败不影响总结）
-    _t5 = _time.monotonic()
-    _maybe_extract_images(arxiv_id, schema)
-    _t6 = _time.monotonic()
-    _maybe_index_chroma(arxiv_id, paper, schema)
-    _t7 = _time.monotonic()
-
-    logger.info(
-        "  [%s] 后处理: 图片提取=%.2fs  ChromaDB=%.2fs",
-        arxiv_id,
-        _t6 - _t5,
-        _t7 - _t6,
-    )
-
-    return quality
+    # 后处理（图片提取 + ChromaDB 索引）已上移到调用方 _do_summarize_one，
+    # 经 asyncio.to_thread 在线程池跑——DB session 必须留在事件循环线程，
+    # 而 CPU/IO 密集的后处理搬走才不冻结事件循环。
+    return quality, schema


 # ── 清理 ────────────────────────────────────────────────────────────────
@@ -226,21 +219,44 @@ def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None:
        logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)


-def _maybe_index_chroma(arxiv_id: str, paper: Paper, schema: SummarySchema) -> None:
-    """写入 ChromaDB 语义索引（失败不影响总结）。"""
+def _maybe_index_chroma(arxiv_id: str, schema: SummarySchema, paper_meta: dict) -> None:
+    """写入 ChromaDB 语义索引（失败不影响总结）。
+
+    paper_meta 是调用方在事件循环线程从 ORM 提取的纯值（title_en/tags/paper_date），
+    规避此函数在线程池跑时跨线程访问 ORM 的 DetachedInstanceError 风险。
+    """
    try:
        from app.services.embedder import index_paper

        texts_dict = {
            "arxiv_id": arxiv_id,
            "title_zh": schema.title_zh or "",
-            "title_en": paper.title_en or "",
-            "tags": " ".join(t.tag for t in paper.tags) if paper.tags else "",
+            "title_en": paper_meta.get("title_en", ""),
+            "tags": paper_meta.get("tags", ""),
            "one_line": schema.one_line or "",
            "motivation_problem": schema.motivation.problem or "",
            "method_key_idea": schema.method.key_idea or "",
-            "paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
+            "paper_date": paper_meta.get("paper_date", ""),
        }
        index_paper(arxiv_id, texts_dict)
    except Exception:
        logger.warning("Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True)
+
+
+def _run_post_processing(
+    arxiv_id: str, schema: SummarySchema, paper_meta: dict
+) -> None:
+    """线程池里跑的 CPU/IO 密集后处理（由 _do_summarize_one 经 asyncio.to_thread 调用）。
+
+    顺序与原 _persist_summary 内部一致：图片提取 → ChromaDB 索引。两者各自
+    try/except（失败不影响已成功的总结），此处再包一层做双保险。
+    """
+    try:
+        _maybe_extract_images(arxiv_id, schema)
+        _maybe_index_chroma(arxiv_id, schema, paper_meta)
+    except Exception:
+        logger.warning(
+            "Post-processing failed for %s (summary already persisted)",
+            arxiv_id,
+            exc_info=True,
+        )