feat: add concurrency safety, caption detection, admin enhancements, and performance improvements
This commit is contained in:
@@ -131,8 +131,12 @@ def _handle_summary_failure(
|
||||
|
||||
def _persist_summary(
|
||||
db: Session, paper: Paper, json_data: dict, raw_output: str
|
||||
) -> str:
|
||||
"""Pydantic 校验 → 质量评估 → 保存文件 → 更新 DB → 返回 quality。"""
|
||||
) -> tuple[str, SummarySchema]:
|
||||
"""Pydantic 校验 → 质量评估 → 保存文件 → 更新 DB → 返回 (quality, schema)。
|
||||
|
||||
后处理(图片提取/ChromaDB)不再在此函数内执行,由调用方搬到线程池,
|
||||
以免阻塞事件循环。返回 schema 供调用方在线程池里跑后处理。
|
||||
"""
|
||||
import time as _time
|
||||
|
||||
arxiv_id = paper.arxiv_id
|
||||
@@ -165,21 +169,10 @@ def _persist_summary(
|
||||
_t4 - _t3,
|
||||
)
|
||||
|
||||
# 触发性增强(失败不影响总结)
|
||||
_t5 = _time.monotonic()
|
||||
_maybe_extract_images(arxiv_id, schema)
|
||||
_t6 = _time.monotonic()
|
||||
_maybe_index_chroma(arxiv_id, paper, schema)
|
||||
_t7 = _time.monotonic()
|
||||
|
||||
logger.info(
|
||||
" [%s] 后处理: 图片提取=%.2fs ChromaDB=%.2fs",
|
||||
arxiv_id,
|
||||
_t6 - _t5,
|
||||
_t7 - _t6,
|
||||
)
|
||||
|
||||
return quality
|
||||
# 后处理(图片提取 + ChromaDB 索引)已上移到调用方 _do_summarize_one,
|
||||
# 经 asyncio.to_thread 在线程池跑——DB session 必须留在事件循环线程,
|
||||
# 而 CPU/IO 密集的后处理搬走才不冻结事件循环。
|
||||
return quality, schema
|
||||
|
||||
|
||||
# ── 清理 ────────────────────────────────────────────────────────────────
|
||||
@@ -226,21 +219,44 @@ def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None:
|
||||
logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
|
||||
|
||||
|
||||
def _maybe_index_chroma(arxiv_id: str, paper: Paper, schema: SummarySchema) -> None:
|
||||
"""写入 ChromaDB 语义索引(失败不影响总结)。"""
|
||||
def _maybe_index_chroma(arxiv_id: str, schema: SummarySchema, paper_meta: dict) -> None:
|
||||
"""写入 ChromaDB 语义索引(失败不影响总结)。
|
||||
|
||||
paper_meta 是调用方在事件循环线程从 ORM 提取的纯值(title_en/tags/paper_date),
|
||||
规避此函数在线程池跑时跨线程访问 ORM 的 DetachedInstanceError 风险。
|
||||
"""
|
||||
try:
|
||||
from app.services.embedder import index_paper
|
||||
|
||||
texts_dict = {
|
||||
"arxiv_id": arxiv_id,
|
||||
"title_zh": schema.title_zh or "",
|
||||
"title_en": paper.title_en or "",
|
||||
"tags": " ".join(t.tag for t in paper.tags) if paper.tags else "",
|
||||
"title_en": paper_meta.get("title_en", ""),
|
||||
"tags": paper_meta.get("tags", ""),
|
||||
"one_line": schema.one_line or "",
|
||||
"motivation_problem": schema.motivation.problem or "",
|
||||
"method_key_idea": schema.method.key_idea or "",
|
||||
"paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
|
||||
"paper_date": paper_meta.get("paper_date", ""),
|
||||
}
|
||||
index_paper(arxiv_id, texts_dict)
|
||||
except Exception:
|
||||
logger.warning("Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True)
|
||||
|
||||
|
||||
def _run_post_processing(
|
||||
arxiv_id: str, schema: SummarySchema, paper_meta: dict
|
||||
) -> None:
|
||||
"""线程池里跑的 CPU/IO 密集后处理(由 _do_summarize_one 经 asyncio.to_thread 调用)。
|
||||
|
||||
顺序与原 _persist_summary 内部一致:图片提取 → ChromaDB 索引。两者各自
|
||||
try/except(失败不影响已成功的总结),此处再包一层做双保险。
|
||||
"""
|
||||
try:
|
||||
_maybe_extract_images(arxiv_id, schema)
|
||||
_maybe_index_chroma(arxiv_id, schema, paper_meta)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"Post-processing failed for %s (summary already persisted)",
|
||||
arxiv_id,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user