feat: add admin dashboard, pipeline service, lightbox, and update dependencies

This commit is contained in:
2026-06-09 09:32:10 +08:00
parent 0d293422ac
commit 32978b3fc5
50 changed files with 4054 additions and 1618 deletions
+217 -225
View File
@@ -2,23 +2,24 @@
from __future__ import annotations
import asyncio
import json
import logging
import shutil
from datetime import datetime, timezone
from pathlib import Path
from pydantic import ValidationError
from sqlalchemy import select
from sqlalchemy.orm import Session, joinedload
from sqlalchemy.orm import Session
from app.config import settings
from app.database import SessionLocal
from app.models import (
PAPER_DEFAULT_LOAD,
CrawlLog,
Paper,
PaperSummary,
PaperTag,
SummaryState,
SummaryStatus,
TaskLock,
)
@@ -42,7 +43,7 @@ from app.services.schemas import (
classify_validation_error,
flatten_for_db,
)
from app.utils import PAPERS_DIR, release_lock
from app.utils import TMP_DIR, release_lock, utc_now
logger = logging.getLogger(__name__)
@@ -96,8 +97,6 @@ def _update_summary_in_db(
"""将校验后的总结写入 DBpaper_summaries + papers + paper_tags + FTS5。"""
from sqlalchemy import text
now = datetime.now(timezone.utc)
# 1. paper_summariesupsert
existing = db.get(PaperSummary, paper.id)
flat = flatten_for_db(schema)
@@ -213,21 +212,14 @@ def _validate_summary(json_data: dict, arxiv_id: str) -> list[str]:
# ── 文件操作 ────────────────────────────────────────────────────────────
def _save_files(arxiv_id: str, schema: SummarySchema, raw_output: str) -> None:
"""保存 summary.json 和 raw_output.txt。"""
d = paper_dir(arxiv_id)
d.mkdir(parents=True, exist_ok=True)
(d / "summary.json").write_text(
schema.model_dump_json(ensure_ascii=False, indent=2),
encoding="utf-8",
)
(d / "raw_output.txt").write_text(raw_output, encoding="utf-8")
def _save_raw_output_only(arxiv_id: str, raw_output: str) -> None:
"""仅保存 raw_output.txt(失败时)。"""
def _save_files(arxiv_id: str, schema: SummarySchema | None, raw_output: str) -> None:
d = paper_dir(arxiv_id)
d.mkdir(parents=True, exist_ok=True)
if schema:
(d / "summary.json").write_text(
schema.model_dump_json(ensure_ascii=False, indent=2),
encoding="utf-8",
)
(d / "raw_output.txt").write_text(raw_output, encoding="utf-8")
@@ -240,26 +232,25 @@ async def summarize_one(
semaphore: asyncio.Semaphore | None = None,
*,
force: bool = False,
pdf_mode: str = "auto",
) -> dict:
"""总结单篇论文的完整流程。"""
import asyncio
arxiv_id = paper.arxiv_id
# 获取或创建 summary_status
if not paper.summary_status:
db.add(SummaryStatus(paper_id=paper.id, status="pending"))
db.add(SummaryStatus(paper_id=paper.id, status=SummaryState.PENDING))
db.commit()
db.refresh(paper)
status = paper.summary_status
# 跳过已完成的(除非 force
if status.status == "done" and not force:
if status.status == SummaryState.DONE and not force:
return {"arxiv_id": arxiv_id, "status": "skipped", "reason": "already_done"}
# 跳过 permanent_failure(除非 force
if status.status == "permanent_failure" and not force:
if status.status == SummaryState.PERMANENT_FAILURE and not force:
return {
"arxiv_id": arxiv_id,
"status": "skipped",
@@ -269,182 +260,202 @@ async def summarize_one(
if semaphore:
await semaphore.acquire()
try:
return await _do_summarize_one(db, paper)
return await _do_summarize_one(db, paper, pdf_mode=pdf_mode)
finally:
if semaphore:
semaphore.release()
async def _do_summarize_one(db: Session, paper: Paper) -> dict:
"""实际的单篇总结执行(在 semaphore 保护下)。"""
import asyncio
async def _generate_with_retry(
arxiv_id: str, meta_path: Path, pdf_path: Path, pdf_mode: str = "auto"
) -> tuple[dict, str]:
"""调用 pi CLI 生成总结,最多 4 轮验证循环。
Returns:
(json_data, raw_output)
Raises:
ValueError: 4 轮验证仍未通过
"""
validation_errors: list[str] = []
json_data: dict | None = None
raw_output = ""
session_id = None
for attempt in range(1, 5):
# 清理上一轮 pi 写的不完整文件
stale = paper_dir(arxiv_id) / "summary.json"
if stale.exists():
stale.unlink()
if attempt == 1:
raw_output, session_id = await call_pi(meta_path, pdf_path, pdf_mode=pdf_mode)
else:
raw_output, session_id = await call_pi(
meta_path, pdf_path,
fix_errors=validation_errors,
session_id=session_id,
pdf_mode=pdf_mode,
)
# 优先读取 pi 写入的 summary.json,否则从 stdout 提取
summary_file = paper_dir(arxiv_id) / "summary.json"
try:
if summary_file.exists():
json_data = json.loads(summary_file.read_text(encoding="utf-8"))
logger.info("Read summary.json written by pi for %s", arxiv_id)
else:
json_data = extract_json(raw_output)
except (json.JSONDecodeError, JsonNotFoundError) as exc:
logger.warning(
"JSON extraction failed for %s (attempt %d): %s",
arxiv_id, attempt, str(exc)[:200],
)
validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"]
continue
validation_errors = _validate_summary(json_data, arxiv_id)
if not validation_errors:
break
logger.warning(
"Validation failed for %s (attempt %d): %s",
arxiv_id, attempt, "; ".join(validation_errors),
)
if validation_errors:
exc = ValueError(
f"Summary validation failed after 4 attempts: {'; '.join(validation_errors)}"
)
exc.raw_output = raw_output # 供上层 _handle_summary_failure 使用
raise exc
return json_data, raw_output
def _persist_summary(
db: Session, paper: Paper, json_data: dict, raw_output: str
) -> str:
"""Pydantic 校验 → 质量评估 → 保存文件 → 更新 DB → 返回 quality。"""
schema = SummarySchema.model_validate(json_data)
quality = assess_quality(schema)
_save_files(paper.arxiv_id, schema, raw_output)
_update_summary_in_db(db, paper, schema, quality, raw_output)
# 状态 → done
paper.summary_status.status = SummaryState.DONE
paper.summary_status.quality = quality
paper.summary_status.completed_at = utc_now()
paper.summary_status.raw_output_saved = True
db.commit()
# 触发性增强(失败不影响总结)
_maybe_extract_images(paper.arxiv_id, schema)
_maybe_index_chroma(paper.arxiv_id, paper, schema)
return quality
def _handle_summary_failure(
db: Session, paper: Paper, exc: Exception, raw_output: str,
) -> dict:
"""记录失败:保存 raw_output、重试计数、错误分类。"""
error_type = _classify_error(exc)
logger.error(
"Summarize failed: %s error_type=%s %s",
paper.arxiv_id, error_type, str(exc)[:200],
)
arxiv_id = paper.arxiv_id
status = paper.summary_status
now = datetime.now(timezone.utc)
if raw_output:
_save_files(paper.arxiv_id, None, raw_output)
status.raw_output_saved = True
status.retry_count = (status.retry_count or 0) + 1
status.error_type = error_type
status.error = str(exc)[:2000]
if status.retry_count >= settings.SUMMARY_MAX_RETRIES + 1:
status.status = SummaryState.PERMANENT_FAILURE
else:
status.status = SummaryState.PENDING
status.completed_at = utc_now()
db.commit()
return {
"arxiv_id": paper.arxiv_id,
"status": "failed",
"error_type": error_type,
"error": str(exc)[:200],
"retry_count": status.retry_count,
}
def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None:
"""从 PDF 提取图片和表格(失败不影响总结)。"""
try:
from app.services.pdf_image_extractor import (
extract_images_from_pdf,
filter_images_by_summary,
)
pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
extract_images_from_pdf(arxiv_id, pdf_path)
if schema.figures:
filter_images_by_summary(arxiv_id, schema.figures)
except Exception:
logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
def _maybe_index_chroma(arxiv_id: str, paper: Paper, schema: SummarySchema) -> None:
"""写入 ChromaDB 语义索引(失败不影响总结)。"""
try:
from app.services.embedder import index_paper
texts_dict = {
"arxiv_id": arxiv_id,
"title_zh": schema.title_zh or "",
"title_en": paper.title_en or "",
"tags": " ".join(t.tag for t in paper.tags) if paper.tags else "",
"one_line": schema.one_line or "",
"motivation_problem": schema.motivation.problem or "",
"method_key_idea": schema.method.key_idea or "",
"paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
}
index_paper(arxiv_id, texts_dict)
except Exception:
logger.warning("Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True)
async def _do_summarize_one(
db: Session, paper: Paper, pdf_mode: str = "auto"
) -> dict:
"""实际的单篇总结执行(在 semaphore 保护下)。"""
arxiv_id = paper.arxiv_id
# 状态 → processing
status.status = "processing"
status.started_at = now
paper.summary_status.status = SummaryState.PROCESSING
paper.summary_status.started_at = utc_now()
db.commit()
raw_output = ""
try:
# 写 meta.json
meta_path = write_meta_json(paper)
# 下载 PDF
await download_pdf(arxiv_id, paper.pdf_url)
# 带验证的生成循环:最多 4 轮,同一 session 内 pi 可看到之前写的文件
json_data = None
validation_errors = []
session_id = None
for attempt in range(1, 5):
# 清理上一轮 pi 通过 write_file 写的不完整文件
stale = paper_dir(arxiv_id) / "summary.json"
if stale.exists():
stale.unlink()
json_data, raw_output = await _generate_with_retry(
arxiv_id, meta_path, TMP_DIR / arxiv_id / "paper.pdf",
pdf_mode=pdf_mode,
)
if attempt == 1:
raw_output, session_id = await call_pi(
meta_path, Path("data/tmp") / arxiv_id / "paper.pdf"
)
else:
# 验证失败,同一 session 内带着错误信息让 pi 修正
raw_output, session_id = await call_pi(
meta_path,
Path("data/tmp") / arxiv_id / "paper.pdf",
fix_errors=validation_errors,
session_id=session_id,
)
# 优先从 pi write_file 写入的 summary.json 读取,否则从 stdout 提取
# 如果都失败,当作验证错误,继续下一次尝试
json_data = None
summary_file = paper_dir(arxiv_id) / "summary.json"
try:
if summary_file.exists():
json_data = json.loads(summary_file.read_text(encoding="utf-8"))
logger.info("Read summary.json written by pi for %s", arxiv_id)
else:
json_data = extract_json(raw_output)
except (json.JSONDecodeError, JsonNotFoundError) as exc:
logger.warning(
"JSON extraction failed for %s (attempt %d): %s",
arxiv_id,
attempt,
str(exc)[:200],
)
validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"]
continue
# 运行验证脚本
validation_errors = _validate_summary(json_data, arxiv_id)
if not validation_errors:
break
logger.warning(
"Validation failed for %s (attempt %d): %s",
arxiv_id,
attempt,
"; ".join(validation_errors),
)
if validation_errors:
raise ValueError(
f"Summary validation failed after 4 attempts: {'; '.join(validation_errors)}"
)
# Pydantic 校验
schema = SummarySchema.model_validate(json_data)
# 质量评估
quality = assess_quality(schema)
# 保存文件
_save_files(arxiv_id, schema, raw_output)
# 更新 DB
_update_summary_in_db(db, paper, schema, quality, raw_output)
# 状态 → done
status.status = "done"
status.quality = quality
status.completed_at = datetime.now(timezone.utc)
status.raw_output_saved = True
db.commit()
# PDF 图片提取(可选增强,失败不影响总结)
try:
from app.services.pdf_image_extractor import (
extract_images_from_pdf,
filter_images_by_summary,
)
pdf_path = Path("data/tmp") / arxiv_id / "paper.pdf"
extract_images_from_pdf(arxiv_id, pdf_path)
# 根据 summary 中 figures 字段过滤,只保留被引用的图表
if schema.figures:
filter_images_by_summary(arxiv_id, schema.figures)
except Exception:
logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
# 同步写入语义索引(失败仅 log
try:
from app.services.embedder import index_paper
texts_dict = {
"arxiv_id": arxiv_id,
"title_zh": schema.title_zh or "",
"title_en": paper.title_en or "",
"tags": " ".join(t.tag for t in paper.tags) if paper.tags else "",
"one_line": schema.one_line or "",
"motivation_problem": schema.motivation.problem or "",
"method_key_idea": schema.method.key_idea or "",
"paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
}
index_paper(arxiv_id, texts_dict)
except Exception:
logger.warning(
"Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True
)
quality = _persist_summary(db, paper, json_data, raw_output)
logger.info("Summarize done: %s quality=%s", arxiv_id, quality)
return {"arxiv_id": arxiv_id, "status": "done", "quality": quality}
except Exception as exc:
error_type = _classify_error(exc)
logger.error(
"Summarize failed: %s error_type=%s %s",
arxiv_id,
error_type,
str(exc)[:200],
)
# 保存 raw_output(如果有)
if raw_output:
_save_raw_output_only(arxiv_id, raw_output)
status.raw_output_saved = True
# 重试逻辑
status.retry_count = (status.retry_count or 0) + 1
status.error_type = error_type
status.error = str(exc)[:2000]
if status.retry_count >= settings.SUMMARY_MAX_RETRIES + 1:
status.status = "permanent_failure"
else:
status.status = "pending"
status.completed_at = datetime.now(timezone.utc)
db.commit()
return {
"arxiv_id": arxiv_id,
"status": "failed",
"error_type": error_type,
"error": str(exc)[:200],
"retry_count": status.retry_count,
}
# 从异常对象获取 raw_output_generate_with_retry 失败时仍有输出)
fail_output = getattr(exc, "raw_output", raw_output)
return _handle_summary_failure(db, paper, exc, fail_output)
finally:
cleanup_tmp(arxiv_id)
@@ -458,22 +469,18 @@ async def summarize_single(
arxiv_id: str,
*,
force: bool = True,
pdf_mode: str = "auto",
_session_factory=None,
) -> dict:
"""单篇总结入口(供 admin 路由和 CLI 调用)。
_session_factory: 可选的 session 工厂,测试时注入内存 DB 的 session。
"""
paper = (
db.query(Paper)
.filter(Paper.arxiv_id == arxiv_id)
.options(
joinedload(Paper.authors),
joinedload(Paper.tags),
joinedload(Paper.summary_status),
)
.first()
)
paper = db.execute(
select(Paper)
.where(Paper.arxiv_id == arxiv_id)
.options(*PAPER_DEFAULT_LOAD)
).unique().scalar_one_or_none()
if not paper:
return {"status": "not_found", "arxiv_id": arxiv_id}
@@ -482,17 +489,12 @@ async def summarize_single(
# 每篇用独立 session 避免并发问题
paper_db = make_session()
try:
paper_in_new_session = (
paper_db.query(Paper)
.filter(Paper.arxiv_id == arxiv_id)
.options(
joinedload(Paper.authors),
joinedload(Paper.tags),
joinedload(Paper.summary_status),
)
.first()
)
result = await summarize_one(paper_db, paper_in_new_session, force=force)
paper_in_new_session = paper_db.execute(
select(Paper)
.where(Paper.arxiv_id == arxiv_id)
.options(*PAPER_DEFAULT_LOAD)
).unique().scalar_one_or_none()
result = await summarize_one(paper_db, paper_in_new_session, force=force, pdf_mode=pdf_mode)
finally:
paper_db.close()
@@ -506,15 +508,14 @@ async def summarize_batch(
db: Session,
arxiv_ids: list[str] | None = None,
*,
pdf_mode: str = "auto",
_session_factory=None,
) -> dict:
"""批量总结入口。arxiv_ids=None 时处理所有 pending 论文。
_session_factory: 可选的 session 工厂,测试时注入内存 DB 的 session。
"""
import asyncio
now = datetime.now(timezone.utc)
now = utc_now()
# TaskLock 防重入
lock = TaskLock(
@@ -543,20 +544,16 @@ async def summarize_batch(
try:
# 查询待总结论文
query = db.query(Paper).options(
joinedload(Paper.authors),
joinedload(Paper.tags),
joinedload(Paper.summary_status),
)
stmt = select(Paper).options(*PAPER_DEFAULT_LOAD)
if arxiv_ids:
query = query.filter(Paper.arxiv_id.in_(arxiv_ids))
stmt = stmt.where(Paper.arxiv_id.in_(arxiv_ids))
else:
# 只处理 pending 或 failed(可重试的)
query = query.join(SummaryStatus).filter(
SummaryStatus.status.in_(["pending", "failed"])
stmt = stmt.join(SummaryStatus).where(
SummaryStatus.status.in_([SummaryState.PENDING, SummaryState.FAILED])
)
papers = query.all()
papers = db.execute(stmt).unique().scalars().all()
total = len(papers)
logger.info("Summarize batch: %d papers to process", total)
@@ -564,7 +561,7 @@ async def summarize_batch(
log_entry.status = "success"
log_entry.papers_found = 0
log_entry.papers_new = 0
log_entry.completed_at = datetime.now(timezone.utc)
log_entry.completed_at = utc_now()
release_lock(db, lock)
return {
"status": "success",
@@ -581,17 +578,12 @@ async def summarize_batch(
async def _process_paper(paper: Paper) -> dict:
paper_db = make_session()
try:
p = (
paper_db.query(Paper)
.filter(Paper.id == paper.id)
.options(
joinedload(Paper.authors),
joinedload(Paper.tags),
joinedload(Paper.summary_status),
)
.first()
)
return await summarize_one(paper_db, p, semaphore)
p = paper_db.execute(
select(Paper)
.where(Paper.id == paper.id)
.options(*PAPER_DEFAULT_LOAD)
).unique().scalar_one_or_none()
return await summarize_one(paper_db, p, semaphore, pdf_mode=pdf_mode)
finally:
paper_db.close()
@@ -619,7 +611,7 @@ async def summarize_batch(
log_entry.status = "success" if failed == 0 else "failed"
log_entry.papers_found = total
log_entry.papers_new = done
log_entry.completed_at = datetime.now(timezone.utc)
log_entry.completed_at = utc_now()
db.commit()
logger.info(
@@ -641,7 +633,7 @@ async def summarize_batch(
logger.exception("Summarize batch failed")
log_entry.status = "failed"
log_entry.error = str(exc)[:2000]
log_entry.completed_at = datetime.now(timezone.utc)
log_entry.completed_at = utc_now()
db.commit()
return {"status": "failed", "error": str(exc)}