feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
@@ -1,233 +1,42 @@
-"""AI 总结编排服务 — 协调 PDF 下载、pi CLI 调用、JSON 校验、DB 写入、语义索引。"""
+"""AI 总结编排服务 — 协调生成器、持久化、批量处理的顶层入口。"""

 from __future__ import annotations

 import asyncio
-import json
 import logging
-from pathlib import Path

-from pydantic import ValidationError
 from sqlalchemy import select
+from sqlalchemy.exc import IntegrityError
 from sqlalchemy.orm import Session

 from app.config import settings
 from app.database import SessionLocal
+from app.exceptions import ConflictError, NotFoundError
 from app.models import (
    PAPER_DEFAULT_LOAD,
    CrawlLog,
    Paper,
-    PaperSummary,
-    PaperTag,
    SummaryState,
    SummaryStatus,
    TaskLock,
+    get_paper_by_arxiv_id,
+    get_paper_by_id,
 )
-from app.services.pdf_downloader import (
-    PdfDownloadError,
-    cleanup_tmp,
-    download_pdf,
-    paper_dir,
+from app.services.pdf_downloader import download_pdf
+from app.services.summary_utils import write_meta_json
+from app.services.summary_generator import (
+    _generate_with_retry,
 )
-from app.services.summary_utils import (
-    JsonNotFoundError,
-    build_prompt,
-    extract_json,
-    write_meta_json,
-    extract_pdf_text,
+from app.services.summary_persister import (
+    _cleanup_old_images,
+    _handle_summary_failure,
+    _persist_summary,
 )
-from app.services.pi_client import (
-    PiProcessError,
-    PiTimeoutError,
-    call_pi,
-)
-from app.services import claude_backend
-from app.services.schemas import (
-    SummarySchema,
-    assess_quality,
-    classify_validation_error,
-    flatten_for_db,
-)
-from app.utils import TMP_DIR, release_lock, utc_now
+from app.utils import TMP_DIR, release_lock, truncate_error, utc_now

 logger = logging.getLogger(__name__)


-# ── 错误分类 ────────────────────────────────────────────────────────────
-
-
-def _classify_error(exc: Exception) -> str:
-    """将异常映射到 error_type 枚举值。"""
-    if isinstance(exc, PdfDownloadError):
-        return "pdf_download_failed"
-    if isinstance(exc, PiTimeoutError):
-        return "timeout"
-    if isinstance(exc, PiProcessError):
-        return "process_error"
-    if isinstance(exc, JsonNotFoundError):
-        return "json_not_found"
-    if isinstance(exc, json.JSONDecodeError):
-        return "json_invalid"
-    if isinstance(exc, ValidationError):
-        return classify_validation_error(exc)
-    return "unknown"
-
-
-# ── FTS5 文本构建 ───────────────────────────────────────────────────────
-
-
-def _build_fts_summary_text(schema: SummarySchema) -> str:
-    """拼接用于 FTS5 索引的总结文本。"""
-    parts = [
-        schema.one_line or "",
-        schema.motivation.problem or "",
-        schema.motivation.goal or "",
-        schema.method.overview or "",
-        schema.method.key_idea or "",
-        schema.results.main_findings or "",
-    ]
-    return " ".join(p for p in parts if p)
-
-
-# ── DB 更新 ─────────────────────────────────────────────────────────────
-
-
-def _update_summary_in_db(
-    db: Session,
-    paper: Paper,
-    schema: SummarySchema,
-    quality: str,
-    raw_output: str,
-) -> None:
-    """将校验后的总结写入 DB：paper_summaries + papers + paper_tags + FTS5。"""
-    from sqlalchemy import text
-
-    # 1. paper_summaries：upsert
-    existing = db.get(PaperSummary, paper.id)
-    flat = flatten_for_db(schema)
-    if existing:
-        for k, v in flat.items():
-            setattr(existing, k, v)
-    else:
-        db.add(PaperSummary(paper_id=paper.id, **flat))
-
-    # 2. papers 表
-    paper.title_zh = schema.title_zh
-    paper.summary_quality = quality
-    p_dir = paper_dir(paper.arxiv_id)
-    paper.summary_path = str(p_dir / "summary.json")
-    paper.raw_output_path = str(p_dir / "raw_output.txt")
-
-    # 3. AI 标签
-    existing_tag_names = {t.tag for t in paper.tags}
-    for tag_name in schema.tags:
-        if tag_name not in existing_tag_names:
-            db.add(PaperTag(paper_id=paper.id, tag=tag_name, source="ai"))
-            existing_tag_names.add(tag_name)
-
-    # 4. FTS5 更新
-    summary_text = _build_fts_summary_text(schema)
-    db.execute(
-        text(
-            "UPDATE papers_fts SET title_zh=:title_zh, summary_text=:summary_text "
-            "WHERE rowid=:paper_id"
-        ),
-        {
-            "title_zh": schema.title_zh,
-            "summary_text": summary_text,
-            "paper_id": paper.id,
-        },
-    )
-
-    db.commit()
-    logger.info("DB updated: paper=%s quality=%s", paper.arxiv_id, quality)
-
-
-# ── JSON 验证 ──────────────────────────────────────────────────────────
-
-
-def _validate_summary(json_data: dict, arxiv_id: str) -> list[str]:
-    """验证 JSON 数据是否符合要求，返回错误列表（空=通过）。"""
-    errors: list[str] = []
-
-    if not isinstance(json_data, dict):
-        return ["顶层必须是 JSON 对象"]
-
-    # 必填字段
-    for f in ["arxiv_id", "title_zh", "one_line", "tags"]:
-        if f not in json_data or not json_data[f]:
-            errors.append(f"缺少必填字段: {f}")
-
-    # tags 必须是非空数组
-    tags = json_data.get("tags")
-    if not isinstance(tags, list) or len(tags) == 0:
-        errors.append("tags 必须是非空数组")
-
-    # 字符串段落字段（必须是 str 且 ≥50 字）
-    string_fields = [
-        ("motivation", "problem"), ("motivation", "goal"), ("motivation", "gap"),
-        ("method", "overview"), ("method", "key_idea"), ("method", "steps"),
-        ("method", "novelty"),
-        ("results", "main_findings"), ("results", "limitations"),
-        ("improvements", "weaknesses"), ("improvements", "future_work"),
-        ("improvements", "reproducibility"),
-    ]
-    for section, field in string_fields:
-        val = json_data.get(section, {}).get(field)
-        if isinstance(val, list):
-            errors.append(f"{section}.{field} 应该是字符串段落，不能是数组")
-        elif not isinstance(val, str) or len(val.strip()) < 50:
-            errors.append(
-                f"{section}.{field} 必须是详细段落（≥50字），"
-                f"当前: {type(val).__name__} ({len(str(val))}字)"
-            )
-
-    # benchmarks 必须是数组
-    benchmarks = json_data.get("results", {}).get("benchmarks")
-    if benchmarks is not None and not isinstance(benchmarks, list):
-        errors.append("results.benchmarks 必须是数组")
-
-    # prerequisites.concepts 必须是对象数组，每个有 term
-    concepts = json_data.get("prerequisites", {}).get("concepts")
-    if concepts is not None:
-        if not isinstance(concepts, list):
-            errors.append("prerequisites.concepts 必须是数组")
-        elif len(concepts) == 0:
-            errors.append("prerequisites.concepts 不能为空")
-        else:
-            for i, c in enumerate(concepts):
-                if isinstance(c, str):
-                    errors.append(f"prerequisites.concepts[{i}] 应该是对象 {{term,explanation,why_matters}}，不能是字符串")
-                elif isinstance(c, dict) and not c.get("term"):
-                    errors.append(f"prerequisites.concepts[{i}] 缺少 term 字段")
-
-    # figures 必须是数组，每个元素应有 id
-    figures = json_data.get("figures")
-    if figures is not None:
-        if not isinstance(figures, list):
-            errors.append("figures 必须是数组")
-        else:
-            for i, fig in enumerate(figures):
-                if isinstance(fig, dict) and not fig.get("id"):
-                    errors.append(f"figures[{i}] 缺少 id 字段")
-
-    return errors
-
-
-# ── 文件操作 ────────────────────────────────────────────────────────────
-
-
-def _save_files(arxiv_id: str, schema: SummarySchema | None, raw_output: str) -> None:
-    d = paper_dir(arxiv_id)
-    d.mkdir(parents=True, exist_ok=True)
-    if schema:
-        (d / "summary.json").write_text(
-            schema.model_dump_json(ensure_ascii=False, indent=2),
-            encoding="utf-8",
-        )
-    (d / "raw_output.txt").write_text(raw_output, encoding="utf-8")
-
-
 # ── 单篇总结 ────────────────────────────────────────────────────────────


@@ -264,277 +73,7 @@ async def summarize_one(
    return await _do_summarize_one(db, paper, pdf_mode=pdf_mode)


-async def _generate_with_retry(
-    arxiv_id: str, meta_path: Path, pdf_path: Path, pdf_mode: str = "auto"
-) -> tuple[dict, str]:
-    """调用 AI 后端生成总结，最多 4 轮验证循环。
-
-    根据 settings.SUMMARY_BACKEND 选择 pi 或 claude 后端。
-
-    Returns:
-        (json_data, raw_output)
-    Raises:
-        ValueError: 4 轮验证仍未通过
-    """
-    import time as _time
-
-    backend = settings.SUMMARY_BACKEND
-    validation_errors: list[str] = []
-    json_data: dict | None = None
-    raw_output = ""
-    session_id = None
-
-    summary_file = paper_dir(arxiv_id) / "summary.json"
-
-    # claude 后端需要预构建 prompt（pi 后端在 call_pi 内部构建）
-    claude_prompt: str | None = None
-    if backend == "claude":
-        _t0 = _time.monotonic()
-        txt_path = extract_pdf_text(pdf_path, max_chars=None)
-        body = txt_path.read_text(encoding="utf-8")
-        if len(body) > 80_000:
-            trimmed = body[:80_000].rstrip()
-            txt_path.write_text(trimmed, encoding="utf-8")
-        claude_prompt = build_prompt(arxiv_id, meta_path, txt_path, "inject", None)
-        logger.info("  [%s] 构建prompt: %.2fs", arxiv_id, _time.monotonic() - _t0)
-
-    for attempt in range(1, 5):
-        # 清理上一轮写入的不完整文件
-        if summary_file.exists():
-            summary_file.unlink()
-
-        # 记录 AI 调用开始时间
-        _t_call_start = _time.monotonic()
-
-        if backend == "claude":
-            if attempt == 1:
-                raw_output, session_id = await claude_backend.call_claude(
-                    claude_prompt, session_id=None,
-                )
-            else:
-                retry_prompt = build_prompt(
-                    arxiv_id, meta_path,
-                    extract_pdf_text(pdf_path, max_chars=80000),
-                    "inject", fix_errors=validation_errors,
-                )
-                raw_output, session_id = await claude_backend.call_claude(
-                    retry_prompt, session_id=session_id, fix_errors=validation_errors,
-                )
-        else:
-            if attempt == 1:
-                raw_output, session_id = await call_pi(meta_path, pdf_path, pdf_mode=pdf_mode)
-            else:
-                raw_output, session_id = await call_pi(
-                    meta_path, pdf_path,
-                    fix_errors=validation_errors,
-                    session_id=session_id,
-                    pdf_mode=pdf_mode,
-                )
-
-        _t_call_end = _time.monotonic()
-
-        # 检查 summary.json 是否由 AI 子进程写入
-        file_written_by_ai = summary_file.exists()
-        file_mtime = summary_file.stat().st_mtime if file_written_by_ai else None
-        file_size = summary_file.stat().st_size if file_written_by_ai else 0
-
-        logger.info(
-            "  [%s] attempt %d AI调用: %.2fs  summary.json=%s%s",
-            arxiv_id, attempt,
-            _t_call_end - _t_call_start,
-            f"已写入({file_size}B)" if file_written_by_ai else "未写入",
-            f" mtime={file_mtime:.2f}" if file_mtime else "",
-        )
-
-        # 提取 JSON
-        _t_json_start = _time.monotonic()
-        try:
-            if file_written_by_ai:
-                json_data = json.loads(summary_file.read_text(encoding="utf-8"))
-                logger.info("  [%s] 从AI写入的summary.json读取", arxiv_id)
-            else:
-                json_data = extract_json(raw_output)
-        except (json.JSONDecodeError, JsonNotFoundError) as exc:
-            _t_json_end = _time.monotonic()
-            logger.warning(
-                "  [%s] JSON提取失败: %.2fs  %s",
-                arxiv_id, _t_json_end - _t_json_start, str(exc)[:200],
-            )
-            validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"]
-            continue
-        _t_json_end = _time.monotonic()
-
-        # 验证
-        _t_val_start = _time.monotonic()
-        validation_errors = _validate_summary(json_data, arxiv_id)
-        _t_val_end = _time.monotonic()
-
-        if not validation_errors:
-            logger.info(
-                "  [%s] JSON提取: %.2fs  验证: %.2fs  ✅",
-                arxiv_id,
-                _t_json_end - _t_json_start,
-                _t_val_end - _t_val_start,
-            )
-            break
-        logger.warning(
-            "  [%s] JSON提取: %.2fs  验证: %.2fs  ❌ %s",
-            arxiv_id,
-            _t_json_end - _t_json_start,
-            _t_val_end - _t_val_start,
-            "; ".join(validation_errors)[:200],
-        )
-
-    if validation_errors:
-        exc = ValueError(
-            f"Summary validation failed after 4 attempts: {'; '.join(validation_errors)}"
-        )
-        exc.raw_output = raw_output  # 供上层 _handle_summary_failure 使用
-        raise exc
-
-    return json_data, raw_output
-
-
-def _persist_summary(
-    db: Session, paper: Paper, json_data: dict, raw_output: str
-) -> str:
-    """Pydantic 校验 → 质量评估 → 保存文件 → 更新 DB → 返回 quality。"""
-    import time as _time
-    arxiv_id = paper.arxiv_id
-
-    _t0 = _time.monotonic()
-    schema = SummarySchema.model_validate(json_data)
-    quality = assess_quality(schema)
-    _t1 = _time.monotonic()
-
-    _save_files(arxiv_id, schema, raw_output)
-    _t2 = _time.monotonic()
-
-    _update_summary_in_db(db, paper, schema, quality, raw_output)
-    _t3 = _time.monotonic()
-
-    # 状态 → done
-    paper.summary_status.status = SummaryState.DONE
-    paper.summary_status.quality = quality
-    paper.summary_status.completed_at = utc_now()
-    paper.summary_status.raw_output_saved = True
-    db.commit()
-    _t4 = _time.monotonic()
-
-    logger.info(
-        "  [%s] persist: pydantic=%.2fs  文件=%.2fs  DB写入=%.2fs  状态commit=%.2fs",
-        arxiv_id,
-        _t1 - _t0,
-        _t2 - _t1,
-        _t3 - _t2,
-        _t4 - _t3,
-    )
-
-    # 触发性增强（失败不影响总结）
-    _t5 = _time.monotonic()
-    _maybe_extract_images(arxiv_id, schema)
-    _t6 = _time.monotonic()
-    _maybe_index_chroma(arxiv_id, paper, schema)
-    _t7 = _time.monotonic()
-
-    logger.info(
-        "  [%s] 后处理: 图片提取=%.2fs  ChromaDB=%.2fs",
-        arxiv_id,
-        _t6 - _t5,
-        _t7 - _t6,
-    )
-
-    return quality
-
-
-def _handle_summary_failure(
-    db: Session, paper: Paper, exc: Exception, raw_output: str,
-) -> dict:
-    """记录失败：保存 raw_output、重试计数、错误分类。"""
-    error_type = _classify_error(exc)
-    logger.error(
-        "Summarize failed: %s error_type=%s %s",
-        paper.arxiv_id, error_type, str(exc)[:200],
-    )
-
-    status = paper.summary_status
-    if raw_output:
-        _save_files(paper.arxiv_id, None, raw_output)
-        status.raw_output_saved = True
-
-    status.retry_count = (status.retry_count or 0) + 1
-    status.error_type = error_type
-    status.error = str(exc)[:2000]
-
-    if status.retry_count >= settings.SUMMARY_MAX_RETRIES + 1:
-        status.status = SummaryState.PERMANENT_FAILURE
-    else:
-        status.status = SummaryState.PENDING
-
-    status.completed_at = utc_now()
-    db.commit()
-
-    return {
-        "arxiv_id": paper.arxiv_id,
-        "status": "failed",
-        "error_type": error_type,
-        "error": str(exc)[:200],
-        "retry_count": status.retry_count,
-    }
-
-
-def _cleanup_old_images(db: Session, paper: Paper) -> None:
-    """清理旧的图片文件和 figures_json，避免重新总结时残留。"""
-    arxiv_id = paper.arxiv_id
-    images_dir = paper_dir(arxiv_id) / "images"
-    if images_dir.exists():
-        for old_file in images_dir.iterdir():
-            if old_file.suffix.lower() in (".png", ".jpg", ".jpeg", ".gif", ".svg") or old_file.name == "manifest.json":
-                old_file.unlink(missing_ok=True)
-    # 清除数据库中的 figures_json
-    if paper.summary and paper.summary.figures_json:
-        paper.summary.figures_json = None
-        db.commit()
-
-
-def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None:
-    """从 PDF 提取图片和表格（失败不影响总结）。"""
-    try:
-        from app.services.pdf_image_extractor import (
-            extract_images_from_pdf,
-            filter_images_by_summary,
-        )
-        pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
-        extract_images_from_pdf(arxiv_id, pdf_path)
-        if schema.figures:
-            filter_images_by_summary(arxiv_id, schema.figures)
-    except Exception:
-        logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
-
-
-def _maybe_index_chroma(arxiv_id: str, paper: Paper, schema: SummarySchema) -> None:
-    """写入 ChromaDB 语义索引（失败不影响总结）。"""
-    try:
-        from app.services.embedder import index_paper
-
-        texts_dict = {
-            "arxiv_id": arxiv_id,
-            "title_zh": schema.title_zh or "",
-            "title_en": paper.title_en or "",
-            "tags": " ".join(t.tag for t in paper.tags) if paper.tags else "",
-            "one_line": schema.one_line or "",
-            "motivation_problem": schema.motivation.problem or "",
-            "method_key_idea": schema.method.key_idea or "",
-            "paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
-        }
-        index_paper(arxiv_id, texts_dict)
-    except Exception:
-        logger.warning("Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True)
-
-
-async def _do_summarize_one(
-    db: Session, paper: Paper, pdf_mode: str = "auto"
-) -> dict:
+async def _do_summarize_one(db: Session, paper: Paper, pdf_mode: str = "auto") -> dict:
    """实际的单篇总结执行（在 semaphore 保护下）。"""
    arxiv_id = paper.arxiv_id
    title_short = (paper.title_en or "")[:50]
@@ -548,6 +87,7 @@ async def _do_summarize_one(

    # 清理旧的图片文件和 figures_json，避免重新总结时残留
    import time as _time
+
    _t_cleanup_start = _time.monotonic()
    _cleanup_old_images(db, paper)
    _t_cleanup_end = _time.monotonic()
@@ -567,7 +107,9 @@ async def _do_summarize_one(

        logger.info("  [%s] 调用 pi 生成总结...", arxiv_id)
        json_data, raw_output = await _generate_with_retry(
-            arxiv_id, meta_path, TMP_DIR / arxiv_id / "paper.pdf",
+            arxiv_id,
+            meta_path,
+            TMP_DIR / arxiv_id / "paper.pdf",
            pdf_mode=pdf_mode,
        )
        _t3 = _time.monotonic()
@@ -577,7 +119,9 @@ async def _do_summarize_one(
        _t4 = _time.monotonic()
        logger.info("  [%s] 持久化: %.2fs", arxiv_id, _t4 - _t3)

-        logger.info("✅ [%s] 完成: quality=%s  总耗时: %.2fs", arxiv_id, quality, _t4 - _t0)
+        logger.info(
+            "✅ [%s] 完成: quality=%s  总耗时: %.2fs", arxiv_id, quality, _t4 - _t0
+        )
        return {"arxiv_id": arxiv_id, "status": "done", "quality": quality}

    except Exception as exc:
@@ -586,7 +130,7 @@ async def _do_summarize_one(
        return _handle_summary_failure(db, paper, exc, fail_output)

    finally:
-        cleanup_tmp(arxiv_id)
+        pass  # cleanup_tmp(arxiv_id)  # 暂时禁用，保留 PDF 用于调试图片提取


 # ── 单篇入口 ────────────────────────────────────────────────────────────
@@ -604,25 +148,19 @@ async def summarize_single(

    _session_factory: 可选的 session 工厂，测试时注入内存 DB 的 session。
    """
-    paper = db.execute(
-        select(Paper)
-        .where(Paper.arxiv_id == arxiv_id)
-        .options(*PAPER_DEFAULT_LOAD)
-    ).unique().scalar_one_or_none()
+    paper = get_paper_by_arxiv_id(db, arxiv_id)
    if not paper:
-        return {"status": "not_found", "arxiv_id": arxiv_id}
+        raise NotFoundError(f"Paper not found: {arxiv_id}")

    make_session = _session_factory or SessionLocal

    # 每篇用独立 session 避免并发问题
    paper_db = make_session()
    try:
-        paper_in_new_session = paper_db.execute(
-            select(Paper)
-            .where(Paper.arxiv_id == arxiv_id)
-            .options(*PAPER_DEFAULT_LOAD)
-        ).unique().scalar_one_or_none()
-        result = await summarize_one(paper_db, paper_in_new_session, force=force, pdf_mode=pdf_mode)
+        paper_in_new_session = get_paper_by_arxiv_id(paper_db, arxiv_id)
+        result = await summarize_one(
+            paper_db, paper_in_new_session, force=force, pdf_mode=pdf_mode
+        )
    finally:
        paper_db.close()

@@ -656,10 +194,10 @@ async def summarize_batch(
    try:
        db.add(lock)
        db.commit()
-    except Exception:
+    except IntegrityError:
        db.rollback()
        logger.warning("Summarize batch already running (lock conflict)")
-        return {"status": "conflict", "error": "summarize batch already running"}
+        raise ConflictError("summarize batch already running")

    # CrawlLog
    log_entry = CrawlLog(
@@ -717,19 +255,18 @@ async def summarize_batch(
                    break
                paper_db = make_session()
                try:
-                    p = paper_db.execute(
-                        select(Paper)
-                        .where(Paper.id == paper.id)
-                        .options(*PAPER_DEFAULT_LOAD)
-                    ).unique().scalar_one_or_none()
+                    p = get_paper_by_id(paper_db, paper.id)
                    result = await summarize_one(paper_db, p, pdf_mode=pdf_mode)
                    status = result.get("status", "failed")
                    progress[status] = progress.get(status, 0) + 1
                    finished = sum(progress.values())
                    logger.info(
                        "📊 进度: %d/%d (✅%d ❌%d ⏭️%d) — %s",
-                        finished, total,
-                        progress["done"], progress["failed"], progress["skipped"],
+                        finished,
+                        total,
+                        progress["done"],
+                        progress["failed"],
+                        progress["skipped"],
                        paper.arxiv_id,
                    )
                    results.append(result)
@@ -785,10 +322,10 @@ async def summarize_batch(
    except Exception as exc:
        logger.exception("Summarize batch failed")
        log_entry.status = "failed"
-        log_entry.error = str(exc)[:2000]
+        log_entry.error = truncate_error(exc, limit=2000)
        log_entry.completed_at = utc_now()
        db.commit()
-        return {"status": "failed", "error": str(exc)}
+        return {"status": "failed", "error": truncate_error(exc)}

    finally:
        release_lock(db, lock)