feat: add claude backend, refactor summary utilities, improve batch worker pattern, add pymupdf4llm

2026-06-12 22:25:57 +08:00
parent b42e9149e5
commit e2f0e1a8be
13 changed files with 1350 additions and 1010 deletions
@@ -1,6 +1,7 @@
 """CLI 工具 — 手动抓取论文。"""

 import asyncio
+import logging

 import typer
 from dotenv import load_dotenv
@@ -49,8 +50,11 @@ def crawl(
        typer.echo(f"📡 开始抓取 {target} ...")
        result = asyncio.run(crawl_daily(db, target, top_n))

-        # 未指定日期且今天无数据时，自动回退到昨天
-        if not date_str and result["status"] == "success" and result["found"] == 0:
+        # 未指定日期且今天失败或无数据时，自动回退到昨天
+        need_fallback = not date_str and (
+            result["status"] == "failed" or result["found"] == 0
+        )
+        if need_fallback:
            fallback = yesterday_str()
            existing = db.scalar(select(func.count(Paper.id)).where(Paper.paper_date == fallback)) or 0
            if existing > 0:
@@ -84,6 +88,11 @@ def summarize(
        "--pdf-mode",
        help="PDF 传递方式：auto（自动选择）| inject（全量注入）| search（pi 自主搜索）",
    ),
+    backend: str = typer.Option(
+        None,
+        "--backend",
+        help="总结后端：pi | claude（留空则使用 .env 配置）",
+    ),
 ):
    """手动触发 AI 总结。"""
    from app.config import settings
@@ -97,9 +106,22 @@ def summarize(
        typer.echo(f"❌ 无效的 pdf_mode: {pdf_mode}，只支持 auto / inject / search", err=True)
        raise typer.Exit(code=1)

+    if backend:
+        if backend not in ("pi", "claude"):
+            typer.echo(f"❌ 无效的 backend: {backend}，只支持 pi / claude", err=True)
+            raise typer.Exit(code=1)
+        settings.SUMMARY_BACKEND = backend
+
    os.makedirs(settings.db_path.parent, exist_ok=True)
    _init(engine)

+    # 配置 logging 输出到终端
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)-5s %(name)s | %(message)s",
+        datefmt="%H:%M:%S",
+    )
+
    db = SessionLocal()
    try:
        if arxiv_id:
@@ -29,8 +29,10 @@ class Settings(BaseSettings):
    HTTP_USER_AGENT: str = "hf-daily-papers-local/0.1"

    # AI 总结
+    SUMMARY_BACKEND: str = "pi"  # "pi" | "claude"
    PI_BIN: str = ""
    SUMMARY_SKILL: str = "daily-paper-summary"
+    CLAUDE_BIN: str = "claude"
    SUMMARY_CONCURRENCY: int = 3
    SUMMARY_TIMEOUT_SECONDS: int = 1200
    SUMMARY_MAX_RETRIES: int = 2
@@ -0,0 +1,84 @@
+"""Claude CLI 后端 — 调用 claude CLI 子进程生成总结。
+
+和 pi_client.py 对称的接口，复用 prompt 构建、PDF 文本提取、JSON 提取逻辑。
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import uuid
+
+from app.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+class ClaudeTimeoutError(Exception):
+    pass
+
+
+class ClaudeProcessError(Exception):
+    def __init__(self, returncode: int, stderr: str):
+        self.returncode = returncode
+        self.stderr = stderr
+        super().__init__(f"claude exited with code {returncode}: {stderr[:500]}")
+
+
+async def call_claude(
+    prompt: str,
+    session_id: str | None = None,
+    fix_errors: list[str] | None = None,
+) -> tuple[str, str]:
+    """调用 claude CLI print 模式，返回 (stdout 文本, session_id)。
+
+    和 call_pi() 对称的接口，但 claude CLI 不需要文件路径和 pdf_mode——
+    所有内容已在 prompt 中准备好。
+
+    Args:
+        prompt: 完整的 prompt 文本
+        session_id: session ID（首次为 None 时自动生成）
+        fix_errors: 上一轮验证错误列表（用于重试）
+    """
+    if session_id is None:
+        session_id = f"claude-summary-{uuid.uuid4().hex[:8]}"
+
+    cmd = [settings.CLAUDE_BIN, "-p", "--output-format", "text"]
+
+    if fix_errors and session_id:
+        # 重试：延续 session
+        cmd += ["--session-id", session_id, "--continue"]
+    else:
+        cmd += ["--session-id", session_id]
+
+    cmd.append(prompt)
+
+    logger.info(
+        "Calling claude (session=%s, fix=%s)",
+        session_id,
+        bool(fix_errors),
+    )
+
+    proc = await asyncio.create_subprocess_exec(
+        *cmd,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+    )
+    try:
+        stdout, stderr = await asyncio.wait_for(
+            proc.communicate(),
+            timeout=settings.SUMMARY_TIMEOUT_SECONDS,
+        )
+    except asyncio.TimeoutError:
+        proc.kill()
+        await proc.wait()
+        raise ClaudeTimeoutError(
+            f"claude timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s"
+        )
+
+    if proc.returncode != 0:
+        raise ClaudeProcessError(
+            proc.returncode, stderr.decode("utf-8", errors="replace")
+        )
+
+    return stdout.decode("utf-8", errors="replace"), session_id
@@ -83,7 +83,7 @@ def _parse_paper(item: dict) -> dict:
        "upvotes": item.get("paper", {}).get("upvotes", 0) or item.get("upvotes", 0),
        "hf_url": f"https://huggingface.co/papers/{arxiv_id}" if arxiv_id else "",
        "arxiv_url": f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else "",
-        "pdf_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else "",
+        "pdf_url": f"https://arxiv.org/pdf/{arxiv_id}" if arxiv_id else "",
        "authors": [
            a.get("name", a) if isinstance(a, dict) else a
            for a in paper_info.get("authors", [])
@@ -3,10 +3,13 @@
 from __future__ import annotations

 import logging
+import os
 import shutil
 from pathlib import Path

-from app.utils import PAPERS_DIR, TMP_DIR, make_http_client
+import requests
+
+from app.utils import PAPERS_DIR, TMP_DIR

 logger = logging.getLogger(__name__)

@@ -31,6 +34,22 @@ def tmp_dir(arxiv_id: str) -> Path:

 # ── PDF 下载 ────────────────────────────────────────────────────────────

+# 复用 TCP 连接的 session
+_http_session: requests.Session | None = None
+
+
+def _get_session() -> requests.Session:
+    global _http_session
+    if _http_session is None:
+        _http_session = requests.Session()
+        _http_session.headers.update({"User-Agent": "hf-daily-papers/1.0"})
+        # 代理：优先 $PROXY_SERVER，其次 settings.http_proxy
+        proxy = os.environ.get("PROXY_SERVER")
+        if proxy:
+            _http_session.proxies = {"http": proxy, "https": proxy}
+            logger.info("PDF download using proxy from $PROXY_SERVER: %s", proxy)
+    return _http_session
+

 async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
    """下载 PDF 到 data/tmp/{arxiv_id}/paper.pdf。"""
@@ -42,10 +61,10 @@ async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
    dest = dest_dir / "paper.pdf"

    try:
-        async with make_http_client(follow_redirects=True) as client:
-            resp = await client.get(pdf_url)
-            resp.raise_for_status()
-            dest.write_bytes(resp.content)
+        session = _get_session()
+        resp = session.get(pdf_url, timeout=120, allow_redirects=True)
+        resp.raise_for_status()
+        dest.write_bytes(resp.content)
    except Exception as exc:
        raise PdfDownloadError(f"failed to download PDF for {arxiv_id}: {exc}") from exc

@@ -1,17 +1,38 @@
-"""pi CLI 调用与 JSON 提取 — 调用 pi 生成总结，从输出中提取结构化 JSON。"""
+"""pi CLI 后端 — 调用 pi 子进程生成总结。
+
+通用工具函数（prompt 构建、PDF 提取、JSON 提取、meta.json）已移至 summary_utils.py。
+"""

 from __future__ import annotations

 import asyncio
-import json
 import logging
-import re
+import uuid
 from pathlib import Path

 from app.config import settings
+from app.services.summary_utils import (
+    JsonNotFoundError,
+    build_prompt,
+    extract_json,
+    extract_pdf_text,
+    write_meta_json,
+)

 logger = logging.getLogger(__name__)

+# 重新导出，保持向后兼容
+__all__ = [
+    "PiTimeoutError",
+    "PiProcessError",
+    "JsonNotFoundError",
+    "call_pi",
+    "write_meta_json",
+    "extract_pdf_text",
+    "build_prompt",
+    "extract_json",
+]
+

 # ── 自定义异常 ──────────────────────────────────────────────────────────

@@ -27,201 +48,6 @@ class PiProcessError(Exception):
        super().__init__(f"pi exited with code {returncode}: {stderr[:500]}")


-class JsonNotFoundError(Exception):
-    pass
-
-
-# ── meta.json ───────────────────────────────────────────────────────────
-
-
-def write_meta_json(paper) -> Path:
-    """写入 data/papers/{arxiv_id}/meta.json，返回路径。"""
-    from app.services.pdf_downloader import paper_dir
-
-    d = paper_dir(paper.arxiv_id)
-    d.mkdir(parents=True, exist_ok=True)
-    meta_path = d / "meta.json"
-
-    authors = [a.name for a in paper.authors]
-    tags = [t.tag for t in paper.tags]
-    meta = {
-        "arxiv_id": paper.arxiv_id,
-        "title_en": paper.title_en,
-        "abstract": paper.abstract or "",
-        "published_at": paper.published_at.isoformat() if paper.published_at else None,
-        "authors": authors,
-        "tags": tags,
-        "upvotes": paper.upvotes,
-    }
-    meta_path.write_text(
-        json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
-    )
-    return meta_path
-
-
-# ── PDF 文本提取 ────────────────────────────────────────────────────────
-
-
-def _trim_body(text: str, max_chars: int | None = None) -> str:
-    """去除参考文献，保留正文+附录，超长时从末尾截断。
-
-    策略：
-    1. 去掉 References/Bibliography 段落（纯引用列表，对解读无用）
-    2. 正文 + 附录全部保留
-    3. 如果指定了 max_chars 且总长超过，从末尾截断（附录靠后，优先保留正文）
-    """
-    import re
-
-    # 找 References 段落的位置（在 Appendix 之后的那个）
-    # 简单策略：找到 References 标题，如果后面没有 Appendix 就全删
-    # 如果后面还有 Appendix，只删 References 到 Appendix 之间的内容
-    ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text)
-    if ref_match:
-        ref_start = ref_match.start()
-        # 看 References 之后有没有 Appendix
-        after_ref = text[ref_start:]
-        app_match = re.search(
-            r"(?m)^(?:A\s+(?:Appendix|Supplementary)|Appendix|附录)\s*$", after_ref
-        )
-        if app_match:
-            # References 之后有 Appendix：只删 References 段
-            ref_end = ref_start + app_match.start()
-            text = text[:ref_start] + text[ref_end:]
-        else:
-            # References 之后没有 Appendix：删掉从 References 到结尾
-            text = text[:ref_start].rstrip()
-
-    # 去掉 Acknowledgments（对解读无用）
-    ack_match = re.search(r"(?m)^(?:Acknowledgments?\s*|致谢\s*)$", text)
-    if ack_match:
-        # 只删 Acknowledgments 本身，不删后面的内容
-        next_section = re.search(r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start():])
-        if next_section:
-            text = text[:ack_match.start()] + text[ack_match.start() + next_section.start():]
-        else:
-            text = text[:ack_match.start()].rstrip()
-
-    # 最后：如果指定了上限且超长，从末尾截断（附录在后面，正文在前面，优先保留正文）
-    if max_chars is not None and len(text) > max_chars:
-        text = text[:max_chars].rstrip()
-
-    return text
-
-
-def extract_pdf_text(pdf_path: Path, max_chars: int | None = None) -> Path:
-    """用 pymupdf 提取 PDF 正文文本，保存为 .txt。
-
-    max_chars=None 时不截断，给 search/auto 模式保留完整内容。
-    """
-    import pymupdf
-
-    txt_path = pdf_path.with_suffix(".txt")
-    if txt_path.exists():
-        # 缓存优先；如果需重新提取（不同 max_chars），先删旧文件
-        return txt_path
-
-    doc = pymupdf.open(str(pdf_path))
-    raw_text = "\n\n".join(page.get_text() for page in doc)
-    doc.close()
-
-    body = _trim_body(raw_text, max_chars=max_chars)
-    txt_path.write_text(body, encoding="utf-8")
-    logger.info(
-        "Extracted PDF text: %s (%d -> %d chars, -%d%%)",
-        txt_path,
-        len(raw_text),
-        len(body),
-        (1 - len(body) / len(raw_text)) * 100 if raw_text else 0,
-    )
-    return txt_path
-
-
-# ── Prompt 构建 ─────────────────────────────────────────────────────────
-
-
-def _build_prompt(
-    arxiv_id: str,
-    meta_path: Path,
-    txt_path: Path,
-    pdf_mode: str,
-    fix_errors: list[str] | None = None,
-) -> str:
-    """根据模式构建 pi prompt。
-
-    inject: 全量注入，prompt 末尾包含论文全文内容
-    search: pi 自主 read 文件，prompt 只包含工作流指令
-    """
-    json_schema = (
-        "## 必须包含以下字段（不要自创字段名）：\n"
-        '{"arxiv_id": "...", '
-        '"title_zh": "中文标题", '
-        '"one_line": "一句话概括(≤50字)", '
-        '"tags": ["标签1","标签2"], '
-        '"difficulty": "入门/进阶/前沿", '
-        '"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的（50-150字）","why_matters":"为什么读懂本文需要它"}]}, '
-        '"motivation": {"problem": "详细段落：现有方法的具体问题（包含具体场景和数据）", '
-        '"goal": "详细段落：本文的具体目标", '
-        '"gap": "详细段落：本文的独特切入角度"}, '
-        '"method": {"overview": "详细段落：方法整体思路（先直觉再技术路线）", '
-        '"key_idea": "详细段落：核心创新点（和已有方法的本质区别）", '
-        '"steps": "详细段落：方法步骤的完整描述（每步的输入输出和具体操作）", '
-        '"novelty": "详细段落：技术新颖性分析"}, '
-        '"results": {"main_findings": "详细段落：核心发现（带具体数字和指标，逐一分析每个实验）", '
-        '"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
-        '"limitations": "详细段落：局限性分析（作者承认的+你自己的观察")}, '
-        '"improvements": {"weaknesses": "详细段落：独立分析的弱点（具体场景，每个弱点给改进方向）", '
-        '"future_work": "详细段落：未来研究方向（作者提出的+基于成果可延伸的）", '
-        '"reproducibility": "详细段落：复现评估（开源情况、数据、算力、难度")}, '
-        '"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要","section":"method"},'
-        '{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要","section":"results"}]'
-        "\n注意：figures 必须包含论文中的所有重要图表，包括 Figure 和 Table，id 严格使用 \"Figure N\" 或 \"Table N\" 格式。"
-        "section 必须是 motivation/method/results/limitations 之一，表示该图最适合展示在哪个章节。"
-        "}"
-    )
-
-    writing_requirements = (
-        "## 写作要求\n"
-        "- 每个字符串字段必须写成详细段落（200-500字），不要用列表或数组\n"
-        "- 必须包含论文中的具体数据、数字、实验指标\n"
-        "- 像资深同事给同事讲论文一样，专业但易懂\n"
-        "- 数学公式、符号、变量必须使用 LaTeX 格式：行内公式用 $...$，独立公式用 $$...$$\n"
-        "  例如：损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$，学习率 $\\eta$\n"
-    )
-
-    if fix_errors:
-        error_list = "\n".join(f"- {e}" for e in fix_errors)
-        return (
-            "你之前生成的 JSON 存在以下问题，请修正后重新用 write_file 保存到 "
-            f"data/papers/{arxiv_id}/summary.json：\n\n"
-            f"{error_list}\n\n"
-            "注意：所有字符串字段必须是详细段落（≥50字），不能是数组或列表。"
-            "修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
-        )
-
-    if pdf_mode == "search":
-        return (
-            "请深度解读以下论文，严格按下面的 JSON schema 输出结果。\n\n"
-            "## 工作流程\n"
-            f"1. 先用 read 工具读取 {meta_path} 了解论文元信息（标题、作者、摘要）\n"
-            f"2. 再用 read 工具阅读 {txt_path}（论文正文全文），可以多次读取定位关键段落\n"
-            f"3. 充分理解后，用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n\n"
-            + writing_requirements
-            + "\n"
-            + json_schema
-        )
-    else:
-        return (
-            "请深度解读以下论文，严格按下面的 JSON schema 输出结果。\n\n"
-            "## 工作流程\n"
-            "论文元信息和正文全文已在上文提供，请仔细阅读。\n"
-            f"1. 充分理解论文后，用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n"
-            "2. 用 bash 运行 python scripts/validate_summary.py 验证\n\n"
-            + writing_requirements
-            + "\n"
-            + json_schema
-        )
-
-
 # ── pi CLI 调用 ────────────────────────────────────────────────────────


@@ -264,12 +90,10 @@ async def call_pi(
        txt_path.write_text(trimmed, encoding="utf-8")
        logger.info("Truncated %s for inject: %d → %d chars", arxiv_id, txt_size, len(trimmed))

-    prompt_text = _build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors)
+    prompt_text = build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors)

    # 构建 session ID（每篇论文一个独立 session）
    if session_id is None:
-        import uuid
-
        session_id = f"summary-{arxiv_id}-{uuid.uuid4().hex[:8]}"

    # 工具列表：search 模式需要 read 工具
@@ -297,6 +121,9 @@ async def call_pi(
        arxiv_id, bool(fix_errors), session_id, actual_mode,
    )

+    import time as _time
+    _t_sub_start = _time.monotonic()
+
    proc = await asyncio.create_subprocess_exec(
        *cmd,
        stdout=asyncio.subprocess.PIPE,
@@ -312,69 +139,22 @@ async def call_pi(
        await proc.wait()
        raise PiTimeoutError(f"pi timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s")

+    _t_sub_end = _time.monotonic()
+
+    # 检查 summary.json 是否由 pi 子进程写入
+    _summary_file = pdf_path.parent / "summary.json"
+    _file_info = ""
+    if _summary_file.exists():
+        _file_mtime = _summary_file.stat().st_mtime
+        _file_size = _summary_file.stat().st_size
+        _file_info = f"  summary.json={_file_size}B"
+
+    logger.info(
+        "pi subprocess for %s: %.2fs%s",
+        arxiv_id, _t_sub_end - _t_sub_start, _file_info,
+    )
+
    if proc.returncode != 0:
        raise PiProcessError(proc.returncode, stderr.decode("utf-8", errors="replace"))

    return stdout.decode("utf-8", errors="replace"), session_id
-
-
-# ── JSON 提取 ──────────────────────────────────────────────────────────
-
-
-def extract_json(raw_output: str) -> dict:
-    """从 pi 输出中提取 JSON dict。三步策略：直接解析 → 代码块 → 最大花括号块。"""
-    # 策略 1：整体直接解析
-    stripped = raw_output.strip()
-    try:
-        result = json.loads(stripped)
-        if isinstance(result, dict) and "title_zh" in result:
-            return result
-    except json.JSONDecodeError:
-        pass
-
-    # 策略 2：提取 ```json ... ``` 代码块
-    fence_pattern = re.compile(r"```(?:json)?\s*\n(.*?)```", re.DOTALL)
-    for match in fence_pattern.finditer(raw_output):
-        try:
-            result = json.loads(match.group(1).strip())
-            if isinstance(result, dict) and "title_zh" in result:
-                return result
-        except json.JSONDecodeError:
-            continue
-
-    # 策略 3：匹配包含 title_zh 的最大 {...} 块
-    brace_pattern = re.compile(r"\{[^{}]*\"title_zh\"[^{}]*\}", re.DOTALL)
-    for match in brace_pattern.finditer(raw_output):
-        try:
-            return json.loads(match.group(0))
-        except json.JSONDecodeError:
-            continue
-
-    # 更宽松：找到最大的 { ... } 平衡块
-    best = None
-    best_len = 0
-    for i, ch in enumerate(raw_output):
-        if ch != "{":
-            continue
-        depth = 0
-        for j in range(i, len(raw_output)):
-            if raw_output[j] == "{":
-                depth += 1
-            elif raw_output[j] == "}":
-                depth -= 1
-            if depth == 0:
-                candidate = raw_output[i : j + 1]
-                if len(candidate) > best_len:
-                    try:
-                        parsed = json.loads(candidate)
-                        if isinstance(parsed, dict):
-                            best = parsed
-                            best_len = len(candidate)
-                    except json.JSONDecodeError:
-                        pass
-                break
-
-    if best is not None:
-        return best
-
-    raise JsonNotFoundError("no JSON object found in pi output")
@@ -29,14 +29,19 @@ from app.services.pdf_downloader import (
    download_pdf,
    paper_dir,
 )
-from app.services.pi_client import (
+from app.services.summary_utils import (
    JsonNotFoundError,
+    build_prompt,
+    extract_json,
+    write_meta_json,
+    extract_pdf_text,
+)
+from app.services.pi_client import (
    PiProcessError,
    PiTimeoutError,
    call_pi,
-    extract_json,
-    write_meta_json,
 )
+from app.services import claude_backend
 from app.services.schemas import (
    SummarySchema,
    assess_quality,
@@ -229,7 +234,6 @@ def _save_files(arxiv_id: str, schema: SummarySchema | None, raw_output: str) ->
 async def summarize_one(
    db: Session,
    paper: Paper,
-    semaphore: asyncio.Semaphore | None = None,
    *,
    force: bool = False,
    pdf_mode: str = "auto",
@@ -257,68 +261,128 @@ async def summarize_one(
            "reason": "permanent_failure",
        }

-    if semaphore:
-        await semaphore.acquire()
-    try:
-        return await _do_summarize_one(db, paper, pdf_mode=pdf_mode)
-    finally:
-        if semaphore:
-            semaphore.release()
+    return await _do_summarize_one(db, paper, pdf_mode=pdf_mode)


 async def _generate_with_retry(
    arxiv_id: str, meta_path: Path, pdf_path: Path, pdf_mode: str = "auto"
 ) -> tuple[dict, str]:
-    """调用 pi CLI 生成总结，最多 4 轮验证循环。
+    """调用 AI 后端生成总结，最多 4 轮验证循环。
+
+    根据 settings.SUMMARY_BACKEND 选择 pi 或 claude 后端。

    Returns:
        (json_data, raw_output)
    Raises:
        ValueError: 4 轮验证仍未通过
    """
+    import time as _time
+
+    backend = settings.SUMMARY_BACKEND
    validation_errors: list[str] = []
    json_data: dict | None = None
    raw_output = ""
    session_id = None

+    summary_file = paper_dir(arxiv_id) / "summary.json"
+
+    # claude 后端需要预构建 prompt（pi 后端在 call_pi 内部构建）
+    claude_prompt: str | None = None
+    if backend == "claude":
+        _t0 = _time.monotonic()
+        txt_path = extract_pdf_text(pdf_path, max_chars=None)
+        body = txt_path.read_text(encoding="utf-8")
+        if len(body) > 80_000:
+            trimmed = body[:80_000].rstrip()
+            txt_path.write_text(trimmed, encoding="utf-8")
+        claude_prompt = build_prompt(arxiv_id, meta_path, txt_path, "inject", None)
+        logger.info("  [%s] 构建prompt: %.2fs", arxiv_id, _time.monotonic() - _t0)
+
    for attempt in range(1, 5):
-        # 清理上一轮 pi 写的不完整文件
-        stale = paper_dir(arxiv_id) / "summary.json"
-        if stale.exists():
-            stale.unlink()
+        # 清理上一轮写入的不完整文件
+        if summary_file.exists():
+            summary_file.unlink()

-        if attempt == 1:
-            raw_output, session_id = await call_pi(meta_path, pdf_path, pdf_mode=pdf_mode)
+        # 记录 AI 调用开始时间
+        _t_call_start = _time.monotonic()
+
+        if backend == "claude":
+            if attempt == 1:
+                raw_output, session_id = await claude_backend.call_claude(
+                    claude_prompt, session_id=None,
+                )
+            else:
+                retry_prompt = build_prompt(
+                    arxiv_id, meta_path,
+                    extract_pdf_text(pdf_path, max_chars=80000),
+                    "inject", fix_errors=validation_errors,
+                )
+                raw_output, session_id = await claude_backend.call_claude(
+                    retry_prompt, session_id=session_id, fix_errors=validation_errors,
+                )
        else:
-            raw_output, session_id = await call_pi(
-                meta_path, pdf_path,
-                fix_errors=validation_errors,
-                session_id=session_id,
-                pdf_mode=pdf_mode,
-            )
+            if attempt == 1:
+                raw_output, session_id = await call_pi(meta_path, pdf_path, pdf_mode=pdf_mode)
+            else:
+                raw_output, session_id = await call_pi(
+                    meta_path, pdf_path,
+                    fix_errors=validation_errors,
+                    session_id=session_id,
+                    pdf_mode=pdf_mode,
+                )

-        # 优先读取 pi 写入的 summary.json，否则从 stdout 提取
-        summary_file = paper_dir(arxiv_id) / "summary.json"
+        _t_call_end = _time.monotonic()
+
+        # 检查 summary.json 是否由 AI 子进程写入
+        file_written_by_ai = summary_file.exists()
+        file_mtime = summary_file.stat().st_mtime if file_written_by_ai else None
+        file_size = summary_file.stat().st_size if file_written_by_ai else 0
+
+        logger.info(
+            "  [%s] attempt %d AI调用: %.2fs  summary.json=%s%s",
+            arxiv_id, attempt,
+            _t_call_end - _t_call_start,
+            f"已写入({file_size}B)" if file_written_by_ai else "未写入",
+            f" mtime={file_mtime:.2f}" if file_mtime else "",
+        )
+
+        # 提取 JSON
+        _t_json_start = _time.monotonic()
        try:
-            if summary_file.exists():
+            if file_written_by_ai:
                json_data = json.loads(summary_file.read_text(encoding="utf-8"))
-                logger.info("Read summary.json written by pi for %s", arxiv_id)
+                logger.info("  [%s] 从AI写入的summary.json读取", arxiv_id)
            else:
                json_data = extract_json(raw_output)
        except (json.JSONDecodeError, JsonNotFoundError) as exc:
+            _t_json_end = _time.monotonic()
            logger.warning(
-                "JSON extraction failed for %s (attempt %d): %s",
-                arxiv_id, attempt, str(exc)[:200],
+                "  [%s] JSON提取失败: %.2fs  %s",
+                arxiv_id, _t_json_end - _t_json_start, str(exc)[:200],
            )
            validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"]
            continue
+        _t_json_end = _time.monotonic()

+        # 验证
+        _t_val_start = _time.monotonic()
        validation_errors = _validate_summary(json_data, arxiv_id)
+        _t_val_end = _time.monotonic()
+
        if not validation_errors:
+            logger.info(
+                "  [%s] JSON提取: %.2fs  验证: %.2fs  ✅",
+                arxiv_id,
+                _t_json_end - _t_json_start,
+                _t_val_end - _t_val_start,
+            )
            break
        logger.warning(
-            "Validation failed for %s (attempt %d): %s",
-            arxiv_id, attempt, "; ".join(validation_errors),
+            "  [%s] JSON提取: %.2fs  验证: %.2fs  ❌ %s",
+            arxiv_id,
+            _t_json_end - _t_json_start,
+            _t_val_end - _t_val_start,
+            "; ".join(validation_errors)[:200],
        )

    if validation_errors:
@@ -335,11 +399,19 @@ def _persist_summary(
    db: Session, paper: Paper, json_data: dict, raw_output: str
 ) -> str:
    """Pydantic 校验 → 质量评估 → 保存文件 → 更新 DB → 返回 quality。"""
+    import time as _time
+    arxiv_id = paper.arxiv_id
+
+    _t0 = _time.monotonic()
    schema = SummarySchema.model_validate(json_data)
    quality = assess_quality(schema)
+    _t1 = _time.monotonic()
+
+    _save_files(arxiv_id, schema, raw_output)
+    _t2 = _time.monotonic()

-    _save_files(paper.arxiv_id, schema, raw_output)
    _update_summary_in_db(db, paper, schema, quality, raw_output)
+    _t3 = _time.monotonic()

    # 状态 → done
    paper.summary_status.status = SummaryState.DONE
@@ -347,10 +419,30 @@ def _persist_summary(
    paper.summary_status.completed_at = utc_now()
    paper.summary_status.raw_output_saved = True
    db.commit()
+    _t4 = _time.monotonic()
+
+    logger.info(
+        "  [%s] persist: pydantic=%.2fs  文件=%.2fs  DB写入=%.2fs  状态commit=%.2fs",
+        arxiv_id,
+        _t1 - _t0,
+        _t2 - _t1,
+        _t3 - _t2,
+        _t4 - _t3,
+    )

    # 触发性增强（失败不影响总结）
-    _maybe_extract_images(paper.arxiv_id, schema)
-    _maybe_index_chroma(paper.arxiv_id, paper, schema)
+    _t5 = _time.monotonic()
+    _maybe_extract_images(arxiv_id, schema)
+    _t6 = _time.monotonic()
+    _maybe_index_chroma(arxiv_id, paper, schema)
+    _t7 = _time.monotonic()
+
+    logger.info(
+        "  [%s] 后处理: 图片提取=%.2fs  ChromaDB=%.2fs",
+        arxiv_id,
+        _t6 - _t5,
+        _t7 - _t6,
+    )

    return quality

@@ -445,28 +537,47 @@ async def _do_summarize_one(
 ) -> dict:
    """实际的单篇总结执行（在 semaphore 保护下）。"""
    arxiv_id = paper.arxiv_id
+    title_short = (paper.title_en or "")[:50]

    # 状态 → processing
    paper.summary_status.status = SummaryState.PROCESSING
    paper.summary_status.started_at = utc_now()
    db.commit()

+    logger.info("▶ [%s] 开始总结: %s", arxiv_id, title_short)
+
    # 清理旧的图片文件和 figures_json，避免重新总结时残留
+    import time as _time
+    _t_cleanup_start = _time.monotonic()
    _cleanup_old_images(db, paper)
+    _t_cleanup_end = _time.monotonic()
+    logger.info("  [%s] 清理旧数据: %.2fs", arxiv_id, _t_cleanup_end - _t_cleanup_start)

    raw_output = ""
    try:
-        meta_path = write_meta_json(paper)
-        await download_pdf(arxiv_id, paper.pdf_url)
+        _t0 = _time.monotonic()

+        meta_path = write_meta_json(paper)
+        _t1 = _time.monotonic()
+        logger.info("  [%s] meta.json: %.2fs", arxiv_id, _t1 - _t0)
+
+        await download_pdf(arxiv_id, paper.pdf_url)
+        _t2 = _time.monotonic()
+        logger.info("  [%s] 下载PDF: %.2fs", arxiv_id, _t2 - _t1)
+
+        logger.info("  [%s] 调用 pi 生成总结...", arxiv_id)
        json_data, raw_output = await _generate_with_retry(
            arxiv_id, meta_path, TMP_DIR / arxiv_id / "paper.pdf",
            pdf_mode=pdf_mode,
        )
+        _t3 = _time.monotonic()
+        logger.info("  [%s] pi生成: %.2fs", arxiv_id, _t3 - _t2)

        quality = _persist_summary(db, paper, json_data, raw_output)
+        _t4 = _time.monotonic()
+        logger.info("  [%s] 持久化: %.2fs", arxiv_id, _t4 - _t3)

-        logger.info("Summarize done: %s quality=%s", arxiv_id, quality)
+        logger.info("✅ [%s] 完成: quality=%s  总耗时: %.2fs", arxiv_id, quality, _t4 - _t0)
        return {"arxiv_id": arxiv_id, "status": "done", "quality": quality}

    except Exception as exc:
@@ -588,42 +699,67 @@ async def summarize_batch(
                "total": 0,
            }

-        # 并发控制
-        semaphore = asyncio.Semaphore(settings.SUMMARY_CONCURRENCY)
+        # 并发控制：worker 模式，避免 573 个协程同时打开 DB 连接耗尽连接池
+        concurrency = settings.SUMMARY_CONCURRENCY
        make_session = _session_factory or SessionLocal

-        async def _process_paper(paper: Paper) -> dict:
-            paper_db = make_session()
-            try:
-                p = paper_db.execute(
-                    select(Paper)
-                    .where(Paper.id == paper.id)
-                    .options(*PAPER_DEFAULT_LOAD)
-                ).unique().scalar_one_or_none()
-                return await summarize_one(paper_db, p, semaphore, pdf_mode=pdf_mode)
-            finally:
-                paper_db.close()
+        # 进度追踪
+        progress = {"done": 0, "failed": 0, "skipped": 0}
+        paper_queue: asyncio.Queue[Paper | None] = asyncio.Queue()
+        for p in papers:
+            paper_queue.put_nowait(p)

-        results = await asyncio.gather(
-            *[_process_paper(p) for p in papers],
+        async def _worker() -> list[dict]:
+            results: list[dict] = []
+            while True:
+                paper = paper_queue.get_nowait() if not paper_queue.empty() else None
+                if paper is None:
+                    break
+                paper_db = make_session()
+                try:
+                    p = paper_db.execute(
+                        select(Paper)
+                        .where(Paper.id == paper.id)
+                        .options(*PAPER_DEFAULT_LOAD)
+                    ).unique().scalar_one_or_none()
+                    result = await summarize_one(paper_db, p, pdf_mode=pdf_mode)
+                    status = result.get("status", "failed")
+                    progress[status] = progress.get(status, 0) + 1
+                    finished = sum(progress.values())
+                    logger.info(
+                        "📊 进度: %d/%d (✅%d ❌%d ⏭️%d) — %s",
+                        finished, total,
+                        progress["done"], progress["failed"], progress["skipped"],
+                        paper.arxiv_id,
+                    )
+                    results.append(result)
+                except Exception as exc:
+                    logger.error("Worker error: %s", exc)
+                    results.append({"status": "failed", "error": str(exc)})
+                finally:
+                    paper_db.close()
+            return results
+
+        worker_results = await asyncio.gather(
+            *[_worker() for _ in range(concurrency)],
            return_exceptions=True,
        )
+        results = []
+        for r in worker_results:
+            if isinstance(r, Exception):
+                logger.error("Unexpected error in batch: %s", r)
+                results.append(r)
+            elif isinstance(r, list):
+                results.extend(r)

-        # 统计结果
-        done = 0
-        failed = 0
-        skipped = 0
+        # 统计结果（progress 已在 worker 中实时更新）
+        done = progress["done"]
+        failed = progress["failed"]
+        skipped = progress["skipped"]
        for r in results:
            if isinstance(r, Exception):
                logger.error("Unexpected error in batch: %s", r)
                failed += 1
-            elif isinstance(r, dict):
-                if r.get("status") == "done":
-                    done += 1
-                elif r.get("status") == "skipped":
-                    skipped += 1
-                else:
-                    failed += 1

        log_entry.status = "success" if failed == 0 else "failed"
        log_entry.papers_found = total
@@ -0,0 +1,270 @@
+"""总结工具函数 — PDF 文本提取、prompt 构建、JSON 提取、meta.json 写入。
+
+与后端无关的通用逻辑，pi 和 claude 后端共享。
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+# ── 自定义异常 ──────────────────────────────────────────────────────────
+
+
+class JsonNotFoundError(Exception):
+    pass
+
+
+# ── meta.json ───────────────────────────────────────────────────────────
+
+
+def write_meta_json(paper) -> Path:
+    """写入 data/papers/{arxiv_id}/meta.json，返回路径。"""
+    from app.services.pdf_downloader import paper_dir
+
+    d = paper_dir(paper.arxiv_id)
+    d.mkdir(parents=True, exist_ok=True)
+    meta_path = d / "meta.json"
+
+    authors = [a.name for a in paper.authors]
+    tags = [t.tag for t in paper.tags]
+    meta = {
+        "arxiv_id": paper.arxiv_id,
+        "title_en": paper.title_en,
+        "abstract": paper.abstract or "",
+        "published_at": paper.published_at.isoformat() if paper.published_at else None,
+        "authors": authors,
+        "tags": tags,
+        "upvotes": paper.upvotes,
+    }
+    meta_path.write_text(
+        json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
+    )
+    return meta_path
+
+
+# ── PDF 文本提取 ────────────────────────────────────────────────────────
+
+
+def _trim_body(text: str, max_chars: int | None = None) -> str:
+    """去除参考文献，保留正文+附录，超长时从末尾截断。
+
+    策略：
+    1. 去掉 References/Bibliography 段落（纯引用列表，对解读无用）
+    2. 正文 + 附录全部保留
+    3. 如果指定了 max_chars 且总长超过，从末尾截断（附录靠后，优先保留正文）
+    """
+    # 找 References 段落的位置（在 Appendix 之后的那个）
+    ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text)
+    if ref_match:
+        ref_start = ref_match.start()
+        # 看 References 之后有没有 Appendix
+        after_ref = text[ref_start:]
+        app_match = re.search(
+            r"(?m)^(?:A\s+(?:Appendix|Supplementary)|Appendix|附录)\s*$", after_ref
+        )
+        if app_match:
+            # References 之后有 Appendix：只删 References 段
+            ref_end = ref_start + app_match.start()
+            text = text[:ref_start] + text[ref_end:]
+        else:
+            # References 之后没有 Appendix：删掉从 References 到结尾
+            text = text[:ref_start].rstrip()
+
+    # 去掉 Acknowledgments（对解读无用）
+    ack_match = re.search(r"(?m)^(?:Acknowledgments?\s*|致谢\s*)$", text)
+    if ack_match:
+        # 只删 Acknowledgments 本身，不删后面的内容
+        next_section = re.search(r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start():])
+        if next_section:
+            text = text[:ack_match.start()] + text[ack_match.start() + next_section.start():]
+        else:
+            text = text[:ack_match.start()].rstrip()
+
+    # 最后：如果指定了上限且超长，从末尾截断（附录在后面，正文在前面，优先保留正文）
+    if max_chars is not None and len(text) > max_chars:
+        text = text[:max_chars].rstrip()
+
+    return text
+
+
+def extract_pdf_text(pdf_path: Path, max_chars: int | None = None) -> Path:
+    """用 pymupdf 提取 PDF 正文文本，保存为 .txt。
+
+    max_chars=None 时不截断，给 search/auto 模式保留完整内容。
+    """
+    import pymupdf
+
+    txt_path = pdf_path.with_suffix(".txt")
+    if txt_path.exists():
+        # 缓存优先；如果需重新提取（不同 max_chars），先删旧文件
+        return txt_path
+
+    doc = pymupdf.open(str(pdf_path))
+    # sort=True 启用阅读顺序检测，避免双栏论文中跨栏错位
+    raw_text = "\n\n".join(page.get_text(sort=True) for page in doc)
+    doc.close()
+
+    body = _trim_body(raw_text, max_chars=max_chars)
+    txt_path.write_text(body, encoding="utf-8")
+    logger.info(
+        "Extracted PDF text: %s (%d -> %d chars, -%d%%)",
+        txt_path,
+        len(raw_text),
+        len(body),
+        (1 - len(body) / len(raw_text)) * 100 if raw_text else 0,
+    )
+    return txt_path
+
+
+# ── Prompt 构建 ─────────────────────────────────────────────────────────
+
+
+def build_prompt(
+    arxiv_id: str,
+    meta_path: Path,
+    txt_path: Path,
+    pdf_mode: str,
+    fix_errors: list[str] | None = None,
+) -> str:
+    """根据模式构建 prompt。
+
+    inject: 全量注入，prompt 末尾包含论文全文内容
+    search: pi 自主 read 文件，prompt 只包含工作流指令
+    """
+    json_schema = (
+        "## 必须包含以下字段（不要自创字段名）：\n"
+        '{"arxiv_id": "...", '
+        '"title_zh": "中文标题", '
+        '"one_line": "一句话概括(≤50字)", '
+        '"tags": ["标签1","标签2"], '
+        '"difficulty": "入门/进阶/前沿", '
+        '"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的（50-150字）","why_matters":"为什么读懂本文需要它"}]}, '
+        '"motivation": {"problem": "详细段落：现有方法的具体问题（包含具体场景和数据）", '
+        '"goal": "详细段落：本文的具体目标", '
+        '"gap": "详细段落：本文的独特切入角度"}, '
+        '"method": {"overview": "详细段落：方法整体思路（先直觉再技术路线）", '
+        '"key_idea": "详细段落：核心创新点（和已有方法的本质区别）", '
+        '"steps": "详细段落：方法步骤的完整描述（每步的输入输出和具体操作）", '
+        '"novelty": "详细段落：技术新颖性分析"}, '
+        '"results": {"main_findings": "详细段落：核心发现（带具体数字和指标，逐一分析每个实验）", '
+        '"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
+        '"limitations": "详细段落：局限性分析（作者承认的+你自己的观察")}, '
+        '"improvements": {"weaknesses": "详细段落：独立分析的弱点（具体场景，每个弱点给改进方向）", '
+        '"future_work": "详细段落：未来研究方向（作者提出的+基于成果可延伸的）", '
+        '"reproducibility": "详细段落：复现评估（开源情况、数据、算力、难度")}, '
+        '"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要","section":"method"},'
+        '{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要","section":"results"}]'
+        "\n注意：figures 必须包含论文中的所有重要图表，包括 Figure 和 Table，id 严格使用 \"Figure N\" 或 \"Table N\" 格式。"
+        "section 必须是 motivation/method/results/limitations 之一，表示该图最适合展示在哪个章节。"
+        "}"
+    )
+
+    writing_requirements = (
+        "## 写作要求\n"
+        "- 每个字符串字段必须写成详细段落（200-500字），不要用列表或数组\n"
+        "- 必须包含论文中的具体数据、数字、实验指标\n"
+        "- 像资深同事给同事讲论文一样，专业但易懂\n"
+        "- 数学公式、符号、变量必须使用 LaTeX 格式：行内公式用 $...$，独立公式用 $$...$$\n"
+        "  例如：损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$，学习率 $\\eta$\n"
+    )
+
+    if fix_errors:
+        error_list = "\n".join(f"- {e}" for e in fix_errors)
+        return (
+            "你之前生成的 JSON 存在以下问题，请修正后重新用 write_file 保存到 "
+            f"data/papers/{arxiv_id}/summary.json：\n\n"
+            f"{error_list}\n\n"
+            "注意：所有字符串字段必须是详细段落（≥50字），不能是数组或列表。"
+            "修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
+        )
+
+    if pdf_mode == "search":
+        return (
+            "请深度解读以下论文，严格按下面的 JSON schema 输出结果。\n\n"
+            "## 工作流程\n"
+            f"1. 先用 read 工具读取 {meta_path} 了解论文元信息（标题、作者、摘要）\n"
+            f"2. 再用 read 工具阅读 {txt_path}（论文正文全文），可以多次读取定位关键段落\n"
+            f"3. 充分理解后，用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n\n"
+            + writing_requirements
+            + "\n"
+            + json_schema
+        )
+    else:
+        return (
+            "请深度解读以下论文，严格按下面的 JSON schema 输出结果。\n\n"
+            "## 工作流程\n"
+            "论文元信息和正文全文已在上文提供，请仔细阅读。\n"
+            f"1. 充分理解论文后，用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n"
+            "2. 用 bash 运行 python scripts/validate_summary.py 验证\n\n"
+            + writing_requirements
+            + "\n"
+            + json_schema
+        )
+
+
+# ── JSON 提取 ──────────────────────────────────────────────────────────
+
+
+def extract_json(raw_output: str) -> dict:
+    """从输出中提取 JSON dict。三步策略：直接解析 → 代码块 → 最大花括号块。"""
+    # 策略 1：整体直接解析
+    stripped = raw_output.strip()
+    try:
+        result = json.loads(stripped)
+        if isinstance(result, dict) and "title_zh" in result:
+            return result
+    except json.JSONDecodeError:
+        pass
+
+    # 策略 2：提取 ```json ... ``` 代码块
+    fence_pattern = re.compile(r"```(?:json)?\s*\n(.*?)```", re.DOTALL)
+    for match in fence_pattern.finditer(raw_output):
+        try:
+            result = json.loads(match.group(1).strip())
+            if isinstance(result, dict) and "title_zh" in result:
+                return result
+        except json.JSONDecodeError:
+            continue
+
+    # 策略 3：匹配包含 title_zh 的最大 {...} 块
+    brace_pattern = re.compile(r"\{[^{}]*\"title_zh\"[^{}]*\}", re.DOTALL)
+    for match in brace_pattern.finditer(raw_output):
+        try:
+            return json.loads(match.group(0))
+        except json.JSONDecodeError:
+            continue
+
+    # 更宽松：找到最大的 { ... } 平衡块
+    best = None
+    best_len = 0
+    for i, ch in enumerate(raw_output):
+        if ch != "{":
+            continue
+        depth = 0
+        for j in range(i, len(raw_output)):
+            if raw_output[j] == "{":
+                depth += 1
+            elif raw_output[j] == "}":
+                depth -= 1
+            if depth == 0:
+                candidate = raw_output[i : j + 1]
+                if len(candidate) > best_len:
+                    try:
+                        parsed = json.loads(candidate)
+                        if isinstance(parsed, dict):
+                            best = parsed
+                            best_len = len(candidate)
+                    except json.JSONDecodeError:
+                        pass
+                break
+
+    if best is not None:
+        return best
+
+    raise JsonNotFoundError("no JSON object found in output")