feat: add claude backend, refactor summary utilities, improve batch worker pattern, add pymupdf4llm

This commit is contained in:
2026-06-12 22:25:57 +08:00
parent b42e9149e5
commit e2f0e1a8be
13 changed files with 1350 additions and 1010 deletions
+24 -2
View File
@@ -1,6 +1,7 @@
"""CLI 工具 — 手动抓取论文。"""
import asyncio
import logging
import typer
from dotenv import load_dotenv
@@ -49,8 +50,11 @@ def crawl(
typer.echo(f"📡 开始抓取 {target} ...")
result = asyncio.run(crawl_daily(db, target, top_n))
# 未指定日期且今天无数据时,自动回退到昨天
if not date_str and result["status"] == "success" and result["found"] == 0:
# 未指定日期且今天失败或无数据时,自动回退到昨天
need_fallback = not date_str and (
result["status"] == "failed" or result["found"] == 0
)
if need_fallback:
fallback = yesterday_str()
existing = db.scalar(select(func.count(Paper.id)).where(Paper.paper_date == fallback)) or 0
if existing > 0:
@@ -84,6 +88,11 @@ def summarize(
"--pdf-mode",
help="PDF 传递方式:auto(自动选择)| inject(全量注入)| searchpi 自主搜索)",
),
backend: str = typer.Option(
None,
"--backend",
help="总结后端:pi | claude(留空则使用 .env 配置)",
),
):
"""手动触发 AI 总结。"""
from app.config import settings
@@ -97,9 +106,22 @@ def summarize(
typer.echo(f"❌ 无效的 pdf_mode: {pdf_mode},只支持 auto / inject / search", err=True)
raise typer.Exit(code=1)
if backend:
if backend not in ("pi", "claude"):
typer.echo(f"❌ 无效的 backend: {backend},只支持 pi / claude", err=True)
raise typer.Exit(code=1)
settings.SUMMARY_BACKEND = backend
os.makedirs(settings.db_path.parent, exist_ok=True)
_init(engine)
# 配置 logging 输出到终端
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-5s %(name)s | %(message)s",
datefmt="%H:%M:%S",
)
db = SessionLocal()
try:
if arxiv_id:
+2
View File
@@ -29,8 +29,10 @@ class Settings(BaseSettings):
HTTP_USER_AGENT: str = "hf-daily-papers-local/0.1"
# AI 总结
SUMMARY_BACKEND: str = "pi" # "pi" | "claude"
PI_BIN: str = ""
SUMMARY_SKILL: str = "daily-paper-summary"
CLAUDE_BIN: str = "claude"
SUMMARY_CONCURRENCY: int = 3
SUMMARY_TIMEOUT_SECONDS: int = 1200
SUMMARY_MAX_RETRIES: int = 2
+84
View File
@@ -0,0 +1,84 @@
"""Claude CLI 后端 — 调用 claude CLI 子进程生成总结。
和 pi_client.py 对称的接口,复用 prompt 构建、PDF 文本提取、JSON 提取逻辑。
"""
from __future__ import annotations
import asyncio
import logging
import uuid
from app.config import settings
logger = logging.getLogger(__name__)
class ClaudeTimeoutError(Exception):
pass
class ClaudeProcessError(Exception):
def __init__(self, returncode: int, stderr: str):
self.returncode = returncode
self.stderr = stderr
super().__init__(f"claude exited with code {returncode}: {stderr[:500]}")
async def call_claude(
prompt: str,
session_id: str | None = None,
fix_errors: list[str] | None = None,
) -> tuple[str, str]:
"""调用 claude CLI print 模式,返回 (stdout 文本, session_id)。
和 call_pi() 对称的接口,但 claude CLI 不需要文件路径和 pdf_mode——
所有内容已在 prompt 中准备好。
Args:
prompt: 完整的 prompt 文本
session_id: session ID(首次为 None 时自动生成)
fix_errors: 上一轮验证错误列表(用于重试)
"""
if session_id is None:
session_id = f"claude-summary-{uuid.uuid4().hex[:8]}"
cmd = [settings.CLAUDE_BIN, "-p", "--output-format", "text"]
if fix_errors and session_id:
# 重试:延续 session
cmd += ["--session-id", session_id, "--continue"]
else:
cmd += ["--session-id", session_id]
cmd.append(prompt)
logger.info(
"Calling claude (session=%s, fix=%s)",
session_id,
bool(fix_errors),
)
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
try:
stdout, stderr = await asyncio.wait_for(
proc.communicate(),
timeout=settings.SUMMARY_TIMEOUT_SECONDS,
)
except asyncio.TimeoutError:
proc.kill()
await proc.wait()
raise ClaudeTimeoutError(
f"claude timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s"
)
if proc.returncode != 0:
raise ClaudeProcessError(
proc.returncode, stderr.decode("utf-8", errors="replace")
)
return stdout.decode("utf-8", errors="replace"), session_id
+1 -1
View File
@@ -83,7 +83,7 @@ def _parse_paper(item: dict) -> dict:
"upvotes": item.get("paper", {}).get("upvotes", 0) or item.get("upvotes", 0),
"hf_url": f"https://huggingface.co/papers/{arxiv_id}" if arxiv_id else "",
"arxiv_url": f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else "",
"pdf_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else "",
"pdf_url": f"https://arxiv.org/pdf/{arxiv_id}" if arxiv_id else "",
"authors": [
a.get("name", a) if isinstance(a, dict) else a
for a in paper_info.get("authors", [])
+24 -5
View File
@@ -3,10 +3,13 @@
from __future__ import annotations
import logging
import os
import shutil
from pathlib import Path
from app.utils import PAPERS_DIR, TMP_DIR, make_http_client
import requests
from app.utils import PAPERS_DIR, TMP_DIR
logger = logging.getLogger(__name__)
@@ -31,6 +34,22 @@ def tmp_dir(arxiv_id: str) -> Path:
# ── PDF 下载 ────────────────────────────────────────────────────────────
# 复用 TCP 连接的 session
_http_session: requests.Session | None = None
def _get_session() -> requests.Session:
global _http_session
if _http_session is None:
_http_session = requests.Session()
_http_session.headers.update({"User-Agent": "hf-daily-papers/1.0"})
# 代理:优先 $PROXY_SERVER,其次 settings.http_proxy
proxy = os.environ.get("PROXY_SERVER")
if proxy:
_http_session.proxies = {"http": proxy, "https": proxy}
logger.info("PDF download using proxy from $PROXY_SERVER: %s", proxy)
return _http_session
async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
"""下载 PDF 到 data/tmp/{arxiv_id}/paper.pdf。"""
@@ -42,10 +61,10 @@ async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
dest = dest_dir / "paper.pdf"
try:
async with make_http_client(follow_redirects=True) as client:
resp = await client.get(pdf_url)
resp.raise_for_status()
dest.write_bytes(resp.content)
session = _get_session()
resp = session.get(pdf_url, timeout=120, allow_redirects=True)
resp.raise_for_status()
dest.write_bytes(resp.content)
except Exception as exc:
raise PdfDownloadError(f"failed to download PDF for {arxiv_id}: {exc}") from exc
File diff suppressed because it is too large Load Diff
+43 -263
View File
@@ -1,17 +1,38 @@
"""pi CLI 调用与 JSON 提取 — 调用 pi 生成总结,从输出中提取结构化 JSON。"""
"""pi CLI 后端 — 调用 pi 子进程生成总结
通用工具函数(prompt 构建、PDF 提取、JSON 提取、meta.json)已移至 summary_utils.py。
"""
from __future__ import annotations
import asyncio
import json
import logging
import re
import uuid
from pathlib import Path
from app.config import settings
from app.services.summary_utils import (
JsonNotFoundError,
build_prompt,
extract_json,
extract_pdf_text,
write_meta_json,
)
logger = logging.getLogger(__name__)
# 重新导出,保持向后兼容
__all__ = [
"PiTimeoutError",
"PiProcessError",
"JsonNotFoundError",
"call_pi",
"write_meta_json",
"extract_pdf_text",
"build_prompt",
"extract_json",
]
# ── 自定义异常 ──────────────────────────────────────────────────────────
@@ -27,201 +48,6 @@ class PiProcessError(Exception):
super().__init__(f"pi exited with code {returncode}: {stderr[:500]}")
class JsonNotFoundError(Exception):
pass
# ── meta.json ───────────────────────────────────────────────────────────
def write_meta_json(paper) -> Path:
"""写入 data/papers/{arxiv_id}/meta.json,返回路径。"""
from app.services.pdf_downloader import paper_dir
d = paper_dir(paper.arxiv_id)
d.mkdir(parents=True, exist_ok=True)
meta_path = d / "meta.json"
authors = [a.name for a in paper.authors]
tags = [t.tag for t in paper.tags]
meta = {
"arxiv_id": paper.arxiv_id,
"title_en": paper.title_en,
"abstract": paper.abstract or "",
"published_at": paper.published_at.isoformat() if paper.published_at else None,
"authors": authors,
"tags": tags,
"upvotes": paper.upvotes,
}
meta_path.write_text(
json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
)
return meta_path
# ── PDF 文本提取 ────────────────────────────────────────────────────────
def _trim_body(text: str, max_chars: int | None = None) -> str:
"""去除参考文献,保留正文+附录,超长时从末尾截断。
策略:
1. 去掉 References/Bibliography 段落(纯引用列表,对解读无用)
2. 正文 + 附录全部保留
3. 如果指定了 max_chars 且总长超过,从末尾截断(附录靠后,优先保留正文)
"""
import re
# 找 References 段落的位置(在 Appendix 之后的那个)
# 简单策略:找到 References 标题,如果后面没有 Appendix 就全删
# 如果后面还有 Appendix,只删 References 到 Appendix 之间的内容
ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text)
if ref_match:
ref_start = ref_match.start()
# 看 References 之后有没有 Appendix
after_ref = text[ref_start:]
app_match = re.search(
r"(?m)^(?:A\s+(?:Appendix|Supplementary)|Appendix|附录)\s*$", after_ref
)
if app_match:
# References 之后有 Appendix:只删 References 段
ref_end = ref_start + app_match.start()
text = text[:ref_start] + text[ref_end:]
else:
# References 之后没有 Appendix:删掉从 References 到结尾
text = text[:ref_start].rstrip()
# 去掉 Acknowledgments(对解读无用)
ack_match = re.search(r"(?m)^(?:Acknowledgments?\s*|致谢\s*)$", text)
if ack_match:
# 只删 Acknowledgments 本身,不删后面的内容
next_section = re.search(r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start():])
if next_section:
text = text[:ack_match.start()] + text[ack_match.start() + next_section.start():]
else:
text = text[:ack_match.start()].rstrip()
# 最后:如果指定了上限且超长,从末尾截断(附录在后面,正文在前面,优先保留正文)
if max_chars is not None and len(text) > max_chars:
text = text[:max_chars].rstrip()
return text
def extract_pdf_text(pdf_path: Path, max_chars: int | None = None) -> Path:
"""用 pymupdf 提取 PDF 正文文本,保存为 .txt。
max_chars=None 时不截断,给 search/auto 模式保留完整内容。
"""
import pymupdf
txt_path = pdf_path.with_suffix(".txt")
if txt_path.exists():
# 缓存优先;如果需重新提取(不同 max_chars),先删旧文件
return txt_path
doc = pymupdf.open(str(pdf_path))
raw_text = "\n\n".join(page.get_text() for page in doc)
doc.close()
body = _trim_body(raw_text, max_chars=max_chars)
txt_path.write_text(body, encoding="utf-8")
logger.info(
"Extracted PDF text: %s (%d -> %d chars, -%d%%)",
txt_path,
len(raw_text),
len(body),
(1 - len(body) / len(raw_text)) * 100 if raw_text else 0,
)
return txt_path
# ── Prompt 构建 ─────────────────────────────────────────────────────────
def _build_prompt(
arxiv_id: str,
meta_path: Path,
txt_path: Path,
pdf_mode: str,
fix_errors: list[str] | None = None,
) -> str:
"""根据模式构建 pi prompt。
inject: 全量注入,prompt 末尾包含论文全文内容
search: pi 自主 read 文件,prompt 只包含工作流指令
"""
json_schema = (
"## 必须包含以下字段(不要自创字段名):\n"
'{"arxiv_id": "...", '
'"title_zh": "中文标题", '
'"one_line": "一句话概括(≤50字)", '
'"tags": ["标签1","标签2"], '
'"difficulty": "入门/进阶/前沿", '
'"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的(50-150字)","why_matters":"为什么读懂本文需要它"}]}, '
'"motivation": {"problem": "详细段落:现有方法的具体问题(包含具体场景和数据)", '
'"goal": "详细段落:本文的具体目标", '
'"gap": "详细段落:本文的独特切入角度"}, '
'"method": {"overview": "详细段落:方法整体思路(先直觉再技术路线)", '
'"key_idea": "详细段落:核心创新点(和已有方法的本质区别)", '
'"steps": "详细段落:方法步骤的完整描述(每步的输入输出和具体操作)", '
'"novelty": "详细段落:技术新颖性分析"}, '
'"results": {"main_findings": "详细段落:核心发现(带具体数字和指标,逐一分析每个实验)", '
'"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
'"limitations": "详细段落:局限性分析(作者承认的+你自己的观察")}, '
'"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", '
'"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", '
'"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度")}, '
'"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要","section":"method"},'
'{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要","section":"results"}]'
"\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Tableid 严格使用 \"Figure N\"\"Table N\" 格式。"
"section 必须是 motivation/method/results/limitations 之一,表示该图最适合展示在哪个章节。"
"}"
)
writing_requirements = (
"## 写作要求\n"
"- 每个字符串字段必须写成详细段落(200-500字),不要用列表或数组\n"
"- 必须包含论文中的具体数据、数字、实验指标\n"
"- 像资深同事给同事讲论文一样,专业但易懂\n"
"- 数学公式、符号、变量必须使用 LaTeX 格式:行内公式用 $...$,独立公式用 $$...$$\n"
" 例如:损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$,学习率 $\\eta$\n"
)
if fix_errors:
error_list = "\n".join(f"- {e}" for e in fix_errors)
return (
"你之前生成的 JSON 存在以下问题,请修正后重新用 write_file 保存到 "
f"data/papers/{arxiv_id}/summary.json\n\n"
f"{error_list}\n\n"
"注意:所有字符串字段必须是详细段落(≥50字),不能是数组或列表。"
"修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
)
if pdf_mode == "search":
return (
"请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n"
"## 工作流程\n"
f"1. 先用 read 工具读取 {meta_path} 了解论文元信息(标题、作者、摘要)\n"
f"2. 再用 read 工具阅读 {txt_path}(论文正文全文),可以多次读取定位关键段落\n"
f"3. 充分理解后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n\n"
+ writing_requirements
+ "\n"
+ json_schema
)
else:
return (
"请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n"
"## 工作流程\n"
"论文元信息和正文全文已在上文提供,请仔细阅读。\n"
f"1. 充分理解论文后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n"
"2. 用 bash 运行 python scripts/validate_summary.py 验证\n\n"
+ writing_requirements
+ "\n"
+ json_schema
)
# ── pi CLI 调用 ────────────────────────────────────────────────────────
@@ -264,12 +90,10 @@ async def call_pi(
txt_path.write_text(trimmed, encoding="utf-8")
logger.info("Truncated %s for inject: %d%d chars", arxiv_id, txt_size, len(trimmed))
prompt_text = _build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors)
prompt_text = build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors)
# 构建 session ID(每篇论文一个独立 session)
if session_id is None:
import uuid
session_id = f"summary-{arxiv_id}-{uuid.uuid4().hex[:8]}"
# 工具列表:search 模式需要 read 工具
@@ -297,6 +121,9 @@ async def call_pi(
arxiv_id, bool(fix_errors), session_id, actual_mode,
)
import time as _time
_t_sub_start = _time.monotonic()
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
@@ -312,69 +139,22 @@ async def call_pi(
await proc.wait()
raise PiTimeoutError(f"pi timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s")
_t_sub_end = _time.monotonic()
# 检查 summary.json 是否由 pi 子进程写入
_summary_file = pdf_path.parent / "summary.json"
_file_info = ""
if _summary_file.exists():
_file_mtime = _summary_file.stat().st_mtime
_file_size = _summary_file.stat().st_size
_file_info = f" summary.json={_file_size}B"
logger.info(
"pi subprocess for %s: %.2fs%s",
arxiv_id, _t_sub_end - _t_sub_start, _file_info,
)
if proc.returncode != 0:
raise PiProcessError(proc.returncode, stderr.decode("utf-8", errors="replace"))
return stdout.decode("utf-8", errors="replace"), session_id
# ── JSON 提取 ──────────────────────────────────────────────────────────
def extract_json(raw_output: str) -> dict:
"""从 pi 输出中提取 JSON dict。三步策略:直接解析 → 代码块 → 最大花括号块。"""
# 策略 1:整体直接解析
stripped = raw_output.strip()
try:
result = json.loads(stripped)
if isinstance(result, dict) and "title_zh" in result:
return result
except json.JSONDecodeError:
pass
# 策略 2:提取 ```json ... ``` 代码块
fence_pattern = re.compile(r"```(?:json)?\s*\n(.*?)```", re.DOTALL)
for match in fence_pattern.finditer(raw_output):
try:
result = json.loads(match.group(1).strip())
if isinstance(result, dict) and "title_zh" in result:
return result
except json.JSONDecodeError:
continue
# 策略 3:匹配包含 title_zh 的最大 {...} 块
brace_pattern = re.compile(r"\{[^{}]*\"title_zh\"[^{}]*\}", re.DOTALL)
for match in brace_pattern.finditer(raw_output):
try:
return json.loads(match.group(0))
except json.JSONDecodeError:
continue
# 更宽松:找到最大的 { ... } 平衡块
best = None
best_len = 0
for i, ch in enumerate(raw_output):
if ch != "{":
continue
depth = 0
for j in range(i, len(raw_output)):
if raw_output[j] == "{":
depth += 1
elif raw_output[j] == "}":
depth -= 1
if depth == 0:
candidate = raw_output[i : j + 1]
if len(candidate) > best_len:
try:
parsed = json.loads(candidate)
if isinstance(parsed, dict):
best = parsed
best_len = len(candidate)
except json.JSONDecodeError:
pass
break
if best is not None:
return best
raise JsonNotFoundError("no JSON object found in pi output")
+200 -64
View File
@@ -29,14 +29,19 @@ from app.services.pdf_downloader import (
download_pdf,
paper_dir,
)
from app.services.pi_client import (
from app.services.summary_utils import (
JsonNotFoundError,
build_prompt,
extract_json,
write_meta_json,
extract_pdf_text,
)
from app.services.pi_client import (
PiProcessError,
PiTimeoutError,
call_pi,
extract_json,
write_meta_json,
)
from app.services import claude_backend
from app.services.schemas import (
SummarySchema,
assess_quality,
@@ -229,7 +234,6 @@ def _save_files(arxiv_id: str, schema: SummarySchema | None, raw_output: str) ->
async def summarize_one(
db: Session,
paper: Paper,
semaphore: asyncio.Semaphore | None = None,
*,
force: bool = False,
pdf_mode: str = "auto",
@@ -257,68 +261,128 @@ async def summarize_one(
"reason": "permanent_failure",
}
if semaphore:
await semaphore.acquire()
try:
return await _do_summarize_one(db, paper, pdf_mode=pdf_mode)
finally:
if semaphore:
semaphore.release()
return await _do_summarize_one(db, paper, pdf_mode=pdf_mode)
async def _generate_with_retry(
arxiv_id: str, meta_path: Path, pdf_path: Path, pdf_mode: str = "auto"
) -> tuple[dict, str]:
"""调用 pi CLI 生成总结,最多 4 轮验证循环。
"""调用 AI 后端生成总结,最多 4 轮验证循环。
根据 settings.SUMMARY_BACKEND 选择 pi 或 claude 后端。
Returns:
(json_data, raw_output)
Raises:
ValueError: 4 轮验证仍未通过
"""
import time as _time
backend = settings.SUMMARY_BACKEND
validation_errors: list[str] = []
json_data: dict | None = None
raw_output = ""
session_id = None
summary_file = paper_dir(arxiv_id) / "summary.json"
# claude 后端需要预构建 promptpi 后端在 call_pi 内部构建)
claude_prompt: str | None = None
if backend == "claude":
_t0 = _time.monotonic()
txt_path = extract_pdf_text(pdf_path, max_chars=None)
body = txt_path.read_text(encoding="utf-8")
if len(body) > 80_000:
trimmed = body[:80_000].rstrip()
txt_path.write_text(trimmed, encoding="utf-8")
claude_prompt = build_prompt(arxiv_id, meta_path, txt_path, "inject", None)
logger.info(" [%s] 构建prompt: %.2fs", arxiv_id, _time.monotonic() - _t0)
for attempt in range(1, 5):
# 清理上一轮 pi 写的不完整文件
stale = paper_dir(arxiv_id) / "summary.json"
if stale.exists():
stale.unlink()
# 清理上一轮写的不完整文件
if summary_file.exists():
summary_file.unlink()
if attempt == 1:
raw_output, session_id = await call_pi(meta_path, pdf_path, pdf_mode=pdf_mode)
# 记录 AI 调用开始时间
_t_call_start = _time.monotonic()
if backend == "claude":
if attempt == 1:
raw_output, session_id = await claude_backend.call_claude(
claude_prompt, session_id=None,
)
else:
retry_prompt = build_prompt(
arxiv_id, meta_path,
extract_pdf_text(pdf_path, max_chars=80000),
"inject", fix_errors=validation_errors,
)
raw_output, session_id = await claude_backend.call_claude(
retry_prompt, session_id=session_id, fix_errors=validation_errors,
)
else:
raw_output, session_id = await call_pi(
meta_path, pdf_path,
fix_errors=validation_errors,
session_id=session_id,
pdf_mode=pdf_mode,
)
if attempt == 1:
raw_output, session_id = await call_pi(meta_path, pdf_path, pdf_mode=pdf_mode)
else:
raw_output, session_id = await call_pi(
meta_path, pdf_path,
fix_errors=validation_errors,
session_id=session_id,
pdf_mode=pdf_mode,
)
# 优先读取 pi 写入的 summary.json,否则从 stdout 提取
summary_file = paper_dir(arxiv_id) / "summary.json"
_t_call_end = _time.monotonic()
# 检查 summary.json 是否由 AI 子进程写入
file_written_by_ai = summary_file.exists()
file_mtime = summary_file.stat().st_mtime if file_written_by_ai else None
file_size = summary_file.stat().st_size if file_written_by_ai else 0
logger.info(
" [%s] attempt %d AI调用: %.2fs summary.json=%s%s",
arxiv_id, attempt,
_t_call_end - _t_call_start,
f"已写入({file_size}B)" if file_written_by_ai else "未写入",
f" mtime={file_mtime:.2f}" if file_mtime else "",
)
# 提取 JSON
_t_json_start = _time.monotonic()
try:
if summary_file.exists():
if file_written_by_ai:
json_data = json.loads(summary_file.read_text(encoding="utf-8"))
logger.info("Read summary.json written by pi for %s", arxiv_id)
logger.info(" [%s] 从AI写入的summary.json读取", arxiv_id)
else:
json_data = extract_json(raw_output)
except (json.JSONDecodeError, JsonNotFoundError) as exc:
_t_json_end = _time.monotonic()
logger.warning(
"JSON extraction failed for %s (attempt %d): %s",
arxiv_id, attempt, str(exc)[:200],
" [%s] JSON提取失败: %.2fs %s",
arxiv_id, _t_json_end - _t_json_start, str(exc)[:200],
)
validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"]
continue
_t_json_end = _time.monotonic()
# 验证
_t_val_start = _time.monotonic()
validation_errors = _validate_summary(json_data, arxiv_id)
_t_val_end = _time.monotonic()
if not validation_errors:
logger.info(
" [%s] JSON提取: %.2fs 验证: %.2fs ✅",
arxiv_id,
_t_json_end - _t_json_start,
_t_val_end - _t_val_start,
)
break
logger.warning(
"Validation failed for %s (attempt %d): %s",
arxiv_id, attempt, "; ".join(validation_errors),
" [%s] JSON提取: %.2fs 验证: %.2fs ❌ %s",
arxiv_id,
_t_json_end - _t_json_start,
_t_val_end - _t_val_start,
"; ".join(validation_errors)[:200],
)
if validation_errors:
@@ -335,11 +399,19 @@ def _persist_summary(
db: Session, paper: Paper, json_data: dict, raw_output: str
) -> str:
"""Pydantic 校验 → 质量评估 → 保存文件 → 更新 DB → 返回 quality。"""
import time as _time
arxiv_id = paper.arxiv_id
_t0 = _time.monotonic()
schema = SummarySchema.model_validate(json_data)
quality = assess_quality(schema)
_t1 = _time.monotonic()
_save_files(arxiv_id, schema, raw_output)
_t2 = _time.monotonic()
_save_files(paper.arxiv_id, schema, raw_output)
_update_summary_in_db(db, paper, schema, quality, raw_output)
_t3 = _time.monotonic()
# 状态 → done
paper.summary_status.status = SummaryState.DONE
@@ -347,10 +419,30 @@ def _persist_summary(
paper.summary_status.completed_at = utc_now()
paper.summary_status.raw_output_saved = True
db.commit()
_t4 = _time.monotonic()
logger.info(
" [%s] persist: pydantic=%.2fs 文件=%.2fs DB写入=%.2fs 状态commit=%.2fs",
arxiv_id,
_t1 - _t0,
_t2 - _t1,
_t3 - _t2,
_t4 - _t3,
)
# 触发性增强(失败不影响总结)
_maybe_extract_images(paper.arxiv_id, schema)
_maybe_index_chroma(paper.arxiv_id, paper, schema)
_t5 = _time.monotonic()
_maybe_extract_images(arxiv_id, schema)
_t6 = _time.monotonic()
_maybe_index_chroma(arxiv_id, paper, schema)
_t7 = _time.monotonic()
logger.info(
" [%s] 后处理: 图片提取=%.2fs ChromaDB=%.2fs",
arxiv_id,
_t6 - _t5,
_t7 - _t6,
)
return quality
@@ -445,28 +537,47 @@ async def _do_summarize_one(
) -> dict:
"""实际的单篇总结执行(在 semaphore 保护下)。"""
arxiv_id = paper.arxiv_id
title_short = (paper.title_en or "")[:50]
# 状态 → processing
paper.summary_status.status = SummaryState.PROCESSING
paper.summary_status.started_at = utc_now()
db.commit()
logger.info("▶ [%s] 开始总结: %s", arxiv_id, title_short)
# 清理旧的图片文件和 figures_json,避免重新总结时残留
import time as _time
_t_cleanup_start = _time.monotonic()
_cleanup_old_images(db, paper)
_t_cleanup_end = _time.monotonic()
logger.info(" [%s] 清理旧数据: %.2fs", arxiv_id, _t_cleanup_end - _t_cleanup_start)
raw_output = ""
try:
meta_path = write_meta_json(paper)
await download_pdf(arxiv_id, paper.pdf_url)
_t0 = _time.monotonic()
meta_path = write_meta_json(paper)
_t1 = _time.monotonic()
logger.info(" [%s] meta.json: %.2fs", arxiv_id, _t1 - _t0)
await download_pdf(arxiv_id, paper.pdf_url)
_t2 = _time.monotonic()
logger.info(" [%s] 下载PDF: %.2fs", arxiv_id, _t2 - _t1)
logger.info(" [%s] 调用 pi 生成总结...", arxiv_id)
json_data, raw_output = await _generate_with_retry(
arxiv_id, meta_path, TMP_DIR / arxiv_id / "paper.pdf",
pdf_mode=pdf_mode,
)
_t3 = _time.monotonic()
logger.info(" [%s] pi生成: %.2fs", arxiv_id, _t3 - _t2)
quality = _persist_summary(db, paper, json_data, raw_output)
_t4 = _time.monotonic()
logger.info(" [%s] 持久化: %.2fs", arxiv_id, _t4 - _t3)
logger.info("Summarize done: %s quality=%s", arxiv_id, quality)
logger.info("✅ [%s] 完成: quality=%s 总耗时: %.2fs", arxiv_id, quality, _t4 - _t0)
return {"arxiv_id": arxiv_id, "status": "done", "quality": quality}
except Exception as exc:
@@ -588,42 +699,67 @@ async def summarize_batch(
"total": 0,
}
# 并发控制
semaphore = asyncio.Semaphore(settings.SUMMARY_CONCURRENCY)
# 并发控制worker 模式,避免 573 个协程同时打开 DB 连接耗尽连接池
concurrency = settings.SUMMARY_CONCURRENCY
make_session = _session_factory or SessionLocal
async def _process_paper(paper: Paper) -> dict:
paper_db = make_session()
try:
p = paper_db.execute(
select(Paper)
.where(Paper.id == paper.id)
.options(*PAPER_DEFAULT_LOAD)
).unique().scalar_one_or_none()
return await summarize_one(paper_db, p, semaphore, pdf_mode=pdf_mode)
finally:
paper_db.close()
# 进度追踪
progress = {"done": 0, "failed": 0, "skipped": 0}
paper_queue: asyncio.Queue[Paper | None] = asyncio.Queue()
for p in papers:
paper_queue.put_nowait(p)
results = await asyncio.gather(
*[_process_paper(p) for p in papers],
async def _worker() -> list[dict]:
results: list[dict] = []
while True:
paper = paper_queue.get_nowait() if not paper_queue.empty() else None
if paper is None:
break
paper_db = make_session()
try:
p = paper_db.execute(
select(Paper)
.where(Paper.id == paper.id)
.options(*PAPER_DEFAULT_LOAD)
).unique().scalar_one_or_none()
result = await summarize_one(paper_db, p, pdf_mode=pdf_mode)
status = result.get("status", "failed")
progress[status] = progress.get(status, 0) + 1
finished = sum(progress.values())
logger.info(
"📊 进度: %d/%d (✅%d%d ⏭️%d) — %s",
finished, total,
progress["done"], progress["failed"], progress["skipped"],
paper.arxiv_id,
)
results.append(result)
except Exception as exc:
logger.error("Worker error: %s", exc)
results.append({"status": "failed", "error": str(exc)})
finally:
paper_db.close()
return results
worker_results = await asyncio.gather(
*[_worker() for _ in range(concurrency)],
return_exceptions=True,
)
results = []
for r in worker_results:
if isinstance(r, Exception):
logger.error("Unexpected error in batch: %s", r)
results.append(r)
elif isinstance(r, list):
results.extend(r)
# 统计结果
done = 0
failed = 0
skipped = 0
# 统计结果progress 已在 worker 中实时更新)
done = progress["done"]
failed = progress["failed"]
skipped = progress["skipped"]
for r in results:
if isinstance(r, Exception):
logger.error("Unexpected error in batch: %s", r)
failed += 1
elif isinstance(r, dict):
if r.get("status") == "done":
done += 1
elif r.get("status") == "skipped":
skipped += 1
else:
failed += 1
log_entry.status = "success" if failed == 0 else "failed"
log_entry.papers_found = total
+270
View File
@@ -0,0 +1,270 @@
"""总结工具函数 — PDF 文本提取、prompt 构建、JSON 提取、meta.json 写入。
与后端无关的通用逻辑,pi 和 claude 后端共享。
"""
from __future__ import annotations
import json
import logging
import re
from pathlib import Path
logger = logging.getLogger(__name__)
# ── 自定义异常 ──────────────────────────────────────────────────────────
class JsonNotFoundError(Exception):
pass
# ── meta.json ───────────────────────────────────────────────────────────
def write_meta_json(paper) -> Path:
"""写入 data/papers/{arxiv_id}/meta.json,返回路径。"""
from app.services.pdf_downloader import paper_dir
d = paper_dir(paper.arxiv_id)
d.mkdir(parents=True, exist_ok=True)
meta_path = d / "meta.json"
authors = [a.name for a in paper.authors]
tags = [t.tag for t in paper.tags]
meta = {
"arxiv_id": paper.arxiv_id,
"title_en": paper.title_en,
"abstract": paper.abstract or "",
"published_at": paper.published_at.isoformat() if paper.published_at else None,
"authors": authors,
"tags": tags,
"upvotes": paper.upvotes,
}
meta_path.write_text(
json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
)
return meta_path
# ── PDF 文本提取 ────────────────────────────────────────────────────────
def _trim_body(text: str, max_chars: int | None = None) -> str:
"""去除参考文献,保留正文+附录,超长时从末尾截断。
策略:
1. 去掉 References/Bibliography 段落(纯引用列表,对解读无用)
2. 正文 + 附录全部保留
3. 如果指定了 max_chars 且总长超过,从末尾截断(附录靠后,优先保留正文)
"""
# 找 References 段落的位置(在 Appendix 之后的那个)
ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text)
if ref_match:
ref_start = ref_match.start()
# 看 References 之后有没有 Appendix
after_ref = text[ref_start:]
app_match = re.search(
r"(?m)^(?:A\s+(?:Appendix|Supplementary)|Appendix|附录)\s*$", after_ref
)
if app_match:
# References 之后有 Appendix:只删 References 段
ref_end = ref_start + app_match.start()
text = text[:ref_start] + text[ref_end:]
else:
# References 之后没有 Appendix:删掉从 References 到结尾
text = text[:ref_start].rstrip()
# 去掉 Acknowledgments(对解读无用)
ack_match = re.search(r"(?m)^(?:Acknowledgments?\s*|致谢\s*)$", text)
if ack_match:
# 只删 Acknowledgments 本身,不删后面的内容
next_section = re.search(r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start():])
if next_section:
text = text[:ack_match.start()] + text[ack_match.start() + next_section.start():]
else:
text = text[:ack_match.start()].rstrip()
# 最后:如果指定了上限且超长,从末尾截断(附录在后面,正文在前面,优先保留正文)
if max_chars is not None and len(text) > max_chars:
text = text[:max_chars].rstrip()
return text
def extract_pdf_text(pdf_path: Path, max_chars: int | None = None) -> Path:
"""用 pymupdf 提取 PDF 正文文本,保存为 .txt。
max_chars=None 时不截断,给 search/auto 模式保留完整内容。
"""
import pymupdf
txt_path = pdf_path.with_suffix(".txt")
if txt_path.exists():
# 缓存优先;如果需重新提取(不同 max_chars),先删旧文件
return txt_path
doc = pymupdf.open(str(pdf_path))
# sort=True 启用阅读顺序检测,避免双栏论文中跨栏错位
raw_text = "\n\n".join(page.get_text(sort=True) for page in doc)
doc.close()
body = _trim_body(raw_text, max_chars=max_chars)
txt_path.write_text(body, encoding="utf-8")
logger.info(
"Extracted PDF text: %s (%d -> %d chars, -%d%%)",
txt_path,
len(raw_text),
len(body),
(1 - len(body) / len(raw_text)) * 100 if raw_text else 0,
)
return txt_path
# ── Prompt 构建 ─────────────────────────────────────────────────────────
def build_prompt(
arxiv_id: str,
meta_path: Path,
txt_path: Path,
pdf_mode: str,
fix_errors: list[str] | None = None,
) -> str:
"""根据模式构建 prompt。
inject: 全量注入,prompt 末尾包含论文全文内容
search: pi 自主 read 文件,prompt 只包含工作流指令
"""
json_schema = (
"## 必须包含以下字段(不要自创字段名):\n"
'{"arxiv_id": "...", '
'"title_zh": "中文标题", '
'"one_line": "一句话概括(≤50字)", '
'"tags": ["标签1","标签2"], '
'"difficulty": "入门/进阶/前沿", '
'"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的(50-150字)","why_matters":"为什么读懂本文需要它"}]}, '
'"motivation": {"problem": "详细段落:现有方法的具体问题(包含具体场景和数据)", '
'"goal": "详细段落:本文的具体目标", '
'"gap": "详细段落:本文的独特切入角度"}, '
'"method": {"overview": "详细段落:方法整体思路(先直觉再技术路线)", '
'"key_idea": "详细段落:核心创新点(和已有方法的本质区别)", '
'"steps": "详细段落:方法步骤的完整描述(每步的输入输出和具体操作)", '
'"novelty": "详细段落:技术新颖性分析"}, '
'"results": {"main_findings": "详细段落:核心发现(带具体数字和指标,逐一分析每个实验)", '
'"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
'"limitations": "详细段落:局限性分析(作者承认的+你自己的观察")}, '
'"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", '
'"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", '
'"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度")}, '
'"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要","section":"method"},'
'{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要","section":"results"}]'
"\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Tableid 严格使用 \"Figure N\"\"Table N\" 格式。"
"section 必须是 motivation/method/results/limitations 之一,表示该图最适合展示在哪个章节。"
"}"
)
writing_requirements = (
"## 写作要求\n"
"- 每个字符串字段必须写成详细段落(200-500字),不要用列表或数组\n"
"- 必须包含论文中的具体数据、数字、实验指标\n"
"- 像资深同事给同事讲论文一样,专业但易懂\n"
"- 数学公式、符号、变量必须使用 LaTeX 格式:行内公式用 $...$,独立公式用 $$...$$\n"
" 例如:损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$,学习率 $\\eta$\n"
)
if fix_errors:
error_list = "\n".join(f"- {e}" for e in fix_errors)
return (
"你之前生成的 JSON 存在以下问题,请修正后重新用 write_file 保存到 "
f"data/papers/{arxiv_id}/summary.json\n\n"
f"{error_list}\n\n"
"注意:所有字符串字段必须是详细段落(≥50字),不能是数组或列表。"
"修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
)
if pdf_mode == "search":
return (
"请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n"
"## 工作流程\n"
f"1. 先用 read 工具读取 {meta_path} 了解论文元信息(标题、作者、摘要)\n"
f"2. 再用 read 工具阅读 {txt_path}(论文正文全文),可以多次读取定位关键段落\n"
f"3. 充分理解后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n\n"
+ writing_requirements
+ "\n"
+ json_schema
)
else:
return (
"请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n"
"## 工作流程\n"
"论文元信息和正文全文已在上文提供,请仔细阅读。\n"
f"1. 充分理解论文后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n"
"2. 用 bash 运行 python scripts/validate_summary.py 验证\n\n"
+ writing_requirements
+ "\n"
+ json_schema
)
# ── JSON 提取 ──────────────────────────────────────────────────────────
def extract_json(raw_output: str) -> dict:
"""从输出中提取 JSON dict。三步策略:直接解析 → 代码块 → 最大花括号块。"""
# 策略 1:整体直接解析
stripped = raw_output.strip()
try:
result = json.loads(stripped)
if isinstance(result, dict) and "title_zh" in result:
return result
except json.JSONDecodeError:
pass
# 策略 2:提取 ```json ... ``` 代码块
fence_pattern = re.compile(r"```(?:json)?\s*\n(.*?)```", re.DOTALL)
for match in fence_pattern.finditer(raw_output):
try:
result = json.loads(match.group(1).strip())
if isinstance(result, dict) and "title_zh" in result:
return result
except json.JSONDecodeError:
continue
# 策略 3:匹配包含 title_zh 的最大 {...} 块
brace_pattern = re.compile(r"\{[^{}]*\"title_zh\"[^{}]*\}", re.DOTALL)
for match in brace_pattern.finditer(raw_output):
try:
return json.loads(match.group(0))
except json.JSONDecodeError:
continue
# 更宽松:找到最大的 { ... } 平衡块
best = None
best_len = 0
for i, ch in enumerate(raw_output):
if ch != "{":
continue
depth = 0
for j in range(i, len(raw_output)):
if raw_output[j] == "{":
depth += 1
elif raw_output[j] == "}":
depth -= 1
if depth == 0:
candidate = raw_output[i : j + 1]
if len(candidate) > best_len:
try:
parsed = json.loads(candidate)
if isinstance(parsed, dict):
best = parsed
best_len = len(candidate)
except json.JSONDecodeError:
pass
break
if best is not None:
return best
raise JsonNotFoundError("no JSON object found in output")