feat: add claude backend, refactor summary utilities, improve batch worker pattern, add pymupdf4llm

This commit is contained in:
2026-06-12 22:25:57 +08:00
parent b42e9149e5
commit e2f0e1a8be
13 changed files with 1350 additions and 1010 deletions
+3
View File
@@ -19,8 +19,11 @@ HTTP_MAX_RETRIES=3
HTTP_USER_AGENT=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 HTTP_USER_AGENT=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
# ─── AI 总结 ────────────────────────────── # ─── AI 总结 ──────────────────────────────
# 总结后端:pi | claude
SUMMARY_BACKEND=pi
PI_BIN= PI_BIN=
SUMMARY_SKILL=daily-paper-summary SUMMARY_SKILL=daily-paper-summary
CLAUDE_BIN=claude
SUMMARY_CONCURRENCY=3 SUMMARY_CONCURRENCY=3
SUMMARY_TIMEOUT_SECONDS=1200 SUMMARY_TIMEOUT_SECONDS=1200
SUMMARY_MAX_RETRIES=2 SUMMARY_MAX_RETRIES=2
+24 -2
View File
@@ -1,6 +1,7 @@
"""CLI 工具 — 手动抓取论文。""" """CLI 工具 — 手动抓取论文。"""
import asyncio import asyncio
import logging
import typer import typer
from dotenv import load_dotenv from dotenv import load_dotenv
@@ -49,8 +50,11 @@ def crawl(
typer.echo(f"📡 开始抓取 {target} ...") typer.echo(f"📡 开始抓取 {target} ...")
result = asyncio.run(crawl_daily(db, target, top_n)) result = asyncio.run(crawl_daily(db, target, top_n))
# 未指定日期且今天无数据时,自动回退到昨天 # 未指定日期且今天失败或无数据时,自动回退到昨天
if not date_str and result["status"] == "success" and result["found"] == 0: need_fallback = not date_str and (
result["status"] == "failed" or result["found"] == 0
)
if need_fallback:
fallback = yesterday_str() fallback = yesterday_str()
existing = db.scalar(select(func.count(Paper.id)).where(Paper.paper_date == fallback)) or 0 existing = db.scalar(select(func.count(Paper.id)).where(Paper.paper_date == fallback)) or 0
if existing > 0: if existing > 0:
@@ -84,6 +88,11 @@ def summarize(
"--pdf-mode", "--pdf-mode",
help="PDF 传递方式:auto(自动选择)| inject(全量注入)| searchpi 自主搜索)", help="PDF 传递方式:auto(自动选择)| inject(全量注入)| searchpi 自主搜索)",
), ),
backend: str = typer.Option(
None,
"--backend",
help="总结后端:pi | claude(留空则使用 .env 配置)",
),
): ):
"""手动触发 AI 总结。""" """手动触发 AI 总结。"""
from app.config import settings from app.config import settings
@@ -97,9 +106,22 @@ def summarize(
typer.echo(f"❌ 无效的 pdf_mode: {pdf_mode},只支持 auto / inject / search", err=True) typer.echo(f"❌ 无效的 pdf_mode: {pdf_mode},只支持 auto / inject / search", err=True)
raise typer.Exit(code=1) raise typer.Exit(code=1)
if backend:
if backend not in ("pi", "claude"):
typer.echo(f"❌ 无效的 backend: {backend},只支持 pi / claude", err=True)
raise typer.Exit(code=1)
settings.SUMMARY_BACKEND = backend
os.makedirs(settings.db_path.parent, exist_ok=True) os.makedirs(settings.db_path.parent, exist_ok=True)
_init(engine) _init(engine)
# 配置 logging 输出到终端
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-5s %(name)s | %(message)s",
datefmt="%H:%M:%S",
)
db = SessionLocal() db = SessionLocal()
try: try:
if arxiv_id: if arxiv_id:
+2
View File
@@ -29,8 +29,10 @@ class Settings(BaseSettings):
HTTP_USER_AGENT: str = "hf-daily-papers-local/0.1" HTTP_USER_AGENT: str = "hf-daily-papers-local/0.1"
# AI 总结 # AI 总结
SUMMARY_BACKEND: str = "pi" # "pi" | "claude"
PI_BIN: str = "" PI_BIN: str = ""
SUMMARY_SKILL: str = "daily-paper-summary" SUMMARY_SKILL: str = "daily-paper-summary"
CLAUDE_BIN: str = "claude"
SUMMARY_CONCURRENCY: int = 3 SUMMARY_CONCURRENCY: int = 3
SUMMARY_TIMEOUT_SECONDS: int = 1200 SUMMARY_TIMEOUT_SECONDS: int = 1200
SUMMARY_MAX_RETRIES: int = 2 SUMMARY_MAX_RETRIES: int = 2
+84
View File
@@ -0,0 +1,84 @@
"""Claude CLI 后端 — 调用 claude CLI 子进程生成总结。
和 pi_client.py 对称的接口,复用 prompt 构建、PDF 文本提取、JSON 提取逻辑。
"""
from __future__ import annotations
import asyncio
import logging
import uuid
from app.config import settings
logger = logging.getLogger(__name__)
class ClaudeTimeoutError(Exception):
pass
class ClaudeProcessError(Exception):
def __init__(self, returncode: int, stderr: str):
self.returncode = returncode
self.stderr = stderr
super().__init__(f"claude exited with code {returncode}: {stderr[:500]}")
async def call_claude(
prompt: str,
session_id: str | None = None,
fix_errors: list[str] | None = None,
) -> tuple[str, str]:
"""调用 claude CLI print 模式,返回 (stdout 文本, session_id)。
和 call_pi() 对称的接口,但 claude CLI 不需要文件路径和 pdf_mode——
所有内容已在 prompt 中准备好。
Args:
prompt: 完整的 prompt 文本
session_id: session ID(首次为 None 时自动生成)
fix_errors: 上一轮验证错误列表(用于重试)
"""
if session_id is None:
session_id = f"claude-summary-{uuid.uuid4().hex[:8]}"
cmd = [settings.CLAUDE_BIN, "-p", "--output-format", "text"]
if fix_errors and session_id:
# 重试:延续 session
cmd += ["--session-id", session_id, "--continue"]
else:
cmd += ["--session-id", session_id]
cmd.append(prompt)
logger.info(
"Calling claude (session=%s, fix=%s)",
session_id,
bool(fix_errors),
)
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
try:
stdout, stderr = await asyncio.wait_for(
proc.communicate(),
timeout=settings.SUMMARY_TIMEOUT_SECONDS,
)
except asyncio.TimeoutError:
proc.kill()
await proc.wait()
raise ClaudeTimeoutError(
f"claude timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s"
)
if proc.returncode != 0:
raise ClaudeProcessError(
proc.returncode, stderr.decode("utf-8", errors="replace")
)
return stdout.decode("utf-8", errors="replace"), session_id
+1 -1
View File
@@ -83,7 +83,7 @@ def _parse_paper(item: dict) -> dict:
"upvotes": item.get("paper", {}).get("upvotes", 0) or item.get("upvotes", 0), "upvotes": item.get("paper", {}).get("upvotes", 0) or item.get("upvotes", 0),
"hf_url": f"https://huggingface.co/papers/{arxiv_id}" if arxiv_id else "", "hf_url": f"https://huggingface.co/papers/{arxiv_id}" if arxiv_id else "",
"arxiv_url": f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else "", "arxiv_url": f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else "",
"pdf_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else "", "pdf_url": f"https://arxiv.org/pdf/{arxiv_id}" if arxiv_id else "",
"authors": [ "authors": [
a.get("name", a) if isinstance(a, dict) else a a.get("name", a) if isinstance(a, dict) else a
for a in paper_info.get("authors", []) for a in paper_info.get("authors", [])
+24 -5
View File
@@ -3,10 +3,13 @@
from __future__ import annotations from __future__ import annotations
import logging import logging
import os
import shutil import shutil
from pathlib import Path from pathlib import Path
from app.utils import PAPERS_DIR, TMP_DIR, make_http_client import requests
from app.utils import PAPERS_DIR, TMP_DIR
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -31,6 +34,22 @@ def tmp_dir(arxiv_id: str) -> Path:
# ── PDF 下载 ──────────────────────────────────────────────────────────── # ── PDF 下载 ────────────────────────────────────────────────────────────
# 复用 TCP 连接的 session
_http_session: requests.Session | None = None
def _get_session() -> requests.Session:
global _http_session
if _http_session is None:
_http_session = requests.Session()
_http_session.headers.update({"User-Agent": "hf-daily-papers/1.0"})
# 代理:优先 $PROXY_SERVER,其次 settings.http_proxy
proxy = os.environ.get("PROXY_SERVER")
if proxy:
_http_session.proxies = {"http": proxy, "https": proxy}
logger.info("PDF download using proxy from $PROXY_SERVER: %s", proxy)
return _http_session
async def download_pdf(arxiv_id: str, pdf_url: str) -> Path: async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
"""下载 PDF 到 data/tmp/{arxiv_id}/paper.pdf。""" """下载 PDF 到 data/tmp/{arxiv_id}/paper.pdf。"""
@@ -42,10 +61,10 @@ async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
dest = dest_dir / "paper.pdf" dest = dest_dir / "paper.pdf"
try: try:
async with make_http_client(follow_redirects=True) as client: session = _get_session()
resp = await client.get(pdf_url) resp = session.get(pdf_url, timeout=120, allow_redirects=True)
resp.raise_for_status() resp.raise_for_status()
dest.write_bytes(resp.content) dest.write_bytes(resp.content)
except Exception as exc: except Exception as exc:
raise PdfDownloadError(f"failed to download PDF for {arxiv_id}: {exc}") from exc raise PdfDownloadError(f"failed to download PDF for {arxiv_id}: {exc}") from exc
File diff suppressed because it is too large Load Diff
+43 -263
View File
@@ -1,17 +1,38 @@
"""pi CLI 调用与 JSON 提取 — 调用 pi 生成总结,从输出中提取结构化 JSON。""" """pi CLI 后端 — 调用 pi 子进程生成总结
通用工具函数(prompt 构建、PDF 提取、JSON 提取、meta.json)已移至 summary_utils.py。
"""
from __future__ import annotations from __future__ import annotations
import asyncio import asyncio
import json
import logging import logging
import re import uuid
from pathlib import Path from pathlib import Path
from app.config import settings from app.config import settings
from app.services.summary_utils import (
JsonNotFoundError,
build_prompt,
extract_json,
extract_pdf_text,
write_meta_json,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# 重新导出,保持向后兼容
__all__ = [
"PiTimeoutError",
"PiProcessError",
"JsonNotFoundError",
"call_pi",
"write_meta_json",
"extract_pdf_text",
"build_prompt",
"extract_json",
]
# ── 自定义异常 ────────────────────────────────────────────────────────── # ── 自定义异常 ──────────────────────────────────────────────────────────
@@ -27,201 +48,6 @@ class PiProcessError(Exception):
super().__init__(f"pi exited with code {returncode}: {stderr[:500]}") super().__init__(f"pi exited with code {returncode}: {stderr[:500]}")
class JsonNotFoundError(Exception):
pass
# ── meta.json ───────────────────────────────────────────────────────────
def write_meta_json(paper) -> Path:
"""写入 data/papers/{arxiv_id}/meta.json,返回路径。"""
from app.services.pdf_downloader import paper_dir
d = paper_dir(paper.arxiv_id)
d.mkdir(parents=True, exist_ok=True)
meta_path = d / "meta.json"
authors = [a.name for a in paper.authors]
tags = [t.tag for t in paper.tags]
meta = {
"arxiv_id": paper.arxiv_id,
"title_en": paper.title_en,
"abstract": paper.abstract or "",
"published_at": paper.published_at.isoformat() if paper.published_at else None,
"authors": authors,
"tags": tags,
"upvotes": paper.upvotes,
}
meta_path.write_text(
json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
)
return meta_path
# ── PDF 文本提取 ────────────────────────────────────────────────────────
def _trim_body(text: str, max_chars: int | None = None) -> str:
"""去除参考文献,保留正文+附录,超长时从末尾截断。
策略:
1. 去掉 References/Bibliography 段落(纯引用列表,对解读无用)
2. 正文 + 附录全部保留
3. 如果指定了 max_chars 且总长超过,从末尾截断(附录靠后,优先保留正文)
"""
import re
# 找 References 段落的位置(在 Appendix 之后的那个)
# 简单策略:找到 References 标题,如果后面没有 Appendix 就全删
# 如果后面还有 Appendix,只删 References 到 Appendix 之间的内容
ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text)
if ref_match:
ref_start = ref_match.start()
# 看 References 之后有没有 Appendix
after_ref = text[ref_start:]
app_match = re.search(
r"(?m)^(?:A\s+(?:Appendix|Supplementary)|Appendix|附录)\s*$", after_ref
)
if app_match:
# References 之后有 Appendix:只删 References 段
ref_end = ref_start + app_match.start()
text = text[:ref_start] + text[ref_end:]
else:
# References 之后没有 Appendix:删掉从 References 到结尾
text = text[:ref_start].rstrip()
# 去掉 Acknowledgments(对解读无用)
ack_match = re.search(r"(?m)^(?:Acknowledgments?\s*|致谢\s*)$", text)
if ack_match:
# 只删 Acknowledgments 本身,不删后面的内容
next_section = re.search(r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start():])
if next_section:
text = text[:ack_match.start()] + text[ack_match.start() + next_section.start():]
else:
text = text[:ack_match.start()].rstrip()
# 最后:如果指定了上限且超长,从末尾截断(附录在后面,正文在前面,优先保留正文)
if max_chars is not None and len(text) > max_chars:
text = text[:max_chars].rstrip()
return text
def extract_pdf_text(pdf_path: Path, max_chars: int | None = None) -> Path:
"""用 pymupdf 提取 PDF 正文文本,保存为 .txt。
max_chars=None 时不截断,给 search/auto 模式保留完整内容。
"""
import pymupdf
txt_path = pdf_path.with_suffix(".txt")
if txt_path.exists():
# 缓存优先;如果需重新提取(不同 max_chars),先删旧文件
return txt_path
doc = pymupdf.open(str(pdf_path))
raw_text = "\n\n".join(page.get_text() for page in doc)
doc.close()
body = _trim_body(raw_text, max_chars=max_chars)
txt_path.write_text(body, encoding="utf-8")
logger.info(
"Extracted PDF text: %s (%d -> %d chars, -%d%%)",
txt_path,
len(raw_text),
len(body),
(1 - len(body) / len(raw_text)) * 100 if raw_text else 0,
)
return txt_path
# ── Prompt 构建 ─────────────────────────────────────────────────────────
def _build_prompt(
arxiv_id: str,
meta_path: Path,
txt_path: Path,
pdf_mode: str,
fix_errors: list[str] | None = None,
) -> str:
"""根据模式构建 pi prompt。
inject: 全量注入,prompt 末尾包含论文全文内容
search: pi 自主 read 文件,prompt 只包含工作流指令
"""
json_schema = (
"## 必须包含以下字段(不要自创字段名):\n"
'{"arxiv_id": "...", '
'"title_zh": "中文标题", '
'"one_line": "一句话概括(≤50字)", '
'"tags": ["标签1","标签2"], '
'"difficulty": "入门/进阶/前沿", '
'"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的(50-150字)","why_matters":"为什么读懂本文需要它"}]}, '
'"motivation": {"problem": "详细段落:现有方法的具体问题(包含具体场景和数据)", '
'"goal": "详细段落:本文的具体目标", '
'"gap": "详细段落:本文的独特切入角度"}, '
'"method": {"overview": "详细段落:方法整体思路(先直觉再技术路线)", '
'"key_idea": "详细段落:核心创新点(和已有方法的本质区别)", '
'"steps": "详细段落:方法步骤的完整描述(每步的输入输出和具体操作)", '
'"novelty": "详细段落:技术新颖性分析"}, '
'"results": {"main_findings": "详细段落:核心发现(带具体数字和指标,逐一分析每个实验)", '
'"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
'"limitations": "详细段落:局限性分析(作者承认的+你自己的观察")}, '
'"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", '
'"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", '
'"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度")}, '
'"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要","section":"method"},'
'{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要","section":"results"}]'
"\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Tableid 严格使用 \"Figure N\"\"Table N\" 格式。"
"section 必须是 motivation/method/results/limitations 之一,表示该图最适合展示在哪个章节。"
"}"
)
writing_requirements = (
"## 写作要求\n"
"- 每个字符串字段必须写成详细段落(200-500字),不要用列表或数组\n"
"- 必须包含论文中的具体数据、数字、实验指标\n"
"- 像资深同事给同事讲论文一样,专业但易懂\n"
"- 数学公式、符号、变量必须使用 LaTeX 格式:行内公式用 $...$,独立公式用 $$...$$\n"
" 例如:损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$,学习率 $\\eta$\n"
)
if fix_errors:
error_list = "\n".join(f"- {e}" for e in fix_errors)
return (
"你之前生成的 JSON 存在以下问题,请修正后重新用 write_file 保存到 "
f"data/papers/{arxiv_id}/summary.json\n\n"
f"{error_list}\n\n"
"注意:所有字符串字段必须是详细段落(≥50字),不能是数组或列表。"
"修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
)
if pdf_mode == "search":
return (
"请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n"
"## 工作流程\n"
f"1. 先用 read 工具读取 {meta_path} 了解论文元信息(标题、作者、摘要)\n"
f"2. 再用 read 工具阅读 {txt_path}(论文正文全文),可以多次读取定位关键段落\n"
f"3. 充分理解后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n\n"
+ writing_requirements
+ "\n"
+ json_schema
)
else:
return (
"请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n"
"## 工作流程\n"
"论文元信息和正文全文已在上文提供,请仔细阅读。\n"
f"1. 充分理解论文后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n"
"2. 用 bash 运行 python scripts/validate_summary.py 验证\n\n"
+ writing_requirements
+ "\n"
+ json_schema
)
# ── pi CLI 调用 ──────────────────────────────────────────────────────── # ── pi CLI 调用 ────────────────────────────────────────────────────────
@@ -264,12 +90,10 @@ async def call_pi(
txt_path.write_text(trimmed, encoding="utf-8") txt_path.write_text(trimmed, encoding="utf-8")
logger.info("Truncated %s for inject: %d%d chars", arxiv_id, txt_size, len(trimmed)) logger.info("Truncated %s for inject: %d%d chars", arxiv_id, txt_size, len(trimmed))
prompt_text = _build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors) prompt_text = build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors)
# 构建 session ID(每篇论文一个独立 session) # 构建 session ID(每篇论文一个独立 session)
if session_id is None: if session_id is None:
import uuid
session_id = f"summary-{arxiv_id}-{uuid.uuid4().hex[:8]}" session_id = f"summary-{arxiv_id}-{uuid.uuid4().hex[:8]}"
# 工具列表:search 模式需要 read 工具 # 工具列表:search 模式需要 read 工具
@@ -297,6 +121,9 @@ async def call_pi(
arxiv_id, bool(fix_errors), session_id, actual_mode, arxiv_id, bool(fix_errors), session_id, actual_mode,
) )
import time as _time
_t_sub_start = _time.monotonic()
proc = await asyncio.create_subprocess_exec( proc = await asyncio.create_subprocess_exec(
*cmd, *cmd,
stdout=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE,
@@ -312,69 +139,22 @@ async def call_pi(
await proc.wait() await proc.wait()
raise PiTimeoutError(f"pi timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s") raise PiTimeoutError(f"pi timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s")
_t_sub_end = _time.monotonic()
# 检查 summary.json 是否由 pi 子进程写入
_summary_file = pdf_path.parent / "summary.json"
_file_info = ""
if _summary_file.exists():
_file_mtime = _summary_file.stat().st_mtime
_file_size = _summary_file.stat().st_size
_file_info = f" summary.json={_file_size}B"
logger.info(
"pi subprocess for %s: %.2fs%s",
arxiv_id, _t_sub_end - _t_sub_start, _file_info,
)
if proc.returncode != 0: if proc.returncode != 0:
raise PiProcessError(proc.returncode, stderr.decode("utf-8", errors="replace")) raise PiProcessError(proc.returncode, stderr.decode("utf-8", errors="replace"))
return stdout.decode("utf-8", errors="replace"), session_id return stdout.decode("utf-8", errors="replace"), session_id
# ── JSON 提取 ──────────────────────────────────────────────────────────
def extract_json(raw_output: str) -> dict:
"""从 pi 输出中提取 JSON dict。三步策略:直接解析 → 代码块 → 最大花括号块。"""
# 策略 1:整体直接解析
stripped = raw_output.strip()
try:
result = json.loads(stripped)
if isinstance(result, dict) and "title_zh" in result:
return result
except json.JSONDecodeError:
pass
# 策略 2:提取 ```json ... ``` 代码块
fence_pattern = re.compile(r"```(?:json)?\s*\n(.*?)```", re.DOTALL)
for match in fence_pattern.finditer(raw_output):
try:
result = json.loads(match.group(1).strip())
if isinstance(result, dict) and "title_zh" in result:
return result
except json.JSONDecodeError:
continue
# 策略 3:匹配包含 title_zh 的最大 {...} 块
brace_pattern = re.compile(r"\{[^{}]*\"title_zh\"[^{}]*\}", re.DOTALL)
for match in brace_pattern.finditer(raw_output):
try:
return json.loads(match.group(0))
except json.JSONDecodeError:
continue
# 更宽松:找到最大的 { ... } 平衡块
best = None
best_len = 0
for i, ch in enumerate(raw_output):
if ch != "{":
continue
depth = 0
for j in range(i, len(raw_output)):
if raw_output[j] == "{":
depth += 1
elif raw_output[j] == "}":
depth -= 1
if depth == 0:
candidate = raw_output[i : j + 1]
if len(candidate) > best_len:
try:
parsed = json.loads(candidate)
if isinstance(parsed, dict):
best = parsed
best_len = len(candidate)
except json.JSONDecodeError:
pass
break
if best is not None:
return best
raise JsonNotFoundError("no JSON object found in pi output")
+200 -64
View File
@@ -29,14 +29,19 @@ from app.services.pdf_downloader import (
download_pdf, download_pdf,
paper_dir, paper_dir,
) )
from app.services.pi_client import ( from app.services.summary_utils import (
JsonNotFoundError, JsonNotFoundError,
build_prompt,
extract_json,
write_meta_json,
extract_pdf_text,
)
from app.services.pi_client import (
PiProcessError, PiProcessError,
PiTimeoutError, PiTimeoutError,
call_pi, call_pi,
extract_json,
write_meta_json,
) )
from app.services import claude_backend
from app.services.schemas import ( from app.services.schemas import (
SummarySchema, SummarySchema,
assess_quality, assess_quality,
@@ -229,7 +234,6 @@ def _save_files(arxiv_id: str, schema: SummarySchema | None, raw_output: str) ->
async def summarize_one( async def summarize_one(
db: Session, db: Session,
paper: Paper, paper: Paper,
semaphore: asyncio.Semaphore | None = None,
*, *,
force: bool = False, force: bool = False,
pdf_mode: str = "auto", pdf_mode: str = "auto",
@@ -257,68 +261,128 @@ async def summarize_one(
"reason": "permanent_failure", "reason": "permanent_failure",
} }
if semaphore: return await _do_summarize_one(db, paper, pdf_mode=pdf_mode)
await semaphore.acquire()
try:
return await _do_summarize_one(db, paper, pdf_mode=pdf_mode)
finally:
if semaphore:
semaphore.release()
async def _generate_with_retry( async def _generate_with_retry(
arxiv_id: str, meta_path: Path, pdf_path: Path, pdf_mode: str = "auto" arxiv_id: str, meta_path: Path, pdf_path: Path, pdf_mode: str = "auto"
) -> tuple[dict, str]: ) -> tuple[dict, str]:
"""调用 pi CLI 生成总结,最多 4 轮验证循环。 """调用 AI 后端生成总结,最多 4 轮验证循环。
根据 settings.SUMMARY_BACKEND 选择 pi 或 claude 后端。
Returns: Returns:
(json_data, raw_output) (json_data, raw_output)
Raises: Raises:
ValueError: 4 轮验证仍未通过 ValueError: 4 轮验证仍未通过
""" """
import time as _time
backend = settings.SUMMARY_BACKEND
validation_errors: list[str] = [] validation_errors: list[str] = []
json_data: dict | None = None json_data: dict | None = None
raw_output = "" raw_output = ""
session_id = None session_id = None
summary_file = paper_dir(arxiv_id) / "summary.json"
# claude 后端需要预构建 promptpi 后端在 call_pi 内部构建)
claude_prompt: str | None = None
if backend == "claude":
_t0 = _time.monotonic()
txt_path = extract_pdf_text(pdf_path, max_chars=None)
body = txt_path.read_text(encoding="utf-8")
if len(body) > 80_000:
trimmed = body[:80_000].rstrip()
txt_path.write_text(trimmed, encoding="utf-8")
claude_prompt = build_prompt(arxiv_id, meta_path, txt_path, "inject", None)
logger.info(" [%s] 构建prompt: %.2fs", arxiv_id, _time.monotonic() - _t0)
for attempt in range(1, 5): for attempt in range(1, 5):
# 清理上一轮 pi 写的不完整文件 # 清理上一轮写的不完整文件
stale = paper_dir(arxiv_id) / "summary.json" if summary_file.exists():
if stale.exists(): summary_file.unlink()
stale.unlink()
if attempt == 1: # 记录 AI 调用开始时间
raw_output, session_id = await call_pi(meta_path, pdf_path, pdf_mode=pdf_mode) _t_call_start = _time.monotonic()
if backend == "claude":
if attempt == 1:
raw_output, session_id = await claude_backend.call_claude(
claude_prompt, session_id=None,
)
else:
retry_prompt = build_prompt(
arxiv_id, meta_path,
extract_pdf_text(pdf_path, max_chars=80000),
"inject", fix_errors=validation_errors,
)
raw_output, session_id = await claude_backend.call_claude(
retry_prompt, session_id=session_id, fix_errors=validation_errors,
)
else: else:
raw_output, session_id = await call_pi( if attempt == 1:
meta_path, pdf_path, raw_output, session_id = await call_pi(meta_path, pdf_path, pdf_mode=pdf_mode)
fix_errors=validation_errors, else:
session_id=session_id, raw_output, session_id = await call_pi(
pdf_mode=pdf_mode, meta_path, pdf_path,
) fix_errors=validation_errors,
session_id=session_id,
pdf_mode=pdf_mode,
)
# 优先读取 pi 写入的 summary.json,否则从 stdout 提取 _t_call_end = _time.monotonic()
summary_file = paper_dir(arxiv_id) / "summary.json"
# 检查 summary.json 是否由 AI 子进程写入
file_written_by_ai = summary_file.exists()
file_mtime = summary_file.stat().st_mtime if file_written_by_ai else None
file_size = summary_file.stat().st_size if file_written_by_ai else 0
logger.info(
" [%s] attempt %d AI调用: %.2fs summary.json=%s%s",
arxiv_id, attempt,
_t_call_end - _t_call_start,
f"已写入({file_size}B)" if file_written_by_ai else "未写入",
f" mtime={file_mtime:.2f}" if file_mtime else "",
)
# 提取 JSON
_t_json_start = _time.monotonic()
try: try:
if summary_file.exists(): if file_written_by_ai:
json_data = json.loads(summary_file.read_text(encoding="utf-8")) json_data = json.loads(summary_file.read_text(encoding="utf-8"))
logger.info("Read summary.json written by pi for %s", arxiv_id) logger.info(" [%s] 从AI写入的summary.json读取", arxiv_id)
else: else:
json_data = extract_json(raw_output) json_data = extract_json(raw_output)
except (json.JSONDecodeError, JsonNotFoundError) as exc: except (json.JSONDecodeError, JsonNotFoundError) as exc:
_t_json_end = _time.monotonic()
logger.warning( logger.warning(
"JSON extraction failed for %s (attempt %d): %s", " [%s] JSON提取失败: %.2fs %s",
arxiv_id, attempt, str(exc)[:200], arxiv_id, _t_json_end - _t_json_start, str(exc)[:200],
) )
validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"] validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"]
continue continue
_t_json_end = _time.monotonic()
# 验证
_t_val_start = _time.monotonic()
validation_errors = _validate_summary(json_data, arxiv_id) validation_errors = _validate_summary(json_data, arxiv_id)
_t_val_end = _time.monotonic()
if not validation_errors: if not validation_errors:
logger.info(
" [%s] JSON提取: %.2fs 验证: %.2fs ✅",
arxiv_id,
_t_json_end - _t_json_start,
_t_val_end - _t_val_start,
)
break break
logger.warning( logger.warning(
"Validation failed for %s (attempt %d): %s", " [%s] JSON提取: %.2fs 验证: %.2fs ❌ %s",
arxiv_id, attempt, "; ".join(validation_errors), arxiv_id,
_t_json_end - _t_json_start,
_t_val_end - _t_val_start,
"; ".join(validation_errors)[:200],
) )
if validation_errors: if validation_errors:
@@ -335,11 +399,19 @@ def _persist_summary(
db: Session, paper: Paper, json_data: dict, raw_output: str db: Session, paper: Paper, json_data: dict, raw_output: str
) -> str: ) -> str:
"""Pydantic 校验 → 质量评估 → 保存文件 → 更新 DB → 返回 quality。""" """Pydantic 校验 → 质量评估 → 保存文件 → 更新 DB → 返回 quality。"""
import time as _time
arxiv_id = paper.arxiv_id
_t0 = _time.monotonic()
schema = SummarySchema.model_validate(json_data) schema = SummarySchema.model_validate(json_data)
quality = assess_quality(schema) quality = assess_quality(schema)
_t1 = _time.monotonic()
_save_files(arxiv_id, schema, raw_output)
_t2 = _time.monotonic()
_save_files(paper.arxiv_id, schema, raw_output)
_update_summary_in_db(db, paper, schema, quality, raw_output) _update_summary_in_db(db, paper, schema, quality, raw_output)
_t3 = _time.monotonic()
# 状态 → done # 状态 → done
paper.summary_status.status = SummaryState.DONE paper.summary_status.status = SummaryState.DONE
@@ -347,10 +419,30 @@ def _persist_summary(
paper.summary_status.completed_at = utc_now() paper.summary_status.completed_at = utc_now()
paper.summary_status.raw_output_saved = True paper.summary_status.raw_output_saved = True
db.commit() db.commit()
_t4 = _time.monotonic()
logger.info(
" [%s] persist: pydantic=%.2fs 文件=%.2fs DB写入=%.2fs 状态commit=%.2fs",
arxiv_id,
_t1 - _t0,
_t2 - _t1,
_t3 - _t2,
_t4 - _t3,
)
# 触发性增强(失败不影响总结) # 触发性增强(失败不影响总结)
_maybe_extract_images(paper.arxiv_id, schema) _t5 = _time.monotonic()
_maybe_index_chroma(paper.arxiv_id, paper, schema) _maybe_extract_images(arxiv_id, schema)
_t6 = _time.monotonic()
_maybe_index_chroma(arxiv_id, paper, schema)
_t7 = _time.monotonic()
logger.info(
" [%s] 后处理: 图片提取=%.2fs ChromaDB=%.2fs",
arxiv_id,
_t6 - _t5,
_t7 - _t6,
)
return quality return quality
@@ -445,28 +537,47 @@ async def _do_summarize_one(
) -> dict: ) -> dict:
"""实际的单篇总结执行(在 semaphore 保护下)。""" """实际的单篇总结执行(在 semaphore 保护下)。"""
arxiv_id = paper.arxiv_id arxiv_id = paper.arxiv_id
title_short = (paper.title_en or "")[:50]
# 状态 → processing # 状态 → processing
paper.summary_status.status = SummaryState.PROCESSING paper.summary_status.status = SummaryState.PROCESSING
paper.summary_status.started_at = utc_now() paper.summary_status.started_at = utc_now()
db.commit() db.commit()
logger.info("▶ [%s] 开始总结: %s", arxiv_id, title_short)
# 清理旧的图片文件和 figures_json,避免重新总结时残留 # 清理旧的图片文件和 figures_json,避免重新总结时残留
import time as _time
_t_cleanup_start = _time.monotonic()
_cleanup_old_images(db, paper) _cleanup_old_images(db, paper)
_t_cleanup_end = _time.monotonic()
logger.info(" [%s] 清理旧数据: %.2fs", arxiv_id, _t_cleanup_end - _t_cleanup_start)
raw_output = "" raw_output = ""
try: try:
meta_path = write_meta_json(paper) _t0 = _time.monotonic()
await download_pdf(arxiv_id, paper.pdf_url)
meta_path = write_meta_json(paper)
_t1 = _time.monotonic()
logger.info(" [%s] meta.json: %.2fs", arxiv_id, _t1 - _t0)
await download_pdf(arxiv_id, paper.pdf_url)
_t2 = _time.monotonic()
logger.info(" [%s] 下载PDF: %.2fs", arxiv_id, _t2 - _t1)
logger.info(" [%s] 调用 pi 生成总结...", arxiv_id)
json_data, raw_output = await _generate_with_retry( json_data, raw_output = await _generate_with_retry(
arxiv_id, meta_path, TMP_DIR / arxiv_id / "paper.pdf", arxiv_id, meta_path, TMP_DIR / arxiv_id / "paper.pdf",
pdf_mode=pdf_mode, pdf_mode=pdf_mode,
) )
_t3 = _time.monotonic()
logger.info(" [%s] pi生成: %.2fs", arxiv_id, _t3 - _t2)
quality = _persist_summary(db, paper, json_data, raw_output) quality = _persist_summary(db, paper, json_data, raw_output)
_t4 = _time.monotonic()
logger.info(" [%s] 持久化: %.2fs", arxiv_id, _t4 - _t3)
logger.info("Summarize done: %s quality=%s", arxiv_id, quality) logger.info("✅ [%s] 完成: quality=%s 总耗时: %.2fs", arxiv_id, quality, _t4 - _t0)
return {"arxiv_id": arxiv_id, "status": "done", "quality": quality} return {"arxiv_id": arxiv_id, "status": "done", "quality": quality}
except Exception as exc: except Exception as exc:
@@ -588,42 +699,67 @@ async def summarize_batch(
"total": 0, "total": 0,
} }
# 并发控制 # 并发控制worker 模式,避免 573 个协程同时打开 DB 连接耗尽连接池
semaphore = asyncio.Semaphore(settings.SUMMARY_CONCURRENCY) concurrency = settings.SUMMARY_CONCURRENCY
make_session = _session_factory or SessionLocal make_session = _session_factory or SessionLocal
async def _process_paper(paper: Paper) -> dict: # 进度追踪
paper_db = make_session() progress = {"done": 0, "failed": 0, "skipped": 0}
try: paper_queue: asyncio.Queue[Paper | None] = asyncio.Queue()
p = paper_db.execute( for p in papers:
select(Paper) paper_queue.put_nowait(p)
.where(Paper.id == paper.id)
.options(*PAPER_DEFAULT_LOAD)
).unique().scalar_one_or_none()
return await summarize_one(paper_db, p, semaphore, pdf_mode=pdf_mode)
finally:
paper_db.close()
results = await asyncio.gather( async def _worker() -> list[dict]:
*[_process_paper(p) for p in papers], results: list[dict] = []
while True:
paper = paper_queue.get_nowait() if not paper_queue.empty() else None
if paper is None:
break
paper_db = make_session()
try:
p = paper_db.execute(
select(Paper)
.where(Paper.id == paper.id)
.options(*PAPER_DEFAULT_LOAD)
).unique().scalar_one_or_none()
result = await summarize_one(paper_db, p, pdf_mode=pdf_mode)
status = result.get("status", "failed")
progress[status] = progress.get(status, 0) + 1
finished = sum(progress.values())
logger.info(
"📊 进度: %d/%d (✅%d%d ⏭️%d) — %s",
finished, total,
progress["done"], progress["failed"], progress["skipped"],
paper.arxiv_id,
)
results.append(result)
except Exception as exc:
logger.error("Worker error: %s", exc)
results.append({"status": "failed", "error": str(exc)})
finally:
paper_db.close()
return results
worker_results = await asyncio.gather(
*[_worker() for _ in range(concurrency)],
return_exceptions=True, return_exceptions=True,
) )
results = []
for r in worker_results:
if isinstance(r, Exception):
logger.error("Unexpected error in batch: %s", r)
results.append(r)
elif isinstance(r, list):
results.extend(r)
# 统计结果 # 统计结果progress 已在 worker 中实时更新)
done = 0 done = progress["done"]
failed = 0 failed = progress["failed"]
skipped = 0 skipped = progress["skipped"]
for r in results: for r in results:
if isinstance(r, Exception): if isinstance(r, Exception):
logger.error("Unexpected error in batch: %s", r) logger.error("Unexpected error in batch: %s", r)
failed += 1 failed += 1
elif isinstance(r, dict):
if r.get("status") == "done":
done += 1
elif r.get("status") == "skipped":
skipped += 1
else:
failed += 1
log_entry.status = "success" if failed == 0 else "failed" log_entry.status = "success" if failed == 0 else "failed"
log_entry.papers_found = total log_entry.papers_found = total
+270
View File
@@ -0,0 +1,270 @@
"""总结工具函数 — PDF 文本提取、prompt 构建、JSON 提取、meta.json 写入。
与后端无关的通用逻辑,pi 和 claude 后端共享。
"""
from __future__ import annotations
import json
import logging
import re
from pathlib import Path
logger = logging.getLogger(__name__)
# ── 自定义异常 ──────────────────────────────────────────────────────────
class JsonNotFoundError(Exception):
pass
# ── meta.json ───────────────────────────────────────────────────────────
def write_meta_json(paper) -> Path:
"""写入 data/papers/{arxiv_id}/meta.json,返回路径。"""
from app.services.pdf_downloader import paper_dir
d = paper_dir(paper.arxiv_id)
d.mkdir(parents=True, exist_ok=True)
meta_path = d / "meta.json"
authors = [a.name for a in paper.authors]
tags = [t.tag for t in paper.tags]
meta = {
"arxiv_id": paper.arxiv_id,
"title_en": paper.title_en,
"abstract": paper.abstract or "",
"published_at": paper.published_at.isoformat() if paper.published_at else None,
"authors": authors,
"tags": tags,
"upvotes": paper.upvotes,
}
meta_path.write_text(
json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
)
return meta_path
# ── PDF 文本提取 ────────────────────────────────────────────────────────
def _trim_body(text: str, max_chars: int | None = None) -> str:
"""去除参考文献,保留正文+附录,超长时从末尾截断。
策略:
1. 去掉 References/Bibliography 段落(纯引用列表,对解读无用)
2. 正文 + 附录全部保留
3. 如果指定了 max_chars 且总长超过,从末尾截断(附录靠后,优先保留正文)
"""
# 找 References 段落的位置(在 Appendix 之后的那个)
ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text)
if ref_match:
ref_start = ref_match.start()
# 看 References 之后有没有 Appendix
after_ref = text[ref_start:]
app_match = re.search(
r"(?m)^(?:A\s+(?:Appendix|Supplementary)|Appendix|附录)\s*$", after_ref
)
if app_match:
# References 之后有 Appendix:只删 References 段
ref_end = ref_start + app_match.start()
text = text[:ref_start] + text[ref_end:]
else:
# References 之后没有 Appendix:删掉从 References 到结尾
text = text[:ref_start].rstrip()
# 去掉 Acknowledgments(对解读无用)
ack_match = re.search(r"(?m)^(?:Acknowledgments?\s*|致谢\s*)$", text)
if ack_match:
# 只删 Acknowledgments 本身,不删后面的内容
next_section = re.search(r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start():])
if next_section:
text = text[:ack_match.start()] + text[ack_match.start() + next_section.start():]
else:
text = text[:ack_match.start()].rstrip()
# 最后:如果指定了上限且超长,从末尾截断(附录在后面,正文在前面,优先保留正文)
if max_chars is not None and len(text) > max_chars:
text = text[:max_chars].rstrip()
return text
def extract_pdf_text(pdf_path: Path, max_chars: int | None = None) -> Path:
"""用 pymupdf 提取 PDF 正文文本,保存为 .txt。
max_chars=None 时不截断,给 search/auto 模式保留完整内容。
"""
import pymupdf
txt_path = pdf_path.with_suffix(".txt")
if txt_path.exists():
# 缓存优先;如果需重新提取(不同 max_chars),先删旧文件
return txt_path
doc = pymupdf.open(str(pdf_path))
# sort=True 启用阅读顺序检测,避免双栏论文中跨栏错位
raw_text = "\n\n".join(page.get_text(sort=True) for page in doc)
doc.close()
body = _trim_body(raw_text, max_chars=max_chars)
txt_path.write_text(body, encoding="utf-8")
logger.info(
"Extracted PDF text: %s (%d -> %d chars, -%d%%)",
txt_path,
len(raw_text),
len(body),
(1 - len(body) / len(raw_text)) * 100 if raw_text else 0,
)
return txt_path
# ── Prompt 构建 ─────────────────────────────────────────────────────────
def build_prompt(
arxiv_id: str,
meta_path: Path,
txt_path: Path,
pdf_mode: str,
fix_errors: list[str] | None = None,
) -> str:
"""根据模式构建 prompt。
inject: 全量注入,prompt 末尾包含论文全文内容
search: pi 自主 read 文件,prompt 只包含工作流指令
"""
json_schema = (
"## 必须包含以下字段(不要自创字段名):\n"
'{"arxiv_id": "...", '
'"title_zh": "中文标题", '
'"one_line": "一句话概括(≤50字)", '
'"tags": ["标签1","标签2"], '
'"difficulty": "入门/进阶/前沿", '
'"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的(50-150字)","why_matters":"为什么读懂本文需要它"}]}, '
'"motivation": {"problem": "详细段落:现有方法的具体问题(包含具体场景和数据)", '
'"goal": "详细段落:本文的具体目标", '
'"gap": "详细段落:本文的独特切入角度"}, '
'"method": {"overview": "详细段落:方法整体思路(先直觉再技术路线)", '
'"key_idea": "详细段落:核心创新点(和已有方法的本质区别)", '
'"steps": "详细段落:方法步骤的完整描述(每步的输入输出和具体操作)", '
'"novelty": "详细段落:技术新颖性分析"}, '
'"results": {"main_findings": "详细段落:核心发现(带具体数字和指标,逐一分析每个实验)", '
'"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
'"limitations": "详细段落:局限性分析(作者承认的+你自己的观察")}, '
'"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", '
'"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", '
'"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度")}, '
'"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要","section":"method"},'
'{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要","section":"results"}]'
"\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Tableid 严格使用 \"Figure N\"\"Table N\" 格式。"
"section 必须是 motivation/method/results/limitations 之一,表示该图最适合展示在哪个章节。"
"}"
)
writing_requirements = (
"## 写作要求\n"
"- 每个字符串字段必须写成详细段落(200-500字),不要用列表或数组\n"
"- 必须包含论文中的具体数据、数字、实验指标\n"
"- 像资深同事给同事讲论文一样,专业但易懂\n"
"- 数学公式、符号、变量必须使用 LaTeX 格式:行内公式用 $...$,独立公式用 $$...$$\n"
" 例如:损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$,学习率 $\\eta$\n"
)
if fix_errors:
error_list = "\n".join(f"- {e}" for e in fix_errors)
return (
"你之前生成的 JSON 存在以下问题,请修正后重新用 write_file 保存到 "
f"data/papers/{arxiv_id}/summary.json\n\n"
f"{error_list}\n\n"
"注意:所有字符串字段必须是详细段落(≥50字),不能是数组或列表。"
"修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
)
if pdf_mode == "search":
return (
"请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n"
"## 工作流程\n"
f"1. 先用 read 工具读取 {meta_path} 了解论文元信息(标题、作者、摘要)\n"
f"2. 再用 read 工具阅读 {txt_path}(论文正文全文),可以多次读取定位关键段落\n"
f"3. 充分理解后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n\n"
+ writing_requirements
+ "\n"
+ json_schema
)
else:
return (
"请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n"
"## 工作流程\n"
"论文元信息和正文全文已在上文提供,请仔细阅读。\n"
f"1. 充分理解论文后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n"
"2. 用 bash 运行 python scripts/validate_summary.py 验证\n\n"
+ writing_requirements
+ "\n"
+ json_schema
)
# ── JSON 提取 ──────────────────────────────────────────────────────────
def extract_json(raw_output: str) -> dict:
"""从输出中提取 JSON dict。三步策略:直接解析 → 代码块 → 最大花括号块。"""
# 策略 1:整体直接解析
stripped = raw_output.strip()
try:
result = json.loads(stripped)
if isinstance(result, dict) and "title_zh" in result:
return result
except json.JSONDecodeError:
pass
# 策略 2:提取 ```json ... ``` 代码块
fence_pattern = re.compile(r"```(?:json)?\s*\n(.*?)```", re.DOTALL)
for match in fence_pattern.finditer(raw_output):
try:
result = json.loads(match.group(1).strip())
if isinstance(result, dict) and "title_zh" in result:
return result
except json.JSONDecodeError:
continue
# 策略 3:匹配包含 title_zh 的最大 {...} 块
brace_pattern = re.compile(r"\{[^{}]*\"title_zh\"[^{}]*\}", re.DOTALL)
for match in brace_pattern.finditer(raw_output):
try:
return json.loads(match.group(0))
except json.JSONDecodeError:
continue
# 更宽松:找到最大的 { ... } 平衡块
best = None
best_len = 0
for i, ch in enumerate(raw_output):
if ch != "{":
continue
depth = 0
for j in range(i, len(raw_output)):
if raw_output[j] == "{":
depth += 1
elif raw_output[j] == "}":
depth -= 1
if depth == 0:
candidate = raw_output[i : j + 1]
if len(candidate) > best_len:
try:
parsed = json.loads(candidate)
if isinstance(parsed, dict):
best = parsed
best_len = len(candidate)
except json.JSONDecodeError:
pass
break
if best is not None:
return best
raise JsonNotFoundError("no JSON object found in output")
+2 -1
View File
@@ -7,7 +7,7 @@ dependencies = [
"fastapi>=0.115", "fastapi>=0.115",
"uvicorn[standard]>=0.34", "uvicorn[standard]>=0.34",
"sqlalchemy>=2.0", "sqlalchemy>=2.0",
"httpx>=0.28", "httpx[http2]>=0.28",
"jinja2>=3.1", "jinja2>=3.1",
"python-multipart>=0.0.18", "python-multipart>=0.0.18",
"pydantic>=2.0", "pydantic>=2.0",
@@ -19,6 +19,7 @@ dependencies = [
"pymupdf>=1.25", "pymupdf>=1.25",
"itsdangerous>=2.2.0", "itsdangerous>=2.2.0",
"bleach>=6.4.0", "bleach>=6.4.0",
"pymupdf4llm>=1.27.2.3",
] ]
[project.optional-dependencies] [project.optional-dependencies]
+136 -109
View File
@@ -1,117 +1,144 @@
"""验证 summary JSON 是否符合 SummarySchema 要求。
用法:python scripts/validate_summary.py <json_file>
返回:exit 0 = 通过,exit 1 = 失败(错误信息输出到 stdout)
"""
import json import json
import sys import sys
from pathlib import Path
schema = {
"type": "object",
"required": ["arxiv_id", "title_zh", "one_line", "tags", "difficulty",
"prerequisites", "motivation", "method", "results", "improvements", "figures"],
"properties": {
"arxiv_id": {"type": "string"},
"title_zh": {"type": "string"},
"one_line": {"type": "string"},
"tags": {"type": "array", "items": {"type": "string"}},
"difficulty": {"type": "string", "enum": ["入门", "进阶", "前沿"]},
"prerequisites": {
"type": "object",
"required": ["concepts"],
"properties": {
"concepts": {"type": "array", "items": {
"type": "object",
"required": ["term", "explanation", "why_matters"],
"properties": {
"term": {"type": "string"},
"explanation": {"type": "string"},
"why_matters": {"type": "string"}
}
}}
}
},
"motivation": {
"type": "object",
"required": ["problem", "goal", "gap"],
"properties": {
"problem": {"type": "string"},
"goal": {"type": "string"},
"gap": {"type": "string"}
}
},
"method": {
"type": "object",
"required": ["overview", "key_idea", "steps", "novelty"],
"properties": {
"overview": {"type": "string"},
"key_idea": {"type": "string"},
"steps": {"type": "string"},
"novelty": {"type": "string"}
}
},
"results": {
"type": "object",
"required": ["main_findings", "benchmarks", "limitations"],
"properties": {
"main_findings": {"type": "string"},
"benchmarks": {"type": "array", "items": {
"type": "object",
"required": ["task", "metric", "this_work", "baseline", "improvement"],
"properties": {
"task": {"type": "string"},
"metric": {"type": "string"},
"this_work": {"type": "string"},
"baseline": {"type": "string"},
"improvement": {"type": "string"}
}
}},
"limitations": {"type": "string"}
}
},
"improvements": {
"type": "object",
"required": ["weaknesses", "future_work", "reproducibility"],
"properties": {
"weaknesses": {"type": "string"},
"future_work": {"type": "string"},
"reproducibility": {"type": "string"}
}
},
"figures": {
"type": "array",
"items": {
"type": "object",
"required": ["id", "caption", "description", "reason", "section"],
"properties": {
"id": {"type": "string"},
"caption": {"type": "string"},
"description": {"type": "string"},
"reason": {"type": "string"},
"section": {"type": "string", "enum": ["motivation", "method", "results", "limitations"]}
}
}
}
}
}
def validate(path: str) -> list[str]: def validate_file(filepath):
errors: list[str] = []
try: try:
data = json.loads(Path(path).read_text(encoding="utf-8")) with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
# Check required fields
for field in schema["required"]:
if field not in data:
print(f"❌ Missing field: {field}")
return False
# Validate nested structure
for field, spec in schema["properties"].items():
if field in data:
if spec["type"] == "string":
if not isinstance(data[field], str):
print(f"❌ Field '{field}' should be string")
return False
elif spec["type"] == "array":
if not isinstance(data[field], list):
print(f"❌ Field '{field}' should be array")
return False
elif spec["type"] == "object":
if not isinstance(data[field], dict):
print(f"❌ Field '{field}' should be object")
return False
if "required" in spec:
for subfield in spec["required"]:
if subfield not in data[field]:
print(f"❌ Missing subfield: {field}.{subfield}")
return False
# Validate section enum in figures
valid_sections = ["motivation", "method", "results", "limitations"]
for fig in data.get("figures", []):
if fig["section"] not in valid_sections:
print(f"❌ Invalid section in figure: {fig['section']}")
return False
print("✅ JSON validation passed!")
return True
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
return [f"JSON 解析失败: {e}"] print(f"JSON decode error: {e}")
return False
if not isinstance(data, dict): except Exception as e:
return ["顶层必须是 JSON 对象 (dict)"] print(f"❌ Validation error: {e}")
return False
# 必填字段
required_top = ["arxiv_id", "title_zh", "one_line", "tags"]
for f in required_top:
if f not in data or not data[f]:
errors.append(f"缺少必填字段: {f}")
# tags 必须是非空数组
tags = data.get("tags")
if isinstance(tags, list) and len(tags) == 0:
errors.append("tags 不能为空数组")
if not isinstance(tags, list):
errors.append("tags 必须是数组")
# motivation 子字段
motivation = data.get("motivation", {})
if not isinstance(motivation, dict):
errors.append("motivation 必须是对象")
else:
for f in ["problem", "goal", "gap"]:
val = motivation.get(f, "")
if not isinstance(val, str) or len(val.strip()) < 50:
errors.append(f"motivation.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)")
# method 子字段
method = data.get("method", {})
if not isinstance(method, dict):
errors.append("method 必须是对象")
else:
for f in ["overview", "key_idea", "steps", "novelty"]:
val = method.get(f, "")
if not isinstance(val, str) or len(val.strip()) < 50:
errors.append(f"method.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)")
# results 子字段
results = data.get("results", {})
if not isinstance(results, dict):
errors.append("results 必须是对象")
else:
for f in ["main_findings", "limitations"]:
val = results.get(f, "")
if not isinstance(val, str) or len(val.strip()) < 50:
errors.append(f"results.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)")
# benchmarks 可以是数组
benchmarks = results.get("benchmarks")
if benchmarks is not None and not isinstance(benchmarks, list):
errors.append("results.benchmarks 必须是数组")
# improvements 子字段
improvements = data.get("improvements", {})
if not isinstance(improvements, dict):
errors.append("improvements 必须是对象")
else:
for f in ["weaknesses", "future_work", "reproducibility"]:
val = improvements.get(f, "")
if not isinstance(val, str) or len(val.strip()) < 50:
errors.append(f"improvements.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)")
# 检查是否有字段误用数组(应该用字符串的)
string_fields = [
("motivation", "problem"), ("motivation", "goal"), ("motivation", "gap"),
("method", "overview"), ("method", "key_idea"), ("method", "steps"), ("method", "novelty"),
("results", "main_findings"), ("results", "limitations"),
("improvements", "weaknesses"), ("improvements", "future_work"), ("improvements", "reproducibility"),
]
for section, field in string_fields:
val = data.get(section, {}).get(field)
if isinstance(val, list):
errors.append(f"{section}.{field} 应该是字符串段落,不能是数组")
# figures 验证
figures = data.get("figures")
if figures is not None:
if not isinstance(figures, list):
errors.append("figures 必须是数组")
else:
for i, fig in enumerate(figures):
if isinstance(fig, dict) and not fig.get("id"):
errors.append(f"figures[{i}] 缺少 id 字段")
return errors
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) != 2: filepath = sys.argv[1] if len(sys.argv) > 1 else "data/papers/2601.10592/summary.json"
print("用法: python scripts/validate_summary.py <json_file>") validate_file(filepath)
sys.exit(1)
errs = validate(sys.argv[1])
if errs:
print("❌ 验证失败:")
for e in errs:
print(f" - {e}")
sys.exit(1)
else:
print("✅ 验证通过")
sys.exit(0)
Generated
+91 -2
View File
@@ -684,6 +684,19 @@ wheels = [
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
] ]
[[package]]
name = "h2"
version = "4.3.0"
source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
dependencies = [
{ name = "hpack" },
{ name = "hyperframe" },
]
sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1d/17/afa56379f94ad0fe8defd37d6eb3f89a25404ffc71d4d848893d270325fc/h2-4.3.0.tar.gz", hash = "sha256:6c59efe4323fa18b47a632221a1888bd7fde6249819beda254aeca909f221bf1", size = 2152026, upload-time = "2025-08-23T18:12:19.778Z" }
wheels = [
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/69/b2/119f6e6dcbd96f9069ce9a2665e0146588dc9f88f29549711853645e736a/h2-4.3.0-py3-none-any.whl", hash = "sha256:c438f029a25f7945c69e0ccf0fb951dc3f73a5f6412981daee861431b70e2bdd", size = 61779, upload-time = "2025-08-23T18:12:17.779Z" },
]
[[package]] [[package]]
name = "hf-daily-papers" name = "hf-daily-papers"
version = "0.1.0" version = "0.1.0"
@@ -693,12 +706,13 @@ dependencies = [
{ name = "bleach" }, { name = "bleach" },
{ name = "chromadb" }, { name = "chromadb" },
{ name = "fastapi" }, { name = "fastapi" },
{ name = "httpx" }, { name = "httpx", extra = ["http2"] },
{ name = "itsdangerous" }, { name = "itsdangerous" },
{ name = "jinja2" }, { name = "jinja2" },
{ name = "pydantic" }, { name = "pydantic" },
{ name = "pydantic-settings" }, { name = "pydantic-settings" },
{ name = "pymupdf" }, { name = "pymupdf" },
{ name = "pymupdf4llm" },
{ name = "python-dotenv" }, { name = "python-dotenv" },
{ name = "python-multipart" }, { name = "python-multipart" },
{ name = "sqlalchemy" }, { name = "sqlalchemy" },
@@ -724,12 +738,13 @@ requires-dist = [
{ name = "bleach", specifier = ">=6.4.0" }, { name = "bleach", specifier = ">=6.4.0" },
{ name = "chromadb", specifier = ">=1.0" }, { name = "chromadb", specifier = ">=1.0" },
{ name = "fastapi", specifier = ">=0.115" }, { name = "fastapi", specifier = ">=0.115" },
{ name = "httpx", specifier = ">=0.28" }, { name = "httpx", extras = ["http2"], specifier = ">=0.28" },
{ name = "itsdangerous", specifier = ">=2.2.0" }, { name = "itsdangerous", specifier = ">=2.2.0" },
{ name = "jinja2", specifier = ">=3.1" }, { name = "jinja2", specifier = ">=3.1" },
{ name = "pydantic", specifier = ">=2.0" }, { name = "pydantic", specifier = ">=2.0" },
{ name = "pydantic-settings", specifier = ">=2.0" }, { name = "pydantic-settings", specifier = ">=2.0" },
{ name = "pymupdf", specifier = ">=1.25" }, { name = "pymupdf", specifier = ">=1.25" },
{ name = "pymupdf4llm", specifier = ">=1.27.2.3" },
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" },
{ name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24" },
{ name = "python-dotenv", specifier = ">=1.0" }, { name = "python-dotenv", specifier = ">=1.0" },
@@ -778,6 +793,15 @@ wheels = [
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/62/94/3b66b148778ee100dcfd69c2ca22b57b41b44d3063ceec934f209e9184ce/hf_xet-1.5.0-cp37-abi3-win_arm64.whl", hash = "sha256:b6c9df403040248c76d808d3e047d64db2d923bae593eb244c41e425cf6cd7be", size = 3806916, upload-time = "2026-05-06T06:18:21.7Z" }, { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/62/94/3b66b148778ee100dcfd69c2ca22b57b41b44d3063ceec934f209e9184ce/hf_xet-1.5.0-cp37-abi3-win_arm64.whl", hash = "sha256:b6c9df403040248c76d808d3e047d64db2d923bae593eb244c41e425cf6cd7be", size = 3806916, upload-time = "2026-05-06T06:18:21.7Z" },
] ]
[[package]]
name = "hpack"
version = "4.1.0"
source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2c/48/71de9ed269fdae9c8057e5a4c0aa7402e8bb16f2c6e90b3aa53327b113f8/hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca", size = 51276, upload-time = "2025-01-22T21:44:58.347Z" }
wheels = [
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/07/c6/80c95b1b2b94682a72cbdbfb85b81ae2daffa4291fbfa1b1464502ede10d/hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496", size = 34357, upload-time = "2025-01-22T21:44:56.92Z" },
]
[[package]] [[package]]
name = "httpcore" name = "httpcore"
version = "1.0.9" version = "1.0.9"
@@ -842,6 +866,11 @@ wheels = [
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
] ]
[package.optional-dependencies]
http2 = [
{ name = "h2" },
]
[[package]] [[package]]
name = "huggingface-hub" name = "huggingface-hub"
version = "1.16.1" version = "1.16.1"
@@ -862,6 +891,15 @@ wheels = [
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/49/79/621a7dbb80c70974f73a597275351ebe03ce5bc65cb5f8f4acb5859252bc/huggingface_hub-1.16.1-py3-none-any.whl", hash = "sha256:64340de934b9ce37857ef85a82de72f5629e8a270f9119eabb12bf495eb53c22", size = 668176, upload-time = "2026-05-21T18:39:58.596Z" }, { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/49/79/621a7dbb80c70974f73a597275351ebe03ce5bc65cb5f8f4acb5859252bc/huggingface_hub-1.16.1-py3-none-any.whl", hash = "sha256:64340de934b9ce37857ef85a82de72f5629e8a270f9119eabb12bf495eb53c22", size = 668176, upload-time = "2026-05-21T18:39:58.596Z" },
] ]
[[package]]
name = "hyperframe"
version = "6.1.0"
source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/02/e7/94f8232d4a74cc99514c13a9f995811485a6903d48e5d952771ef6322e30/hyperframe-6.1.0.tar.gz", hash = "sha256:f630908a00854a7adeabd6382b43923a4c4cd4b821fcb527e6ab9e15382a3b08", size = 26566, upload-time = "2025-01-22T21:41:49.302Z" }
wheels = [
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/48/30/47d0bf6072f7252e6521f3447ccfa40b421b6824517f82854703d0f5a98b/hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5", size = 13007, upload-time = "2025-01-22T21:41:47.295Z" },
]
[[package]] [[package]]
name = "idna" name = "idna"
version = "3.18" version = "3.18"
@@ -1223,6 +1261,15 @@ wheels = [
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" }, { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" },
] ]
[[package]]
name = "networkx"
version = "3.6.1"
source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" }
wheels = [
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" },
]
[[package]] [[package]]
name = "numpy" name = "numpy"
version = "2.4.6" version = "2.4.6"
@@ -1842,6 +1889,39 @@ wheels = [
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/53/a4/b9e91aac82293f9c954654c85581ee8212b5b05efadc534b581141241e6f/pymupdf-1.27.2.3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:77691604c5d1d0233827139bbcdea61fd57879c84712b8e49b1f45520f7ab9c2", size = 25000393, upload-time = "2026-04-24T14:11:01.669Z" }, { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/53/a4/b9e91aac82293f9c954654c85581ee8212b5b05efadc534b581141241e6f/pymupdf-1.27.2.3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:77691604c5d1d0233827139bbcdea61fd57879c84712b8e49b1f45520f7ab9c2", size = 25000393, upload-time = "2026-04-24T14:11:01.669Z" },
] ]
[[package]]
name = "pymupdf-layout"
version = "1.27.2.3"
source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
dependencies = [
{ name = "networkx" },
{ name = "numpy" },
{ name = "onnxruntime" },
{ name = "pymupdf" },
{ name = "pyyaml" },
]
wheels = [
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bc/ee/067726c3ee5574ad5c605d00d7419e264ef509d626a726f99388111f8216/pymupdf_layout-1.27.2.3-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:75c2ab3c0e8830ac2bc50cfd32d375a30768a2610dac72a02f08265336e0834f", size = 15799844, upload-time = "2026-04-24T14:11:13.177Z" },
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0a/ba/46a7a36474722f9280d885f6eec878561a257d9378e52590b43d32ffb96c/pymupdf_layout-1.27.2.3-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:5656b09669dcd7c51f539afb6fdaf853602bab4cbc20479ee5ee1a85a4e32b60", size = 15795220, upload-time = "2026-04-24T14:11:23.17Z" },
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/84/87/bfdcca67346052943a4549814f2009b38f4d15ec025798cdf7dfa5f57c84/pymupdf_layout-1.27.2.3-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:fcf03aa815cbceebdb3263dd6a190de4547c46b1d168928836ec38738afe127d", size = 15805240, upload-time = "2026-04-24T14:11:33.465Z" },
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/32/e9/7ce6eaf97cebd46c3808593282e9eb99a60cddd6183e25a636980d5c7986/pymupdf_layout-1.27.2.3-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:303b9414216dfaf711ec7d807b6f1e4c3e0a92bbb4569340fcedd9d5593d16ca", size = 15806269, upload-time = "2026-04-24T14:11:43.481Z" },
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bf/61/3b2417d8f2cdfaa0f4749cd9dafa3379cb5cdaddf4233165f1ff81953c30/pymupdf_layout-1.27.2.3-cp310-abi3-win_amd64.whl", hash = "sha256:503b64d9b6b31ea3af79ef85cf7d36950c5048af468cb297684d2953553c62ad", size = 15809163, upload-time = "2026-04-24T14:11:53.956Z" },
]
[[package]]
name = "pymupdf4llm"
version = "1.27.2.3"
source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
dependencies = [
{ name = "pymupdf" },
{ name = "pymupdf-layout" },
{ name = "tabulate" },
]
sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/87/c0/e3830452d82032c3d82a9879616c05bf0c51e0dea03c1d80d57b3a6ec0d1/pymupdf4llm-1.27.2.3.tar.gz", hash = "sha256:42ec1a47ddc62be3f4f40c116d27618611c6f9fa366719016d9ddc3f3a3dc22b", size = 1406297, upload-time = "2026-04-24T14:13:18.843Z" }
wheels = [
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e6/38/84bf29f4dd72e6c450546df6ca8f53021f764fd945ba67dcc235d39bc20e/pymupdf4llm-1.27.2.3-py3-none-any.whl", hash = "sha256:bd724b79fa3f06a5b28d7a65f7acfa8de56e04bdb603ac2d6dff315e0d151aaa", size = 77348, upload-time = "2026-04-24T14:11:04.305Z" },
]
[[package]] [[package]]
name = "pypika" name = "pypika"
version = "0.51.1" version = "0.51.1"
@@ -2202,6 +2282,15 @@ wheels = [
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1c/54/196d0c1db10af76baa4f64894448505d60d3cdf70ef92cbb35f46a4e4c71/starlette-1.2.1-py3-none-any.whl", hash = "sha256:4de0082d08c8f6764a85a54cf1120d6939507a19905c7768acad2a9f875d2b89", size = 73350, upload-time = "2026-05-31T01:07:50.09Z" }, { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1c/54/196d0c1db10af76baa4f64894448505d60d3cdf70ef92cbb35f46a4e4c71/starlette-1.2.1-py3-none-any.whl", hash = "sha256:4de0082d08c8f6764a85a54cf1120d6939507a19905c7768acad2a9f875d2b89", size = 73350, upload-time = "2026-05-31T01:07:50.09Z" },
] ]
[[package]]
name = "tabulate"
version = "0.10.0"
source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/46/58/8c37dea7bbf769b20d58e7ace7e5edfe65b849442b00ffcdd56be88697c6/tabulate-0.10.0.tar.gz", hash = "sha256:e2cfde8f79420f6deeffdeda9aaec3b6bc5abce947655d17ac662b126e48a60d", size = 91754, upload-time = "2026-03-04T18:55:34.402Z" }
wheels = [
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/99/55/db07de81b5c630da5cbf5c7df646580ca26dfaefa593667fc6f2fe016d2e/tabulate-0.10.0-py3-none-any.whl", hash = "sha256:f0b0622e567335c8fabaaa659f1b33bcb6ddfe2e496071b743aa113f8774f2d3", size = 39814, upload-time = "2026-03-04T18:55:31.284Z" },
]
[[package]] [[package]]
name = "tenacity" name = "tenacity"
version = "9.1.4" version = "9.1.4"