feat: add claude backend, refactor summary utilities, improve batch worker pattern, add pymupdf4llm
This commit is contained in:
@@ -19,8 +19,11 @@ HTTP_MAX_RETRIES=3
|
||||
HTTP_USER_AGENT=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
|
||||
|
||||
# ─── AI 总结 ──────────────────────────────
|
||||
# 总结后端:pi | claude
|
||||
SUMMARY_BACKEND=pi
|
||||
PI_BIN=
|
||||
SUMMARY_SKILL=daily-paper-summary
|
||||
CLAUDE_BIN=claude
|
||||
SUMMARY_CONCURRENCY=3
|
||||
SUMMARY_TIMEOUT_SECONDS=1200
|
||||
SUMMARY_MAX_RETRIES=2
|
||||
|
||||
+24
-2
@@ -1,6 +1,7 @@
|
||||
"""CLI 工具 — 手动抓取论文。"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
import typer
|
||||
from dotenv import load_dotenv
|
||||
@@ -49,8 +50,11 @@ def crawl(
|
||||
typer.echo(f"📡 开始抓取 {target} ...")
|
||||
result = asyncio.run(crawl_daily(db, target, top_n))
|
||||
|
||||
# 未指定日期且今天无数据时,自动回退到昨天
|
||||
if not date_str and result["status"] == "success" and result["found"] == 0:
|
||||
# 未指定日期且今天失败或无数据时,自动回退到昨天
|
||||
need_fallback = not date_str and (
|
||||
result["status"] == "failed" or result["found"] == 0
|
||||
)
|
||||
if need_fallback:
|
||||
fallback = yesterday_str()
|
||||
existing = db.scalar(select(func.count(Paper.id)).where(Paper.paper_date == fallback)) or 0
|
||||
if existing > 0:
|
||||
@@ -84,6 +88,11 @@ def summarize(
|
||||
"--pdf-mode",
|
||||
help="PDF 传递方式:auto(自动选择)| inject(全量注入)| search(pi 自主搜索)",
|
||||
),
|
||||
backend: str = typer.Option(
|
||||
None,
|
||||
"--backend",
|
||||
help="总结后端:pi | claude(留空则使用 .env 配置)",
|
||||
),
|
||||
):
|
||||
"""手动触发 AI 总结。"""
|
||||
from app.config import settings
|
||||
@@ -97,9 +106,22 @@ def summarize(
|
||||
typer.echo(f"❌ 无效的 pdf_mode: {pdf_mode},只支持 auto / inject / search", err=True)
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
if backend:
|
||||
if backend not in ("pi", "claude"):
|
||||
typer.echo(f"❌ 无效的 backend: {backend},只支持 pi / claude", err=True)
|
||||
raise typer.Exit(code=1)
|
||||
settings.SUMMARY_BACKEND = backend
|
||||
|
||||
os.makedirs(settings.db_path.parent, exist_ok=True)
|
||||
_init(engine)
|
||||
|
||||
# 配置 logging 输出到终端
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)-5s %(name)s | %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
if arxiv_id:
|
||||
|
||||
@@ -29,8 +29,10 @@ class Settings(BaseSettings):
|
||||
HTTP_USER_AGENT: str = "hf-daily-papers-local/0.1"
|
||||
|
||||
# AI 总结
|
||||
SUMMARY_BACKEND: str = "pi" # "pi" | "claude"
|
||||
PI_BIN: str = ""
|
||||
SUMMARY_SKILL: str = "daily-paper-summary"
|
||||
CLAUDE_BIN: str = "claude"
|
||||
SUMMARY_CONCURRENCY: int = 3
|
||||
SUMMARY_TIMEOUT_SECONDS: int = 1200
|
||||
SUMMARY_MAX_RETRIES: int = 2
|
||||
|
||||
@@ -0,0 +1,84 @@
|
||||
"""Claude CLI 后端 — 调用 claude CLI 子进程生成总结。
|
||||
|
||||
和 pi_client.py 对称的接口,复用 prompt 构建、PDF 文本提取、JSON 提取逻辑。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
from app.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ClaudeTimeoutError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ClaudeProcessError(Exception):
|
||||
def __init__(self, returncode: int, stderr: str):
|
||||
self.returncode = returncode
|
||||
self.stderr = stderr
|
||||
super().__init__(f"claude exited with code {returncode}: {stderr[:500]}")
|
||||
|
||||
|
||||
async def call_claude(
|
||||
prompt: str,
|
||||
session_id: str | None = None,
|
||||
fix_errors: list[str] | None = None,
|
||||
) -> tuple[str, str]:
|
||||
"""调用 claude CLI print 模式,返回 (stdout 文本, session_id)。
|
||||
|
||||
和 call_pi() 对称的接口,但 claude CLI 不需要文件路径和 pdf_mode——
|
||||
所有内容已在 prompt 中准备好。
|
||||
|
||||
Args:
|
||||
prompt: 完整的 prompt 文本
|
||||
session_id: session ID(首次为 None 时自动生成)
|
||||
fix_errors: 上一轮验证错误列表(用于重试)
|
||||
"""
|
||||
if session_id is None:
|
||||
session_id = f"claude-summary-{uuid.uuid4().hex[:8]}"
|
||||
|
||||
cmd = [settings.CLAUDE_BIN, "-p", "--output-format", "text"]
|
||||
|
||||
if fix_errors and session_id:
|
||||
# 重试:延续 session
|
||||
cmd += ["--session-id", session_id, "--continue"]
|
||||
else:
|
||||
cmd += ["--session-id", session_id]
|
||||
|
||||
cmd.append(prompt)
|
||||
|
||||
logger.info(
|
||||
"Calling claude (session=%s, fix=%s)",
|
||||
session_id,
|
||||
bool(fix_errors),
|
||||
)
|
||||
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
try:
|
||||
stdout, stderr = await asyncio.wait_for(
|
||||
proc.communicate(),
|
||||
timeout=settings.SUMMARY_TIMEOUT_SECONDS,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
proc.kill()
|
||||
await proc.wait()
|
||||
raise ClaudeTimeoutError(
|
||||
f"claude timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s"
|
||||
)
|
||||
|
||||
if proc.returncode != 0:
|
||||
raise ClaudeProcessError(
|
||||
proc.returncode, stderr.decode("utf-8", errors="replace")
|
||||
)
|
||||
|
||||
return stdout.decode("utf-8", errors="replace"), session_id
|
||||
@@ -83,7 +83,7 @@ def _parse_paper(item: dict) -> dict:
|
||||
"upvotes": item.get("paper", {}).get("upvotes", 0) or item.get("upvotes", 0),
|
||||
"hf_url": f"https://huggingface.co/papers/{arxiv_id}" if arxiv_id else "",
|
||||
"arxiv_url": f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else "",
|
||||
"pdf_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else "",
|
||||
"pdf_url": f"https://arxiv.org/pdf/{arxiv_id}" if arxiv_id else "",
|
||||
"authors": [
|
||||
a.get("name", a) if isinstance(a, dict) else a
|
||||
for a in paper_info.get("authors", [])
|
||||
|
||||
@@ -3,10 +3,13 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from app.utils import PAPERS_DIR, TMP_DIR, make_http_client
|
||||
import requests
|
||||
|
||||
from app.utils import PAPERS_DIR, TMP_DIR
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -31,6 +34,22 @@ def tmp_dir(arxiv_id: str) -> Path:
|
||||
|
||||
# ── PDF 下载 ────────────────────────────────────────────────────────────
|
||||
|
||||
# 复用 TCP 连接的 session
|
||||
_http_session: requests.Session | None = None
|
||||
|
||||
|
||||
def _get_session() -> requests.Session:
|
||||
global _http_session
|
||||
if _http_session is None:
|
||||
_http_session = requests.Session()
|
||||
_http_session.headers.update({"User-Agent": "hf-daily-papers/1.0"})
|
||||
# 代理:优先 $PROXY_SERVER,其次 settings.http_proxy
|
||||
proxy = os.environ.get("PROXY_SERVER")
|
||||
if proxy:
|
||||
_http_session.proxies = {"http": proxy, "https": proxy}
|
||||
logger.info("PDF download using proxy from $PROXY_SERVER: %s", proxy)
|
||||
return _http_session
|
||||
|
||||
|
||||
async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
|
||||
"""下载 PDF 到 data/tmp/{arxiv_id}/paper.pdf。"""
|
||||
@@ -42,10 +61,10 @@ async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
|
||||
dest = dest_dir / "paper.pdf"
|
||||
|
||||
try:
|
||||
async with make_http_client(follow_redirects=True) as client:
|
||||
resp = await client.get(pdf_url)
|
||||
resp.raise_for_status()
|
||||
dest.write_bytes(resp.content)
|
||||
session = _get_session()
|
||||
resp = session.get(pdf_url, timeout=120, allow_redirects=True)
|
||||
resp.raise_for_status()
|
||||
dest.write_bytes(resp.content)
|
||||
except Exception as exc:
|
||||
raise PdfDownloadError(f"failed to download PDF for {arxiv_id}: {exc}") from exc
|
||||
|
||||
|
||||
+470
-563
File diff suppressed because it is too large
Load Diff
+43
-263
@@ -1,17 +1,38 @@
|
||||
"""pi CLI 调用与 JSON 提取 — 调用 pi 生成总结,从输出中提取结构化 JSON。"""
|
||||
"""pi CLI 后端 — 调用 pi 子进程生成总结。
|
||||
|
||||
通用工具函数(prompt 构建、PDF 提取、JSON 提取、meta.json)已移至 summary_utils.py。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from app.config import settings
|
||||
from app.services.summary_utils import (
|
||||
JsonNotFoundError,
|
||||
build_prompt,
|
||||
extract_json,
|
||||
extract_pdf_text,
|
||||
write_meta_json,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 重新导出,保持向后兼容
|
||||
__all__ = [
|
||||
"PiTimeoutError",
|
||||
"PiProcessError",
|
||||
"JsonNotFoundError",
|
||||
"call_pi",
|
||||
"write_meta_json",
|
||||
"extract_pdf_text",
|
||||
"build_prompt",
|
||||
"extract_json",
|
||||
]
|
||||
|
||||
|
||||
# ── 自定义异常 ──────────────────────────────────────────────────────────
|
||||
|
||||
@@ -27,201 +48,6 @@ class PiProcessError(Exception):
|
||||
super().__init__(f"pi exited with code {returncode}: {stderr[:500]}")
|
||||
|
||||
|
||||
class JsonNotFoundError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
# ── meta.json ───────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def write_meta_json(paper) -> Path:
|
||||
"""写入 data/papers/{arxiv_id}/meta.json,返回路径。"""
|
||||
from app.services.pdf_downloader import paper_dir
|
||||
|
||||
d = paper_dir(paper.arxiv_id)
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
meta_path = d / "meta.json"
|
||||
|
||||
authors = [a.name for a in paper.authors]
|
||||
tags = [t.tag for t in paper.tags]
|
||||
meta = {
|
||||
"arxiv_id": paper.arxiv_id,
|
||||
"title_en": paper.title_en,
|
||||
"abstract": paper.abstract or "",
|
||||
"published_at": paper.published_at.isoformat() if paper.published_at else None,
|
||||
"authors": authors,
|
||||
"tags": tags,
|
||||
"upvotes": paper.upvotes,
|
||||
}
|
||||
meta_path.write_text(
|
||||
json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
|
||||
)
|
||||
return meta_path
|
||||
|
||||
|
||||
# ── PDF 文本提取 ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _trim_body(text: str, max_chars: int | None = None) -> str:
|
||||
"""去除参考文献,保留正文+附录,超长时从末尾截断。
|
||||
|
||||
策略:
|
||||
1. 去掉 References/Bibliography 段落(纯引用列表,对解读无用)
|
||||
2. 正文 + 附录全部保留
|
||||
3. 如果指定了 max_chars 且总长超过,从末尾截断(附录靠后,优先保留正文)
|
||||
"""
|
||||
import re
|
||||
|
||||
# 找 References 段落的位置(在 Appendix 之后的那个)
|
||||
# 简单策略:找到 References 标题,如果后面没有 Appendix 就全删
|
||||
# 如果后面还有 Appendix,只删 References 到 Appendix 之间的内容
|
||||
ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text)
|
||||
if ref_match:
|
||||
ref_start = ref_match.start()
|
||||
# 看 References 之后有没有 Appendix
|
||||
after_ref = text[ref_start:]
|
||||
app_match = re.search(
|
||||
r"(?m)^(?:A\s+(?:Appendix|Supplementary)|Appendix|附录)\s*$", after_ref
|
||||
)
|
||||
if app_match:
|
||||
# References 之后有 Appendix:只删 References 段
|
||||
ref_end = ref_start + app_match.start()
|
||||
text = text[:ref_start] + text[ref_end:]
|
||||
else:
|
||||
# References 之后没有 Appendix:删掉从 References 到结尾
|
||||
text = text[:ref_start].rstrip()
|
||||
|
||||
# 去掉 Acknowledgments(对解读无用)
|
||||
ack_match = re.search(r"(?m)^(?:Acknowledgments?\s*|致谢\s*)$", text)
|
||||
if ack_match:
|
||||
# 只删 Acknowledgments 本身,不删后面的内容
|
||||
next_section = re.search(r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start():])
|
||||
if next_section:
|
||||
text = text[:ack_match.start()] + text[ack_match.start() + next_section.start():]
|
||||
else:
|
||||
text = text[:ack_match.start()].rstrip()
|
||||
|
||||
# 最后:如果指定了上限且超长,从末尾截断(附录在后面,正文在前面,优先保留正文)
|
||||
if max_chars is not None and len(text) > max_chars:
|
||||
text = text[:max_chars].rstrip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def extract_pdf_text(pdf_path: Path, max_chars: int | None = None) -> Path:
|
||||
"""用 pymupdf 提取 PDF 正文文本,保存为 .txt。
|
||||
|
||||
max_chars=None 时不截断,给 search/auto 模式保留完整内容。
|
||||
"""
|
||||
import pymupdf
|
||||
|
||||
txt_path = pdf_path.with_suffix(".txt")
|
||||
if txt_path.exists():
|
||||
# 缓存优先;如果需重新提取(不同 max_chars),先删旧文件
|
||||
return txt_path
|
||||
|
||||
doc = pymupdf.open(str(pdf_path))
|
||||
raw_text = "\n\n".join(page.get_text() for page in doc)
|
||||
doc.close()
|
||||
|
||||
body = _trim_body(raw_text, max_chars=max_chars)
|
||||
txt_path.write_text(body, encoding="utf-8")
|
||||
logger.info(
|
||||
"Extracted PDF text: %s (%d -> %d chars, -%d%%)",
|
||||
txt_path,
|
||||
len(raw_text),
|
||||
len(body),
|
||||
(1 - len(body) / len(raw_text)) * 100 if raw_text else 0,
|
||||
)
|
||||
return txt_path
|
||||
|
||||
|
||||
# ── Prompt 构建 ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _build_prompt(
|
||||
arxiv_id: str,
|
||||
meta_path: Path,
|
||||
txt_path: Path,
|
||||
pdf_mode: str,
|
||||
fix_errors: list[str] | None = None,
|
||||
) -> str:
|
||||
"""根据模式构建 pi prompt。
|
||||
|
||||
inject: 全量注入,prompt 末尾包含论文全文内容
|
||||
search: pi 自主 read 文件,prompt 只包含工作流指令
|
||||
"""
|
||||
json_schema = (
|
||||
"## 必须包含以下字段(不要自创字段名):\n"
|
||||
'{"arxiv_id": "...", '
|
||||
'"title_zh": "中文标题", '
|
||||
'"one_line": "一句话概括(≤50字)", '
|
||||
'"tags": ["标签1","标签2"], '
|
||||
'"difficulty": "入门/进阶/前沿", '
|
||||
'"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的(50-150字)","why_matters":"为什么读懂本文需要它"}]}, '
|
||||
'"motivation": {"problem": "详细段落:现有方法的具体问题(包含具体场景和数据)", '
|
||||
'"goal": "详细段落:本文的具体目标", '
|
||||
'"gap": "详细段落:本文的独特切入角度"}, '
|
||||
'"method": {"overview": "详细段落:方法整体思路(先直觉再技术路线)", '
|
||||
'"key_idea": "详细段落:核心创新点(和已有方法的本质区别)", '
|
||||
'"steps": "详细段落:方法步骤的完整描述(每步的输入输出和具体操作)", '
|
||||
'"novelty": "详细段落:技术新颖性分析"}, '
|
||||
'"results": {"main_findings": "详细段落:核心发现(带具体数字和指标,逐一分析每个实验)", '
|
||||
'"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
|
||||
'"limitations": "详细段落:局限性分析(作者承认的+你自己的观察")}, '
|
||||
'"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", '
|
||||
'"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", '
|
||||
'"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度")}, '
|
||||
'"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要","section":"method"},'
|
||||
'{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要","section":"results"}]'
|
||||
"\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Table,id 严格使用 \"Figure N\" 或 \"Table N\" 格式。"
|
||||
"section 必须是 motivation/method/results/limitations 之一,表示该图最适合展示在哪个章节。"
|
||||
"}"
|
||||
)
|
||||
|
||||
writing_requirements = (
|
||||
"## 写作要求\n"
|
||||
"- 每个字符串字段必须写成详细段落(200-500字),不要用列表或数组\n"
|
||||
"- 必须包含论文中的具体数据、数字、实验指标\n"
|
||||
"- 像资深同事给同事讲论文一样,专业但易懂\n"
|
||||
"- 数学公式、符号、变量必须使用 LaTeX 格式:行内公式用 $...$,独立公式用 $$...$$\n"
|
||||
" 例如:损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$,学习率 $\\eta$\n"
|
||||
)
|
||||
|
||||
if fix_errors:
|
||||
error_list = "\n".join(f"- {e}" for e in fix_errors)
|
||||
return (
|
||||
"你之前生成的 JSON 存在以下问题,请修正后重新用 write_file 保存到 "
|
||||
f"data/papers/{arxiv_id}/summary.json:\n\n"
|
||||
f"{error_list}\n\n"
|
||||
"注意:所有字符串字段必须是详细段落(≥50字),不能是数组或列表。"
|
||||
"修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
|
||||
)
|
||||
|
||||
if pdf_mode == "search":
|
||||
return (
|
||||
"请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n"
|
||||
"## 工作流程\n"
|
||||
f"1. 先用 read 工具读取 {meta_path} 了解论文元信息(标题、作者、摘要)\n"
|
||||
f"2. 再用 read 工具阅读 {txt_path}(论文正文全文),可以多次读取定位关键段落\n"
|
||||
f"3. 充分理解后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n\n"
|
||||
+ writing_requirements
|
||||
+ "\n"
|
||||
+ json_schema
|
||||
)
|
||||
else:
|
||||
return (
|
||||
"请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n"
|
||||
"## 工作流程\n"
|
||||
"论文元信息和正文全文已在上文提供,请仔细阅读。\n"
|
||||
f"1. 充分理解论文后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n"
|
||||
"2. 用 bash 运行 python scripts/validate_summary.py 验证\n\n"
|
||||
+ writing_requirements
|
||||
+ "\n"
|
||||
+ json_schema
|
||||
)
|
||||
|
||||
|
||||
# ── pi CLI 调用 ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@@ -264,12 +90,10 @@ async def call_pi(
|
||||
txt_path.write_text(trimmed, encoding="utf-8")
|
||||
logger.info("Truncated %s for inject: %d → %d chars", arxiv_id, txt_size, len(trimmed))
|
||||
|
||||
prompt_text = _build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors)
|
||||
prompt_text = build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors)
|
||||
|
||||
# 构建 session ID(每篇论文一个独立 session)
|
||||
if session_id is None:
|
||||
import uuid
|
||||
|
||||
session_id = f"summary-{arxiv_id}-{uuid.uuid4().hex[:8]}"
|
||||
|
||||
# 工具列表:search 模式需要 read 工具
|
||||
@@ -297,6 +121,9 @@ async def call_pi(
|
||||
arxiv_id, bool(fix_errors), session_id, actual_mode,
|
||||
)
|
||||
|
||||
import time as _time
|
||||
_t_sub_start = _time.monotonic()
|
||||
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
@@ -312,69 +139,22 @@ async def call_pi(
|
||||
await proc.wait()
|
||||
raise PiTimeoutError(f"pi timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s")
|
||||
|
||||
_t_sub_end = _time.monotonic()
|
||||
|
||||
# 检查 summary.json 是否由 pi 子进程写入
|
||||
_summary_file = pdf_path.parent / "summary.json"
|
||||
_file_info = ""
|
||||
if _summary_file.exists():
|
||||
_file_mtime = _summary_file.stat().st_mtime
|
||||
_file_size = _summary_file.stat().st_size
|
||||
_file_info = f" summary.json={_file_size}B"
|
||||
|
||||
logger.info(
|
||||
"pi subprocess for %s: %.2fs%s",
|
||||
arxiv_id, _t_sub_end - _t_sub_start, _file_info,
|
||||
)
|
||||
|
||||
if proc.returncode != 0:
|
||||
raise PiProcessError(proc.returncode, stderr.decode("utf-8", errors="replace"))
|
||||
|
||||
return stdout.decode("utf-8", errors="replace"), session_id
|
||||
|
||||
|
||||
# ── JSON 提取 ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def extract_json(raw_output: str) -> dict:
|
||||
"""从 pi 输出中提取 JSON dict。三步策略:直接解析 → 代码块 → 最大花括号块。"""
|
||||
# 策略 1:整体直接解析
|
||||
stripped = raw_output.strip()
|
||||
try:
|
||||
result = json.loads(stripped)
|
||||
if isinstance(result, dict) and "title_zh" in result:
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# 策略 2:提取 ```json ... ``` 代码块
|
||||
fence_pattern = re.compile(r"```(?:json)?\s*\n(.*?)```", re.DOTALL)
|
||||
for match in fence_pattern.finditer(raw_output):
|
||||
try:
|
||||
result = json.loads(match.group(1).strip())
|
||||
if isinstance(result, dict) and "title_zh" in result:
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# 策略 3:匹配包含 title_zh 的最大 {...} 块
|
||||
brace_pattern = re.compile(r"\{[^{}]*\"title_zh\"[^{}]*\}", re.DOTALL)
|
||||
for match in brace_pattern.finditer(raw_output):
|
||||
try:
|
||||
return json.loads(match.group(0))
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# 更宽松:找到最大的 { ... } 平衡块
|
||||
best = None
|
||||
best_len = 0
|
||||
for i, ch in enumerate(raw_output):
|
||||
if ch != "{":
|
||||
continue
|
||||
depth = 0
|
||||
for j in range(i, len(raw_output)):
|
||||
if raw_output[j] == "{":
|
||||
depth += 1
|
||||
elif raw_output[j] == "}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
candidate = raw_output[i : j + 1]
|
||||
if len(candidate) > best_len:
|
||||
try:
|
||||
parsed = json.loads(candidate)
|
||||
if isinstance(parsed, dict):
|
||||
best = parsed
|
||||
best_len = len(candidate)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
break
|
||||
|
||||
if best is not None:
|
||||
return best
|
||||
|
||||
raise JsonNotFoundError("no JSON object found in pi output")
|
||||
|
||||
+200
-64
@@ -29,14 +29,19 @@ from app.services.pdf_downloader import (
|
||||
download_pdf,
|
||||
paper_dir,
|
||||
)
|
||||
from app.services.pi_client import (
|
||||
from app.services.summary_utils import (
|
||||
JsonNotFoundError,
|
||||
build_prompt,
|
||||
extract_json,
|
||||
write_meta_json,
|
||||
extract_pdf_text,
|
||||
)
|
||||
from app.services.pi_client import (
|
||||
PiProcessError,
|
||||
PiTimeoutError,
|
||||
call_pi,
|
||||
extract_json,
|
||||
write_meta_json,
|
||||
)
|
||||
from app.services import claude_backend
|
||||
from app.services.schemas import (
|
||||
SummarySchema,
|
||||
assess_quality,
|
||||
@@ -229,7 +234,6 @@ def _save_files(arxiv_id: str, schema: SummarySchema | None, raw_output: str) ->
|
||||
async def summarize_one(
|
||||
db: Session,
|
||||
paper: Paper,
|
||||
semaphore: asyncio.Semaphore | None = None,
|
||||
*,
|
||||
force: bool = False,
|
||||
pdf_mode: str = "auto",
|
||||
@@ -257,68 +261,128 @@ async def summarize_one(
|
||||
"reason": "permanent_failure",
|
||||
}
|
||||
|
||||
if semaphore:
|
||||
await semaphore.acquire()
|
||||
try:
|
||||
return await _do_summarize_one(db, paper, pdf_mode=pdf_mode)
|
||||
finally:
|
||||
if semaphore:
|
||||
semaphore.release()
|
||||
return await _do_summarize_one(db, paper, pdf_mode=pdf_mode)
|
||||
|
||||
|
||||
async def _generate_with_retry(
|
||||
arxiv_id: str, meta_path: Path, pdf_path: Path, pdf_mode: str = "auto"
|
||||
) -> tuple[dict, str]:
|
||||
"""调用 pi CLI 生成总结,最多 4 轮验证循环。
|
||||
"""调用 AI 后端生成总结,最多 4 轮验证循环。
|
||||
|
||||
根据 settings.SUMMARY_BACKEND 选择 pi 或 claude 后端。
|
||||
|
||||
Returns:
|
||||
(json_data, raw_output)
|
||||
Raises:
|
||||
ValueError: 4 轮验证仍未通过
|
||||
"""
|
||||
import time as _time
|
||||
|
||||
backend = settings.SUMMARY_BACKEND
|
||||
validation_errors: list[str] = []
|
||||
json_data: dict | None = None
|
||||
raw_output = ""
|
||||
session_id = None
|
||||
|
||||
summary_file = paper_dir(arxiv_id) / "summary.json"
|
||||
|
||||
# claude 后端需要预构建 prompt(pi 后端在 call_pi 内部构建)
|
||||
claude_prompt: str | None = None
|
||||
if backend == "claude":
|
||||
_t0 = _time.monotonic()
|
||||
txt_path = extract_pdf_text(pdf_path, max_chars=None)
|
||||
body = txt_path.read_text(encoding="utf-8")
|
||||
if len(body) > 80_000:
|
||||
trimmed = body[:80_000].rstrip()
|
||||
txt_path.write_text(trimmed, encoding="utf-8")
|
||||
claude_prompt = build_prompt(arxiv_id, meta_path, txt_path, "inject", None)
|
||||
logger.info(" [%s] 构建prompt: %.2fs", arxiv_id, _time.monotonic() - _t0)
|
||||
|
||||
for attempt in range(1, 5):
|
||||
# 清理上一轮 pi 写的不完整文件
|
||||
stale = paper_dir(arxiv_id) / "summary.json"
|
||||
if stale.exists():
|
||||
stale.unlink()
|
||||
# 清理上一轮写入的不完整文件
|
||||
if summary_file.exists():
|
||||
summary_file.unlink()
|
||||
|
||||
if attempt == 1:
|
||||
raw_output, session_id = await call_pi(meta_path, pdf_path, pdf_mode=pdf_mode)
|
||||
# 记录 AI 调用开始时间
|
||||
_t_call_start = _time.monotonic()
|
||||
|
||||
if backend == "claude":
|
||||
if attempt == 1:
|
||||
raw_output, session_id = await claude_backend.call_claude(
|
||||
claude_prompt, session_id=None,
|
||||
)
|
||||
else:
|
||||
retry_prompt = build_prompt(
|
||||
arxiv_id, meta_path,
|
||||
extract_pdf_text(pdf_path, max_chars=80000),
|
||||
"inject", fix_errors=validation_errors,
|
||||
)
|
||||
raw_output, session_id = await claude_backend.call_claude(
|
||||
retry_prompt, session_id=session_id, fix_errors=validation_errors,
|
||||
)
|
||||
else:
|
||||
raw_output, session_id = await call_pi(
|
||||
meta_path, pdf_path,
|
||||
fix_errors=validation_errors,
|
||||
session_id=session_id,
|
||||
pdf_mode=pdf_mode,
|
||||
)
|
||||
if attempt == 1:
|
||||
raw_output, session_id = await call_pi(meta_path, pdf_path, pdf_mode=pdf_mode)
|
||||
else:
|
||||
raw_output, session_id = await call_pi(
|
||||
meta_path, pdf_path,
|
||||
fix_errors=validation_errors,
|
||||
session_id=session_id,
|
||||
pdf_mode=pdf_mode,
|
||||
)
|
||||
|
||||
# 优先读取 pi 写入的 summary.json,否则从 stdout 提取
|
||||
summary_file = paper_dir(arxiv_id) / "summary.json"
|
||||
_t_call_end = _time.monotonic()
|
||||
|
||||
# 检查 summary.json 是否由 AI 子进程写入
|
||||
file_written_by_ai = summary_file.exists()
|
||||
file_mtime = summary_file.stat().st_mtime if file_written_by_ai else None
|
||||
file_size = summary_file.stat().st_size if file_written_by_ai else 0
|
||||
|
||||
logger.info(
|
||||
" [%s] attempt %d AI调用: %.2fs summary.json=%s%s",
|
||||
arxiv_id, attempt,
|
||||
_t_call_end - _t_call_start,
|
||||
f"已写入({file_size}B)" if file_written_by_ai else "未写入",
|
||||
f" mtime={file_mtime:.2f}" if file_mtime else "",
|
||||
)
|
||||
|
||||
# 提取 JSON
|
||||
_t_json_start = _time.monotonic()
|
||||
try:
|
||||
if summary_file.exists():
|
||||
if file_written_by_ai:
|
||||
json_data = json.loads(summary_file.read_text(encoding="utf-8"))
|
||||
logger.info("Read summary.json written by pi for %s", arxiv_id)
|
||||
logger.info(" [%s] 从AI写入的summary.json读取", arxiv_id)
|
||||
else:
|
||||
json_data = extract_json(raw_output)
|
||||
except (json.JSONDecodeError, JsonNotFoundError) as exc:
|
||||
_t_json_end = _time.monotonic()
|
||||
logger.warning(
|
||||
"JSON extraction failed for %s (attempt %d): %s",
|
||||
arxiv_id, attempt, str(exc)[:200],
|
||||
" [%s] JSON提取失败: %.2fs %s",
|
||||
arxiv_id, _t_json_end - _t_json_start, str(exc)[:200],
|
||||
)
|
||||
validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"]
|
||||
continue
|
||||
_t_json_end = _time.monotonic()
|
||||
|
||||
# 验证
|
||||
_t_val_start = _time.monotonic()
|
||||
validation_errors = _validate_summary(json_data, arxiv_id)
|
||||
_t_val_end = _time.monotonic()
|
||||
|
||||
if not validation_errors:
|
||||
logger.info(
|
||||
" [%s] JSON提取: %.2fs 验证: %.2fs ✅",
|
||||
arxiv_id,
|
||||
_t_json_end - _t_json_start,
|
||||
_t_val_end - _t_val_start,
|
||||
)
|
||||
break
|
||||
logger.warning(
|
||||
"Validation failed for %s (attempt %d): %s",
|
||||
arxiv_id, attempt, "; ".join(validation_errors),
|
||||
" [%s] JSON提取: %.2fs 验证: %.2fs ❌ %s",
|
||||
arxiv_id,
|
||||
_t_json_end - _t_json_start,
|
||||
_t_val_end - _t_val_start,
|
||||
"; ".join(validation_errors)[:200],
|
||||
)
|
||||
|
||||
if validation_errors:
|
||||
@@ -335,11 +399,19 @@ def _persist_summary(
|
||||
db: Session, paper: Paper, json_data: dict, raw_output: str
|
||||
) -> str:
|
||||
"""Pydantic 校验 → 质量评估 → 保存文件 → 更新 DB → 返回 quality。"""
|
||||
import time as _time
|
||||
arxiv_id = paper.arxiv_id
|
||||
|
||||
_t0 = _time.monotonic()
|
||||
schema = SummarySchema.model_validate(json_data)
|
||||
quality = assess_quality(schema)
|
||||
_t1 = _time.monotonic()
|
||||
|
||||
_save_files(arxiv_id, schema, raw_output)
|
||||
_t2 = _time.monotonic()
|
||||
|
||||
_save_files(paper.arxiv_id, schema, raw_output)
|
||||
_update_summary_in_db(db, paper, schema, quality, raw_output)
|
||||
_t3 = _time.monotonic()
|
||||
|
||||
# 状态 → done
|
||||
paper.summary_status.status = SummaryState.DONE
|
||||
@@ -347,10 +419,30 @@ def _persist_summary(
|
||||
paper.summary_status.completed_at = utc_now()
|
||||
paper.summary_status.raw_output_saved = True
|
||||
db.commit()
|
||||
_t4 = _time.monotonic()
|
||||
|
||||
logger.info(
|
||||
" [%s] persist: pydantic=%.2fs 文件=%.2fs DB写入=%.2fs 状态commit=%.2fs",
|
||||
arxiv_id,
|
||||
_t1 - _t0,
|
||||
_t2 - _t1,
|
||||
_t3 - _t2,
|
||||
_t4 - _t3,
|
||||
)
|
||||
|
||||
# 触发性增强(失败不影响总结)
|
||||
_maybe_extract_images(paper.arxiv_id, schema)
|
||||
_maybe_index_chroma(paper.arxiv_id, paper, schema)
|
||||
_t5 = _time.monotonic()
|
||||
_maybe_extract_images(arxiv_id, schema)
|
||||
_t6 = _time.monotonic()
|
||||
_maybe_index_chroma(arxiv_id, paper, schema)
|
||||
_t7 = _time.monotonic()
|
||||
|
||||
logger.info(
|
||||
" [%s] 后处理: 图片提取=%.2fs ChromaDB=%.2fs",
|
||||
arxiv_id,
|
||||
_t6 - _t5,
|
||||
_t7 - _t6,
|
||||
)
|
||||
|
||||
return quality
|
||||
|
||||
@@ -445,28 +537,47 @@ async def _do_summarize_one(
|
||||
) -> dict:
|
||||
"""实际的单篇总结执行(在 semaphore 保护下)。"""
|
||||
arxiv_id = paper.arxiv_id
|
||||
title_short = (paper.title_en or "")[:50]
|
||||
|
||||
# 状态 → processing
|
||||
paper.summary_status.status = SummaryState.PROCESSING
|
||||
paper.summary_status.started_at = utc_now()
|
||||
db.commit()
|
||||
|
||||
logger.info("▶ [%s] 开始总结: %s", arxiv_id, title_short)
|
||||
|
||||
# 清理旧的图片文件和 figures_json,避免重新总结时残留
|
||||
import time as _time
|
||||
_t_cleanup_start = _time.monotonic()
|
||||
_cleanup_old_images(db, paper)
|
||||
_t_cleanup_end = _time.monotonic()
|
||||
logger.info(" [%s] 清理旧数据: %.2fs", arxiv_id, _t_cleanup_end - _t_cleanup_start)
|
||||
|
||||
raw_output = ""
|
||||
try:
|
||||
meta_path = write_meta_json(paper)
|
||||
await download_pdf(arxiv_id, paper.pdf_url)
|
||||
_t0 = _time.monotonic()
|
||||
|
||||
meta_path = write_meta_json(paper)
|
||||
_t1 = _time.monotonic()
|
||||
logger.info(" [%s] meta.json: %.2fs", arxiv_id, _t1 - _t0)
|
||||
|
||||
await download_pdf(arxiv_id, paper.pdf_url)
|
||||
_t2 = _time.monotonic()
|
||||
logger.info(" [%s] 下载PDF: %.2fs", arxiv_id, _t2 - _t1)
|
||||
|
||||
logger.info(" [%s] 调用 pi 生成总结...", arxiv_id)
|
||||
json_data, raw_output = await _generate_with_retry(
|
||||
arxiv_id, meta_path, TMP_DIR / arxiv_id / "paper.pdf",
|
||||
pdf_mode=pdf_mode,
|
||||
)
|
||||
_t3 = _time.monotonic()
|
||||
logger.info(" [%s] pi生成: %.2fs", arxiv_id, _t3 - _t2)
|
||||
|
||||
quality = _persist_summary(db, paper, json_data, raw_output)
|
||||
_t4 = _time.monotonic()
|
||||
logger.info(" [%s] 持久化: %.2fs", arxiv_id, _t4 - _t3)
|
||||
|
||||
logger.info("Summarize done: %s quality=%s", arxiv_id, quality)
|
||||
logger.info("✅ [%s] 完成: quality=%s 总耗时: %.2fs", arxiv_id, quality, _t4 - _t0)
|
||||
return {"arxiv_id": arxiv_id, "status": "done", "quality": quality}
|
||||
|
||||
except Exception as exc:
|
||||
@@ -588,42 +699,67 @@ async def summarize_batch(
|
||||
"total": 0,
|
||||
}
|
||||
|
||||
# 并发控制
|
||||
semaphore = asyncio.Semaphore(settings.SUMMARY_CONCURRENCY)
|
||||
# 并发控制:worker 模式,避免 573 个协程同时打开 DB 连接耗尽连接池
|
||||
concurrency = settings.SUMMARY_CONCURRENCY
|
||||
make_session = _session_factory or SessionLocal
|
||||
|
||||
async def _process_paper(paper: Paper) -> dict:
|
||||
paper_db = make_session()
|
||||
try:
|
||||
p = paper_db.execute(
|
||||
select(Paper)
|
||||
.where(Paper.id == paper.id)
|
||||
.options(*PAPER_DEFAULT_LOAD)
|
||||
).unique().scalar_one_or_none()
|
||||
return await summarize_one(paper_db, p, semaphore, pdf_mode=pdf_mode)
|
||||
finally:
|
||||
paper_db.close()
|
||||
# 进度追踪
|
||||
progress = {"done": 0, "failed": 0, "skipped": 0}
|
||||
paper_queue: asyncio.Queue[Paper | None] = asyncio.Queue()
|
||||
for p in papers:
|
||||
paper_queue.put_nowait(p)
|
||||
|
||||
results = await asyncio.gather(
|
||||
*[_process_paper(p) for p in papers],
|
||||
async def _worker() -> list[dict]:
|
||||
results: list[dict] = []
|
||||
while True:
|
||||
paper = paper_queue.get_nowait() if not paper_queue.empty() else None
|
||||
if paper is None:
|
||||
break
|
||||
paper_db = make_session()
|
||||
try:
|
||||
p = paper_db.execute(
|
||||
select(Paper)
|
||||
.where(Paper.id == paper.id)
|
||||
.options(*PAPER_DEFAULT_LOAD)
|
||||
).unique().scalar_one_or_none()
|
||||
result = await summarize_one(paper_db, p, pdf_mode=pdf_mode)
|
||||
status = result.get("status", "failed")
|
||||
progress[status] = progress.get(status, 0) + 1
|
||||
finished = sum(progress.values())
|
||||
logger.info(
|
||||
"📊 进度: %d/%d (✅%d ❌%d ⏭️%d) — %s",
|
||||
finished, total,
|
||||
progress["done"], progress["failed"], progress["skipped"],
|
||||
paper.arxiv_id,
|
||||
)
|
||||
results.append(result)
|
||||
except Exception as exc:
|
||||
logger.error("Worker error: %s", exc)
|
||||
results.append({"status": "failed", "error": str(exc)})
|
||||
finally:
|
||||
paper_db.close()
|
||||
return results
|
||||
|
||||
worker_results = await asyncio.gather(
|
||||
*[_worker() for _ in range(concurrency)],
|
||||
return_exceptions=True,
|
||||
)
|
||||
results = []
|
||||
for r in worker_results:
|
||||
if isinstance(r, Exception):
|
||||
logger.error("Unexpected error in batch: %s", r)
|
||||
results.append(r)
|
||||
elif isinstance(r, list):
|
||||
results.extend(r)
|
||||
|
||||
# 统计结果
|
||||
done = 0
|
||||
failed = 0
|
||||
skipped = 0
|
||||
# 统计结果(progress 已在 worker 中实时更新)
|
||||
done = progress["done"]
|
||||
failed = progress["failed"]
|
||||
skipped = progress["skipped"]
|
||||
for r in results:
|
||||
if isinstance(r, Exception):
|
||||
logger.error("Unexpected error in batch: %s", r)
|
||||
failed += 1
|
||||
elif isinstance(r, dict):
|
||||
if r.get("status") == "done":
|
||||
done += 1
|
||||
elif r.get("status") == "skipped":
|
||||
skipped += 1
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
log_entry.status = "success" if failed == 0 else "failed"
|
||||
log_entry.papers_found = total
|
||||
|
||||
@@ -0,0 +1,270 @@
|
||||
"""总结工具函数 — PDF 文本提取、prompt 构建、JSON 提取、meta.json 写入。
|
||||
|
||||
与后端无关的通用逻辑,pi 和 claude 后端共享。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ── 自定义异常 ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class JsonNotFoundError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
# ── meta.json ───────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def write_meta_json(paper) -> Path:
|
||||
"""写入 data/papers/{arxiv_id}/meta.json,返回路径。"""
|
||||
from app.services.pdf_downloader import paper_dir
|
||||
|
||||
d = paper_dir(paper.arxiv_id)
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
meta_path = d / "meta.json"
|
||||
|
||||
authors = [a.name for a in paper.authors]
|
||||
tags = [t.tag for t in paper.tags]
|
||||
meta = {
|
||||
"arxiv_id": paper.arxiv_id,
|
||||
"title_en": paper.title_en,
|
||||
"abstract": paper.abstract or "",
|
||||
"published_at": paper.published_at.isoformat() if paper.published_at else None,
|
||||
"authors": authors,
|
||||
"tags": tags,
|
||||
"upvotes": paper.upvotes,
|
||||
}
|
||||
meta_path.write_text(
|
||||
json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
|
||||
)
|
||||
return meta_path
|
||||
|
||||
|
||||
# ── PDF 文本提取 ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _trim_body(text: str, max_chars: int | None = None) -> str:
|
||||
"""去除参考文献,保留正文+附录,超长时从末尾截断。
|
||||
|
||||
策略:
|
||||
1. 去掉 References/Bibliography 段落(纯引用列表,对解读无用)
|
||||
2. 正文 + 附录全部保留
|
||||
3. 如果指定了 max_chars 且总长超过,从末尾截断(附录靠后,优先保留正文)
|
||||
"""
|
||||
# 找 References 段落的位置(在 Appendix 之后的那个)
|
||||
ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text)
|
||||
if ref_match:
|
||||
ref_start = ref_match.start()
|
||||
# 看 References 之后有没有 Appendix
|
||||
after_ref = text[ref_start:]
|
||||
app_match = re.search(
|
||||
r"(?m)^(?:A\s+(?:Appendix|Supplementary)|Appendix|附录)\s*$", after_ref
|
||||
)
|
||||
if app_match:
|
||||
# References 之后有 Appendix:只删 References 段
|
||||
ref_end = ref_start + app_match.start()
|
||||
text = text[:ref_start] + text[ref_end:]
|
||||
else:
|
||||
# References 之后没有 Appendix:删掉从 References 到结尾
|
||||
text = text[:ref_start].rstrip()
|
||||
|
||||
# 去掉 Acknowledgments(对解读无用)
|
||||
ack_match = re.search(r"(?m)^(?:Acknowledgments?\s*|致谢\s*)$", text)
|
||||
if ack_match:
|
||||
# 只删 Acknowledgments 本身,不删后面的内容
|
||||
next_section = re.search(r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start():])
|
||||
if next_section:
|
||||
text = text[:ack_match.start()] + text[ack_match.start() + next_section.start():]
|
||||
else:
|
||||
text = text[:ack_match.start()].rstrip()
|
||||
|
||||
# 最后:如果指定了上限且超长,从末尾截断(附录在后面,正文在前面,优先保留正文)
|
||||
if max_chars is not None and len(text) > max_chars:
|
||||
text = text[:max_chars].rstrip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def extract_pdf_text(pdf_path: Path, max_chars: int | None = None) -> Path:
|
||||
"""用 pymupdf 提取 PDF 正文文本,保存为 .txt。
|
||||
|
||||
max_chars=None 时不截断,给 search/auto 模式保留完整内容。
|
||||
"""
|
||||
import pymupdf
|
||||
|
||||
txt_path = pdf_path.with_suffix(".txt")
|
||||
if txt_path.exists():
|
||||
# 缓存优先;如果需重新提取(不同 max_chars),先删旧文件
|
||||
return txt_path
|
||||
|
||||
doc = pymupdf.open(str(pdf_path))
|
||||
# sort=True 启用阅读顺序检测,避免双栏论文中跨栏错位
|
||||
raw_text = "\n\n".join(page.get_text(sort=True) for page in doc)
|
||||
doc.close()
|
||||
|
||||
body = _trim_body(raw_text, max_chars=max_chars)
|
||||
txt_path.write_text(body, encoding="utf-8")
|
||||
logger.info(
|
||||
"Extracted PDF text: %s (%d -> %d chars, -%d%%)",
|
||||
txt_path,
|
||||
len(raw_text),
|
||||
len(body),
|
||||
(1 - len(body) / len(raw_text)) * 100 if raw_text else 0,
|
||||
)
|
||||
return txt_path
|
||||
|
||||
|
||||
# ── Prompt 构建 ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def build_prompt(
|
||||
arxiv_id: str,
|
||||
meta_path: Path,
|
||||
txt_path: Path,
|
||||
pdf_mode: str,
|
||||
fix_errors: list[str] | None = None,
|
||||
) -> str:
|
||||
"""根据模式构建 prompt。
|
||||
|
||||
inject: 全量注入,prompt 末尾包含论文全文内容
|
||||
search: pi 自主 read 文件,prompt 只包含工作流指令
|
||||
"""
|
||||
json_schema = (
|
||||
"## 必须包含以下字段(不要自创字段名):\n"
|
||||
'{"arxiv_id": "...", '
|
||||
'"title_zh": "中文标题", '
|
||||
'"one_line": "一句话概括(≤50字)", '
|
||||
'"tags": ["标签1","标签2"], '
|
||||
'"difficulty": "入门/进阶/前沿", '
|
||||
'"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的(50-150字)","why_matters":"为什么读懂本文需要它"}]}, '
|
||||
'"motivation": {"problem": "详细段落:现有方法的具体问题(包含具体场景和数据)", '
|
||||
'"goal": "详细段落:本文的具体目标", '
|
||||
'"gap": "详细段落:本文的独特切入角度"}, '
|
||||
'"method": {"overview": "详细段落:方法整体思路(先直觉再技术路线)", '
|
||||
'"key_idea": "详细段落:核心创新点(和已有方法的本质区别)", '
|
||||
'"steps": "详细段落:方法步骤的完整描述(每步的输入输出和具体操作)", '
|
||||
'"novelty": "详细段落:技术新颖性分析"}, '
|
||||
'"results": {"main_findings": "详细段落:核心发现(带具体数字和指标,逐一分析每个实验)", '
|
||||
'"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
|
||||
'"limitations": "详细段落:局限性分析(作者承认的+你自己的观察")}, '
|
||||
'"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", '
|
||||
'"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", '
|
||||
'"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度")}, '
|
||||
'"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要","section":"method"},'
|
||||
'{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要","section":"results"}]'
|
||||
"\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Table,id 严格使用 \"Figure N\" 或 \"Table N\" 格式。"
|
||||
"section 必须是 motivation/method/results/limitations 之一,表示该图最适合展示在哪个章节。"
|
||||
"}"
|
||||
)
|
||||
|
||||
writing_requirements = (
|
||||
"## 写作要求\n"
|
||||
"- 每个字符串字段必须写成详细段落(200-500字),不要用列表或数组\n"
|
||||
"- 必须包含论文中的具体数据、数字、实验指标\n"
|
||||
"- 像资深同事给同事讲论文一样,专业但易懂\n"
|
||||
"- 数学公式、符号、变量必须使用 LaTeX 格式:行内公式用 $...$,独立公式用 $$...$$\n"
|
||||
" 例如:损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$,学习率 $\\eta$\n"
|
||||
)
|
||||
|
||||
if fix_errors:
|
||||
error_list = "\n".join(f"- {e}" for e in fix_errors)
|
||||
return (
|
||||
"你之前生成的 JSON 存在以下问题,请修正后重新用 write_file 保存到 "
|
||||
f"data/papers/{arxiv_id}/summary.json:\n\n"
|
||||
f"{error_list}\n\n"
|
||||
"注意:所有字符串字段必须是详细段落(≥50字),不能是数组或列表。"
|
||||
"修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
|
||||
)
|
||||
|
||||
if pdf_mode == "search":
|
||||
return (
|
||||
"请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n"
|
||||
"## 工作流程\n"
|
||||
f"1. 先用 read 工具读取 {meta_path} 了解论文元信息(标题、作者、摘要)\n"
|
||||
f"2. 再用 read 工具阅读 {txt_path}(论文正文全文),可以多次读取定位关键段落\n"
|
||||
f"3. 充分理解后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n\n"
|
||||
+ writing_requirements
|
||||
+ "\n"
|
||||
+ json_schema
|
||||
)
|
||||
else:
|
||||
return (
|
||||
"请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n"
|
||||
"## 工作流程\n"
|
||||
"论文元信息和正文全文已在上文提供,请仔细阅读。\n"
|
||||
f"1. 充分理解论文后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n"
|
||||
"2. 用 bash 运行 python scripts/validate_summary.py 验证\n\n"
|
||||
+ writing_requirements
|
||||
+ "\n"
|
||||
+ json_schema
|
||||
)
|
||||
|
||||
|
||||
# ── JSON 提取 ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def extract_json(raw_output: str) -> dict:
|
||||
"""从输出中提取 JSON dict。三步策略:直接解析 → 代码块 → 最大花括号块。"""
|
||||
# 策略 1:整体直接解析
|
||||
stripped = raw_output.strip()
|
||||
try:
|
||||
result = json.loads(stripped)
|
||||
if isinstance(result, dict) and "title_zh" in result:
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# 策略 2:提取 ```json ... ``` 代码块
|
||||
fence_pattern = re.compile(r"```(?:json)?\s*\n(.*?)```", re.DOTALL)
|
||||
for match in fence_pattern.finditer(raw_output):
|
||||
try:
|
||||
result = json.loads(match.group(1).strip())
|
||||
if isinstance(result, dict) and "title_zh" in result:
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# 策略 3:匹配包含 title_zh 的最大 {...} 块
|
||||
brace_pattern = re.compile(r"\{[^{}]*\"title_zh\"[^{}]*\}", re.DOTALL)
|
||||
for match in brace_pattern.finditer(raw_output):
|
||||
try:
|
||||
return json.loads(match.group(0))
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# 更宽松:找到最大的 { ... } 平衡块
|
||||
best = None
|
||||
best_len = 0
|
||||
for i, ch in enumerate(raw_output):
|
||||
if ch != "{":
|
||||
continue
|
||||
depth = 0
|
||||
for j in range(i, len(raw_output)):
|
||||
if raw_output[j] == "{":
|
||||
depth += 1
|
||||
elif raw_output[j] == "}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
candidate = raw_output[i : j + 1]
|
||||
if len(candidate) > best_len:
|
||||
try:
|
||||
parsed = json.loads(candidate)
|
||||
if isinstance(parsed, dict):
|
||||
best = parsed
|
||||
best_len = len(candidate)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
break
|
||||
|
||||
if best is not None:
|
||||
return best
|
||||
|
||||
raise JsonNotFoundError("no JSON object found in output")
|
||||
+2
-1
@@ -7,7 +7,7 @@ dependencies = [
|
||||
"fastapi>=0.115",
|
||||
"uvicorn[standard]>=0.34",
|
||||
"sqlalchemy>=2.0",
|
||||
"httpx>=0.28",
|
||||
"httpx[http2]>=0.28",
|
||||
"jinja2>=3.1",
|
||||
"python-multipart>=0.0.18",
|
||||
"pydantic>=2.0",
|
||||
@@ -19,6 +19,7 @@ dependencies = [
|
||||
"pymupdf>=1.25",
|
||||
"itsdangerous>=2.2.0",
|
||||
"bleach>=6.4.0",
|
||||
"pymupdf4llm>=1.27.2.3",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
+136
-109
@@ -1,117 +1,144 @@
|
||||
"""验证 summary JSON 是否符合 SummarySchema 要求。
|
||||
|
||||
用法:python scripts/validate_summary.py <json_file>
|
||||
返回:exit 0 = 通过,exit 1 = 失败(错误信息输出到 stdout)
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
schema = {
|
||||
"type": "object",
|
||||
"required": ["arxiv_id", "title_zh", "one_line", "tags", "difficulty",
|
||||
"prerequisites", "motivation", "method", "results", "improvements", "figures"],
|
||||
"properties": {
|
||||
"arxiv_id": {"type": "string"},
|
||||
"title_zh": {"type": "string"},
|
||||
"one_line": {"type": "string"},
|
||||
"tags": {"type": "array", "items": {"type": "string"}},
|
||||
"difficulty": {"type": "string", "enum": ["入门", "进阶", "前沿"]},
|
||||
"prerequisites": {
|
||||
"type": "object",
|
||||
"required": ["concepts"],
|
||||
"properties": {
|
||||
"concepts": {"type": "array", "items": {
|
||||
"type": "object",
|
||||
"required": ["term", "explanation", "why_matters"],
|
||||
"properties": {
|
||||
"term": {"type": "string"},
|
||||
"explanation": {"type": "string"},
|
||||
"why_matters": {"type": "string"}
|
||||
}
|
||||
}}
|
||||
}
|
||||
},
|
||||
"motivation": {
|
||||
"type": "object",
|
||||
"required": ["problem", "goal", "gap"],
|
||||
"properties": {
|
||||
"problem": {"type": "string"},
|
||||
"goal": {"type": "string"},
|
||||
"gap": {"type": "string"}
|
||||
}
|
||||
},
|
||||
"method": {
|
||||
"type": "object",
|
||||
"required": ["overview", "key_idea", "steps", "novelty"],
|
||||
"properties": {
|
||||
"overview": {"type": "string"},
|
||||
"key_idea": {"type": "string"},
|
||||
"steps": {"type": "string"},
|
||||
"novelty": {"type": "string"}
|
||||
}
|
||||
},
|
||||
"results": {
|
||||
"type": "object",
|
||||
"required": ["main_findings", "benchmarks", "limitations"],
|
||||
"properties": {
|
||||
"main_findings": {"type": "string"},
|
||||
"benchmarks": {"type": "array", "items": {
|
||||
"type": "object",
|
||||
"required": ["task", "metric", "this_work", "baseline", "improvement"],
|
||||
"properties": {
|
||||
"task": {"type": "string"},
|
||||
"metric": {"type": "string"},
|
||||
"this_work": {"type": "string"},
|
||||
"baseline": {"type": "string"},
|
||||
"improvement": {"type": "string"}
|
||||
}
|
||||
}},
|
||||
"limitations": {"type": "string"}
|
||||
}
|
||||
},
|
||||
"improvements": {
|
||||
"type": "object",
|
||||
"required": ["weaknesses", "future_work", "reproducibility"],
|
||||
"properties": {
|
||||
"weaknesses": {"type": "string"},
|
||||
"future_work": {"type": "string"},
|
||||
"reproducibility": {"type": "string"}
|
||||
}
|
||||
},
|
||||
"figures": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": ["id", "caption", "description", "reason", "section"],
|
||||
"properties": {
|
||||
"id": {"type": "string"},
|
||||
"caption": {"type": "string"},
|
||||
"description": {"type": "string"},
|
||||
"reason": {"type": "string"},
|
||||
"section": {"type": "string", "enum": ["motivation", "method", "results", "limitations"]}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def validate(path: str) -> list[str]:
|
||||
errors: list[str] = []
|
||||
def validate_file(filepath):
|
||||
try:
|
||||
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Check required fields
|
||||
for field in schema["required"]:
|
||||
if field not in data:
|
||||
print(f"❌ Missing field: {field}")
|
||||
return False
|
||||
|
||||
# Validate nested structure
|
||||
for field, spec in schema["properties"].items():
|
||||
if field in data:
|
||||
if spec["type"] == "string":
|
||||
if not isinstance(data[field], str):
|
||||
print(f"❌ Field '{field}' should be string")
|
||||
return False
|
||||
elif spec["type"] == "array":
|
||||
if not isinstance(data[field], list):
|
||||
print(f"❌ Field '{field}' should be array")
|
||||
return False
|
||||
elif spec["type"] == "object":
|
||||
if not isinstance(data[field], dict):
|
||||
print(f"❌ Field '{field}' should be object")
|
||||
return False
|
||||
if "required" in spec:
|
||||
for subfield in spec["required"]:
|
||||
if subfield not in data[field]:
|
||||
print(f"❌ Missing subfield: {field}.{subfield}")
|
||||
return False
|
||||
|
||||
# Validate section enum in figures
|
||||
valid_sections = ["motivation", "method", "results", "limitations"]
|
||||
for fig in data.get("figures", []):
|
||||
if fig["section"] not in valid_sections:
|
||||
print(f"❌ Invalid section in figure: {fig['section']}")
|
||||
return False
|
||||
|
||||
print("✅ JSON validation passed!")
|
||||
return True
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
return [f"JSON 解析失败: {e}"]
|
||||
|
||||
if not isinstance(data, dict):
|
||||
return ["顶层必须是 JSON 对象 (dict)"]
|
||||
|
||||
# 必填字段
|
||||
required_top = ["arxiv_id", "title_zh", "one_line", "tags"]
|
||||
for f in required_top:
|
||||
if f not in data or not data[f]:
|
||||
errors.append(f"缺少必填字段: {f}")
|
||||
|
||||
# tags 必须是非空数组
|
||||
tags = data.get("tags")
|
||||
if isinstance(tags, list) and len(tags) == 0:
|
||||
errors.append("tags 不能为空数组")
|
||||
if not isinstance(tags, list):
|
||||
errors.append("tags 必须是数组")
|
||||
|
||||
# motivation 子字段
|
||||
motivation = data.get("motivation", {})
|
||||
if not isinstance(motivation, dict):
|
||||
errors.append("motivation 必须是对象")
|
||||
else:
|
||||
for f in ["problem", "goal", "gap"]:
|
||||
val = motivation.get(f, "")
|
||||
if not isinstance(val, str) or len(val.strip()) < 50:
|
||||
errors.append(f"motivation.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)")
|
||||
|
||||
# method 子字段
|
||||
method = data.get("method", {})
|
||||
if not isinstance(method, dict):
|
||||
errors.append("method 必须是对象")
|
||||
else:
|
||||
for f in ["overview", "key_idea", "steps", "novelty"]:
|
||||
val = method.get(f, "")
|
||||
if not isinstance(val, str) or len(val.strip()) < 50:
|
||||
errors.append(f"method.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)")
|
||||
|
||||
# results 子字段
|
||||
results = data.get("results", {})
|
||||
if not isinstance(results, dict):
|
||||
errors.append("results 必须是对象")
|
||||
else:
|
||||
for f in ["main_findings", "limitations"]:
|
||||
val = results.get(f, "")
|
||||
if not isinstance(val, str) or len(val.strip()) < 50:
|
||||
errors.append(f"results.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)")
|
||||
# benchmarks 可以是数组
|
||||
benchmarks = results.get("benchmarks")
|
||||
if benchmarks is not None and not isinstance(benchmarks, list):
|
||||
errors.append("results.benchmarks 必须是数组")
|
||||
|
||||
# improvements 子字段
|
||||
improvements = data.get("improvements", {})
|
||||
if not isinstance(improvements, dict):
|
||||
errors.append("improvements 必须是对象")
|
||||
else:
|
||||
for f in ["weaknesses", "future_work", "reproducibility"]:
|
||||
val = improvements.get(f, "")
|
||||
if not isinstance(val, str) or len(val.strip()) < 50:
|
||||
errors.append(f"improvements.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)")
|
||||
|
||||
# 检查是否有字段误用数组(应该用字符串的)
|
||||
string_fields = [
|
||||
("motivation", "problem"), ("motivation", "goal"), ("motivation", "gap"),
|
||||
("method", "overview"), ("method", "key_idea"), ("method", "steps"), ("method", "novelty"),
|
||||
("results", "main_findings"), ("results", "limitations"),
|
||||
("improvements", "weaknesses"), ("improvements", "future_work"), ("improvements", "reproducibility"),
|
||||
]
|
||||
for section, field in string_fields:
|
||||
val = data.get(section, {}).get(field)
|
||||
if isinstance(val, list):
|
||||
errors.append(f"{section}.{field} 应该是字符串段落,不能是数组")
|
||||
|
||||
# figures 验证
|
||||
figures = data.get("figures")
|
||||
if figures is not None:
|
||||
if not isinstance(figures, list):
|
||||
errors.append("figures 必须是数组")
|
||||
else:
|
||||
for i, fig in enumerate(figures):
|
||||
if isinstance(fig, dict) and not fig.get("id"):
|
||||
errors.append(f"figures[{i}] 缺少 id 字段")
|
||||
|
||||
return errors
|
||||
|
||||
print(f"❌ JSON decode error: {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"❌ Validation error: {e}")
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 2:
|
||||
print("用法: python scripts/validate_summary.py <json_file>")
|
||||
sys.exit(1)
|
||||
|
||||
errs = validate(sys.argv[1])
|
||||
if errs:
|
||||
print("❌ 验证失败:")
|
||||
for e in errs:
|
||||
print(f" - {e}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("✅ 验证通过")
|
||||
sys.exit(0)
|
||||
filepath = sys.argv[1] if len(sys.argv) > 1 else "data/papers/2601.10592/summary.json"
|
||||
validate_file(filepath)
|
||||
|
||||
@@ -684,6 +684,19 @@ wheels = [
|
||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "h2"
|
||||
version = "4.3.0"
|
||||
source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
|
||||
dependencies = [
|
||||
{ name = "hpack" },
|
||||
{ name = "hyperframe" },
|
||||
]
|
||||
sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1d/17/afa56379f94ad0fe8defd37d6eb3f89a25404ffc71d4d848893d270325fc/h2-4.3.0.tar.gz", hash = "sha256:6c59efe4323fa18b47a632221a1888bd7fde6249819beda254aeca909f221bf1", size = 2152026, upload-time = "2025-08-23T18:12:19.778Z" }
|
||||
wheels = [
|
||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/69/b2/119f6e6dcbd96f9069ce9a2665e0146588dc9f88f29549711853645e736a/h2-4.3.0-py3-none-any.whl", hash = "sha256:c438f029a25f7945c69e0ccf0fb951dc3f73a5f6412981daee861431b70e2bdd", size = 61779, upload-time = "2025-08-23T18:12:17.779Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hf-daily-papers"
|
||||
version = "0.1.0"
|
||||
@@ -693,12 +706,13 @@ dependencies = [
|
||||
{ name = "bleach" },
|
||||
{ name = "chromadb" },
|
||||
{ name = "fastapi" },
|
||||
{ name = "httpx" },
|
||||
{ name = "httpx", extra = ["http2"] },
|
||||
{ name = "itsdangerous" },
|
||||
{ name = "jinja2" },
|
||||
{ name = "pydantic" },
|
||||
{ name = "pydantic-settings" },
|
||||
{ name = "pymupdf" },
|
||||
{ name = "pymupdf4llm" },
|
||||
{ name = "python-dotenv" },
|
||||
{ name = "python-multipart" },
|
||||
{ name = "sqlalchemy" },
|
||||
@@ -724,12 +738,13 @@ requires-dist = [
|
||||
{ name = "bleach", specifier = ">=6.4.0" },
|
||||
{ name = "chromadb", specifier = ">=1.0" },
|
||||
{ name = "fastapi", specifier = ">=0.115" },
|
||||
{ name = "httpx", specifier = ">=0.28" },
|
||||
{ name = "httpx", extras = ["http2"], specifier = ">=0.28" },
|
||||
{ name = "itsdangerous", specifier = ">=2.2.0" },
|
||||
{ name = "jinja2", specifier = ">=3.1" },
|
||||
{ name = "pydantic", specifier = ">=2.0" },
|
||||
{ name = "pydantic-settings", specifier = ">=2.0" },
|
||||
{ name = "pymupdf", specifier = ">=1.25" },
|
||||
{ name = "pymupdf4llm", specifier = ">=1.27.2.3" },
|
||||
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" },
|
||||
{ name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24" },
|
||||
{ name = "python-dotenv", specifier = ">=1.0" },
|
||||
@@ -778,6 +793,15 @@ wheels = [
|
||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/62/94/3b66b148778ee100dcfd69c2ca22b57b41b44d3063ceec934f209e9184ce/hf_xet-1.5.0-cp37-abi3-win_arm64.whl", hash = "sha256:b6c9df403040248c76d808d3e047d64db2d923bae593eb244c41e425cf6cd7be", size = 3806916, upload-time = "2026-05-06T06:18:21.7Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hpack"
|
||||
version = "4.1.0"
|
||||
source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
|
||||
sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2c/48/71de9ed269fdae9c8057e5a4c0aa7402e8bb16f2c6e90b3aa53327b113f8/hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca", size = 51276, upload-time = "2025-01-22T21:44:58.347Z" }
|
||||
wheels = [
|
||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/07/c6/80c95b1b2b94682a72cbdbfb85b81ae2daffa4291fbfa1b1464502ede10d/hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496", size = 34357, upload-time = "2025-01-22T21:44:56.92Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "httpcore"
|
||||
version = "1.0.9"
|
||||
@@ -842,6 +866,11 @@ wheels = [
|
||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
|
||||
]
|
||||
|
||||
[package.optional-dependencies]
|
||||
http2 = [
|
||||
{ name = "h2" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "huggingface-hub"
|
||||
version = "1.16.1"
|
||||
@@ -862,6 +891,15 @@ wheels = [
|
||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/49/79/621a7dbb80c70974f73a597275351ebe03ce5bc65cb5f8f4acb5859252bc/huggingface_hub-1.16.1-py3-none-any.whl", hash = "sha256:64340de934b9ce37857ef85a82de72f5629e8a270f9119eabb12bf495eb53c22", size = 668176, upload-time = "2026-05-21T18:39:58.596Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyperframe"
|
||||
version = "6.1.0"
|
||||
source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
|
||||
sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/02/e7/94f8232d4a74cc99514c13a9f995811485a6903d48e5d952771ef6322e30/hyperframe-6.1.0.tar.gz", hash = "sha256:f630908a00854a7adeabd6382b43923a4c4cd4b821fcb527e6ab9e15382a3b08", size = 26566, upload-time = "2025-01-22T21:41:49.302Z" }
|
||||
wheels = [
|
||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/48/30/47d0bf6072f7252e6521f3447ccfa40b421b6824517f82854703d0f5a98b/hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5", size = 13007, upload-time = "2025-01-22T21:41:47.295Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "3.18"
|
||||
@@ -1223,6 +1261,15 @@ wheels = [
|
||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "networkx"
|
||||
version = "3.6.1"
|
||||
source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
|
||||
sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" }
|
||||
wheels = [
|
||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "numpy"
|
||||
version = "2.4.6"
|
||||
@@ -1842,6 +1889,39 @@ wheels = [
|
||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/53/a4/b9e91aac82293f9c954654c85581ee8212b5b05efadc534b581141241e6f/pymupdf-1.27.2.3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:77691604c5d1d0233827139bbcdea61fd57879c84712b8e49b1f45520f7ab9c2", size = 25000393, upload-time = "2026-04-24T14:11:01.669Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pymupdf-layout"
|
||||
version = "1.27.2.3"
|
||||
source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
|
||||
dependencies = [
|
||||
{ name = "networkx" },
|
||||
{ name = "numpy" },
|
||||
{ name = "onnxruntime" },
|
||||
{ name = "pymupdf" },
|
||||
{ name = "pyyaml" },
|
||||
]
|
||||
wheels = [
|
||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bc/ee/067726c3ee5574ad5c605d00d7419e264ef509d626a726f99388111f8216/pymupdf_layout-1.27.2.3-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:75c2ab3c0e8830ac2bc50cfd32d375a30768a2610dac72a02f08265336e0834f", size = 15799844, upload-time = "2026-04-24T14:11:13.177Z" },
|
||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0a/ba/46a7a36474722f9280d885f6eec878561a257d9378e52590b43d32ffb96c/pymupdf_layout-1.27.2.3-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:5656b09669dcd7c51f539afb6fdaf853602bab4cbc20479ee5ee1a85a4e32b60", size = 15795220, upload-time = "2026-04-24T14:11:23.17Z" },
|
||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/84/87/bfdcca67346052943a4549814f2009b38f4d15ec025798cdf7dfa5f57c84/pymupdf_layout-1.27.2.3-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:fcf03aa815cbceebdb3263dd6a190de4547c46b1d168928836ec38738afe127d", size = 15805240, upload-time = "2026-04-24T14:11:33.465Z" },
|
||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/32/e9/7ce6eaf97cebd46c3808593282e9eb99a60cddd6183e25a636980d5c7986/pymupdf_layout-1.27.2.3-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:303b9414216dfaf711ec7d807b6f1e4c3e0a92bbb4569340fcedd9d5593d16ca", size = 15806269, upload-time = "2026-04-24T14:11:43.481Z" },
|
||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bf/61/3b2417d8f2cdfaa0f4749cd9dafa3379cb5cdaddf4233165f1ff81953c30/pymupdf_layout-1.27.2.3-cp310-abi3-win_amd64.whl", hash = "sha256:503b64d9b6b31ea3af79ef85cf7d36950c5048af468cb297684d2953553c62ad", size = 15809163, upload-time = "2026-04-24T14:11:53.956Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pymupdf4llm"
|
||||
version = "1.27.2.3"
|
||||
source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
|
||||
dependencies = [
|
||||
{ name = "pymupdf" },
|
||||
{ name = "pymupdf-layout" },
|
||||
{ name = "tabulate" },
|
||||
]
|
||||
sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/87/c0/e3830452d82032c3d82a9879616c05bf0c51e0dea03c1d80d57b3a6ec0d1/pymupdf4llm-1.27.2.3.tar.gz", hash = "sha256:42ec1a47ddc62be3f4f40c116d27618611c6f9fa366719016d9ddc3f3a3dc22b", size = 1406297, upload-time = "2026-04-24T14:13:18.843Z" }
|
||||
wheels = [
|
||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e6/38/84bf29f4dd72e6c450546df6ca8f53021f764fd945ba67dcc235d39bc20e/pymupdf4llm-1.27.2.3-py3-none-any.whl", hash = "sha256:bd724b79fa3f06a5b28d7a65f7acfa8de56e04bdb603ac2d6dff315e0d151aaa", size = 77348, upload-time = "2026-04-24T14:11:04.305Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pypika"
|
||||
version = "0.51.1"
|
||||
@@ -2202,6 +2282,15 @@ wheels = [
|
||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1c/54/196d0c1db10af76baa4f64894448505d60d3cdf70ef92cbb35f46a4e4c71/starlette-1.2.1-py3-none-any.whl", hash = "sha256:4de0082d08c8f6764a85a54cf1120d6939507a19905c7768acad2a9f875d2b89", size = 73350, upload-time = "2026-05-31T01:07:50.09Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tabulate"
|
||||
version = "0.10.0"
|
||||
source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
|
||||
sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/46/58/8c37dea7bbf769b20d58e7ace7e5edfe65b849442b00ffcdd56be88697c6/tabulate-0.10.0.tar.gz", hash = "sha256:e2cfde8f79420f6deeffdeda9aaec3b6bc5abce947655d17ac662b126e48a60d", size = 91754, upload-time = "2026-03-04T18:55:34.402Z" }
|
||||
wheels = [
|
||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/99/55/db07de81b5c630da5cbf5c7df646580ca26dfaefa593667fc6f2fe016d2e/tabulate-0.10.0-py3-none-any.whl", hash = "sha256:f0b0622e567335c8fabaaa659f1b33bcb6ddfe2e496071b743aa113f8774f2d3", size = 39814, upload-time = "2026-03-04T18:55:31.284Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tenacity"
|
||||
version = "9.1.4"
|
||||
|
||||
Reference in New Issue
Block a user