diff --git a/.env.example b/.env.example index bfea3bf..19cfaae 100644 --- a/.env.example +++ b/.env.example @@ -19,8 +19,11 @@ HTTP_MAX_RETRIES=3 HTTP_USER_AGENT=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 # ─── AI 总结 ────────────────────────────── +# 总结后端:pi | claude +SUMMARY_BACKEND=pi PI_BIN= SUMMARY_SKILL=daily-paper-summary +CLAUDE_BIN=claude SUMMARY_CONCURRENCY=3 SUMMARY_TIMEOUT_SECONDS=1200 SUMMARY_MAX_RETRIES=2 diff --git a/app/cli.py b/app/cli.py index ea163df..452918b 100644 --- a/app/cli.py +++ b/app/cli.py @@ -1,6 +1,7 @@ """CLI 工具 — 手动抓取论文。""" import asyncio +import logging import typer from dotenv import load_dotenv @@ -49,8 +50,11 @@ def crawl( typer.echo(f"📡 开始抓取 {target} ...") result = asyncio.run(crawl_daily(db, target, top_n)) - # 未指定日期且今天无数据时,自动回退到昨天 - if not date_str and result["status"] == "success" and result["found"] == 0: + # 未指定日期且今天失败或无数据时,自动回退到昨天 + need_fallback = not date_str and ( + result["status"] == "failed" or result["found"] == 0 + ) + if need_fallback: fallback = yesterday_str() existing = db.scalar(select(func.count(Paper.id)).where(Paper.paper_date == fallback)) or 0 if existing > 0: @@ -84,6 +88,11 @@ def summarize( "--pdf-mode", help="PDF 传递方式:auto(自动选择)| inject(全量注入)| search(pi 自主搜索)", ), + backend: str = typer.Option( + None, + "--backend", + help="总结后端:pi | claude(留空则使用 .env 配置)", + ), ): """手动触发 AI 总结。""" from app.config import settings @@ -97,9 +106,22 @@ def summarize( typer.echo(f"❌ 无效的 pdf_mode: {pdf_mode},只支持 auto / inject / search", err=True) raise typer.Exit(code=1) + if backend: + if backend not in ("pi", "claude"): + typer.echo(f"❌ 无效的 backend: {backend},只支持 pi / claude", err=True) + raise typer.Exit(code=1) + settings.SUMMARY_BACKEND = backend + os.makedirs(settings.db_path.parent, exist_ok=True) _init(engine) + # 配置 logging 输出到终端 + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)-5s %(name)s | %(message)s", + datefmt="%H:%M:%S", + ) + db = SessionLocal() try: if arxiv_id: diff --git a/app/config.py b/app/config.py index c6ddc18..09d4b44 100644 --- a/app/config.py +++ b/app/config.py @@ -29,8 +29,10 @@ class Settings(BaseSettings): HTTP_USER_AGENT: str = "hf-daily-papers-local/0.1" # AI 总结 + SUMMARY_BACKEND: str = "pi" # "pi" | "claude" PI_BIN: str = "" SUMMARY_SKILL: str = "daily-paper-summary" + CLAUDE_BIN: str = "claude" SUMMARY_CONCURRENCY: int = 3 SUMMARY_TIMEOUT_SECONDS: int = 1200 SUMMARY_MAX_RETRIES: int = 2 diff --git a/app/services/claude_backend.py b/app/services/claude_backend.py new file mode 100644 index 0000000..76ac8f8 --- /dev/null +++ b/app/services/claude_backend.py @@ -0,0 +1,84 @@ +"""Claude CLI 后端 — 调用 claude CLI 子进程生成总结。 + +和 pi_client.py 对称的接口,复用 prompt 构建、PDF 文本提取、JSON 提取逻辑。 +""" + +from __future__ import annotations + +import asyncio +import logging +import uuid + +from app.config import settings + +logger = logging.getLogger(__name__) + + +class ClaudeTimeoutError(Exception): + pass + + +class ClaudeProcessError(Exception): + def __init__(self, returncode: int, stderr: str): + self.returncode = returncode + self.stderr = stderr + super().__init__(f"claude exited with code {returncode}: {stderr[:500]}") + + +async def call_claude( + prompt: str, + session_id: str | None = None, + fix_errors: list[str] | None = None, +) -> tuple[str, str]: + """调用 claude CLI print 模式,返回 (stdout 文本, session_id)。 + + 和 call_pi() 对称的接口,但 claude CLI 不需要文件路径和 pdf_mode—— + 所有内容已在 prompt 中准备好。 + + Args: + prompt: 完整的 prompt 文本 + session_id: session ID(首次为 None 时自动生成) + fix_errors: 上一轮验证错误列表(用于重试) + """ + if session_id is None: + session_id = f"claude-summary-{uuid.uuid4().hex[:8]}" + + cmd = [settings.CLAUDE_BIN, "-p", "--output-format", "text"] + + if fix_errors and session_id: + # 重试:延续 session + cmd += ["--session-id", session_id, "--continue"] + else: + cmd += ["--session-id", session_id] + + cmd.append(prompt) + + logger.info( + "Calling claude (session=%s, fix=%s)", + session_id, + bool(fix_errors), + ) + + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + stdout, stderr = await asyncio.wait_for( + proc.communicate(), + timeout=settings.SUMMARY_TIMEOUT_SECONDS, + ) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + raise ClaudeTimeoutError( + f"claude timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s" + ) + + if proc.returncode != 0: + raise ClaudeProcessError( + proc.returncode, stderr.decode("utf-8", errors="replace") + ) + + return stdout.decode("utf-8", errors="replace"), session_id diff --git a/app/services/crawler.py b/app/services/crawler.py index feb67c0..5942373 100644 --- a/app/services/crawler.py +++ b/app/services/crawler.py @@ -83,7 +83,7 @@ def _parse_paper(item: dict) -> dict: "upvotes": item.get("paper", {}).get("upvotes", 0) or item.get("upvotes", 0), "hf_url": f"https://huggingface.co/papers/{arxiv_id}" if arxiv_id else "", "arxiv_url": f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else "", - "pdf_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else "", + "pdf_url": f"https://arxiv.org/pdf/{arxiv_id}" if arxiv_id else "", "authors": [ a.get("name", a) if isinstance(a, dict) else a for a in paper_info.get("authors", []) diff --git a/app/services/pdf_downloader.py b/app/services/pdf_downloader.py index 96da241..7b49ac0 100644 --- a/app/services/pdf_downloader.py +++ b/app/services/pdf_downloader.py @@ -3,10 +3,13 @@ from __future__ import annotations import logging +import os import shutil from pathlib import Path -from app.utils import PAPERS_DIR, TMP_DIR, make_http_client +import requests + +from app.utils import PAPERS_DIR, TMP_DIR logger = logging.getLogger(__name__) @@ -31,6 +34,22 @@ def tmp_dir(arxiv_id: str) -> Path: # ── PDF 下载 ──────────────────────────────────────────────────────────── +# 复用 TCP 连接的 session +_http_session: requests.Session | None = None + + +def _get_session() -> requests.Session: + global _http_session + if _http_session is None: + _http_session = requests.Session() + _http_session.headers.update({"User-Agent": "hf-daily-papers/1.0"}) + # 代理:优先 $PROXY_SERVER,其次 settings.http_proxy + proxy = os.environ.get("PROXY_SERVER") + if proxy: + _http_session.proxies = {"http": proxy, "https": proxy} + logger.info("PDF download using proxy from $PROXY_SERVER: %s", proxy) + return _http_session + async def download_pdf(arxiv_id: str, pdf_url: str) -> Path: """下载 PDF 到 data/tmp/{arxiv_id}/paper.pdf。""" @@ -42,10 +61,10 @@ async def download_pdf(arxiv_id: str, pdf_url: str) -> Path: dest = dest_dir / "paper.pdf" try: - async with make_http_client(follow_redirects=True) as client: - resp = await client.get(pdf_url) - resp.raise_for_status() - dest.write_bytes(resp.content) + session = _get_session() + resp = session.get(pdf_url, timeout=120, allow_redirects=True) + resp.raise_for_status() + dest.write_bytes(resp.content) except Exception as exc: raise PdfDownloadError(f"failed to download PDF for {arxiv_id}: {exc}") from exc diff --git a/app/services/pdf_image_extractor.py b/app/services/pdf_image_extractor.py index 384171c..0894716 100644 --- a/app/services/pdf_image_extractor.py +++ b/app/services/pdf_image_extractor.py @@ -1,12 +1,12 @@ -"""PDF 图片与表格提取 — 基于 caption 定位的页面区域截图。 +"""PDF 图片与表格提取 — 基于 pymupdf4llm layout analysis。 -核心思路:学术论文排版极其规整,Figure caption 在图下方,Table caption 在表格上方。 -因此反过来:先找 caption 文字 → 向上/向下截取页面区域 → 渲染为 PNG。 +用 pymupdf4llm 的 layout analysis 检测 table / picture 区域, +再通过 caption 文字匹配确定 Figure/Table 编号,渲染为 JPEG。 -优势(相比提取嵌入位图): -- 复合图表不会被拆成碎片(整块截取) -- 矢量图也能截取(页面渲染包含一切) -- 不依赖 find_tables()(纯文本匹配 caption) +相比旧方案(caption 正则 + pdfplumber/find_tables/文本块扫描三套策略): +- layout analysis 直接给出区域 bbox,不存在相邻表格互相侵入的问题 +- 无需手动调参(最大高度、间隙阈值等) +- 页面级 caption 匹配:每个 caption 只分配给最近的 box,避免上下相邻表格抢夺同一个 caption """ from __future__ import annotations @@ -16,40 +16,18 @@ import logging import re from pathlib import Path +import pymupdf +import pymupdf4llm.helpers.document_layout as dl + from app.services.pdf_downloader import paper_dir from app.utils import TMP_DIR logger = logging.getLogger(__name__) -# ── 截取区域参数 ─────────────────────────────────────────────────────── - -# Figure: caption 上方搜索图的范围(点) -_FIGURE_MAX_HEIGHT = 450 # 最大向上搜索范围 -_FIGURE_MIN_HEIGHT = 50 # 最小有效截图高度 -_FIGURE_DEFAULT_HEIGHT = 280 # 上方未找到内容块时的默认图高度 - -# Table: caption 下方搜索表格的范围 -_TABLE_MAX_HEIGHT = 500 # 最大向下搜索范围 -_TABLE_MIN_HEIGHT = 30 - -# caption 左右扩展(双栏论文中 caption 可能比表格窄) -_REGION_SIDE_PADDING = 10 -# 表格通常比 caption 文字宽,使用更大的水平扩展 -_TABLE_SIDE_PADDING = 60 - -# 正文行距的 ~1.5 倍 ≈ 空白间隙阈值(学术论文紧密排版,30pt 太宽松) -_CONTENT_GAP_THRESHOLD = 20 -# 密集表格数据块后的过渡阈值:表格块之后的段落间距常只有 12-18pt -_TABLE_DATA_GAP_THRESHOLD = 12 - - # ── Caption 正则 ─────────────────────────────────────────────────────── -# 要求以 Figure/Table 开头(避免匹配正文中的 "see Figure 3" 等) -# 支持三种 caption 格式: -# "Figure 1: Title" / "Figure 1. Title" / "Figure 1 Title"(无标点,空格分隔) -# 第三种需要后续紧跟大写字母(排除 "Figure 1 shows..." 等正文引用) -_CAPTION_RE = re.compile( +# 用于从 caption 文字中提取 Figure/Table 编号 +_FIGURE_CAPTION_RE = re.compile( r"^(?:Fig\.?|Figure)\s+(\d+)\s*(?:[:\.]\s*|\s+(?=(?-i:[A-Z])))", re.IGNORECASE, ) @@ -58,489 +36,471 @@ _TABLE_CAPTION_RE = re.compile( re.IGNORECASE, ) -# ── 停止信号:表格边界检测遇到以下内容时立即停止 ── - -# 下一个 Figure/Table caption(如 "Table 2:" "Figure 3:" "Figure 4 Title") -_CAPTION_STOP_RE = re.compile( - r"^(?:Table|Fig\.?|Figure)\s+\d+\s*(?:[:\.]\s*|\s+[A-Z])", - re.IGNORECASE, -) -# Section header(如 "6.2 Evolution" "D.1 Dependency" "7 Conclusion") -_SECTION_STOP_RE = re.compile( - r"^(\d{1,2}(?:\.\d+)?\s+[A-Z][a-z]|[A-Z]\.\d+\s+[A-Z][a-z])" -) +# caption 与 table/picture 的最大匹配距离(点) +_CAPTION_MATCH_DISTANCE = 100 +# 截图区域的外边距 +_REGION_PADDING = 5 +# 3x 渲染,保证清晰度 +_RENDER_ZOOM = 3 +# 相邻 box 聚类间距(点)— 同一 figure/table 的碎片间距通常 < 15pt +_CLUSTER_GAP = 15 -def _estimate_column_x(caption: dict) -> tuple[float, float]: - """估计 caption 所在列的水平边界(col_x0, col_x1)。 +# ── Box 聚类 ───────────────────────────────────────────────────────── - 双栏论文中 caption 宽度远小于页面宽度,据此判断左右列。 - 单栏或跨栏 caption(宽度 >65% 页宽)返回整页宽度。 - caption 居中对齐(中心接近页面中线)时按跨栏处理,使用宽范围。 + +class _BoxCluster: + """合并后的布局区域(由一个或多个相邻 LayoutBox 组成)。 + + pymupdf4llm 有时将一个大图拆成多个小 picture box(如视频帧网格), + 聚类后用整体 bbox 作为渲染区域。 """ - pw = caption["page_width"] - caption_w = caption["caption_x1"] - caption["caption_x0"] - # caption 宽度 >65% 页宽 → 单栏或跨栏 - if caption_w > pw * 0.65: - return 0, pw + __slots__ = ("x0", "y0", "x1", "y1", "boxclass") - cx = (caption["caption_x0"] + caption["caption_x1"]) / 2 - - # caption 居中(中心距页面中线 <8%)→ 可能是跨栏表格,使用宽范围 - if abs(cx - pw / 2) / pw < 0.08: - return ( - max(0, caption["caption_x0"] - _TABLE_SIDE_PADDING * 2), - min(pw, caption["caption_x1"] + _TABLE_SIDE_PADDING * 2), - ) - - if cx < pw / 2: - return 0, pw / 2 - else: - return pw / 2, pw + def __init__(self, boxes: list): + self.x0 = min(b.x0 for b in boxes) + self.y0 = min(b.y0 for b in boxes) + self.x1 = max(b.x1 for b in boxes) + self.y1 = max(b.y1 for b in boxes) + # table-fallback 归一化为 table(layout model 检测到表格但无法提取结构) + raw = boxes[0].boxclass + self.boxclass = "table" if raw == "table-fallback" else raw -def _find_captions(doc) -> list[dict]: - """扫描整个文档,找到所有 Figure/Table caption 的位置和信息。""" - captions = [] +def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]: + """将相邻的同类型 box 合并为聚类。 - for page_num in range(len(doc)): - page = doc[page_num] - page_width = page.rect.width - page_height = page.rect.height - blocks = page.get_text("blocks") - - for block in blocks: - if len(block) < 5: - continue - text = str(block[4]).strip() - if not text: - continue - - bx0, by0, bx1, by1 = block[0], block[1], block[2], block[3] - # 只取 block 第一行做匹配(避免 block 包含多段文字干扰) - first_line = text.split("\n")[0].strip() - - m = _CAPTION_RE.match(first_line) - if m: - captions.append( - { - "type": "figure", - "num": int(m.group(1)), - "label": f"Figure {m.group(1)}", - "page_num": page_num, - "caption_y0": by0, - "caption_y1": by1, - "caption_x0": bx0, - "caption_x1": bx1, - "caption_text": text, - "page_width": page_width, - "page_height": page_height, - } - ) - continue - - m = _TABLE_CAPTION_RE.match(first_line) - if m: - captions.append( - { - "type": "table", - "num": int(m.group(1)), - "label": f"Table {m.group(1)}", - "page_num": page_num, - "caption_y0": by0, - "caption_y1": by1, - "caption_x0": bx0, - "caption_x1": bx1, - "caption_text": text, - "page_width": page_width, - "page_height": page_height, - } - ) - - return captions - - -def _find_figure_top(page, caption: dict) -> float: - """向上扫描页面,找到 Figure 的上边界。 - - 策略: - 1. 优先用嵌入图片定位 — 收集 caption 上方所有相关图片 bbox, - 按 Y 轴聚类后取最大簇的最小 y 作为上界(处理 subfigure 组合图) - 2. 无图片时回退到文本块间隙检测(处理纯矢量图如 TikZ/matplotlib PDF) + 用 union-find 将间距 ≤ gap 的同类型 box 归为一组, + 每组生成一个 _BoxCluster(整体 bbox)。 """ - caption_y = caption["caption_y0"] - col_x0, col_x1 = _estimate_column_x(caption) - cx0 = max(col_x0, caption["caption_x0"] - _REGION_SIDE_PADDING) - cx1 = min(col_x1, caption["caption_x1"] + _REGION_SIDE_PADDING) - - # 同页上方最近的 Figure/Table caption(多 figure 同页时截断) - _caption_cutoff: float | None = None - for b in page.get_text("blocks"): - if len(b) < 5: - continue - by0, by1 = b[1], b[3] - if by1 >= caption_y or by1 <= caption_y - _FIGURE_MAX_HEIGHT: - continue - first_line = str(b[4]).strip().split("\n")[0].strip() - if _CAPTION_STOP_RE.match(first_line): - _caption_cutoff = by0 - break - - # ── 策略 1:嵌入图片聚类定位 ── - # 收集 caption 上方搜索范围内所有与 caption 水平区域重叠的图片 - image_tops: list[float] = [] - for img_info in page.get_image_info(): - bbox = img_info.get("bbox") - if bbox is None: - continue - if hasattr(bbox, "x0"): - ix0, iy0, ix1, iy1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1 - else: - ix0, iy0, ix1, iy1 = bbox[0], bbox[1], bbox[2], bbox[3] - - # 图片底部必须在 caption 上方、且在搜索范围内 - if not (iy1 <= caption_y and iy1 > caption_y - _FIGURE_MAX_HEIGHT): - continue - # 图片水平范围与 caption 所在列有重叠 - if not (ix1 > cx0 and ix0 < cx1): - continue - # 跳过属于上方另一个 figure 的图片 - if _caption_cutoff is not None and iy0 < _caption_cutoff: - continue - # 跳过极小图标(宽度或高度 <15pt,通常是 logo/符号) - if (ix1 - ix0) < 15 or (iy1 - iy0) < 15: - continue - - image_tops.append(iy0) - - if image_tops: - # 聚类:将 Y 轴接近的图片视为同一组(subfigure),最大簇的最小 y 即图上界 - image_tops.sort() - # 用简单单遍聚类:相邻图片 top 差 < 最大高度的 40% 视为同簇 - cluster_gap = _FIGURE_MAX_HEIGHT * 0.4 - clusters: list[list[float]] = [[image_tops[0]]] - for yt in image_tops[1:]: - if yt - clusters[-1][-1] < cluster_gap: - clusters[-1].append(yt) - else: - clusters.append([yt]) - # 取最大簇(图片数最多的)的最小 y - biggest = max(clusters, key=len) - figure_top = min(biggest) - else: - # ── 策略 2:文本块间隙检测(纯矢量图) ── - above_blocks: list[tuple[float, float, float, float]] = [] - for b in page.get_text("blocks"): - if len(b) < 5: - continue - bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3] - if by1 <= caption_y and by1 > caption_y - _FIGURE_MAX_HEIGHT: - if bx1 > cx0 and bx0 < cx1: - if col_x0 > 0 and bx0 < col_x0 - _REGION_SIDE_PADDING * 2: - continue - above_blocks.append((bx0, by0, bx1, by1)) - - if not above_blocks: - return max(0, caption_y - _FIGURE_DEFAULT_HEIGHT) - - above_blocks.sort(key=lambda b: b[1], reverse=True) - prev_bottom = caption_y - for b in above_blocks: - if prev_bottom - b[3] > _CONTENT_GAP_THRESHOLD: - figure_top = prev_bottom - 5 - break - prev_bottom = b[1] - else: - figure_top = above_blocks[-1][1] - - # 同页 caption 截断 - if _caption_cutoff is not None: - figure_top = max(figure_top, _caption_cutoff + 5) - - # 限制最大高度 - if caption_y - figure_top > _FIGURE_MAX_HEIGHT: - figure_top = caption_y - _FIGURE_MAX_HEIGHT - - return max(0, figure_top) - - -def _find_figure_horizontal( - page, caption: dict, top: float, bottom: float -) -> tuple[float, float]: - """确定 Figure 的水平裁剪范围。 - - 取 caption 宽度和图片实际宽度的并集,避免截断比 caption 更宽的图。 - """ - pw = caption["page_width"] - x0 = caption["caption_x0"] - x1 = caption["caption_x1"] - - # 收集裁剪区域内所有嵌入图片的水平范围 - col_x0, col_x1 = _estimate_column_x(caption) - for img_info in page.get_image_info(): - bbox = img_info.get("bbox") - if bbox is None: - continue - if hasattr(bbox, "x0"): - ix0, iy0, ix1, iy1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1 - else: - ix0, iy0, ix1, iy1 = bbox[0], bbox[1], bbox[2], bbox[3] - # 图片在裁剪区域内且在 caption 所在列 - if iy0 < bottom and iy1 > top and ix1 > col_x0 and ix0 < col_x1: - if (ix1 - ix0) < 15: - continue # 跳过小图标 - x0 = min(x0, ix0) - x1 = max(x1, ix1) - - return max(0, x0 - _REGION_SIDE_PADDING), min(pw, x1 + _REGION_SIDE_PADDING) - - -def _find_table_region(page, caption: dict) -> tuple[float, float, float, float]: - """向下扫描页面,找到 Table 的下边界和水平范围。 - - 返回: (x0, bottom, x1) — 裁剪区域的左、下、右边界。 - 上边界由调用方根据 caption 位置确定。 - - 策略: - 1. 用 page.find_tables() 收集 caption 下方所有相邻的表格段,合并为一个完整区域 - (学术论文表格常被拆成表头行 + 数据行等多个 find_tables 段) - 2. 未命中时回退到文本块间隙检测 - """ - caption_y = caption["caption_y1"] # caption 底部作为扫描起点 - caption_x0 = caption["caption_x0"] - caption_x1 = caption["caption_x1"] - page_width = caption["page_width"] - - # ── 策略 1: find_tables() 结构化检测 + 合并相邻段 ── - try: - tables = page.find_tables() - except Exception: - tables = None - - if tables and tables.tables: - # 确定 caption 所在栏的范围(防止双栏论文中跨栏收集) - col_x0, col_x1 = _estimate_column_x(caption) - - # 收集 caption 下方附近且在同一栏内的表格段 bbox - segments: list[tuple[float, float, float, float]] = [] - for t in tables.tables: - tb = t.bbox - if isinstance(tb, (list, tuple)): - tx0, ty0, tx1, ty1 = ( - float(tb[0]), - float(tb[1]), - float(tb[2]), - float(tb[3]), - ) - else: - tx0, ty0, tx1, ty1 = ( - float(tb.x0), - float(tb.y0), - float(tb.x1), - float(tb.y1), - ) - - # 表格段上边在 caption 底部附近,且与 caption 同栏 - if ( - ty0 >= caption_y - 5 - and ty0 < caption_y + 200 - and tx1 > col_x0 - and tx0 < col_x1 - ): - segments.append((tx0, ty0, tx1, ty1)) - - if segments: - # 按 y 排序,合并相邻段(gap < 30pt 视为同一表格的连续部分) - segments.sort(key=lambda s: s[1]) - merged: list[tuple[float, float, float, float]] = [segments[0]] - for seg in segments[1:]: - prev = merged[-1] - gap = seg[1] - prev[3] # 当前段 top - 上一段 bottom - if gap < 30: - # 合并:取并集范围 - merged[-1] = ( - min(prev[0], seg[0]), - min(prev[1], seg[1]), - max(prev[2], seg[2]), - max(prev[3], seg[3]), - ) - else: - merged.append(seg) - - # 取第一个合并段(最靠近 caption 的完整表格) - final = merged[0] - tx0, ty0, tx1, ty1 = final - - # 限制最大高度 - if ty1 - caption_y > _TABLE_MAX_HEIGHT: - ty1 = caption_y + _TABLE_MAX_HEIGHT - x0 = max(0, min(caption_x0, tx0) - _REGION_SIDE_PADDING) - x1 = min(page_width, max(caption_x1, tx1) + _REGION_SIDE_PADDING) - logger.debug( - "Table detected by find_tables() (%d segments merged): " - "(%.0f,%.0f)-(%.0f,%.0f)", - len(segments), - x0, - caption_y, - x1, - ty1, - ) - return (x0, caption["caption_y0"], ty1, x1) - - # ── 策略 2: 回退到文本块间隙检测 ── - x0, t_top, t_bottom, x1 = _find_table_region_by_blocks(page, caption) - return (x0, t_top, t_bottom, x1) - - -def _scan_blocks_direction( - blocks: list, - start_y: float, - col_x0: float, - col_x1: float, - direction: int, - max_range: float, -) -> list[tuple[float, float, float, float]]: - """从 start_y 向上(direction=-1)或向下(direction=1)扫描文本块。 - - 收集间隙连续的块,遇到 stop 信号(caption / section header)或大间隙即停。 - 用 current_top/current_bottom 追踪连通区域边界,正确处理 y 重叠块。 - - Returns: - 收集到的块列表 [(x0, y0, x1, y1), ...] - """ - # 过滤在扫描范围内的块 - if direction > 0: # 向下 - candidates = [ - b - for b in blocks - if len(b) >= 5 - and b[1] > start_y - and b[1] < start_y + max_range - and b[2] > col_x0 - and b[0] < col_x1 - ] - candidates.sort(key=lambda b: b[1]) # 按 y0 升序 - else: # 向上 - candidates = [ - b - for b in blocks - if len(b) >= 5 - and b[3] <= start_y - and b[1] > start_y - max_range - and b[2] > col_x0 - and b[0] < col_x1 - ] - candidates.sort(key=lambda b: b[3], reverse=True) # 按 y1 降序(底部离 start_y 最近的在前) - - if not candidates: + if not boxes: return [] - # 从 start_y 开始,追踪连通区域边界 - connected: list[tuple[float, float, float, float]] = [] - boundary = start_y # 当前连通区域离 start_y 最近端的 y 坐标 - prev_was_dense_table = False + n = len(boxes) + parent = list(range(n)) - for b in candidates: + def find(x: int) -> int: + while parent[x] != x: + parent[x] = parent[parent[x]] + x = parent[x] + return x + + def union(a: int, b: int) -> None: + ra, rb = find(a), find(b) + if ra != rb: + parent[ra] = rb + + for i in range(n): + bi = boxes[i] + for j in range(i + 1, n): + bj = boxes[j] + if bi.boxclass != bj.boxclass: + continue + h_gap = max(0.0, max(bi.x0, bj.x0) - min(bi.x1, bj.x1)) + v_gap = max(0.0, max(bi.y0, bj.y0) - min(bi.y1, bj.y1)) + h_overlap = bi.x1 > bj.x0 - gap and bj.x1 > bi.x0 - gap + v_overlap = bi.y1 > bj.y0 - gap and bj.y1 > bi.y0 - gap + if (h_gap <= gap and v_overlap) or (v_gap <= gap and h_overlap): + union(i, j) + + groups: dict[int, list] = {} + for i in range(n): + groups.setdefault(find(i), []).append(boxes[i]) + + return [_BoxCluster(members) for members in groups.values()] + + +# ── 页面级 Caption 查找与匹配 ────────────────────────────────────────── + + +def _find_page_captions(page) -> list[dict]: + """查找页面上所有 Figure/Table caption 文字块。""" + blocks = page.get_text("blocks") + captions = [] + for b in blocks: + if len(b) < 5: + continue bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3] text = str(b[4]).strip() first_line = text.split("\n")[0].strip() - # stop 信号 - if _CAPTION_STOP_RE.match(first_line) or _SECTION_STOP_RE.match(first_line): - break - - # 检查当前块是否与连通区域相连(间隙 < 阈值) - if direction > 0: - gap = by0 - boundary + cap_type = None + m = _TABLE_CAPTION_RE.match(first_line) + if m: + cap_type = "table" else: - gap = boundary - by1 + m = _FIGURE_CAPTION_RE.match(first_line) + if m: + cap_type = "figure" + if m is None: + continue - # 密集表格数据块后使用更低的间隙阈值 - threshold = ( - _TABLE_DATA_GAP_THRESHOLD - if prev_was_dense_table - else _CONTENT_GAP_THRESHOLD + captions.append( + { + "label": f"{'Table' if cap_type == 'table' else 'Figure'} {m.group(1)}", + "type": cap_type, + "caption_text": text, + "caption_y0": by0, + "caption_y1": by1, + "caption_x0": bx0, + "caption_x1": bx1, + } ) - if gap > threshold: - break - - connected.append((bx0, by0, bx1, by1)) - - # 更新连通区域边界 - if direction > 0: - boundary = by1 # 向下扩展 - else: - boundary = min(boundary, by0) # 向上扩展 - - # 判断当前块是否为密集表格数据(行密度高) - lines = [l for l in text.split("\n") if l.strip()] - block_height = by1 - by0 - prev_was_dense_table = ( - len(lines) >= 4 - and block_height > 0 - and len(lines) / block_height >= 0.08 - ) - - return connected + return captions -def _find_table_region_by_blocks( - page, caption: dict -) -> tuple[float, float, float]: - """文本块间隙检测 — 作为 find_tables() 的 fallback。 +def _vertical_distance(cap_y0, cap_y1, box_y0, box_y1) -> float | None: + """计算 caption 到 box 的垂直距离。不邻接时返回 None。 - 向下扫描找表格下边界,向上扫描找表格上边界(处理 caption 在数据下方)。 - 使用 _scan_blocks_direction 统一双向扫描逻辑。 + 三种情况:caption 完全在 box 上方、完全在下方、与 box 有垂直重叠。 + 重叠(含部分溢出)视为 distance=0,确保 caption 延伸到 box 边界外时不会丢失。 """ - blocks = page.get_text("blocks") - caption_y0 = caption["caption_y0"] - caption_y1 = caption["caption_y1"] - caption_x0 = caption["caption_x0"] - caption_x1 = caption["caption_x1"] - page_width = caption["page_width"] - page_height = caption["page_height"] + # Caption 完全在 box 上方 + if cap_y1 <= box_y0: + dist = box_y0 - cap_y1 + return dist if dist <= _CAPTION_MATCH_DISTANCE else None + # Caption 完全在 box 下方 + if cap_y0 >= box_y1: + dist = cap_y0 - box_y1 + return dist if dist <= _CAPTION_MATCH_DISTANCE else None + # Caption 与 box 有垂直重叠(内部、部分溢出都算)→ 距离 0 + return 0 - col_x0, col_x1 = _estimate_column_x(caption) - # 向下扫描 - below = _scan_blocks_direction( - blocks, caption_y1, col_x0, col_x1, direction=1, max_range=_TABLE_MAX_HEIGHT - ) - # 向上扫描 - above = _scan_blocks_direction( - blocks, caption_y0, col_x0, col_x1, direction=-1, max_range=_TABLE_MAX_HEIGHT +def _same_column(cap: dict, box, page_width: float) -> bool: + """判断 caption 和 box 是否在同一列。 + + 双栏论文中左右栏间距有限,简单的水平重叠检查会跨列匹配。 + 策略:用中心 X 坐标判断各自在哪半边,只有同半边才算同列。 + 跨栏图表(caption 或 box 宽度 >65% 页宽)不受此限制。 + """ + cap_w = cap["caption_x1"] - cap["caption_x0"] + box_w = box.x1 - box.x0 + + # 跨栏元素:宽度超过页面的 65% + if cap_w > page_width * 0.65 or box_w > page_width * 0.65: + return True + + cap_cx = (cap["caption_x0"] + cap["caption_x1"]) / 2 + box_cx = (box.x0 + box.x1) / 2 + mid = page_width / 2 + + # 同在左半边或同在右半边 + return (cap_cx < mid) == (box_cx < mid) + + +def _match_captions_to_boxes( + page_boxes: list, captions: list[dict], page_width: float +) -> list[tuple[list[int], list[dict]]]: + """将 caption 分配给 box,允许一个 caption 匹配多个同类型 box。 + + 典型场景: + - Figure 由左右两个 picture box 组成,caption 同时靠近两者 + - Table 的视觉内容被 layout analysis 误分类为 picture,需要跨类型匹配 + + Returns: + [(box_indices, captions), ...] 每组是一个独立的渲染任务 + """ + # 每个 caption 找到所有距离在阈值内的 box + # 优先匹配同类型;如果找不到,再匹配任意 table/picture box + cap_to_boxes: dict[int, list[tuple[int, float]]] = {} + + for ci, cap in enumerate(captions): + same_type: list[tuple[int, float]] = [] + any_type: list[tuple[int, float]] = [] + expected = "table" if cap["type"] == "table" else "picture" + + for bi, box in enumerate(page_boxes): + # 列感知:双栏论文中只匹配同栏的 box + if not _same_column(cap, box, page_width): + continue + # 水平重叠检查(同列内仍需有重叠) + if not ( + cap["caption_x1"] > box.x0 - 5 and cap["caption_x0"] < box.x1 + 5 + ): + continue + dist = _vertical_distance( + cap["caption_y0"], cap["caption_y1"], box.y0, box.y1 + ) + if dist is None: + continue + entry = (bi, dist) + any_type.append(entry) + if box.boxclass == expected: + same_type.append(entry) + + # 优先用同类型匹配;没有时回退到任意类型;都没有则跳过 + if same_type: + cap_to_boxes[ci] = same_type + elif any_type: + cap_to_boxes[ci] = any_type + # else: 该 caption 无匹配 box,不加入 cap_to_boxes + + # 每个 caption → 最近的 box(用于分组),但记录所有匹配的 box + cap_primary: dict[int, int] = {} # caption → primary box index + cap_all_boxes: dict[int, list[int]] = {} # caption → all matched box indices + for ci, matches in cap_to_boxes.items(): + matches.sort(key=lambda x: x[1]) + cap_primary[ci] = matches[0][0] + # 所有距离最近的同组 box(距离差 < 20pt 视为同一组) + best_dist = matches[0][1] + cap_all_boxes[ci] = [bi for bi, d in matches if d <= best_dist + 20] + + # 按 primary box 分组 + box_to_caps: dict[int, list[int]] = {} + for ci, bi in cap_primary.items(): + box_to_caps.setdefault(bi, []).append(ci) + + # 构建渲染组:每个 caption 独立成组(共享 box 但各自渲染) + # 同类型同 label 的 caption 会合并;不同类型则分开 + used_captions: set[int] = set() + groups: list[tuple[list[int], list[dict]]] = [] + + for bi in sorted(box_to_caps.keys()): + cis = box_to_caps[bi] + for ci in cis: + if ci in used_captions: + continue + used_captions.add(ci) + + all_box_indices = set(cap_all_boxes.get(ci, [bi])) + # 只合并同 label 的 caption(同 figure/table 的重复 caption) + merged_captions = [captions[ci]] + for other_bi in all_box_indices: + if other_bi in box_to_caps: + for other_ci in box_to_caps[other_bi]: + if other_ci not in used_captions: + other_cap = captions[other_ci] + if other_cap["label"] == captions[ci]["label"]: + used_captions.add(other_ci) + merged_captions.append(other_cap) + groups.append((sorted(all_box_indices), merged_captions)) + + return groups + + +# ── 单页处理 ───────────────────────────────────────────────────────── + + +def _render_and_save( + page, + clip: pymupdf.Rect, + images_dest: Path, + manifest: dict, + label: str, + cap_type: str, + caption_text: str, + page_num_1based: int, + arxiv_id: str, +) -> bool: + """渲染页面区域并保存 JPEG,写入 manifest。成功返回 True。""" + mat = pymupdf.Matrix(_RENDER_ZOOM, _RENDER_ZOOM) + try: + pix = page.get_pixmap(matrix=mat, clip=clip) + except Exception: + logger.debug("Failed to render %s for %s", label, arxiv_id) + return False + + filename = f"{label.replace(' ', '_').lower()}.jpg" + (images_dest / filename).write_bytes(pix.tobytes("jpeg")) + + manifest[filename] = { + "page": page_num_1based, + "type": cap_type, + "label": label, + "caption_text": caption_text[:200] if caption_text else "", + "figures" if cap_type == "figure" else "tables": [label], + } + logger.debug( + "Rendered %s: page %d, region (%.0f,%.0f)-(%.0f,%.0f) → %s", + label, + page_num_1based, + clip.x0, + clip.y0, + clip.x1, + clip.y1, + filename, ) + return True - # 确定上下边界 - scan_top = min(b[1] for b in above) if above else caption_y0 - scan_bottom = max(b[3] for b in below) if below else caption_y1 - top = scan_top - bottom = scan_bottom + 5 # 底部 padding +def _process_page( + doc, + page_idx: int, + page_layout, + images_dest: Path, + manifest: dict, + seen_labels: set, + arxiv_id: str, +) -> int: + """处理单页:caption 匹配 + orphan 兜底,返回本页提取数量。""" + page = doc[page_idx] + page_width = page.rect.width + page_num = page_idx + 1 + orphan_fig_counter = 0 + orphan_tbl_counter = 0 - if bottom - top > _TABLE_MAX_HEIGHT: - bottom = top + _TABLE_MAX_HEIGHT + # 收集本页的 table/picture box(跳过极小区域) + raw_boxes = [] + for box in page_layout.boxes: + if box.boxclass not in ("table", "table-fallback", "picture"): + continue + if (box.x1 - box.x0) < 20 or (box.y1 - box.y0) < 20: + continue + raw_boxes.append(box) - # 水平范围:caption + 所有纳入块 - all_blocks = above + below - if all_blocks: - content_x0 = min(caption_x0, min(b[0] for b in all_blocks)) - content_x1 = max(caption_x1, max(b[2] for b in all_blocks)) - else: - content_x0 = caption_x0 - content_x1 = caption_x1 + if not raw_boxes: + return 0 - x0 = max(0, content_x0 - _REGION_SIDE_PADDING) - x1 = min(page_width, content_x1 + _REGION_SIDE_PADDING) + # 聚类:将同一 figure/table 的碎片 box 合并 + page_boxes = _cluster_boxes(raw_boxes) - return (x0, top, bottom, x1) + # 页面级匹配:查找所有 caption,分配给 box + captions = _find_page_captions(page) + groups = _match_captions_to_boxes(page_boxes, captions, page_width) + + # 只合并同 label 的 group(同一个 figure/table 的重复 caption) + # 不同 label 的 group 即使共享 box 也不合并(如 Figure 7 和 Figure 8), + # 渲染时用 caption 位置切割区域 + _merged_groups: set[int] = set() + merged_groups: list[tuple[list[int], list[dict]]] = [] + for gi, (box_indices, caps) in enumerate(groups): + if gi in _merged_groups: + continue + this_labels = {c["label"] for c in caps} + all_box_set = set(box_indices) + merge_targets = {gi} + for other_gi, (other_bi, other_caps) in enumerate(groups): + if other_gi <= gi or other_gi in _merged_groups: + continue + other_labels = {c["label"] for c in other_caps} + # 只在 label 有交集时合并(同一个 figure/table) + if this_labels & other_labels and all_box_set & set(other_bi): + merge_targets.add(other_gi) + all_box_set |= set(other_bi) + all_caps = [] + for mgi in sorted(merge_targets): + _merged_groups.add(mgi) + all_caps.extend(groups[mgi][1]) + merged_groups.append((sorted(all_box_set), all_caps)) + groups = merged_groups + + # ── 阶段 1:渲染有 caption 匹配的图/表 ── + matched_box_indices: set[int] = set() + extracted = 0 + + for box_indices, caps in groups: + matched_box_indices.update(box_indices) + + # 去重同一 label,跳过已处理的 + unique_caps = [] + for cap in caps: + if cap["label"] not in seen_labels: + seen_labels.add(cap["label"]) + unique_caps.append(cap) + if not unique_caps: + continue + + # 合并所有关联 box 的 bbox + bx0 = min(page_boxes[i].x0 for i in box_indices) + by0 = min(page_boxes[i].y0 for i in box_indices) + bx1 = max(page_boxes[i].x1 for i in box_indices) + by1 = max(page_boxes[i].y1 for i in box_indices) + + # 渲染区域:box + caption + all_cap_y0 = min(c["caption_y0"] for c in unique_caps) + all_cap_y1 = max(c["caption_y1"] for c in unique_caps) + all_cap_x0 = min(c["caption_x0"] for c in unique_caps) + all_cap_x1 = max(c["caption_x1"] for c in unique_caps) + + top = max(0, min(by0, all_cap_y0) - _REGION_PADDING) + bottom = max(by1, all_cap_y1) + _REGION_PADDING + rx0 = max(0, min(bx0, all_cap_x0) - _REGION_PADDING) + rx1 = min(page_width, max(bx1, all_cap_x1) + _REGION_PADDING) + + clip = pymupdf.Rect(rx0, top, rx1, bottom) + # 多个 caption 可能共享同一区域(如 subfigure),只需渲染一次 + jpeg_bytes = None + for cap in unique_caps: + if jpeg_bytes is None: + if not _render_and_save( + page, + clip, + images_dest, + manifest, + cap["label"], + cap["type"], + cap["caption_text"], + page_num, + arxiv_id, + ): + break + # 读取刚写入的 bytes 供后续同名 caption 复用 + filename = f"{cap['label'].replace(' ', '_').lower()}.jpg" + jpeg_bytes = (images_dest / filename).read_bytes() + extracted += 1 + else: + # 同区域的不同 caption(如 subfigure),复用图片 + filename = f"{cap['label'].replace(' ', '_').lower()}.jpg" + (images_dest / filename).write_bytes(jpeg_bytes) + cap_preview = cap["caption_text"][:200] + manifest[filename] = { + "page": page_num, + "type": cap["type"], + "label": cap["label"], + "caption_text": cap_preview, + "figures" if cap["type"] == "figure" else "tables": [cap["label"]], + } + extracted += 1 + + # ── 阶段 2:渲染无 caption 匹配的图/表(orphan boxes) ── + orphan_indices = set(range(len(page_boxes))) - matched_box_indices + for bi in sorted(orphan_indices): + box = page_boxes[bi] + cap_type = "figure" if box.boxclass == "picture" else "table" + + if cap_type == "figure": + orphan_fig_counter += 1 + label = f"Figure (p{page_num}-{orphan_fig_counter})" + else: + orphan_tbl_counter += 1 + label = f"Table (p{page_num}-{orphan_tbl_counter})" + + if label in seen_labels: + continue + seen_labels.add(label) + + clip = pymupdf.Rect( + max(0, box.x0 - _REGION_PADDING), + max(0, box.y0 - _REGION_PADDING), + min(page_width, box.x1 + _REGION_PADDING), + box.y1 + _REGION_PADDING, + ) + if _render_and_save( + page, + clip, + images_dest, + manifest, + label, + cap_type, + "", + page_num, + arxiv_id, + ): + extracted += 1 + + return extracted + + +# ── 核心提取 ─────────────────────────────────────────────────────────── def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int: """从 PDF 提取 Figure/Table 截图,生成 manifest。 - 策略:找 caption → 定位区域 → 渲染页面截图。 + 用 pymupdf4llm layout analysis 检测 table/picture 区域, + 再通过 caption 文字确定编号,渲染为 JPEG。 Args: arxiv_id: 论文 ID @@ -549,8 +509,6 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int: Returns: 提取的图片数量 """ - import pymupdf - if pdf_path is None: pdf_path = TMP_DIR / arxiv_id / "paper.pdf" @@ -561,7 +519,7 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int: images_dest = paper_dir(arxiv_id) / "images" images_dest.mkdir(parents=True, exist_ok=True) - # 清理上次提取的旧图片,避免残留(同时清理 .png 和 .jpg) + # 清理上次提取的旧图片 for old_file in images_dest.iterdir(): if old_file.suffix.lower() in (".png", ".jpg", ".jpeg"): old_file.unlink() @@ -569,94 +527,43 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int: (images_dest / "manifest.json").unlink() doc = pymupdf.open(str(pdf_path)) - captions = _find_captions(doc) - if not captions: - logger.info("No Figure/Table captions found in PDF for %s", arxiv_id) + # layout analysis + try: + parsed = dl.parse_document( + doc, filename=str(pdf_path), use_ocr=dl.OCRMode.NEVER + ) + except Exception: + logger.warning( + "pymupdf4llm layout analysis failed for %s", arxiv_id, exc_info=True + ) doc.close() return 0 - # 去重:同一页同一 label 可能匹配到多个 block(如正文引用 "Figure 7") - # 保留每个 (type, num) 的第一个匹配(即真正的 caption) - seen_labels: dict[str, dict] = {} - for cap in captions: - key = cap["label"] - if key not in seen_labels: - seen_labels[key] = cap - - unique_captions = list(seen_labels.values()) extracted = 0 manifest: dict[str, dict] = {} + seen_labels: set[str] = set() - zoom = 3 # 3x 渲染,保证清晰度 - - for cap in unique_captions: - page = doc[cap["page_num"]] - - if cap["type"] == "figure": - # Figure: caption 上方是图 → 向上找图的上边界 - top = _find_figure_top(page, cap) - # 上方多留 5pt 边距,确保图框边框、装饰线等不被截断 - top = max(0, top - 5) - bottom = cap["caption_y1"] + 5 # 包含 caption - # 水平范围:取 caption 宽度和图片实际宽度的并集 - x0, x1 = _find_figure_horizontal(page, cap, top, bottom) - - height = bottom - top - if height < _FIGURE_MIN_HEIGHT: - logger.debug( - "Figure %s too small (%.0fpt), skipping", cap["label"], height - ) - continue - - else: - # Table: 找表格区域(find_tables() → 块级 fallback,双向扫描) - x0, tbl_top, bottom, x1 = _find_table_region(page, cap) - top = max(0, tbl_top - 5) # 包含 caption 及上方数据,留 5pt margin - - height = bottom - top - if height < _TABLE_MIN_HEIGHT: - logger.debug( - "Table %s too small (%.0fpt), skipping", cap["label"], height - ) - continue - - # 渲染截取 - clip = pymupdf.Rect(x0, top, x1, bottom) - mat = pymupdf.Matrix(zoom, zoom) + for page_idx, page_layout in enumerate(parsed.pages): try: - pix = page.get_pixmap(matrix=mat, clip=clip) + extracted += _process_page( + doc, + page_idx, + page_layout, + images_dest=images_dest, + manifest=manifest, + seen_labels=seen_labels, + arxiv_id=arxiv_id, + ) except Exception: - logger.debug("Failed to render %s region for %s", cap["label"], arxiv_id) + logger.warning( + "Failed to process page %d for %s", + page_idx + 1, + arxiv_id, + exc_info=True, + ) continue - # 保存为 JPEG(比 PNG 小 5-10 倍,适合网络传输) - filename = f"{cap['label'].replace(' ', '_').lower()}.jpg" - jpeg_path = images_dest / filename - jpeg_bytes = pix.tobytes("jpeg") - jpeg_path.write_bytes(jpeg_bytes) - extracted += 1 - - cap_preview = cap["caption_text"][:200] if cap["caption_text"] else "" - manifest[filename] = { - "page": cap["page_num"] + 1, - "type": cap["type"], - "label": cap["label"], - "caption_text": cap_preview, - "figures" if cap["type"] == "figure" else "tables": [cap["label"]], - } - logger.debug( - "Rendered %s: page %d, region (%.0f,%.0f)-(%.0f,%.0f) h=%.0fpt → %s", - cap["label"], - cap["page_num"] + 1, - x0, - top, - x1, - bottom, - height, - filename, - ) - doc.close() # 保存 manifest @@ -665,17 +572,17 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int: if extracted > 0: logger.info( - "Extracted %d figure/table screenshots from PDF for %s " - "(from %d captions found, %d unique)", + "Extracted %d figure/table screenshots from PDF for %s", extracted, arxiv_id, - len(captions), - len(unique_captions), ) return extracted +# ── 按 summary 过滤 ──────────────────────────────────────────────────── + + def filter_images_by_summary(arxiv_id: str, figures: list[dict]) -> int: """根据 summary 中的 figures 字段过滤提取的图片/表格。 diff --git a/app/services/pi_client.py b/app/services/pi_client.py index 0a7f68c..5dce8f7 100644 --- a/app/services/pi_client.py +++ b/app/services/pi_client.py @@ -1,17 +1,38 @@ -"""pi CLI 调用与 JSON 提取 — 调用 pi 生成总结,从输出中提取结构化 JSON。""" +"""pi CLI 后端 — 调用 pi 子进程生成总结。 + +通用工具函数(prompt 构建、PDF 提取、JSON 提取、meta.json)已移至 summary_utils.py。 +""" from __future__ import annotations import asyncio -import json import logging -import re +import uuid from pathlib import Path from app.config import settings +from app.services.summary_utils import ( + JsonNotFoundError, + build_prompt, + extract_json, + extract_pdf_text, + write_meta_json, +) logger = logging.getLogger(__name__) +# 重新导出,保持向后兼容 +__all__ = [ + "PiTimeoutError", + "PiProcessError", + "JsonNotFoundError", + "call_pi", + "write_meta_json", + "extract_pdf_text", + "build_prompt", + "extract_json", +] + # ── 自定义异常 ────────────────────────────────────────────────────────── @@ -27,201 +48,6 @@ class PiProcessError(Exception): super().__init__(f"pi exited with code {returncode}: {stderr[:500]}") -class JsonNotFoundError(Exception): - pass - - -# ── meta.json ─────────────────────────────────────────────────────────── - - -def write_meta_json(paper) -> Path: - """写入 data/papers/{arxiv_id}/meta.json,返回路径。""" - from app.services.pdf_downloader import paper_dir - - d = paper_dir(paper.arxiv_id) - d.mkdir(parents=True, exist_ok=True) - meta_path = d / "meta.json" - - authors = [a.name for a in paper.authors] - tags = [t.tag for t in paper.tags] - meta = { - "arxiv_id": paper.arxiv_id, - "title_en": paper.title_en, - "abstract": paper.abstract or "", - "published_at": paper.published_at.isoformat() if paper.published_at else None, - "authors": authors, - "tags": tags, - "upvotes": paper.upvotes, - } - meta_path.write_text( - json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8" - ) - return meta_path - - -# ── PDF 文本提取 ──────────────────────────────────────────────────────── - - -def _trim_body(text: str, max_chars: int | None = None) -> str: - """去除参考文献,保留正文+附录,超长时从末尾截断。 - - 策略: - 1. 去掉 References/Bibliography 段落(纯引用列表,对解读无用) - 2. 正文 + 附录全部保留 - 3. 如果指定了 max_chars 且总长超过,从末尾截断(附录靠后,优先保留正文) - """ - import re - - # 找 References 段落的位置(在 Appendix 之后的那个) - # 简单策略:找到 References 标题,如果后面没有 Appendix 就全删 - # 如果后面还有 Appendix,只删 References 到 Appendix 之间的内容 - ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text) - if ref_match: - ref_start = ref_match.start() - # 看 References 之后有没有 Appendix - after_ref = text[ref_start:] - app_match = re.search( - r"(?m)^(?:A\s+(?:Appendix|Supplementary)|Appendix|附录)\s*$", after_ref - ) - if app_match: - # References 之后有 Appendix:只删 References 段 - ref_end = ref_start + app_match.start() - text = text[:ref_start] + text[ref_end:] - else: - # References 之后没有 Appendix:删掉从 References 到结尾 - text = text[:ref_start].rstrip() - - # 去掉 Acknowledgments(对解读无用) - ack_match = re.search(r"(?m)^(?:Acknowledgments?\s*|致谢\s*)$", text) - if ack_match: - # 只删 Acknowledgments 本身,不删后面的内容 - next_section = re.search(r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start():]) - if next_section: - text = text[:ack_match.start()] + text[ack_match.start() + next_section.start():] - else: - text = text[:ack_match.start()].rstrip() - - # 最后:如果指定了上限且超长,从末尾截断(附录在后面,正文在前面,优先保留正文) - if max_chars is not None and len(text) > max_chars: - text = text[:max_chars].rstrip() - - return text - - -def extract_pdf_text(pdf_path: Path, max_chars: int | None = None) -> Path: - """用 pymupdf 提取 PDF 正文文本,保存为 .txt。 - - max_chars=None 时不截断,给 search/auto 模式保留完整内容。 - """ - import pymupdf - - txt_path = pdf_path.with_suffix(".txt") - if txt_path.exists(): - # 缓存优先;如果需重新提取(不同 max_chars),先删旧文件 - return txt_path - - doc = pymupdf.open(str(pdf_path)) - raw_text = "\n\n".join(page.get_text() for page in doc) - doc.close() - - body = _trim_body(raw_text, max_chars=max_chars) - txt_path.write_text(body, encoding="utf-8") - logger.info( - "Extracted PDF text: %s (%d -> %d chars, -%d%%)", - txt_path, - len(raw_text), - len(body), - (1 - len(body) / len(raw_text)) * 100 if raw_text else 0, - ) - return txt_path - - -# ── Prompt 构建 ───────────────────────────────────────────────────────── - - -def _build_prompt( - arxiv_id: str, - meta_path: Path, - txt_path: Path, - pdf_mode: str, - fix_errors: list[str] | None = None, -) -> str: - """根据模式构建 pi prompt。 - - inject: 全量注入,prompt 末尾包含论文全文内容 - search: pi 自主 read 文件,prompt 只包含工作流指令 - """ - json_schema = ( - "## 必须包含以下字段(不要自创字段名):\n" - '{"arxiv_id": "...", ' - '"title_zh": "中文标题", ' - '"one_line": "一句话概括(≤50字)", ' - '"tags": ["标签1","标签2"], ' - '"difficulty": "入门/进阶/前沿", ' - '"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的(50-150字)","why_matters":"为什么读懂本文需要它"}]}, ' - '"motivation": {"problem": "详细段落:现有方法的具体问题(包含具体场景和数据)", ' - '"goal": "详细段落:本文的具体目标", ' - '"gap": "详细段落:本文的独特切入角度"}, ' - '"method": {"overview": "详细段落:方法整体思路(先直觉再技术路线)", ' - '"key_idea": "详细段落:核心创新点(和已有方法的本质区别)", ' - '"steps": "详细段落:方法步骤的完整描述(每步的输入输出和具体操作)", ' - '"novelty": "详细段落:技术新颖性分析"}, ' - '"results": {"main_findings": "详细段落:核心发现(带具体数字和指标,逐一分析每个实验)", ' - '"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], ' - '"limitations": "详细段落:局限性分析(作者承认的+你自己的观察")}, ' - '"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", ' - '"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", ' - '"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度")}, ' - '"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要","section":"method"},' - '{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要","section":"results"}]' - "\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Table,id 严格使用 \"Figure N\" 或 \"Table N\" 格式。" - "section 必须是 motivation/method/results/limitations 之一,表示该图最适合展示在哪个章节。" - "}" - ) - - writing_requirements = ( - "## 写作要求\n" - "- 每个字符串字段必须写成详细段落(200-500字),不要用列表或数组\n" - "- 必须包含论文中的具体数据、数字、实验指标\n" - "- 像资深同事给同事讲论文一样,专业但易懂\n" - "- 数学公式、符号、变量必须使用 LaTeX 格式:行内公式用 $...$,独立公式用 $$...$$\n" - " 例如:损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$,学习率 $\\eta$\n" - ) - - if fix_errors: - error_list = "\n".join(f"- {e}" for e in fix_errors) - return ( - "你之前生成的 JSON 存在以下问题,请修正后重新用 write_file 保存到 " - f"data/papers/{arxiv_id}/summary.json:\n\n" - f"{error_list}\n\n" - "注意:所有字符串字段必须是详细段落(≥50字),不能是数组或列表。" - "修正后请用 bash 运行 python scripts/validate_summary.py 验证。" - ) - - if pdf_mode == "search": - return ( - "请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n" - "## 工作流程\n" - f"1. 先用 read 工具读取 {meta_path} 了解论文元信息(标题、作者、摘要)\n" - f"2. 再用 read 工具阅读 {txt_path}(论文正文全文),可以多次读取定位关键段落\n" - f"3. 充分理解后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n\n" - + writing_requirements - + "\n" - + json_schema - ) - else: - return ( - "请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n" - "## 工作流程\n" - "论文元信息和正文全文已在上文提供,请仔细阅读。\n" - f"1. 充分理解论文后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n" - "2. 用 bash 运行 python scripts/validate_summary.py 验证\n\n" - + writing_requirements - + "\n" - + json_schema - ) - - # ── pi CLI 调用 ──────────────────────────────────────────────────────── @@ -264,12 +90,10 @@ async def call_pi( txt_path.write_text(trimmed, encoding="utf-8") logger.info("Truncated %s for inject: %d → %d chars", arxiv_id, txt_size, len(trimmed)) - prompt_text = _build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors) + prompt_text = build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors) # 构建 session ID(每篇论文一个独立 session) if session_id is None: - import uuid - session_id = f"summary-{arxiv_id}-{uuid.uuid4().hex[:8]}" # 工具列表:search 模式需要 read 工具 @@ -297,6 +121,9 @@ async def call_pi( arxiv_id, bool(fix_errors), session_id, actual_mode, ) + import time as _time + _t_sub_start = _time.monotonic() + proc = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, @@ -312,69 +139,22 @@ async def call_pi( await proc.wait() raise PiTimeoutError(f"pi timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s") + _t_sub_end = _time.monotonic() + + # 检查 summary.json 是否由 pi 子进程写入 + _summary_file = pdf_path.parent / "summary.json" + _file_info = "" + if _summary_file.exists(): + _file_mtime = _summary_file.stat().st_mtime + _file_size = _summary_file.stat().st_size + _file_info = f" summary.json={_file_size}B" + + logger.info( + "pi subprocess for %s: %.2fs%s", + arxiv_id, _t_sub_end - _t_sub_start, _file_info, + ) + if proc.returncode != 0: raise PiProcessError(proc.returncode, stderr.decode("utf-8", errors="replace")) return stdout.decode("utf-8", errors="replace"), session_id - - -# ── JSON 提取 ────────────────────────────────────────────────────────── - - -def extract_json(raw_output: str) -> dict: - """从 pi 输出中提取 JSON dict。三步策略:直接解析 → 代码块 → 最大花括号块。""" - # 策略 1:整体直接解析 - stripped = raw_output.strip() - try: - result = json.loads(stripped) - if isinstance(result, dict) and "title_zh" in result: - return result - except json.JSONDecodeError: - pass - - # 策略 2:提取 ```json ... ``` 代码块 - fence_pattern = re.compile(r"```(?:json)?\s*\n(.*?)```", re.DOTALL) - for match in fence_pattern.finditer(raw_output): - try: - result = json.loads(match.group(1).strip()) - if isinstance(result, dict) and "title_zh" in result: - return result - except json.JSONDecodeError: - continue - - # 策略 3:匹配包含 title_zh 的最大 {...} 块 - brace_pattern = re.compile(r"\{[^{}]*\"title_zh\"[^{}]*\}", re.DOTALL) - for match in brace_pattern.finditer(raw_output): - try: - return json.loads(match.group(0)) - except json.JSONDecodeError: - continue - - # 更宽松:找到最大的 { ... } 平衡块 - best = None - best_len = 0 - for i, ch in enumerate(raw_output): - if ch != "{": - continue - depth = 0 - for j in range(i, len(raw_output)): - if raw_output[j] == "{": - depth += 1 - elif raw_output[j] == "}": - depth -= 1 - if depth == 0: - candidate = raw_output[i : j + 1] - if len(candidate) > best_len: - try: - parsed = json.loads(candidate) - if isinstance(parsed, dict): - best = parsed - best_len = len(candidate) - except json.JSONDecodeError: - pass - break - - if best is not None: - return best - - raise JsonNotFoundError("no JSON object found in pi output") diff --git a/app/services/summarizer.py b/app/services/summarizer.py index 9b1c6bd..8de9383 100644 --- a/app/services/summarizer.py +++ b/app/services/summarizer.py @@ -29,14 +29,19 @@ from app.services.pdf_downloader import ( download_pdf, paper_dir, ) -from app.services.pi_client import ( +from app.services.summary_utils import ( JsonNotFoundError, + build_prompt, + extract_json, + write_meta_json, + extract_pdf_text, +) +from app.services.pi_client import ( PiProcessError, PiTimeoutError, call_pi, - extract_json, - write_meta_json, ) +from app.services import claude_backend from app.services.schemas import ( SummarySchema, assess_quality, @@ -229,7 +234,6 @@ def _save_files(arxiv_id: str, schema: SummarySchema | None, raw_output: str) -> async def summarize_one( db: Session, paper: Paper, - semaphore: asyncio.Semaphore | None = None, *, force: bool = False, pdf_mode: str = "auto", @@ -257,68 +261,128 @@ async def summarize_one( "reason": "permanent_failure", } - if semaphore: - await semaphore.acquire() - try: - return await _do_summarize_one(db, paper, pdf_mode=pdf_mode) - finally: - if semaphore: - semaphore.release() + return await _do_summarize_one(db, paper, pdf_mode=pdf_mode) async def _generate_with_retry( arxiv_id: str, meta_path: Path, pdf_path: Path, pdf_mode: str = "auto" ) -> tuple[dict, str]: - """调用 pi CLI 生成总结,最多 4 轮验证循环。 + """调用 AI 后端生成总结,最多 4 轮验证循环。 + + 根据 settings.SUMMARY_BACKEND 选择 pi 或 claude 后端。 Returns: (json_data, raw_output) Raises: ValueError: 4 轮验证仍未通过 """ + import time as _time + + backend = settings.SUMMARY_BACKEND validation_errors: list[str] = [] json_data: dict | None = None raw_output = "" session_id = None + summary_file = paper_dir(arxiv_id) / "summary.json" + + # claude 后端需要预构建 prompt(pi 后端在 call_pi 内部构建) + claude_prompt: str | None = None + if backend == "claude": + _t0 = _time.monotonic() + txt_path = extract_pdf_text(pdf_path, max_chars=None) + body = txt_path.read_text(encoding="utf-8") + if len(body) > 80_000: + trimmed = body[:80_000].rstrip() + txt_path.write_text(trimmed, encoding="utf-8") + claude_prompt = build_prompt(arxiv_id, meta_path, txt_path, "inject", None) + logger.info(" [%s] 构建prompt: %.2fs", arxiv_id, _time.monotonic() - _t0) + for attempt in range(1, 5): - # 清理上一轮 pi 写的不完整文件 - stale = paper_dir(arxiv_id) / "summary.json" - if stale.exists(): - stale.unlink() + # 清理上一轮写入的不完整文件 + if summary_file.exists(): + summary_file.unlink() - if attempt == 1: - raw_output, session_id = await call_pi(meta_path, pdf_path, pdf_mode=pdf_mode) + # 记录 AI 调用开始时间 + _t_call_start = _time.monotonic() + + if backend == "claude": + if attempt == 1: + raw_output, session_id = await claude_backend.call_claude( + claude_prompt, session_id=None, + ) + else: + retry_prompt = build_prompt( + arxiv_id, meta_path, + extract_pdf_text(pdf_path, max_chars=80000), + "inject", fix_errors=validation_errors, + ) + raw_output, session_id = await claude_backend.call_claude( + retry_prompt, session_id=session_id, fix_errors=validation_errors, + ) else: - raw_output, session_id = await call_pi( - meta_path, pdf_path, - fix_errors=validation_errors, - session_id=session_id, - pdf_mode=pdf_mode, - ) + if attempt == 1: + raw_output, session_id = await call_pi(meta_path, pdf_path, pdf_mode=pdf_mode) + else: + raw_output, session_id = await call_pi( + meta_path, pdf_path, + fix_errors=validation_errors, + session_id=session_id, + pdf_mode=pdf_mode, + ) - # 优先读取 pi 写入的 summary.json,否则从 stdout 提取 - summary_file = paper_dir(arxiv_id) / "summary.json" + _t_call_end = _time.monotonic() + + # 检查 summary.json 是否由 AI 子进程写入 + file_written_by_ai = summary_file.exists() + file_mtime = summary_file.stat().st_mtime if file_written_by_ai else None + file_size = summary_file.stat().st_size if file_written_by_ai else 0 + + logger.info( + " [%s] attempt %d AI调用: %.2fs summary.json=%s%s", + arxiv_id, attempt, + _t_call_end - _t_call_start, + f"已写入({file_size}B)" if file_written_by_ai else "未写入", + f" mtime={file_mtime:.2f}" if file_mtime else "", + ) + + # 提取 JSON + _t_json_start = _time.monotonic() try: - if summary_file.exists(): + if file_written_by_ai: json_data = json.loads(summary_file.read_text(encoding="utf-8")) - logger.info("Read summary.json written by pi for %s", arxiv_id) + logger.info(" [%s] 从AI写入的summary.json读取", arxiv_id) else: json_data = extract_json(raw_output) except (json.JSONDecodeError, JsonNotFoundError) as exc: + _t_json_end = _time.monotonic() logger.warning( - "JSON extraction failed for %s (attempt %d): %s", - arxiv_id, attempt, str(exc)[:200], + " [%s] JSON提取失败: %.2fs %s", + arxiv_id, _t_json_end - _t_json_start, str(exc)[:200], ) validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"] continue + _t_json_end = _time.monotonic() + # 验证 + _t_val_start = _time.monotonic() validation_errors = _validate_summary(json_data, arxiv_id) + _t_val_end = _time.monotonic() + if not validation_errors: + logger.info( + " [%s] JSON提取: %.2fs 验证: %.2fs ✅", + arxiv_id, + _t_json_end - _t_json_start, + _t_val_end - _t_val_start, + ) break logger.warning( - "Validation failed for %s (attempt %d): %s", - arxiv_id, attempt, "; ".join(validation_errors), + " [%s] JSON提取: %.2fs 验证: %.2fs ❌ %s", + arxiv_id, + _t_json_end - _t_json_start, + _t_val_end - _t_val_start, + "; ".join(validation_errors)[:200], ) if validation_errors: @@ -335,11 +399,19 @@ def _persist_summary( db: Session, paper: Paper, json_data: dict, raw_output: str ) -> str: """Pydantic 校验 → 质量评估 → 保存文件 → 更新 DB → 返回 quality。""" + import time as _time + arxiv_id = paper.arxiv_id + + _t0 = _time.monotonic() schema = SummarySchema.model_validate(json_data) quality = assess_quality(schema) + _t1 = _time.monotonic() + + _save_files(arxiv_id, schema, raw_output) + _t2 = _time.monotonic() - _save_files(paper.arxiv_id, schema, raw_output) _update_summary_in_db(db, paper, schema, quality, raw_output) + _t3 = _time.monotonic() # 状态 → done paper.summary_status.status = SummaryState.DONE @@ -347,10 +419,30 @@ def _persist_summary( paper.summary_status.completed_at = utc_now() paper.summary_status.raw_output_saved = True db.commit() + _t4 = _time.monotonic() + + logger.info( + " [%s] persist: pydantic=%.2fs 文件=%.2fs DB写入=%.2fs 状态commit=%.2fs", + arxiv_id, + _t1 - _t0, + _t2 - _t1, + _t3 - _t2, + _t4 - _t3, + ) # 触发性增强(失败不影响总结) - _maybe_extract_images(paper.arxiv_id, schema) - _maybe_index_chroma(paper.arxiv_id, paper, schema) + _t5 = _time.monotonic() + _maybe_extract_images(arxiv_id, schema) + _t6 = _time.monotonic() + _maybe_index_chroma(arxiv_id, paper, schema) + _t7 = _time.monotonic() + + logger.info( + " [%s] 后处理: 图片提取=%.2fs ChromaDB=%.2fs", + arxiv_id, + _t6 - _t5, + _t7 - _t6, + ) return quality @@ -445,28 +537,47 @@ async def _do_summarize_one( ) -> dict: """实际的单篇总结执行(在 semaphore 保护下)。""" arxiv_id = paper.arxiv_id + title_short = (paper.title_en or "")[:50] # 状态 → processing paper.summary_status.status = SummaryState.PROCESSING paper.summary_status.started_at = utc_now() db.commit() + logger.info("▶ [%s] 开始总结: %s", arxiv_id, title_short) + # 清理旧的图片文件和 figures_json,避免重新总结时残留 + import time as _time + _t_cleanup_start = _time.monotonic() _cleanup_old_images(db, paper) + _t_cleanup_end = _time.monotonic() + logger.info(" [%s] 清理旧数据: %.2fs", arxiv_id, _t_cleanup_end - _t_cleanup_start) raw_output = "" try: - meta_path = write_meta_json(paper) - await download_pdf(arxiv_id, paper.pdf_url) + _t0 = _time.monotonic() + meta_path = write_meta_json(paper) + _t1 = _time.monotonic() + logger.info(" [%s] meta.json: %.2fs", arxiv_id, _t1 - _t0) + + await download_pdf(arxiv_id, paper.pdf_url) + _t2 = _time.monotonic() + logger.info(" [%s] 下载PDF: %.2fs", arxiv_id, _t2 - _t1) + + logger.info(" [%s] 调用 pi 生成总结...", arxiv_id) json_data, raw_output = await _generate_with_retry( arxiv_id, meta_path, TMP_DIR / arxiv_id / "paper.pdf", pdf_mode=pdf_mode, ) + _t3 = _time.monotonic() + logger.info(" [%s] pi生成: %.2fs", arxiv_id, _t3 - _t2) quality = _persist_summary(db, paper, json_data, raw_output) + _t4 = _time.monotonic() + logger.info(" [%s] 持久化: %.2fs", arxiv_id, _t4 - _t3) - logger.info("Summarize done: %s quality=%s", arxiv_id, quality) + logger.info("✅ [%s] 完成: quality=%s 总耗时: %.2fs", arxiv_id, quality, _t4 - _t0) return {"arxiv_id": arxiv_id, "status": "done", "quality": quality} except Exception as exc: @@ -588,42 +699,67 @@ async def summarize_batch( "total": 0, } - # 并发控制 - semaphore = asyncio.Semaphore(settings.SUMMARY_CONCURRENCY) + # 并发控制:worker 模式,避免 573 个协程同时打开 DB 连接耗尽连接池 + concurrency = settings.SUMMARY_CONCURRENCY make_session = _session_factory or SessionLocal - async def _process_paper(paper: Paper) -> dict: - paper_db = make_session() - try: - p = paper_db.execute( - select(Paper) - .where(Paper.id == paper.id) - .options(*PAPER_DEFAULT_LOAD) - ).unique().scalar_one_or_none() - return await summarize_one(paper_db, p, semaphore, pdf_mode=pdf_mode) - finally: - paper_db.close() + # 进度追踪 + progress = {"done": 0, "failed": 0, "skipped": 0} + paper_queue: asyncio.Queue[Paper | None] = asyncio.Queue() + for p in papers: + paper_queue.put_nowait(p) - results = await asyncio.gather( - *[_process_paper(p) for p in papers], + async def _worker() -> list[dict]: + results: list[dict] = [] + while True: + paper = paper_queue.get_nowait() if not paper_queue.empty() else None + if paper is None: + break + paper_db = make_session() + try: + p = paper_db.execute( + select(Paper) + .where(Paper.id == paper.id) + .options(*PAPER_DEFAULT_LOAD) + ).unique().scalar_one_or_none() + result = await summarize_one(paper_db, p, pdf_mode=pdf_mode) + status = result.get("status", "failed") + progress[status] = progress.get(status, 0) + 1 + finished = sum(progress.values()) + logger.info( + "📊 进度: %d/%d (✅%d ❌%d ⏭️%d) — %s", + finished, total, + progress["done"], progress["failed"], progress["skipped"], + paper.arxiv_id, + ) + results.append(result) + except Exception as exc: + logger.error("Worker error: %s", exc) + results.append({"status": "failed", "error": str(exc)}) + finally: + paper_db.close() + return results + + worker_results = await asyncio.gather( + *[_worker() for _ in range(concurrency)], return_exceptions=True, ) + results = [] + for r in worker_results: + if isinstance(r, Exception): + logger.error("Unexpected error in batch: %s", r) + results.append(r) + elif isinstance(r, list): + results.extend(r) - # 统计结果 - done = 0 - failed = 0 - skipped = 0 + # 统计结果(progress 已在 worker 中实时更新) + done = progress["done"] + failed = progress["failed"] + skipped = progress["skipped"] for r in results: if isinstance(r, Exception): logger.error("Unexpected error in batch: %s", r) failed += 1 - elif isinstance(r, dict): - if r.get("status") == "done": - done += 1 - elif r.get("status") == "skipped": - skipped += 1 - else: - failed += 1 log_entry.status = "success" if failed == 0 else "failed" log_entry.papers_found = total diff --git a/app/services/summary_utils.py b/app/services/summary_utils.py new file mode 100644 index 0000000..cfd2c53 --- /dev/null +++ b/app/services/summary_utils.py @@ -0,0 +1,270 @@ +"""总结工具函数 — PDF 文本提取、prompt 构建、JSON 提取、meta.json 写入。 + +与后端无关的通用逻辑,pi 和 claude 后端共享。 +""" + +from __future__ import annotations + +import json +import logging +import re +from pathlib import Path + +logger = logging.getLogger(__name__) + + +# ── 自定义异常 ────────────────────────────────────────────────────────── + + +class JsonNotFoundError(Exception): + pass + + +# ── meta.json ─────────────────────────────────────────────────────────── + + +def write_meta_json(paper) -> Path: + """写入 data/papers/{arxiv_id}/meta.json,返回路径。""" + from app.services.pdf_downloader import paper_dir + + d = paper_dir(paper.arxiv_id) + d.mkdir(parents=True, exist_ok=True) + meta_path = d / "meta.json" + + authors = [a.name for a in paper.authors] + tags = [t.tag for t in paper.tags] + meta = { + "arxiv_id": paper.arxiv_id, + "title_en": paper.title_en, + "abstract": paper.abstract or "", + "published_at": paper.published_at.isoformat() if paper.published_at else None, + "authors": authors, + "tags": tags, + "upvotes": paper.upvotes, + } + meta_path.write_text( + json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8" + ) + return meta_path + + +# ── PDF 文本提取 ──────────────────────────────────────────────────────── + + +def _trim_body(text: str, max_chars: int | None = None) -> str: + """去除参考文献,保留正文+附录,超长时从末尾截断。 + + 策略: + 1. 去掉 References/Bibliography 段落(纯引用列表,对解读无用) + 2. 正文 + 附录全部保留 + 3. 如果指定了 max_chars 且总长超过,从末尾截断(附录靠后,优先保留正文) + """ + # 找 References 段落的位置(在 Appendix 之后的那个) + ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text) + if ref_match: + ref_start = ref_match.start() + # 看 References 之后有没有 Appendix + after_ref = text[ref_start:] + app_match = re.search( + r"(?m)^(?:A\s+(?:Appendix|Supplementary)|Appendix|附录)\s*$", after_ref + ) + if app_match: + # References 之后有 Appendix:只删 References 段 + ref_end = ref_start + app_match.start() + text = text[:ref_start] + text[ref_end:] + else: + # References 之后没有 Appendix:删掉从 References 到结尾 + text = text[:ref_start].rstrip() + + # 去掉 Acknowledgments(对解读无用) + ack_match = re.search(r"(?m)^(?:Acknowledgments?\s*|致谢\s*)$", text) + if ack_match: + # 只删 Acknowledgments 本身,不删后面的内容 + next_section = re.search(r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start():]) + if next_section: + text = text[:ack_match.start()] + text[ack_match.start() + next_section.start():] + else: + text = text[:ack_match.start()].rstrip() + + # 最后:如果指定了上限且超长,从末尾截断(附录在后面,正文在前面,优先保留正文) + if max_chars is not None and len(text) > max_chars: + text = text[:max_chars].rstrip() + + return text + + +def extract_pdf_text(pdf_path: Path, max_chars: int | None = None) -> Path: + """用 pymupdf 提取 PDF 正文文本,保存为 .txt。 + + max_chars=None 时不截断,给 search/auto 模式保留完整内容。 + """ + import pymupdf + + txt_path = pdf_path.with_suffix(".txt") + if txt_path.exists(): + # 缓存优先;如果需重新提取(不同 max_chars),先删旧文件 + return txt_path + + doc = pymupdf.open(str(pdf_path)) + # sort=True 启用阅读顺序检测,避免双栏论文中跨栏错位 + raw_text = "\n\n".join(page.get_text(sort=True) for page in doc) + doc.close() + + body = _trim_body(raw_text, max_chars=max_chars) + txt_path.write_text(body, encoding="utf-8") + logger.info( + "Extracted PDF text: %s (%d -> %d chars, -%d%%)", + txt_path, + len(raw_text), + len(body), + (1 - len(body) / len(raw_text)) * 100 if raw_text else 0, + ) + return txt_path + + +# ── Prompt 构建 ───────────────────────────────────────────────────────── + + +def build_prompt( + arxiv_id: str, + meta_path: Path, + txt_path: Path, + pdf_mode: str, + fix_errors: list[str] | None = None, +) -> str: + """根据模式构建 prompt。 + + inject: 全量注入,prompt 末尾包含论文全文内容 + search: pi 自主 read 文件,prompt 只包含工作流指令 + """ + json_schema = ( + "## 必须包含以下字段(不要自创字段名):\n" + '{"arxiv_id": "...", ' + '"title_zh": "中文标题", ' + '"one_line": "一句话概括(≤50字)", ' + '"tags": ["标签1","标签2"], ' + '"difficulty": "入门/进阶/前沿", ' + '"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的(50-150字)","why_matters":"为什么读懂本文需要它"}]}, ' + '"motivation": {"problem": "详细段落:现有方法的具体问题(包含具体场景和数据)", ' + '"goal": "详细段落:本文的具体目标", ' + '"gap": "详细段落:本文的独特切入角度"}, ' + '"method": {"overview": "详细段落:方法整体思路(先直觉再技术路线)", ' + '"key_idea": "详细段落:核心创新点(和已有方法的本质区别)", ' + '"steps": "详细段落:方法步骤的完整描述(每步的输入输出和具体操作)", ' + '"novelty": "详细段落:技术新颖性分析"}, ' + '"results": {"main_findings": "详细段落:核心发现(带具体数字和指标,逐一分析每个实验)", ' + '"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], ' + '"limitations": "详细段落:局限性分析(作者承认的+你自己的观察")}, ' + '"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", ' + '"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", ' + '"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度")}, ' + '"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要","section":"method"},' + '{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要","section":"results"}]' + "\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Table,id 严格使用 \"Figure N\" 或 \"Table N\" 格式。" + "section 必须是 motivation/method/results/limitations 之一,表示该图最适合展示在哪个章节。" + "}" + ) + + writing_requirements = ( + "## 写作要求\n" + "- 每个字符串字段必须写成详细段落(200-500字),不要用列表或数组\n" + "- 必须包含论文中的具体数据、数字、实验指标\n" + "- 像资深同事给同事讲论文一样,专业但易懂\n" + "- 数学公式、符号、变量必须使用 LaTeX 格式:行内公式用 $...$,独立公式用 $$...$$\n" + " 例如:损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$,学习率 $\\eta$\n" + ) + + if fix_errors: + error_list = "\n".join(f"- {e}" for e in fix_errors) + return ( + "你之前生成的 JSON 存在以下问题,请修正后重新用 write_file 保存到 " + f"data/papers/{arxiv_id}/summary.json:\n\n" + f"{error_list}\n\n" + "注意:所有字符串字段必须是详细段落(≥50字),不能是数组或列表。" + "修正后请用 bash 运行 python scripts/validate_summary.py 验证。" + ) + + if pdf_mode == "search": + return ( + "请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n" + "## 工作流程\n" + f"1. 先用 read 工具读取 {meta_path} 了解论文元信息(标题、作者、摘要)\n" + f"2. 再用 read 工具阅读 {txt_path}(论文正文全文),可以多次读取定位关键段落\n" + f"3. 充分理解后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n\n" + + writing_requirements + + "\n" + + json_schema + ) + else: + return ( + "请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n" + "## 工作流程\n" + "论文元信息和正文全文已在上文提供,请仔细阅读。\n" + f"1. 充分理解论文后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n" + "2. 用 bash 运行 python scripts/validate_summary.py 验证\n\n" + + writing_requirements + + "\n" + + json_schema + ) + + +# ── JSON 提取 ────────────────────────────────────────────────────────── + + +def extract_json(raw_output: str) -> dict: + """从输出中提取 JSON dict。三步策略:直接解析 → 代码块 → 最大花括号块。""" + # 策略 1:整体直接解析 + stripped = raw_output.strip() + try: + result = json.loads(stripped) + if isinstance(result, dict) and "title_zh" in result: + return result + except json.JSONDecodeError: + pass + + # 策略 2:提取 ```json ... ``` 代码块 + fence_pattern = re.compile(r"```(?:json)?\s*\n(.*?)```", re.DOTALL) + for match in fence_pattern.finditer(raw_output): + try: + result = json.loads(match.group(1).strip()) + if isinstance(result, dict) and "title_zh" in result: + return result + except json.JSONDecodeError: + continue + + # 策略 3:匹配包含 title_zh 的最大 {...} 块 + brace_pattern = re.compile(r"\{[^{}]*\"title_zh\"[^{}]*\}", re.DOTALL) + for match in brace_pattern.finditer(raw_output): + try: + return json.loads(match.group(0)) + except json.JSONDecodeError: + continue + + # 更宽松:找到最大的 { ... } 平衡块 + best = None + best_len = 0 + for i, ch in enumerate(raw_output): + if ch != "{": + continue + depth = 0 + for j in range(i, len(raw_output)): + if raw_output[j] == "{": + depth += 1 + elif raw_output[j] == "}": + depth -= 1 + if depth == 0: + candidate = raw_output[i : j + 1] + if len(candidate) > best_len: + try: + parsed = json.loads(candidate) + if isinstance(parsed, dict): + best = parsed + best_len = len(candidate) + except json.JSONDecodeError: + pass + break + + if best is not None: + return best + + raise JsonNotFoundError("no JSON object found in output") diff --git a/pyproject.toml b/pyproject.toml index 2c0c9a8..1001609 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ dependencies = [ "fastapi>=0.115", "uvicorn[standard]>=0.34", "sqlalchemy>=2.0", - "httpx>=0.28", + "httpx[http2]>=0.28", "jinja2>=3.1", "python-multipart>=0.0.18", "pydantic>=2.0", @@ -19,6 +19,7 @@ dependencies = [ "pymupdf>=1.25", "itsdangerous>=2.2.0", "bleach>=6.4.0", + "pymupdf4llm>=1.27.2.3", ] [project.optional-dependencies] diff --git a/scripts/validate_summary.py b/scripts/validate_summary.py index 226b139..0bfcc5f 100644 --- a/scripts/validate_summary.py +++ b/scripts/validate_summary.py @@ -1,117 +1,144 @@ -"""验证 summary JSON 是否符合 SummarySchema 要求。 - -用法:python scripts/validate_summary.py -返回:exit 0 = 通过,exit 1 = 失败(错误信息输出到 stdout) -""" - import json import sys -from pathlib import Path +schema = { + "type": "object", + "required": ["arxiv_id", "title_zh", "one_line", "tags", "difficulty", + "prerequisites", "motivation", "method", "results", "improvements", "figures"], + "properties": { + "arxiv_id": {"type": "string"}, + "title_zh": {"type": "string"}, + "one_line": {"type": "string"}, + "tags": {"type": "array", "items": {"type": "string"}}, + "difficulty": {"type": "string", "enum": ["入门", "进阶", "前沿"]}, + "prerequisites": { + "type": "object", + "required": ["concepts"], + "properties": { + "concepts": {"type": "array", "items": { + "type": "object", + "required": ["term", "explanation", "why_matters"], + "properties": { + "term": {"type": "string"}, + "explanation": {"type": "string"}, + "why_matters": {"type": "string"} + } + }} + } + }, + "motivation": { + "type": "object", + "required": ["problem", "goal", "gap"], + "properties": { + "problem": {"type": "string"}, + "goal": {"type": "string"}, + "gap": {"type": "string"} + } + }, + "method": { + "type": "object", + "required": ["overview", "key_idea", "steps", "novelty"], + "properties": { + "overview": {"type": "string"}, + "key_idea": {"type": "string"}, + "steps": {"type": "string"}, + "novelty": {"type": "string"} + } + }, + "results": { + "type": "object", + "required": ["main_findings", "benchmarks", "limitations"], + "properties": { + "main_findings": {"type": "string"}, + "benchmarks": {"type": "array", "items": { + "type": "object", + "required": ["task", "metric", "this_work", "baseline", "improvement"], + "properties": { + "task": {"type": "string"}, + "metric": {"type": "string"}, + "this_work": {"type": "string"}, + "baseline": {"type": "string"}, + "improvement": {"type": "string"} + } + }}, + "limitations": {"type": "string"} + } + }, + "improvements": { + "type": "object", + "required": ["weaknesses", "future_work", "reproducibility"], + "properties": { + "weaknesses": {"type": "string"}, + "future_work": {"type": "string"}, + "reproducibility": {"type": "string"} + } + }, + "figures": { + "type": "array", + "items": { + "type": "object", + "required": ["id", "caption", "description", "reason", "section"], + "properties": { + "id": {"type": "string"}, + "caption": {"type": "string"}, + "description": {"type": "string"}, + "reason": {"type": "string"}, + "section": {"type": "string", "enum": ["motivation", "method", "results", "limitations"]} + } + } + } + } +} -def validate(path: str) -> list[str]: - errors: list[str] = [] +def validate_file(filepath): try: - data = json.loads(Path(path).read_text(encoding="utf-8")) + with open(filepath, 'r', encoding='utf-8') as f: + data = json.load(f) + + # Check required fields + for field in schema["required"]: + if field not in data: + print(f"❌ Missing field: {field}") + return False + + # Validate nested structure + for field, spec in schema["properties"].items(): + if field in data: + if spec["type"] == "string": + if not isinstance(data[field], str): + print(f"❌ Field '{field}' should be string") + return False + elif spec["type"] == "array": + if not isinstance(data[field], list): + print(f"❌ Field '{field}' should be array") + return False + elif spec["type"] == "object": + if not isinstance(data[field], dict): + print(f"❌ Field '{field}' should be object") + return False + if "required" in spec: + for subfield in spec["required"]: + if subfield not in data[field]: + print(f"❌ Missing subfield: {field}.{subfield}") + return False + + # Validate section enum in figures + valid_sections = ["motivation", "method", "results", "limitations"] + for fig in data.get("figures", []): + if fig["section"] not in valid_sections: + print(f"❌ Invalid section in figure: {fig['section']}") + return False + + print("✅ JSON validation passed!") + return True + except json.JSONDecodeError as e: - return [f"JSON 解析失败: {e}"] - - if not isinstance(data, dict): - return ["顶层必须是 JSON 对象 (dict)"] - - # 必填字段 - required_top = ["arxiv_id", "title_zh", "one_line", "tags"] - for f in required_top: - if f not in data or not data[f]: - errors.append(f"缺少必填字段: {f}") - - # tags 必须是非空数组 - tags = data.get("tags") - if isinstance(tags, list) and len(tags) == 0: - errors.append("tags 不能为空数组") - if not isinstance(tags, list): - errors.append("tags 必须是数组") - - # motivation 子字段 - motivation = data.get("motivation", {}) - if not isinstance(motivation, dict): - errors.append("motivation 必须是对象") - else: - for f in ["problem", "goal", "gap"]: - val = motivation.get(f, "") - if not isinstance(val, str) or len(val.strip()) < 50: - errors.append(f"motivation.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)") - - # method 子字段 - method = data.get("method", {}) - if not isinstance(method, dict): - errors.append("method 必须是对象") - else: - for f in ["overview", "key_idea", "steps", "novelty"]: - val = method.get(f, "") - if not isinstance(val, str) or len(val.strip()) < 50: - errors.append(f"method.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)") - - # results 子字段 - results = data.get("results", {}) - if not isinstance(results, dict): - errors.append("results 必须是对象") - else: - for f in ["main_findings", "limitations"]: - val = results.get(f, "") - if not isinstance(val, str) or len(val.strip()) < 50: - errors.append(f"results.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)") - # benchmarks 可以是数组 - benchmarks = results.get("benchmarks") - if benchmarks is not None and not isinstance(benchmarks, list): - errors.append("results.benchmarks 必须是数组") - - # improvements 子字段 - improvements = data.get("improvements", {}) - if not isinstance(improvements, dict): - errors.append("improvements 必须是对象") - else: - for f in ["weaknesses", "future_work", "reproducibility"]: - val = improvements.get(f, "") - if not isinstance(val, str) or len(val.strip()) < 50: - errors.append(f"improvements.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)") - - # 检查是否有字段误用数组(应该用字符串的) - string_fields = [ - ("motivation", "problem"), ("motivation", "goal"), ("motivation", "gap"), - ("method", "overview"), ("method", "key_idea"), ("method", "steps"), ("method", "novelty"), - ("results", "main_findings"), ("results", "limitations"), - ("improvements", "weaknesses"), ("improvements", "future_work"), ("improvements", "reproducibility"), - ] - for section, field in string_fields: - val = data.get(section, {}).get(field) - if isinstance(val, list): - errors.append(f"{section}.{field} 应该是字符串段落,不能是数组") - - # figures 验证 - figures = data.get("figures") - if figures is not None: - if not isinstance(figures, list): - errors.append("figures 必须是数组") - else: - for i, fig in enumerate(figures): - if isinstance(fig, dict) and not fig.get("id"): - errors.append(f"figures[{i}] 缺少 id 字段") - - return errors - + print(f"❌ JSON decode error: {e}") + return False + except Exception as e: + print(f"❌ Validation error: {e}") + return False if __name__ == "__main__": - if len(sys.argv) != 2: - print("用法: python scripts/validate_summary.py ") - sys.exit(1) - - errs = validate(sys.argv[1]) - if errs: - print("❌ 验证失败:") - for e in errs: - print(f" - {e}") - sys.exit(1) - else: - print("✅ 验证通过") - sys.exit(0) + filepath = sys.argv[1] if len(sys.argv) > 1 else "data/papers/2601.10592/summary.json" + validate_file(filepath) diff --git a/uv.lock b/uv.lock index b28e0d9..48a678d 100644 --- a/uv.lock +++ b/uv.lock @@ -684,6 +684,19 @@ wheels = [ { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, ] +[[package]] +name = "h2" +version = "4.3.0" +source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" } +dependencies = [ + { name = "hpack" }, + { name = "hyperframe" }, +] +sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1d/17/afa56379f94ad0fe8defd37d6eb3f89a25404ffc71d4d848893d270325fc/h2-4.3.0.tar.gz", hash = "sha256:6c59efe4323fa18b47a632221a1888bd7fde6249819beda254aeca909f221bf1", size = 2152026, upload-time = "2025-08-23T18:12:19.778Z" } +wheels = [ + { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/69/b2/119f6e6dcbd96f9069ce9a2665e0146588dc9f88f29549711853645e736a/h2-4.3.0-py3-none-any.whl", hash = "sha256:c438f029a25f7945c69e0ccf0fb951dc3f73a5f6412981daee861431b70e2bdd", size = 61779, upload-time = "2025-08-23T18:12:17.779Z" }, +] + [[package]] name = "hf-daily-papers" version = "0.1.0" @@ -693,12 +706,13 @@ dependencies = [ { name = "bleach" }, { name = "chromadb" }, { name = "fastapi" }, - { name = "httpx" }, + { name = "httpx", extra = ["http2"] }, { name = "itsdangerous" }, { name = "jinja2" }, { name = "pydantic" }, { name = "pydantic-settings" }, { name = "pymupdf" }, + { name = "pymupdf4llm" }, { name = "python-dotenv" }, { name = "python-multipart" }, { name = "sqlalchemy" }, @@ -724,12 +738,13 @@ requires-dist = [ { name = "bleach", specifier = ">=6.4.0" }, { name = "chromadb", specifier = ">=1.0" }, { name = "fastapi", specifier = ">=0.115" }, - { name = "httpx", specifier = ">=0.28" }, + { name = "httpx", extras = ["http2"], specifier = ">=0.28" }, { name = "itsdangerous", specifier = ">=2.2.0" }, { name = "jinja2", specifier = ">=3.1" }, { name = "pydantic", specifier = ">=2.0" }, { name = "pydantic-settings", specifier = ">=2.0" }, { name = "pymupdf", specifier = ">=1.25" }, + { name = "pymupdf4llm", specifier = ">=1.27.2.3" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24" }, { name = "python-dotenv", specifier = ">=1.0" }, @@ -778,6 +793,15 @@ wheels = [ { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/62/94/3b66b148778ee100dcfd69c2ca22b57b41b44d3063ceec934f209e9184ce/hf_xet-1.5.0-cp37-abi3-win_arm64.whl", hash = "sha256:b6c9df403040248c76d808d3e047d64db2d923bae593eb244c41e425cf6cd7be", size = 3806916, upload-time = "2026-05-06T06:18:21.7Z" }, ] +[[package]] +name = "hpack" +version = "4.1.0" +source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" } +sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2c/48/71de9ed269fdae9c8057e5a4c0aa7402e8bb16f2c6e90b3aa53327b113f8/hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca", size = 51276, upload-time = "2025-01-22T21:44:58.347Z" } +wheels = [ + { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/07/c6/80c95b1b2b94682a72cbdbfb85b81ae2daffa4291fbfa1b1464502ede10d/hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496", size = 34357, upload-time = "2025-01-22T21:44:56.92Z" }, +] + [[package]] name = "httpcore" version = "1.0.9" @@ -842,6 +866,11 @@ wheels = [ { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] +[package.optional-dependencies] +http2 = [ + { name = "h2" }, +] + [[package]] name = "huggingface-hub" version = "1.16.1" @@ -862,6 +891,15 @@ wheels = [ { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/49/79/621a7dbb80c70974f73a597275351ebe03ce5bc65cb5f8f4acb5859252bc/huggingface_hub-1.16.1-py3-none-any.whl", hash = "sha256:64340de934b9ce37857ef85a82de72f5629e8a270f9119eabb12bf495eb53c22", size = 668176, upload-time = "2026-05-21T18:39:58.596Z" }, ] +[[package]] +name = "hyperframe" +version = "6.1.0" +source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" } +sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/02/e7/94f8232d4a74cc99514c13a9f995811485a6903d48e5d952771ef6322e30/hyperframe-6.1.0.tar.gz", hash = "sha256:f630908a00854a7adeabd6382b43923a4c4cd4b821fcb527e6ab9e15382a3b08", size = 26566, upload-time = "2025-01-22T21:41:49.302Z" } +wheels = [ + { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/48/30/47d0bf6072f7252e6521f3447ccfa40b421b6824517f82854703d0f5a98b/hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5", size = 13007, upload-time = "2025-01-22T21:41:47.295Z" }, +] + [[package]] name = "idna" version = "3.18" @@ -1223,6 +1261,15 @@ wheels = [ { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" }, ] +[[package]] +name = "networkx" +version = "3.6.1" +source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" } +sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" } +wheels = [ + { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" }, +] + [[package]] name = "numpy" version = "2.4.6" @@ -1842,6 +1889,39 @@ wheels = [ { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/53/a4/b9e91aac82293f9c954654c85581ee8212b5b05efadc534b581141241e6f/pymupdf-1.27.2.3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:77691604c5d1d0233827139bbcdea61fd57879c84712b8e49b1f45520f7ab9c2", size = 25000393, upload-time = "2026-04-24T14:11:01.669Z" }, ] +[[package]] +name = "pymupdf-layout" +version = "1.27.2.3" +source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" } +dependencies = [ + { name = "networkx" }, + { name = "numpy" }, + { name = "onnxruntime" }, + { name = "pymupdf" }, + { name = "pyyaml" }, +] +wheels = [ + { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bc/ee/067726c3ee5574ad5c605d00d7419e264ef509d626a726f99388111f8216/pymupdf_layout-1.27.2.3-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:75c2ab3c0e8830ac2bc50cfd32d375a30768a2610dac72a02f08265336e0834f", size = 15799844, upload-time = "2026-04-24T14:11:13.177Z" }, + { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0a/ba/46a7a36474722f9280d885f6eec878561a257d9378e52590b43d32ffb96c/pymupdf_layout-1.27.2.3-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:5656b09669dcd7c51f539afb6fdaf853602bab4cbc20479ee5ee1a85a4e32b60", size = 15795220, upload-time = "2026-04-24T14:11:23.17Z" }, + { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/84/87/bfdcca67346052943a4549814f2009b38f4d15ec025798cdf7dfa5f57c84/pymupdf_layout-1.27.2.3-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:fcf03aa815cbceebdb3263dd6a190de4547c46b1d168928836ec38738afe127d", size = 15805240, upload-time = "2026-04-24T14:11:33.465Z" }, + { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/32/e9/7ce6eaf97cebd46c3808593282e9eb99a60cddd6183e25a636980d5c7986/pymupdf_layout-1.27.2.3-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:303b9414216dfaf711ec7d807b6f1e4c3e0a92bbb4569340fcedd9d5593d16ca", size = 15806269, upload-time = "2026-04-24T14:11:43.481Z" }, + { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bf/61/3b2417d8f2cdfaa0f4749cd9dafa3379cb5cdaddf4233165f1ff81953c30/pymupdf_layout-1.27.2.3-cp310-abi3-win_amd64.whl", hash = "sha256:503b64d9b6b31ea3af79ef85cf7d36950c5048af468cb297684d2953553c62ad", size = 15809163, upload-time = "2026-04-24T14:11:53.956Z" }, +] + +[[package]] +name = "pymupdf4llm" +version = "1.27.2.3" +source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" } +dependencies = [ + { name = "pymupdf" }, + { name = "pymupdf-layout" }, + { name = "tabulate" }, +] +sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/87/c0/e3830452d82032c3d82a9879616c05bf0c51e0dea03c1d80d57b3a6ec0d1/pymupdf4llm-1.27.2.3.tar.gz", hash = "sha256:42ec1a47ddc62be3f4f40c116d27618611c6f9fa366719016d9ddc3f3a3dc22b", size = 1406297, upload-time = "2026-04-24T14:13:18.843Z" } +wheels = [ + { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e6/38/84bf29f4dd72e6c450546df6ca8f53021f764fd945ba67dcc235d39bc20e/pymupdf4llm-1.27.2.3-py3-none-any.whl", hash = "sha256:bd724b79fa3f06a5b28d7a65f7acfa8de56e04bdb603ac2d6dff315e0d151aaa", size = 77348, upload-time = "2026-04-24T14:11:04.305Z" }, +] + [[package]] name = "pypika" version = "0.51.1" @@ -2202,6 +2282,15 @@ wheels = [ { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1c/54/196d0c1db10af76baa4f64894448505d60d3cdf70ef92cbb35f46a4e4c71/starlette-1.2.1-py3-none-any.whl", hash = "sha256:4de0082d08c8f6764a85a54cf1120d6939507a19905c7768acad2a9f875d2b89", size = 73350, upload-time = "2026-05-31T01:07:50.09Z" }, ] +[[package]] +name = "tabulate" +version = "0.10.0" +source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" } +sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/46/58/8c37dea7bbf769b20d58e7ace7e5edfe65b849442b00ffcdd56be88697c6/tabulate-0.10.0.tar.gz", hash = "sha256:e2cfde8f79420f6deeffdeda9aaec3b6bc5abce947655d17ac662b126e48a60d", size = 91754, upload-time = "2026-03-04T18:55:34.402Z" } +wheels = [ + { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/99/55/db07de81b5c630da5cbf5c7df646580ca26dfaefa593667fc6f2fe016d2e/tabulate-0.10.0-py3-none-any.whl", hash = "sha256:f0b0622e567335c8fabaaa659f1b33bcb6ddfe2e496071b743aa113f8774f2d3", size = 39814, upload-time = "2026-03-04T18:55:31.284Z" }, +] + [[package]] name = "tenacity" version = "9.1.4"