diff --git a/.env.example b/.env.example
index bfea3bf..19cfaae 100644
--- a/.env.example
+++ b/.env.example
@@ -19,8 +19,11 @@ HTTP_MAX_RETRIES=3
 HTTP_USER_AGENT=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
 
 # ─── AI 总结 ──────────────────────────────
+# 总结后端：pi | claude
+SUMMARY_BACKEND=pi
 PI_BIN=
 SUMMARY_SKILL=daily-paper-summary
+CLAUDE_BIN=claude
 SUMMARY_CONCURRENCY=3
 SUMMARY_TIMEOUT_SECONDS=1200
 SUMMARY_MAX_RETRIES=2
diff --git a/app/cli.py b/app/cli.py
index ea163df..452918b 100644
--- a/app/cli.py
+++ b/app/cli.py
@@ -1,6 +1,7 @@
 """CLI 工具 — 手动抓取论文。"""
 
 import asyncio
+import logging
 
 import typer
 from dotenv import load_dotenv
@@ -49,8 +50,11 @@ def crawl(
         typer.echo(f"📡 开始抓取 {target} ...")
         result = asyncio.run(crawl_daily(db, target, top_n))
 
-        # 未指定日期且今天无数据时，自动回退到昨天
-        if not date_str and result["status"] == "success" and result["found"] == 0:
+        # 未指定日期且今天失败或无数据时，自动回退到昨天
+        need_fallback = not date_str and (
+            result["status"] == "failed" or result["found"] == 0
+        )
+        if need_fallback:
             fallback = yesterday_str()
             existing = db.scalar(select(func.count(Paper.id)).where(Paper.paper_date == fallback)) or 0
             if existing > 0:
@@ -84,6 +88,11 @@ def summarize(
         "--pdf-mode",
         help="PDF 传递方式：auto（自动选择）| inject（全量注入）| search（pi 自主搜索）",
     ),
+    backend: str = typer.Option(
+        None,
+        "--backend",
+        help="总结后端：pi | claude（留空则使用 .env 配置）",
+    ),
 ):
     """手动触发 AI 总结。"""
     from app.config import settings
@@ -97,9 +106,22 @@ def summarize(
         typer.echo(f"❌ 无效的 pdf_mode: {pdf_mode}，只支持 auto / inject / search", err=True)
         raise typer.Exit(code=1)
 
+    if backend:
+        if backend not in ("pi", "claude"):
+            typer.echo(f"❌ 无效的 backend: {backend}，只支持 pi / claude", err=True)
+            raise typer.Exit(code=1)
+        settings.SUMMARY_BACKEND = backend
+
     os.makedirs(settings.db_path.parent, exist_ok=True)
     _init(engine)
 
+    # 配置 logging 输出到终端
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)-5s %(name)s | %(message)s",
+        datefmt="%H:%M:%S",
+    )
+
     db = SessionLocal()
     try:
         if arxiv_id:
diff --git a/app/config.py b/app/config.py
index c6ddc18..09d4b44 100644
--- a/app/config.py
+++ b/app/config.py
@@ -29,8 +29,10 @@ class Settings(BaseSettings):
     HTTP_USER_AGENT: str = "hf-daily-papers-local/0.1"
 
     # AI 总结
+    SUMMARY_BACKEND: str = "pi"  # "pi" | "claude"
     PI_BIN: str = ""
     SUMMARY_SKILL: str = "daily-paper-summary"
+    CLAUDE_BIN: str = "claude"
     SUMMARY_CONCURRENCY: int = 3
     SUMMARY_TIMEOUT_SECONDS: int = 1200
     SUMMARY_MAX_RETRIES: int = 2
diff --git a/app/services/claude_backend.py b/app/services/claude_backend.py
new file mode 100644
index 0000000..76ac8f8
--- /dev/null
+++ b/app/services/claude_backend.py
@@ -0,0 +1,84 @@
+"""Claude CLI 后端 — 调用 claude CLI 子进程生成总结。
+
+和 pi_client.py 对称的接口，复用 prompt 构建、PDF 文本提取、JSON 提取逻辑。
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import uuid
+
+from app.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+class ClaudeTimeoutError(Exception):
+    pass
+
+
+class ClaudeProcessError(Exception):
+    def __init__(self, returncode: int, stderr: str):
+        self.returncode = returncode
+        self.stderr = stderr
+        super().__init__(f"claude exited with code {returncode}: {stderr[:500]}")
+
+
+async def call_claude(
+    prompt: str,
+    session_id: str | None = None,
+    fix_errors: list[str] | None = None,
+) -> tuple[str, str]:
+    """调用 claude CLI print 模式，返回 (stdout 文本, session_id)。
+
+    和 call_pi() 对称的接口，但 claude CLI 不需要文件路径和 pdf_mode——
+    所有内容已在 prompt 中准备好。
+
+    Args:
+        prompt: 完整的 prompt 文本
+        session_id: session ID（首次为 None 时自动生成）
+        fix_errors: 上一轮验证错误列表（用于重试）
+    """
+    if session_id is None:
+        session_id = f"claude-summary-{uuid.uuid4().hex[:8]}"
+
+    cmd = [settings.CLAUDE_BIN, "-p", "--output-format", "text"]
+
+    if fix_errors and session_id:
+        # 重试：延续 session
+        cmd += ["--session-id", session_id, "--continue"]
+    else:
+        cmd += ["--session-id", session_id]
+
+    cmd.append(prompt)
+
+    logger.info(
+        "Calling claude (session=%s, fix=%s)",
+        session_id,
+        bool(fix_errors),
+    )
+
+    proc = await asyncio.create_subprocess_exec(
+        *cmd,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+    )
+    try:
+        stdout, stderr = await asyncio.wait_for(
+            proc.communicate(),
+            timeout=settings.SUMMARY_TIMEOUT_SECONDS,
+        )
+    except asyncio.TimeoutError:
+        proc.kill()
+        await proc.wait()
+        raise ClaudeTimeoutError(
+            f"claude timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s"
+        )
+
+    if proc.returncode != 0:
+        raise ClaudeProcessError(
+            proc.returncode, stderr.decode("utf-8", errors="replace")
+        )
+
+    return stdout.decode("utf-8", errors="replace"), session_id
diff --git a/app/services/crawler.py b/app/services/crawler.py
index feb67c0..5942373 100644
--- a/app/services/crawler.py
+++ b/app/services/crawler.py
@@ -83,7 +83,7 @@ def _parse_paper(item: dict) -> dict:
         "upvotes": item.get("paper", {}).get("upvotes", 0) or item.get("upvotes", 0),
         "hf_url": f"https://huggingface.co/papers/{arxiv_id}" if arxiv_id else "",
         "arxiv_url": f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else "",
-        "pdf_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else "",
+        "pdf_url": f"https://arxiv.org/pdf/{arxiv_id}" if arxiv_id else "",
         "authors": [
             a.get("name", a) if isinstance(a, dict) else a
             for a in paper_info.get("authors", [])
diff --git a/app/services/pdf_downloader.py b/app/services/pdf_downloader.py
index 96da241..7b49ac0 100644
--- a/app/services/pdf_downloader.py
+++ b/app/services/pdf_downloader.py
@@ -3,10 +3,13 @@
 from __future__ import annotations
 
 import logging
+import os
 import shutil
 from pathlib import Path
 
-from app.utils import PAPERS_DIR, TMP_DIR, make_http_client
+import requests
+
+from app.utils import PAPERS_DIR, TMP_DIR
 
 logger = logging.getLogger(__name__)
 
@@ -31,6 +34,22 @@ def tmp_dir(arxiv_id: str) -> Path:
 
 # ── PDF 下载 ────────────────────────────────────────────────────────────
 
+# 复用 TCP 连接的 session
+_http_session: requests.Session | None = None
+
+
+def _get_session() -> requests.Session:
+    global _http_session
+    if _http_session is None:
+        _http_session = requests.Session()
+        _http_session.headers.update({"User-Agent": "hf-daily-papers/1.0"})
+        # 代理：优先 $PROXY_SERVER，其次 settings.http_proxy
+        proxy = os.environ.get("PROXY_SERVER")
+        if proxy:
+            _http_session.proxies = {"http": proxy, "https": proxy}
+            logger.info("PDF download using proxy from $PROXY_SERVER: %s", proxy)
+    return _http_session
+
 
 async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
     """下载 PDF 到 data/tmp/{arxiv_id}/paper.pdf。"""
@@ -42,10 +61,10 @@ async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
     dest = dest_dir / "paper.pdf"
 
     try:
-        async with make_http_client(follow_redirects=True) as client:
-            resp = await client.get(pdf_url)
-            resp.raise_for_status()
-            dest.write_bytes(resp.content)
+        session = _get_session()
+        resp = session.get(pdf_url, timeout=120, allow_redirects=True)
+        resp.raise_for_status()
+        dest.write_bytes(resp.content)
     except Exception as exc:
         raise PdfDownloadError(f"failed to download PDF for {arxiv_id}: {exc}") from exc
 
diff --git a/app/services/pdf_image_extractor.py b/app/services/pdf_image_extractor.py
index 384171c..0894716 100644
--- a/app/services/pdf_image_extractor.py
+++ b/app/services/pdf_image_extractor.py
@@ -1,12 +1,12 @@
-"""PDF 图片与表格提取 — 基于 caption 定位的页面区域截图。
+"""PDF 图片与表格提取 — 基于 pymupdf4llm layout analysis。
 
-核心思路：学术论文排版极其规整，Figure caption 在图下方，Table caption 在表格上方。
-因此反过来：先找 caption 文字 → 向上/向下截取页面区域 → 渲染为 PNG。
+用 pymupdf4llm 的 layout analysis 检测 table / picture 区域，
+再通过 caption 文字匹配确定 Figure/Table 编号，渲染为 JPEG。
 
-优势（相比提取嵌入位图）：
-- 复合图表不会被拆成碎片（整块截取）
-- 矢量图也能截取（页面渲染包含一切）
-- 不依赖 find_tables()（纯文本匹配 caption）
+相比旧方案（caption 正则 + pdfplumber/find_tables/文本块扫描三套策略）：
+- layout analysis 直接给出区域 bbox，不存在相邻表格互相侵入的问题
+- 无需手动调参（最大高度、间隙阈值等）
+- 页面级 caption 匹配：每个 caption 只分配给最近的 box，避免上下相邻表格抢夺同一个 caption
 """
 
 from __future__ import annotations
@@ -16,40 +16,18 @@ import logging
 import re
 from pathlib import Path
 
+import pymupdf
+import pymupdf4llm.helpers.document_layout as dl
+
 from app.services.pdf_downloader import paper_dir
 from app.utils import TMP_DIR
 
 logger = logging.getLogger(__name__)
 
-# ── 截取区域参数 ───────────────────────────────────────────────────────
-
-# Figure: caption 上方搜索图的范围（点）
-_FIGURE_MAX_HEIGHT = 450  # 最大向上搜索范围
-_FIGURE_MIN_HEIGHT = 50  # 最小有效截图高度
-_FIGURE_DEFAULT_HEIGHT = 280  # 上方未找到内容块时的默认图高度
-
-# Table: caption 下方搜索表格的范围
-_TABLE_MAX_HEIGHT = 500  # 最大向下搜索范围
-_TABLE_MIN_HEIGHT = 30
-
-# caption 左右扩展（双栏论文中 caption 可能比表格窄）
-_REGION_SIDE_PADDING = 10
-# 表格通常比 caption 文字宽，使用更大的水平扩展
-_TABLE_SIDE_PADDING = 60
-
-# 正文行距的 ~1.5 倍 ≈ 空白间隙阈值（学术论文紧密排版，30pt 太宽松）
-_CONTENT_GAP_THRESHOLD = 20
-# 密集表格数据块后的过渡阈值：表格块之后的段落间距常只有 12-18pt
-_TABLE_DATA_GAP_THRESHOLD = 12
-
-
 # ── Caption 正则 ───────────────────────────────────────────────────────
 
-# 要求以 Figure/Table 开头（避免匹配正文中的 "see Figure 3" 等）
-# 支持三种 caption 格式：
-#   "Figure 1: Title" / "Figure 1. Title" / "Figure 1 Title"（无标点，空格分隔）
-# 第三种需要后续紧跟大写字母（排除 "Figure 1 shows..." 等正文引用）
-_CAPTION_RE = re.compile(
+# 用于从 caption 文字中提取 Figure/Table 编号
+_FIGURE_CAPTION_RE = re.compile(
     r"^(?:Fig\.?|Figure)\s+(\d+)\s*(?:[:\.]\s*|\s+(?=(?-i:[A-Z])))",
     re.IGNORECASE,
 )
@@ -58,489 +36,471 @@ _TABLE_CAPTION_RE = re.compile(
     re.IGNORECASE,
 )
 
-# ── 停止信号：表格边界检测遇到以下内容时立即停止 ──
-
-# 下一个 Figure/Table caption（如 "Table 2:" "Figure 3:" "Figure 4 Title"）
-_CAPTION_STOP_RE = re.compile(
-    r"^(?:Table|Fig\.?|Figure)\s+\d+\s*(?:[:\.]\s*|\s+[A-Z])",
-    re.IGNORECASE,
-)
-# Section header（如 "6.2 Evolution" "D.1 Dependency" "7 Conclusion"）
-_SECTION_STOP_RE = re.compile(
-    r"^(\d{1,2}(?:\.\d+)?\s+[A-Z][a-z]|[A-Z]\.\d+\s+[A-Z][a-z])"
-)
+# caption 与 table/picture 的最大匹配距离（点）
+_CAPTION_MATCH_DISTANCE = 100
+# 截图区域的外边距
+_REGION_PADDING = 5
+# 3x 渲染，保证清晰度
+_RENDER_ZOOM = 3
+# 相邻 box 聚类间距（点）— 同一 figure/table 的碎片间距通常 < 15pt
+_CLUSTER_GAP = 15
 
 
-def _estimate_column_x(caption: dict) -> tuple[float, float]:
-    """估计 caption 所在列的水平边界（col_x0, col_x1）。
+# ── Box 聚类 ─────────────────────────────────────────────────────────
 
-    双栏论文中 caption 宽度远小于页面宽度，据此判断左右列。
-    单栏或跨栏 caption（宽度 >65% 页宽）返回整页宽度。
-    caption 居中对齐（中心接近页面中线）时按跨栏处理，使用宽范围。
+
+class _BoxCluster:
+    """合并后的布局区域（由一个或多个相邻 LayoutBox 组成）。
+
+    pymupdf4llm 有时将一个大图拆成多个小 picture box（如视频帧网格），
+    聚类后用整体 bbox 作为渲染区域。
     """
-    pw = caption["page_width"]
-    caption_w = caption["caption_x1"] - caption["caption_x0"]
 
-    # caption 宽度 >65% 页宽 → 单栏或跨栏
-    if caption_w > pw * 0.65:
-        return 0, pw
+    __slots__ = ("x0", "y0", "x1", "y1", "boxclass")
 
-    cx = (caption["caption_x0"] + caption["caption_x1"]) / 2
-
-    # caption 居中（中心距页面中线 <8%）→ 可能是跨栏表格，使用宽范围
-    if abs(cx - pw / 2) / pw < 0.08:
-        return (
-            max(0, caption["caption_x0"] - _TABLE_SIDE_PADDING * 2),
-            min(pw, caption["caption_x1"] + _TABLE_SIDE_PADDING * 2),
-        )
-
-    if cx < pw / 2:
-        return 0, pw / 2
-    else:
-        return pw / 2, pw
+    def __init__(self, boxes: list):
+        self.x0 = min(b.x0 for b in boxes)
+        self.y0 = min(b.y0 for b in boxes)
+        self.x1 = max(b.x1 for b in boxes)
+        self.y1 = max(b.y1 for b in boxes)
+        # table-fallback 归一化为 table（layout model 检测到表格但无法提取结构）
+        raw = boxes[0].boxclass
+        self.boxclass = "table" if raw == "table-fallback" else raw
 
 
-def _find_captions(doc) -> list[dict]:
-    """扫描整个文档，找到所有 Figure/Table caption 的位置和信息。"""
-    captions = []
+def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]:
+    """将相邻的同类型 box 合并为聚类。
 
-    for page_num in range(len(doc)):
-        page = doc[page_num]
-        page_width = page.rect.width
-        page_height = page.rect.height
-        blocks = page.get_text("blocks")
-
-        for block in blocks:
-            if len(block) < 5:
-                continue
-            text = str(block[4]).strip()
-            if not text:
-                continue
-
-            bx0, by0, bx1, by1 = block[0], block[1], block[2], block[3]
-            # 只取 block 第一行做匹配（避免 block 包含多段文字干扰）
-            first_line = text.split("\n")[0].strip()
-
-            m = _CAPTION_RE.match(first_line)
-            if m:
-                captions.append(
-                    {
-                        "type": "figure",
-                        "num": int(m.group(1)),
-                        "label": f"Figure {m.group(1)}",
-                        "page_num": page_num,
-                        "caption_y0": by0,
-                        "caption_y1": by1,
-                        "caption_x0": bx0,
-                        "caption_x1": bx1,
-                        "caption_text": text,
-                        "page_width": page_width,
-                        "page_height": page_height,
-                    }
-                )
-                continue
-
-            m = _TABLE_CAPTION_RE.match(first_line)
-            if m:
-                captions.append(
-                    {
-                        "type": "table",
-                        "num": int(m.group(1)),
-                        "label": f"Table {m.group(1)}",
-                        "page_num": page_num,
-                        "caption_y0": by0,
-                        "caption_y1": by1,
-                        "caption_x0": bx0,
-                        "caption_x1": bx1,
-                        "caption_text": text,
-                        "page_width": page_width,
-                        "page_height": page_height,
-                    }
-                )
-
-    return captions
-
-
-def _find_figure_top(page, caption: dict) -> float:
-    """向上扫描页面，找到 Figure 的上边界。
-
-    策略：
-    1. 优先用嵌入图片定位 — 收集 caption 上方所有相关图片 bbox，
-       按 Y 轴聚类后取最大簇的最小 y 作为上界（处理 subfigure 组合图）
-    2. 无图片时回退到文本块间隙检测（处理纯矢量图如 TikZ/matplotlib PDF）
+    用 union-find 将间距 ≤ gap 的同类型 box 归为一组，
+    每组生成一个 _BoxCluster（整体 bbox）。
     """
-    caption_y = caption["caption_y0"]
-    col_x0, col_x1 = _estimate_column_x(caption)
-    cx0 = max(col_x0, caption["caption_x0"] - _REGION_SIDE_PADDING)
-    cx1 = min(col_x1, caption["caption_x1"] + _REGION_SIDE_PADDING)
-
-    # 同页上方最近的 Figure/Table caption（多 figure 同页时截断）
-    _caption_cutoff: float | None = None
-    for b in page.get_text("blocks"):
-        if len(b) < 5:
-            continue
-        by0, by1 = b[1], b[3]
-        if by1 >= caption_y or by1 <= caption_y - _FIGURE_MAX_HEIGHT:
-            continue
-        first_line = str(b[4]).strip().split("\n")[0].strip()
-        if _CAPTION_STOP_RE.match(first_line):
-            _caption_cutoff = by0
-            break
-
-    # ── 策略 1：嵌入图片聚类定位 ──
-    # 收集 caption 上方搜索范围内所有与 caption 水平区域重叠的图片
-    image_tops: list[float] = []
-    for img_info in page.get_image_info():
-        bbox = img_info.get("bbox")
-        if bbox is None:
-            continue
-        if hasattr(bbox, "x0"):
-            ix0, iy0, ix1, iy1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
-        else:
-            ix0, iy0, ix1, iy1 = bbox[0], bbox[1], bbox[2], bbox[3]
-
-        # 图片底部必须在 caption 上方、且在搜索范围内
-        if not (iy1 <= caption_y and iy1 > caption_y - _FIGURE_MAX_HEIGHT):
-            continue
-        # 图片水平范围与 caption 所在列有重叠
-        if not (ix1 > cx0 and ix0 < cx1):
-            continue
-        # 跳过属于上方另一个 figure 的图片
-        if _caption_cutoff is not None and iy0 < _caption_cutoff:
-            continue
-        # 跳过极小图标（宽度或高度 <15pt，通常是 logo/符号）
-        if (ix1 - ix0) < 15 or (iy1 - iy0) < 15:
-            continue
-
-        image_tops.append(iy0)
-
-    if image_tops:
-        # 聚类：将 Y 轴接近的图片视为同一组（subfigure），最大簇的最小 y 即图上界
-        image_tops.sort()
-        # 用简单单遍聚类：相邻图片 top 差 < 最大高度的 40% 视为同簇
-        cluster_gap = _FIGURE_MAX_HEIGHT * 0.4
-        clusters: list[list[float]] = [[image_tops[0]]]
-        for yt in image_tops[1:]:
-            if yt - clusters[-1][-1] < cluster_gap:
-                clusters[-1].append(yt)
-            else:
-                clusters.append([yt])
-        # 取最大簇（图片数最多的）的最小 y
-        biggest = max(clusters, key=len)
-        figure_top = min(biggest)
-    else:
-        # ── 策略 2：文本块间隙检测（纯矢量图） ──
-        above_blocks: list[tuple[float, float, float, float]] = []
-        for b in page.get_text("blocks"):
-            if len(b) < 5:
-                continue
-            bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
-            if by1 <= caption_y and by1 > caption_y - _FIGURE_MAX_HEIGHT:
-                if bx1 > cx0 and bx0 < cx1:
-                    if col_x0 > 0 and bx0 < col_x0 - _REGION_SIDE_PADDING * 2:
-                        continue
-                    above_blocks.append((bx0, by0, bx1, by1))
-
-        if not above_blocks:
-            return max(0, caption_y - _FIGURE_DEFAULT_HEIGHT)
-
-        above_blocks.sort(key=lambda b: b[1], reverse=True)
-        prev_bottom = caption_y
-        for b in above_blocks:
-            if prev_bottom - b[3] > _CONTENT_GAP_THRESHOLD:
-                figure_top = prev_bottom - 5
-                break
-            prev_bottom = b[1]
-        else:
-            figure_top = above_blocks[-1][1]
-
-    # 同页 caption 截断
-    if _caption_cutoff is not None:
-        figure_top = max(figure_top, _caption_cutoff + 5)
-
-    # 限制最大高度
-    if caption_y - figure_top > _FIGURE_MAX_HEIGHT:
-        figure_top = caption_y - _FIGURE_MAX_HEIGHT
-
-    return max(0, figure_top)
-
-
-def _find_figure_horizontal(
-    page, caption: dict, top: float, bottom: float
-) -> tuple[float, float]:
-    """确定 Figure 的水平裁剪范围。
-
-    取 caption 宽度和图片实际宽度的并集，避免截断比 caption 更宽的图。
-    """
-    pw = caption["page_width"]
-    x0 = caption["caption_x0"]
-    x1 = caption["caption_x1"]
-
-    # 收集裁剪区域内所有嵌入图片的水平范围
-    col_x0, col_x1 = _estimate_column_x(caption)
-    for img_info in page.get_image_info():
-        bbox = img_info.get("bbox")
-        if bbox is None:
-            continue
-        if hasattr(bbox, "x0"):
-            ix0, iy0, ix1, iy1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
-        else:
-            ix0, iy0, ix1, iy1 = bbox[0], bbox[1], bbox[2], bbox[3]
-        # 图片在裁剪区域内且在 caption 所在列
-        if iy0 < bottom and iy1 > top and ix1 > col_x0 and ix0 < col_x1:
-            if (ix1 - ix0) < 15:
-                continue  # 跳过小图标
-            x0 = min(x0, ix0)
-            x1 = max(x1, ix1)
-
-    return max(0, x0 - _REGION_SIDE_PADDING), min(pw, x1 + _REGION_SIDE_PADDING)
-
-
-def _find_table_region(page, caption: dict) -> tuple[float, float, float, float]:
-    """向下扫描页面，找到 Table 的下边界和水平范围。
-
-    返回: (x0, bottom, x1) — 裁剪区域的左、下、右边界。
-    上边界由调用方根据 caption 位置确定。
-
-    策略：
-    1. 用 page.find_tables() 收集 caption 下方所有相邻的表格段，合并为一个完整区域
-       （学术论文表格常被拆成表头行 + 数据行等多个 find_tables 段）
-    2. 未命中时回退到文本块间隙检测
-    """
-    caption_y = caption["caption_y1"]  # caption 底部作为扫描起点
-    caption_x0 = caption["caption_x0"]
-    caption_x1 = caption["caption_x1"]
-    page_width = caption["page_width"]
-
-    # ── 策略 1: find_tables() 结构化检测 + 合并相邻段 ──
-    try:
-        tables = page.find_tables()
-    except Exception:
-        tables = None
-
-    if tables and tables.tables:
-        # 确定 caption 所在栏的范围（防止双栏论文中跨栏收集）
-        col_x0, col_x1 = _estimate_column_x(caption)
-
-        # 收集 caption 下方附近且在同一栏内的表格段 bbox
-        segments: list[tuple[float, float, float, float]] = []
-        for t in tables.tables:
-            tb = t.bbox
-            if isinstance(tb, (list, tuple)):
-                tx0, ty0, tx1, ty1 = (
-                    float(tb[0]),
-                    float(tb[1]),
-                    float(tb[2]),
-                    float(tb[3]),
-                )
-            else:
-                tx0, ty0, tx1, ty1 = (
-                    float(tb.x0),
-                    float(tb.y0),
-                    float(tb.x1),
-                    float(tb.y1),
-                )
-
-            # 表格段上边在 caption 底部附近，且与 caption 同栏
-            if (
-                ty0 >= caption_y - 5
-                and ty0 < caption_y + 200
-                and tx1 > col_x0
-                and tx0 < col_x1
-            ):
-                segments.append((tx0, ty0, tx1, ty1))
-
-        if segments:
-            # 按 y 排序，合并相邻段（gap < 30pt 视为同一表格的连续部分）
-            segments.sort(key=lambda s: s[1])
-            merged: list[tuple[float, float, float, float]] = [segments[0]]
-            for seg in segments[1:]:
-                prev = merged[-1]
-                gap = seg[1] - prev[3]  # 当前段 top - 上一段 bottom
-                if gap < 30:
-                    # 合并：取并集范围
-                    merged[-1] = (
-                        min(prev[0], seg[0]),
-                        min(prev[1], seg[1]),
-                        max(prev[2], seg[2]),
-                        max(prev[3], seg[3]),
-                    )
-                else:
-                    merged.append(seg)
-
-            # 取第一个合并段（最靠近 caption 的完整表格）
-            final = merged[0]
-            tx0, ty0, tx1, ty1 = final
-
-            # 限制最大高度
-            if ty1 - caption_y > _TABLE_MAX_HEIGHT:
-                ty1 = caption_y + _TABLE_MAX_HEIGHT
-            x0 = max(0, min(caption_x0, tx0) - _REGION_SIDE_PADDING)
-            x1 = min(page_width, max(caption_x1, tx1) + _REGION_SIDE_PADDING)
-            logger.debug(
-                "Table detected by find_tables() (%d segments merged): "
-                "(%.0f,%.0f)-(%.0f,%.0f)",
-                len(segments),
-                x0,
-                caption_y,
-                x1,
-                ty1,
-            )
-            return (x0, caption["caption_y0"], ty1, x1)
-
-    # ── 策略 2: 回退到文本块间隙检测 ──
-    x0, t_top, t_bottom, x1 = _find_table_region_by_blocks(page, caption)
-    return (x0, t_top, t_bottom, x1)
-
-
-def _scan_blocks_direction(
-    blocks: list,
-    start_y: float,
-    col_x0: float,
-    col_x1: float,
-    direction: int,
-    max_range: float,
-) -> list[tuple[float, float, float, float]]:
-    """从 start_y 向上(direction=-1)或向下(direction=1)扫描文本块。
-
-    收集间隙连续的块，遇到 stop 信号（caption / section header）或大间隙即停。
-    用 current_top/current_bottom 追踪连通区域边界，正确处理 y 重叠块。
-
-    Returns:
-        收集到的块列表 [(x0, y0, x1, y1), ...]
-    """
-    # 过滤在扫描范围内的块
-    if direction > 0:  # 向下
-        candidates = [
-            b
-            for b in blocks
-            if len(b) >= 5
-            and b[1] > start_y
-            and b[1] < start_y + max_range
-            and b[2] > col_x0
-            and b[0] < col_x1
-        ]
-        candidates.sort(key=lambda b: b[1])  # 按 y0 升序
-    else:  # 向上
-        candidates = [
-            b
-            for b in blocks
-            if len(b) >= 5
-            and b[3] <= start_y
-            and b[1] > start_y - max_range
-            and b[2] > col_x0
-            and b[0] < col_x1
-        ]
-        candidates.sort(key=lambda b: b[3], reverse=True)  # 按 y1 降序（底部离 start_y 最近的在前）
-
-    if not candidates:
+    if not boxes:
         return []
 
-    # 从 start_y 开始，追踪连通区域边界
-    connected: list[tuple[float, float, float, float]] = []
-    boundary = start_y  # 当前连通区域离 start_y 最近端的 y 坐标
-    prev_was_dense_table = False
+    n = len(boxes)
+    parent = list(range(n))
 
-    for b in candidates:
+    def find(x: int) -> int:
+        while parent[x] != x:
+            parent[x] = parent[parent[x]]
+            x = parent[x]
+        return x
+
+    def union(a: int, b: int) -> None:
+        ra, rb = find(a), find(b)
+        if ra != rb:
+            parent[ra] = rb
+
+    for i in range(n):
+        bi = boxes[i]
+        for j in range(i + 1, n):
+            bj = boxes[j]
+            if bi.boxclass != bj.boxclass:
+                continue
+            h_gap = max(0.0, max(bi.x0, bj.x0) - min(bi.x1, bj.x1))
+            v_gap = max(0.0, max(bi.y0, bj.y0) - min(bi.y1, bj.y1))
+            h_overlap = bi.x1 > bj.x0 - gap and bj.x1 > bi.x0 - gap
+            v_overlap = bi.y1 > bj.y0 - gap and bj.y1 > bi.y0 - gap
+            if (h_gap <= gap and v_overlap) or (v_gap <= gap and h_overlap):
+                union(i, j)
+
+    groups: dict[int, list] = {}
+    for i in range(n):
+        groups.setdefault(find(i), []).append(boxes[i])
+
+    return [_BoxCluster(members) for members in groups.values()]
+
+
+# ── 页面级 Caption 查找与匹配 ──────────────────────────────────────────
+
+
+def _find_page_captions(page) -> list[dict]:
+    """查找页面上所有 Figure/Table caption 文字块。"""
+    blocks = page.get_text("blocks")
+    captions = []
+    for b in blocks:
+        if len(b) < 5:
+            continue
         bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
         text = str(b[4]).strip()
         first_line = text.split("\n")[0].strip()
 
-        # stop 信号
-        if _CAPTION_STOP_RE.match(first_line) or _SECTION_STOP_RE.match(first_line):
-            break
-
-        # 检查当前块是否与连通区域相连（间隙 < 阈值）
-        if direction > 0:
-            gap = by0 - boundary
+        cap_type = None
+        m = _TABLE_CAPTION_RE.match(first_line)
+        if m:
+            cap_type = "table"
         else:
-            gap = boundary - by1
+            m = _FIGURE_CAPTION_RE.match(first_line)
+            if m:
+                cap_type = "figure"
+        if m is None:
+            continue
 
-        # 密集表格数据块后使用更低的间隙阈值
-        threshold = (
-            _TABLE_DATA_GAP_THRESHOLD
-            if prev_was_dense_table
-            else _CONTENT_GAP_THRESHOLD
+        captions.append(
+            {
+                "label": f"{'Table' if cap_type == 'table' else 'Figure'} {m.group(1)}",
+                "type": cap_type,
+                "caption_text": text,
+                "caption_y0": by0,
+                "caption_y1": by1,
+                "caption_x0": bx0,
+                "caption_x1": bx1,
+            }
         )
-        if gap > threshold:
-            break
-
-        connected.append((bx0, by0, bx1, by1))
-
-        # 更新连通区域边界
-        if direction > 0:
-            boundary = by1  # 向下扩展
-        else:
-            boundary = min(boundary, by0)  # 向上扩展
-
-        # 判断当前块是否为密集表格数据（行密度高）
-        lines = [l for l in text.split("\n") if l.strip()]
-        block_height = by1 - by0
-        prev_was_dense_table = (
-            len(lines) >= 4
-            and block_height > 0
-            and len(lines) / block_height >= 0.08
-        )
-
-    return connected
+    return captions
 
 
-def _find_table_region_by_blocks(
-    page, caption: dict
-) -> tuple[float, float, float]:
-    """文本块间隙检测 — 作为 find_tables() 的 fallback。
+def _vertical_distance(cap_y0, cap_y1, box_y0, box_y1) -> float | None:
+    """计算 caption 到 box 的垂直距离。不邻接时返回 None。
 
-    向下扫描找表格下边界，向上扫描找表格上边界（处理 caption 在数据下方）。
-    使用 _scan_blocks_direction 统一双向扫描逻辑。
+    三种情况：caption 完全在 box 上方、完全在下方、与 box 有垂直重叠。
+    重叠（含部分溢出）视为 distance=0，确保 caption 延伸到 box 边界外时不会丢失。
     """
-    blocks = page.get_text("blocks")
-    caption_y0 = caption["caption_y0"]
-    caption_y1 = caption["caption_y1"]
-    caption_x0 = caption["caption_x0"]
-    caption_x1 = caption["caption_x1"]
-    page_width = caption["page_width"]
-    page_height = caption["page_height"]
+    # Caption 完全在 box 上方
+    if cap_y1 <= box_y0:
+        dist = box_y0 - cap_y1
+        return dist if dist <= _CAPTION_MATCH_DISTANCE else None
+    # Caption 完全在 box 下方
+    if cap_y0 >= box_y1:
+        dist = cap_y0 - box_y1
+        return dist if dist <= _CAPTION_MATCH_DISTANCE else None
+    # Caption 与 box 有垂直重叠（内部、部分溢出都算）→ 距离 0
+    return 0
 
-    col_x0, col_x1 = _estimate_column_x(caption)
 
-    # 向下扫描
-    below = _scan_blocks_direction(
-        blocks, caption_y1, col_x0, col_x1, direction=1, max_range=_TABLE_MAX_HEIGHT
-    )
-    # 向上扫描
-    above = _scan_blocks_direction(
-        blocks, caption_y0, col_x0, col_x1, direction=-1, max_range=_TABLE_MAX_HEIGHT
+def _same_column(cap: dict, box, page_width: float) -> bool:
+    """判断 caption 和 box 是否在同一列。
+
+    双栏论文中左右栏间距有限，简单的水平重叠检查会跨列匹配。
+    策略：用中心 X 坐标判断各自在哪半边，只有同半边才算同列。
+    跨栏图表（caption 或 box 宽度 >65% 页宽）不受此限制。
+    """
+    cap_w = cap["caption_x1"] - cap["caption_x0"]
+    box_w = box.x1 - box.x0
+
+    # 跨栏元素：宽度超过页面的 65%
+    if cap_w > page_width * 0.65 or box_w > page_width * 0.65:
+        return True
+
+    cap_cx = (cap["caption_x0"] + cap["caption_x1"]) / 2
+    box_cx = (box.x0 + box.x1) / 2
+    mid = page_width / 2
+
+    # 同在左半边或同在右半边
+    return (cap_cx < mid) == (box_cx < mid)
+
+
+def _match_captions_to_boxes(
+    page_boxes: list, captions: list[dict], page_width: float
+) -> list[tuple[list[int], list[dict]]]:
+    """将 caption 分配给 box，允许一个 caption 匹配多个同类型 box。
+
+    典型场景：
+    - Figure 由左右两个 picture box 组成，caption 同时靠近两者
+    - Table 的视觉内容被 layout analysis 误分类为 picture，需要跨类型匹配
+
+    Returns:
+        [(box_indices, captions), ...] 每组是一个独立的渲染任务
+    """
+    # 每个 caption 找到所有距离在阈值内的 box
+    # 优先匹配同类型；如果找不到，再匹配任意 table/picture box
+    cap_to_boxes: dict[int, list[tuple[int, float]]] = {}
+
+    for ci, cap in enumerate(captions):
+        same_type: list[tuple[int, float]] = []
+        any_type: list[tuple[int, float]] = []
+        expected = "table" if cap["type"] == "table" else "picture"
+
+        for bi, box in enumerate(page_boxes):
+            # 列感知：双栏论文中只匹配同栏的 box
+            if not _same_column(cap, box, page_width):
+                continue
+            # 水平重叠检查（同列内仍需有重叠）
+            if not (
+                cap["caption_x1"] > box.x0 - 5 and cap["caption_x0"] < box.x1 + 5
+            ):
+                continue
+            dist = _vertical_distance(
+                cap["caption_y0"], cap["caption_y1"], box.y0, box.y1
+            )
+            if dist is None:
+                continue
+            entry = (bi, dist)
+            any_type.append(entry)
+            if box.boxclass == expected:
+                same_type.append(entry)
+
+        # 优先用同类型匹配；没有时回退到任意类型；都没有则跳过
+        if same_type:
+            cap_to_boxes[ci] = same_type
+        elif any_type:
+            cap_to_boxes[ci] = any_type
+        # else: 该 caption 无匹配 box，不加入 cap_to_boxes
+
+    # 每个 caption → 最近的 box（用于分组），但记录所有匹配的 box
+    cap_primary: dict[int, int] = {}  # caption → primary box index
+    cap_all_boxes: dict[int, list[int]] = {}  # caption → all matched box indices
+    for ci, matches in cap_to_boxes.items():
+        matches.sort(key=lambda x: x[1])
+        cap_primary[ci] = matches[0][0]
+        # 所有距离最近的同组 box（距离差 < 20pt 视为同一组）
+        best_dist = matches[0][1]
+        cap_all_boxes[ci] = [bi for bi, d in matches if d <= best_dist + 20]
+
+    # 按 primary box 分组
+    box_to_caps: dict[int, list[int]] = {}
+    for ci, bi in cap_primary.items():
+        box_to_caps.setdefault(bi, []).append(ci)
+
+    # 构建渲染组：每个 caption 独立成组（共享 box 但各自渲染）
+    # 同类型同 label 的 caption 会合并；不同类型则分开
+    used_captions: set[int] = set()
+    groups: list[tuple[list[int], list[dict]]] = []
+
+    for bi in sorted(box_to_caps.keys()):
+        cis = box_to_caps[bi]
+        for ci in cis:
+            if ci in used_captions:
+                continue
+            used_captions.add(ci)
+
+            all_box_indices = set(cap_all_boxes.get(ci, [bi]))
+            # 只合并同 label 的 caption（同 figure/table 的重复 caption）
+            merged_captions = [captions[ci]]
+            for other_bi in all_box_indices:
+                if other_bi in box_to_caps:
+                    for other_ci in box_to_caps[other_bi]:
+                        if other_ci not in used_captions:
+                            other_cap = captions[other_ci]
+                            if other_cap["label"] == captions[ci]["label"]:
+                                used_captions.add(other_ci)
+                                merged_captions.append(other_cap)
+            groups.append((sorted(all_box_indices), merged_captions))
+
+    return groups
+
+
+# ── 单页处理 ─────────────────────────────────────────────────────────
+
+
+def _render_and_save(
+    page,
+    clip: pymupdf.Rect,
+    images_dest: Path,
+    manifest: dict,
+    label: str,
+    cap_type: str,
+    caption_text: str,
+    page_num_1based: int,
+    arxiv_id: str,
+) -> bool:
+    """渲染页面区域并保存 JPEG，写入 manifest。成功返回 True。"""
+    mat = pymupdf.Matrix(_RENDER_ZOOM, _RENDER_ZOOM)
+    try:
+        pix = page.get_pixmap(matrix=mat, clip=clip)
+    except Exception:
+        logger.debug("Failed to render %s for %s", label, arxiv_id)
+        return False
+
+    filename = f"{label.replace(' ', '_').lower()}.jpg"
+    (images_dest / filename).write_bytes(pix.tobytes("jpeg"))
+
+    manifest[filename] = {
+        "page": page_num_1based,
+        "type": cap_type,
+        "label": label,
+        "caption_text": caption_text[:200] if caption_text else "",
+        "figures" if cap_type == "figure" else "tables": [label],
+    }
+    logger.debug(
+        "Rendered %s: page %d, region (%.0f,%.0f)-(%.0f,%.0f) → %s",
+        label,
+        page_num_1based,
+        clip.x0,
+        clip.y0,
+        clip.x1,
+        clip.y1,
+        filename,
     )
+    return True
 
-    # 确定上下边界
-    scan_top = min(b[1] for b in above) if above else caption_y0
-    scan_bottom = max(b[3] for b in below) if below else caption_y1
 
-    top = scan_top
-    bottom = scan_bottom + 5  # 底部 padding
+def _process_page(
+    doc,
+    page_idx: int,
+    page_layout,
+    images_dest: Path,
+    manifest: dict,
+    seen_labels: set,
+    arxiv_id: str,
+) -> int:
+    """处理单页：caption 匹配 + orphan 兜底，返回本页提取数量。"""
+    page = doc[page_idx]
+    page_width = page.rect.width
+    page_num = page_idx + 1
+    orphan_fig_counter = 0
+    orphan_tbl_counter = 0
 
-    if bottom - top > _TABLE_MAX_HEIGHT:
-        bottom = top + _TABLE_MAX_HEIGHT
+    # 收集本页的 table/picture box（跳过极小区域）
+    raw_boxes = []
+    for box in page_layout.boxes:
+        if box.boxclass not in ("table", "table-fallback", "picture"):
+            continue
+        if (box.x1 - box.x0) < 20 or (box.y1 - box.y0) < 20:
+            continue
+        raw_boxes.append(box)
 
-    # 水平范围：caption + 所有纳入块
-    all_blocks = above + below
-    if all_blocks:
-        content_x0 = min(caption_x0, min(b[0] for b in all_blocks))
-        content_x1 = max(caption_x1, max(b[2] for b in all_blocks))
-    else:
-        content_x0 = caption_x0
-        content_x1 = caption_x1
+    if not raw_boxes:
+        return 0
 
-    x0 = max(0, content_x0 - _REGION_SIDE_PADDING)
-    x1 = min(page_width, content_x1 + _REGION_SIDE_PADDING)
+    # 聚类：将同一 figure/table 的碎片 box 合并
+    page_boxes = _cluster_boxes(raw_boxes)
 
-    return (x0, top, bottom, x1)
+    # 页面级匹配：查找所有 caption，分配给 box
+    captions = _find_page_captions(page)
+    groups = _match_captions_to_boxes(page_boxes, captions, page_width)
+
+    # 只合并同 label 的 group（同一个 figure/table 的重复 caption）
+    # 不同 label 的 group 即使共享 box 也不合并（如 Figure 7 和 Figure 8），
+    # 渲染时用 caption 位置切割区域
+    _merged_groups: set[int] = set()
+    merged_groups: list[tuple[list[int], list[dict]]] = []
+    for gi, (box_indices, caps) in enumerate(groups):
+        if gi in _merged_groups:
+            continue
+        this_labels = {c["label"] for c in caps}
+        all_box_set = set(box_indices)
+        merge_targets = {gi}
+        for other_gi, (other_bi, other_caps) in enumerate(groups):
+            if other_gi <= gi or other_gi in _merged_groups:
+                continue
+            other_labels = {c["label"] for c in other_caps}
+            # 只在 label 有交集时合并（同一个 figure/table）
+            if this_labels & other_labels and all_box_set & set(other_bi):
+                merge_targets.add(other_gi)
+                all_box_set |= set(other_bi)
+        all_caps = []
+        for mgi in sorted(merge_targets):
+            _merged_groups.add(mgi)
+            all_caps.extend(groups[mgi][1])
+        merged_groups.append((sorted(all_box_set), all_caps))
+    groups = merged_groups
+
+    # ── 阶段 1：渲染有 caption 匹配的图/表 ──
+    matched_box_indices: set[int] = set()
+    extracted = 0
+
+    for box_indices, caps in groups:
+        matched_box_indices.update(box_indices)
+
+        # 去重同一 label，跳过已处理的
+        unique_caps = []
+        for cap in caps:
+            if cap["label"] not in seen_labels:
+                seen_labels.add(cap["label"])
+                unique_caps.append(cap)
+        if not unique_caps:
+            continue
+
+        # 合并所有关联 box 的 bbox
+        bx0 = min(page_boxes[i].x0 for i in box_indices)
+        by0 = min(page_boxes[i].y0 for i in box_indices)
+        bx1 = max(page_boxes[i].x1 for i in box_indices)
+        by1 = max(page_boxes[i].y1 for i in box_indices)
+
+        # 渲染区域：box + caption
+        all_cap_y0 = min(c["caption_y0"] for c in unique_caps)
+        all_cap_y1 = max(c["caption_y1"] for c in unique_caps)
+        all_cap_x0 = min(c["caption_x0"] for c in unique_caps)
+        all_cap_x1 = max(c["caption_x1"] for c in unique_caps)
+
+        top = max(0, min(by0, all_cap_y0) - _REGION_PADDING)
+        bottom = max(by1, all_cap_y1) + _REGION_PADDING
+        rx0 = max(0, min(bx0, all_cap_x0) - _REGION_PADDING)
+        rx1 = min(page_width, max(bx1, all_cap_x1) + _REGION_PADDING)
+
+        clip = pymupdf.Rect(rx0, top, rx1, bottom)
+        # 多个 caption 可能共享同一区域（如 subfigure），只需渲染一次
+        jpeg_bytes = None
+        for cap in unique_caps:
+            if jpeg_bytes is None:
+                if not _render_and_save(
+                    page,
+                    clip,
+                    images_dest,
+                    manifest,
+                    cap["label"],
+                    cap["type"],
+                    cap["caption_text"],
+                    page_num,
+                    arxiv_id,
+                ):
+                    break
+                # 读取刚写入的 bytes 供后续同名 caption 复用
+                filename = f"{cap['label'].replace(' ', '_').lower()}.jpg"
+                jpeg_bytes = (images_dest / filename).read_bytes()
+                extracted += 1
+            else:
+                # 同区域的不同 caption（如 subfigure），复用图片
+                filename = f"{cap['label'].replace(' ', '_').lower()}.jpg"
+                (images_dest / filename).write_bytes(jpeg_bytes)
+                cap_preview = cap["caption_text"][:200]
+                manifest[filename] = {
+                    "page": page_num,
+                    "type": cap["type"],
+                    "label": cap["label"],
+                    "caption_text": cap_preview,
+                    "figures" if cap["type"] == "figure" else "tables": [cap["label"]],
+                }
+                extracted += 1
+
+    # ── 阶段 2：渲染无 caption 匹配的图/表（orphan boxes） ──
+    orphan_indices = set(range(len(page_boxes))) - matched_box_indices
+    for bi in sorted(orphan_indices):
+        box = page_boxes[bi]
+        cap_type = "figure" if box.boxclass == "picture" else "table"
+
+        if cap_type == "figure":
+            orphan_fig_counter += 1
+            label = f"Figure (p{page_num}-{orphan_fig_counter})"
+        else:
+            orphan_tbl_counter += 1
+            label = f"Table (p{page_num}-{orphan_tbl_counter})"
+
+        if label in seen_labels:
+            continue
+        seen_labels.add(label)
+
+        clip = pymupdf.Rect(
+            max(0, box.x0 - _REGION_PADDING),
+            max(0, box.y0 - _REGION_PADDING),
+            min(page_width, box.x1 + _REGION_PADDING),
+            box.y1 + _REGION_PADDING,
+        )
+        if _render_and_save(
+            page,
+            clip,
+            images_dest,
+            manifest,
+            label,
+            cap_type,
+            "",
+            page_num,
+            arxiv_id,
+        ):
+            extracted += 1
+
+    return extracted
+
+
+# ── 核心提取 ───────────────────────────────────────────────────────────
 
 
 def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
     """从 PDF 提取 Figure/Table 截图，生成 manifest。
 
-    策略：找 caption → 定位区域 → 渲染页面截图。
+    用 pymupdf4llm layout analysis 检测 table/picture 区域，
+    再通过 caption 文字确定编号，渲染为 JPEG。
 
     Args:
         arxiv_id: 论文 ID
@@ -549,8 +509,6 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
     Returns:
         提取的图片数量
     """
-    import pymupdf
-
     if pdf_path is None:
         pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
 
@@ -561,7 +519,7 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
     images_dest = paper_dir(arxiv_id) / "images"
     images_dest.mkdir(parents=True, exist_ok=True)
 
-    # 清理上次提取的旧图片，避免残留（同时清理 .png 和 .jpg）
+    # 清理上次提取的旧图片
     for old_file in images_dest.iterdir():
         if old_file.suffix.lower() in (".png", ".jpg", ".jpeg"):
             old_file.unlink()
@@ -569,94 +527,43 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
         (images_dest / "manifest.json").unlink()
 
     doc = pymupdf.open(str(pdf_path))
-    captions = _find_captions(doc)
 
-    if not captions:
-        logger.info("No Figure/Table captions found in PDF for %s", arxiv_id)
+    # layout analysis
+    try:
+        parsed = dl.parse_document(
+            doc, filename=str(pdf_path), use_ocr=dl.OCRMode.NEVER
+        )
+    except Exception:
+        logger.warning(
+            "pymupdf4llm layout analysis failed for %s", arxiv_id, exc_info=True
+        )
         doc.close()
         return 0
 
-    # 去重：同一页同一 label 可能匹配到多个 block（如正文引用 "Figure 7"）
-    # 保留每个 (type, num) 的第一个匹配（即真正的 caption）
-    seen_labels: dict[str, dict] = {}
-    for cap in captions:
-        key = cap["label"]
-        if key not in seen_labels:
-            seen_labels[key] = cap
-
-    unique_captions = list(seen_labels.values())
     extracted = 0
     manifest: dict[str, dict] = {}
+    seen_labels: set[str] = set()
 
-    zoom = 3  # 3x 渲染，保证清晰度
-
-    for cap in unique_captions:
-        page = doc[cap["page_num"]]
-
-        if cap["type"] == "figure":
-            # Figure: caption 上方是图 → 向上找图的上边界
-            top = _find_figure_top(page, cap)
-            # 上方多留 5pt 边距，确保图框边框、装饰线等不被截断
-            top = max(0, top - 5)
-            bottom = cap["caption_y1"] + 5  # 包含 caption
-            # 水平范围：取 caption 宽度和图片实际宽度的并集
-            x0, x1 = _find_figure_horizontal(page, cap, top, bottom)
-
-            height = bottom - top
-            if height < _FIGURE_MIN_HEIGHT:
-                logger.debug(
-                    "Figure %s too small (%.0fpt), skipping", cap["label"], height
-                )
-                continue
-
-        else:
-            # Table: 找表格区域（find_tables() → 块级 fallback，双向扫描）
-            x0, tbl_top, bottom, x1 = _find_table_region(page, cap)
-            top = max(0, tbl_top - 5)  # 包含 caption 及上方数据，留 5pt margin
-
-            height = bottom - top
-            if height < _TABLE_MIN_HEIGHT:
-                logger.debug(
-                    "Table %s too small (%.0fpt), skipping", cap["label"], height
-                )
-                continue
-
-        # 渲染截取
-        clip = pymupdf.Rect(x0, top, x1, bottom)
-        mat = pymupdf.Matrix(zoom, zoom)
+    for page_idx, page_layout in enumerate(parsed.pages):
         try:
-            pix = page.get_pixmap(matrix=mat, clip=clip)
+            extracted += _process_page(
+                doc,
+                page_idx,
+                page_layout,
+                images_dest=images_dest,
+                manifest=manifest,
+                seen_labels=seen_labels,
+                arxiv_id=arxiv_id,
+            )
         except Exception:
-            logger.debug("Failed to render %s region for %s", cap["label"], arxiv_id)
+            logger.warning(
+                "Failed to process page %d for %s",
+                page_idx + 1,
+                arxiv_id,
+                exc_info=True,
+            )
             continue
 
-        # 保存为 JPEG（比 PNG 小 5-10 倍，适合网络传输）
-        filename = f"{cap['label'].replace(' ', '_').lower()}.jpg"
-        jpeg_path = images_dest / filename
-        jpeg_bytes = pix.tobytes("jpeg")
-        jpeg_path.write_bytes(jpeg_bytes)
-        extracted += 1
-
-        cap_preview = cap["caption_text"][:200] if cap["caption_text"] else ""
-        manifest[filename] = {
-            "page": cap["page_num"] + 1,
-            "type": cap["type"],
-            "label": cap["label"],
-            "caption_text": cap_preview,
-            "figures" if cap["type"] == "figure" else "tables": [cap["label"]],
-        }
-        logger.debug(
-            "Rendered %s: page %d, region (%.0f,%.0f)-(%.0f,%.0f) h=%.0fpt → %s",
-            cap["label"],
-            cap["page_num"] + 1,
-            x0,
-            top,
-            x1,
-            bottom,
-            height,
-            filename,
-        )
-
     doc.close()
 
     # 保存 manifest
@@ -665,17 +572,17 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
 
     if extracted > 0:
         logger.info(
-            "Extracted %d figure/table screenshots from PDF for %s "
-            "(from %d captions found, %d unique)",
+            "Extracted %d figure/table screenshots from PDF for %s",
             extracted,
             arxiv_id,
-            len(captions),
-            len(unique_captions),
         )
 
     return extracted
 
 
+# ── 按 summary 过滤 ────────────────────────────────────────────────────
+
+
 def filter_images_by_summary(arxiv_id: str, figures: list[dict]) -> int:
     """根据 summary 中的 figures 字段过滤提取的图片/表格。
 
diff --git a/app/services/pi_client.py b/app/services/pi_client.py
index 0a7f68c..5dce8f7 100644
--- a/app/services/pi_client.py
+++ b/app/services/pi_client.py
@@ -1,17 +1,38 @@
-"""pi CLI 调用与 JSON 提取 — 调用 pi 生成总结，从输出中提取结构化 JSON。"""
+"""pi CLI 后端 — 调用 pi 子进程生成总结。
+
+通用工具函数（prompt 构建、PDF 提取、JSON 提取、meta.json）已移至 summary_utils.py。
+"""
 
 from __future__ import annotations
 
 import asyncio
-import json
 import logging
-import re
+import uuid
 from pathlib import Path
 
 from app.config import settings
+from app.services.summary_utils import (
+    JsonNotFoundError,
+    build_prompt,
+    extract_json,
+    extract_pdf_text,
+    write_meta_json,
+)
 
 logger = logging.getLogger(__name__)
 
+# 重新导出，保持向后兼容
+__all__ = [
+    "PiTimeoutError",
+    "PiProcessError",
+    "JsonNotFoundError",
+    "call_pi",
+    "write_meta_json",
+    "extract_pdf_text",
+    "build_prompt",
+    "extract_json",
+]
+
 
 # ── 自定义异常 ──────────────────────────────────────────────────────────
 
@@ -27,201 +48,6 @@ class PiProcessError(Exception):
         super().__init__(f"pi exited with code {returncode}: {stderr[:500]}")
 
 
-class JsonNotFoundError(Exception):
-    pass
-
-
-# ── meta.json ───────────────────────────────────────────────────────────
-
-
-def write_meta_json(paper) -> Path:
-    """写入 data/papers/{arxiv_id}/meta.json，返回路径。"""
-    from app.services.pdf_downloader import paper_dir
-
-    d = paper_dir(paper.arxiv_id)
-    d.mkdir(parents=True, exist_ok=True)
-    meta_path = d / "meta.json"
-
-    authors = [a.name for a in paper.authors]
-    tags = [t.tag for t in paper.tags]
-    meta = {
-        "arxiv_id": paper.arxiv_id,
-        "title_en": paper.title_en,
-        "abstract": paper.abstract or "",
-        "published_at": paper.published_at.isoformat() if paper.published_at else None,
-        "authors": authors,
-        "tags": tags,
-        "upvotes": paper.upvotes,
-    }
-    meta_path.write_text(
-        json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
-    )
-    return meta_path
-
-
-# ── PDF 文本提取 ────────────────────────────────────────────────────────
-
-
-def _trim_body(text: str, max_chars: int | None = None) -> str:
-    """去除参考文献，保留正文+附录，超长时从末尾截断。
-
-    策略：
-    1. 去掉 References/Bibliography 段落（纯引用列表，对解读无用）
-    2. 正文 + 附录全部保留
-    3. 如果指定了 max_chars 且总长超过，从末尾截断（附录靠后，优先保留正文）
-    """
-    import re
-
-    # 找 References 段落的位置（在 Appendix 之后的那个）
-    # 简单策略：找到 References 标题，如果后面没有 Appendix 就全删
-    # 如果后面还有 Appendix，只删 References 到 Appendix 之间的内容
-    ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text)
-    if ref_match:
-        ref_start = ref_match.start()
-        # 看 References 之后有没有 Appendix
-        after_ref = text[ref_start:]
-        app_match = re.search(
-            r"(?m)^(?:A\s+(?:Appendix|Supplementary)|Appendix|附录)\s*$", after_ref
-        )
-        if app_match:
-            # References 之后有 Appendix：只删 References 段
-            ref_end = ref_start + app_match.start()
-            text = text[:ref_start] + text[ref_end:]
-        else:
-            # References 之后没有 Appendix：删掉从 References 到结尾
-            text = text[:ref_start].rstrip()
-
-    # 去掉 Acknowledgments（对解读无用）
-    ack_match = re.search(r"(?m)^(?:Acknowledgments?\s*|致谢\s*)$", text)
-    if ack_match:
-        # 只删 Acknowledgments 本身，不删后面的内容
-        next_section = re.search(r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start():])
-        if next_section:
-            text = text[:ack_match.start()] + text[ack_match.start() + next_section.start():]
-        else:
-            text = text[:ack_match.start()].rstrip()
-
-    # 最后：如果指定了上限且超长，从末尾截断（附录在后面，正文在前面，优先保留正文）
-    if max_chars is not None and len(text) > max_chars:
-        text = text[:max_chars].rstrip()
-
-    return text
-
-
-def extract_pdf_text(pdf_path: Path, max_chars: int | None = None) -> Path:
-    """用 pymupdf 提取 PDF 正文文本，保存为 .txt。
-
-    max_chars=None 时不截断，给 search/auto 模式保留完整内容。
-    """
-    import pymupdf
-
-    txt_path = pdf_path.with_suffix(".txt")
-    if txt_path.exists():
-        # 缓存优先；如果需重新提取（不同 max_chars），先删旧文件
-        return txt_path
-
-    doc = pymupdf.open(str(pdf_path))
-    raw_text = "\n\n".join(page.get_text() for page in doc)
-    doc.close()
-
-    body = _trim_body(raw_text, max_chars=max_chars)
-    txt_path.write_text(body, encoding="utf-8")
-    logger.info(
-        "Extracted PDF text: %s (%d -> %d chars, -%d%%)",
-        txt_path,
-        len(raw_text),
-        len(body),
-        (1 - len(body) / len(raw_text)) * 100 if raw_text else 0,
-    )
-    return txt_path
-
-
-# ── Prompt 构建 ─────────────────────────────────────────────────────────
-
-
-def _build_prompt(
-    arxiv_id: str,
-    meta_path: Path,
-    txt_path: Path,
-    pdf_mode: str,
-    fix_errors: list[str] | None = None,
-) -> str:
-    """根据模式构建 pi prompt。
-
-    inject: 全量注入，prompt 末尾包含论文全文内容
-    search: pi 自主 read 文件，prompt 只包含工作流指令
-    """
-    json_schema = (
-        "## 必须包含以下字段（不要自创字段名）：\n"
-        '{"arxiv_id": "...", '
-        '"title_zh": "中文标题", '
-        '"one_line": "一句话概括(≤50字)", '
-        '"tags": ["标签1","标签2"], '
-        '"difficulty": "入门/进阶/前沿", '
-        '"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的（50-150字）","why_matters":"为什么读懂本文需要它"}]}, '
-        '"motivation": {"problem": "详细段落：现有方法的具体问题（包含具体场景和数据）", '
-        '"goal": "详细段落：本文的具体目标", '
-        '"gap": "详细段落：本文的独特切入角度"}, '
-        '"method": {"overview": "详细段落：方法整体思路（先直觉再技术路线）", '
-        '"key_idea": "详细段落：核心创新点（和已有方法的本质区别）", '
-        '"steps": "详细段落：方法步骤的完整描述（每步的输入输出和具体操作）", '
-        '"novelty": "详细段落：技术新颖性分析"}, '
-        '"results": {"main_findings": "详细段落：核心发现（带具体数字和指标，逐一分析每个实验）", '
-        '"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
-        '"limitations": "详细段落：局限性分析（作者承认的+你自己的观察")}, '
-        '"improvements": {"weaknesses": "详细段落：独立分析的弱点（具体场景，每个弱点给改进方向）", '
-        '"future_work": "详细段落：未来研究方向（作者提出的+基于成果可延伸的）", '
-        '"reproducibility": "详细段落：复现评估（开源情况、数据、算力、难度")}, '
-        '"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要","section":"method"},'
-        '{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要","section":"results"}]'
-        "\n注意：figures 必须包含论文中的所有重要图表，包括 Figure 和 Table，id 严格使用 \"Figure N\" 或 \"Table N\" 格式。"
-        "section 必须是 motivation/method/results/limitations 之一，表示该图最适合展示在哪个章节。"
-        "}"
-    )
-
-    writing_requirements = (
-        "## 写作要求\n"
-        "- 每个字符串字段必须写成详细段落（200-500字），不要用列表或数组\n"
-        "- 必须包含论文中的具体数据、数字、实验指标\n"
-        "- 像资深同事给同事讲论文一样，专业但易懂\n"
-        "- 数学公式、符号、变量必须使用 LaTeX 格式：行内公式用 $...$，独立公式用 $$...$$\n"
-        "  例如：损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$，学习率 $\\eta$\n"
-    )
-
-    if fix_errors:
-        error_list = "\n".join(f"- {e}" for e in fix_errors)
-        return (
-            "你之前生成的 JSON 存在以下问题，请修正后重新用 write_file 保存到 "
-            f"data/papers/{arxiv_id}/summary.json：\n\n"
-            f"{error_list}\n\n"
-            "注意：所有字符串字段必须是详细段落（≥50字），不能是数组或列表。"
-            "修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
-        )
-
-    if pdf_mode == "search":
-        return (
-            "请深度解读以下论文，严格按下面的 JSON schema 输出结果。\n\n"
-            "## 工作流程\n"
-            f"1. 先用 read 工具读取 {meta_path} 了解论文元信息（标题、作者、摘要）\n"
-            f"2. 再用 read 工具阅读 {txt_path}（论文正文全文），可以多次读取定位关键段落\n"
-            f"3. 充分理解后，用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n\n"
-            + writing_requirements
-            + "\n"
-            + json_schema
-        )
-    else:
-        return (
-            "请深度解读以下论文，严格按下面的 JSON schema 输出结果。\n\n"
-            "## 工作流程\n"
-            "论文元信息和正文全文已在上文提供，请仔细阅读。\n"
-            f"1. 充分理解论文后，用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n"
-            "2. 用 bash 运行 python scripts/validate_summary.py 验证\n\n"
-            + writing_requirements
-            + "\n"
-            + json_schema
-        )
-
-
 # ── pi CLI 调用 ────────────────────────────────────────────────────────
 
 
@@ -264,12 +90,10 @@ async def call_pi(
         txt_path.write_text(trimmed, encoding="utf-8")
         logger.info("Truncated %s for inject: %d → %d chars", arxiv_id, txt_size, len(trimmed))
 
-    prompt_text = _build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors)
+    prompt_text = build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors)
 
     # 构建 session ID（每篇论文一个独立 session）
     if session_id is None:
-        import uuid
-
         session_id = f"summary-{arxiv_id}-{uuid.uuid4().hex[:8]}"
 
     # 工具列表：search 模式需要 read 工具
@@ -297,6 +121,9 @@ async def call_pi(
         arxiv_id, bool(fix_errors), session_id, actual_mode,
     )
 
+    import time as _time
+    _t_sub_start = _time.monotonic()
+
     proc = await asyncio.create_subprocess_exec(
         *cmd,
         stdout=asyncio.subprocess.PIPE,
@@ -312,69 +139,22 @@ async def call_pi(
         await proc.wait()
         raise PiTimeoutError(f"pi timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s")
 
+    _t_sub_end = _time.monotonic()
+
+    # 检查 summary.json 是否由 pi 子进程写入
+    _summary_file = pdf_path.parent / "summary.json"
+    _file_info = ""
+    if _summary_file.exists():
+        _file_mtime = _summary_file.stat().st_mtime
+        _file_size = _summary_file.stat().st_size
+        _file_info = f"  summary.json={_file_size}B"
+
+    logger.info(
+        "pi subprocess for %s: %.2fs%s",
+        arxiv_id, _t_sub_end - _t_sub_start, _file_info,
+    )
+
     if proc.returncode != 0:
         raise PiProcessError(proc.returncode, stderr.decode("utf-8", errors="replace"))
 
     return stdout.decode("utf-8", errors="replace"), session_id
-
-
-# ── JSON 提取 ──────────────────────────────────────────────────────────
-
-
-def extract_json(raw_output: str) -> dict:
-    """从 pi 输出中提取 JSON dict。三步策略：直接解析 → 代码块 → 最大花括号块。"""
-    # 策略 1：整体直接解析
-    stripped = raw_output.strip()
-    try:
-        result = json.loads(stripped)
-        if isinstance(result, dict) and "title_zh" in result:
-            return result
-    except json.JSONDecodeError:
-        pass
-
-    # 策略 2：提取 ```json ... ``` 代码块
-    fence_pattern = re.compile(r"```(?:json)?\s*\n(.*?)```", re.DOTALL)
-    for match in fence_pattern.finditer(raw_output):
-        try:
-            result = json.loads(match.group(1).strip())
-            if isinstance(result, dict) and "title_zh" in result:
-                return result
-        except json.JSONDecodeError:
-            continue
-
-    # 策略 3：匹配包含 title_zh 的最大 {...} 块
-    brace_pattern = re.compile(r"\{[^{}]*\"title_zh\"[^{}]*\}", re.DOTALL)
-    for match in brace_pattern.finditer(raw_output):
-        try:
-            return json.loads(match.group(0))
-        except json.JSONDecodeError:
-            continue
-
-    # 更宽松：找到最大的 { ... } 平衡块
-    best = None
-    best_len = 0
-    for i, ch in enumerate(raw_output):
-        if ch != "{":
-            continue
-        depth = 0
-        for j in range(i, len(raw_output)):
-            if raw_output[j] == "{":
-                depth += 1
-            elif raw_output[j] == "}":
-                depth -= 1
-            if depth == 0:
-                candidate = raw_output[i : j + 1]
-                if len(candidate) > best_len:
-                    try:
-                        parsed = json.loads(candidate)
-                        if isinstance(parsed, dict):
-                            best = parsed
-                            best_len = len(candidate)
-                    except json.JSONDecodeError:
-                        pass
-                break
-
-    if best is not None:
-        return best
-
-    raise JsonNotFoundError("no JSON object found in pi output")
diff --git a/app/services/summarizer.py b/app/services/summarizer.py
index 9b1c6bd..8de9383 100644
--- a/app/services/summarizer.py
+++ b/app/services/summarizer.py
@@ -29,14 +29,19 @@ from app.services.pdf_downloader import (
     download_pdf,
     paper_dir,
 )
-from app.services.pi_client import (
+from app.services.summary_utils import (
     JsonNotFoundError,
+    build_prompt,
+    extract_json,
+    write_meta_json,
+    extract_pdf_text,
+)
+from app.services.pi_client import (
     PiProcessError,
     PiTimeoutError,
     call_pi,
-    extract_json,
-    write_meta_json,
 )
+from app.services import claude_backend
 from app.services.schemas import (
     SummarySchema,
     assess_quality,
@@ -229,7 +234,6 @@ def _save_files(arxiv_id: str, schema: SummarySchema | None, raw_output: str) ->
 async def summarize_one(
     db: Session,
     paper: Paper,
-    semaphore: asyncio.Semaphore | None = None,
     *,
     force: bool = False,
     pdf_mode: str = "auto",
@@ -257,68 +261,128 @@ async def summarize_one(
             "reason": "permanent_failure",
         }
 
-    if semaphore:
-        await semaphore.acquire()
-    try:
-        return await _do_summarize_one(db, paper, pdf_mode=pdf_mode)
-    finally:
-        if semaphore:
-            semaphore.release()
+    return await _do_summarize_one(db, paper, pdf_mode=pdf_mode)
 
 
 async def _generate_with_retry(
     arxiv_id: str, meta_path: Path, pdf_path: Path, pdf_mode: str = "auto"
 ) -> tuple[dict, str]:
-    """调用 pi CLI 生成总结，最多 4 轮验证循环。
+    """调用 AI 后端生成总结，最多 4 轮验证循环。
+
+    根据 settings.SUMMARY_BACKEND 选择 pi 或 claude 后端。
 
     Returns:
         (json_data, raw_output)
     Raises:
         ValueError: 4 轮验证仍未通过
     """
+    import time as _time
+
+    backend = settings.SUMMARY_BACKEND
     validation_errors: list[str] = []
     json_data: dict | None = None
     raw_output = ""
     session_id = None
 
+    summary_file = paper_dir(arxiv_id) / "summary.json"
+
+    # claude 后端需要预构建 prompt（pi 后端在 call_pi 内部构建）
+    claude_prompt: str | None = None
+    if backend == "claude":
+        _t0 = _time.monotonic()
+        txt_path = extract_pdf_text(pdf_path, max_chars=None)
+        body = txt_path.read_text(encoding="utf-8")
+        if len(body) > 80_000:
+            trimmed = body[:80_000].rstrip()
+            txt_path.write_text(trimmed, encoding="utf-8")
+        claude_prompt = build_prompt(arxiv_id, meta_path, txt_path, "inject", None)
+        logger.info("  [%s] 构建prompt: %.2fs", arxiv_id, _time.monotonic() - _t0)
+
     for attempt in range(1, 5):
-        # 清理上一轮 pi 写的不完整文件
-        stale = paper_dir(arxiv_id) / "summary.json"
-        if stale.exists():
-            stale.unlink()
+        # 清理上一轮写入的不完整文件
+        if summary_file.exists():
+            summary_file.unlink()
 
-        if attempt == 1:
-            raw_output, session_id = await call_pi(meta_path, pdf_path, pdf_mode=pdf_mode)
+        # 记录 AI 调用开始时间
+        _t_call_start = _time.monotonic()
+
+        if backend == "claude":
+            if attempt == 1:
+                raw_output, session_id = await claude_backend.call_claude(
+                    claude_prompt, session_id=None,
+                )
+            else:
+                retry_prompt = build_prompt(
+                    arxiv_id, meta_path,
+                    extract_pdf_text(pdf_path, max_chars=80000),
+                    "inject", fix_errors=validation_errors,
+                )
+                raw_output, session_id = await claude_backend.call_claude(
+                    retry_prompt, session_id=session_id, fix_errors=validation_errors,
+                )
         else:
-            raw_output, session_id = await call_pi(
-                meta_path, pdf_path,
-                fix_errors=validation_errors,
-                session_id=session_id,
-                pdf_mode=pdf_mode,
-            )
+            if attempt == 1:
+                raw_output, session_id = await call_pi(meta_path, pdf_path, pdf_mode=pdf_mode)
+            else:
+                raw_output, session_id = await call_pi(
+                    meta_path, pdf_path,
+                    fix_errors=validation_errors,
+                    session_id=session_id,
+                    pdf_mode=pdf_mode,
+                )
 
-        # 优先读取 pi 写入的 summary.json，否则从 stdout 提取
-        summary_file = paper_dir(arxiv_id) / "summary.json"
+        _t_call_end = _time.monotonic()
+
+        # 检查 summary.json 是否由 AI 子进程写入
+        file_written_by_ai = summary_file.exists()
+        file_mtime = summary_file.stat().st_mtime if file_written_by_ai else None
+        file_size = summary_file.stat().st_size if file_written_by_ai else 0
+
+        logger.info(
+            "  [%s] attempt %d AI调用: %.2fs  summary.json=%s%s",
+            arxiv_id, attempt,
+            _t_call_end - _t_call_start,
+            f"已写入({file_size}B)" if file_written_by_ai else "未写入",
+            f" mtime={file_mtime:.2f}" if file_mtime else "",
+        )
+
+        # 提取 JSON
+        _t_json_start = _time.monotonic()
         try:
-            if summary_file.exists():
+            if file_written_by_ai:
                 json_data = json.loads(summary_file.read_text(encoding="utf-8"))
-                logger.info("Read summary.json written by pi for %s", arxiv_id)
+                logger.info("  [%s] 从AI写入的summary.json读取", arxiv_id)
             else:
                 json_data = extract_json(raw_output)
         except (json.JSONDecodeError, JsonNotFoundError) as exc:
+            _t_json_end = _time.monotonic()
             logger.warning(
-                "JSON extraction failed for %s (attempt %d): %s",
-                arxiv_id, attempt, str(exc)[:200],
+                "  [%s] JSON提取失败: %.2fs  %s",
+                arxiv_id, _t_json_end - _t_json_start, str(exc)[:200],
             )
             validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"]
             continue
+        _t_json_end = _time.monotonic()
 
+        # 验证
+        _t_val_start = _time.monotonic()
         validation_errors = _validate_summary(json_data, arxiv_id)
+        _t_val_end = _time.monotonic()
+
         if not validation_errors:
+            logger.info(
+                "  [%s] JSON提取: %.2fs  验证: %.2fs  ✅",
+                arxiv_id,
+                _t_json_end - _t_json_start,
+                _t_val_end - _t_val_start,
+            )
             break
         logger.warning(
-            "Validation failed for %s (attempt %d): %s",
-            arxiv_id, attempt, "; ".join(validation_errors),
+            "  [%s] JSON提取: %.2fs  验证: %.2fs  ❌ %s",
+            arxiv_id,
+            _t_json_end - _t_json_start,
+            _t_val_end - _t_val_start,
+            "; ".join(validation_errors)[:200],
         )
 
     if validation_errors:
@@ -335,11 +399,19 @@ def _persist_summary(
     db: Session, paper: Paper, json_data: dict, raw_output: str
 ) -> str:
     """Pydantic 校验 → 质量评估 → 保存文件 → 更新 DB → 返回 quality。"""
+    import time as _time
+    arxiv_id = paper.arxiv_id
+
+    _t0 = _time.monotonic()
     schema = SummarySchema.model_validate(json_data)
     quality = assess_quality(schema)
+    _t1 = _time.monotonic()
+
+    _save_files(arxiv_id, schema, raw_output)
+    _t2 = _time.monotonic()
 
-    _save_files(paper.arxiv_id, schema, raw_output)
     _update_summary_in_db(db, paper, schema, quality, raw_output)
+    _t3 = _time.monotonic()
 
     # 状态 → done
     paper.summary_status.status = SummaryState.DONE
@@ -347,10 +419,30 @@ def _persist_summary(
     paper.summary_status.completed_at = utc_now()
     paper.summary_status.raw_output_saved = True
     db.commit()
+    _t4 = _time.monotonic()
+
+    logger.info(
+        "  [%s] persist: pydantic=%.2fs  文件=%.2fs  DB写入=%.2fs  状态commit=%.2fs",
+        arxiv_id,
+        _t1 - _t0,
+        _t2 - _t1,
+        _t3 - _t2,
+        _t4 - _t3,
+    )
 
     # 触发性增强（失败不影响总结）
-    _maybe_extract_images(paper.arxiv_id, schema)
-    _maybe_index_chroma(paper.arxiv_id, paper, schema)
+    _t5 = _time.monotonic()
+    _maybe_extract_images(arxiv_id, schema)
+    _t6 = _time.monotonic()
+    _maybe_index_chroma(arxiv_id, paper, schema)
+    _t7 = _time.monotonic()
+
+    logger.info(
+        "  [%s] 后处理: 图片提取=%.2fs  ChromaDB=%.2fs",
+        arxiv_id,
+        _t6 - _t5,
+        _t7 - _t6,
+    )
 
     return quality
 
@@ -445,28 +537,47 @@ async def _do_summarize_one(
 ) -> dict:
     """实际的单篇总结执行（在 semaphore 保护下）。"""
     arxiv_id = paper.arxiv_id
+    title_short = (paper.title_en or "")[:50]
 
     # 状态 → processing
     paper.summary_status.status = SummaryState.PROCESSING
     paper.summary_status.started_at = utc_now()
     db.commit()
 
+    logger.info("▶ [%s] 开始总结: %s", arxiv_id, title_short)
+
     # 清理旧的图片文件和 figures_json，避免重新总结时残留
+    import time as _time
+    _t_cleanup_start = _time.monotonic()
     _cleanup_old_images(db, paper)
+    _t_cleanup_end = _time.monotonic()
+    logger.info("  [%s] 清理旧数据: %.2fs", arxiv_id, _t_cleanup_end - _t_cleanup_start)
 
     raw_output = ""
     try:
-        meta_path = write_meta_json(paper)
-        await download_pdf(arxiv_id, paper.pdf_url)
+        _t0 = _time.monotonic()
 
+        meta_path = write_meta_json(paper)
+        _t1 = _time.monotonic()
+        logger.info("  [%s] meta.json: %.2fs", arxiv_id, _t1 - _t0)
+
+        await download_pdf(arxiv_id, paper.pdf_url)
+        _t2 = _time.monotonic()
+        logger.info("  [%s] 下载PDF: %.2fs", arxiv_id, _t2 - _t1)
+
+        logger.info("  [%s] 调用 pi 生成总结...", arxiv_id)
         json_data, raw_output = await _generate_with_retry(
             arxiv_id, meta_path, TMP_DIR / arxiv_id / "paper.pdf",
             pdf_mode=pdf_mode,
         )
+        _t3 = _time.monotonic()
+        logger.info("  [%s] pi生成: %.2fs", arxiv_id, _t3 - _t2)
 
         quality = _persist_summary(db, paper, json_data, raw_output)
+        _t4 = _time.monotonic()
+        logger.info("  [%s] 持久化: %.2fs", arxiv_id, _t4 - _t3)
 
-        logger.info("Summarize done: %s quality=%s", arxiv_id, quality)
+        logger.info("✅ [%s] 完成: quality=%s  总耗时: %.2fs", arxiv_id, quality, _t4 - _t0)
         return {"arxiv_id": arxiv_id, "status": "done", "quality": quality}
 
     except Exception as exc:
@@ -588,42 +699,67 @@ async def summarize_batch(
                 "total": 0,
             }
 
-        # 并发控制
-        semaphore = asyncio.Semaphore(settings.SUMMARY_CONCURRENCY)
+        # 并发控制：worker 模式，避免 573 个协程同时打开 DB 连接耗尽连接池
+        concurrency = settings.SUMMARY_CONCURRENCY
         make_session = _session_factory or SessionLocal
 
-        async def _process_paper(paper: Paper) -> dict:
-            paper_db = make_session()
-            try:
-                p = paper_db.execute(
-                    select(Paper)
-                    .where(Paper.id == paper.id)
-                    .options(*PAPER_DEFAULT_LOAD)
-                ).unique().scalar_one_or_none()
-                return await summarize_one(paper_db, p, semaphore, pdf_mode=pdf_mode)
-            finally:
-                paper_db.close()
+        # 进度追踪
+        progress = {"done": 0, "failed": 0, "skipped": 0}
+        paper_queue: asyncio.Queue[Paper | None] = asyncio.Queue()
+        for p in papers:
+            paper_queue.put_nowait(p)
 
-        results = await asyncio.gather(
-            *[_process_paper(p) for p in papers],
+        async def _worker() -> list[dict]:
+            results: list[dict] = []
+            while True:
+                paper = paper_queue.get_nowait() if not paper_queue.empty() else None
+                if paper is None:
+                    break
+                paper_db = make_session()
+                try:
+                    p = paper_db.execute(
+                        select(Paper)
+                        .where(Paper.id == paper.id)
+                        .options(*PAPER_DEFAULT_LOAD)
+                    ).unique().scalar_one_or_none()
+                    result = await summarize_one(paper_db, p, pdf_mode=pdf_mode)
+                    status = result.get("status", "failed")
+                    progress[status] = progress.get(status, 0) + 1
+                    finished = sum(progress.values())
+                    logger.info(
+                        "📊 进度: %d/%d (✅%d ❌%d ⏭️%d) — %s",
+                        finished, total,
+                        progress["done"], progress["failed"], progress["skipped"],
+                        paper.arxiv_id,
+                    )
+                    results.append(result)
+                except Exception as exc:
+                    logger.error("Worker error: %s", exc)
+                    results.append({"status": "failed", "error": str(exc)})
+                finally:
+                    paper_db.close()
+            return results
+
+        worker_results = await asyncio.gather(
+            *[_worker() for _ in range(concurrency)],
             return_exceptions=True,
         )
+        results = []
+        for r in worker_results:
+            if isinstance(r, Exception):
+                logger.error("Unexpected error in batch: %s", r)
+                results.append(r)
+            elif isinstance(r, list):
+                results.extend(r)
 
-        # 统计结果
-        done = 0
-        failed = 0
-        skipped = 0
+        # 统计结果（progress 已在 worker 中实时更新）
+        done = progress["done"]
+        failed = progress["failed"]
+        skipped = progress["skipped"]
         for r in results:
             if isinstance(r, Exception):
                 logger.error("Unexpected error in batch: %s", r)
                 failed += 1
-            elif isinstance(r, dict):
-                if r.get("status") == "done":
-                    done += 1
-                elif r.get("status") == "skipped":
-                    skipped += 1
-                else:
-                    failed += 1
 
         log_entry.status = "success" if failed == 0 else "failed"
         log_entry.papers_found = total
diff --git a/app/services/summary_utils.py b/app/services/summary_utils.py
new file mode 100644
index 0000000..cfd2c53
--- /dev/null
+++ b/app/services/summary_utils.py
@@ -0,0 +1,270 @@
+"""总结工具函数 — PDF 文本提取、prompt 构建、JSON 提取、meta.json 写入。
+
+与后端无关的通用逻辑，pi 和 claude 后端共享。
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+# ── 自定义异常 ──────────────────────────────────────────────────────────
+
+
+class JsonNotFoundError(Exception):
+    pass
+
+
+# ── meta.json ───────────────────────────────────────────────────────────
+
+
+def write_meta_json(paper) -> Path:
+    """写入 data/papers/{arxiv_id}/meta.json，返回路径。"""
+    from app.services.pdf_downloader import paper_dir
+
+    d = paper_dir(paper.arxiv_id)
+    d.mkdir(parents=True, exist_ok=True)
+    meta_path = d / "meta.json"
+
+    authors = [a.name for a in paper.authors]
+    tags = [t.tag for t in paper.tags]
+    meta = {
+        "arxiv_id": paper.arxiv_id,
+        "title_en": paper.title_en,
+        "abstract": paper.abstract or "",
+        "published_at": paper.published_at.isoformat() if paper.published_at else None,
+        "authors": authors,
+        "tags": tags,
+        "upvotes": paper.upvotes,
+    }
+    meta_path.write_text(
+        json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
+    )
+    return meta_path
+
+
+# ── PDF 文本提取 ────────────────────────────────────────────────────────
+
+
+def _trim_body(text: str, max_chars: int | None = None) -> str:
+    """去除参考文献，保留正文+附录，超长时从末尾截断。
+
+    策略：
+    1. 去掉 References/Bibliography 段落（纯引用列表，对解读无用）
+    2. 正文 + 附录全部保留
+    3. 如果指定了 max_chars 且总长超过，从末尾截断（附录靠后，优先保留正文）
+    """
+    # 找 References 段落的位置（在 Appendix 之后的那个）
+    ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text)
+    if ref_match:
+        ref_start = ref_match.start()
+        # 看 References 之后有没有 Appendix
+        after_ref = text[ref_start:]
+        app_match = re.search(
+            r"(?m)^(?:A\s+(?:Appendix|Supplementary)|Appendix|附录)\s*$", after_ref
+        )
+        if app_match:
+            # References 之后有 Appendix：只删 References 段
+            ref_end = ref_start + app_match.start()
+            text = text[:ref_start] + text[ref_end:]
+        else:
+            # References 之后没有 Appendix：删掉从 References 到结尾
+            text = text[:ref_start].rstrip()
+
+    # 去掉 Acknowledgments（对解读无用）
+    ack_match = re.search(r"(?m)^(?:Acknowledgments?\s*|致谢\s*)$", text)
+    if ack_match:
+        # 只删 Acknowledgments 本身，不删后面的内容
+        next_section = re.search(r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start():])
+        if next_section:
+            text = text[:ack_match.start()] + text[ack_match.start() + next_section.start():]
+        else:
+            text = text[:ack_match.start()].rstrip()
+
+    # 最后：如果指定了上限且超长，从末尾截断（附录在后面，正文在前面，优先保留正文）
+    if max_chars is not None and len(text) > max_chars:
+        text = text[:max_chars].rstrip()
+
+    return text
+
+
+def extract_pdf_text(pdf_path: Path, max_chars: int | None = None) -> Path:
+    """用 pymupdf 提取 PDF 正文文本，保存为 .txt。
+
+    max_chars=None 时不截断，给 search/auto 模式保留完整内容。
+    """
+    import pymupdf
+
+    txt_path = pdf_path.with_suffix(".txt")
+    if txt_path.exists():
+        # 缓存优先；如果需重新提取（不同 max_chars），先删旧文件
+        return txt_path
+
+    doc = pymupdf.open(str(pdf_path))
+    # sort=True 启用阅读顺序检测，避免双栏论文中跨栏错位
+    raw_text = "\n\n".join(page.get_text(sort=True) for page in doc)
+    doc.close()
+
+    body = _trim_body(raw_text, max_chars=max_chars)
+    txt_path.write_text(body, encoding="utf-8")
+    logger.info(
+        "Extracted PDF text: %s (%d -> %d chars, -%d%%)",
+        txt_path,
+        len(raw_text),
+        len(body),
+        (1 - len(body) / len(raw_text)) * 100 if raw_text else 0,
+    )
+    return txt_path
+
+
+# ── Prompt 构建 ─────────────────────────────────────────────────────────
+
+
+def build_prompt(
+    arxiv_id: str,
+    meta_path: Path,
+    txt_path: Path,
+    pdf_mode: str,
+    fix_errors: list[str] | None = None,
+) -> str:
+    """根据模式构建 prompt。
+
+    inject: 全量注入，prompt 末尾包含论文全文内容
+    search: pi 自主 read 文件，prompt 只包含工作流指令
+    """
+    json_schema = (
+        "## 必须包含以下字段（不要自创字段名）：\n"
+        '{"arxiv_id": "...", '
+        '"title_zh": "中文标题", '
+        '"one_line": "一句话概括(≤50字)", '
+        '"tags": ["标签1","标签2"], '
+        '"difficulty": "入门/进阶/前沿", '
+        '"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的（50-150字）","why_matters":"为什么读懂本文需要它"}]}, '
+        '"motivation": {"problem": "详细段落：现有方法的具体问题（包含具体场景和数据）", '
+        '"goal": "详细段落：本文的具体目标", '
+        '"gap": "详细段落：本文的独特切入角度"}, '
+        '"method": {"overview": "详细段落：方法整体思路（先直觉再技术路线）", '
+        '"key_idea": "详细段落：核心创新点（和已有方法的本质区别）", '
+        '"steps": "详细段落：方法步骤的完整描述（每步的输入输出和具体操作）", '
+        '"novelty": "详细段落：技术新颖性分析"}, '
+        '"results": {"main_findings": "详细段落：核心发现（带具体数字和指标，逐一分析每个实验）", '
+        '"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
+        '"limitations": "详细段落：局限性分析（作者承认的+你自己的观察")}, '
+        '"improvements": {"weaknesses": "详细段落：独立分析的弱点（具体场景，每个弱点给改进方向）", '
+        '"future_work": "详细段落：未来研究方向（作者提出的+基于成果可延伸的）", '
+        '"reproducibility": "详细段落：复现评估（开源情况、数据、算力、难度")}, '
+        '"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要","section":"method"},'
+        '{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要","section":"results"}]'
+        "\n注意：figures 必须包含论文中的所有重要图表，包括 Figure 和 Table，id 严格使用 \"Figure N\" 或 \"Table N\" 格式。"
+        "section 必须是 motivation/method/results/limitations 之一，表示该图最适合展示在哪个章节。"
+        "}"
+    )
+
+    writing_requirements = (
+        "## 写作要求\n"
+        "- 每个字符串字段必须写成详细段落（200-500字），不要用列表或数组\n"
+        "- 必须包含论文中的具体数据、数字、实验指标\n"
+        "- 像资深同事给同事讲论文一样，专业但易懂\n"
+        "- 数学公式、符号、变量必须使用 LaTeX 格式：行内公式用 $...$，独立公式用 $$...$$\n"
+        "  例如：损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$，学习率 $\\eta$\n"
+    )
+
+    if fix_errors:
+        error_list = "\n".join(f"- {e}" for e in fix_errors)
+        return (
+            "你之前生成的 JSON 存在以下问题，请修正后重新用 write_file 保存到 "
+            f"data/papers/{arxiv_id}/summary.json：\n\n"
+            f"{error_list}\n\n"
+            "注意：所有字符串字段必须是详细段落（≥50字），不能是数组或列表。"
+            "修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
+        )
+
+    if pdf_mode == "search":
+        return (
+            "请深度解读以下论文，严格按下面的 JSON schema 输出结果。\n\n"
+            "## 工作流程\n"
+            f"1. 先用 read 工具读取 {meta_path} 了解论文元信息（标题、作者、摘要）\n"
+            f"2. 再用 read 工具阅读 {txt_path}（论文正文全文），可以多次读取定位关键段落\n"
+            f"3. 充分理解后，用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n\n"
+            + writing_requirements
+            + "\n"
+            + json_schema
+        )
+    else:
+        return (
+            "请深度解读以下论文，严格按下面的 JSON schema 输出结果。\n\n"
+            "## 工作流程\n"
+            "论文元信息和正文全文已在上文提供，请仔细阅读。\n"
+            f"1. 充分理解论文后，用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n"
+            "2. 用 bash 运行 python scripts/validate_summary.py 验证\n\n"
+            + writing_requirements
+            + "\n"
+            + json_schema
+        )
+
+
+# ── JSON 提取 ──────────────────────────────────────────────────────────
+
+
+def extract_json(raw_output: str) -> dict:
+    """从输出中提取 JSON dict。三步策略：直接解析 → 代码块 → 最大花括号块。"""
+    # 策略 1：整体直接解析
+    stripped = raw_output.strip()
+    try:
+        result = json.loads(stripped)
+        if isinstance(result, dict) and "title_zh" in result:
+            return result
+    except json.JSONDecodeError:
+        pass
+
+    # 策略 2：提取 ```json ... ``` 代码块
+    fence_pattern = re.compile(r"```(?:json)?\s*\n(.*?)```", re.DOTALL)
+    for match in fence_pattern.finditer(raw_output):
+        try:
+            result = json.loads(match.group(1).strip())
+            if isinstance(result, dict) and "title_zh" in result:
+                return result
+        except json.JSONDecodeError:
+            continue
+
+    # 策略 3：匹配包含 title_zh 的最大 {...} 块
+    brace_pattern = re.compile(r"\{[^{}]*\"title_zh\"[^{}]*\}", re.DOTALL)
+    for match in brace_pattern.finditer(raw_output):
+        try:
+            return json.loads(match.group(0))
+        except json.JSONDecodeError:
+            continue
+
+    # 更宽松：找到最大的 { ... } 平衡块
+    best = None
+    best_len = 0
+    for i, ch in enumerate(raw_output):
+        if ch != "{":
+            continue
+        depth = 0
+        for j in range(i, len(raw_output)):
+            if raw_output[j] == "{":
+                depth += 1
+            elif raw_output[j] == "}":
+                depth -= 1
+            if depth == 0:
+                candidate = raw_output[i : j + 1]
+                if len(candidate) > best_len:
+                    try:
+                        parsed = json.loads(candidate)
+                        if isinstance(parsed, dict):
+                            best = parsed
+                            best_len = len(candidate)
+                    except json.JSONDecodeError:
+                        pass
+                break
+
+    if best is not None:
+        return best
+
+    raise JsonNotFoundError("no JSON object found in output")
diff --git a/pyproject.toml b/pyproject.toml
index 2c0c9a8..1001609 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ dependencies = [
     "fastapi>=0.115",
     "uvicorn[standard]>=0.34",
     "sqlalchemy>=2.0",
-    "httpx>=0.28",
+    "httpx[http2]>=0.28",
     "jinja2>=3.1",
     "python-multipart>=0.0.18",
     "pydantic>=2.0",
@@ -19,6 +19,7 @@ dependencies = [
     "pymupdf>=1.25",
     "itsdangerous>=2.2.0",
     "bleach>=6.4.0",
+    "pymupdf4llm>=1.27.2.3",
 ]
 
 [project.optional-dependencies]
diff --git a/scripts/validate_summary.py b/scripts/validate_summary.py
index 226b139..0bfcc5f 100644
--- a/scripts/validate_summary.py
+++ b/scripts/validate_summary.py
@@ -1,117 +1,144 @@
-"""验证 summary JSON 是否符合 SummarySchema 要求。
-
-用法：python scripts/validate_summary.py <json_file>
-返回：exit 0 = 通过，exit 1 = 失败（错误信息输出到 stdout）
-"""
-
 import json
 import sys
-from pathlib import Path
 
+schema = {
+    "type": "object",
+    "required": ["arxiv_id", "title_zh", "one_line", "tags", "difficulty", 
+                 "prerequisites", "motivation", "method", "results", "improvements", "figures"],
+    "properties": {
+        "arxiv_id": {"type": "string"},
+        "title_zh": {"type": "string"},
+        "one_line": {"type": "string"},
+        "tags": {"type": "array", "items": {"type": "string"}},
+        "difficulty": {"type": "string", "enum": ["入门", "进阶", "前沿"]},
+        "prerequisites": {
+            "type": "object",
+            "required": ["concepts"],
+            "properties": {
+                "concepts": {"type": "array", "items": {
+                    "type": "object",
+                    "required": ["term", "explanation", "why_matters"],
+                    "properties": {
+                        "term": {"type": "string"},
+                        "explanation": {"type": "string"},
+                        "why_matters": {"type": "string"}
+                    }
+                }}
+            }
+        },
+        "motivation": {
+            "type": "object",
+            "required": ["problem", "goal", "gap"],
+            "properties": {
+                "problem": {"type": "string"},
+                "goal": {"type": "string"},
+                "gap": {"type": "string"}
+            }
+        },
+        "method": {
+            "type": "object",
+            "required": ["overview", "key_idea", "steps", "novelty"],
+            "properties": {
+                "overview": {"type": "string"},
+                "key_idea": {"type": "string"},
+                "steps": {"type": "string"},
+                "novelty": {"type": "string"}
+            }
+        },
+        "results": {
+            "type": "object",
+            "required": ["main_findings", "benchmarks", "limitations"],
+            "properties": {
+                "main_findings": {"type": "string"},
+                "benchmarks": {"type": "array", "items": {
+                    "type": "object",
+                    "required": ["task", "metric", "this_work", "baseline", "improvement"],
+                    "properties": {
+                        "task": {"type": "string"},
+                        "metric": {"type": "string"},
+                        "this_work": {"type": "string"},
+                        "baseline": {"type": "string"},
+                        "improvement": {"type": "string"}
+                    }
+                }},
+                "limitations": {"type": "string"}
+            }
+        },
+        "improvements": {
+            "type": "object",
+            "required": ["weaknesses", "future_work", "reproducibility"],
+            "properties": {
+                "weaknesses": {"type": "string"},
+                "future_work": {"type": "string"},
+                "reproducibility": {"type": "string"}
+            }
+        },
+        "figures": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "required": ["id", "caption", "description", "reason", "section"],
+                "properties": {
+                    "id": {"type": "string"},
+                    "caption": {"type": "string"},
+                    "description": {"type": "string"},
+                    "reason": {"type": "string"},
+                    "section": {"type": "string", "enum": ["motivation", "method", "results", "limitations"]}
+                }
+            }
+        }
+    }
+}
 
-def validate(path: str) -> list[str]:
-    errors: list[str] = []
+def validate_file(filepath):
     try:
-        data = json.loads(Path(path).read_text(encoding="utf-8"))
+        with open(filepath, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        
+        # Check required fields
+        for field in schema["required"]:
+            if field not in data:
+                print(f"❌ Missing field: {field}")
+                return False
+        
+        # Validate nested structure
+        for field, spec in schema["properties"].items():
+            if field in data:
+                if spec["type"] == "string":
+                    if not isinstance(data[field], str):
+                        print(f"❌ Field '{field}' should be string")
+                        return False
+                elif spec["type"] == "array":
+                    if not isinstance(data[field], list):
+                        print(f"❌ Field '{field}' should be array")
+                        return False
+                elif spec["type"] == "object":
+                    if not isinstance(data[field], dict):
+                        print(f"❌ Field '{field}' should be object")
+                        return False
+                    if "required" in spec:
+                        for subfield in spec["required"]:
+                            if subfield not in data[field]:
+                                print(f"❌ Missing subfield: {field}.{subfield}")
+                                return False
+        
+        # Validate section enum in figures
+        valid_sections = ["motivation", "method", "results", "limitations"]
+        for fig in data.get("figures", []):
+            if fig["section"] not in valid_sections:
+                print(f"❌ Invalid section in figure: {fig['section']}")
+                return False
+        
+        print("✅ JSON validation passed!")
+        return True
+        
     except json.JSONDecodeError as e:
-        return [f"JSON 解析失败: {e}"]
-
-    if not isinstance(data, dict):
-        return ["顶层必须是 JSON 对象 (dict)"]
-
-    # 必填字段
-    required_top = ["arxiv_id", "title_zh", "one_line", "tags"]
-    for f in required_top:
-        if f not in data or not data[f]:
-            errors.append(f"缺少必填字段: {f}")
-
-    # tags 必须是非空数组
-    tags = data.get("tags")
-    if isinstance(tags, list) and len(tags) == 0:
-        errors.append("tags 不能为空数组")
-    if not isinstance(tags, list):
-        errors.append("tags 必须是数组")
-
-    # motivation 子字段
-    motivation = data.get("motivation", {})
-    if not isinstance(motivation, dict):
-        errors.append("motivation 必须是对象")
-    else:
-        for f in ["problem", "goal", "gap"]:
-            val = motivation.get(f, "")
-            if not isinstance(val, str) or len(val.strip()) < 50:
-                errors.append(f"motivation.{f} 必须是详细段落（≥50字），当前: {type(val).__name__} ({len(str(val))}字)")
-
-    # method 子字段
-    method = data.get("method", {})
-    if not isinstance(method, dict):
-        errors.append("method 必须是对象")
-    else:
-        for f in ["overview", "key_idea", "steps", "novelty"]:
-            val = method.get(f, "")
-            if not isinstance(val, str) or len(val.strip()) < 50:
-                errors.append(f"method.{f} 必须是详细段落（≥50字），当前: {type(val).__name__} ({len(str(val))}字)")
-
-    # results 子字段
-    results = data.get("results", {})
-    if not isinstance(results, dict):
-        errors.append("results 必须是对象")
-    else:
-        for f in ["main_findings", "limitations"]:
-            val = results.get(f, "")
-            if not isinstance(val, str) or len(val.strip()) < 50:
-                errors.append(f"results.{f} 必须是详细段落（≥50字），当前: {type(val).__name__} ({len(str(val))}字)")
-        # benchmarks 可以是数组
-        benchmarks = results.get("benchmarks")
-        if benchmarks is not None and not isinstance(benchmarks, list):
-            errors.append("results.benchmarks 必须是数组")
-
-    # improvements 子字段
-    improvements = data.get("improvements", {})
-    if not isinstance(improvements, dict):
-        errors.append("improvements 必须是对象")
-    else:
-        for f in ["weaknesses", "future_work", "reproducibility"]:
-            val = improvements.get(f, "")
-            if not isinstance(val, str) or len(val.strip()) < 50:
-                errors.append(f"improvements.{f} 必须是详细段落（≥50字），当前: {type(val).__name__} ({len(str(val))}字)")
-
-    # 检查是否有字段误用数组（应该用字符串的）
-    string_fields = [
-        ("motivation", "problem"), ("motivation", "goal"), ("motivation", "gap"),
-        ("method", "overview"), ("method", "key_idea"), ("method", "steps"), ("method", "novelty"),
-        ("results", "main_findings"), ("results", "limitations"),
-        ("improvements", "weaknesses"), ("improvements", "future_work"), ("improvements", "reproducibility"),
-    ]
-    for section, field in string_fields:
-        val = data.get(section, {}).get(field)
-        if isinstance(val, list):
-            errors.append(f"{section}.{field} 应该是字符串段落，不能是数组")
-
-    # figures 验证
-    figures = data.get("figures")
-    if figures is not None:
-        if not isinstance(figures, list):
-            errors.append("figures 必须是数组")
-        else:
-            for i, fig in enumerate(figures):
-                if isinstance(fig, dict) and not fig.get("id"):
-                    errors.append(f"figures[{i}] 缺少 id 字段")
-
-    return errors
-
+        print(f"❌ JSON decode error: {e}")
+        return False
+    except Exception as e:
+        print(f"❌ Validation error: {e}")
+        return False
 
 if __name__ == "__main__":
-    if len(sys.argv) != 2:
-        print("用法: python scripts/validate_summary.py <json_file>")
-        sys.exit(1)
-
-    errs = validate(sys.argv[1])
-    if errs:
-        print("❌ 验证失败:")
-        for e in errs:
-            print(f"  - {e}")
-        sys.exit(1)
-    else:
-        print("✅ 验证通过")
-        sys.exit(0)
+    filepath = sys.argv[1] if len(sys.argv) > 1 else "data/papers/2601.10592/summary.json"
+    validate_file(filepath)
diff --git a/uv.lock b/uv.lock
index b28e0d9..48a678d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -684,6 +684,19 @@ wheels = [
     { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
 ]
 
+[[package]]
+name = "h2"
+version = "4.3.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
+dependencies = [
+    { name = "hpack" },
+    { name = "hyperframe" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1d/17/afa56379f94ad0fe8defd37d6eb3f89a25404ffc71d4d848893d270325fc/h2-4.3.0.tar.gz", hash = "sha256:6c59efe4323fa18b47a632221a1888bd7fde6249819beda254aeca909f221bf1", size = 2152026, upload-time = "2025-08-23T18:12:19.778Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/69/b2/119f6e6dcbd96f9069ce9a2665e0146588dc9f88f29549711853645e736a/h2-4.3.0-py3-none-any.whl", hash = "sha256:c438f029a25f7945c69e0ccf0fb951dc3f73a5f6412981daee861431b70e2bdd", size = 61779, upload-time = "2025-08-23T18:12:17.779Z" },
+]
+
 [[package]]
 name = "hf-daily-papers"
 version = "0.1.0"
@@ -693,12 +706,13 @@ dependencies = [
     { name = "bleach" },
     { name = "chromadb" },
     { name = "fastapi" },
-    { name = "httpx" },
+    { name = "httpx", extra = ["http2"] },
     { name = "itsdangerous" },
     { name = "jinja2" },
     { name = "pydantic" },
     { name = "pydantic-settings" },
     { name = "pymupdf" },
+    { name = "pymupdf4llm" },
     { name = "python-dotenv" },
     { name = "python-multipart" },
     { name = "sqlalchemy" },
@@ -724,12 +738,13 @@ requires-dist = [
     { name = "bleach", specifier = ">=6.4.0" },
     { name = "chromadb", specifier = ">=1.0" },
     { name = "fastapi", specifier = ">=0.115" },
-    { name = "httpx", specifier = ">=0.28" },
+    { name = "httpx", extras = ["http2"], specifier = ">=0.28" },
     { name = "itsdangerous", specifier = ">=2.2.0" },
     { name = "jinja2", specifier = ">=3.1" },
     { name = "pydantic", specifier = ">=2.0" },
     { name = "pydantic-settings", specifier = ">=2.0" },
     { name = "pymupdf", specifier = ">=1.25" },
+    { name = "pymupdf4llm", specifier = ">=1.27.2.3" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" },
     { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24" },
     { name = "python-dotenv", specifier = ">=1.0" },
@@ -778,6 +793,15 @@ wheels = [
     { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/62/94/3b66b148778ee100dcfd69c2ca22b57b41b44d3063ceec934f209e9184ce/hf_xet-1.5.0-cp37-abi3-win_arm64.whl", hash = "sha256:b6c9df403040248c76d808d3e047d64db2d923bae593eb244c41e425cf6cd7be", size = 3806916, upload-time = "2026-05-06T06:18:21.7Z" },
 ]
 
+[[package]]
+name = "hpack"
+version = "4.1.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2c/48/71de9ed269fdae9c8057e5a4c0aa7402e8bb16f2c6e90b3aa53327b113f8/hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca", size = 51276, upload-time = "2025-01-22T21:44:58.347Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/07/c6/80c95b1b2b94682a72cbdbfb85b81ae2daffa4291fbfa1b1464502ede10d/hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496", size = 34357, upload-time = "2025-01-22T21:44:56.92Z" },
+]
+
 [[package]]
 name = "httpcore"
 version = "1.0.9"
@@ -842,6 +866,11 @@ wheels = [
     { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
 ]
 
+[package.optional-dependencies]
+http2 = [
+    { name = "h2" },
+]
+
 [[package]]
 name = "huggingface-hub"
 version = "1.16.1"
@@ -862,6 +891,15 @@ wheels = [
     { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/49/79/621a7dbb80c70974f73a597275351ebe03ce5bc65cb5f8f4acb5859252bc/huggingface_hub-1.16.1-py3-none-any.whl", hash = "sha256:64340de934b9ce37857ef85a82de72f5629e8a270f9119eabb12bf495eb53c22", size = 668176, upload-time = "2026-05-21T18:39:58.596Z" },
 ]
 
+[[package]]
+name = "hyperframe"
+version = "6.1.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/02/e7/94f8232d4a74cc99514c13a9f995811485a6903d48e5d952771ef6322e30/hyperframe-6.1.0.tar.gz", hash = "sha256:f630908a00854a7adeabd6382b43923a4c4cd4b821fcb527e6ab9e15382a3b08", size = 26566, upload-time = "2025-01-22T21:41:49.302Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/48/30/47d0bf6072f7252e6521f3447ccfa40b421b6824517f82854703d0f5a98b/hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5", size = 13007, upload-time = "2025-01-22T21:41:47.295Z" },
+]
+
 [[package]]
 name = "idna"
 version = "3.18"
@@ -1223,6 +1261,15 @@ wheels = [
     { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" },
 ]
 
+[[package]]
+name = "networkx"
+version = "3.6.1"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" },
+]
+
 [[package]]
 name = "numpy"
 version = "2.4.6"
@@ -1842,6 +1889,39 @@ wheels = [
     { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/53/a4/b9e91aac82293f9c954654c85581ee8212b5b05efadc534b581141241e6f/pymupdf-1.27.2.3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:77691604c5d1d0233827139bbcdea61fd57879c84712b8e49b1f45520f7ab9c2", size = 25000393, upload-time = "2026-04-24T14:11:01.669Z" },
 ]
 
+[[package]]
+name = "pymupdf-layout"
+version = "1.27.2.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
+dependencies = [
+    { name = "networkx" },
+    { name = "numpy" },
+    { name = "onnxruntime" },
+    { name = "pymupdf" },
+    { name = "pyyaml" },
+]
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bc/ee/067726c3ee5574ad5c605d00d7419e264ef509d626a726f99388111f8216/pymupdf_layout-1.27.2.3-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:75c2ab3c0e8830ac2bc50cfd32d375a30768a2610dac72a02f08265336e0834f", size = 15799844, upload-time = "2026-04-24T14:11:13.177Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0a/ba/46a7a36474722f9280d885f6eec878561a257d9378e52590b43d32ffb96c/pymupdf_layout-1.27.2.3-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:5656b09669dcd7c51f539afb6fdaf853602bab4cbc20479ee5ee1a85a4e32b60", size = 15795220, upload-time = "2026-04-24T14:11:23.17Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/84/87/bfdcca67346052943a4549814f2009b38f4d15ec025798cdf7dfa5f57c84/pymupdf_layout-1.27.2.3-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:fcf03aa815cbceebdb3263dd6a190de4547c46b1d168928836ec38738afe127d", size = 15805240, upload-time = "2026-04-24T14:11:33.465Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/32/e9/7ce6eaf97cebd46c3808593282e9eb99a60cddd6183e25a636980d5c7986/pymupdf_layout-1.27.2.3-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:303b9414216dfaf711ec7d807b6f1e4c3e0a92bbb4569340fcedd9d5593d16ca", size = 15806269, upload-time = "2026-04-24T14:11:43.481Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/bf/61/3b2417d8f2cdfaa0f4749cd9dafa3379cb5cdaddf4233165f1ff81953c30/pymupdf_layout-1.27.2.3-cp310-abi3-win_amd64.whl", hash = "sha256:503b64d9b6b31ea3af79ef85cf7d36950c5048af468cb297684d2953553c62ad", size = 15809163, upload-time = "2026-04-24T14:11:53.956Z" },
+]
+
+[[package]]
+name = "pymupdf4llm"
+version = "1.27.2.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
+dependencies = [
+    { name = "pymupdf" },
+    { name = "pymupdf-layout" },
+    { name = "tabulate" },
+]
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/87/c0/e3830452d82032c3d82a9879616c05bf0c51e0dea03c1d80d57b3a6ec0d1/pymupdf4llm-1.27.2.3.tar.gz", hash = "sha256:42ec1a47ddc62be3f4f40c116d27618611c6f9fa366719016d9ddc3f3a3dc22b", size = 1406297, upload-time = "2026-04-24T14:13:18.843Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e6/38/84bf29f4dd72e6c450546df6ca8f53021f764fd945ba67dcc235d39bc20e/pymupdf4llm-1.27.2.3-py3-none-any.whl", hash = "sha256:bd724b79fa3f06a5b28d7a65f7acfa8de56e04bdb603ac2d6dff315e0d151aaa", size = 77348, upload-time = "2026-04-24T14:11:04.305Z" },
+]
+
 [[package]]
 name = "pypika"
 version = "0.51.1"
@@ -2202,6 +2282,15 @@ wheels = [
     { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/1c/54/196d0c1db10af76baa4f64894448505d60d3cdf70ef92cbb35f46a4e4c71/starlette-1.2.1-py3-none-any.whl", hash = "sha256:4de0082d08c8f6764a85a54cf1120d6939507a19905c7768acad2a9f875d2b89", size = 73350, upload-time = "2026-05-31T01:07:50.09Z" },
 ]
 
+[[package]]
+name = "tabulate"
+version = "0.10.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/46/58/8c37dea7bbf769b20d58e7ace7e5edfe65b849442b00ffcdd56be88697c6/tabulate-0.10.0.tar.gz", hash = "sha256:e2cfde8f79420f6deeffdeda9aaec3b6bc5abce947655d17ac662b126e48a60d", size = 91754, upload-time = "2026-03-04T18:55:34.402Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/99/55/db07de81b5c630da5cbf5c7df646580ca26dfaefa593667fc6f2fe016d2e/tabulate-0.10.0-py3-none-any.whl", hash = "sha256:f0b0622e567335c8fabaaa659f1b33bcb6ddfe2e496071b743aa113f8774f2d3", size = 39814, upload-time = "2026-03-04T18:55:31.284Z" },
+]
+
 [[package]]
 name = "tenacity"
 version = "9.1.4"