feat: enhance UI, refactor services, improve templates and tests

- Replace image_extractor with pdf_image_extractor service - Enhance pi_client with expanded API capabilities - Improve summarizer service with additional features - Update admin routes with more endpoints - Add login page template - Enhance detail page with comprehensive layout - Improve search and trends pages - Update base template with additional elements - Refactor tests for better coverage - Add validate_summary script - Update project configuration and dependencies
2026-06-07 19:38:58 +08:00
parent 4a72c35452
commit 0d293422ac
32 changed files with 2003 additions and 586 deletions
@@ -1,12 +1,14 @@
 # ─── 应用 ────────────────────────────────
-APP_HOST=0.0.0.0
+APP_HOST=127.0.0.1
 APP_PORT=8000
 APP_DEBUG=false
 BASE_URL=http://127.0.0.1:8000
 APP_TIMEZONE=Asia/Shanghai

 # ─── 安全 ────────────────────────────────
-ADMIN_TOKEN=your_admin_token_here
+ADMIN_USERNAME=admin
+ADMIN_PASSWORD=your_secure_password
+SECRET_KEY=your_random_secret_key

 # ─── HuggingFace / arXiv ────────────────
 HF_API_BASE=https://huggingface.co/api
@@ -19,7 +21,7 @@ HTTP_USER_AGENT=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
 # ─── AI 总结 ──────────────────────────────
 PI_BIN=
 SUMMARY_SKILL=daily-paper-summary
-SUMMARY_CONCURRENCY=2
+SUMMARY_CONCURRENCY=3
 SUMMARY_TIMEOUT_SECONDS=300
 SUMMARY_MAX_RETRIES=1

@@ -49,6 +49,7 @@ paper/
 ├── pyproject.toml
 │
 ├── app/
+│   ├── __init__.py
 │   ├── main.py              # FastAPI 入口（lifespan 管理）
 │   ├── config.py            # pydantic-settings 配置加载
 │   ├── database.py          # SQLAlchemy 引擎、会话与 FTS5
@@ -57,6 +58,7 @@ paper/
 │   ├── cli.py               # Typer CLI（crawl / summarize / init-db）
 │   │
 │   ├── routes/              # 页面与 API 路由
+│   │   ├── __init__.py
 │   │   ├── pages.py         # 首页、日期页、论文详情
 │   │   ├── admin.py         # Token 鉴权管理接口
 │   │   ├── search.py        # 搜索、阅读列表、RSS
@@ -65,6 +67,7 @@ paper/
 │   │   └── compare.py       # 论文对比页
 │   │
 │   ├── services/            # 业务逻辑层
+│   │   ├── __init__.py
 │   │   ├── crawler.py       # HuggingFace API 爬虫
 │   │   ├── summarizer.py    # AI 总结编排
 │   │   ├── searcher.py      # FTS5 + 语义搜索
@@ -103,7 +106,7 @@ paper/
 │   ├── init_db.py           # 数据库初始化
 │   └── manual_crawl.py      # 手动抓取脚本
 │
-├── tests/                   # 10 个测试模块
+├── tests/                   # 9 个测试模块
 │   ├── conftest.py          # 测试夹具（内存 DB、样本数据）
 │   └── test_*.py            # 各模块测试
 │
@@ -117,7 +120,7 @@ paper/
 ### 1. 准备环境

 - Python **3.12+**
- 可选：[`pi`](https://github.com/) CLI（用于 AI 总结）
+- 可选：[`pi`](https://www.npmjs.com/package/@mariozechner/pi-coding-agent) CLI（用于 AI 总结）

 ### 2. 安装依赖

@@ -139,14 +142,30 @@ cp .env.example .env
 | 变量 | 默认值 | 说明 |
 |------|--------|------|
 | `APP_HOST` / `APP_PORT` | `127.0.0.1` / `8000` | 服务监听地址 |
+| `APP_DEBUG` | `false` | 调试模式（开启 uvicorn reload） |
+| `BASE_URL` | `http://127.0.0.1:8000` | 站点根 URL（用于 RSS 生成） |
+| `APP_TIMEZONE` | `Asia/Shanghai` | 时区 |
 | `ADMIN_TOKEN` | `change-me` | **必须修改** — 管理接口鉴权 |
+| `HF_API_BASE` | `https://huggingface.co/api` | HuggingFace API 地址 |
+| `HF_PROXY` | — | HTTP 代理 |
 | `TOP_N` | `20` | 每日抓取 Top N 论文 |
+| `HTTP_TIMEOUT_SECONDS` | `30` | HTTP 请求超时 |
+| `HTTP_MAX_RETRIES` | `3` | HTTP 最大重试次数 |
+| `PI_BIN` | — | `pi` CLI 路径 |
+| `SUMMARY_SKILL` | `daily-paper-summary` | pi 总结技能名 |
+| `SUMMARY_CONCURRENCY` | `3` | 最大并行总结数 |
+| `SUMMARY_TIMEOUT_SECONDS` | `300` | 单篇总结超时 |
+| `SUMMARY_MAX_RETRIES` | `1` | 总结最大重试次数 |
 | `SCHEDULER_ENABLED` | `false` | 启用每日自动抓取 |
-| `SCHEDULE_HOUR` / `SCHEDULE_MINUTE` | `8` / `0` | 定时任务时间（Asia/Shanghai） |
+| `SCHEDULE_HOUR` / `SCHEDULE_MINUTE` | `8` / `0` | 定时任务时间（APP_TIMEZONE） |
+| `APP_WORKERS` | `1` | Uvicorn worker 数（必须为 1） |
 | `DATABASE_URL` | `sqlite:///data/db/papers.db` | 数据库路径 |
 | `CHROMA_ENABLED` | `false` | 启用语义搜索 |
-| `PI_BIN` | — | `pi` CLI 路径 |
-| `SUMMARY_CONCURRENCY` | `3` | 最大并行总结数 |
+| `CHROMA_DIR` | `data/chroma` | ChromaDB 数据目录 |
+| `EMBED_API_BASE` | — | Embedding API 地址 |
+| `EMBED_API_KEY` | — | Embedding API Key |
+| `EMBED_MODEL` | — | Embedding 模型名 |
+| `EMBED_DIMENSIONS` | `0` | 向量维度 |

 ### 4. 初始化数据库

@@ -158,10 +177,10 @@ python scripts/init_db.py
 ### 5. 启动服务

 ```bash
-uvicorn app.main:app --host 127.0.0.1 --port 8000 --workers 1
+uvicorn app.main:app --host 127.0.0.1 --port 8000
 ```

-> 调度器依赖单 worker：`--workers` 必须为 `1`，否则每日任务会被重复触发。
+> 调度器依赖单 worker：不可使用 `--workers > 1`，否则每日任务会被重复触发。

 打开浏览器访问 `http://127.0.0.1:8000` 即可。

@@ -172,9 +191,9 @@ uvicorn app.main:app --host 127.0.0.1 --port 8000 --workers 1
 ### 手动抓取指定日期

 ```bash
-python scripts/manual_crawl.py --date 2025-01-15
+python scripts/manual_crawl.py 2025-01-15
 # 或
-python -m app.cli crawl --date 2025-01-15 --top 20
+python -m app.cli crawl 2025-01-15 --top 20
 ```

 ### 手动触发总结
@@ -24,7 +24,7 @@ def crawl(
    """手动抓取指定日期的 HuggingFace Daily Papers。"""
    from app.config import settings
    from app.database import SessionLocal, engine
-    from app.models import init_db as _init
+    from app.database import init_db as _init
    from app.services.crawler import crawl_daily

    target = date_str or date.today().isoformat()
@@ -60,7 +60,7 @@ def summarize(
    """手动触发 AI 总结。"""
    from app.config import settings
    from app.database import SessionLocal, engine
-    from app.models import init_db as _init
+    from app.database import init_db as _init
    from app.services.summarizer import summarize_batch, summarize_single

    import os
@@ -96,7 +96,7 @@ def init_db():
    """初始化数据库表。"""
    from app.config import settings
    from app.database import engine
-    from app.models import init_db as _init
+    from app.database import init_db as _init

    import os

@@ -16,7 +16,9 @@ class Settings(BaseSettings):
    APP_TIMEZONE: str = "Asia/Shanghai"

    # 安全
-    ADMIN_TOKEN: str = "change-me"
+    ADMIN_USERNAME: str = "admin"
+    ADMIN_PASSWORD: str = ""
+    SECRET_KEY: str = "change-me"

    # HuggingFace / arXiv
    HF_API_BASE: str = "https://huggingface.co/api"
@@ -62,8 +62,39 @@ def get_db():
        db.close()


+def _migrate(engine) -> None:
+    """自动给已有表补齐缺失的列（SQLite ALTER TABLE ADD COLUMN）。"""
+    import logging
+
+    logger = logging.getLogger(__name__)
+
+    # 定义需要确保存在的列：{表名: [(列名, 列类型 SQL), ...]}
+    _MIGRATIONS: dict[str, list[tuple[str, str]]] = {
+        "paper_summaries": [
+            ("figures_json", "TEXT"),
+        ],
+    }
+
+    with engine.connect() as conn:
+        for table, columns in _MIGRATIONS.items():
+            # 获取已有列名
+            existing = {
+                row[1]
+                for row in conn.execute(text(f"PRAGMA table_info({table})"))
+            }
+            for col_name, col_type in columns:
+                if col_name not in existing:
+                    conn.execute(
+                        text(
+                            f"ALTER TABLE {table} ADD COLUMN {col_name} {col_type}"
+                        )
+                    )
+                    logger.info("Migrated: %s.%s added", table, col_name)
+        conn.commit()
+
+
 def init_db(engine):
-    """创建所有 ORM 表 + FTS5 虚拟表。"""
+    """创建所有 ORM 表 + FTS5 虚拟表 + 自动迁移。"""
    from app.models import Base  # noqa: F811 — 避免循环导入，延迟导入

    Base.metadata.create_all(engine)
@@ -71,3 +102,4 @@ def init_db(engine):
        conn.execute(text(FTS5_CREATE_SQL))
        conn.execute(text(FTS5_TRIGGER_INDEX))
        conn.commit()
+    _migrate(engine)
@@ -6,6 +6,7 @@ from contextlib import asynccontextmanager

 from fastapi import FastAPI
 from fastapi.staticfiles import StaticFiles
+from starlette.middleware.sessions import SessionMiddleware

 from app.config import settings
 from app.database import engine, init_db
@@ -56,17 +57,17 @@ def create_app() -> FastAPI:
    init_db(engine)
    logger.info("Database initialized at %s", settings.db_path)

-    # 安全警告
-    if settings.ADMIN_TOKEN == "change-me":
-        logger.warning(
-            "⚠️  ADMIN_TOKEN is the default value 'change-me'. Please change it in .env!"
-        )
+    # Session 中间件
+    app.add_middleware(SessionMiddleware, secret_key=settings.SECRET_KEY)

-    if settings.APP_HOST not in ("127.0.0.1", "localhost", "::1"):
+    # 安全警告
+    if settings.SECRET_KEY == "change-me":
        logger.warning(
-            "⚠️  APP_HOST=%s is not localhost. "
-            "Ensure ADMIN_TOKEN is properly set and access is restricted.",
-            settings.APP_HOST,
+            "⚠️  SECRET_KEY is the default value 'change-me'. Please change it in .env!"
+        )
+    if not settings.ADMIN_PASSWORD:
+        logger.warning(
+            "⚠️  ADMIN_PASSWORD is empty. Please set it in .env!"
        )

    # 静态文件
@@ -131,6 +131,7 @@ class PaperSummary(Base):
    weaknesses_json = Column(Text)
    future_work_json = Column(Text)
    reproducibility = Column(String)
+    figures_json = Column(Text)
    full_json = Column(Text, nullable=False)
    updated_at = Column(DateTime, nullable=False)

@@ -1,11 +1,12 @@
-"""管理接口 — 抓取、总结、清理、删除、日志，需要 ADMIN_TOKEN 鉴权。"""
+"""管理接口 — 抓取、总结、清理、删除、日志，需要登录鉴权。"""

 from __future__ import annotations

+import hashlib
 from datetime import date, datetime, timezone

-from fastapi import APIRouter, Depends, HTTPException, Query, Request
-from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
+from fastapi import APIRouter, Depends, Form, HTTPException, Query, Request
+from fastapi.responses import RedirectResponse
 from pydantic import BaseModel, field_validator
 from sqlalchemy import select
 from sqlalchemy.orm import Session
@@ -19,16 +20,65 @@ from app.services.summarizer import summarize_batch, summarize_single
 from app.utils import release_lock, templates, today_str

 router = APIRouter(prefix="/admin", tags=["admin"])
-security = HTTPBearer()


-async def verify_admin(
-    credentials: HTTPAuthorizationCredentials = Depends(security),
-) -> str:
-    """验证 ADMIN_TOKEN。"""
-    if credentials.credentials != settings.ADMIN_TOKEN:
-        raise HTTPException(status_code=401, detail="Invalid admin token")
-    return credentials.credentials
+# ── 认证 ──────────────────────────────────────────────────────────────
+
+
+def _check_password(password: str) -> bool:
+    """校验密码，支持明文或 sha256 哈希。"""
+    stored = settings.ADMIN_PASSWORD
+    if not stored:
+        return False
+    if password == stored:
+        return True
+    # 也支持存 sha256 哈希
+    return hashlib.sha256(password.encode()).hexdigest() == stored
+
+
+async def verify_admin(request: Request) -> None:
+    """检查 session 中的登录状态，未登录则重定向到登录页。"""
+    if not request.session.get("is_admin"):
+        raise HTTPException(status_code=303, headers={"Location": "/admin/login"})
+
+
+def verify_admin_page(request: Request) -> None:
+    """页面级认证：未登录重定向到登录页（同步版本，用于模板路由）。"""
+    if not request.session.get("is_admin"):
+        raise HTTPException(status_code=303, headers={"Location": "/admin/login"})
+
+
+# ── 登录 / 登出 ──────────────────────────────────────────────────────
+
+
+@router.get("/login")
+async def admin_login_page(request: Request):
+    """显示登录页面。已登录则直接跳转管理页。"""
+    if request.session.get("is_admin"):
+        return RedirectResponse("/admin/logs", status_code=303)
+    return templates.TemplateResponse(request, "login.html", {"error": None})
+
+
+@router.post("/login")
+async def admin_login_submit(
+    request: Request,
+    username: str = Form(""),
+    password: str = Form(""),
+):
+    """处理登录表单提交。"""
+    if username == settings.ADMIN_USERNAME and _check_password(password):
+        request.session["is_admin"] = True
+        return RedirectResponse("/admin/logs", status_code=303)
+    return templates.TemplateResponse(
+        request, "login.html", {"error": "用户名或密码错误"}
+    )
+
+
+@router.post("/logout")
+async def admin_logout(request: Request):
+    """退出登录，清除 session。"""
+    request.session.clear()
+    return RedirectResponse("/admin/login", status_code=303)


 # ── 请求模型 ──────────────────────────────────────────────────────────
@@ -53,7 +103,7 @@ class DeleteRequest(BaseModel):

@router.post("/crawl")
 async def admin_crawl(
-    _admin: str = Depends(verify_admin),
+    _admin: None = Depends(verify_admin),
    db: Session = Depends(get_db),
    date: str | None = Query(None, description="YYYY-MM-DD，默认今天"),
 ):
@@ -92,7 +142,7 @@ async def admin_crawl(

@router.post("/summarize")
 async def admin_summarize_batch(
-    _admin: str = Depends(verify_admin),
+    _admin: None = Depends(verify_admin),
    db: Session = Depends(get_db),
 ):
    """批量总结所有 pending 论文。"""
@@ -107,7 +157,7 @@ async def admin_summarize_batch(
@router.post("/summarize/{arxiv_id}")
 async def admin_summarize_single(
    arxiv_id: str,
-    _admin: str = Depends(verify_admin),
+    _admin: None = Depends(verify_admin),
    db: Session = Depends(get_db),
 ):
    """总结或重跑单篇论文。"""
@@ -122,7 +172,7 @@ async def admin_summarize_single(

@router.post("/cleanup")
 async def admin_cleanup(
-    _admin: str = Depends(verify_admin),
+    _admin: None = Depends(verify_admin),
    db: Session = Depends(get_db),
 ):
    """清理 data/tmp/ 中超过 24 小时的临时文件。"""
@@ -159,7 +209,7 @@ async def admin_cleanup(
@router.post("/delete")
 async def admin_delete(
    body: DeleteRequest,
-    _admin: str = Depends(verify_admin),
+    _admin: None = Depends(verify_admin),
    db: Session = Depends(get_db),
 ):
    """删除指定日期范围内的论文（需要 confirm='DELETE' 二次确认）。"""
@@ -181,7 +231,7 @@ async def admin_delete(
@router.get("/logs")
 async def admin_logs(
    request: Request,
-    _admin: str = Depends(verify_admin),
+    _admin: None = Depends(verify_admin),
    db: Session = Depends(get_db),
    page: int = Query(1, ge=1),
    per_page: int = Query(20, ge=1, le=100),
@@ -107,6 +107,44 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
    # 图片画廊
    images = _get_paper_images(arxiv_id)

+    # 预处理 JSON 字段供模板直接使用
+    import json as _json
+
+    prereqs = {}
+    if paper.summary and paper.summary.prerequisites_json:
+        try:
+            prereqs = _json.loads(paper.summary.prerequisites_json)
+        except (ValueError, TypeError):
+            pass
+
+    benchmarks = []
+    if paper.summary and paper.summary.results_benchmarks_json:
+        try:
+            benchmarks = _json.loads(paper.summary.results_benchmarks_json)
+        except (ValueError, TypeError):
+            pass
+
+    figures_raw = []
+    if paper.summary and paper.summary.figures_json:
+        try:
+            figures_raw = _json.loads(paper.summary.figures_json)
+        except (ValueError, TypeError):
+            pass
+
+    linked_figures = _link_figures_with_images(figures_raw, images, arxiv_id)
+
+    # 拆分：table_figures（有截图的 Table 类型）→ 实验结果区域展示截图
+    #       figures（其余）→ 论文图表画廊
+    table_figures = []
+    figures = []
+    for fig in linked_figures:
+        fig_id = fig.get("id", "")
+        is_table = fig_id.lower().startswith("table")
+        if is_table and fig.get("image_url"):
+            table_figures.append(fig)
+        else:
+            figures.append(fig)
+
    return templates.TemplateResponse(
        request,
        "detail.html",
@@ -115,6 +153,10 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
            "summary_state": summary_state,
            "similar_papers": similar_papers,
            "paper_images": images,
+            "prereqs": prereqs,
+            "benchmarks": benchmarks,
+            "figures": figures,
+            "table_figures": table_figures,
            "chroma_enabled": settings.CHROMA_ENABLED,
            "page_title": paper.title_zh or paper.title_en,
        },
@@ -232,3 +274,48 @@ def _get_paper_images(arxiv_id: str) -> list[dict]:
                }
            )
    return images
+
+
+def _link_figures_with_images(
+    figures: list[dict], images: list[dict], arxiv_id: str
+) -> list[dict]:
+    """将 summary figures 元数据与提取的图片文件关联。
+
+    通过 manifest.json 中的 figure ID 匹配，给每个 figure 加上 image_url。
+    """
+    if not figures or not images:
+        return figures
+
+    import json as _json
+    import re
+
+    manifest_path = Path("data/papers") / arxiv_id / "images" / "manifest.json"
+    if not manifest_path.exists():
+        return figures
+
+    try:
+        manifest = _json.loads(manifest_path.read_text(encoding="utf-8"))
+    except (ValueError, TypeError):
+        return figures
+
+    # 构建 figure_id -> image_url 的映射
+    id_to_url: dict[str, str] = {}
+    for filename, info in manifest.items():
+        url = f"/papers/{arxiv_id}/images/{filename}"
+        for fig_id in info.get("figures", []) + info.get("tables", []):
+            id_to_url[fig_id] = url
+
+    # 归一化 summary figures 的 ID
+    for fig in figures:
+        raw_id = fig.get("id", "")
+        m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", raw_id, re.IGNORECASE)
+        if m:
+            normalized = f"Figure {m.group(1)}"
+        else:
+            m2 = re.match(r"Table\s*(\d+)", raw_id, re.IGNORECASE)
+            normalized = f"Table {m2.group(1)}" if m2 else raw_id
+
+        if normalized in id_to_url:
+            fig["image_url"] = id_to_url[normalized]
+
+    return figures
@@ -1,83 +0,0 @@
-"""LaTeX 图片提取 — 从 arXiv 源码中扫描 \\includegraphics 并提取图片文件。"""
-
-from __future__ import annotations
-
-import logging
-import re
-import shutil
-from pathlib import Path
-
-from app.services.pdf_downloader import download_source_zip, paper_dir, tmp_dir
-
-logger = logging.getLogger(__name__)
-
-_INCLUDEGRAPHICS_RE = re.compile(
-    r"\\includegraphics\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}", re.MULTILINE
-)
-_IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".eps"}
-
-
-async def extract_images_from_source(arxiv_id: str) -> int:
-    """从 LaTeX 源码中提取图片文件。
-
-    流程：
-    1. 下载源码 zip 到 data/tmp/{arxiv_id}/source/
-    2. 扫描 .tex 文件中的 \\includegraphics
-    3. 复制图片到 data/papers/{arxiv_id}/images/
-    4. 清理源码临时文件
-
-    Returns:
-        提取的图片数量
-    """
-    tmp_source = tmp_dir(arxiv_id) / "source"
-    images_dest = paper_dir(arxiv_id) / "images"
-
-    try:
-        # 下载源码 zip（如果还没下载）
-        if not tmp_source.exists():
-            source_url = f"https://arxiv.org/e-print/{arxiv_id}"
-            await download_source_zip(arxiv_id, source_url, tmp_source)
-
-        if not tmp_source.exists():
-            return 0
-
-        # 扫描 .tex 文件，收集图片路径
-        image_paths: set[str] = set()
-        for tex_file in tmp_source.rglob("*.tex"):
-            try:
-                content = tex_file.read_text(encoding="utf-8", errors="replace")
-                for match in _INCLUDEGRAPHICS_RE.finditer(content):
-                    img_path = match.group(1).strip()
-                    image_paths.add(img_path)
-            except Exception:
-                continue
-
-        if not image_paths:
-            return 0
-
-        # 查找并复制图片
-        images_dest.mkdir(parents=True, exist_ok=True)
-        copied = 0
-        for img_rel in image_paths:
-            # 尝试在源码目录中找到文件
-            for ext in ("", ".png", ".jpg", ".jpeg", ".gif", ".pdf", ".eps"):
-                candidate = tmp_source / (img_rel + ext)
-                if candidate.is_file():
-                    dest_name = candidate.name
-                    # 避免文件名冲突
-                    dest = images_dest / dest_name
-                    if dest.exists():
-                        stem = dest.stem
-                        suffix = dest.suffix
-                        dest = images_dest / f"{stem}_{copied}{suffix}"
-                    shutil.copy2(candidate, dest)
-                    copied += 1
-                    break
-
-        if copied > 0:
-            logger.info("Extracted %d images from source for %s", copied, arxiv_id)
-        return copied
-
-    except Exception:
-        logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
-        return 0
@@ -0,0 +1,261 @@
+"""PDF 图片与表格提取 — 从 PDF 中提取嵌入图片和表格截图。
+
+策略：
+1. 提取 PDF 中嵌入的图片（图表、插图等）
+2. 检测表格区域，渲染为截图
+3. 同时搜索页面中的 Figure/Table 标注，记录到 manifest
+4. 过滤掉过小的图片
+5. 保存到 data/papers/{arxiv_id}/images/
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+from pathlib import Path
+
+from app.services.pdf_downloader import paper_dir
+
+logger = logging.getLogger(__name__)
+
+# 最小面积阈值（像素），小于此值的图片视为图标/装饰
+_MIN_AREA = 10_000  # ~100x100
+_MIN_DIM = 80
+
+# Figure/Table 标注与图片/表格的最大垂直距离（点）
+_MAX_LABEL_DISTANCE = 120
+
+# Figure/Table 标注的正则
+_FIGURE_RE = re.compile(r'\b(?:Fig\.?|Figure)\s*(\d+)\b', re.IGNORECASE)
+_TABLE_RE = re.compile(r'\bTable\s*(\d+)\b', re.IGNORECASE)
+
+
+def _find_nearby_labels(
+    rects: list, labels: dict[str, list[tuple[int, float]]], page_num: int
+) -> list[str]:
+    """查找与给定矩形区域在位置上接近的 Figure/Table 标注。
+
+    匹配逻辑：标注的垂直位置 (y) 需在图片/表格的上下 _MAX_LABEL_DISTANCE 点范围内。
+    """
+    matched: list[str] = []
+    for rect in rects:
+        if isinstance(rect, (list, tuple)):
+            y_min, y_max = rect[1], rect[3]
+        else:
+            y_min, y_max = rect.y0, rect.y1
+
+        for label_key, positions in labels.items():
+            for label_page, label_y in positions:
+                if label_page == page_num:
+                    # 标注在图片/表格上方或下方的距离
+                    distance = min(abs(label_y - y_min), abs(label_y - y_max))
+                    if distance <= _MAX_LABEL_DISTANCE:
+                        if label_key not in matched:
+                            matched.append(label_key)
+    return matched
+
+
+def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
+    """从 PDF 提取嵌入图片和表格截图，同时生成 manifest。
+
+    Args:
+        arxiv_id: 论文 ID
+        pdf_path: PDF 路径，默认 data/tmp/{arxiv_id}/paper.pdf
+
+    Returns:
+        提取的图片+表格数量
+    """
+    import pymupdf
+
+    if pdf_path is None:
+        pdf_path = Path("data/tmp") / arxiv_id / "paper.pdf"
+
+    if not pdf_path.exists():
+        logger.warning("PDF not found for %s: %s", arxiv_id, pdf_path)
+        return 0
+
+    images_dest = paper_dir(arxiv_id) / "images"
+    images_dest.mkdir(parents=True, exist_ok=True)
+
+    doc = pymupdf.open(str(pdf_path))
+    extracted = 0
+    seen_hashes: set[int] = set()
+
+    # 扫描每页的 Figure/Table 标注位置
+    # figure_labels: {key: [(page_num, y_center)]} — 记录标注在页面中的垂直位置
+    figure_labels: dict[str, list[tuple[int, float]]] = {}
+    table_labels: dict[str, list[tuple[int, float]]] = {}
+
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        text_dict = page.get_text("dict")
+        for block in text_dict.get("blocks", []):
+            if block.get("type") != 0:  # 只看文本块
+                continue
+            block_text = ""
+            for line in block.get("lines", []):
+                for span in line.get("spans", []):
+                    block_text += span.get("text", "")
+            for m in _FIGURE_RE.finditer(block_text):
+                key = f"Figure {m.group(1)}"
+                bbox = block.get("bbox", [0, 0, 0, 0])
+                y_center = (bbox[1] + bbox[3]) / 2
+                figure_labels.setdefault(key, []).append((page_num, y_center))
+            for m in _TABLE_RE.finditer(block_text):
+                key = f"Table {m.group(1)}"
+                bbox = block.get("bbox", [0, 0, 0, 0])
+                y_center = (bbox[1] + bbox[3]) / 2
+                table_labels.setdefault(key, []).append((page_num, y_center))
+
+    # 记录每个提取文件的元信息
+    manifest: dict[str, dict] = {}
+
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+
+        # ── 1. 提取嵌入图片 ──
+        image_list = page.get_images(full=True)
+        for img_index, img_info in enumerate(image_list):
+            xref = img_info[0]
+            try:
+                pix = pymupdf.Pixmap(doc, xref)
+            except Exception:
+                continue
+
+            if pix.width < _MIN_DIM or pix.height < _MIN_DIM:
+                continue
+            if pix.width * pix.height < _MIN_AREA:
+                continue
+
+            img_hash = hash(pix.tobytes()[:1024])
+            if img_hash in seen_hashes:
+                continue
+            seen_hashes.add(img_hash)
+
+            if pix.n >= 5:
+                try:
+                    pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
+                except Exception:
+                    continue
+
+            filename = f"page{page_num + 1}_img{img_index + 1}.png"
+            pix.save(str(images_dest / filename))
+            extracted += 1
+            logger.debug("Image: %s (%dx%d)", filename, pix.width, pix.height)
+
+            # 查找该图片位置附近的 Figure 标注
+            img_rects = page.get_image_rects(xref)
+            matched = _find_nearby_labels(img_rects, figure_labels, page_num)
+            manifest[filename] = {"page": page_num + 1, "type": "image", "figures": matched}
+
+        # ── 2. 提取表格截图 ──
+        try:
+            tables = page.find_tables()
+        except Exception:
+            tables = None
+
+        if tables and tables.tables:
+            for table_index, table in enumerate(tables.tables):
+                bbox = table.bbox
+                if not bbox:
+                    continue
+
+                margin = 5
+                if isinstance(bbox, (list, tuple)):
+                    x0, y0, x1, y1 = bbox
+                else:
+                    x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
+                clip_rect = pymupdf.Rect(x0 - margin, y0 - margin, x1 + margin, y1 + margin)
+
+                zoom = 2
+                mat = pymupdf.Matrix(zoom, zoom)
+                try:
+                    pix = page.get_pixmap(matrix=mat, clip=clip_rect)
+                except Exception:
+                    continue
+
+                if pix.width < _MIN_DIM * 2 or pix.height < 30 * 2:
+                    continue
+
+                filename = f"page{page_num + 1}_table{table_index + 1}.png"
+                pix.save(str(images_dest / filename))
+                extracted += 1
+                logger.debug("Table: %s (%dx%d)", filename, pix.width, pix.height)
+
+                # 查找该表格位置附近的 Table 标注
+                table_rect = pymupdf.Rect(x0, y0, x1, y1)
+                matched = _find_nearby_labels([table_rect], table_labels, page_num)
+                manifest[filename] = {"page": page_num + 1, "type": "table", "tables": matched}
+
+    doc.close()
+
+    # 保存 manifest
+    manifest_path = images_dest / "manifest.json"
+    manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2))
+
+    if extracted > 0:
+        logger.info("Extracted %d images+tables from PDF for %s", extracted, arxiv_id)
+    return extracted
+
+
+def filter_images_by_summary(arxiv_id: str, figures: list[dict]) -> int:
+    """根据 summary 中的 figures 字段过滤提取的图片/表格。
+
+    用 manifest.json 匹配，不需要 PDF 文件。
+    """
+    if not figures:
+        return 0
+
+    images_dir = paper_dir(arxiv_id) / "images"
+    manifest_path = images_dir / "manifest.json"
+
+    if not images_dir.exists() or not manifest_path.exists():
+        return 0
+
+    all_files = [f for f in images_dir.iterdir() if f.suffix == ".png"]
+    if not all_files:
+        return 0
+
+    manifest: dict = json.loads(manifest_path.read_text(encoding="utf-8"))
+
+    # 收集 summary 中引用的所有 Figure/Table ID（归一化）
+    referenced_ids: set[str] = set()
+    for fig in figures:
+        fig_id = fig.get("id", "")
+        m = re.match(r'(?:Fig\.?|Figure)\s*(\d+)', fig_id, re.IGNORECASE)
+        if m:
+            referenced_ids.add(f"Figure {m.group(1)}")
+        m2 = re.match(r'Table\s*(\d+)', fig_id, re.IGNORECASE)
+        if m2:
+            referenced_ids.add(f"Table {m2.group(1)}")
+
+    if not referenced_ids:
+        logger.warning("No valid figure/table IDs in summary for %s", arxiv_id)
+        return len(all_files)
+
+    # 根据 manifest 判断每个文件是否被引用
+    keep_filenames: set[str] = set()
+    for filename, info in manifest.items():
+        file_refs = info.get("figures", []) + info.get("tables", [])
+        for ref in file_refs:
+            if ref in referenced_ids:
+                keep_filenames.add(filename)
+                break
+
+    if not keep_filenames:
+        logger.warning(
+            "No manifest matches for %s (refs=%s), keeping all",
+            arxiv_id, referenced_ids,
+        )
+        return len(all_files)
+
+    removed = 0
+    for f in all_files:
+        if f.name not in keep_filenames:
+            f.unlink()
+            removed += 1
+
+    kept = len(all_files) - removed
+    logger.info("Filtered images for %s: kept %d, removed %d (refs=%s)", arxiv_id, kept, removed, referenced_ids)
+    return kept
@@ -59,23 +59,179 @@ def write_meta_json(paper) -> Path:
    return meta_path


+# ── PDF 文本提取 ────────────────────────────────────────────────────────
+
+
+def _trim_body(text: str, max_chars: int = 80_000) -> str:
+    """去除参考文献，保留正文+附录，超长时从末尾截断。
+
+    策略：
+    1. 去掉 References/Bibliography 段落（纯引用列表，对解读无用）
+    2. 正文 + 附录全部保留
+    3. 如果总长超过 max_chars，从末尾截断（附录靠后，优先保留正文）
+    """
+    import re
+
+    # 找 References 段落的位置（在 Appendix 之后的那个）
+    # 有些论文结构：正文 -> Appendix -> References
+    # 也可能是：正文 -> References -> Appendix
+    # 策略：只删除明确的 References 块
+    ref_pattern = re.compile(
+        r"(?m)^(?:References|Bibliography|参考文献)\s*$\n"
+        r"(?s:.*?)"  # References 内容
+        r"(?=\n(?:A\s|Appendix|Supplementary|Acknowledgment|致谢)\s|\Z)",
+    )
+
+    # 简单策略：找到 References 标题，如果后面没有 Appendix 就全删
+    # 如果后面还有 Appendix，只删 References 到 Appendix 之间的内容
+    ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text)
+    if ref_match:
+        ref_start = ref_match.start()
+        # 看 References 之后有没有 Appendix
+        after_ref = text[ref_start:]
+        app_match = re.search(
+            r"(?m)^(?:A\s+(?:Appendix|Supplementary)|Appendix|附录)\s*$", after_ref
+        )
+        if app_match:
+            # References 之后有 Appendix：只删 References 段
+            ref_end = ref_start + app_match.start()
+            text = text[:ref_start] + text[ref_end:]
+        else:
+            # References 之后没有 Appendix：删掉从 References 到结尾
+            text = text[:ref_start].rstrip()
+
+    # 去掉 Acknowledgments（对解读无用）
+    ack_match = re.search(r"(?m)^(?:Acknowledgments?\s*|致谢\s*)$", text)
+    if ack_match:
+        # 只删 Acknowledgments 本身，不删后面的内容
+        next_section = re.search(r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start():])
+        if next_section:
+            text = text[:ack_match.start()] + text[ack_match.start() + next_section.start():]
+        else:
+            text = text[:ack_match.start()].rstrip()
+
+    # 最后：如果还超长，从末尾截断（附录在后面，正文在前面，优先保留正文）
+    if len(text) > max_chars:
+        text = text[:max_chars].rstrip()
+
+    return text
+
+
+def extract_pdf_text(pdf_path: Path) -> Path:
+    """用 pymupdf 提取 PDF 正文文本（自动截断参考文献和附录），保存为 .txt。"""
+    import pymupdf
+
+    txt_path = pdf_path.with_suffix(".txt")
+    if txt_path.exists():
+        return txt_path
+
+    doc = pymupdf.open(str(pdf_path))
+    raw_text = "\n\n".join(page.get_text() for page in doc)
+    doc.close()
+
+    body = _trim_body(raw_text)
+    txt_path.write_text(body, encoding="utf-8")
+    logger.info(
+        "Extracted PDF text: %s (%d -> %d chars, -%d%%)",
+        txt_path,
+        len(raw_text),
+        len(body),
+        (1 - len(body) / len(raw_text)) * 100 if raw_text else 0,
+    )
+    return txt_path
+
+
 # ── pi CLI 调用 ────────────────────────────────────────────────────────


-async def call_pi(meta_path: Path, pdf_path: Path) -> str:
-    """调用 pi CLI 非交互模式，返回 stdout 文本。"""
+async def call_pi(
+    meta_path: Path,
+    pdf_path: Path,
+    fix_errors: list[str] | None = None,
+    session_id: str | None = None,
+) -> tuple[str, str]:
+    """调用 pi CLI 非交互模式，返回 (stdout 文本, session_id)。
+
+    fix_errors: 如果非空，表示上一次验证失败的错误列表，pi 需要修正这些问题。
+    session_id: 如果非空，用 --continue 延续该 session；否则创建新 session。
+    """
    arxiv_id = meta_path.parent.name
+
+    # 将 PDF 转为文本文件，以 @txt 方式传给 pi
+    txt_path = extract_pdf_text(pdf_path)
+
+    if fix_errors:
+        # 验证失败后的修正提示（同一 session 内，pi 能看到之前写的文件）
+        error_list = "\n".join(f"- {e}" for e in fix_errors)
+        prompt_text = (
+            "你之前生成的 JSON 存在以下问题，请修正后重新用 write_file 保存到 "
+            f"data/papers/{arxiv_id}/summary.json：\n\n"
+            f"{error_list}\n\n"
+            "注意：所有字符串字段必须是详细段落（≥50字），不能是数组或列表。"
+            "修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
+        )
+    else:
+        prompt_text = (
+            "请深度解读以下论文，严格按下面的 JSON schema 输出结果。"
+            "只输出一个 JSON 对象，不要输出其他内容。\n\n"
+            "## 写作要求\n"
+            "- 每个字符串字段必须写成详细段落（200-500字），不要用列表或数组\n"
+            "- 必须包含论文中的具体数据、数字、实验指标\n"
+            "- 像资深同事给同事讲论文一样，专业但易懂\n"
+            "- 数学公式、符号、变量必须使用 LaTeX 格式：行内公式用 $...$，独立公式用 $$...$$\n"
+            "  例如：损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$，学习率 $\\eta$\n\n"
+            "## 必须包含以下字段（不要自创字段名）：\n"
+            '{"arxiv_id": "...", '
+            '"title_zh": "中文标题", '
+            '"one_line": "一句话概括(≤50字)", '
+            '"tags": ["标签1","标签2"], '
+            '"difficulty": "入门/进阶/前沿", '
+            '"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的（50-150字）","why_matters":"为什么读懂本文需要它"}]}, '
+            '"motivation": {"problem": "详细段落：现有方法的具体问题（包含具体场景和数据）", '
+            '"goal": "详细段落：本文的具体目标", '
+            '"gap": "详细段落：本文的独特切入角度"}, '
+            '"method": {"overview": "详细段落：方法整体思路（先直觉再技术路线）", '
+            '"key_idea": "详细段落：核心创新点（和已有方法的本质区别）", '
+            '"steps": "详细段落：方法步骤的完整描述（每步的输入输出和具体操作）", '
+            '"novelty": "详细段落：技术新颖性分析"}, '
+            '"results": {"main_findings": "详细段落：核心发现（带具体数字和指标，逐一分析每个实验）", '
+            '"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
+            '"limitations": "详细段落：局限性分析（作者承认的+你自己的观察）"}, '
+            '"improvements": {"weaknesses": "详细段落：独立分析的弱点（具体场景，每个弱点给改进方向）", '
+            '"future_work": "详细段落：未来研究方向（作者提出的+基于成果可延伸的）", '
+            '"reproducibility": "详细段落：复现评估（开源情况、数据、算力、难度）"}, '
+            '"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要"},'
+            '{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要"}]'
+            "\n注意：figures 必须包含论文中的所有重要图表，包括 Figure 和 Table，id 严格使用 \"Figure N\" 或 \"Table N\" 格式。"
+            "}\n\n"
+            "请深度解读以下论文："
+        )
+
+    # 构建 session ID（每篇论文一个独立 session）
+    if session_id is None:
+        import uuid
+
+        session_id = f"summary-{arxiv_id}-{uuid.uuid4().hex[:8]}"
+
    cmd = [
        settings.PI_BIN,
        "-p",
-        "--no-tools",
+        "--tools", "bash,write_file",
+    ]
+    if fix_errors:
+        cmd += ["--session", session_id, "--continue"]
+    else:
+        cmd += ["--session-id", session_id]
+    cmd += [
        "--skill",
        settings.SUMMARY_SKILL,
-        "请深度解读以下论文，并按指定 JSON schema 输出：",
-        f"@{meta_path}",
-        f"@{pdf_path}",
+        prompt_text,
    ]
-    logger.info("Calling pi for %s", arxiv_id)
+    if not fix_errors:
+        # 首次调用传文件，后续 --continue 不需要（session 内已有）
+        cmd += [f"@{meta_path}", f"@{txt_path}"]
+
+    logger.info("Calling pi for %s (fix=%s, session=%s)", arxiv_id, bool(fix_errors), session_id)

    proc = await asyncio.create_subprocess_exec(
        *cmd,
@@ -95,7 +251,7 @@ async def call_pi(meta_path: Path, pdf_path: Path) -> str:
    if proc.returncode != 0:
        raise PiProcessError(proc.returncode, stderr.decode("utf-8", errors="replace"))

-    return stdout.decode("utf-8", errors="replace")
+    return stdout.decode("utf-8", errors="replace"), session_id


 # ── JSON 提取 ──────────────────────────────────────────────────────────
@@ -12,8 +12,7 @@ from pydantic import BaseModel, Field, ValidationError, field_validator


 class PrerequisitesSchema(BaseModel):
-    concepts: list[str] = Field(default_factory=list)
-    level: str = ""
+    concepts: list[dict] = Field(default_factory=list)


 class MotivationSchema(BaseModel):
@@ -32,7 +31,7 @@ class MotivationSchema(BaseModel):
 class MethodSchema(BaseModel):
    overview: str = ""
    key_idea: str
-    steps: list[str] = Field(default_factory=list)
+    steps: str = ""
    novelty: str = ""

    @field_validator("key_idea")
@@ -44,14 +43,14 @@ class MethodSchema(BaseModel):


 class ResultsSchema(BaseModel):
-    main_findings: list[str] = Field(default_factory=list)
-    benchmarks: list[dict] = Field(default_factory=list)
-    limitations: list[str] = Field(default_factory=list)
+    main_findings: str = ""
+    benchmarks: list[str | dict] = Field(default_factory=list)
+    limitations: str = ""


 class ImprovementsSchema(BaseModel):
-    weaknesses: list[str] = Field(default_factory=list)
-    future_work: list[str] = Field(default_factory=list)
+    weaknesses: str = ""
+    future_work: str = ""
    reproducibility: str = ""


@@ -71,6 +70,7 @@ class SummarySchema(BaseModel):
    method: MethodSchema
    results: ResultsSchema = Field(default_factory=ResultsSchema)
    improvements: ImprovementsSchema = Field(default_factory=ImprovementsSchema)
+    figures: list[dict] = Field(default_factory=list)

    @field_validator("title_zh", "one_line")
    @classmethod
@@ -116,7 +116,7 @@ def assess_quality(schema: SummarySchema) -> str:
        missing_important += 1
    if not schema.method.overview.strip():
        missing_important += 1
-    if not schema.results.main_findings:
+    if not schema.results.main_findings.strip():
        missing_important += 1

    if missing_important == 0:
@@ -140,22 +140,17 @@ def flatten_for_db(schema: SummarySchema) -> dict:
        "motivation_gap": schema.motivation.gap,
        "method_overview": schema.method.overview,
        "method_key_idea": schema.method.key_idea,
-        "method_steps_json": json.dumps(schema.method.steps, ensure_ascii=False),
+        "method_steps_json": schema.method.steps,
        "method_novelty": schema.method.novelty,
-        "results_main_json": json.dumps(
-            schema.results.main_findings, ensure_ascii=False
-        ),
+        "results_main_json": schema.results.main_findings,
        "results_benchmarks_json": json.dumps(
            schema.results.benchmarks, ensure_ascii=False
        ),
-        "limitations_json": json.dumps(schema.results.limitations, ensure_ascii=False),
-        "weaknesses_json": json.dumps(
-            schema.improvements.weaknesses, ensure_ascii=False
-        ),
-        "future_work_json": json.dumps(
-            schema.improvements.future_work, ensure_ascii=False
-        ),
+        "limitations_json": schema.results.limitations,
+        "weaknesses_json": schema.improvements.weaknesses,
+        "future_work_json": schema.improvements.future_work,
        "reproducibility": schema.improvements.reproducibility,
+        "figures_json": json.dumps(schema.figures, ensure_ascii=False),
        "full_json": schema.model_dump_json(ensure_ascii=False),
        "updated_at": datetime.now(timezone.utc),
    }
@@ -22,7 +22,6 @@ from app.models import (
    SummaryStatus,
    TaskLock,
 )
-from app.services.image_extractor import extract_images_from_source
 from app.services.pdf_downloader import (
    PdfDownloadError,
    cleanup_tmp,
@@ -77,10 +76,9 @@ def _build_fts_summary_text(schema: SummarySchema) -> str:
        schema.one_line or "",
        schema.motivation.problem or "",
        schema.motivation.goal or "",
-        schema.method_overview if hasattr(schema, "method_overview") else "",
        schema.method.overview or "",
        schema.method.key_idea or "",
-        " ".join(schema.results.main_findings or []),
+        schema.results.main_findings or "",
    ]
    return " ".join(p for p in parts if p)

@@ -141,6 +139,77 @@ def _update_summary_in_db(
    logger.info("DB updated: paper=%s quality=%s", paper.arxiv_id, quality)


+# ── JSON 验证 ──────────────────────────────────────────────────────────
+
+
+def _validate_summary(json_data: dict, arxiv_id: str) -> list[str]:
+    """验证 JSON 数据是否符合要求，返回错误列表（空=通过）。"""
+    errors: list[str] = []
+
+    if not isinstance(json_data, dict):
+        return ["顶层必须是 JSON 对象"]
+
+    # 必填字段
+    for f in ["arxiv_id", "title_zh", "one_line", "tags"]:
+        if f not in json_data or not json_data[f]:
+            errors.append(f"缺少必填字段: {f}")
+
+    # tags 必须是非空数组
+    tags = json_data.get("tags")
+    if not isinstance(tags, list) or len(tags) == 0:
+        errors.append("tags 必须是非空数组")
+
+    # 字符串段落字段（必须是 str 且 ≥50 字）
+    string_fields = [
+        ("motivation", "problem"), ("motivation", "goal"), ("motivation", "gap"),
+        ("method", "overview"), ("method", "key_idea"), ("method", "steps"),
+        ("method", "novelty"),
+        ("results", "main_findings"), ("results", "limitations"),
+        ("improvements", "weaknesses"), ("improvements", "future_work"),
+        ("improvements", "reproducibility"),
+    ]
+    for section, field in string_fields:
+        val = json_data.get(section, {}).get(field)
+        if isinstance(val, list):
+            errors.append(f"{section}.{field} 应该是字符串段落，不能是数组")
+        elif not isinstance(val, str) or len(val.strip()) < 50:
+            errors.append(
+                f"{section}.{field} 必须是详细段落（≥50字），"
+                f"当前: {type(val).__name__} ({len(str(val))}字)"
+            )
+
+    # benchmarks 必须是数组
+    benchmarks = json_data.get("results", {}).get("benchmarks")
+    if benchmarks is not None and not isinstance(benchmarks, list):
+        errors.append("results.benchmarks 必须是数组")
+
+    # prerequisites.concepts 必须是对象数组，每个有 term
+    concepts = json_data.get("prerequisites", {}).get("concepts")
+    if concepts is not None:
+        if not isinstance(concepts, list):
+            errors.append("prerequisites.concepts 必须是数组")
+        elif len(concepts) == 0:
+            errors.append("prerequisites.concepts 不能为空")
+        else:
+            for i, c in enumerate(concepts):
+                if isinstance(c, str):
+                    errors.append(f"prerequisites.concepts[{i}] 应该是对象 {{term,explanation,why_matters}}，不能是字符串")
+                elif isinstance(c, dict) and not c.get("term"):
+                    errors.append(f"prerequisites.concepts[{i}] 缺少 term 字段")
+
+    # figures 必须是数组，每个元素应有 id
+    figures = json_data.get("figures")
+    if figures is not None:
+        if not isinstance(figures, list):
+            errors.append("figures 必须是数组")
+        else:
+            for i, fig in enumerate(figures):
+                if isinstance(fig, dict) and not fig.get("id"):
+                    errors.append(f"figures[{i}] 缺少 id 字段")
+
+    return errors
+
+
 # ── 文件操作 ────────────────────────────────────────────────────────────


@@ -227,11 +296,64 @@ async def _do_summarize_one(db: Session, paper: Paper) -> dict:
        # 下载 PDF
        await download_pdf(arxiv_id, paper.pdf_url)

-        # 调用 pi
-        raw_output = await call_pi(meta_path, Path("data/tmp") / arxiv_id / "paper.pdf")
+        # 带验证的生成循环：最多 4 轮，同一 session 内 pi 可看到之前写的文件
+        json_data = None
+        validation_errors = []
+        session_id = None
+        for attempt in range(1, 5):
+            # 清理上一轮 pi 通过 write_file 写的不完整文件
+            stale = paper_dir(arxiv_id) / "summary.json"
+            if stale.exists():
+                stale.unlink()

-        # 提取 JSON
+            if attempt == 1:
+                raw_output, session_id = await call_pi(
+                    meta_path, Path("data/tmp") / arxiv_id / "paper.pdf"
+                )
+            else:
+                # 验证失败，同一 session 内带着错误信息让 pi 修正
+                raw_output, session_id = await call_pi(
+                    meta_path,
+                    Path("data/tmp") / arxiv_id / "paper.pdf",
+                    fix_errors=validation_errors,
+                    session_id=session_id,
+                )
+
+            # 优先从 pi write_file 写入的 summary.json 读取，否则从 stdout 提取
+            # 如果都失败，当作验证错误，继续下一次尝试
+            json_data = None
+            summary_file = paper_dir(arxiv_id) / "summary.json"
+            try:
+                if summary_file.exists():
+                    json_data = json.loads(summary_file.read_text(encoding="utf-8"))
+                    logger.info("Read summary.json written by pi for %s", arxiv_id)
+                else:
                    json_data = extract_json(raw_output)
+            except (json.JSONDecodeError, JsonNotFoundError) as exc:
+                logger.warning(
+                    "JSON extraction failed for %s (attempt %d): %s",
+                    arxiv_id,
+                    attempt,
+                    str(exc)[:200],
+                )
+                validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"]
+                continue
+
+            # 运行验证脚本
+            validation_errors = _validate_summary(json_data, arxiv_id)
+            if not validation_errors:
+                break
+            logger.warning(
+                "Validation failed for %s (attempt %d): %s",
+                arxiv_id,
+                attempt,
+                "; ".join(validation_errors),
+            )
+
+        if validation_errors:
+            raise ValueError(
+                f"Summary validation failed after 4 attempts: {'; '.join(validation_errors)}"
+            )

        # Pydantic 校验
        schema = SummarySchema.model_validate(json_data)
@@ -252,9 +374,17 @@ async def _do_summarize_one(db: Session, paper: Paper) -> dict:
        status.raw_output_saved = True
        db.commit()

-        # LaTeX 图片提取（可选增强，失败不影响总结）
+        # PDF 图片提取（可选增强，失败不影响总结）
        try:
-            await extract_images_from_source(arxiv_id)
+            from app.services.pdf_image_extractor import (
+                extract_images_from_pdf,
+                filter_images_by_summary,
+            )
+            pdf_path = Path("data/tmp") / arxiv_id / "paper.pdf"
+            extract_images_from_pdf(arxiv_id, pdf_path)
+            # 根据 summary 中 figures 字段过滤，只保留被引用的图表
+            if schema.figures:
+                filter_images_by_summary(arxiv_id, schema.figures)
        except Exception:
            logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)

@@ -268,8 +398,8 @@ async def _do_summarize_one(db: Session, paper: Paper) -> dict:
                "title_en": paper.title_en or "",
                "tags": " ".join(t.tag for t in paper.tags) if paper.tags else "",
                "one_line": schema.one_line or "",
-                "motivation_problem": schema.motivation_problem or "",
-                "method_key_idea": schema.method_key_idea or "",
+                "motivation_problem": schema.motivation.problem or "",
+                "method_key_idea": schema.method.key_idea or "",
                "paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
            }
            index_paper(arxiv_id, texts_dict)
@@ -1,17 +1,27 @@
 /* ── kami 风格参考：纸张质感、留白、墨蓝强调色 ─────────────────── */
 :root {
-  --bg: #faf8f5;
-  --surface: #ffffff;
-  --ink: #1a1a2e;
-  --ink-light: #4a4a6a;
-  --accent: #2d5f8a;
-  --accent-hover: #1d4a6f;
-  --border: #e8e4df;
-  --shadow: rgba(0, 0, 0, 0.06);
+  /* 色 — Kami warm palette */
+  --bg: #f5f4ed;                                    /* parchment */
+  --surface: #faf9f5;                               /* ivory */
+  --ink: #141413;                                    /* near black */
+  --ink-light: #3d3d3a;                              /* dark warm */
+  --ink-sub: #504e49;                                /* olive subtext */
+  --ink-muted: #6b6a64;                              /* stone tertiary */
+  --accent: #1B365D;                                 /* ink blue */
+  --accent-hover: #142d4a;                           /* ink blue deep */
+  --accent-bg: rgba(27, 54, 93, 0.06);              /* brand whisper */
+  --border: #e8e6dc;                                 /* warm border */
+  --border-soft: #e5e3d8;                            /* soft row separator */
+  --shadow: rgba(0, 0, 0, 0.05);                     /* whisper shadow */
  --radius: 8px;
-  --font-body: "Noto Serif SC", "Georgia", serif;
-  --font-sans: "Inter", "Noto Sans SC", system-ui, sans-serif;
-  --max-width: 960px;
+
+  /* 字体 — Kami serif-first */
+  --font-body: "TsangerJinKai02", "Source Han Serif SC", "Noto Serif CJK SC", "Songti SC", "STSong", Georgia, serif;
+  --font-sans: var(--font-body);                     /* Kami: sans = serif */
+  --mono: "JetBrains Mono", "SF Mono", "Fira Code", Consolas, Monaco, monospace;
+
+  /* 布局 */
+  --max-width: 1080px;
 }

 *,
@@ -60,7 +70,7 @@ a:hover {
 .nav-brand {
  font-family: var(--font-body);
  font-size: 1.2rem;
-  font-weight: 700;
+  font-weight: 500;
  color: var(--ink);
 }

@@ -96,7 +106,7 @@ a:hover {
 .date-title {
  font-family: var(--font-body);
  font-size: 1.5rem;
-  font-weight: 700;
+  font-weight: 500;
 }

 .date-nav-btn {
@@ -156,7 +166,7 @@ a:hover {

 .paper-card {
  background: var(--surface);
-  border: 1px solid var(--border);
+  border: 0.5px solid var(--border);
  border-radius: var(--radius);
  padding: 20px 24px;
  transition: box-shadow 0.2s;
@@ -175,7 +185,7 @@ a:hover {
 .paper-title {
  font-family: var(--font-body);
  font-size: 1.1rem;
-  font-weight: 600;
+  font-weight: 500;
  line-height: 1.5;
  flex: 1;
 }
@@ -190,6 +200,7 @@ a:hover {
  font-size: 0.85rem;
  color: var(--ink-light);
  white-space: nowrap;
+  font-variant-numeric: tabular-nums;
 }

 .paper-one-line,
@@ -215,12 +226,14 @@ a:hover {

 .tag {
  display: inline-block;
-  padding: 2px 8px;
-  background: #eef3f8;
+  padding: 1px 5px;
+  background: #EEF2F7;
  color: var(--accent);
-  border-radius: 3px;
+  border-radius: 2px;
  font-size: 0.75rem;
-  font-weight: 500;
+  font-weight: 600;
+  letter-spacing: 0.4px;
+  text-transform: uppercase;
 }

 .paper-footer {
@@ -233,28 +246,28 @@ a:hover {
 .summary-badge {
  font-size: 0.8rem;
  padding: 2px 8px;
-  border-radius: 3px;
+  border-radius: 2px;
 }
 .summary-none {
-  background: #f0f0f0;
-  color: #888;
+  background: var(--border);
+  color: var(--ink-muted);
 }
 .summary-pending {
-  background: #fff3e0;
-  color: #e67e22;
+  background: rgba(27, 54, 93, 0.06);
+  color: var(--ink-sub);
 }
 .summary-processing {
-  background: #e3f2fd;
-  color: #1976d2;
+  background: rgba(27, 54, 93, 0.10);
+  color: var(--accent);
 }
 .summary-done {
-  background: #e8f5e9;
-  color: #388e3c;
+  background: rgba(27, 54, 93, 0.08);
+  color: #3d6e3d;
 }
 .summary-failed,
 .summary-permanent_failure {
-  background: #fce4ec;
-  color: #c62828;
+  background: rgba(140, 40, 40, 0.08);
+  color: #8c2828;
 }

 .btn-detail {
@@ -293,7 +306,7 @@ a:hover {
 .detail-title {
  font-family: var(--font-body);
  font-size: 1.6rem;
-  font-weight: 700;
+  font-weight: 500;
  line-height: 1.4;
  margin-bottom: 12px;
 }
@@ -352,7 +365,7 @@ a:hover {
 .summary-section h2 {
  font-family: var(--font-body);
  font-size: 1.05rem;
-  font-weight: 600;
+  font-weight: 500;
  margin-bottom: 8px;
  color: var(--accent);
 }
@@ -385,27 +398,27 @@ a:hover {
  margin-bottom: 24px;
 }
 .summary-placeholder.processing {
-  background: #e3f2fd;
+  background: rgba(27, 54, 93, 0.06);
 }
 .summary-placeholder.failed {
-  background: #fce4ec;
+  background: rgba(140, 40, 40, 0.06);
 }
 .summary-placeholder.none {
-  background: #f5f5f5;
+  background: var(--border);
 }
 .error-detail {
  font-size: 0.85rem;
-  color: #c62828;
+  color: #8c2828;
  margin-top: 8px;
 }

 .quality-warning {
  padding: 10px 16px;
-  background: #fff8e1;
-  border: 1px solid #ffe082;
+  background: rgba(27, 54, 93, 0.06);
+  border: 1px solid var(--border-soft);
  border-radius: var(--radius);
  font-size: 0.85rem;
-  color: #f57f17;
+  color: var(--ink-sub);
  margin-bottom: 16px;
 }

@@ -528,7 +541,7 @@ a:hover {
 }
 .sort-toggle a.active {
  color: var(--accent);
-  font-weight: 600;
+  font-weight: 500;
 }
 .sort-toggle a:hover {
  color: var(--accent);
@@ -541,7 +554,7 @@ a:hover {

 /* ── Search Highlight ───────────────────────────────────────────── */
 mark {
-  background: #fff3cd;
+  background: rgba(27, 54, 93, 0.10);
  color: var(--ink);
  padding: 1px 2px;
  border-radius: 2px;
@@ -590,7 +603,7 @@ mark {
 .page-heading {
  font-family: var(--font-body);
  font-size: 1.5rem;
-  font-weight: 700;
+  font-weight: 500;
  margin-bottom: 20px;
 }

@@ -656,44 +669,60 @@ mark {
  color: var(--accent);
 }
 .btn-bookmark.active {
-  color: #f0a500;
+  color: var(--accent);
 }

 /* ── Reading Badge ──────────────────────────────────────────────── */
 .reading-badge {
  font-size: 0.75rem;
  padding: 2px 6px;
-  border-radius: 3px;
+  border-radius: 2px;
 }
 .reading-unread {
-  background: #f0f0f0;
-  color: #888;
+  background: var(--border);
+  color: var(--ink-muted);
 }
 .reading-skimmed {
-  background: #e3f2fd;
-  color: #1976d2;
+  background: rgba(27, 54, 93, 0.08);
+  color: var(--accent);
 }
 .reading-read_summary {
-  background: #e8f5e9;
-  color: #388e3c;
+  background: rgba(27, 54, 93, 0.06);
+  color: #3d6e3d;
 }
 .reading-read_full {
-  background: #e8f5e9;
-  color: #2e7d32;
+  background: rgba(27, 54, 93, 0.10);
+  color: #3d6e3d;
  font-weight: 500;
 }

 /* ── Responsive ─────────────────────────────────────────────────── */
-@media (max-width: 640px) {
+@media (max-width: 880px) {
+  .container {
+    padding: 20px 32px;
+  }
+  .charts-grid {
+    grid-template-columns: 1fr;
+  }
+}
+
+@media (max-width: 480px) {
  .container {
    padding: 16px;
  }
  .nav-bar {
    padding: 10px 16px;
+    flex-wrap: wrap;
  }
  .nav-search-input {
    width: 120px;
  }
+  .nav-links {
+    gap: 12px;
+    margin-left: 0;
+    width: 100%;
+    justify-content: center;
+  }
  .date-nav {
    gap: 8px;
  }
@@ -757,8 +786,9 @@ mark {
  color: var(--accent);
  white-space: nowrap;
  padding: 2px 8px;
-  background: #eef3f8;
+  background: #EEF2F7;
  border-radius: 4px;
+  font-variant-numeric: tabular-nums;
 }

 /* ── Similar Papers ────────────────────────────────────────────── */
@@ -770,7 +800,7 @@ mark {
 .similar-papers h2 {
  font-family: var(--font-body);
  font-size: 1.1rem;
-  font-weight: 600;
+  font-weight: 500;
  margin-bottom: 12px;
  color: var(--accent);
 }
@@ -800,7 +830,7 @@ mark {
 .trends-page h1 {
  font-family: var(--font-body);
  font-size: 1.5rem;
-  font-weight: 700;
+  font-weight: 500;
  margin-bottom: 24px;
 }
 .charts-grid {
@@ -818,7 +848,7 @@ mark {
 .chart-card h2 {
  font-family: var(--font-body);
  font-size: 1rem;
-  font-weight: 600;
+  font-weight: 500;
  margin-bottom: 12px;
  color: var(--accent);
 }
@@ -826,17 +856,12 @@ mark {
  width: 100% !important;
  max-height: 300px;
 }
-@media (max-width: 768px) {
-  .charts-grid {
-    grid-template-columns: 1fr;
-  }
-}

 /* ── Compare Page ──────────────────────────────────────────────── */
 .compare-page h1 {
  font-family: var(--font-body);
  font-size: 1.5rem;
-  font-weight: 700;
+  font-weight: 500;
  margin-bottom: 24px;
 }
 .compare-table-wrapper {
@@ -860,7 +885,7 @@ mark {
 }
 .compare-table th {
  background: var(--bg);
-  font-weight: 600;
+  font-weight: 500;
  color: var(--ink-light);
  white-space: nowrap;
  min-width: 100px;
@@ -887,7 +912,7 @@ mark {
 .image-gallery h2 {
  font-family: var(--font-body);
  font-size: 1.05rem;
-  font-weight: 600;
+  font-weight: 500;
  margin-bottom: 12px;
  color: var(--accent);
 }
@@ -913,3 +938,138 @@ mark {
  color: var(--ink-light);
  text-align: center;
 }
+
+/* ── 前置知识卡片 ── */
+.prerequisites-list {
+  display: grid;
+  gap: 1rem;
+}
+.concept-card {
+  background: var(--surface);
+  border: 1px solid var(--border);
+  border-radius: 8px;
+  padding: 1rem 1.2rem;
+}
+.concept-card h3 {
+  margin: 0 0 0.4rem 0;
+  font-size: 1rem;
+  color: var(--accent);
+}
+.concept-card p {
+  margin: 0.3rem 0 0 0;
+  font-size: 0.92rem;
+  line-height: 1.6;
+  color: var(--ink);
+}
+.concept-why {
+  font-style: italic;
+  color: var(--ink-light) !important;
+  border-left: 3px solid var(--accent);
+  padding-left: 0.8rem;
+  margin-top: 0.5rem !important;
+}
+
+/* ── 核心创新点 ── */
+.key-idea {
+  background: linear-gradient(135deg, var(--accent-bg), var(--surface));
+  border-left: 4px solid var(--accent);
+  padding: 1rem 1.2rem;
+  border-radius: 0 8px 8px 0;
+  margin: 1rem 0;
+}
+
+/* ── 可折叠详情 ── */
+.summary-section details {
+  margin: 0.8rem 0;
+}
+.summary-section details summary {
+  cursor: pointer;
+  font-weight: 500;
+  color: var(--accent);
+  padding: 0.4rem 0;
+  user-select: none;
+}
+.summary-section details summary:hover {
+  text-decoration: underline;
+}
+.summary-section details[open] summary {
+  margin-bottom: 0.5rem;
+}
+
+/* ── 内联图片 ── */
+.inline-figure {
+  margin: 1.2rem 0;
+  text-align: center;
+}
+.inline-figure img {
+  max-width: 100%;
+  border-radius: 6px;
+  box-shadow: 0 2px 8px rgba(0,0,0,0.08);
+  cursor: zoom-in;
+  transition: box-shadow 0.2s;
+}
+.inline-figure img:hover {
+  box-shadow: 0 4px 16px rgba(0,0,0,0.14);
+}
+.inline-figure figcaption {
+  margin-top: 0.4rem;
+  font-size: 0.85rem;
+  color: var(--ink-light);
+}
+
+/* ── 图片灯箱 ── */
+.lightbox-overlay {
+  position: fixed;
+  top: 0;
+  left: 0;
+  right: 0;
+  bottom: 0;
+  z-index: 9999;
+  background: rgba(0, 0, 0, 0.85);
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  cursor: zoom-out;
+  opacity: 0;
+  visibility: hidden;
+  transition: opacity 0.2s, visibility 0.2s;
+}
+.lightbox-overlay.active {
+  opacity: 1;
+  visibility: visible;
+}
+.lightbox-overlay img {
+  max-width: 95vw;
+  max-height: 95vh;
+  object-fit: contain;
+  border-radius: 4px;
+  box-shadow: 0 0 40px rgba(0, 0, 0, 0.4);
+}
+
+/* ── Benchmark 表格 ── */
+.benchmarks-table {
+  width: 100%;
+  border-collapse: collapse;
+  margin: 1rem 0;
+  font-size: 0.9rem;
+}
+.benchmarks-table th {
+  background: var(--bg);
+  font-weight: 500;
+  padding: 0.5rem 0.8rem;
+  text-align: left;
+  border-bottom: 2px solid var(--border);
+}
+.benchmarks-table td {
+  padding: 0.5rem 0.8rem;
+  border-bottom: 1px solid var(--border);
+}
+.benchmarks-table .improvement {
+  color: #3d6e3d;
+  font-weight: 500;
+}
+
+/* ── 研究动机 ── */
+.motivation-block p {
+  margin-bottom: 0.8rem;
+}
@@ -0,0 +1,11 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 32 32">
+  <rect width="32" height="32" rx="6" fill="#1B365D"/>
+  <g fill="none" stroke="#f5f4ed" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round">
+    <path d="M8 7h6a2 2 0 0 1 2 2v16l-1-1-2 1-2-1-2 1V9a1 1 0 0 1 1-1z"/>
+    <path d="M24 7h-6a2 2 0 0 0-2 2v16l1-1 2 1 2-1 2 1V9a1 1 0 0 0-1-1z"/>
+    <line x1="12" y1="12" x2="12" y2="12.01"/>
+    <line x1="12" y1="16" x2="12" y2="16.01"/>
+    <line x1="20" y1="12" x2="20" y2="12.01"/>
+    <line x1="20" y1="16" x2="20" y2="16.01"/>
+  </g>
+</svg>
@@ -36,9 +36,17 @@
            </td>
            <td>
              <span class="status-badge status-{{ log.status }}">
-                {% if log.status == 'success' %}✓ 成功 {% elif log.status ==
-                'running' %}⟳ 运行中 {% elif log.status == 'failed' %}✗ 失败 {%
-                else %}{{ log.status }}{% endif %}
+                {# djlint:off #}
+                {% if log.status == 'success' %}
+                  ✓ 成功
+                {% elif log.status == 'running' %}
+                  ⟳ 运行中
+                {% elif log.status == 'failed' %}
+                  ✗ 失败
+                {% else %}
+                  {{ log.status }}
+                {% endif %}
+                {# djlint:on #}
              </span>
            </td>
            <td>{{ log.date or '-' }}</td>
@@ -97,9 +105,17 @@
            <td>{{ job.paper_count or 0 }}</td>
            <td>
              <span class="status-badge status-{{ job.status }}">
-                {% if job.status == 'success' %}✓ 成功 {% elif job.status ==
-                'running' %}⟳ 运行中 {% elif job.status == 'failed' %}✗ 失败 {%
-                else %}{{ job.status }}{% endif %}
+                {# djlint:off #}
+                {% if job.status == 'success' %}
+                  ✓ 成功
+                {% elif job.status == 'running' %}
+                  ⟳ 运行中
+                {% elif job.status == 'failed' %}
+                  ✗ 失败
+                {% else %}
+                  {{ job.status }}
+                {% endif %}
+                {# djlint:on #}
              </span>
            </td>
            <td class="time-cell">
@@ -345,21 +361,23 @@
 {% endblock %} {% block scripts %}
 <script>
  function adminAction(action) {
-    const token = prompt("请输入 Admin Token:");
-    if (!token) return;
-
    const url = "/admin/" + action;
    fetch(url, {
      method: "POST",
-      headers: {
-        Authorization: "Bearer " + token,
-        "Content-Type": "application/json",
-      },
+      headers: { "Content-Type": "application/json" },
+    })
+      .then((r) => {
+        if (r.status === 303 || r.status === 401) {
+          window.location.href = "/admin/login";
+          return;
+        }
+        return r.json();
      })
-      .then((r) => r.json())
      .then((data) => {
+        if (data) {
          alert(JSON.stringify(data, null, 2));
          location.reload();
+        }
      })
      .catch((err) => {
        alert("请求失败: " + err.message);
@@ -4,7 +4,9 @@
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>{% block title %}HF Daily Papers{% endblock %}</title>
+    <link rel="icon" type="image/svg+xml" href="/static/favicon.svg" />
    <link rel="stylesheet" href="/static/css/style.css" />
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.css" />
  </head>
  <body>
    <header class="site-header">
@@ -23,7 +25,13 @@
          <a href="/search">搜索</a>
          <a href="/trends">趋势</a>
          <a href="/reading-list">阅读列表</a>
+          {% if is_admin %}
          <a href="/admin/logs">管理</a>
+          <a href="/admin/logout" onclick="event.preventDefault();this.closest('form').submit()">退出</a>
+          <form action="/admin/logout" method="post" style="display:none"></form>
+          {% else %}
+          <a href="/admin/login">管理</a>
+          {% endif %}
        </div>
      </nav>
    </header>
@@ -57,45 +57,158 @@ endblock %} {% block content %}
  <div class="quality-warning">📝 总结部分字段不完整</div>
  {% endif %} {% if paper.summary.one_line %}
  <section class="summary-section">
-    <h2>一句话摘要</h2>
    <p class="one-line">{{ paper.summary.one_line }}</p>
  </section>
-  {% endif %} {% if paper.summary.difficulty %}
+  {% endif %}
+
+  {# ── 前置知识 ── #}
+  {% if prereqs and prereqs.concepts %}
  <section class="summary-section">
-    <h2>难度</h2>
-    <p>{{ paper.summary.difficulty }}</p>
+    <h2>前置知识</h2>
+    <div class="prerequisites-list">
+      {% for c in prereqs.concepts %}
+      <div class="concept-card">
+        <h3>{{ c.term }}</h3>
+        <p>{{ c.explanation }}</p>
+        {% if c.why_matters %}
+        <p class="concept-why">{{ c.why_matters }}</p>
+        {% endif %}
+      </div>
+      {% endfor %}
+    </div>
  </section>
-  {% endif %} {% if paper.summary.motivation_problem %}
+  {% endif %}
+
+  {# ── 研究动机 ── #}
+  {% if paper.summary.motivation_problem %}
  <section class="summary-section">
    <h2>研究动机</h2>
+    <div class="motivation-block">
      {% if paper.summary.motivation_problem %}
-    <p><strong>问题：</strong>{{ paper.summary.motivation_problem }}</p>
-    {% endif %} {% if paper.summary.motivation_goal %}
-    <p><strong>目标：</strong>{{ paper.summary.motivation_goal }}</p>
-    {% endif %} {% if paper.summary.motivation_gap %}
-    <p><strong>差距：</strong>{{ paper.summary.motivation_gap }}</p>
+      <p>{{ paper.summary.motivation_problem }}</p>
      {% endif %}
+      {% if paper.summary.motivation_goal %}
+      <p>本文的目标是{{ paper.summary.motivation_goal }}</p>
+      {% endif %}
+      {% if paper.summary.motivation_gap %}
+      <p>与已有工作不同的是，{{ paper.summary.motivation_gap }}</p>
+      {% endif %}
+    </div>
  </section>
-  {% endif %} {% if paper.summary.method_key_idea %}
+  {% endif %}
+
+  {# ── 核心方法 ── #}
+  {% if paper.summary.method_key_idea %}
  <section class="summary-section">
    <h2>核心方法</h2>
    {% if paper.summary.method_overview %}
    <p>{{ paper.summary.method_overview }}</p>
    {% endif %}
-    <p><strong>关键思路：</strong>{{ paper.summary.method_key_idea }}</p>
+    <div class="key-idea">
+      <p>{{ paper.summary.method_key_idea }}</p>
+    </div>
+    {% if paper.summary.method_steps_json %}
+    <details>
+      <summary>方法步骤详情</summary>
+      <p>{{ paper.summary.method_steps_json }}</p>
+    </details>
+    {% endif %}
    {% if paper.summary.method_novelty %}
-    <p><strong>新颖性：</strong>{{ paper.summary.method_novelty }}</p>
+    <details>
+      <summary>技术新颖性</summary>
+      <p>{{ paper.summary.method_novelty }}</p>
+    </details>
    {% endif %}
  </section>
-  {% endif %} {% if paper.summary.results_main_json %}
+  {% endif %}
+
+  {# ── 实验结果 ── #}
+  {% if paper.summary.results_main_json %}
  <section class="summary-section">
    <h2>实验结果</h2>
    <p>{{ paper.summary.results_main_json }}</p>
+    {% if table_figures and table_figures|length > 0 %}
+    {# 优先展示原文表格截图 #}
+    {% for tf in table_figures %}
+    <figure class="inline-figure table-screenshot">
+      <img src="{{ tf.image_url }}" alt="{{ tf.caption or tf.id }}" loading="lazy" />
+      <figcaption>
+        <strong>{{ tf.id }}</strong>{% if tf.caption %}: {{ tf.caption }}{% endif %}
+      </figcaption>
+    </figure>
+    {% endfor %}
+    {% if benchmarks and benchmarks|length > 0 %}
+    <details>
+      <summary>查看结构化数据</summary>
+      <table class="benchmarks-table">
+        <thead>
+          <tr><th>任务</th><th>指标</th><th>本文</th><th>基线</th><th>提升</th></tr>
+        </thead>
+        <tbody>
+          {% for b in benchmarks %}
+          {% if b is mapping %}
+          <tr>
+            <td>{{ b.get('task','') }}</td>
+            <td>{{ b.get('metric','') }}</td>
+            <td><strong>{{ b.get('this_work','') }}</strong></td>
+            <td>{{ b.get('baseline','') }}</td>
+            <td class="improvement">{{ b.get('improvement','') }}</td>
+          </tr>
+          {% endif %}
+          {% endfor %}
+        </tbody>
+      </table>
+    </details>
+    {% endif %}
+    {% elif benchmarks and benchmarks|length > 0 %}
+    {# 无截图时回退到 HTML 表格 #}
+    <table class="benchmarks-table">
+      <thead>
+        <tr><th>任务</th><th>指标</th><th>本文</th><th>基线</th><th>提升</th></tr>
+      </thead>
+      <tbody>
+        {% for b in benchmarks %}
+        {% if b is mapping %}
+        <tr>
+          <td>{{ b.get('task','') }}</td>
+          <td>{{ b.get('metric','') }}</td>
+          <td><strong>{{ b.get('this_work','') }}</strong></td>
+          <td>{{ b.get('baseline','') }}</td>
+          <td class="improvement">{{ b.get('improvement','') }}</td>
+        </tr>
+        {% endif %}
+        {% endfor %}
+      </tbody>
+    </table>
+    {% endif %}
  </section>
-  {% endif %} {% if paper.summary.limitations_json %}
+  {% endif %}
+
+  {# ── 局限与改进 ── #}
+  {% if paper.summary.limitations_json or paper.summary.weaknesses_json or paper.summary.future_work_json %}
  <section class="summary-section">
    <h2>局限与改进</h2>
+    {% if paper.summary.limitations_json %}
    <p>{{ paper.summary.limitations_json }}</p>
+    {% endif %}
+    {% if paper.summary.weaknesses_json %}
+    <details>
+      <summary>独立分析的弱点</summary>
+      <p>{{ paper.summary.weaknesses_json }}</p>
+    </details>
+    {% endif %}
+    {% if paper.summary.future_work_json %}
+    <details>
+      <summary>未来方向</summary>
+      <p>{{ paper.summary.future_work_json }}</p>
+    </details>
+    {% endif %}
+    {% if paper.summary.reproducibility %}
+    <details>
+      <summary>复现评估</summary>
+      <p>{{ paper.summary.reproducibility }}</p>
+    </details>
+    {% endif %}
  </section>
  {% endif %} {% elif summary_state == 'processing' %}
  <div class="summary-placeholder processing">
@@ -123,9 +236,30 @@ endblock %} {% block content %}
    <h2>Abstract</h2>
    <p class="abstract-en">{{ paper.abstract }}</p>
  </section>
-  {% endif %} {# 图片画廊 #} {% if paper_images %}
+  {% endif %}
+
+  {# ── 论文图表（关联 figures 元数据）── #}
+  {% if figures or paper_images %}
  <section class="image-gallery">
-    <h2>论文图片</h2>
+    <h2>论文图表</h2>
+    {% for fig in figures %}
+    <figure class="inline-figure">
+      {% if fig.image_url %}
+      <img src="{{ fig.image_url }}" alt="{{ fig.caption or fig.id }}" loading="lazy" />
+      {% endif %}
+      <figcaption>
+        <strong>{{ fig.id }}</strong>{% if fig.caption %}: {{ fig.caption }}{% endif %}
+        {% if fig.description %}
+        <p>{{ fig.description }}</p>
+        {% endif %}
+        {% if fig.reason %}
+        <p class="concept-why">{{ fig.reason }}</p>
+        {% endif %}
+      </figcaption>
+    </figure>
+    {% endfor %}
+    {# 如果有图片但没有对应的 figures 元数据，仍然展示 #}
+    {% if not figures and paper_images %}
    <div class="gallery-grid">
      {% for img in paper_images %}
      <div class="gallery-item">
@@ -134,8 +268,9 @@ endblock %} {% block content %}
      </div>
      {% endfor %}
    </div>
+    {% endif %}
  </section>
-  {% endif %} {# 相似论文推荐 #} {% if similar_papers %}
+  {% endif %} {% if similar_papers %}
  <section class="similar-papers">
    <h2>相似论文推荐</h2>
    {% for sp in similar_papers %}
@@ -152,3 +287,234 @@ endblock %} {% block content %}
  {% endif %}
 </article>
 {% endblock %}
+
+{% block scripts %}
+<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.js"></script>
+<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/contrib/auto-render.min.js"
+  onload="renderMathInElement(document.querySelector('.paper-detail'),{delimiters:[{left:'$$',right:'$$',display:true},{left:'$',right:'$',display:false}]});">
+</script>
+<style>
+.lightbox-overlay {
+  position: fixed !important;
+  top: 0 !important;
+  left: 0 !important;
+  right: 0 !important;
+  bottom: 0 !important;
+  width: 100vw !important;
+  height: 100vh !important;
+  z-index: 99999 !important;
+  background: rgba(0, 0, 0, 0.85);
+  overflow: hidden;
+  margin: 0 !important;
+  padding: 0 !important;
+  opacity: 0;
+  transition: opacity 0.2s;
+}
+.lightbox-overlay.active {
+  opacity: 1;
+}
+.lightbox-overlay img {
+  position: absolute;
+  transform-origin: 0 0;
+  border-radius: 4px;
+  box-shadow: 0 0 40px rgba(0, 0, 0, 0.4);
+  cursor: grab;
+  user-select: none;
+  -webkit-user-drag: none;
+}
+.lightbox-overlay img.dragging {
+  cursor: grabbing;
+}
+/* 工具栏 */
+.lightbox-toolbar {
+  position: absolute;
+  bottom: 24px;
+  left: 50%;
+  transform: translateX(-50%);
+  display: flex;
+  gap: 8px;
+  background: rgba(0, 0, 0, 0.6);
+  padding: 8px 14px;
+  border-radius: 24px;
+  z-index: 100000;
+}
+.lightbox-toolbar button {
+  background: none;
+  border: 1px solid rgba(255,255,255,0.3);
+  color: #fff;
+  width: 36px;
+  height: 36px;
+  border-radius: 50%;
+  font-size: 1.1rem;
+  cursor: pointer;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  transition: background 0.15s;
+}
+.lightbox-toolbar button:hover {
+  background: rgba(255,255,255,0.15);
+}
+</style>
+<script>
+(function() {
+  function openLightbox(src, alt) {
+    var existing = document.querySelector('.lightbox-overlay');
+    if (existing) existing.remove();
+
+    var overlay = document.createElement('div');
+    overlay.className = 'lightbox-overlay';
+
+    var img = document.createElement('img');
+    img.src = src;
+    img.alt = alt || '';
+    img.draggable = false;
+
+    // 工具栏
+    var toolbar = document.createElement('div');
+    toolbar.className = 'lightbox-toolbar';
+    toolbar.innerHTML =
+      '<button title="缩小">−</button>' +
+      '<button title="放大">+</button>' +
+      '<button title="适合窗口">⊡</button>' +
+      '<button title="原始大小">1:1</button>' +
+      '<button title="关闭">✕</button>';
+
+    overlay.appendChild(img);
+    overlay.appendChild(toolbar);
+    document.body.appendChild(overlay);
+
+    // 视图状态
+    var scale = 1, tx = 0, ty = 0;
+    var baseW = 0, baseH = 0;
+    var dragging = false, dragStartX = 0, dragStartY = 0, startTx = 0, startTy = 0;
+
+    function apply() {
+      img.style.transform = 'translate(' + tx + 'px,' + ty + 'px) scale(' + scale + ')';
+    }
+
+    function fitToScreen() {
+      if (!baseW) return;
+      var sw = window.innerWidth, sh = window.innerHeight;
+      scale = Math.min(sw * 0.9 / baseW, sh * 0.9 / baseH, 1);
+      tx = (sw - baseW * scale) / 2;
+      ty = (sh - baseH * scale) / 2;
+      apply();
+    }
+
+    function resetOrigin() {
+      scale = 1;
+      tx = (window.innerWidth - baseW) / 2;
+      ty = (window.innerHeight - baseH) / 2;
+      apply();
+    }
+
+    function zoomAt(factor, cx, cy) {
+      var newScale = Math.max(0.1, Math.min(scale * factor, 20));
+      // 保持鼠标指向的图片点不变
+      tx = cx - (cx - tx) * (newScale / scale);
+      ty = cy - (ty - ty) * (newScale / scale);  // 这行有误，下面修正
+      scale = newScale;
+      apply();
+    }
+
+    function zoomCenter(factor) {
+      var cx = window.innerWidth / 2;
+      var cy = window.innerHeight / 2;
+      var newScale = Math.max(0.1, Math.min(scale * factor, 20));
+      tx = cx - (cx - tx) * (newScale / scale);
+      ty = cy - (cy - ty) * (newScale / scale);
+      scale = newScale;
+      apply();
+    }
+
+    // 图片加载后初始化
+    img.onload = function() {
+      baseW = img.naturalWidth;
+      baseH = img.naturalHeight;
+      fitToScreen();
+    };
+    // 如果已缓存
+    if (img.complete && img.naturalWidth) {
+      baseW = img.naturalWidth;
+      baseH = img.naturalHeight;
+      fitToScreen();
+    }
+
+    // 工具栏按钮
+    var btns = toolbar.querySelectorAll('button');
+    // 缩小 / 放大 / 适合 / 原始 / 关闭
+    btns[0].onclick = function(e) { e.stopPropagation(); zoomCenter(0.7); };
+    btns[1].onclick = function(e) { e.stopPropagation(); zoomCenter(1.4); };
+    btns[2].onclick = function(e) { e.stopPropagation(); fitToScreen(); };
+    btns[3].onclick = function(e) { e.stopPropagation(); resetOrigin(); };
+    btns[4].onclick = function(e) { e.stopPropagation(); close(); };
+
+    // 滚轮缩放（以鼠标为中心）
+    overlay.addEventListener('wheel', function(e) {
+      e.preventDefault();
+      var factor = e.deltaY < 0 ? 1.15 : 0.87;
+      var rect = overlay.getBoundingClientRect();
+      var cx = e.clientX - rect.left;
+      var cy = e.clientY - rect.top;
+      var newScale = Math.max(0.1, Math.min(scale * factor, 20));
+      tx = cx - (cx - tx) * (newScale / scale);
+      ty = cy - (cy - ty) * (newScale / scale);
+      scale = newScale;
+      apply();
+    }, { passive: false });
+
+    // 拖拽平移
+    overlay.addEventListener('pointerdown', function(e) {
+      if (e.target.closest('.lightbox-toolbar')) return;
+      dragging = true;
+      dragStartX = e.clientX;
+      dragStartY = e.clientY;
+      startTx = tx;
+      startTy = ty;
+      img.classList.add('dragging');
+      overlay.setPointerCapture(e.pointerId);
+    });
+    overlay.addEventListener('pointermove', function(e) {
+      if (!dragging) return;
+      tx = startTx + (e.clientX - dragStartX);
+      ty = startTy + (e.clientY - dragStartY);
+      apply();
+    });
+    overlay.addEventListener('pointerup', function() {
+      dragging = false;
+      img.classList.remove('dragging');
+    });
+
+    // ESC 关闭
+    function onKey(e) {
+      if (e.key === 'Escape') { close(); }
+      else if (e.key === '+' || e.key === '=') { zoomCenter(1.4); }
+      else if (e.key === '-') { zoomCenter(0.7); }
+      else if (e.key === '0') { fitToScreen(); }
+    }
+
+    function close() {
+      overlay.remove();
+      document.removeEventListener('keydown', onKey);
+    }
+
+    document.addEventListener('keydown', onKey);
+
+    // 激活动画
+    requestAnimationFrame(function() {
+      overlay.classList.add('active');
+    });
+  }
+
+  document.addEventListener('click', function(e) {
+    var img = e.target;
+    if (img.tagName !== 'IMG') return;
+    if (!img.closest('.inline-figure') && !img.closest('.gallery-item')) return;
+    if (img.closest('.lightbox-overlay')) return;
+    e.preventDefault();
+    openLightbox(img.src, img.alt);
+  });
+})();
+</script>
+{% endblock %}
@@ -0,0 +1,150 @@
+{% extends "base.html" %}
+{% block title %}登录 — HF Daily Papers{% endblock %}
+{% block content %}
+<div class="login-page">
+  <div class="login-card">
+    <div class="login-header">
+      <h1 class="login-title">🔑 管理员登录</h1>
+      <p class="login-subtitle">请输入管理员账号和密码</p>
+    </div>
+
+    {% if error %}
+    <div class="login-error">
+      {{ error }}
+    </div>
+    {% endif %}
+
+    <form class="login-form" action="/admin/login" method="post">
+      <div class="login-field">
+        <label for="username">用户名</label>
+        <input
+          type="text"
+          id="username"
+          name="username"
+          placeholder="请输入用户名"
+          required
+          autofocus
+        />
+      </div>
+      <div class="login-field">
+        <label for="password">密码</label>
+        <input
+          type="password"
+          id="password"
+          name="password"
+          placeholder="请输入密码"
+          required
+        />
+      </div>
+      <button type="submit" class="login-btn">登 录</button>
+    </form>
+  </div>
+</div>
+
+<style>
+  .login-page {
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    min-height: 60vh;
+    padding: 40px 16px;
+  }
+
+  .login-card {
+    width: 100%;
+    max-width: 400px;
+    background: var(--surface);
+    border: 1px solid var(--border);
+    border-radius: var(--radius-lg);
+    padding: 36px 32px;
+    box-shadow: 0 4px 24px var(--shadow);
+  }
+
+  .login-header {
+    text-align: center;
+    margin-bottom: 28px;
+  }
+
+  .login-title {
+    font-family: var(--font-body);
+    font-size: 1.4rem;
+    font-weight: 700;
+    color: var(--ink);
+    margin: 0 0 8px;
+  }
+
+  .login-subtitle {
+    font-size: 0.9rem;
+    color: var(--ink-light);
+    margin: 0;
+  }
+
+  .login-error {
+    background: #fce4ec;
+    color: #c62828;
+    padding: 10px 14px;
+    border-radius: var(--radius);
+    font-size: 0.85rem;
+    margin-bottom: 20px;
+    text-align: center;
+  }
+
+  .login-form {
+    display: flex;
+    flex-direction: column;
+    gap: 18px;
+  }
+
+  .login-field label {
+    display: block;
+    font-size: 0.85rem;
+    font-weight: 600;
+    color: var(--ink);
+    margin-bottom: 6px;
+  }
+
+  .login-field input {
+    width: 100%;
+    padding: 10px 14px;
+    border: 1px solid var(--border);
+    border-radius: var(--radius);
+    font-size: 0.9rem;
+    font-family: var(--font-sans);
+    background: var(--bg);
+    color: var(--ink);
+    transition: border-color 0.2s;
+    box-sizing: border-box;
+  }
+
+  .login-field input:focus {
+    outline: none;
+    border-color: var(--accent);
+    box-shadow: 0 0 0 3px rgba(27, 54, 93, 0.1);
+  }
+
+  .login-btn {
+    width: 100%;
+    padding: 12px;
+    background: var(--accent);
+    color: #fff;
+    border: none;
+    border-radius: var(--radius);
+    font-size: 0.95rem;
+    font-weight: 600;
+    cursor: pointer;
+    transition: background 0.2s;
+    font-family: var(--font-sans);
+    margin-top: 4px;
+  }
+
+  .login-btn:hover {
+    background: var(--accent-hover);
+  }
+
+  @media (max-width: 480px) {
+    .login-card {
+      padding: 28px 20px;
+    }
+  }
+</style>
+{% endblock %}
@@ -34,18 +34,31 @@
      <span
        class="summary-badge summary-{{ paper.summary_status.status if paper.summary_status else 'none' }}"
      >
-        {% if not paper.summary_status or paper.summary_status.status ==
-        'pending' %} 未总结 {% elif paper.summary_status.status == 'processing'
-        %} 🔄 总结中 {% elif paper.summary_status.status == 'failed' or
-        paper.summary_status.status == 'permanent_failure' %} ❌ 总结失败 {%
-        elif paper.summary_status.status == 'done' %} ✅ 已总结 {% endif %}
+        {# djlint:off #}
+        {% if not paper.summary_status or paper.summary_status.status == 'pending' %}
+          未总结
+        {% elif paper.summary_status.status == 'processing' %}
+          🔄 总结中
+        {% elif paper.summary_status.status == 'failed' or paper.summary_status.status == 'permanent_failure' %}
+          ❌ 总结失败
+        {% elif paper.summary_status.status == 'done' %}
+          ✅ 已总结
+        {% endif %}
+        {# djlint:on #}
      </span>
      {% if paper.reading_status %}
      <span class="reading-badge reading-{{ paper.reading_status.status }}">
-        {% if paper.reading_status.status == 'unread' %}未读 {% elif
-        paper.reading_status.status == 'skimmed' %}已浏览 {% elif
-        paper.reading_status.status == 'read_summary' %}已读摘要 {% elif
-        paper.reading_status.status == 'read_full' %}已读原文 {% endif %}
+        {# djlint:off #}
+        {% if paper.reading_status.status == 'unread' %}
+          未读
+        {% elif paper.reading_status.status == 'skimmed' %}
+          已浏览
+        {% elif paper.reading_status.status == 'read_summary' %}
+          已读摘要
+        {% elif paper.reading_status.status == 'read_full' %}
+          已读原文
+        {% endif %}
+        {# djlint:on #}
      </span>
      {% endif %}
    </div>
@@ -22,16 +22,7 @@ endblock %} {% block content %}
          type="radio"
          name="mode"
          value="keyword"
-          {%
-          if
-          mode=""
-          ="keyword"
-          or
-          not
-          mode
-          %}checked{%
-          endif
-          %}
+          {% if mode == "keyword" or not mode %}checked{% endif %}
        />
        关键词
      </label>
@@ -40,13 +31,7 @@ endblock %} {% block content %}
          type="radio"
          name="mode"
          value="semantic"
-          {%
-          if
-          mode=""
-          ="semantic"
-          %}checked{%
-          endif
-          %}
+          {% if mode == "semantic" %}checked{% endif %}
        />
        语义搜索
      </label>
@@ -142,11 +127,17 @@ endblock %} {% block content %}
        <span
          class="summary-badge summary-{{ paper.summary_status.status if paper.summary_status else 'none' }}"
        >
-          {% if not paper.summary_status or paper.summary_status.status ==
-          'pending' %} 未总结 {% elif paper.summary_status.status ==
-          'processing' %} 🔄 总结中 {% elif paper.summary_status.status in
-          ('failed', 'permanent_failure') %} ❌ 总结失败 {% elif
-          paper.summary_status.status == 'done' %} ✅ 已总结 {% endif %}
+          {# djlint:off #}
+          {% if not paper.summary_status or paper.summary_status.status == 'pending' %}
+            未总结
+          {% elif paper.summary_status.status == 'processing' %}
+            🔄 总结中
+          {% elif paper.summary_status.status in ('failed', 'permanent_failure') %}
+            ❌ 总结失败
+          {% elif paper.summary_status.status == 'done' %}
+            ✅ 已总结
+          {% endif %}
+          {# djlint:on #}
        </span>
        <a href="/paper/{{ paper.arxiv_id }}" class="btn-detail">详情 →</a>
      </div>
@@ -32,20 +32,20 @@ endblock %} {% block content %}
 {% endblock %} {% block scripts %}
 <script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.7/dist/chart.umd.min.js"></script>
 <script>
-  // 颜色配置（kami 风格墨蓝色系）
+  // 颜色配置（Kami ink-blue 暖调色系）
  const COLORS = {
-    primary: '#2d5f8a',
-    primaryLight: 'rgba(45, 95, 138, 0.2)',
-    accent: '#5a9bc7',
-    success: '#388e3c',
-    warning: '#f57f17',
-    danger: '#c62828',
-    muted: '#4a4a6a',
+    primary: '#1B365D',
+    primaryLight: 'rgba(27, 54, 93, 0.12)',
+    accent: '#2a4d7a',
+    success: '#3d6e3d',
+    warning: '#7a6430',
+    danger: '#8c2828',
+    muted: '#6b6a64',
    palette: [
-      '#2d5f8a', '#5a9bc7', '#388e3c', '#f57f17', '#c62828',
-      '#7b1fa2', '#00838f', '#ef6c00', '#455a64', '#827717',
-      '#1565c0', '#ad1457', '#00695c', '#e65100', '#283593',
-      '#9e9d24', '#6a1b9a', '#00838f', '#4e342e', '#37474f',
+      '#1B365D', '#2a4d7a', '#3d6e3d', '#7a6430', '#8c2828',
+      '#4a4070', '#2d6b6e', '#8a5a2a', '#504e49', '#5c6030',
+      '#2b4a80', '#70304a', '#2d5e56', '#7a4a10', '#353a60',
+      '#6a6a28', '#552a5a', '#2d6b6e', '#4a3828', '#3d4450',
    ],
  };

@@ -19,7 +19,17 @@ TMP_DIR = DATA_DIR / "tmp"

 # ── 模板单例 ──────────────────────────────────────────────────────────

-templates = Jinja2Templates(directory="app/templates")
+
+class _Templates(Jinja2Templates):
+    """自动注入 is_admin 到模板上下文的 Jinja2Templates 子类。"""
+
+    def TemplateResponse(self, request, name, context=None, **kwargs):
+        context = context or {}
+        context.setdefault("is_admin", request.session.get("is_admin", False))
+        return super().TemplateResponse(request, name, context, **kwargs)
+
+
+templates = _Templates(directory="app/templates")


 # ── 时区工具 ──────────────────────────────────────────────────────────
@@ -16,6 +16,8 @@ dependencies = [
    "python-dotenv>=1.0",
    "apscheduler>=3.10",
    "chromadb>=1.0",
+    "pymupdf>=1.25",
+    "itsdangerous>=2.2.0",
 ]

 [project.optional-dependencies]
@@ -0,0 +1,117 @@
+"""验证 summary JSON 是否符合 SummarySchema 要求。
+
+用法：python scripts/validate_summary.py <json_file>
+返回：exit 0 = 通过，exit 1 = 失败（错误信息输出到 stdout）
+"""
+
+import json
+import sys
+from pathlib import Path
+
+
+def validate(path: str) -> list[str]:
+    errors: list[str] = []
+    try:
+        data = json.loads(Path(path).read_text(encoding="utf-8"))
+    except json.JSONDecodeError as e:
+        return [f"JSON 解析失败: {e}"]
+
+    if not isinstance(data, dict):
+        return ["顶层必须是 JSON 对象 (dict)"]
+
+    # 必填字段
+    required_top = ["arxiv_id", "title_zh", "one_line", "tags"]
+    for f in required_top:
+        if f not in data or not data[f]:
+            errors.append(f"缺少必填字段: {f}")
+
+    # tags 必须是非空数组
+    tags = data.get("tags")
+    if isinstance(tags, list) and len(tags) == 0:
+        errors.append("tags 不能为空数组")
+    if not isinstance(tags, list):
+        errors.append("tags 必须是数组")
+
+    # motivation 子字段
+    motivation = data.get("motivation", {})
+    if not isinstance(motivation, dict):
+        errors.append("motivation 必须是对象")
+    else:
+        for f in ["problem", "goal", "gap"]:
+            val = motivation.get(f, "")
+            if not isinstance(val, str) or len(val.strip()) < 50:
+                errors.append(f"motivation.{f} 必须是详细段落（≥50字），当前: {type(val).__name__} ({len(str(val))}字)")
+
+    # method 子字段
+    method = data.get("method", {})
+    if not isinstance(method, dict):
+        errors.append("method 必须是对象")
+    else:
+        for f in ["overview", "key_idea", "steps", "novelty"]:
+            val = method.get(f, "")
+            if not isinstance(val, str) or len(val.strip()) < 50:
+                errors.append(f"method.{f} 必须是详细段落（≥50字），当前: {type(val).__name__} ({len(str(val))}字)")
+
+    # results 子字段
+    results = data.get("results", {})
+    if not isinstance(results, dict):
+        errors.append("results 必须是对象")
+    else:
+        for f in ["main_findings", "limitations"]:
+            val = results.get(f, "")
+            if not isinstance(val, str) or len(val.strip()) < 50:
+                errors.append(f"results.{f} 必须是详细段落（≥50字），当前: {type(val).__name__} ({len(str(val))}字)")
+        # benchmarks 可以是数组
+        benchmarks = results.get("benchmarks")
+        if benchmarks is not None and not isinstance(benchmarks, list):
+            errors.append("results.benchmarks 必须是数组")
+
+    # improvements 子字段
+    improvements = data.get("improvements", {})
+    if not isinstance(improvements, dict):
+        errors.append("improvements 必须是对象")
+    else:
+        for f in ["weaknesses", "future_work", "reproducibility"]:
+            val = improvements.get(f, "")
+            if not isinstance(val, str) or len(val.strip()) < 50:
+                errors.append(f"improvements.{f} 必须是详细段落（≥50字），当前: {type(val).__name__} ({len(str(val))}字)")
+
+    # 检查是否有字段误用数组（应该用字符串的）
+    string_fields = [
+        ("motivation", "problem"), ("motivation", "goal"), ("motivation", "gap"),
+        ("method", "overview"), ("method", "key_idea"), ("method", "steps"), ("method", "novelty"),
+        ("results", "main_findings"), ("results", "limitations"),
+        ("improvements", "weaknesses"), ("improvements", "future_work"), ("improvements", "reproducibility"),
+    ]
+    for section, field in string_fields:
+        val = data.get(section, {}).get(field)
+        if isinstance(val, list):
+            errors.append(f"{section}.{field} 应该是字符串段落，不能是数组")
+
+    # figures 验证
+    figures = data.get("figures")
+    if figures is not None:
+        if not isinstance(figures, list):
+            errors.append("figures 必须是数组")
+        else:
+            for i, fig in enumerate(figures):
+                if isinstance(fig, dict) and not fig.get("id"):
+                    errors.append(f"figures[{i}] 缺少 id 字段")
+
+    return errors
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("用法: python scripts/validate_summary.py <json_file>")
+        sys.exit(1)
+
+    errs = validate(sys.argv[1])
+    if errs:
+        print("❌ 验证失败:")
+        for e in errs:
+            print(f"  - {e}")
+        sys.exit(1)
+    else:
+        print("✅ 验证通过")
+        sys.exit(0)
@@ -87,7 +87,8 @@ def client(db_engine, db_session):
 # ── 样例数据 ────────────────────────────────────────────────────────────

 SAMPLE_ARXIV_ID = "2401.12345"
-ADMIN_TOKEN = "test-admin-token-12345"
+_TEST_ADMIN_USERNAME = "admin"
+_TEST_ADMIN_PASSWORD = "test-password-12345"


@pytest.fixture
@@ -138,46 +139,56 @@ def sample_paper(db_session):
 def sample_summary_dict() -> dict:
    """完整合法的 summary dict。"""
    return {
+        "arxiv_id": "2401.12345",
        "title_zh": "测试论文中文标题",
        "one_line": "这是一篇关于自然语言处理的测试论文的一句话总结。",
        "tags": ["自然语言处理", "大语言模型", "Transformer"],
        "difficulty": "中级",
        "prerequisites": {
-            "concepts": ["Transformer", "注意力机制"],
-            "level": "中级",
+            "concepts": [
+                {
+                    "term": "Transformer",
+                    "explanation": "一种基于自注意力机制的序列到序列模型架构，广泛用于NLP任务。",
+                    "why_matters": "本文方法基于 Transformer 架构进行改进。",
+                },
+                {
+                    "term": "注意力机制",
+                    "explanation": "允许模型在处理序列时动态关注不同位置的信息的机制。",
+                    "why_matters": "理解注意力机制是理解本文方法的基础。",
+                },
+            ],
        },
        "motivation": {
-            "problem": "现有模型在长文本理解上存在不足。",
-            "goal": "提出一种新的注意力机制来提升长文本建模能力。",
-            "gap": "当前方法计算复杂度过高。",
+            "problem": "现有模型在长文本理解上存在不足，主要体现在注意力计算复杂度随序列长度二次增长，导致实际应用中无法处理超长文本输入。",
+            "goal": "提出一种新的稀疏注意力机制来有效提升长文本建模能力，在保持模型整体性能的同时大幅降低计算开销和显存占用。",
+            "gap": "当前方法计算复杂度过高，已有的稀疏注意力方案在保留全局信息方面存在明显不足，导致长距离依赖建模效果不佳。",
        },
        "method": {
-            "overview": "提出了一种高效的稀疏注意力机制。",
-            "key_idea": "使用局部-全局混合的注意力模式来降低计算复杂度。",
-            "steps": [
-                "分析现有注意力机制的瓶颈",
-                "设计稀疏注意力模式",
-                "在多个基准上验证效果",
-            ],
-            "novelty": "首次将局部-全局注意力模式结合应用于长文本建模。",
+            "overview": "提出了一种高效的稀疏注意力机制，通过局部-全局混合的注意力模式，在降低计算复杂度的同时保留了关键的全局信息流动。",
+            "key_idea": "使用局部-全局混合的注意力模式来降低计算复杂度，局部窗口捕获短距离依赖，全局采样点维护长距离信息传递。",
+            "steps": "首先分析现有注意力机制的计算瓶颈，发现全连接注意力中大部分注意力权重接近于零。然后设计了一种混合稀疏注意力模式，包含局部滑动窗口和全局随机采样两条路径。最后在多个长文本基准数据集上进行了全面的实验验证。",
+            "novelty": "首次将局部-全局注意力模式结合应用于长文本建模，通过可学习的采样策略动态调整全局注意力点的位置，而非固定模式。",
        },
        "results": {
-            "main_findings": [
-                "在长文本基准上取得了 SOTA 结果",
-                "推理速度提升了 2 倍",
-            ],
+            "main_findings": "在长文本基准 LongBench 上取得了 SOTA 结果，平均得分提升 3.2 个百分点。推理速度相比全注意力提升了 2 倍，显存占用降低 60%。在 32k 序列长度下仍保持与全注意力相当的生成质量。",
            "benchmarks": [
-                {"dataset": "LongBench", "score": 85.3},
-            ],
-            "limitations": [
-                "在超长文本（>100k tokens）上效果有所下降",
+                {"task": "长文本摘要", "metric": "ROUGE-L", "this_work": "42.1", "baseline": "38.9", "improvement": "+3.2"},
            ],
+            "limitations": "在超长文本（>100k tokens）上效果有所下降，主要原因是全局采样点数量不足以覆盖所有关键信息。此外，在小规模数据集上的优势不如大规模数据集明显。",
        },
        "improvements": {
-            "weaknesses": ["仅验证了英文数据"],
-            "future_work": ["扩展到多语言场景"],
-            "reproducibility": "代码已开源，模型权重可下载。",
+            "weaknesses": "仅验证了英文数据，未在中文等多语言场景下测试。全局采样策略在极端长度的文本上可能需要增加采样点数量，增加了工程复杂度。",
+            "future_work": "扩展到多语言场景，研究自适应采样策略，使模型能根据输入内容动态调整全局注意力点的分配。同时探索与 Flash Attention 等底层优化的兼容性。",
+            "reproducibility": "代码已在 GitHub 开源，提供了完整的训练脚本和预训练模型权重。实验使用了公开数据集，硬件需求为 8×A100 GPU。",
        },
+        "figures": [
+            {
+                "id": "Figure 1",
+                "caption": "稀疏注意力机制的整体架构图",
+                "description": "展示了局部窗口注意力和全局采样注意力的组合方式，以及信息如何在两种路径间流动。",
+                "reason": "帮助理解本文方法的核心设计思想，直观展示了局部-全局混合模式的工作原理。",
+            },
+        ],
    }


@@ -200,21 +211,21 @@ def mock_pi_output(sample_summary_json) -> str:


@pytest.fixture
-def admin_token():
-    """返回测试用的 ADMIN_TOKEN（需要配合 monkeypatch 使用）。"""
-    return ADMIN_TOKEN
+def auth_client(client, monkeypatch):
+    """已登录的 TestClient（session cookie 自动携带）。"""
+    from app.config import settings

-
-@pytest.fixture
-def admin_headers(admin_token):
-    """带 Bearer token 的请求头。"""
-    return {"Authorization": f"Bearer {admin_token}"}
-
-
-@pytest.fixture
-def wrong_admin_headers():
-    """错误的 Authorization 请求头。"""
-    return {"Authorization": "Bearer wrong-token"}
+    monkeypatch.setattr(settings, "ADMIN_USERNAME", _TEST_ADMIN_USERNAME)
+    monkeypatch.setattr(settings, "ADMIN_PASSWORD", _TEST_ADMIN_PASSWORD)
+    monkeypatch.setattr(settings, "CHROMA_ENABLED", False)
+    # 登录获取 session cookie
+    resp = client.post(
+        "/admin/login",
+        data={"username": _TEST_ADMIN_USERNAME, "password": _TEST_ADMIN_PASSWORD},
+        follow_redirects=False,
+    )
+    assert resp.status_code == 303
+    return client


 # ── 多样例数据 ────────────────────────────────────────────────────────────
@@ -16,19 +16,6 @@ from app.models import (
 )


-# ── Fixtures ────────────────────────────────────────────────────────────
-
-ADMIN_TOKEN = "test-admin-token-12345"
-
-
-@pytest.fixture
-def auth_client(client, monkeypatch):
-    """带 admin token monkeypatch 的 TestClient。"""
-    monkeypatch.setattr(settings, "ADMIN_TOKEN", ADMIN_TOKEN)
-    monkeypatch.setattr(settings, "CHROMA_ENABLED", False)
-    return client
-
-
 # ═══════════════════════════════════════════════════════════════════════
 # Admin Routes — 鉴权测试
 # ═══════════════════════════════════════════════════════════════════════
@@ -37,46 +24,70 @@ def auth_client(client, monkeypatch):
 class TestAdminAuth:
    """管理接口鉴权测试。"""

-    def test_no_token_returns_403(self, auth_client):
-        """无 token 时请求管理接口应返回 403。"""
-        resp = auth_client.post("/admin/crawl")
-        assert resp.status_code in (403, 401)
+    def test_unauthenticated_redirects_to_login(self, auth_client):
+        """未登录时请求管理接口应重定向到登录页。"""
+        # 用未登录的 client（auth_client 已登录，这里直接用 client）
+        pass  # 见下方 test_no_session_returns_303

-    def test_wrong_token_returns_401(self, auth_client, wrong_admin_headers):
-        """错误 token 应返回 401。"""
-        resp = auth_client.post("/admin/crawl", headers=wrong_admin_headers)
-        assert resp.status_code == 401
+    def test_no_session_returns_303(self, client, monkeypatch):
+        """无 session 时请求管理接口应返回 303 重定向。"""
+        monkeypatch.setattr(settings, "ADMIN_PASSWORD", "some-password")
+        resp = client.post("/admin/crawl", follow_redirects=False)
+        assert resp.status_code == 303
+        assert "/admin/login" in resp.headers.get("location", "")

-    def test_correct_token_accepted(self, auth_client, admin_headers):
-        """正确 token 应被接受（crawl 可能会失败但不是 401）。"""
+    def test_wrong_password_shows_error(self, client, monkeypatch):
+        """错误密码应返回登录页并显示错误。"""
+        monkeypatch.setattr(settings, "ADMIN_USERNAME", "admin")
+        monkeypatch.setattr(settings, "ADMIN_PASSWORD", "correct-pass")
+        resp = client.post(
+            "/admin/login",
+            data={"username": "admin", "password": "wrong-pass"},
+            follow_redirects=False,
+        )
+        assert resp.status_code == 200
+        assert "错误" in resp.text or "error" in resp.text.lower()
+
+    def test_correct_login_redirects_to_logs(self, client, monkeypatch):
+        """正确登录应重定向到 /admin/logs。"""
+        monkeypatch.setattr(settings, "ADMIN_USERNAME", "admin")
+        monkeypatch.setattr(settings, "ADMIN_PASSWORD", "test-pass")
+        resp = client.post(
+            "/admin/login",
+            data={"username": "admin", "password": "test-pass"},
+            follow_redirects=False,
+        )
+        assert resp.status_code == 303
+        assert "/admin/logs" in resp.headers.get("location", "")
+
+    def test_logout_clears_session(self, auth_client, monkeypatch):
+        """退出登录后应清除 session。"""
+        monkeypatch.setattr(settings, "CHROMA_ENABLED", False)
+        resp = auth_client.post("/admin/logout", follow_redirects=False)
+        assert resp.status_code == 303
+        # 退出后访问管理页应被重定向
+        resp = auth_client.get("/admin/logs", follow_redirects=False)
+        assert resp.status_code == 303
+
+    def test_correct_session_accepted(self, auth_client):
+        """已登录 session 应被接受（crawl 可能会失败但不是 303）。"""
        with patch(
            "app.routes.admin.crawl_daily", new_callable=AsyncMock
        ) as mock_crawl:
            mock_crawl.return_value = {"found": 0, "new": 0, "status": "success"}
-            resp = auth_client.post("/admin/crawl", headers=admin_headers)
-            assert resp.status_code != 401
+            resp = auth_client.post("/admin/crawl")
+            assert resp.status_code != 303

    # ── summarize route auth ────────────────────────────────────────

-    def test_no_token_returns_401_for_summarize(self, client):
-        """无 Bearer token 返回 401。"""
-        resp = client.post("/admin/summarize")
-        assert resp.status_code in (401, 403)
+    def test_no_session_returns_303_for_summarize(self, client, monkeypatch):
+        """无 session 返回 303。"""
+        monkeypatch.setattr(settings, "ADMIN_PASSWORD", "some-password")
+        resp = client.post("/admin/summarize", follow_redirects=False)
+        assert resp.status_code == 303

-    def test_wrong_token_returns_401_for_summarize(self, client):
-        resp = client.post(
-            "/admin/summarize",
-            headers={"Authorization": "Bearer wrong-token"},
-        )
-        assert resp.status_code == 401
-
-    def test_correct_token_batch_summarize(self, client, admin_headers):
-        """正确 token 调用 batch summarize，mock 掉服务层。"""
-        import app.config as config_mod
-
-        original = config_mod.settings.ADMIN_TOKEN
-        config_mod.settings.ADMIN_TOKEN = ADMIN_TOKEN
-        try:
+    def test_correct_session_batch_summarize(self, auth_client):
+        """已登录调用 batch summarize，mock 掉服务层。"""
        with patch(
            "app.routes.admin.summarize_batch", new_callable=AsyncMock
        ) as mock:
@@ -86,31 +97,19 @@ class TestAdminAuth:
                "failed": 0,
                "total": 0,
            }
-                resp = client.post("/admin/summarize", headers=admin_headers)
+            resp = auth_client.post("/admin/summarize")
            assert resp.status_code == 200
            assert resp.json()["status"] == "success"
-        finally:
-            config_mod.settings.ADMIN_TOKEN = original

-    def test_single_paper_not_found(self, client, admin_headers):
+    def test_single_paper_not_found(self, auth_client):
        """单篇总结不存在的论文返回 404。"""
-        import app.config as config_mod
-
-        original = config_mod.settings.ADMIN_TOKEN
-        config_mod.settings.ADMIN_TOKEN = ADMIN_TOKEN
-        try:
        with patch(
            "app.routes.admin.summarize_single",
            new_callable=AsyncMock,
            return_value={"status": "not_found", "arxiv_id": "nonexistent.99999"},
        ):
-                resp = client.post(
-                    "/admin/summarize/nonexistent.99999",
-                    headers=admin_headers,
-                )
+            resp = auth_client.post("/admin/summarize/nonexistent.99999")
            assert resp.status_code == 404
-        finally:
-            config_mod.settings.ADMIN_TOKEN = original


 # ═══════════════════════════════════════════════════════════════════════
@@ -121,27 +120,25 @@ class TestAdminAuth:
 class TestAdminCrawl:
    """POST /admin/crawl 测试。"""

-    def test_crawl_default_today(self, auth_client, admin_headers):
+    def test_crawl_default_today(self, auth_client):
        """不指定日期时默认抓取今天。"""
        with patch(
            "app.routes.admin.crawl_daily", new_callable=AsyncMock
        ) as mock_crawl:
            mock_crawl.return_value = {"found": 5, "new": 3, "status": "success"}
-            resp = auth_client.post("/admin/crawl", headers=admin_headers)
+            resp = auth_client.post("/admin/crawl")
            assert resp.status_code == 200
            data = resp.json()
            assert data["status"] == "success"
            mock_crawl.assert_called_once()

-    def test_crawl_specific_date(self, auth_client, admin_headers):
+    def test_crawl_specific_date(self, auth_client):
        """指定日期抓取。"""
        with patch(
            "app.routes.admin.crawl_daily", new_callable=AsyncMock
        ) as mock_crawl:
            mock_crawl.return_value = {"found": 2, "new": 1, "status": "success"}
-            resp = auth_client.post(
-                "/admin/crawl?date=2024-01-15", headers=admin_headers
-            )
+            resp = auth_client.post("/admin/crawl?date=2024-01-15")
            assert resp.status_code == 200
            mock_crawl.assert_called_once()
            call_args = mock_crawl.call_args
@@ -156,21 +153,21 @@ class TestAdminCrawl:
 class TestAdminCleanup:
    """POST /admin/cleanup 测试。"""

-    def test_cleanup_returns_stats(self, auth_client, admin_headers):
+    def test_cleanup_returns_stats(self, auth_client):
        """清理应返回统计信息。"""
        with patch("app.routes.admin.cleanup_tmp") as mock_cleanup:
            mock_cleanup.return_value = {"scanned": 3, "removed": 1, "errors": []}
-            resp = auth_client.post("/admin/cleanup", headers=admin_headers)
+            resp = auth_client.post("/admin/cleanup")
            assert resp.status_code == 200
            data = resp.json()
            assert data["scanned"] == 3
            assert data["removed"] == 1

-    def test_cleanup_writes_log(self, auth_client, admin_headers, db_session):
+    def test_cleanup_writes_log(self, auth_client, db_session):
        """清理应写入 crawl_logs。"""
        with patch("app.routes.admin.cleanup_tmp") as mock_cleanup:
            mock_cleanup.return_value = {"scanned": 0, "removed": 0, "errors": []}
-            auth_client.post("/admin/cleanup", headers=admin_headers)
+            auth_client.post("/admin/cleanup")

        logs = (
            db_session.execute(select(CrawlLog).where(CrawlLog.task == "cleanup"))
@@ -189,7 +186,7 @@ class TestAdminCleanup:
 class TestAdminDelete:
    """POST /admin/delete 测试。"""

-    def test_delete_requires_confirm(self, auth_client, admin_headers):
+    def test_delete_requires_confirm(self, auth_client):
        """confirm 不是 'DELETE' 时应返回 422。"""
        resp = auth_client.post(
            "/admin/delete",
@@ -199,12 +196,11 @@ class TestAdminDelete:
                "include_notes": True,
                "confirm": "WRONG",
            },
-            headers=admin_headers,
        )
        assert resp.status_code == 422

    def test_delete_with_confirm(
-        self, auth_client, admin_headers, db_session, sample_papers_range
+        self, auth_client, db_session, sample_papers_range
    ):
        """confirm='DELETE' 时应执行删除。"""
        resp = auth_client.post(
@@ -215,13 +211,12 @@ class TestAdminDelete:
                "include_notes": True,
                "confirm": "DELETE",
            },
-            headers=admin_headers,
        )
        assert resp.status_code == 200
        data = resp.json()
        assert data["deleted"] == 3

-    def test_delete_invalid_date_range(self, auth_client, admin_headers):
+    def test_delete_invalid_date_range(self, auth_client):
        """date_start > date_end 应返回 400。"""
        resp = auth_client.post(
            "/admin/delete",
@@ -230,11 +225,10 @@ class TestAdminDelete:
                "date_end": "2024-01-10",
                "confirm": "DELETE",
            },
-            headers=admin_headers,
        )
        assert resp.status_code == 400

-    def test_delete_without_confirm_field(self, auth_client, admin_headers):
+    def test_delete_without_confirm_field(self, auth_client):
        """缺少 confirm 字段应返回 422。"""
        resp = auth_client.post(
            "/admin/delete",
@@ -242,7 +236,6 @@ class TestAdminDelete:
                "date_start": "2024-01-10",
                "date_end": "2024-01-12",
            },
-            headers=admin_headers,
        )
        assert resp.status_code == 422

@@ -255,19 +248,20 @@ class TestAdminDelete:
 class TestAdminLogs:
    """GET /admin/logs 测试。"""

-    def test_logs_returns_page(self, auth_client, admin_headers):
+    def test_logs_returns_page(self, auth_client):
        """应返回管理日志页面。"""
-        resp = auth_client.get("/admin/logs", headers=admin_headers)
+        resp = auth_client.get("/admin/logs")
        assert resp.status_code == 200
        assert "text/html" in resp.headers.get("content-type", "")

-    def test_logs_requires_auth(self, auth_client):
+    def test_logs_requires_auth(self, client, monkeypatch):
        """日志页面需要鉴权。"""
-        resp = auth_client.get("/admin/logs")
-        assert resp.status_code in (403, 401)
+        monkeypatch.setattr(settings, "ADMIN_PASSWORD", "some-password")
+        resp = client.get("/admin/logs", follow_redirects=False)
+        assert resp.status_code == 303

    def test_logs_contains_data(
-        self, auth_client, admin_headers, db_session, sample_papers_range
+        self, auth_client, db_session, sample_papers_range
    ):
        """日志页面应包含日志数据。"""
        # 先创建一条日志
@@ -282,7 +276,7 @@ class TestAdminLogs:
        )
        db_session.commit()

-        resp = auth_client.get("/admin/logs", headers=admin_headers)
+        resp = auth_client.get("/admin/logs")
        assert resp.status_code == 200
        assert "crawl" in resp.text.lower() or "日志" in resp.text

@@ -1,107 +0,0 @@
-"""LaTeX 图片提取测试 — 从 .tex 源码中提取图片文件。"""
-
-from __future__ import annotations
-
-import pytest
-
-
-# ═══════════════════════════════════════════════════════════════════════
-# Image Extraction
-# ═══════════════════════════════════════════════════════════════════════
-
-
-class TestImageExtraction:
-    """LaTeX 图片提取测试。"""
-
-    @pytest.mark.asyncio
-    async def test_extract_images_from_source_no_dir(self, monkeypatch, tmp_path):
-        """源码目录不存在时返回 0。"""
-        monkeypatch.setattr(
-            "app.services.pdf_downloader.tmp_dir", lambda x: tmp_path / "tmp" / x
-        )
-        monkeypatch.setattr(
-            "app.services.pdf_downloader.paper_dir", lambda x: tmp_path / "papers" / x
-        )
-        from app.services.image_extractor import extract_images_from_source
-
-        result = await extract_images_from_source("2401.99999")
-        assert result == 0
-
-    @pytest.mark.asyncio
-    async def test_extract_images_from_tex(self, monkeypatch, tmp_path):
-        """从 .tex 文件中提取图片。"""
-        from app.services.image_extractor import extract_images_from_source
-
-        tmp_source = tmp_path / "tmp" / "2401.00001" / "source"
-        tmp_source.mkdir(parents=True)
-
-        images_dir = tmp_source / "figs"
-        images_dir.mkdir()
-        (images_dir / "figure1.png").write_bytes(b"\x89PNG\r\n")
-        (images_dir / "figure2.jpg").write_bytes(b"\xff\xd8\xff\xe0")
-
-        # 创建 .tex 文件
-        tex_content = r"""
-\documentclass{article}
-\begin{document}
-\begin{figure}
-  \includegraphics[width=0.8\textwidth]{figs/figure1.png}
-  \includegraphics{figs/figure2.jpg}
-  \includegraphics[angle=90]{figs/nonexistent.pdf}
-\end{figure}
-\end{document}
-"""
-        (tmp_source / "main.tex").write_text(tex_content)
-
-        papers_dir = tmp_path / "papers" / "2401.00001"
-        monkeypatch.setattr(
-            "app.services.image_extractor.tmp_dir", lambda x: tmp_path / "tmp" / x
-        )
-        monkeypatch.setattr(
-            "app.services.image_extractor.paper_dir", lambda x: tmp_path / "papers" / x
-        )
-
-        # Mock download_source_zip to avoid real network call (source dir already exists)
-        async def _noop_download(*args, **kwargs):
-            pass
-
-        monkeypatch.setattr(
-            "app.services.image_extractor.download_source_zip", _noop_download
-        )
-
-        result = await extract_images_from_source("2401.00001")
-
-        assert result == 2
-        dest_images = papers_dir / "images"
-        assert dest_images.exists()
-        assert (dest_images / "figure1.png").exists()
-        assert (dest_images / "figure2.jpg").exists()
-
-    @pytest.mark.asyncio
-    async def test_extract_images_empty_tex(self, monkeypatch, tmp_path):
-        """.tex 文件无图片时返回 0。"""
-        from app.services.image_extractor import extract_images_from_source
-
-        tmp_source = tmp_path / "tmp" / "2401.00002" / "source"
-        tmp_source.mkdir(parents=True)
-        (tmp_source / "main.tex").write_text(
-            r"\documentclass{article}\begin{document}Hello\end{document}"
-        )
-
-        monkeypatch.setattr(
-            "app.services.image_extractor.tmp_dir", lambda x: tmp_path / "tmp" / x
-        )
-        monkeypatch.setattr(
-            "app.services.image_extractor.paper_dir", lambda x: tmp_path / "papers" / x
-        )
-
-        # Mock download_source_zip to avoid real network call
-        async def _noop_download(*args, **kwargs):
-            pass
-
-        monkeypatch.setattr(
-            "app.services.image_extractor.download_source_zip", _noop_download
-        )
-
-        result = await extract_images_from_source("2401.00002")
-        assert result == 0
@@ -64,10 +64,9 @@ class TestSummarySchema:
            SummarySchema.model_validate(sample_summary_dict)

    def test_extra_fields_ignored(self, sample_summary_dict):
-        sample_summary_dict["figures"] = ["fig1.png"]
        sample_summary_dict["takeaway"] = "important paper"
        schema = SummarySchema.model_validate(sample_summary_dict)
-        assert not hasattr(schema, "figures")
+        assert not hasattr(schema, "takeaway")
        assert schema.title_zh  # 正常解析

    def test_flatten_for_db(self, sample_summary_dict):
@@ -80,7 +79,7 @@ class TestSummarySchema:
        assert "updated_at" in flat
        # JSON 字段可解析
        assert isinstance(json.loads(flat["prerequisites_json"]), dict)
-        assert isinstance(json.loads(flat["method_steps_json"]), list)
+        assert isinstance(flat["figures_json"], str)  # figures 序列化为 JSON


 # ═══════════════════════════════════════════════════════════════════════
@@ -99,7 +98,7 @@ class TestQualityAssessment:
        sample_summary_dict["motivation"]["goal"] = ""
        sample_summary_dict["motivation"]["gap"] = ""
        sample_summary_dict["method"]["overview"] = ""
-        sample_summary_dict["results"]["main_findings"] = []
+        sample_summary_dict["results"]["main_findings"] = ""
        schema = SummarySchema.model_validate(sample_summary_dict)
        assert assess_quality(schema) == "degraded"

@@ -182,7 +182,7 @@ class TestSummarizeOneFlow:
            patch(
                "app.services.summarizer.call_pi",
                new_callable=AsyncMock,
-                return_value=mock_pi_output,
+                return_value=(mock_pi_output, "test-session-id"),
            ),
        ):
            result = await summarize_one(db_session, sample_paper)
@@ -246,27 +246,28 @@ class TestSummarizeOneFlow:

    @pytest.mark.asyncio
    async def test_json_not_found(self, db_session, sample_paper, _patch_paths):
-        """pi 输出无 JSON → json_not_found。"""
+        """pi 输出无 JSON → 验证循环重试 4 次后 ValueError (unknown)。"""
        with (
            patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
            patch(
                "app.services.summarizer.call_pi",
                new_callable=AsyncMock,
-                return_value="No JSON in this output at all.",
+                return_value=("No JSON in this output at all.", "test-session-id"),
            ),
        ):
            result = await summarize_one(db_session, sample_paper)

        assert result["status"] == "failed"
-        assert result["error_type"] == "json_not_found"
+        assert result["error_type"] == "unknown"

    @pytest.mark.asyncio
-    async def test_field_missing_and_retry(
+    async def test_validation_fails_and_retries(
        self, db_session, sample_paper, _patch_paths
    ):
-        """必填字段缺失 → field_missing → retry → permanent_failure。"""
+        """验证失败（字段不符合要求）→ 重试多次后失败。"""
        bad_json = json.dumps(
            {
+                "arxiv_id": sample_paper.arxiv_id,
                "title_zh": "",  # 空的必填字段
                "one_line": "valid line",
                "tags": ["tag1"],
@@ -282,23 +283,14 @@ class TestSummarizeOneFlow:
            patch(
                "app.services.summarizer.call_pi",
                new_callable=AsyncMock,
-                return_value=bad_output,
+                return_value=(bad_output, "test-session-id"),
            ),
        ):
-            # 第一次失败 → pending (retry)
-            result1 = await summarize_one(db_session, sample_paper)
-            assert result1["status"] == "failed"
-            assert result1["error_type"] == "field_missing"
-            assert result1["retry_count"] == 1
-
-            # 第二次失败 → permanent_failure (SUMMARY_MAX_RETRIES=1, 所以 2 次 > 1+1)
-            db_session.refresh(sample_paper)
-            result2 = await summarize_one(db_session, sample_paper)
-            assert result2["status"] == "failed"
-            assert result2["retry_count"] == 2
-
-            db_session.refresh(sample_paper)
-            assert sample_paper.summary_status.status == "permanent_failure"
+            # _validate_summary 先拦截，4 轮都失败后 ValueError → unknown
+            result = await summarize_one(db_session, sample_paper)
+            assert result["status"] == "failed"
+            assert result["error_type"] == "unknown"
+            assert result["retry_count"] == 1

    @pytest.mark.asyncio
    async def test_raw_output_saved_on_failure(
@@ -310,7 +302,7 @@ class TestSummarizeOneFlow:
            patch(
                "app.services.summarizer.call_pi",
                new_callable=AsyncMock,
-                return_value="Some output without JSON",
+                return_value=("Some output without JSON", "test-session-id"),
            ),
        ):
            await summarize_one(db_session, sample_paper)
@@ -329,7 +321,7 @@ class TestSummarizeOneFlow:
            patch(
                "app.services.summarizer.call_pi",
                new_callable=AsyncMock,
-                return_value=mock_pi_output,
+                return_value=(mock_pi_output, "test-session-id"),
            ),
        ):
            await summarize_one(db_session, sample_paper)
@@ -417,7 +409,7 @@ class TestBatchSummarize:
            patch(
                "app.services.summarizer.call_pi",
                new_callable=AsyncMock,
-                return_value=mock_pi_output,
+                return_value=(mock_pi_output, "test-session-id"),
            ),
        ):
            result = await summarize_batch(db_session, _session_factory=_TestSession)
@@ -464,7 +456,7 @@ class TestBatchSummarize:
            call_count += 1
            if call_count == 1:
                raise PiTimeoutError("timeout")
-            return mock_pi_output
+            return mock_pi_output, "test-session-id"

        with (
            patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
@@ -506,7 +498,7 @@ class TestBatchSummarize:
            patch(
                "app.services.summarizer.call_pi",
                new_callable=AsyncMock,
-                return_value=mock_pi_output,
+                return_value=(mock_pi_output, "test-session-id"),
            ),
        ):
            await summarize_batch(db_session, _session_factory=_TestSession)
@@ -672,9 +672,11 @@ dependencies = [
    { name = "chromadb" },
    { name = "fastapi" },
    { name = "httpx" },
+    { name = "itsdangerous" },
    { name = "jinja2" },
    { name = "pydantic" },
    { name = "pydantic-settings" },
+    { name = "pymupdf" },
    { name = "python-dotenv" },
    { name = "python-multipart" },
    { name = "sqlalchemy" },
@@ -694,9 +696,11 @@ requires-dist = [
    { name = "chromadb", specifier = ">=1.0" },
    { name = "fastapi", specifier = ">=0.115" },
    { name = "httpx", specifier = ">=0.28" },
+    { name = "itsdangerous", specifier = ">=2.2.0" },
    { name = "jinja2", specifier = ">=3.1" },
    { name = "pydantic", specifier = ">=2.0" },
    { name = "pydantic-settings", specifier = ">=2.0" },
+    { name = "pymupdf", specifier = ">=1.25" },
    { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" },
    { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24" },
    { name = "python-dotenv", specifier = ">=1.0" },
@@ -850,6 +854,15 @@ wheels = [
    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
 ]

+[[package]]
+name = "itsdangerous"
+version = "2.2.0"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" },
+]
+
 [[package]]
 name = "jinja2"
 version = "3.1.6"
@@ -1778,6 +1791,22 @@ wheels = [
    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" },
 ]

+[[package]]
+name = "pymupdf"
+version = "1.27.2.3"
+source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
+sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/22/32/708bedc9dde7b328d45abbc076091769d44f2f24ad151ad92d56a6ec142b/pymupdf-1.27.2.3.tar.gz", hash = "sha256:7a92faa25129e8bbec5e50eeb9214f187665428c31b05c4ef6e36c58c0b1c6d2", size = 85759618, upload-time = "2026-04-24T14:13:14.42Z" }
+wheels = [
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/dc/09/ddbdfa7ee91fbabd6f63d7d744884cbdfe3e7ff9b8604749fb38bddf5c5d/pymupdf-1.27.2.3-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:fc1bc3cae6e9e150b0dbb0a9221bdfd411d65f0db2fe359eaa22467d7cc2a05f", size = 24002636, upload-time = "2026-04-24T14:09:17.459Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/01/89/3f8edd6c4f50ca370e2a2f2a3011face36f3760728ffe76dffec91c0fca0/pymupdf-1.27.2.3-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:660d93cb6da5bbddf11d3982ae27745dd3a9902d9f24cdb69adab83962294b5a", size = 23278238, upload-time = "2026-04-24T14:09:32.882Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c3/26/b7e5a70eb83bd189f8b5df87ec442746b992f2f632662839b288170d357d/pymupdf-1.27.2.3-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1dd460a3ae4597a755f00a3bd9771f5ebf1531dc111f6a36bf05dd00a6b84425", size = 24333923, upload-time = "2026-04-24T14:09:47.341Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e4/a0/aa1ee2240f29481a04a827c313333b4ecd8a14d6ac3e15d3f41a30574781/pymupdf-1.27.2.3-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:857842b4888827bd6155a1131341b2822a7ebe9a8c15a975fd7d490d7a64a30c", size = 24963198, upload-time = "2026-04-24T14:10:07.408Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/69/49/4f742451f980840829fc00ba158bebb25d389c846d8f4f8c65936ee55de8/pymupdf-1.27.2.3-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:580983849c64a08d08344ca3d1580e87c01f046a8392421797bc850efd72a5b6", size = 25184609, upload-time = "2026-04-24T14:10:22.911Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f6/3f/3853d6608f394faf6eec2bd4e8ea9f6a00beea329b071abdb29f4164cc3d/pymupdf-1.27.2.3-cp310-abi3-win32.whl", hash = "sha256:a5c1088a87189891a4946ab314a14b7934ac4c5b6077f7e74ebee956f8906d0e", size = 18019286, upload-time = "2026-04-24T14:10:34.239Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/44/47/5fb10fe73f96b31253a41647c362ea9e0380920bddf16028414a051247fc/pymupdf-1.27.2.3-cp310-abi3-win_amd64.whl", hash = "sha256:d20f68ef15195e073071dbc4ae7455257c7889af7584e39df490c0a92728526e", size = 19249102, upload-time = "2026-04-24T14:10:46.72Z" },
+    { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/53/a4/b9e91aac82293f9c954654c85581ee8212b5b05efadc534b581141241e6f/pymupdf-1.27.2.3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:77691604c5d1d0233827139bbcdea61fd57879c84712b8e49b1f45520f7ab9c2", size = 25000393, upload-time = "2026-04-24T14:11:01.669Z" },
+]
+
 [[package]]
 name = "pypika"
 version = "0.51.1"