feat: enhance UI, refactor services, improve templates and tests
- Replace image_extractor with pdf_image_extractor service - Enhance pi_client with expanded API capabilities - Improve summarizer service with additional features - Update admin routes with more endpoints - Add login page template - Enhance detail page with comprehensive layout - Improve search and trends pages - Update base template with additional elements - Refactor tests for better coverage - Add validate_summary script - Update project configuration and dependencies
This commit is contained in:
+5
-3
@@ -1,12 +1,14 @@
|
|||||||
# ─── 应用 ────────────────────────────────
|
# ─── 应用 ────────────────────────────────
|
||||||
APP_HOST=0.0.0.0
|
APP_HOST=127.0.0.1
|
||||||
APP_PORT=8000
|
APP_PORT=8000
|
||||||
APP_DEBUG=false
|
APP_DEBUG=false
|
||||||
BASE_URL=http://127.0.0.1:8000
|
BASE_URL=http://127.0.0.1:8000
|
||||||
APP_TIMEZONE=Asia/Shanghai
|
APP_TIMEZONE=Asia/Shanghai
|
||||||
|
|
||||||
# ─── 安全 ────────────────────────────────
|
# ─── 安全 ────────────────────────────────
|
||||||
ADMIN_TOKEN=your_admin_token_here
|
ADMIN_USERNAME=admin
|
||||||
|
ADMIN_PASSWORD=your_secure_password
|
||||||
|
SECRET_KEY=your_random_secret_key
|
||||||
|
|
||||||
# ─── HuggingFace / arXiv ────────────────
|
# ─── HuggingFace / arXiv ────────────────
|
||||||
HF_API_BASE=https://huggingface.co/api
|
HF_API_BASE=https://huggingface.co/api
|
||||||
@@ -19,7 +21,7 @@ HTTP_USER_AGENT=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
|
|||||||
# ─── AI 总结 ──────────────────────────────
|
# ─── AI 总结 ──────────────────────────────
|
||||||
PI_BIN=
|
PI_BIN=
|
||||||
SUMMARY_SKILL=daily-paper-summary
|
SUMMARY_SKILL=daily-paper-summary
|
||||||
SUMMARY_CONCURRENCY=2
|
SUMMARY_CONCURRENCY=3
|
||||||
SUMMARY_TIMEOUT_SECONDS=300
|
SUMMARY_TIMEOUT_SECONDS=300
|
||||||
SUMMARY_MAX_RETRIES=1
|
SUMMARY_MAX_RETRIES=1
|
||||||
|
|
||||||
|
|||||||
@@ -49,6 +49,7 @@ paper/
|
|||||||
├── pyproject.toml
|
├── pyproject.toml
|
||||||
│
|
│
|
||||||
├── app/
|
├── app/
|
||||||
|
│ ├── __init__.py
|
||||||
│ ├── main.py # FastAPI 入口(lifespan 管理)
|
│ ├── main.py # FastAPI 入口(lifespan 管理)
|
||||||
│ ├── config.py # pydantic-settings 配置加载
|
│ ├── config.py # pydantic-settings 配置加载
|
||||||
│ ├── database.py # SQLAlchemy 引擎、会话与 FTS5
|
│ ├── database.py # SQLAlchemy 引擎、会话与 FTS5
|
||||||
@@ -57,6 +58,7 @@ paper/
|
|||||||
│ ├── cli.py # Typer CLI(crawl / summarize / init-db)
|
│ ├── cli.py # Typer CLI(crawl / summarize / init-db)
|
||||||
│ │
|
│ │
|
||||||
│ ├── routes/ # 页面与 API 路由
|
│ ├── routes/ # 页面与 API 路由
|
||||||
|
│ │ ├── __init__.py
|
||||||
│ │ ├── pages.py # 首页、日期页、论文详情
|
│ │ ├── pages.py # 首页、日期页、论文详情
|
||||||
│ │ ├── admin.py # Token 鉴权管理接口
|
│ │ ├── admin.py # Token 鉴权管理接口
|
||||||
│ │ ├── search.py # 搜索、阅读列表、RSS
|
│ │ ├── search.py # 搜索、阅读列表、RSS
|
||||||
@@ -65,6 +67,7 @@ paper/
|
|||||||
│ │ └── compare.py # 论文对比页
|
│ │ └── compare.py # 论文对比页
|
||||||
│ │
|
│ │
|
||||||
│ ├── services/ # 业务逻辑层
|
│ ├── services/ # 业务逻辑层
|
||||||
|
│ │ ├── __init__.py
|
||||||
│ │ ├── crawler.py # HuggingFace API 爬虫
|
│ │ ├── crawler.py # HuggingFace API 爬虫
|
||||||
│ │ ├── summarizer.py # AI 总结编排
|
│ │ ├── summarizer.py # AI 总结编排
|
||||||
│ │ ├── searcher.py # FTS5 + 语义搜索
|
│ │ ├── searcher.py # FTS5 + 语义搜索
|
||||||
@@ -103,7 +106,7 @@ paper/
|
|||||||
│ ├── init_db.py # 数据库初始化
|
│ ├── init_db.py # 数据库初始化
|
||||||
│ └── manual_crawl.py # 手动抓取脚本
|
│ └── manual_crawl.py # 手动抓取脚本
|
||||||
│
|
│
|
||||||
├── tests/ # 10 个测试模块
|
├── tests/ # 9 个测试模块
|
||||||
│ ├── conftest.py # 测试夹具(内存 DB、样本数据)
|
│ ├── conftest.py # 测试夹具(内存 DB、样本数据)
|
||||||
│ └── test_*.py # 各模块测试
|
│ └── test_*.py # 各模块测试
|
||||||
│
|
│
|
||||||
@@ -117,7 +120,7 @@ paper/
|
|||||||
### 1. 准备环境
|
### 1. 准备环境
|
||||||
|
|
||||||
- Python **3.12+**
|
- Python **3.12+**
|
||||||
- 可选:[`pi`](https://github.com/) CLI(用于 AI 总结)
|
- 可选:[`pi`](https://www.npmjs.com/package/@mariozechner/pi-coding-agent) CLI(用于 AI 总结)
|
||||||
|
|
||||||
### 2. 安装依赖
|
### 2. 安装依赖
|
||||||
|
|
||||||
@@ -139,14 +142,30 @@ cp .env.example .env
|
|||||||
| 变量 | 默认值 | 说明 |
|
| 变量 | 默认值 | 说明 |
|
||||||
|------|--------|------|
|
|------|--------|------|
|
||||||
| `APP_HOST` / `APP_PORT` | `127.0.0.1` / `8000` | 服务监听地址 |
|
| `APP_HOST` / `APP_PORT` | `127.0.0.1` / `8000` | 服务监听地址 |
|
||||||
|
| `APP_DEBUG` | `false` | 调试模式(开启 uvicorn reload) |
|
||||||
|
| `BASE_URL` | `http://127.0.0.1:8000` | 站点根 URL(用于 RSS 生成) |
|
||||||
|
| `APP_TIMEZONE` | `Asia/Shanghai` | 时区 |
|
||||||
| `ADMIN_TOKEN` | `change-me` | **必须修改** — 管理接口鉴权 |
|
| `ADMIN_TOKEN` | `change-me` | **必须修改** — 管理接口鉴权 |
|
||||||
|
| `HF_API_BASE` | `https://huggingface.co/api` | HuggingFace API 地址 |
|
||||||
|
| `HF_PROXY` | — | HTTP 代理 |
|
||||||
| `TOP_N` | `20` | 每日抓取 Top N 论文 |
|
| `TOP_N` | `20` | 每日抓取 Top N 论文 |
|
||||||
|
| `HTTP_TIMEOUT_SECONDS` | `30` | HTTP 请求超时 |
|
||||||
|
| `HTTP_MAX_RETRIES` | `3` | HTTP 最大重试次数 |
|
||||||
|
| `PI_BIN` | — | `pi` CLI 路径 |
|
||||||
|
| `SUMMARY_SKILL` | `daily-paper-summary` | pi 总结技能名 |
|
||||||
|
| `SUMMARY_CONCURRENCY` | `3` | 最大并行总结数 |
|
||||||
|
| `SUMMARY_TIMEOUT_SECONDS` | `300` | 单篇总结超时 |
|
||||||
|
| `SUMMARY_MAX_RETRIES` | `1` | 总结最大重试次数 |
|
||||||
| `SCHEDULER_ENABLED` | `false` | 启用每日自动抓取 |
|
| `SCHEDULER_ENABLED` | `false` | 启用每日自动抓取 |
|
||||||
| `SCHEDULE_HOUR` / `SCHEDULE_MINUTE` | `8` / `0` | 定时任务时间(Asia/Shanghai) |
|
| `SCHEDULE_HOUR` / `SCHEDULE_MINUTE` | `8` / `0` | 定时任务时间(APP_TIMEZONE) |
|
||||||
|
| `APP_WORKERS` | `1` | Uvicorn worker 数(必须为 1) |
|
||||||
| `DATABASE_URL` | `sqlite:///data/db/papers.db` | 数据库路径 |
|
| `DATABASE_URL` | `sqlite:///data/db/papers.db` | 数据库路径 |
|
||||||
| `CHROMA_ENABLED` | `false` | 启用语义搜索 |
|
| `CHROMA_ENABLED` | `false` | 启用语义搜索 |
|
||||||
| `PI_BIN` | — | `pi` CLI 路径 |
|
| `CHROMA_DIR` | `data/chroma` | ChromaDB 数据目录 |
|
||||||
| `SUMMARY_CONCURRENCY` | `3` | 最大并行总结数 |
|
| `EMBED_API_BASE` | — | Embedding API 地址 |
|
||||||
|
| `EMBED_API_KEY` | — | Embedding API Key |
|
||||||
|
| `EMBED_MODEL` | — | Embedding 模型名 |
|
||||||
|
| `EMBED_DIMENSIONS` | `0` | 向量维度 |
|
||||||
|
|
||||||
### 4. 初始化数据库
|
### 4. 初始化数据库
|
||||||
|
|
||||||
@@ -158,10 +177,10 @@ python scripts/init_db.py
|
|||||||
### 5. 启动服务
|
### 5. 启动服务
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
uvicorn app.main:app --host 127.0.0.1 --port 8000 --workers 1
|
uvicorn app.main:app --host 127.0.0.1 --port 8000
|
||||||
```
|
```
|
||||||
|
|
||||||
> 调度器依赖单 worker:`--workers` 必须为 `1`,否则每日任务会被重复触发。
|
> 调度器依赖单 worker:不可使用 `--workers > 1`,否则每日任务会被重复触发。
|
||||||
|
|
||||||
打开浏览器访问 `http://127.0.0.1:8000` 即可。
|
打开浏览器访问 `http://127.0.0.1:8000` 即可。
|
||||||
|
|
||||||
@@ -172,9 +191,9 @@ uvicorn app.main:app --host 127.0.0.1 --port 8000 --workers 1
|
|||||||
### 手动抓取指定日期
|
### 手动抓取指定日期
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python scripts/manual_crawl.py --date 2025-01-15
|
python scripts/manual_crawl.py 2025-01-15
|
||||||
# 或
|
# 或
|
||||||
python -m app.cli crawl --date 2025-01-15 --top 20
|
python -m app.cli crawl 2025-01-15 --top 20
|
||||||
```
|
```
|
||||||
|
|
||||||
### 手动触发总结
|
### 手动触发总结
|
||||||
|
|||||||
+3
-3
@@ -24,7 +24,7 @@ def crawl(
|
|||||||
"""手动抓取指定日期的 HuggingFace Daily Papers。"""
|
"""手动抓取指定日期的 HuggingFace Daily Papers。"""
|
||||||
from app.config import settings
|
from app.config import settings
|
||||||
from app.database import SessionLocal, engine
|
from app.database import SessionLocal, engine
|
||||||
from app.models import init_db as _init
|
from app.database import init_db as _init
|
||||||
from app.services.crawler import crawl_daily
|
from app.services.crawler import crawl_daily
|
||||||
|
|
||||||
target = date_str or date.today().isoformat()
|
target = date_str or date.today().isoformat()
|
||||||
@@ -60,7 +60,7 @@ def summarize(
|
|||||||
"""手动触发 AI 总结。"""
|
"""手动触发 AI 总结。"""
|
||||||
from app.config import settings
|
from app.config import settings
|
||||||
from app.database import SessionLocal, engine
|
from app.database import SessionLocal, engine
|
||||||
from app.models import init_db as _init
|
from app.database import init_db as _init
|
||||||
from app.services.summarizer import summarize_batch, summarize_single
|
from app.services.summarizer import summarize_batch, summarize_single
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@@ -96,7 +96,7 @@ def init_db():
|
|||||||
"""初始化数据库表。"""
|
"""初始化数据库表。"""
|
||||||
from app.config import settings
|
from app.config import settings
|
||||||
from app.database import engine
|
from app.database import engine
|
||||||
from app.models import init_db as _init
|
from app.database import init_db as _init
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|||||||
+3
-1
@@ -16,7 +16,9 @@ class Settings(BaseSettings):
|
|||||||
APP_TIMEZONE: str = "Asia/Shanghai"
|
APP_TIMEZONE: str = "Asia/Shanghai"
|
||||||
|
|
||||||
# 安全
|
# 安全
|
||||||
ADMIN_TOKEN: str = "change-me"
|
ADMIN_USERNAME: str = "admin"
|
||||||
|
ADMIN_PASSWORD: str = ""
|
||||||
|
SECRET_KEY: str = "change-me"
|
||||||
|
|
||||||
# HuggingFace / arXiv
|
# HuggingFace / arXiv
|
||||||
HF_API_BASE: str = "https://huggingface.co/api"
|
HF_API_BASE: str = "https://huggingface.co/api"
|
||||||
|
|||||||
+33
-1
@@ -62,8 +62,39 @@ def get_db():
|
|||||||
db.close()
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _migrate(engine) -> None:
|
||||||
|
"""自动给已有表补齐缺失的列(SQLite ALTER TABLE ADD COLUMN)。"""
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# 定义需要确保存在的列:{表名: [(列名, 列类型 SQL), ...]}
|
||||||
|
_MIGRATIONS: dict[str, list[tuple[str, str]]] = {
|
||||||
|
"paper_summaries": [
|
||||||
|
("figures_json", "TEXT"),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
with engine.connect() as conn:
|
||||||
|
for table, columns in _MIGRATIONS.items():
|
||||||
|
# 获取已有列名
|
||||||
|
existing = {
|
||||||
|
row[1]
|
||||||
|
for row in conn.execute(text(f"PRAGMA table_info({table})"))
|
||||||
|
}
|
||||||
|
for col_name, col_type in columns:
|
||||||
|
if col_name not in existing:
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
f"ALTER TABLE {table} ADD COLUMN {col_name} {col_type}"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
logger.info("Migrated: %s.%s added", table, col_name)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def init_db(engine):
|
def init_db(engine):
|
||||||
"""创建所有 ORM 表 + FTS5 虚拟表。"""
|
"""创建所有 ORM 表 + FTS5 虚拟表 + 自动迁移。"""
|
||||||
from app.models import Base # noqa: F811 — 避免循环导入,延迟导入
|
from app.models import Base # noqa: F811 — 避免循环导入,延迟导入
|
||||||
|
|
||||||
Base.metadata.create_all(engine)
|
Base.metadata.create_all(engine)
|
||||||
@@ -71,3 +102,4 @@ def init_db(engine):
|
|||||||
conn.execute(text(FTS5_CREATE_SQL))
|
conn.execute(text(FTS5_CREATE_SQL))
|
||||||
conn.execute(text(FTS5_TRIGGER_INDEX))
|
conn.execute(text(FTS5_TRIGGER_INDEX))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
_migrate(engine)
|
||||||
|
|||||||
+10
-9
@@ -6,6 +6,7 @@ from contextlib import asynccontextmanager
|
|||||||
|
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
from starlette.middleware.sessions import SessionMiddleware
|
||||||
|
|
||||||
from app.config import settings
|
from app.config import settings
|
||||||
from app.database import engine, init_db
|
from app.database import engine, init_db
|
||||||
@@ -56,17 +57,17 @@ def create_app() -> FastAPI:
|
|||||||
init_db(engine)
|
init_db(engine)
|
||||||
logger.info("Database initialized at %s", settings.db_path)
|
logger.info("Database initialized at %s", settings.db_path)
|
||||||
|
|
||||||
# 安全警告
|
# Session 中间件
|
||||||
if settings.ADMIN_TOKEN == "change-me":
|
app.add_middleware(SessionMiddleware, secret_key=settings.SECRET_KEY)
|
||||||
logger.warning(
|
|
||||||
"⚠️ ADMIN_TOKEN is the default value 'change-me'. Please change it in .env!"
|
|
||||||
)
|
|
||||||
|
|
||||||
if settings.APP_HOST not in ("127.0.0.1", "localhost", "::1"):
|
# 安全警告
|
||||||
|
if settings.SECRET_KEY == "change-me":
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"⚠️ APP_HOST=%s is not localhost. "
|
"⚠️ SECRET_KEY is the default value 'change-me'. Please change it in .env!"
|
||||||
"Ensure ADMIN_TOKEN is properly set and access is restricted.",
|
)
|
||||||
settings.APP_HOST,
|
if not settings.ADMIN_PASSWORD:
|
||||||
|
logger.warning(
|
||||||
|
"⚠️ ADMIN_PASSWORD is empty. Please set it in .env!"
|
||||||
)
|
)
|
||||||
|
|
||||||
# 静态文件
|
# 静态文件
|
||||||
|
|||||||
@@ -131,6 +131,7 @@ class PaperSummary(Base):
|
|||||||
weaknesses_json = Column(Text)
|
weaknesses_json = Column(Text)
|
||||||
future_work_json = Column(Text)
|
future_work_json = Column(Text)
|
||||||
reproducibility = Column(String)
|
reproducibility = Column(String)
|
||||||
|
figures_json = Column(Text)
|
||||||
full_json = Column(Text, nullable=False)
|
full_json = Column(Text, nullable=False)
|
||||||
updated_at = Column(DateTime, nullable=False)
|
updated_at = Column(DateTime, nullable=False)
|
||||||
|
|
||||||
|
|||||||
+67
-17
@@ -1,11 +1,12 @@
|
|||||||
"""管理接口 — 抓取、总结、清理、删除、日志,需要 ADMIN_TOKEN 鉴权。"""
|
"""管理接口 — 抓取、总结、清理、删除、日志,需要登录鉴权。"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
from datetime import date, datetime, timezone
|
from datetime import date, datetime, timezone
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException, Query, Request
|
from fastapi import APIRouter, Depends, Form, HTTPException, Query, Request
|
||||||
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
|
from fastapi.responses import RedirectResponse
|
||||||
from pydantic import BaseModel, field_validator
|
from pydantic import BaseModel, field_validator
|
||||||
from sqlalchemy import select
|
from sqlalchemy import select
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
@@ -19,16 +20,65 @@ from app.services.summarizer import summarize_batch, summarize_single
|
|||||||
from app.utils import release_lock, templates, today_str
|
from app.utils import release_lock, templates, today_str
|
||||||
|
|
||||||
router = APIRouter(prefix="/admin", tags=["admin"])
|
router = APIRouter(prefix="/admin", tags=["admin"])
|
||||||
security = HTTPBearer()
|
|
||||||
|
|
||||||
|
|
||||||
async def verify_admin(
|
# ── 认证 ──────────────────────────────────────────────────────────────
|
||||||
credentials: HTTPAuthorizationCredentials = Depends(security),
|
|
||||||
) -> str:
|
|
||||||
"""验证 ADMIN_TOKEN。"""
|
def _check_password(password: str) -> bool:
|
||||||
if credentials.credentials != settings.ADMIN_TOKEN:
|
"""校验密码,支持明文或 sha256 哈希。"""
|
||||||
raise HTTPException(status_code=401, detail="Invalid admin token")
|
stored = settings.ADMIN_PASSWORD
|
||||||
return credentials.credentials
|
if not stored:
|
||||||
|
return False
|
||||||
|
if password == stored:
|
||||||
|
return True
|
||||||
|
# 也支持存 sha256 哈希
|
||||||
|
return hashlib.sha256(password.encode()).hexdigest() == stored
|
||||||
|
|
||||||
|
|
||||||
|
async def verify_admin(request: Request) -> None:
|
||||||
|
"""检查 session 中的登录状态,未登录则重定向到登录页。"""
|
||||||
|
if not request.session.get("is_admin"):
|
||||||
|
raise HTTPException(status_code=303, headers={"Location": "/admin/login"})
|
||||||
|
|
||||||
|
|
||||||
|
def verify_admin_page(request: Request) -> None:
|
||||||
|
"""页面级认证:未登录重定向到登录页(同步版本,用于模板路由)。"""
|
||||||
|
if not request.session.get("is_admin"):
|
||||||
|
raise HTTPException(status_code=303, headers={"Location": "/admin/login"})
|
||||||
|
|
||||||
|
|
||||||
|
# ── 登录 / 登出 ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/login")
|
||||||
|
async def admin_login_page(request: Request):
|
||||||
|
"""显示登录页面。已登录则直接跳转管理页。"""
|
||||||
|
if request.session.get("is_admin"):
|
||||||
|
return RedirectResponse("/admin/logs", status_code=303)
|
||||||
|
return templates.TemplateResponse(request, "login.html", {"error": None})
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/login")
|
||||||
|
async def admin_login_submit(
|
||||||
|
request: Request,
|
||||||
|
username: str = Form(""),
|
||||||
|
password: str = Form(""),
|
||||||
|
):
|
||||||
|
"""处理登录表单提交。"""
|
||||||
|
if username == settings.ADMIN_USERNAME and _check_password(password):
|
||||||
|
request.session["is_admin"] = True
|
||||||
|
return RedirectResponse("/admin/logs", status_code=303)
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
request, "login.html", {"error": "用户名或密码错误"}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/logout")
|
||||||
|
async def admin_logout(request: Request):
|
||||||
|
"""退出登录,清除 session。"""
|
||||||
|
request.session.clear()
|
||||||
|
return RedirectResponse("/admin/login", status_code=303)
|
||||||
|
|
||||||
|
|
||||||
# ── 请求模型 ──────────────────────────────────────────────────────────
|
# ── 请求模型 ──────────────────────────────────────────────────────────
|
||||||
@@ -53,7 +103,7 @@ class DeleteRequest(BaseModel):
|
|||||||
|
|
||||||
@router.post("/crawl")
|
@router.post("/crawl")
|
||||||
async def admin_crawl(
|
async def admin_crawl(
|
||||||
_admin: str = Depends(verify_admin),
|
_admin: None = Depends(verify_admin),
|
||||||
db: Session = Depends(get_db),
|
db: Session = Depends(get_db),
|
||||||
date: str | None = Query(None, description="YYYY-MM-DD,默认今天"),
|
date: str | None = Query(None, description="YYYY-MM-DD,默认今天"),
|
||||||
):
|
):
|
||||||
@@ -92,7 +142,7 @@ async def admin_crawl(
|
|||||||
|
|
||||||
@router.post("/summarize")
|
@router.post("/summarize")
|
||||||
async def admin_summarize_batch(
|
async def admin_summarize_batch(
|
||||||
_admin: str = Depends(verify_admin),
|
_admin: None = Depends(verify_admin),
|
||||||
db: Session = Depends(get_db),
|
db: Session = Depends(get_db),
|
||||||
):
|
):
|
||||||
"""批量总结所有 pending 论文。"""
|
"""批量总结所有 pending 论文。"""
|
||||||
@@ -107,7 +157,7 @@ async def admin_summarize_batch(
|
|||||||
@router.post("/summarize/{arxiv_id}")
|
@router.post("/summarize/{arxiv_id}")
|
||||||
async def admin_summarize_single(
|
async def admin_summarize_single(
|
||||||
arxiv_id: str,
|
arxiv_id: str,
|
||||||
_admin: str = Depends(verify_admin),
|
_admin: None = Depends(verify_admin),
|
||||||
db: Session = Depends(get_db),
|
db: Session = Depends(get_db),
|
||||||
):
|
):
|
||||||
"""总结或重跑单篇论文。"""
|
"""总结或重跑单篇论文。"""
|
||||||
@@ -122,7 +172,7 @@ async def admin_summarize_single(
|
|||||||
|
|
||||||
@router.post("/cleanup")
|
@router.post("/cleanup")
|
||||||
async def admin_cleanup(
|
async def admin_cleanup(
|
||||||
_admin: str = Depends(verify_admin),
|
_admin: None = Depends(verify_admin),
|
||||||
db: Session = Depends(get_db),
|
db: Session = Depends(get_db),
|
||||||
):
|
):
|
||||||
"""清理 data/tmp/ 中超过 24 小时的临时文件。"""
|
"""清理 data/tmp/ 中超过 24 小时的临时文件。"""
|
||||||
@@ -159,7 +209,7 @@ async def admin_cleanup(
|
|||||||
@router.post("/delete")
|
@router.post("/delete")
|
||||||
async def admin_delete(
|
async def admin_delete(
|
||||||
body: DeleteRequest,
|
body: DeleteRequest,
|
||||||
_admin: str = Depends(verify_admin),
|
_admin: None = Depends(verify_admin),
|
||||||
db: Session = Depends(get_db),
|
db: Session = Depends(get_db),
|
||||||
):
|
):
|
||||||
"""删除指定日期范围内的论文(需要 confirm='DELETE' 二次确认)。"""
|
"""删除指定日期范围内的论文(需要 confirm='DELETE' 二次确认)。"""
|
||||||
@@ -181,7 +231,7 @@ async def admin_delete(
|
|||||||
@router.get("/logs")
|
@router.get("/logs")
|
||||||
async def admin_logs(
|
async def admin_logs(
|
||||||
request: Request,
|
request: Request,
|
||||||
_admin: str = Depends(verify_admin),
|
_admin: None = Depends(verify_admin),
|
||||||
db: Session = Depends(get_db),
|
db: Session = Depends(get_db),
|
||||||
page: int = Query(1, ge=1),
|
page: int = Query(1, ge=1),
|
||||||
per_page: int = Query(20, ge=1, le=100),
|
per_page: int = Query(20, ge=1, le=100),
|
||||||
|
|||||||
@@ -107,6 +107,44 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
|
|||||||
# 图片画廊
|
# 图片画廊
|
||||||
images = _get_paper_images(arxiv_id)
|
images = _get_paper_images(arxiv_id)
|
||||||
|
|
||||||
|
# 预处理 JSON 字段供模板直接使用
|
||||||
|
import json as _json
|
||||||
|
|
||||||
|
prereqs = {}
|
||||||
|
if paper.summary and paper.summary.prerequisites_json:
|
||||||
|
try:
|
||||||
|
prereqs = _json.loads(paper.summary.prerequisites_json)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
benchmarks = []
|
||||||
|
if paper.summary and paper.summary.results_benchmarks_json:
|
||||||
|
try:
|
||||||
|
benchmarks = _json.loads(paper.summary.results_benchmarks_json)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
figures_raw = []
|
||||||
|
if paper.summary and paper.summary.figures_json:
|
||||||
|
try:
|
||||||
|
figures_raw = _json.loads(paper.summary.figures_json)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
linked_figures = _link_figures_with_images(figures_raw, images, arxiv_id)
|
||||||
|
|
||||||
|
# 拆分:table_figures(有截图的 Table 类型)→ 实验结果区域展示截图
|
||||||
|
# figures(其余)→ 论文图表画廊
|
||||||
|
table_figures = []
|
||||||
|
figures = []
|
||||||
|
for fig in linked_figures:
|
||||||
|
fig_id = fig.get("id", "")
|
||||||
|
is_table = fig_id.lower().startswith("table")
|
||||||
|
if is_table and fig.get("image_url"):
|
||||||
|
table_figures.append(fig)
|
||||||
|
else:
|
||||||
|
figures.append(fig)
|
||||||
|
|
||||||
return templates.TemplateResponse(
|
return templates.TemplateResponse(
|
||||||
request,
|
request,
|
||||||
"detail.html",
|
"detail.html",
|
||||||
@@ -115,6 +153,10 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
|
|||||||
"summary_state": summary_state,
|
"summary_state": summary_state,
|
||||||
"similar_papers": similar_papers,
|
"similar_papers": similar_papers,
|
||||||
"paper_images": images,
|
"paper_images": images,
|
||||||
|
"prereqs": prereqs,
|
||||||
|
"benchmarks": benchmarks,
|
||||||
|
"figures": figures,
|
||||||
|
"table_figures": table_figures,
|
||||||
"chroma_enabled": settings.CHROMA_ENABLED,
|
"chroma_enabled": settings.CHROMA_ENABLED,
|
||||||
"page_title": paper.title_zh or paper.title_en,
|
"page_title": paper.title_zh or paper.title_en,
|
||||||
},
|
},
|
||||||
@@ -232,3 +274,48 @@ def _get_paper_images(arxiv_id: str) -> list[dict]:
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
return images
|
return images
|
||||||
|
|
||||||
|
|
||||||
|
def _link_figures_with_images(
|
||||||
|
figures: list[dict], images: list[dict], arxiv_id: str
|
||||||
|
) -> list[dict]:
|
||||||
|
"""将 summary figures 元数据与提取的图片文件关联。
|
||||||
|
|
||||||
|
通过 manifest.json 中的 figure ID 匹配,给每个 figure 加上 image_url。
|
||||||
|
"""
|
||||||
|
if not figures or not images:
|
||||||
|
return figures
|
||||||
|
|
||||||
|
import json as _json
|
||||||
|
import re
|
||||||
|
|
||||||
|
manifest_path = Path("data/papers") / arxiv_id / "images" / "manifest.json"
|
||||||
|
if not manifest_path.exists():
|
||||||
|
return figures
|
||||||
|
|
||||||
|
try:
|
||||||
|
manifest = _json.loads(manifest_path.read_text(encoding="utf-8"))
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return figures
|
||||||
|
|
||||||
|
# 构建 figure_id -> image_url 的映射
|
||||||
|
id_to_url: dict[str, str] = {}
|
||||||
|
for filename, info in manifest.items():
|
||||||
|
url = f"/papers/{arxiv_id}/images/{filename}"
|
||||||
|
for fig_id in info.get("figures", []) + info.get("tables", []):
|
||||||
|
id_to_url[fig_id] = url
|
||||||
|
|
||||||
|
# 归一化 summary figures 的 ID
|
||||||
|
for fig in figures:
|
||||||
|
raw_id = fig.get("id", "")
|
||||||
|
m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", raw_id, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
normalized = f"Figure {m.group(1)}"
|
||||||
|
else:
|
||||||
|
m2 = re.match(r"Table\s*(\d+)", raw_id, re.IGNORECASE)
|
||||||
|
normalized = f"Table {m2.group(1)}" if m2 else raw_id
|
||||||
|
|
||||||
|
if normalized in id_to_url:
|
||||||
|
fig["image_url"] = id_to_url[normalized]
|
||||||
|
|
||||||
|
return figures
|
||||||
|
|||||||
@@ -1,83 +0,0 @@
|
|||||||
"""LaTeX 图片提取 — 从 arXiv 源码中扫描 \\includegraphics 并提取图片文件。"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
import shutil
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from app.services.pdf_downloader import download_source_zip, paper_dir, tmp_dir
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
_INCLUDEGRAPHICS_RE = re.compile(
|
|
||||||
r"\\includegraphics\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}", re.MULTILINE
|
|
||||||
)
|
|
||||||
_IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".eps"}
|
|
||||||
|
|
||||||
|
|
||||||
async def extract_images_from_source(arxiv_id: str) -> int:
|
|
||||||
"""从 LaTeX 源码中提取图片文件。
|
|
||||||
|
|
||||||
流程:
|
|
||||||
1. 下载源码 zip 到 data/tmp/{arxiv_id}/source/
|
|
||||||
2. 扫描 .tex 文件中的 \\includegraphics
|
|
||||||
3. 复制图片到 data/papers/{arxiv_id}/images/
|
|
||||||
4. 清理源码临时文件
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
提取的图片数量
|
|
||||||
"""
|
|
||||||
tmp_source = tmp_dir(arxiv_id) / "source"
|
|
||||||
images_dest = paper_dir(arxiv_id) / "images"
|
|
||||||
|
|
||||||
try:
|
|
||||||
# 下载源码 zip(如果还没下载)
|
|
||||||
if not tmp_source.exists():
|
|
||||||
source_url = f"https://arxiv.org/e-print/{arxiv_id}"
|
|
||||||
await download_source_zip(arxiv_id, source_url, tmp_source)
|
|
||||||
|
|
||||||
if not tmp_source.exists():
|
|
||||||
return 0
|
|
||||||
|
|
||||||
# 扫描 .tex 文件,收集图片路径
|
|
||||||
image_paths: set[str] = set()
|
|
||||||
for tex_file in tmp_source.rglob("*.tex"):
|
|
||||||
try:
|
|
||||||
content = tex_file.read_text(encoding="utf-8", errors="replace")
|
|
||||||
for match in _INCLUDEGRAPHICS_RE.finditer(content):
|
|
||||||
img_path = match.group(1).strip()
|
|
||||||
image_paths.add(img_path)
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not image_paths:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
# 查找并复制图片
|
|
||||||
images_dest.mkdir(parents=True, exist_ok=True)
|
|
||||||
copied = 0
|
|
||||||
for img_rel in image_paths:
|
|
||||||
# 尝试在源码目录中找到文件
|
|
||||||
for ext in ("", ".png", ".jpg", ".jpeg", ".gif", ".pdf", ".eps"):
|
|
||||||
candidate = tmp_source / (img_rel + ext)
|
|
||||||
if candidate.is_file():
|
|
||||||
dest_name = candidate.name
|
|
||||||
# 避免文件名冲突
|
|
||||||
dest = images_dest / dest_name
|
|
||||||
if dest.exists():
|
|
||||||
stem = dest.stem
|
|
||||||
suffix = dest.suffix
|
|
||||||
dest = images_dest / f"{stem}_{copied}{suffix}"
|
|
||||||
shutil.copy2(candidate, dest)
|
|
||||||
copied += 1
|
|
||||||
break
|
|
||||||
|
|
||||||
if copied > 0:
|
|
||||||
logger.info("Extracted %d images from source for %s", copied, arxiv_id)
|
|
||||||
return copied
|
|
||||||
|
|
||||||
except Exception:
|
|
||||||
logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
|
|
||||||
return 0
|
|
||||||
@@ -0,0 +1,261 @@
|
|||||||
|
"""PDF 图片与表格提取 — 从 PDF 中提取嵌入图片和表格截图。
|
||||||
|
|
||||||
|
策略:
|
||||||
|
1. 提取 PDF 中嵌入的图片(图表、插图等)
|
||||||
|
2. 检测表格区域,渲染为截图
|
||||||
|
3. 同时搜索页面中的 Figure/Table 标注,记录到 manifest
|
||||||
|
4. 过滤掉过小的图片
|
||||||
|
5. 保存到 data/papers/{arxiv_id}/images/
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from app.services.pdf_downloader import paper_dir
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# 最小面积阈值(像素),小于此值的图片视为图标/装饰
|
||||||
|
_MIN_AREA = 10_000 # ~100x100
|
||||||
|
_MIN_DIM = 80
|
||||||
|
|
||||||
|
# Figure/Table 标注与图片/表格的最大垂直距离(点)
|
||||||
|
_MAX_LABEL_DISTANCE = 120
|
||||||
|
|
||||||
|
# Figure/Table 标注的正则
|
||||||
|
_FIGURE_RE = re.compile(r'\b(?:Fig\.?|Figure)\s*(\d+)\b', re.IGNORECASE)
|
||||||
|
_TABLE_RE = re.compile(r'\bTable\s*(\d+)\b', re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def _find_nearby_labels(
|
||||||
|
rects: list, labels: dict[str, list[tuple[int, float]]], page_num: int
|
||||||
|
) -> list[str]:
|
||||||
|
"""查找与给定矩形区域在位置上接近的 Figure/Table 标注。
|
||||||
|
|
||||||
|
匹配逻辑:标注的垂直位置 (y) 需在图片/表格的上下 _MAX_LABEL_DISTANCE 点范围内。
|
||||||
|
"""
|
||||||
|
matched: list[str] = []
|
||||||
|
for rect in rects:
|
||||||
|
if isinstance(rect, (list, tuple)):
|
||||||
|
y_min, y_max = rect[1], rect[3]
|
||||||
|
else:
|
||||||
|
y_min, y_max = rect.y0, rect.y1
|
||||||
|
|
||||||
|
for label_key, positions in labels.items():
|
||||||
|
for label_page, label_y in positions:
|
||||||
|
if label_page == page_num:
|
||||||
|
# 标注在图片/表格上方或下方的距离
|
||||||
|
distance = min(abs(label_y - y_min), abs(label_y - y_max))
|
||||||
|
if distance <= _MAX_LABEL_DISTANCE:
|
||||||
|
if label_key not in matched:
|
||||||
|
matched.append(label_key)
|
||||||
|
return matched
|
||||||
|
|
||||||
|
|
||||||
|
def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
|
||||||
|
"""从 PDF 提取嵌入图片和表格截图,同时生成 manifest。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
arxiv_id: 论文 ID
|
||||||
|
pdf_path: PDF 路径,默认 data/tmp/{arxiv_id}/paper.pdf
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
提取的图片+表格数量
|
||||||
|
"""
|
||||||
|
import pymupdf
|
||||||
|
|
||||||
|
if pdf_path is None:
|
||||||
|
pdf_path = Path("data/tmp") / arxiv_id / "paper.pdf"
|
||||||
|
|
||||||
|
if not pdf_path.exists():
|
||||||
|
logger.warning("PDF not found for %s: %s", arxiv_id, pdf_path)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
images_dest = paper_dir(arxiv_id) / "images"
|
||||||
|
images_dest.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
doc = pymupdf.open(str(pdf_path))
|
||||||
|
extracted = 0
|
||||||
|
seen_hashes: set[int] = set()
|
||||||
|
|
||||||
|
# 扫描每页的 Figure/Table 标注位置
|
||||||
|
# figure_labels: {key: [(page_num, y_center)]} — 记录标注在页面中的垂直位置
|
||||||
|
figure_labels: dict[str, list[tuple[int, float]]] = {}
|
||||||
|
table_labels: dict[str, list[tuple[int, float]]] = {}
|
||||||
|
|
||||||
|
for page_num in range(len(doc)):
|
||||||
|
page = doc[page_num]
|
||||||
|
text_dict = page.get_text("dict")
|
||||||
|
for block in text_dict.get("blocks", []):
|
||||||
|
if block.get("type") != 0: # 只看文本块
|
||||||
|
continue
|
||||||
|
block_text = ""
|
||||||
|
for line in block.get("lines", []):
|
||||||
|
for span in line.get("spans", []):
|
||||||
|
block_text += span.get("text", "")
|
||||||
|
for m in _FIGURE_RE.finditer(block_text):
|
||||||
|
key = f"Figure {m.group(1)}"
|
||||||
|
bbox = block.get("bbox", [0, 0, 0, 0])
|
||||||
|
y_center = (bbox[1] + bbox[3]) / 2
|
||||||
|
figure_labels.setdefault(key, []).append((page_num, y_center))
|
||||||
|
for m in _TABLE_RE.finditer(block_text):
|
||||||
|
key = f"Table {m.group(1)}"
|
||||||
|
bbox = block.get("bbox", [0, 0, 0, 0])
|
||||||
|
y_center = (bbox[1] + bbox[3]) / 2
|
||||||
|
table_labels.setdefault(key, []).append((page_num, y_center))
|
||||||
|
|
||||||
|
# 记录每个提取文件的元信息
|
||||||
|
manifest: dict[str, dict] = {}
|
||||||
|
|
||||||
|
for page_num in range(len(doc)):
|
||||||
|
page = doc[page_num]
|
||||||
|
|
||||||
|
# ── 1. 提取嵌入图片 ──
|
||||||
|
image_list = page.get_images(full=True)
|
||||||
|
for img_index, img_info in enumerate(image_list):
|
||||||
|
xref = img_info[0]
|
||||||
|
try:
|
||||||
|
pix = pymupdf.Pixmap(doc, xref)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if pix.width < _MIN_DIM or pix.height < _MIN_DIM:
|
||||||
|
continue
|
||||||
|
if pix.width * pix.height < _MIN_AREA:
|
||||||
|
continue
|
||||||
|
|
||||||
|
img_hash = hash(pix.tobytes()[:1024])
|
||||||
|
if img_hash in seen_hashes:
|
||||||
|
continue
|
||||||
|
seen_hashes.add(img_hash)
|
||||||
|
|
||||||
|
if pix.n >= 5:
|
||||||
|
try:
|
||||||
|
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
filename = f"page{page_num + 1}_img{img_index + 1}.png"
|
||||||
|
pix.save(str(images_dest / filename))
|
||||||
|
extracted += 1
|
||||||
|
logger.debug("Image: %s (%dx%d)", filename, pix.width, pix.height)
|
||||||
|
|
||||||
|
# 查找该图片位置附近的 Figure 标注
|
||||||
|
img_rects = page.get_image_rects(xref)
|
||||||
|
matched = _find_nearby_labels(img_rects, figure_labels, page_num)
|
||||||
|
manifest[filename] = {"page": page_num + 1, "type": "image", "figures": matched}
|
||||||
|
|
||||||
|
# ── 2. 提取表格截图 ──
|
||||||
|
try:
|
||||||
|
tables = page.find_tables()
|
||||||
|
except Exception:
|
||||||
|
tables = None
|
||||||
|
|
||||||
|
if tables and tables.tables:
|
||||||
|
for table_index, table in enumerate(tables.tables):
|
||||||
|
bbox = table.bbox
|
||||||
|
if not bbox:
|
||||||
|
continue
|
||||||
|
|
||||||
|
margin = 5
|
||||||
|
if isinstance(bbox, (list, tuple)):
|
||||||
|
x0, y0, x1, y1 = bbox
|
||||||
|
else:
|
||||||
|
x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
|
||||||
|
clip_rect = pymupdf.Rect(x0 - margin, y0 - margin, x1 + margin, y1 + margin)
|
||||||
|
|
||||||
|
zoom = 2
|
||||||
|
mat = pymupdf.Matrix(zoom, zoom)
|
||||||
|
try:
|
||||||
|
pix = page.get_pixmap(matrix=mat, clip=clip_rect)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if pix.width < _MIN_DIM * 2 or pix.height < 30 * 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
filename = f"page{page_num + 1}_table{table_index + 1}.png"
|
||||||
|
pix.save(str(images_dest / filename))
|
||||||
|
extracted += 1
|
||||||
|
logger.debug("Table: %s (%dx%d)", filename, pix.width, pix.height)
|
||||||
|
|
||||||
|
# 查找该表格位置附近的 Table 标注
|
||||||
|
table_rect = pymupdf.Rect(x0, y0, x1, y1)
|
||||||
|
matched = _find_nearby_labels([table_rect], table_labels, page_num)
|
||||||
|
manifest[filename] = {"page": page_num + 1, "type": "table", "tables": matched}
|
||||||
|
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
# 保存 manifest
|
||||||
|
manifest_path = images_dest / "manifest.json"
|
||||||
|
manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2))
|
||||||
|
|
||||||
|
if extracted > 0:
|
||||||
|
logger.info("Extracted %d images+tables from PDF for %s", extracted, arxiv_id)
|
||||||
|
return extracted
|
||||||
|
|
||||||
|
|
||||||
|
def filter_images_by_summary(arxiv_id: str, figures: list[dict]) -> int:
|
||||||
|
"""根据 summary 中的 figures 字段过滤提取的图片/表格。
|
||||||
|
|
||||||
|
用 manifest.json 匹配,不需要 PDF 文件。
|
||||||
|
"""
|
||||||
|
if not figures:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
images_dir = paper_dir(arxiv_id) / "images"
|
||||||
|
manifest_path = images_dir / "manifest.json"
|
||||||
|
|
||||||
|
if not images_dir.exists() or not manifest_path.exists():
|
||||||
|
return 0
|
||||||
|
|
||||||
|
all_files = [f for f in images_dir.iterdir() if f.suffix == ".png"]
|
||||||
|
if not all_files:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
manifest: dict = json.loads(manifest_path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
# 收集 summary 中引用的所有 Figure/Table ID(归一化)
|
||||||
|
referenced_ids: set[str] = set()
|
||||||
|
for fig in figures:
|
||||||
|
fig_id = fig.get("id", "")
|
||||||
|
m = re.match(r'(?:Fig\.?|Figure)\s*(\d+)', fig_id, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
referenced_ids.add(f"Figure {m.group(1)}")
|
||||||
|
m2 = re.match(r'Table\s*(\d+)', fig_id, re.IGNORECASE)
|
||||||
|
if m2:
|
||||||
|
referenced_ids.add(f"Table {m2.group(1)}")
|
||||||
|
|
||||||
|
if not referenced_ids:
|
||||||
|
logger.warning("No valid figure/table IDs in summary for %s", arxiv_id)
|
||||||
|
return len(all_files)
|
||||||
|
|
||||||
|
# 根据 manifest 判断每个文件是否被引用
|
||||||
|
keep_filenames: set[str] = set()
|
||||||
|
for filename, info in manifest.items():
|
||||||
|
file_refs = info.get("figures", []) + info.get("tables", [])
|
||||||
|
for ref in file_refs:
|
||||||
|
if ref in referenced_ids:
|
||||||
|
keep_filenames.add(filename)
|
||||||
|
break
|
||||||
|
|
||||||
|
if not keep_filenames:
|
||||||
|
logger.warning(
|
||||||
|
"No manifest matches for %s (refs=%s), keeping all",
|
||||||
|
arxiv_id, referenced_ids,
|
||||||
|
)
|
||||||
|
return len(all_files)
|
||||||
|
|
||||||
|
removed = 0
|
||||||
|
for f in all_files:
|
||||||
|
if f.name not in keep_filenames:
|
||||||
|
f.unlink()
|
||||||
|
removed += 1
|
||||||
|
|
||||||
|
kept = len(all_files) - removed
|
||||||
|
logger.info("Filtered images for %s: kept %d, removed %d (refs=%s)", arxiv_id, kept, removed, referenced_ids)
|
||||||
|
return kept
|
||||||
+164
-8
@@ -59,23 +59,179 @@ def write_meta_json(paper) -> Path:
|
|||||||
return meta_path
|
return meta_path
|
||||||
|
|
||||||
|
|
||||||
|
# ── PDF 文本提取 ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _trim_body(text: str, max_chars: int = 80_000) -> str:
|
||||||
|
"""去除参考文献,保留正文+附录,超长时从末尾截断。
|
||||||
|
|
||||||
|
策略:
|
||||||
|
1. 去掉 References/Bibliography 段落(纯引用列表,对解读无用)
|
||||||
|
2. 正文 + 附录全部保留
|
||||||
|
3. 如果总长超过 max_chars,从末尾截断(附录靠后,优先保留正文)
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
# 找 References 段落的位置(在 Appendix 之后的那个)
|
||||||
|
# 有些论文结构:正文 -> Appendix -> References
|
||||||
|
# 也可能是:正文 -> References -> Appendix
|
||||||
|
# 策略:只删除明确的 References 块
|
||||||
|
ref_pattern = re.compile(
|
||||||
|
r"(?m)^(?:References|Bibliography|参考文献)\s*$\n"
|
||||||
|
r"(?s:.*?)" # References 内容
|
||||||
|
r"(?=\n(?:A\s|Appendix|Supplementary|Acknowledgment|致谢)\s|\Z)",
|
||||||
|
)
|
||||||
|
|
||||||
|
# 简单策略:找到 References 标题,如果后面没有 Appendix 就全删
|
||||||
|
# 如果后面还有 Appendix,只删 References 到 Appendix 之间的内容
|
||||||
|
ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text)
|
||||||
|
if ref_match:
|
||||||
|
ref_start = ref_match.start()
|
||||||
|
# 看 References 之后有没有 Appendix
|
||||||
|
after_ref = text[ref_start:]
|
||||||
|
app_match = re.search(
|
||||||
|
r"(?m)^(?:A\s+(?:Appendix|Supplementary)|Appendix|附录)\s*$", after_ref
|
||||||
|
)
|
||||||
|
if app_match:
|
||||||
|
# References 之后有 Appendix:只删 References 段
|
||||||
|
ref_end = ref_start + app_match.start()
|
||||||
|
text = text[:ref_start] + text[ref_end:]
|
||||||
|
else:
|
||||||
|
# References 之后没有 Appendix:删掉从 References 到结尾
|
||||||
|
text = text[:ref_start].rstrip()
|
||||||
|
|
||||||
|
# 去掉 Acknowledgments(对解读无用)
|
||||||
|
ack_match = re.search(r"(?m)^(?:Acknowledgments?\s*|致谢\s*)$", text)
|
||||||
|
if ack_match:
|
||||||
|
# 只删 Acknowledgments 本身,不删后面的内容
|
||||||
|
next_section = re.search(r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start():])
|
||||||
|
if next_section:
|
||||||
|
text = text[:ack_match.start()] + text[ack_match.start() + next_section.start():]
|
||||||
|
else:
|
||||||
|
text = text[:ack_match.start()].rstrip()
|
||||||
|
|
||||||
|
# 最后:如果还超长,从末尾截断(附录在后面,正文在前面,优先保留正文)
|
||||||
|
if len(text) > max_chars:
|
||||||
|
text = text[:max_chars].rstrip()
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pdf_text(pdf_path: Path) -> Path:
|
||||||
|
"""用 pymupdf 提取 PDF 正文文本(自动截断参考文献和附录),保存为 .txt。"""
|
||||||
|
import pymupdf
|
||||||
|
|
||||||
|
txt_path = pdf_path.with_suffix(".txt")
|
||||||
|
if txt_path.exists():
|
||||||
|
return txt_path
|
||||||
|
|
||||||
|
doc = pymupdf.open(str(pdf_path))
|
||||||
|
raw_text = "\n\n".join(page.get_text() for page in doc)
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
body = _trim_body(raw_text)
|
||||||
|
txt_path.write_text(body, encoding="utf-8")
|
||||||
|
logger.info(
|
||||||
|
"Extracted PDF text: %s (%d -> %d chars, -%d%%)",
|
||||||
|
txt_path,
|
||||||
|
len(raw_text),
|
||||||
|
len(body),
|
||||||
|
(1 - len(body) / len(raw_text)) * 100 if raw_text else 0,
|
||||||
|
)
|
||||||
|
return txt_path
|
||||||
|
|
||||||
|
|
||||||
# ── pi CLI 调用 ────────────────────────────────────────────────────────
|
# ── pi CLI 调用 ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
async def call_pi(meta_path: Path, pdf_path: Path) -> str:
|
async def call_pi(
|
||||||
"""调用 pi CLI 非交互模式,返回 stdout 文本。"""
|
meta_path: Path,
|
||||||
|
pdf_path: Path,
|
||||||
|
fix_errors: list[str] | None = None,
|
||||||
|
session_id: str | None = None,
|
||||||
|
) -> tuple[str, str]:
|
||||||
|
"""调用 pi CLI 非交互模式,返回 (stdout 文本, session_id)。
|
||||||
|
|
||||||
|
fix_errors: 如果非空,表示上一次验证失败的错误列表,pi 需要修正这些问题。
|
||||||
|
session_id: 如果非空,用 --continue 延续该 session;否则创建新 session。
|
||||||
|
"""
|
||||||
arxiv_id = meta_path.parent.name
|
arxiv_id = meta_path.parent.name
|
||||||
|
|
||||||
|
# 将 PDF 转为文本文件,以 @txt 方式传给 pi
|
||||||
|
txt_path = extract_pdf_text(pdf_path)
|
||||||
|
|
||||||
|
if fix_errors:
|
||||||
|
# 验证失败后的修正提示(同一 session 内,pi 能看到之前写的文件)
|
||||||
|
error_list = "\n".join(f"- {e}" for e in fix_errors)
|
||||||
|
prompt_text = (
|
||||||
|
"你之前生成的 JSON 存在以下问题,请修正后重新用 write_file 保存到 "
|
||||||
|
f"data/papers/{arxiv_id}/summary.json:\n\n"
|
||||||
|
f"{error_list}\n\n"
|
||||||
|
"注意:所有字符串字段必须是详细段落(≥50字),不能是数组或列表。"
|
||||||
|
"修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
prompt_text = (
|
||||||
|
"请深度解读以下论文,严格按下面的 JSON schema 输出结果。"
|
||||||
|
"只输出一个 JSON 对象,不要输出其他内容。\n\n"
|
||||||
|
"## 写作要求\n"
|
||||||
|
"- 每个字符串字段必须写成详细段落(200-500字),不要用列表或数组\n"
|
||||||
|
"- 必须包含论文中的具体数据、数字、实验指标\n"
|
||||||
|
"- 像资深同事给同事讲论文一样,专业但易懂\n"
|
||||||
|
"- 数学公式、符号、变量必须使用 LaTeX 格式:行内公式用 $...$,独立公式用 $$...$$\n"
|
||||||
|
" 例如:损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$,学习率 $\\eta$\n\n"
|
||||||
|
"## 必须包含以下字段(不要自创字段名):\n"
|
||||||
|
'{"arxiv_id": "...", '
|
||||||
|
'"title_zh": "中文标题", '
|
||||||
|
'"one_line": "一句话概括(≤50字)", '
|
||||||
|
'"tags": ["标签1","标签2"], '
|
||||||
|
'"difficulty": "入门/进阶/前沿", '
|
||||||
|
'"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的(50-150字)","why_matters":"为什么读懂本文需要它"}]}, '
|
||||||
|
'"motivation": {"problem": "详细段落:现有方法的具体问题(包含具体场景和数据)", '
|
||||||
|
'"goal": "详细段落:本文的具体目标", '
|
||||||
|
'"gap": "详细段落:本文的独特切入角度"}, '
|
||||||
|
'"method": {"overview": "详细段落:方法整体思路(先直觉再技术路线)", '
|
||||||
|
'"key_idea": "详细段落:核心创新点(和已有方法的本质区别)", '
|
||||||
|
'"steps": "详细段落:方法步骤的完整描述(每步的输入输出和具体操作)", '
|
||||||
|
'"novelty": "详细段落:技术新颖性分析"}, '
|
||||||
|
'"results": {"main_findings": "详细段落:核心发现(带具体数字和指标,逐一分析每个实验)", '
|
||||||
|
'"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
|
||||||
|
'"limitations": "详细段落:局限性分析(作者承认的+你自己的观察)"}, '
|
||||||
|
'"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", '
|
||||||
|
'"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", '
|
||||||
|
'"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度)"}, '
|
||||||
|
'"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要"},'
|
||||||
|
'{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要"}]'
|
||||||
|
"\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Table,id 严格使用 \"Figure N\" 或 \"Table N\" 格式。"
|
||||||
|
"}\n\n"
|
||||||
|
"请深度解读以下论文:"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 构建 session ID(每篇论文一个独立 session)
|
||||||
|
if session_id is None:
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
session_id = f"summary-{arxiv_id}-{uuid.uuid4().hex[:8]}"
|
||||||
|
|
||||||
cmd = [
|
cmd = [
|
||||||
settings.PI_BIN,
|
settings.PI_BIN,
|
||||||
"-p",
|
"-p",
|
||||||
"--no-tools",
|
"--tools", "bash,write_file",
|
||||||
|
]
|
||||||
|
if fix_errors:
|
||||||
|
cmd += ["--session", session_id, "--continue"]
|
||||||
|
else:
|
||||||
|
cmd += ["--session-id", session_id]
|
||||||
|
cmd += [
|
||||||
"--skill",
|
"--skill",
|
||||||
settings.SUMMARY_SKILL,
|
settings.SUMMARY_SKILL,
|
||||||
"请深度解读以下论文,并按指定 JSON schema 输出:",
|
prompt_text,
|
||||||
f"@{meta_path}",
|
|
||||||
f"@{pdf_path}",
|
|
||||||
]
|
]
|
||||||
logger.info("Calling pi for %s", arxiv_id)
|
if not fix_errors:
|
||||||
|
# 首次调用传文件,后续 --continue 不需要(session 内已有)
|
||||||
|
cmd += [f"@{meta_path}", f"@{txt_path}"]
|
||||||
|
|
||||||
|
logger.info("Calling pi for %s (fix=%s, session=%s)", arxiv_id, bool(fix_errors), session_id)
|
||||||
|
|
||||||
proc = await asyncio.create_subprocess_exec(
|
proc = await asyncio.create_subprocess_exec(
|
||||||
*cmd,
|
*cmd,
|
||||||
@@ -95,7 +251,7 @@ async def call_pi(meta_path: Path, pdf_path: Path) -> str:
|
|||||||
if proc.returncode != 0:
|
if proc.returncode != 0:
|
||||||
raise PiProcessError(proc.returncode, stderr.decode("utf-8", errors="replace"))
|
raise PiProcessError(proc.returncode, stderr.decode("utf-8", errors="replace"))
|
||||||
|
|
||||||
return stdout.decode("utf-8", errors="replace")
|
return stdout.decode("utf-8", errors="replace"), session_id
|
||||||
|
|
||||||
|
|
||||||
# ── JSON 提取 ──────────────────────────────────────────────────────────
|
# ── JSON 提取 ──────────────────────────────────────────────────────────
|
||||||
|
|||||||
+15
-20
@@ -12,8 +12,7 @@ from pydantic import BaseModel, Field, ValidationError, field_validator
|
|||||||
|
|
||||||
|
|
||||||
class PrerequisitesSchema(BaseModel):
|
class PrerequisitesSchema(BaseModel):
|
||||||
concepts: list[str] = Field(default_factory=list)
|
concepts: list[dict] = Field(default_factory=list)
|
||||||
level: str = ""
|
|
||||||
|
|
||||||
|
|
||||||
class MotivationSchema(BaseModel):
|
class MotivationSchema(BaseModel):
|
||||||
@@ -32,7 +31,7 @@ class MotivationSchema(BaseModel):
|
|||||||
class MethodSchema(BaseModel):
|
class MethodSchema(BaseModel):
|
||||||
overview: str = ""
|
overview: str = ""
|
||||||
key_idea: str
|
key_idea: str
|
||||||
steps: list[str] = Field(default_factory=list)
|
steps: str = ""
|
||||||
novelty: str = ""
|
novelty: str = ""
|
||||||
|
|
||||||
@field_validator("key_idea")
|
@field_validator("key_idea")
|
||||||
@@ -44,14 +43,14 @@ class MethodSchema(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class ResultsSchema(BaseModel):
|
class ResultsSchema(BaseModel):
|
||||||
main_findings: list[str] = Field(default_factory=list)
|
main_findings: str = ""
|
||||||
benchmarks: list[dict] = Field(default_factory=list)
|
benchmarks: list[str | dict] = Field(default_factory=list)
|
||||||
limitations: list[str] = Field(default_factory=list)
|
limitations: str = ""
|
||||||
|
|
||||||
|
|
||||||
class ImprovementsSchema(BaseModel):
|
class ImprovementsSchema(BaseModel):
|
||||||
weaknesses: list[str] = Field(default_factory=list)
|
weaknesses: str = ""
|
||||||
future_work: list[str] = Field(default_factory=list)
|
future_work: str = ""
|
||||||
reproducibility: str = ""
|
reproducibility: str = ""
|
||||||
|
|
||||||
|
|
||||||
@@ -71,6 +70,7 @@ class SummarySchema(BaseModel):
|
|||||||
method: MethodSchema
|
method: MethodSchema
|
||||||
results: ResultsSchema = Field(default_factory=ResultsSchema)
|
results: ResultsSchema = Field(default_factory=ResultsSchema)
|
||||||
improvements: ImprovementsSchema = Field(default_factory=ImprovementsSchema)
|
improvements: ImprovementsSchema = Field(default_factory=ImprovementsSchema)
|
||||||
|
figures: list[dict] = Field(default_factory=list)
|
||||||
|
|
||||||
@field_validator("title_zh", "one_line")
|
@field_validator("title_zh", "one_line")
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -116,7 +116,7 @@ def assess_quality(schema: SummarySchema) -> str:
|
|||||||
missing_important += 1
|
missing_important += 1
|
||||||
if not schema.method.overview.strip():
|
if not schema.method.overview.strip():
|
||||||
missing_important += 1
|
missing_important += 1
|
||||||
if not schema.results.main_findings:
|
if not schema.results.main_findings.strip():
|
||||||
missing_important += 1
|
missing_important += 1
|
||||||
|
|
||||||
if missing_important == 0:
|
if missing_important == 0:
|
||||||
@@ -140,22 +140,17 @@ def flatten_for_db(schema: SummarySchema) -> dict:
|
|||||||
"motivation_gap": schema.motivation.gap,
|
"motivation_gap": schema.motivation.gap,
|
||||||
"method_overview": schema.method.overview,
|
"method_overview": schema.method.overview,
|
||||||
"method_key_idea": schema.method.key_idea,
|
"method_key_idea": schema.method.key_idea,
|
||||||
"method_steps_json": json.dumps(schema.method.steps, ensure_ascii=False),
|
"method_steps_json": schema.method.steps,
|
||||||
"method_novelty": schema.method.novelty,
|
"method_novelty": schema.method.novelty,
|
||||||
"results_main_json": json.dumps(
|
"results_main_json": schema.results.main_findings,
|
||||||
schema.results.main_findings, ensure_ascii=False
|
|
||||||
),
|
|
||||||
"results_benchmarks_json": json.dumps(
|
"results_benchmarks_json": json.dumps(
|
||||||
schema.results.benchmarks, ensure_ascii=False
|
schema.results.benchmarks, ensure_ascii=False
|
||||||
),
|
),
|
||||||
"limitations_json": json.dumps(schema.results.limitations, ensure_ascii=False),
|
"limitations_json": schema.results.limitations,
|
||||||
"weaknesses_json": json.dumps(
|
"weaknesses_json": schema.improvements.weaknesses,
|
||||||
schema.improvements.weaknesses, ensure_ascii=False
|
"future_work_json": schema.improvements.future_work,
|
||||||
),
|
|
||||||
"future_work_json": json.dumps(
|
|
||||||
schema.improvements.future_work, ensure_ascii=False
|
|
||||||
),
|
|
||||||
"reproducibility": schema.improvements.reproducibility,
|
"reproducibility": schema.improvements.reproducibility,
|
||||||
|
"figures_json": json.dumps(schema.figures, ensure_ascii=False),
|
||||||
"full_json": schema.model_dump_json(ensure_ascii=False),
|
"full_json": schema.model_dump_json(ensure_ascii=False),
|
||||||
"updated_at": datetime.now(timezone.utc),
|
"updated_at": datetime.now(timezone.utc),
|
||||||
}
|
}
|
||||||
|
|||||||
+141
-11
@@ -22,7 +22,6 @@ from app.models import (
|
|||||||
SummaryStatus,
|
SummaryStatus,
|
||||||
TaskLock,
|
TaskLock,
|
||||||
)
|
)
|
||||||
from app.services.image_extractor import extract_images_from_source
|
|
||||||
from app.services.pdf_downloader import (
|
from app.services.pdf_downloader import (
|
||||||
PdfDownloadError,
|
PdfDownloadError,
|
||||||
cleanup_tmp,
|
cleanup_tmp,
|
||||||
@@ -77,10 +76,9 @@ def _build_fts_summary_text(schema: SummarySchema) -> str:
|
|||||||
schema.one_line or "",
|
schema.one_line or "",
|
||||||
schema.motivation.problem or "",
|
schema.motivation.problem or "",
|
||||||
schema.motivation.goal or "",
|
schema.motivation.goal or "",
|
||||||
schema.method_overview if hasattr(schema, "method_overview") else "",
|
|
||||||
schema.method.overview or "",
|
schema.method.overview or "",
|
||||||
schema.method.key_idea or "",
|
schema.method.key_idea or "",
|
||||||
" ".join(schema.results.main_findings or []),
|
schema.results.main_findings or "",
|
||||||
]
|
]
|
||||||
return " ".join(p for p in parts if p)
|
return " ".join(p for p in parts if p)
|
||||||
|
|
||||||
@@ -141,6 +139,77 @@ def _update_summary_in_db(
|
|||||||
logger.info("DB updated: paper=%s quality=%s", paper.arxiv_id, quality)
|
logger.info("DB updated: paper=%s quality=%s", paper.arxiv_id, quality)
|
||||||
|
|
||||||
|
|
||||||
|
# ── JSON 验证 ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_summary(json_data: dict, arxiv_id: str) -> list[str]:
|
||||||
|
"""验证 JSON 数据是否符合要求,返回错误列表(空=通过)。"""
|
||||||
|
errors: list[str] = []
|
||||||
|
|
||||||
|
if not isinstance(json_data, dict):
|
||||||
|
return ["顶层必须是 JSON 对象"]
|
||||||
|
|
||||||
|
# 必填字段
|
||||||
|
for f in ["arxiv_id", "title_zh", "one_line", "tags"]:
|
||||||
|
if f not in json_data or not json_data[f]:
|
||||||
|
errors.append(f"缺少必填字段: {f}")
|
||||||
|
|
||||||
|
# tags 必须是非空数组
|
||||||
|
tags = json_data.get("tags")
|
||||||
|
if not isinstance(tags, list) or len(tags) == 0:
|
||||||
|
errors.append("tags 必须是非空数组")
|
||||||
|
|
||||||
|
# 字符串段落字段(必须是 str 且 ≥50 字)
|
||||||
|
string_fields = [
|
||||||
|
("motivation", "problem"), ("motivation", "goal"), ("motivation", "gap"),
|
||||||
|
("method", "overview"), ("method", "key_idea"), ("method", "steps"),
|
||||||
|
("method", "novelty"),
|
||||||
|
("results", "main_findings"), ("results", "limitations"),
|
||||||
|
("improvements", "weaknesses"), ("improvements", "future_work"),
|
||||||
|
("improvements", "reproducibility"),
|
||||||
|
]
|
||||||
|
for section, field in string_fields:
|
||||||
|
val = json_data.get(section, {}).get(field)
|
||||||
|
if isinstance(val, list):
|
||||||
|
errors.append(f"{section}.{field} 应该是字符串段落,不能是数组")
|
||||||
|
elif not isinstance(val, str) or len(val.strip()) < 50:
|
||||||
|
errors.append(
|
||||||
|
f"{section}.{field} 必须是详细段落(≥50字),"
|
||||||
|
f"当前: {type(val).__name__} ({len(str(val))}字)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# benchmarks 必须是数组
|
||||||
|
benchmarks = json_data.get("results", {}).get("benchmarks")
|
||||||
|
if benchmarks is not None and not isinstance(benchmarks, list):
|
||||||
|
errors.append("results.benchmarks 必须是数组")
|
||||||
|
|
||||||
|
# prerequisites.concepts 必须是对象数组,每个有 term
|
||||||
|
concepts = json_data.get("prerequisites", {}).get("concepts")
|
||||||
|
if concepts is not None:
|
||||||
|
if not isinstance(concepts, list):
|
||||||
|
errors.append("prerequisites.concepts 必须是数组")
|
||||||
|
elif len(concepts) == 0:
|
||||||
|
errors.append("prerequisites.concepts 不能为空")
|
||||||
|
else:
|
||||||
|
for i, c in enumerate(concepts):
|
||||||
|
if isinstance(c, str):
|
||||||
|
errors.append(f"prerequisites.concepts[{i}] 应该是对象 {{term,explanation,why_matters}},不能是字符串")
|
||||||
|
elif isinstance(c, dict) and not c.get("term"):
|
||||||
|
errors.append(f"prerequisites.concepts[{i}] 缺少 term 字段")
|
||||||
|
|
||||||
|
# figures 必须是数组,每个元素应有 id
|
||||||
|
figures = json_data.get("figures")
|
||||||
|
if figures is not None:
|
||||||
|
if not isinstance(figures, list):
|
||||||
|
errors.append("figures 必须是数组")
|
||||||
|
else:
|
||||||
|
for i, fig in enumerate(figures):
|
||||||
|
if isinstance(fig, dict) and not fig.get("id"):
|
||||||
|
errors.append(f"figures[{i}] 缺少 id 字段")
|
||||||
|
|
||||||
|
return errors
|
||||||
|
|
||||||
|
|
||||||
# ── 文件操作 ────────────────────────────────────────────────────────────
|
# ── 文件操作 ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
@@ -227,11 +296,64 @@ async def _do_summarize_one(db: Session, paper: Paper) -> dict:
|
|||||||
# 下载 PDF
|
# 下载 PDF
|
||||||
await download_pdf(arxiv_id, paper.pdf_url)
|
await download_pdf(arxiv_id, paper.pdf_url)
|
||||||
|
|
||||||
# 调用 pi
|
# 带验证的生成循环:最多 4 轮,同一 session 内 pi 可看到之前写的文件
|
||||||
raw_output = await call_pi(meta_path, Path("data/tmp") / arxiv_id / "paper.pdf")
|
json_data = None
|
||||||
|
validation_errors = []
|
||||||
|
session_id = None
|
||||||
|
for attempt in range(1, 5):
|
||||||
|
# 清理上一轮 pi 通过 write_file 写的不完整文件
|
||||||
|
stale = paper_dir(arxiv_id) / "summary.json"
|
||||||
|
if stale.exists():
|
||||||
|
stale.unlink()
|
||||||
|
|
||||||
# 提取 JSON
|
if attempt == 1:
|
||||||
json_data = extract_json(raw_output)
|
raw_output, session_id = await call_pi(
|
||||||
|
meta_path, Path("data/tmp") / arxiv_id / "paper.pdf"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# 验证失败,同一 session 内带着错误信息让 pi 修正
|
||||||
|
raw_output, session_id = await call_pi(
|
||||||
|
meta_path,
|
||||||
|
Path("data/tmp") / arxiv_id / "paper.pdf",
|
||||||
|
fix_errors=validation_errors,
|
||||||
|
session_id=session_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 优先从 pi write_file 写入的 summary.json 读取,否则从 stdout 提取
|
||||||
|
# 如果都失败,当作验证错误,继续下一次尝试
|
||||||
|
json_data = None
|
||||||
|
summary_file = paper_dir(arxiv_id) / "summary.json"
|
||||||
|
try:
|
||||||
|
if summary_file.exists():
|
||||||
|
json_data = json.loads(summary_file.read_text(encoding="utf-8"))
|
||||||
|
logger.info("Read summary.json written by pi for %s", arxiv_id)
|
||||||
|
else:
|
||||||
|
json_data = extract_json(raw_output)
|
||||||
|
except (json.JSONDecodeError, JsonNotFoundError) as exc:
|
||||||
|
logger.warning(
|
||||||
|
"JSON extraction failed for %s (attempt %d): %s",
|
||||||
|
arxiv_id,
|
||||||
|
attempt,
|
||||||
|
str(exc)[:200],
|
||||||
|
)
|
||||||
|
validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"]
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 运行验证脚本
|
||||||
|
validation_errors = _validate_summary(json_data, arxiv_id)
|
||||||
|
if not validation_errors:
|
||||||
|
break
|
||||||
|
logger.warning(
|
||||||
|
"Validation failed for %s (attempt %d): %s",
|
||||||
|
arxiv_id,
|
||||||
|
attempt,
|
||||||
|
"; ".join(validation_errors),
|
||||||
|
)
|
||||||
|
|
||||||
|
if validation_errors:
|
||||||
|
raise ValueError(
|
||||||
|
f"Summary validation failed after 4 attempts: {'; '.join(validation_errors)}"
|
||||||
|
)
|
||||||
|
|
||||||
# Pydantic 校验
|
# Pydantic 校验
|
||||||
schema = SummarySchema.model_validate(json_data)
|
schema = SummarySchema.model_validate(json_data)
|
||||||
@@ -252,9 +374,17 @@ async def _do_summarize_one(db: Session, paper: Paper) -> dict:
|
|||||||
status.raw_output_saved = True
|
status.raw_output_saved = True
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
# LaTeX 图片提取(可选增强,失败不影响总结)
|
# PDF 图片提取(可选增强,失败不影响总结)
|
||||||
try:
|
try:
|
||||||
await extract_images_from_source(arxiv_id)
|
from app.services.pdf_image_extractor import (
|
||||||
|
extract_images_from_pdf,
|
||||||
|
filter_images_by_summary,
|
||||||
|
)
|
||||||
|
pdf_path = Path("data/tmp") / arxiv_id / "paper.pdf"
|
||||||
|
extract_images_from_pdf(arxiv_id, pdf_path)
|
||||||
|
# 根据 summary 中 figures 字段过滤,只保留被引用的图表
|
||||||
|
if schema.figures:
|
||||||
|
filter_images_by_summary(arxiv_id, schema.figures)
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
|
logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
|
||||||
|
|
||||||
@@ -268,8 +398,8 @@ async def _do_summarize_one(db: Session, paper: Paper) -> dict:
|
|||||||
"title_en": paper.title_en or "",
|
"title_en": paper.title_en or "",
|
||||||
"tags": " ".join(t.tag for t in paper.tags) if paper.tags else "",
|
"tags": " ".join(t.tag for t in paper.tags) if paper.tags else "",
|
||||||
"one_line": schema.one_line or "",
|
"one_line": schema.one_line or "",
|
||||||
"motivation_problem": schema.motivation_problem or "",
|
"motivation_problem": schema.motivation.problem or "",
|
||||||
"method_key_idea": schema.method_key_idea or "",
|
"method_key_idea": schema.method.key_idea or "",
|
||||||
"paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
|
"paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
|
||||||
}
|
}
|
||||||
index_paper(arxiv_id, texts_dict)
|
index_paper(arxiv_id, texts_dict)
|
||||||
|
|||||||
+225
-65
@@ -1,17 +1,27 @@
|
|||||||
/* ── kami 风格参考:纸张质感、留白、墨蓝强调色 ─────────────────── */
|
/* ── kami 风格参考:纸张质感、留白、墨蓝强调色 ─────────────────── */
|
||||||
:root {
|
:root {
|
||||||
--bg: #faf8f5;
|
/* 色 — Kami warm palette */
|
||||||
--surface: #ffffff;
|
--bg: #f5f4ed; /* parchment */
|
||||||
--ink: #1a1a2e;
|
--surface: #faf9f5; /* ivory */
|
||||||
--ink-light: #4a4a6a;
|
--ink: #141413; /* near black */
|
||||||
--accent: #2d5f8a;
|
--ink-light: #3d3d3a; /* dark warm */
|
||||||
--accent-hover: #1d4a6f;
|
--ink-sub: #504e49; /* olive subtext */
|
||||||
--border: #e8e4df;
|
--ink-muted: #6b6a64; /* stone tertiary */
|
||||||
--shadow: rgba(0, 0, 0, 0.06);
|
--accent: #1B365D; /* ink blue */
|
||||||
|
--accent-hover: #142d4a; /* ink blue deep */
|
||||||
|
--accent-bg: rgba(27, 54, 93, 0.06); /* brand whisper */
|
||||||
|
--border: #e8e6dc; /* warm border */
|
||||||
|
--border-soft: #e5e3d8; /* soft row separator */
|
||||||
|
--shadow: rgba(0, 0, 0, 0.05); /* whisper shadow */
|
||||||
--radius: 8px;
|
--radius: 8px;
|
||||||
--font-body: "Noto Serif SC", "Georgia", serif;
|
|
||||||
--font-sans: "Inter", "Noto Sans SC", system-ui, sans-serif;
|
/* 字体 — Kami serif-first */
|
||||||
--max-width: 960px;
|
--font-body: "TsangerJinKai02", "Source Han Serif SC", "Noto Serif CJK SC", "Songti SC", "STSong", Georgia, serif;
|
||||||
|
--font-sans: var(--font-body); /* Kami: sans = serif */
|
||||||
|
--mono: "JetBrains Mono", "SF Mono", "Fira Code", Consolas, Monaco, monospace;
|
||||||
|
|
||||||
|
/* 布局 */
|
||||||
|
--max-width: 1080px;
|
||||||
}
|
}
|
||||||
|
|
||||||
*,
|
*,
|
||||||
@@ -60,7 +70,7 @@ a:hover {
|
|||||||
.nav-brand {
|
.nav-brand {
|
||||||
font-family: var(--font-body);
|
font-family: var(--font-body);
|
||||||
font-size: 1.2rem;
|
font-size: 1.2rem;
|
||||||
font-weight: 700;
|
font-weight: 500;
|
||||||
color: var(--ink);
|
color: var(--ink);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -96,7 +106,7 @@ a:hover {
|
|||||||
.date-title {
|
.date-title {
|
||||||
font-family: var(--font-body);
|
font-family: var(--font-body);
|
||||||
font-size: 1.5rem;
|
font-size: 1.5rem;
|
||||||
font-weight: 700;
|
font-weight: 500;
|
||||||
}
|
}
|
||||||
|
|
||||||
.date-nav-btn {
|
.date-nav-btn {
|
||||||
@@ -156,7 +166,7 @@ a:hover {
|
|||||||
|
|
||||||
.paper-card {
|
.paper-card {
|
||||||
background: var(--surface);
|
background: var(--surface);
|
||||||
border: 1px solid var(--border);
|
border: 0.5px solid var(--border);
|
||||||
border-radius: var(--radius);
|
border-radius: var(--radius);
|
||||||
padding: 20px 24px;
|
padding: 20px 24px;
|
||||||
transition: box-shadow 0.2s;
|
transition: box-shadow 0.2s;
|
||||||
@@ -175,7 +185,7 @@ a:hover {
|
|||||||
.paper-title {
|
.paper-title {
|
||||||
font-family: var(--font-body);
|
font-family: var(--font-body);
|
||||||
font-size: 1.1rem;
|
font-size: 1.1rem;
|
||||||
font-weight: 600;
|
font-weight: 500;
|
||||||
line-height: 1.5;
|
line-height: 1.5;
|
||||||
flex: 1;
|
flex: 1;
|
||||||
}
|
}
|
||||||
@@ -190,6 +200,7 @@ a:hover {
|
|||||||
font-size: 0.85rem;
|
font-size: 0.85rem;
|
||||||
color: var(--ink-light);
|
color: var(--ink-light);
|
||||||
white-space: nowrap;
|
white-space: nowrap;
|
||||||
|
font-variant-numeric: tabular-nums;
|
||||||
}
|
}
|
||||||
|
|
||||||
.paper-one-line,
|
.paper-one-line,
|
||||||
@@ -215,12 +226,14 @@ a:hover {
|
|||||||
|
|
||||||
.tag {
|
.tag {
|
||||||
display: inline-block;
|
display: inline-block;
|
||||||
padding: 2px 8px;
|
padding: 1px 5px;
|
||||||
background: #eef3f8;
|
background: #EEF2F7;
|
||||||
color: var(--accent);
|
color: var(--accent);
|
||||||
border-radius: 3px;
|
border-radius: 2px;
|
||||||
font-size: 0.75rem;
|
font-size: 0.75rem;
|
||||||
font-weight: 500;
|
font-weight: 600;
|
||||||
|
letter-spacing: 0.4px;
|
||||||
|
text-transform: uppercase;
|
||||||
}
|
}
|
||||||
|
|
||||||
.paper-footer {
|
.paper-footer {
|
||||||
@@ -233,28 +246,28 @@ a:hover {
|
|||||||
.summary-badge {
|
.summary-badge {
|
||||||
font-size: 0.8rem;
|
font-size: 0.8rem;
|
||||||
padding: 2px 8px;
|
padding: 2px 8px;
|
||||||
border-radius: 3px;
|
border-radius: 2px;
|
||||||
}
|
}
|
||||||
.summary-none {
|
.summary-none {
|
||||||
background: #f0f0f0;
|
background: var(--border);
|
||||||
color: #888;
|
color: var(--ink-muted);
|
||||||
}
|
}
|
||||||
.summary-pending {
|
.summary-pending {
|
||||||
background: #fff3e0;
|
background: rgba(27, 54, 93, 0.06);
|
||||||
color: #e67e22;
|
color: var(--ink-sub);
|
||||||
}
|
}
|
||||||
.summary-processing {
|
.summary-processing {
|
||||||
background: #e3f2fd;
|
background: rgba(27, 54, 93, 0.10);
|
||||||
color: #1976d2;
|
color: var(--accent);
|
||||||
}
|
}
|
||||||
.summary-done {
|
.summary-done {
|
||||||
background: #e8f5e9;
|
background: rgba(27, 54, 93, 0.08);
|
||||||
color: #388e3c;
|
color: #3d6e3d;
|
||||||
}
|
}
|
||||||
.summary-failed,
|
.summary-failed,
|
||||||
.summary-permanent_failure {
|
.summary-permanent_failure {
|
||||||
background: #fce4ec;
|
background: rgba(140, 40, 40, 0.08);
|
||||||
color: #c62828;
|
color: #8c2828;
|
||||||
}
|
}
|
||||||
|
|
||||||
.btn-detail {
|
.btn-detail {
|
||||||
@@ -293,7 +306,7 @@ a:hover {
|
|||||||
.detail-title {
|
.detail-title {
|
||||||
font-family: var(--font-body);
|
font-family: var(--font-body);
|
||||||
font-size: 1.6rem;
|
font-size: 1.6rem;
|
||||||
font-weight: 700;
|
font-weight: 500;
|
||||||
line-height: 1.4;
|
line-height: 1.4;
|
||||||
margin-bottom: 12px;
|
margin-bottom: 12px;
|
||||||
}
|
}
|
||||||
@@ -352,7 +365,7 @@ a:hover {
|
|||||||
.summary-section h2 {
|
.summary-section h2 {
|
||||||
font-family: var(--font-body);
|
font-family: var(--font-body);
|
||||||
font-size: 1.05rem;
|
font-size: 1.05rem;
|
||||||
font-weight: 600;
|
font-weight: 500;
|
||||||
margin-bottom: 8px;
|
margin-bottom: 8px;
|
||||||
color: var(--accent);
|
color: var(--accent);
|
||||||
}
|
}
|
||||||
@@ -385,27 +398,27 @@ a:hover {
|
|||||||
margin-bottom: 24px;
|
margin-bottom: 24px;
|
||||||
}
|
}
|
||||||
.summary-placeholder.processing {
|
.summary-placeholder.processing {
|
||||||
background: #e3f2fd;
|
background: rgba(27, 54, 93, 0.06);
|
||||||
}
|
}
|
||||||
.summary-placeholder.failed {
|
.summary-placeholder.failed {
|
||||||
background: #fce4ec;
|
background: rgba(140, 40, 40, 0.06);
|
||||||
}
|
}
|
||||||
.summary-placeholder.none {
|
.summary-placeholder.none {
|
||||||
background: #f5f5f5;
|
background: var(--border);
|
||||||
}
|
}
|
||||||
.error-detail {
|
.error-detail {
|
||||||
font-size: 0.85rem;
|
font-size: 0.85rem;
|
||||||
color: #c62828;
|
color: #8c2828;
|
||||||
margin-top: 8px;
|
margin-top: 8px;
|
||||||
}
|
}
|
||||||
|
|
||||||
.quality-warning {
|
.quality-warning {
|
||||||
padding: 10px 16px;
|
padding: 10px 16px;
|
||||||
background: #fff8e1;
|
background: rgba(27, 54, 93, 0.06);
|
||||||
border: 1px solid #ffe082;
|
border: 1px solid var(--border-soft);
|
||||||
border-radius: var(--radius);
|
border-radius: var(--radius);
|
||||||
font-size: 0.85rem;
|
font-size: 0.85rem;
|
||||||
color: #f57f17;
|
color: var(--ink-sub);
|
||||||
margin-bottom: 16px;
|
margin-bottom: 16px;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -528,7 +541,7 @@ a:hover {
|
|||||||
}
|
}
|
||||||
.sort-toggle a.active {
|
.sort-toggle a.active {
|
||||||
color: var(--accent);
|
color: var(--accent);
|
||||||
font-weight: 600;
|
font-weight: 500;
|
||||||
}
|
}
|
||||||
.sort-toggle a:hover {
|
.sort-toggle a:hover {
|
||||||
color: var(--accent);
|
color: var(--accent);
|
||||||
@@ -541,7 +554,7 @@ a:hover {
|
|||||||
|
|
||||||
/* ── Search Highlight ───────────────────────────────────────────── */
|
/* ── Search Highlight ───────────────────────────────────────────── */
|
||||||
mark {
|
mark {
|
||||||
background: #fff3cd;
|
background: rgba(27, 54, 93, 0.10);
|
||||||
color: var(--ink);
|
color: var(--ink);
|
||||||
padding: 1px 2px;
|
padding: 1px 2px;
|
||||||
border-radius: 2px;
|
border-radius: 2px;
|
||||||
@@ -590,7 +603,7 @@ mark {
|
|||||||
.page-heading {
|
.page-heading {
|
||||||
font-family: var(--font-body);
|
font-family: var(--font-body);
|
||||||
font-size: 1.5rem;
|
font-size: 1.5rem;
|
||||||
font-weight: 700;
|
font-weight: 500;
|
||||||
margin-bottom: 20px;
|
margin-bottom: 20px;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -656,44 +669,60 @@ mark {
|
|||||||
color: var(--accent);
|
color: var(--accent);
|
||||||
}
|
}
|
||||||
.btn-bookmark.active {
|
.btn-bookmark.active {
|
||||||
color: #f0a500;
|
color: var(--accent);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ── Reading Badge ──────────────────────────────────────────────── */
|
/* ── Reading Badge ──────────────────────────────────────────────── */
|
||||||
.reading-badge {
|
.reading-badge {
|
||||||
font-size: 0.75rem;
|
font-size: 0.75rem;
|
||||||
padding: 2px 6px;
|
padding: 2px 6px;
|
||||||
border-radius: 3px;
|
border-radius: 2px;
|
||||||
}
|
}
|
||||||
.reading-unread {
|
.reading-unread {
|
||||||
background: #f0f0f0;
|
background: var(--border);
|
||||||
color: #888;
|
color: var(--ink-muted);
|
||||||
}
|
}
|
||||||
.reading-skimmed {
|
.reading-skimmed {
|
||||||
background: #e3f2fd;
|
background: rgba(27, 54, 93, 0.08);
|
||||||
color: #1976d2;
|
color: var(--accent);
|
||||||
}
|
}
|
||||||
.reading-read_summary {
|
.reading-read_summary {
|
||||||
background: #e8f5e9;
|
background: rgba(27, 54, 93, 0.06);
|
||||||
color: #388e3c;
|
color: #3d6e3d;
|
||||||
}
|
}
|
||||||
.reading-read_full {
|
.reading-read_full {
|
||||||
background: #e8f5e9;
|
background: rgba(27, 54, 93, 0.10);
|
||||||
color: #2e7d32;
|
color: #3d6e3d;
|
||||||
font-weight: 500;
|
font-weight: 500;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ── Responsive ─────────────────────────────────────────────────── */
|
/* ── Responsive ─────────────────────────────────────────────────── */
|
||||||
@media (max-width: 640px) {
|
@media (max-width: 880px) {
|
||||||
|
.container {
|
||||||
|
padding: 20px 32px;
|
||||||
|
}
|
||||||
|
.charts-grid {
|
||||||
|
grid-template-columns: 1fr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@media (max-width: 480px) {
|
||||||
.container {
|
.container {
|
||||||
padding: 16px;
|
padding: 16px;
|
||||||
}
|
}
|
||||||
.nav-bar {
|
.nav-bar {
|
||||||
padding: 10px 16px;
|
padding: 10px 16px;
|
||||||
|
flex-wrap: wrap;
|
||||||
}
|
}
|
||||||
.nav-search-input {
|
.nav-search-input {
|
||||||
width: 120px;
|
width: 120px;
|
||||||
}
|
}
|
||||||
|
.nav-links {
|
||||||
|
gap: 12px;
|
||||||
|
margin-left: 0;
|
||||||
|
width: 100%;
|
||||||
|
justify-content: center;
|
||||||
|
}
|
||||||
.date-nav {
|
.date-nav {
|
||||||
gap: 8px;
|
gap: 8px;
|
||||||
}
|
}
|
||||||
@@ -757,8 +786,9 @@ mark {
|
|||||||
color: var(--accent);
|
color: var(--accent);
|
||||||
white-space: nowrap;
|
white-space: nowrap;
|
||||||
padding: 2px 8px;
|
padding: 2px 8px;
|
||||||
background: #eef3f8;
|
background: #EEF2F7;
|
||||||
border-radius: 4px;
|
border-radius: 4px;
|
||||||
|
font-variant-numeric: tabular-nums;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ── Similar Papers ────────────────────────────────────────────── */
|
/* ── Similar Papers ────────────────────────────────────────────── */
|
||||||
@@ -770,7 +800,7 @@ mark {
|
|||||||
.similar-papers h2 {
|
.similar-papers h2 {
|
||||||
font-family: var(--font-body);
|
font-family: var(--font-body);
|
||||||
font-size: 1.1rem;
|
font-size: 1.1rem;
|
||||||
font-weight: 600;
|
font-weight: 500;
|
||||||
margin-bottom: 12px;
|
margin-bottom: 12px;
|
||||||
color: var(--accent);
|
color: var(--accent);
|
||||||
}
|
}
|
||||||
@@ -800,7 +830,7 @@ mark {
|
|||||||
.trends-page h1 {
|
.trends-page h1 {
|
||||||
font-family: var(--font-body);
|
font-family: var(--font-body);
|
||||||
font-size: 1.5rem;
|
font-size: 1.5rem;
|
||||||
font-weight: 700;
|
font-weight: 500;
|
||||||
margin-bottom: 24px;
|
margin-bottom: 24px;
|
||||||
}
|
}
|
||||||
.charts-grid {
|
.charts-grid {
|
||||||
@@ -818,7 +848,7 @@ mark {
|
|||||||
.chart-card h2 {
|
.chart-card h2 {
|
||||||
font-family: var(--font-body);
|
font-family: var(--font-body);
|
||||||
font-size: 1rem;
|
font-size: 1rem;
|
||||||
font-weight: 600;
|
font-weight: 500;
|
||||||
margin-bottom: 12px;
|
margin-bottom: 12px;
|
||||||
color: var(--accent);
|
color: var(--accent);
|
||||||
}
|
}
|
||||||
@@ -826,17 +856,12 @@ mark {
|
|||||||
width: 100% !important;
|
width: 100% !important;
|
||||||
max-height: 300px;
|
max-height: 300px;
|
||||||
}
|
}
|
||||||
@media (max-width: 768px) {
|
|
||||||
.charts-grid {
|
|
||||||
grid-template-columns: 1fr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ── Compare Page ──────────────────────────────────────────────── */
|
/* ── Compare Page ──────────────────────────────────────────────── */
|
||||||
.compare-page h1 {
|
.compare-page h1 {
|
||||||
font-family: var(--font-body);
|
font-family: var(--font-body);
|
||||||
font-size: 1.5rem;
|
font-size: 1.5rem;
|
||||||
font-weight: 700;
|
font-weight: 500;
|
||||||
margin-bottom: 24px;
|
margin-bottom: 24px;
|
||||||
}
|
}
|
||||||
.compare-table-wrapper {
|
.compare-table-wrapper {
|
||||||
@@ -860,7 +885,7 @@ mark {
|
|||||||
}
|
}
|
||||||
.compare-table th {
|
.compare-table th {
|
||||||
background: var(--bg);
|
background: var(--bg);
|
||||||
font-weight: 600;
|
font-weight: 500;
|
||||||
color: var(--ink-light);
|
color: var(--ink-light);
|
||||||
white-space: nowrap;
|
white-space: nowrap;
|
||||||
min-width: 100px;
|
min-width: 100px;
|
||||||
@@ -887,7 +912,7 @@ mark {
|
|||||||
.image-gallery h2 {
|
.image-gallery h2 {
|
||||||
font-family: var(--font-body);
|
font-family: var(--font-body);
|
||||||
font-size: 1.05rem;
|
font-size: 1.05rem;
|
||||||
font-weight: 600;
|
font-weight: 500;
|
||||||
margin-bottom: 12px;
|
margin-bottom: 12px;
|
||||||
color: var(--accent);
|
color: var(--accent);
|
||||||
}
|
}
|
||||||
@@ -913,3 +938,138 @@ mark {
|
|||||||
color: var(--ink-light);
|
color: var(--ink-light);
|
||||||
text-align: center;
|
text-align: center;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ── 前置知识卡片 ── */
|
||||||
|
.prerequisites-list {
|
||||||
|
display: grid;
|
||||||
|
gap: 1rem;
|
||||||
|
}
|
||||||
|
.concept-card {
|
||||||
|
background: var(--surface);
|
||||||
|
border: 1px solid var(--border);
|
||||||
|
border-radius: 8px;
|
||||||
|
padding: 1rem 1.2rem;
|
||||||
|
}
|
||||||
|
.concept-card h3 {
|
||||||
|
margin: 0 0 0.4rem 0;
|
||||||
|
font-size: 1rem;
|
||||||
|
color: var(--accent);
|
||||||
|
}
|
||||||
|
.concept-card p {
|
||||||
|
margin: 0.3rem 0 0 0;
|
||||||
|
font-size: 0.92rem;
|
||||||
|
line-height: 1.6;
|
||||||
|
color: var(--ink);
|
||||||
|
}
|
||||||
|
.concept-why {
|
||||||
|
font-style: italic;
|
||||||
|
color: var(--ink-light) !important;
|
||||||
|
border-left: 3px solid var(--accent);
|
||||||
|
padding-left: 0.8rem;
|
||||||
|
margin-top: 0.5rem !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── 核心创新点 ── */
|
||||||
|
.key-idea {
|
||||||
|
background: linear-gradient(135deg, var(--accent-bg), var(--surface));
|
||||||
|
border-left: 4px solid var(--accent);
|
||||||
|
padding: 1rem 1.2rem;
|
||||||
|
border-radius: 0 8px 8px 0;
|
||||||
|
margin: 1rem 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── 可折叠详情 ── */
|
||||||
|
.summary-section details {
|
||||||
|
margin: 0.8rem 0;
|
||||||
|
}
|
||||||
|
.summary-section details summary {
|
||||||
|
cursor: pointer;
|
||||||
|
font-weight: 500;
|
||||||
|
color: var(--accent);
|
||||||
|
padding: 0.4rem 0;
|
||||||
|
user-select: none;
|
||||||
|
}
|
||||||
|
.summary-section details summary:hover {
|
||||||
|
text-decoration: underline;
|
||||||
|
}
|
||||||
|
.summary-section details[open] summary {
|
||||||
|
margin-bottom: 0.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── 内联图片 ── */
|
||||||
|
.inline-figure {
|
||||||
|
margin: 1.2rem 0;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
.inline-figure img {
|
||||||
|
max-width: 100%;
|
||||||
|
border-radius: 6px;
|
||||||
|
box-shadow: 0 2px 8px rgba(0,0,0,0.08);
|
||||||
|
cursor: zoom-in;
|
||||||
|
transition: box-shadow 0.2s;
|
||||||
|
}
|
||||||
|
.inline-figure img:hover {
|
||||||
|
box-shadow: 0 4px 16px rgba(0,0,0,0.14);
|
||||||
|
}
|
||||||
|
.inline-figure figcaption {
|
||||||
|
margin-top: 0.4rem;
|
||||||
|
font-size: 0.85rem;
|
||||||
|
color: var(--ink-light);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── 图片灯箱 ── */
|
||||||
|
.lightbox-overlay {
|
||||||
|
position: fixed;
|
||||||
|
top: 0;
|
||||||
|
left: 0;
|
||||||
|
right: 0;
|
||||||
|
bottom: 0;
|
||||||
|
z-index: 9999;
|
||||||
|
background: rgba(0, 0, 0, 0.85);
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
cursor: zoom-out;
|
||||||
|
opacity: 0;
|
||||||
|
visibility: hidden;
|
||||||
|
transition: opacity 0.2s, visibility 0.2s;
|
||||||
|
}
|
||||||
|
.lightbox-overlay.active {
|
||||||
|
opacity: 1;
|
||||||
|
visibility: visible;
|
||||||
|
}
|
||||||
|
.lightbox-overlay img {
|
||||||
|
max-width: 95vw;
|
||||||
|
max-height: 95vh;
|
||||||
|
object-fit: contain;
|
||||||
|
border-radius: 4px;
|
||||||
|
box-shadow: 0 0 40px rgba(0, 0, 0, 0.4);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── Benchmark 表格 ── */
|
||||||
|
.benchmarks-table {
|
||||||
|
width: 100%;
|
||||||
|
border-collapse: collapse;
|
||||||
|
margin: 1rem 0;
|
||||||
|
font-size: 0.9rem;
|
||||||
|
}
|
||||||
|
.benchmarks-table th {
|
||||||
|
background: var(--bg);
|
||||||
|
font-weight: 500;
|
||||||
|
padding: 0.5rem 0.8rem;
|
||||||
|
text-align: left;
|
||||||
|
border-bottom: 2px solid var(--border);
|
||||||
|
}
|
||||||
|
.benchmarks-table td {
|
||||||
|
padding: 0.5rem 0.8rem;
|
||||||
|
border-bottom: 1px solid var(--border);
|
||||||
|
}
|
||||||
|
.benchmarks-table .improvement {
|
||||||
|
color: #3d6e3d;
|
||||||
|
font-weight: 500;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── 研究动机 ── */
|
||||||
|
.motivation-block p {
|
||||||
|
margin-bottom: 0.8rem;
|
||||||
|
}
|
||||||
|
|||||||
@@ -0,0 +1,11 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 32 32">
|
||||||
|
<rect width="32" height="32" rx="6" fill="#1B365D"/>
|
||||||
|
<g fill="none" stroke="#f5f4ed" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round">
|
||||||
|
<path d="M8 7h6a2 2 0 0 1 2 2v16l-1-1-2 1-2-1-2 1V9a1 1 0 0 1 1-1z"/>
|
||||||
|
<path d="M24 7h-6a2 2 0 0 0-2 2v16l1-1 2 1 2-1 2 1V9a1 1 0 0 0-1-1z"/>
|
||||||
|
<line x1="12" y1="12" x2="12" y2="12.01"/>
|
||||||
|
<line x1="12" y1="16" x2="12" y2="16.01"/>
|
||||||
|
<line x1="20" y1="12" x2="20" y2="12.01"/>
|
||||||
|
<line x1="20" y1="16" x2="20" y2="16.01"/>
|
||||||
|
</g>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 568 B |
@@ -36,9 +36,17 @@
|
|||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
<span class="status-badge status-{{ log.status }}">
|
<span class="status-badge status-{{ log.status }}">
|
||||||
{% if log.status == 'success' %}✓ 成功 {% elif log.status ==
|
{# djlint:off #}
|
||||||
'running' %}⟳ 运行中 {% elif log.status == 'failed' %}✗ 失败 {%
|
{% if log.status == 'success' %}
|
||||||
else %}{{ log.status }}{% endif %}
|
✓ 成功
|
||||||
|
{% elif log.status == 'running' %}
|
||||||
|
⟳ 运行中
|
||||||
|
{% elif log.status == 'failed' %}
|
||||||
|
✗ 失败
|
||||||
|
{% else %}
|
||||||
|
{{ log.status }}
|
||||||
|
{% endif %}
|
||||||
|
{# djlint:on #}
|
||||||
</span>
|
</span>
|
||||||
</td>
|
</td>
|
||||||
<td>{{ log.date or '-' }}</td>
|
<td>{{ log.date or '-' }}</td>
|
||||||
@@ -97,9 +105,17 @@
|
|||||||
<td>{{ job.paper_count or 0 }}</td>
|
<td>{{ job.paper_count or 0 }}</td>
|
||||||
<td>
|
<td>
|
||||||
<span class="status-badge status-{{ job.status }}">
|
<span class="status-badge status-{{ job.status }}">
|
||||||
{% if job.status == 'success' %}✓ 成功 {% elif job.status ==
|
{# djlint:off #}
|
||||||
'running' %}⟳ 运行中 {% elif job.status == 'failed' %}✗ 失败 {%
|
{% if job.status == 'success' %}
|
||||||
else %}{{ job.status }}{% endif %}
|
✓ 成功
|
||||||
|
{% elif job.status == 'running' %}
|
||||||
|
⟳ 运行中
|
||||||
|
{% elif job.status == 'failed' %}
|
||||||
|
✗ 失败
|
||||||
|
{% else %}
|
||||||
|
{{ job.status }}
|
||||||
|
{% endif %}
|
||||||
|
{# djlint:on #}
|
||||||
</span>
|
</span>
|
||||||
</td>
|
</td>
|
||||||
<td class="time-cell">
|
<td class="time-cell">
|
||||||
@@ -345,21 +361,23 @@
|
|||||||
{% endblock %} {% block scripts %}
|
{% endblock %} {% block scripts %}
|
||||||
<script>
|
<script>
|
||||||
function adminAction(action) {
|
function adminAction(action) {
|
||||||
const token = prompt("请输入 Admin Token:");
|
|
||||||
if (!token) return;
|
|
||||||
|
|
||||||
const url = "/admin/" + action;
|
const url = "/admin/" + action;
|
||||||
fetch(url, {
|
fetch(url, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: {
|
headers: { "Content-Type": "application/json" },
|
||||||
Authorization: "Bearer " + token,
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
},
|
|
||||||
})
|
})
|
||||||
.then((r) => r.json())
|
.then((r) => {
|
||||||
|
if (r.status === 303 || r.status === 401) {
|
||||||
|
window.location.href = "/admin/login";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
return r.json();
|
||||||
|
})
|
||||||
.then((data) => {
|
.then((data) => {
|
||||||
alert(JSON.stringify(data, null, 2));
|
if (data) {
|
||||||
location.reload();
|
alert(JSON.stringify(data, null, 2));
|
||||||
|
location.reload();
|
||||||
|
}
|
||||||
})
|
})
|
||||||
.catch((err) => {
|
.catch((err) => {
|
||||||
alert("请求失败: " + err.message);
|
alert("请求失败: " + err.message);
|
||||||
|
|||||||
@@ -4,7 +4,9 @@
|
|||||||
<meta charset="UTF-8" />
|
<meta charset="UTF-8" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
<title>{% block title %}HF Daily Papers{% endblock %}</title>
|
<title>{% block title %}HF Daily Papers{% endblock %}</title>
|
||||||
|
<link rel="icon" type="image/svg+xml" href="/static/favicon.svg" />
|
||||||
<link rel="stylesheet" href="/static/css/style.css" />
|
<link rel="stylesheet" href="/static/css/style.css" />
|
||||||
|
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.css" />
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<header class="site-header">
|
<header class="site-header">
|
||||||
@@ -23,7 +25,13 @@
|
|||||||
<a href="/search">搜索</a>
|
<a href="/search">搜索</a>
|
||||||
<a href="/trends">趋势</a>
|
<a href="/trends">趋势</a>
|
||||||
<a href="/reading-list">阅读列表</a>
|
<a href="/reading-list">阅读列表</a>
|
||||||
|
{% if is_admin %}
|
||||||
<a href="/admin/logs">管理</a>
|
<a href="/admin/logs">管理</a>
|
||||||
|
<a href="/admin/logout" onclick="event.preventDefault();this.closest('form').submit()">退出</a>
|
||||||
|
<form action="/admin/logout" method="post" style="display:none"></form>
|
||||||
|
{% else %}
|
||||||
|
<a href="/admin/login">管理</a>
|
||||||
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
</nav>
|
</nav>
|
||||||
</header>
|
</header>
|
||||||
|
|||||||
+386
-20
@@ -57,45 +57,158 @@ endblock %} {% block content %}
|
|||||||
<div class="quality-warning">📝 总结部分字段不完整</div>
|
<div class="quality-warning">📝 总结部分字段不完整</div>
|
||||||
{% endif %} {% if paper.summary.one_line %}
|
{% endif %} {% if paper.summary.one_line %}
|
||||||
<section class="summary-section">
|
<section class="summary-section">
|
||||||
<h2>一句话摘要</h2>
|
|
||||||
<p class="one-line">{{ paper.summary.one_line }}</p>
|
<p class="one-line">{{ paper.summary.one_line }}</p>
|
||||||
</section>
|
</section>
|
||||||
{% endif %} {% if paper.summary.difficulty %}
|
{% endif %}
|
||||||
|
|
||||||
|
{# ── 前置知识 ── #}
|
||||||
|
{% if prereqs and prereqs.concepts %}
|
||||||
<section class="summary-section">
|
<section class="summary-section">
|
||||||
<h2>难度</h2>
|
<h2>前置知识</h2>
|
||||||
<p>{{ paper.summary.difficulty }}</p>
|
<div class="prerequisites-list">
|
||||||
|
{% for c in prereqs.concepts %}
|
||||||
|
<div class="concept-card">
|
||||||
|
<h3>{{ c.term }}</h3>
|
||||||
|
<p>{{ c.explanation }}</p>
|
||||||
|
{% if c.why_matters %}
|
||||||
|
<p class="concept-why">{{ c.why_matters }}</p>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
</section>
|
</section>
|
||||||
{% endif %} {% if paper.summary.motivation_problem %}
|
{% endif %}
|
||||||
|
|
||||||
|
{# ── 研究动机 ── #}
|
||||||
|
{% if paper.summary.motivation_problem %}
|
||||||
<section class="summary-section">
|
<section class="summary-section">
|
||||||
<h2>研究动机</h2>
|
<h2>研究动机</h2>
|
||||||
{% if paper.summary.motivation_problem %}
|
<div class="motivation-block">
|
||||||
<p><strong>问题:</strong>{{ paper.summary.motivation_problem }}</p>
|
{% if paper.summary.motivation_problem %}
|
||||||
{% endif %} {% if paper.summary.motivation_goal %}
|
<p>{{ paper.summary.motivation_problem }}</p>
|
||||||
<p><strong>目标:</strong>{{ paper.summary.motivation_goal }}</p>
|
{% endif %}
|
||||||
{% endif %} {% if paper.summary.motivation_gap %}
|
{% if paper.summary.motivation_goal %}
|
||||||
<p><strong>差距:</strong>{{ paper.summary.motivation_gap }}</p>
|
<p>本文的目标是{{ paper.summary.motivation_goal }}</p>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
{% if paper.summary.motivation_gap %}
|
||||||
|
<p>与已有工作不同的是,{{ paper.summary.motivation_gap }}</p>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
</section>
|
</section>
|
||||||
{% endif %} {% if paper.summary.method_key_idea %}
|
{% endif %}
|
||||||
|
|
||||||
|
{# ── 核心方法 ── #}
|
||||||
|
{% if paper.summary.method_key_idea %}
|
||||||
<section class="summary-section">
|
<section class="summary-section">
|
||||||
<h2>核心方法</h2>
|
<h2>核心方法</h2>
|
||||||
{% if paper.summary.method_overview %}
|
{% if paper.summary.method_overview %}
|
||||||
<p>{{ paper.summary.method_overview }}</p>
|
<p>{{ paper.summary.method_overview }}</p>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
<p><strong>关键思路:</strong>{{ paper.summary.method_key_idea }}</p>
|
<div class="key-idea">
|
||||||
|
<p>{{ paper.summary.method_key_idea }}</p>
|
||||||
|
</div>
|
||||||
|
{% if paper.summary.method_steps_json %}
|
||||||
|
<details>
|
||||||
|
<summary>方法步骤详情</summary>
|
||||||
|
<p>{{ paper.summary.method_steps_json }}</p>
|
||||||
|
</details>
|
||||||
|
{% endif %}
|
||||||
{% if paper.summary.method_novelty %}
|
{% if paper.summary.method_novelty %}
|
||||||
<p><strong>新颖性:</strong>{{ paper.summary.method_novelty }}</p>
|
<details>
|
||||||
|
<summary>技术新颖性</summary>
|
||||||
|
<p>{{ paper.summary.method_novelty }}</p>
|
||||||
|
</details>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</section>
|
</section>
|
||||||
{% endif %} {% if paper.summary.results_main_json %}
|
{% endif %}
|
||||||
|
|
||||||
|
{# ── 实验结果 ── #}
|
||||||
|
{% if paper.summary.results_main_json %}
|
||||||
<section class="summary-section">
|
<section class="summary-section">
|
||||||
<h2>实验结果</h2>
|
<h2>实验结果</h2>
|
||||||
<p>{{ paper.summary.results_main_json }}</p>
|
<p>{{ paper.summary.results_main_json }}</p>
|
||||||
|
{% if table_figures and table_figures|length > 0 %}
|
||||||
|
{# 优先展示原文表格截图 #}
|
||||||
|
{% for tf in table_figures %}
|
||||||
|
<figure class="inline-figure table-screenshot">
|
||||||
|
<img src="{{ tf.image_url }}" alt="{{ tf.caption or tf.id }}" loading="lazy" />
|
||||||
|
<figcaption>
|
||||||
|
<strong>{{ tf.id }}</strong>{% if tf.caption %}: {{ tf.caption }}{% endif %}
|
||||||
|
</figcaption>
|
||||||
|
</figure>
|
||||||
|
{% endfor %}
|
||||||
|
{% if benchmarks and benchmarks|length > 0 %}
|
||||||
|
<details>
|
||||||
|
<summary>查看结构化数据</summary>
|
||||||
|
<table class="benchmarks-table">
|
||||||
|
<thead>
|
||||||
|
<tr><th>任务</th><th>指标</th><th>本文</th><th>基线</th><th>提升</th></tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for b in benchmarks %}
|
||||||
|
{% if b is mapping %}
|
||||||
|
<tr>
|
||||||
|
<td>{{ b.get('task','') }}</td>
|
||||||
|
<td>{{ b.get('metric','') }}</td>
|
||||||
|
<td><strong>{{ b.get('this_work','') }}</strong></td>
|
||||||
|
<td>{{ b.get('baseline','') }}</td>
|
||||||
|
<td class="improvement">{{ b.get('improvement','') }}</td>
|
||||||
|
</tr>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</details>
|
||||||
|
{% endif %}
|
||||||
|
{% elif benchmarks and benchmarks|length > 0 %}
|
||||||
|
{# 无截图时回退到 HTML 表格 #}
|
||||||
|
<table class="benchmarks-table">
|
||||||
|
<thead>
|
||||||
|
<tr><th>任务</th><th>指标</th><th>本文</th><th>基线</th><th>提升</th></tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for b in benchmarks %}
|
||||||
|
{% if b is mapping %}
|
||||||
|
<tr>
|
||||||
|
<td>{{ b.get('task','') }}</td>
|
||||||
|
<td>{{ b.get('metric','') }}</td>
|
||||||
|
<td><strong>{{ b.get('this_work','') }}</strong></td>
|
||||||
|
<td>{{ b.get('baseline','') }}</td>
|
||||||
|
<td class="improvement">{{ b.get('improvement','') }}</td>
|
||||||
|
</tr>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
{% endif %}
|
||||||
</section>
|
</section>
|
||||||
{% endif %} {% if paper.summary.limitations_json %}
|
{% endif %}
|
||||||
|
|
||||||
|
{# ── 局限与改进 ── #}
|
||||||
|
{% if paper.summary.limitations_json or paper.summary.weaknesses_json or paper.summary.future_work_json %}
|
||||||
<section class="summary-section">
|
<section class="summary-section">
|
||||||
<h2>局限与改进</h2>
|
<h2>局限与改进</h2>
|
||||||
|
{% if paper.summary.limitations_json %}
|
||||||
<p>{{ paper.summary.limitations_json }}</p>
|
<p>{{ paper.summary.limitations_json }}</p>
|
||||||
|
{% endif %}
|
||||||
|
{% if paper.summary.weaknesses_json %}
|
||||||
|
<details>
|
||||||
|
<summary>独立分析的弱点</summary>
|
||||||
|
<p>{{ paper.summary.weaknesses_json }}</p>
|
||||||
|
</details>
|
||||||
|
{% endif %}
|
||||||
|
{% if paper.summary.future_work_json %}
|
||||||
|
<details>
|
||||||
|
<summary>未来方向</summary>
|
||||||
|
<p>{{ paper.summary.future_work_json }}</p>
|
||||||
|
</details>
|
||||||
|
{% endif %}
|
||||||
|
{% if paper.summary.reproducibility %}
|
||||||
|
<details>
|
||||||
|
<summary>复现评估</summary>
|
||||||
|
<p>{{ paper.summary.reproducibility }}</p>
|
||||||
|
</details>
|
||||||
|
{% endif %}
|
||||||
</section>
|
</section>
|
||||||
{% endif %} {% elif summary_state == 'processing' %}
|
{% endif %} {% elif summary_state == 'processing' %}
|
||||||
<div class="summary-placeholder processing">
|
<div class="summary-placeholder processing">
|
||||||
@@ -123,9 +236,30 @@ endblock %} {% block content %}
|
|||||||
<h2>Abstract</h2>
|
<h2>Abstract</h2>
|
||||||
<p class="abstract-en">{{ paper.abstract }}</p>
|
<p class="abstract-en">{{ paper.abstract }}</p>
|
||||||
</section>
|
</section>
|
||||||
{% endif %} {# 图片画廊 #} {% if paper_images %}
|
{% endif %}
|
||||||
|
|
||||||
|
{# ── 论文图表(关联 figures 元数据)── #}
|
||||||
|
{% if figures or paper_images %}
|
||||||
<section class="image-gallery">
|
<section class="image-gallery">
|
||||||
<h2>论文图片</h2>
|
<h2>论文图表</h2>
|
||||||
|
{% for fig in figures %}
|
||||||
|
<figure class="inline-figure">
|
||||||
|
{% if fig.image_url %}
|
||||||
|
<img src="{{ fig.image_url }}" alt="{{ fig.caption or fig.id }}" loading="lazy" />
|
||||||
|
{% endif %}
|
||||||
|
<figcaption>
|
||||||
|
<strong>{{ fig.id }}</strong>{% if fig.caption %}: {{ fig.caption }}{% endif %}
|
||||||
|
{% if fig.description %}
|
||||||
|
<p>{{ fig.description }}</p>
|
||||||
|
{% endif %}
|
||||||
|
{% if fig.reason %}
|
||||||
|
<p class="concept-why">{{ fig.reason }}</p>
|
||||||
|
{% endif %}
|
||||||
|
</figcaption>
|
||||||
|
</figure>
|
||||||
|
{% endfor %}
|
||||||
|
{# 如果有图片但没有对应的 figures 元数据,仍然展示 #}
|
||||||
|
{% if not figures and paper_images %}
|
||||||
<div class="gallery-grid">
|
<div class="gallery-grid">
|
||||||
{% for img in paper_images %}
|
{% for img in paper_images %}
|
||||||
<div class="gallery-item">
|
<div class="gallery-item">
|
||||||
@@ -134,8 +268,9 @@ endblock %} {% block content %}
|
|||||||
</div>
|
</div>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
|
{% endif %}
|
||||||
</section>
|
</section>
|
||||||
{% endif %} {# 相似论文推荐 #} {% if similar_papers %}
|
{% endif %} {% if similar_papers %}
|
||||||
<section class="similar-papers">
|
<section class="similar-papers">
|
||||||
<h2>相似论文推荐</h2>
|
<h2>相似论文推荐</h2>
|
||||||
{% for sp in similar_papers %}
|
{% for sp in similar_papers %}
|
||||||
@@ -152,3 +287,234 @@ endblock %} {% block content %}
|
|||||||
{% endif %}
|
{% endif %}
|
||||||
</article>
|
</article>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|
||||||
|
{% block scripts %}
|
||||||
|
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.js"></script>
|
||||||
|
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/contrib/auto-render.min.js"
|
||||||
|
onload="renderMathInElement(document.querySelector('.paper-detail'),{delimiters:[{left:'$$',right:'$$',display:true},{left:'$',right:'$',display:false}]});">
|
||||||
|
</script>
|
||||||
|
<style>
|
||||||
|
.lightbox-overlay {
|
||||||
|
position: fixed !important;
|
||||||
|
top: 0 !important;
|
||||||
|
left: 0 !important;
|
||||||
|
right: 0 !important;
|
||||||
|
bottom: 0 !important;
|
||||||
|
width: 100vw !important;
|
||||||
|
height: 100vh !important;
|
||||||
|
z-index: 99999 !important;
|
||||||
|
background: rgba(0, 0, 0, 0.85);
|
||||||
|
overflow: hidden;
|
||||||
|
margin: 0 !important;
|
||||||
|
padding: 0 !important;
|
||||||
|
opacity: 0;
|
||||||
|
transition: opacity 0.2s;
|
||||||
|
}
|
||||||
|
.lightbox-overlay.active {
|
||||||
|
opacity: 1;
|
||||||
|
}
|
||||||
|
.lightbox-overlay img {
|
||||||
|
position: absolute;
|
||||||
|
transform-origin: 0 0;
|
||||||
|
border-radius: 4px;
|
||||||
|
box-shadow: 0 0 40px rgba(0, 0, 0, 0.4);
|
||||||
|
cursor: grab;
|
||||||
|
user-select: none;
|
||||||
|
-webkit-user-drag: none;
|
||||||
|
}
|
||||||
|
.lightbox-overlay img.dragging {
|
||||||
|
cursor: grabbing;
|
||||||
|
}
|
||||||
|
/* 工具栏 */
|
||||||
|
.lightbox-toolbar {
|
||||||
|
position: absolute;
|
||||||
|
bottom: 24px;
|
||||||
|
left: 50%;
|
||||||
|
transform: translateX(-50%);
|
||||||
|
display: flex;
|
||||||
|
gap: 8px;
|
||||||
|
background: rgba(0, 0, 0, 0.6);
|
||||||
|
padding: 8px 14px;
|
||||||
|
border-radius: 24px;
|
||||||
|
z-index: 100000;
|
||||||
|
}
|
||||||
|
.lightbox-toolbar button {
|
||||||
|
background: none;
|
||||||
|
border: 1px solid rgba(255,255,255,0.3);
|
||||||
|
color: #fff;
|
||||||
|
width: 36px;
|
||||||
|
height: 36px;
|
||||||
|
border-radius: 50%;
|
||||||
|
font-size: 1.1rem;
|
||||||
|
cursor: pointer;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
transition: background 0.15s;
|
||||||
|
}
|
||||||
|
.lightbox-toolbar button:hover {
|
||||||
|
background: rgba(255,255,255,0.15);
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
<script>
|
||||||
|
(function() {
|
||||||
|
function openLightbox(src, alt) {
|
||||||
|
var existing = document.querySelector('.lightbox-overlay');
|
||||||
|
if (existing) existing.remove();
|
||||||
|
|
||||||
|
var overlay = document.createElement('div');
|
||||||
|
overlay.className = 'lightbox-overlay';
|
||||||
|
|
||||||
|
var img = document.createElement('img');
|
||||||
|
img.src = src;
|
||||||
|
img.alt = alt || '';
|
||||||
|
img.draggable = false;
|
||||||
|
|
||||||
|
// 工具栏
|
||||||
|
var toolbar = document.createElement('div');
|
||||||
|
toolbar.className = 'lightbox-toolbar';
|
||||||
|
toolbar.innerHTML =
|
||||||
|
'<button title="缩小">−</button>' +
|
||||||
|
'<button title="放大">+</button>' +
|
||||||
|
'<button title="适合窗口">⊡</button>' +
|
||||||
|
'<button title="原始大小">1:1</button>' +
|
||||||
|
'<button title="关闭">✕</button>';
|
||||||
|
|
||||||
|
overlay.appendChild(img);
|
||||||
|
overlay.appendChild(toolbar);
|
||||||
|
document.body.appendChild(overlay);
|
||||||
|
|
||||||
|
// 视图状态
|
||||||
|
var scale = 1, tx = 0, ty = 0;
|
||||||
|
var baseW = 0, baseH = 0;
|
||||||
|
var dragging = false, dragStartX = 0, dragStartY = 0, startTx = 0, startTy = 0;
|
||||||
|
|
||||||
|
function apply() {
|
||||||
|
img.style.transform = 'translate(' + tx + 'px,' + ty + 'px) scale(' + scale + ')';
|
||||||
|
}
|
||||||
|
|
||||||
|
function fitToScreen() {
|
||||||
|
if (!baseW) return;
|
||||||
|
var sw = window.innerWidth, sh = window.innerHeight;
|
||||||
|
scale = Math.min(sw * 0.9 / baseW, sh * 0.9 / baseH, 1);
|
||||||
|
tx = (sw - baseW * scale) / 2;
|
||||||
|
ty = (sh - baseH * scale) / 2;
|
||||||
|
apply();
|
||||||
|
}
|
||||||
|
|
||||||
|
function resetOrigin() {
|
||||||
|
scale = 1;
|
||||||
|
tx = (window.innerWidth - baseW) / 2;
|
||||||
|
ty = (window.innerHeight - baseH) / 2;
|
||||||
|
apply();
|
||||||
|
}
|
||||||
|
|
||||||
|
function zoomAt(factor, cx, cy) {
|
||||||
|
var newScale = Math.max(0.1, Math.min(scale * factor, 20));
|
||||||
|
// 保持鼠标指向的图片点不变
|
||||||
|
tx = cx - (cx - tx) * (newScale / scale);
|
||||||
|
ty = cy - (ty - ty) * (newScale / scale); // 这行有误,下面修正
|
||||||
|
scale = newScale;
|
||||||
|
apply();
|
||||||
|
}
|
||||||
|
|
||||||
|
function zoomCenter(factor) {
|
||||||
|
var cx = window.innerWidth / 2;
|
||||||
|
var cy = window.innerHeight / 2;
|
||||||
|
var newScale = Math.max(0.1, Math.min(scale * factor, 20));
|
||||||
|
tx = cx - (cx - tx) * (newScale / scale);
|
||||||
|
ty = cy - (cy - ty) * (newScale / scale);
|
||||||
|
scale = newScale;
|
||||||
|
apply();
|
||||||
|
}
|
||||||
|
|
||||||
|
// 图片加载后初始化
|
||||||
|
img.onload = function() {
|
||||||
|
baseW = img.naturalWidth;
|
||||||
|
baseH = img.naturalHeight;
|
||||||
|
fitToScreen();
|
||||||
|
};
|
||||||
|
// 如果已缓存
|
||||||
|
if (img.complete && img.naturalWidth) {
|
||||||
|
baseW = img.naturalWidth;
|
||||||
|
baseH = img.naturalHeight;
|
||||||
|
fitToScreen();
|
||||||
|
}
|
||||||
|
|
||||||
|
// 工具栏按钮
|
||||||
|
var btns = toolbar.querySelectorAll('button');
|
||||||
|
// 缩小 / 放大 / 适合 / 原始 / 关闭
|
||||||
|
btns[0].onclick = function(e) { e.stopPropagation(); zoomCenter(0.7); };
|
||||||
|
btns[1].onclick = function(e) { e.stopPropagation(); zoomCenter(1.4); };
|
||||||
|
btns[2].onclick = function(e) { e.stopPropagation(); fitToScreen(); };
|
||||||
|
btns[3].onclick = function(e) { e.stopPropagation(); resetOrigin(); };
|
||||||
|
btns[4].onclick = function(e) { e.stopPropagation(); close(); };
|
||||||
|
|
||||||
|
// 滚轮缩放(以鼠标为中心)
|
||||||
|
overlay.addEventListener('wheel', function(e) {
|
||||||
|
e.preventDefault();
|
||||||
|
var factor = e.deltaY < 0 ? 1.15 : 0.87;
|
||||||
|
var rect = overlay.getBoundingClientRect();
|
||||||
|
var cx = e.clientX - rect.left;
|
||||||
|
var cy = e.clientY - rect.top;
|
||||||
|
var newScale = Math.max(0.1, Math.min(scale * factor, 20));
|
||||||
|
tx = cx - (cx - tx) * (newScale / scale);
|
||||||
|
ty = cy - (cy - ty) * (newScale / scale);
|
||||||
|
scale = newScale;
|
||||||
|
apply();
|
||||||
|
}, { passive: false });
|
||||||
|
|
||||||
|
// 拖拽平移
|
||||||
|
overlay.addEventListener('pointerdown', function(e) {
|
||||||
|
if (e.target.closest('.lightbox-toolbar')) return;
|
||||||
|
dragging = true;
|
||||||
|
dragStartX = e.clientX;
|
||||||
|
dragStartY = e.clientY;
|
||||||
|
startTx = tx;
|
||||||
|
startTy = ty;
|
||||||
|
img.classList.add('dragging');
|
||||||
|
overlay.setPointerCapture(e.pointerId);
|
||||||
|
});
|
||||||
|
overlay.addEventListener('pointermove', function(e) {
|
||||||
|
if (!dragging) return;
|
||||||
|
tx = startTx + (e.clientX - dragStartX);
|
||||||
|
ty = startTy + (e.clientY - dragStartY);
|
||||||
|
apply();
|
||||||
|
});
|
||||||
|
overlay.addEventListener('pointerup', function() {
|
||||||
|
dragging = false;
|
||||||
|
img.classList.remove('dragging');
|
||||||
|
});
|
||||||
|
|
||||||
|
// ESC 关闭
|
||||||
|
function onKey(e) {
|
||||||
|
if (e.key === 'Escape') { close(); }
|
||||||
|
else if (e.key === '+' || e.key === '=') { zoomCenter(1.4); }
|
||||||
|
else if (e.key === '-') { zoomCenter(0.7); }
|
||||||
|
else if (e.key === '0') { fitToScreen(); }
|
||||||
|
}
|
||||||
|
|
||||||
|
function close() {
|
||||||
|
overlay.remove();
|
||||||
|
document.removeEventListener('keydown', onKey);
|
||||||
|
}
|
||||||
|
|
||||||
|
document.addEventListener('keydown', onKey);
|
||||||
|
|
||||||
|
// 激活动画
|
||||||
|
requestAnimationFrame(function() {
|
||||||
|
overlay.classList.add('active');
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
document.addEventListener('click', function(e) {
|
||||||
|
var img = e.target;
|
||||||
|
if (img.tagName !== 'IMG') return;
|
||||||
|
if (!img.closest('.inline-figure') && !img.closest('.gallery-item')) return;
|
||||||
|
if (img.closest('.lightbox-overlay')) return;
|
||||||
|
e.preventDefault();
|
||||||
|
openLightbox(img.src, img.alt);
|
||||||
|
});
|
||||||
|
})();
|
||||||
|
</script>
|
||||||
|
{% endblock %}
|
||||||
|
|||||||
@@ -0,0 +1,150 @@
|
|||||||
|
{% extends "base.html" %}
|
||||||
|
{% block title %}登录 — HF Daily Papers{% endblock %}
|
||||||
|
{% block content %}
|
||||||
|
<div class="login-page">
|
||||||
|
<div class="login-card">
|
||||||
|
<div class="login-header">
|
||||||
|
<h1 class="login-title">🔑 管理员登录</h1>
|
||||||
|
<p class="login-subtitle">请输入管理员账号和密码</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{% if error %}
|
||||||
|
<div class="login-error">
|
||||||
|
{{ error }}
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
<form class="login-form" action="/admin/login" method="post">
|
||||||
|
<div class="login-field">
|
||||||
|
<label for="username">用户名</label>
|
||||||
|
<input
|
||||||
|
type="text"
|
||||||
|
id="username"
|
||||||
|
name="username"
|
||||||
|
placeholder="请输入用户名"
|
||||||
|
required
|
||||||
|
autofocus
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
<div class="login-field">
|
||||||
|
<label for="password">密码</label>
|
||||||
|
<input
|
||||||
|
type="password"
|
||||||
|
id="password"
|
||||||
|
name="password"
|
||||||
|
placeholder="请输入密码"
|
||||||
|
required
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
<button type="submit" class="login-btn">登 录</button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
.login-page {
|
||||||
|
display: flex;
|
||||||
|
justify-content: center;
|
||||||
|
align-items: center;
|
||||||
|
min-height: 60vh;
|
||||||
|
padding: 40px 16px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.login-card {
|
||||||
|
width: 100%;
|
||||||
|
max-width: 400px;
|
||||||
|
background: var(--surface);
|
||||||
|
border: 1px solid var(--border);
|
||||||
|
border-radius: var(--radius-lg);
|
||||||
|
padding: 36px 32px;
|
||||||
|
box-shadow: 0 4px 24px var(--shadow);
|
||||||
|
}
|
||||||
|
|
||||||
|
.login-header {
|
||||||
|
text-align: center;
|
||||||
|
margin-bottom: 28px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.login-title {
|
||||||
|
font-family: var(--font-body);
|
||||||
|
font-size: 1.4rem;
|
||||||
|
font-weight: 700;
|
||||||
|
color: var(--ink);
|
||||||
|
margin: 0 0 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.login-subtitle {
|
||||||
|
font-size: 0.9rem;
|
||||||
|
color: var(--ink-light);
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.login-error {
|
||||||
|
background: #fce4ec;
|
||||||
|
color: #c62828;
|
||||||
|
padding: 10px 14px;
|
||||||
|
border-radius: var(--radius);
|
||||||
|
font-size: 0.85rem;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.login-form {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 18px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.login-field label {
|
||||||
|
display: block;
|
||||||
|
font-size: 0.85rem;
|
||||||
|
font-weight: 600;
|
||||||
|
color: var(--ink);
|
||||||
|
margin-bottom: 6px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.login-field input {
|
||||||
|
width: 100%;
|
||||||
|
padding: 10px 14px;
|
||||||
|
border: 1px solid var(--border);
|
||||||
|
border-radius: var(--radius);
|
||||||
|
font-size: 0.9rem;
|
||||||
|
font-family: var(--font-sans);
|
||||||
|
background: var(--bg);
|
||||||
|
color: var(--ink);
|
||||||
|
transition: border-color 0.2s;
|
||||||
|
box-sizing: border-box;
|
||||||
|
}
|
||||||
|
|
||||||
|
.login-field input:focus {
|
||||||
|
outline: none;
|
||||||
|
border-color: var(--accent);
|
||||||
|
box-shadow: 0 0 0 3px rgba(27, 54, 93, 0.1);
|
||||||
|
}
|
||||||
|
|
||||||
|
.login-btn {
|
||||||
|
width: 100%;
|
||||||
|
padding: 12px;
|
||||||
|
background: var(--accent);
|
||||||
|
color: #fff;
|
||||||
|
border: none;
|
||||||
|
border-radius: var(--radius);
|
||||||
|
font-size: 0.95rem;
|
||||||
|
font-weight: 600;
|
||||||
|
cursor: pointer;
|
||||||
|
transition: background 0.2s;
|
||||||
|
font-family: var(--font-sans);
|
||||||
|
margin-top: 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.login-btn:hover {
|
||||||
|
background: var(--accent-hover);
|
||||||
|
}
|
||||||
|
|
||||||
|
@media (max-width: 480px) {
|
||||||
|
.login-card {
|
||||||
|
padding: 28px 20px;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
{% endblock %}
|
||||||
@@ -34,18 +34,31 @@
|
|||||||
<span
|
<span
|
||||||
class="summary-badge summary-{{ paper.summary_status.status if paper.summary_status else 'none' }}"
|
class="summary-badge summary-{{ paper.summary_status.status if paper.summary_status else 'none' }}"
|
||||||
>
|
>
|
||||||
{% if not paper.summary_status or paper.summary_status.status ==
|
{# djlint:off #}
|
||||||
'pending' %} 未总结 {% elif paper.summary_status.status == 'processing'
|
{% if not paper.summary_status or paper.summary_status.status == 'pending' %}
|
||||||
%} 🔄 总结中 {% elif paper.summary_status.status == 'failed' or
|
未总结
|
||||||
paper.summary_status.status == 'permanent_failure' %} ❌ 总结失败 {%
|
{% elif paper.summary_status.status == 'processing' %}
|
||||||
elif paper.summary_status.status == 'done' %} ✅ 已总结 {% endif %}
|
🔄 总结中
|
||||||
|
{% elif paper.summary_status.status == 'failed' or paper.summary_status.status == 'permanent_failure' %}
|
||||||
|
❌ 总结失败
|
||||||
|
{% elif paper.summary_status.status == 'done' %}
|
||||||
|
✅ 已总结
|
||||||
|
{% endif %}
|
||||||
|
{# djlint:on #}
|
||||||
</span>
|
</span>
|
||||||
{% if paper.reading_status %}
|
{% if paper.reading_status %}
|
||||||
<span class="reading-badge reading-{{ paper.reading_status.status }}">
|
<span class="reading-badge reading-{{ paper.reading_status.status }}">
|
||||||
{% if paper.reading_status.status == 'unread' %}未读 {% elif
|
{# djlint:off #}
|
||||||
paper.reading_status.status == 'skimmed' %}已浏览 {% elif
|
{% if paper.reading_status.status == 'unread' %}
|
||||||
paper.reading_status.status == 'read_summary' %}已读摘要 {% elif
|
未读
|
||||||
paper.reading_status.status == 'read_full' %}已读原文 {% endif %}
|
{% elif paper.reading_status.status == 'skimmed' %}
|
||||||
|
已浏览
|
||||||
|
{% elif paper.reading_status.status == 'read_summary' %}
|
||||||
|
已读摘要
|
||||||
|
{% elif paper.reading_status.status == 'read_full' %}
|
||||||
|
已读原文
|
||||||
|
{% endif %}
|
||||||
|
{# djlint:on #}
|
||||||
</span>
|
</span>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
+13
-22
@@ -22,16 +22,7 @@ endblock %} {% block content %}
|
|||||||
type="radio"
|
type="radio"
|
||||||
name="mode"
|
name="mode"
|
||||||
value="keyword"
|
value="keyword"
|
||||||
{%
|
{% if mode == "keyword" or not mode %}checked{% endif %}
|
||||||
if
|
|
||||||
mode=""
|
|
||||||
="keyword"
|
|
||||||
or
|
|
||||||
not
|
|
||||||
mode
|
|
||||||
%}checked{%
|
|
||||||
endif
|
|
||||||
%}
|
|
||||||
/>
|
/>
|
||||||
关键词
|
关键词
|
||||||
</label>
|
</label>
|
||||||
@@ -40,13 +31,7 @@ endblock %} {% block content %}
|
|||||||
type="radio"
|
type="radio"
|
||||||
name="mode"
|
name="mode"
|
||||||
value="semantic"
|
value="semantic"
|
||||||
{%
|
{% if mode == "semantic" %}checked{% endif %}
|
||||||
if
|
|
||||||
mode=""
|
|
||||||
="semantic"
|
|
||||||
%}checked{%
|
|
||||||
endif
|
|
||||||
%}
|
|
||||||
/>
|
/>
|
||||||
语义搜索
|
语义搜索
|
||||||
</label>
|
</label>
|
||||||
@@ -142,11 +127,17 @@ endblock %} {% block content %}
|
|||||||
<span
|
<span
|
||||||
class="summary-badge summary-{{ paper.summary_status.status if paper.summary_status else 'none' }}"
|
class="summary-badge summary-{{ paper.summary_status.status if paper.summary_status else 'none' }}"
|
||||||
>
|
>
|
||||||
{% if not paper.summary_status or paper.summary_status.status ==
|
{# djlint:off #}
|
||||||
'pending' %} 未总结 {% elif paper.summary_status.status ==
|
{% if not paper.summary_status or paper.summary_status.status == 'pending' %}
|
||||||
'processing' %} 🔄 总结中 {% elif paper.summary_status.status in
|
未总结
|
||||||
('failed', 'permanent_failure') %} ❌ 总结失败 {% elif
|
{% elif paper.summary_status.status == 'processing' %}
|
||||||
paper.summary_status.status == 'done' %} ✅ 已总结 {% endif %}
|
🔄 总结中
|
||||||
|
{% elif paper.summary_status.status in ('failed', 'permanent_failure') %}
|
||||||
|
❌ 总结失败
|
||||||
|
{% elif paper.summary_status.status == 'done' %}
|
||||||
|
✅ 已总结
|
||||||
|
{% endif %}
|
||||||
|
{# djlint:on #}
|
||||||
</span>
|
</span>
|
||||||
<a href="/paper/{{ paper.arxiv_id }}" class="btn-detail">详情 →</a>
|
<a href="/paper/{{ paper.arxiv_id }}" class="btn-detail">详情 →</a>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
+12
-12
@@ -32,20 +32,20 @@ endblock %} {% block content %}
|
|||||||
{% endblock %} {% block scripts %}
|
{% endblock %} {% block scripts %}
|
||||||
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.7/dist/chart.umd.min.js"></script>
|
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.7/dist/chart.umd.min.js"></script>
|
||||||
<script>
|
<script>
|
||||||
// 颜色配置(kami 风格墨蓝色系)
|
// 颜色配置(Kami ink-blue 暖调色系)
|
||||||
const COLORS = {
|
const COLORS = {
|
||||||
primary: '#2d5f8a',
|
primary: '#1B365D',
|
||||||
primaryLight: 'rgba(45, 95, 138, 0.2)',
|
primaryLight: 'rgba(27, 54, 93, 0.12)',
|
||||||
accent: '#5a9bc7',
|
accent: '#2a4d7a',
|
||||||
success: '#388e3c',
|
success: '#3d6e3d',
|
||||||
warning: '#f57f17',
|
warning: '#7a6430',
|
||||||
danger: '#c62828',
|
danger: '#8c2828',
|
||||||
muted: '#4a4a6a',
|
muted: '#6b6a64',
|
||||||
palette: [
|
palette: [
|
||||||
'#2d5f8a', '#5a9bc7', '#388e3c', '#f57f17', '#c62828',
|
'#1B365D', '#2a4d7a', '#3d6e3d', '#7a6430', '#8c2828',
|
||||||
'#7b1fa2', '#00838f', '#ef6c00', '#455a64', '#827717',
|
'#4a4070', '#2d6b6e', '#8a5a2a', '#504e49', '#5c6030',
|
||||||
'#1565c0', '#ad1457', '#00695c', '#e65100', '#283593',
|
'#2b4a80', '#70304a', '#2d5e56', '#7a4a10', '#353a60',
|
||||||
'#9e9d24', '#6a1b9a', '#00838f', '#4e342e', '#37474f',
|
'#6a6a28', '#552a5a', '#2d6b6e', '#4a3828', '#3d4450',
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
+11
-1
@@ -19,7 +19,17 @@ TMP_DIR = DATA_DIR / "tmp"
|
|||||||
|
|
||||||
# ── 模板单例 ──────────────────────────────────────────────────────────
|
# ── 模板单例 ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
templates = Jinja2Templates(directory="app/templates")
|
|
||||||
|
class _Templates(Jinja2Templates):
|
||||||
|
"""自动注入 is_admin 到模板上下文的 Jinja2Templates 子类。"""
|
||||||
|
|
||||||
|
def TemplateResponse(self, request, name, context=None, **kwargs):
|
||||||
|
context = context or {}
|
||||||
|
context.setdefault("is_admin", request.session.get("is_admin", False))
|
||||||
|
return super().TemplateResponse(request, name, context, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
templates = _Templates(directory="app/templates")
|
||||||
|
|
||||||
|
|
||||||
# ── 时区工具 ──────────────────────────────────────────────────────────
|
# ── 时区工具 ──────────────────────────────────────────────────────────
|
||||||
|
|||||||
@@ -16,6 +16,8 @@ dependencies = [
|
|||||||
"python-dotenv>=1.0",
|
"python-dotenv>=1.0",
|
||||||
"apscheduler>=3.10",
|
"apscheduler>=3.10",
|
||||||
"chromadb>=1.0",
|
"chromadb>=1.0",
|
||||||
|
"pymupdf>=1.25",
|
||||||
|
"itsdangerous>=2.2.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
|||||||
@@ -0,0 +1,117 @@
|
|||||||
|
"""验证 summary JSON 是否符合 SummarySchema 要求。
|
||||||
|
|
||||||
|
用法:python scripts/validate_summary.py <json_file>
|
||||||
|
返回:exit 0 = 通过,exit 1 = 失败(错误信息输出到 stdout)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def validate(path: str) -> list[str]:
|
||||||
|
errors: list[str] = []
|
||||||
|
try:
|
||||||
|
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
return [f"JSON 解析失败: {e}"]
|
||||||
|
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
return ["顶层必须是 JSON 对象 (dict)"]
|
||||||
|
|
||||||
|
# 必填字段
|
||||||
|
required_top = ["arxiv_id", "title_zh", "one_line", "tags"]
|
||||||
|
for f in required_top:
|
||||||
|
if f not in data or not data[f]:
|
||||||
|
errors.append(f"缺少必填字段: {f}")
|
||||||
|
|
||||||
|
# tags 必须是非空数组
|
||||||
|
tags = data.get("tags")
|
||||||
|
if isinstance(tags, list) and len(tags) == 0:
|
||||||
|
errors.append("tags 不能为空数组")
|
||||||
|
if not isinstance(tags, list):
|
||||||
|
errors.append("tags 必须是数组")
|
||||||
|
|
||||||
|
# motivation 子字段
|
||||||
|
motivation = data.get("motivation", {})
|
||||||
|
if not isinstance(motivation, dict):
|
||||||
|
errors.append("motivation 必须是对象")
|
||||||
|
else:
|
||||||
|
for f in ["problem", "goal", "gap"]:
|
||||||
|
val = motivation.get(f, "")
|
||||||
|
if not isinstance(val, str) or len(val.strip()) < 50:
|
||||||
|
errors.append(f"motivation.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)")
|
||||||
|
|
||||||
|
# method 子字段
|
||||||
|
method = data.get("method", {})
|
||||||
|
if not isinstance(method, dict):
|
||||||
|
errors.append("method 必须是对象")
|
||||||
|
else:
|
||||||
|
for f in ["overview", "key_idea", "steps", "novelty"]:
|
||||||
|
val = method.get(f, "")
|
||||||
|
if not isinstance(val, str) or len(val.strip()) < 50:
|
||||||
|
errors.append(f"method.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)")
|
||||||
|
|
||||||
|
# results 子字段
|
||||||
|
results = data.get("results", {})
|
||||||
|
if not isinstance(results, dict):
|
||||||
|
errors.append("results 必须是对象")
|
||||||
|
else:
|
||||||
|
for f in ["main_findings", "limitations"]:
|
||||||
|
val = results.get(f, "")
|
||||||
|
if not isinstance(val, str) or len(val.strip()) < 50:
|
||||||
|
errors.append(f"results.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)")
|
||||||
|
# benchmarks 可以是数组
|
||||||
|
benchmarks = results.get("benchmarks")
|
||||||
|
if benchmarks is not None and not isinstance(benchmarks, list):
|
||||||
|
errors.append("results.benchmarks 必须是数组")
|
||||||
|
|
||||||
|
# improvements 子字段
|
||||||
|
improvements = data.get("improvements", {})
|
||||||
|
if not isinstance(improvements, dict):
|
||||||
|
errors.append("improvements 必须是对象")
|
||||||
|
else:
|
||||||
|
for f in ["weaknesses", "future_work", "reproducibility"]:
|
||||||
|
val = improvements.get(f, "")
|
||||||
|
if not isinstance(val, str) or len(val.strip()) < 50:
|
||||||
|
errors.append(f"improvements.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)")
|
||||||
|
|
||||||
|
# 检查是否有字段误用数组(应该用字符串的)
|
||||||
|
string_fields = [
|
||||||
|
("motivation", "problem"), ("motivation", "goal"), ("motivation", "gap"),
|
||||||
|
("method", "overview"), ("method", "key_idea"), ("method", "steps"), ("method", "novelty"),
|
||||||
|
("results", "main_findings"), ("results", "limitations"),
|
||||||
|
("improvements", "weaknesses"), ("improvements", "future_work"), ("improvements", "reproducibility"),
|
||||||
|
]
|
||||||
|
for section, field in string_fields:
|
||||||
|
val = data.get(section, {}).get(field)
|
||||||
|
if isinstance(val, list):
|
||||||
|
errors.append(f"{section}.{field} 应该是字符串段落,不能是数组")
|
||||||
|
|
||||||
|
# figures 验证
|
||||||
|
figures = data.get("figures")
|
||||||
|
if figures is not None:
|
||||||
|
if not isinstance(figures, list):
|
||||||
|
errors.append("figures 必须是数组")
|
||||||
|
else:
|
||||||
|
for i, fig in enumerate(figures):
|
||||||
|
if isinstance(fig, dict) and not fig.get("id"):
|
||||||
|
errors.append(f"figures[{i}] 缺少 id 字段")
|
||||||
|
|
||||||
|
return errors
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
print("用法: python scripts/validate_summary.py <json_file>")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
errs = validate(sys.argv[1])
|
||||||
|
if errs:
|
||||||
|
print("❌ 验证失败:")
|
||||||
|
for e in errs:
|
||||||
|
print(f" - {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
print("✅ 验证通过")
|
||||||
|
sys.exit(0)
|
||||||
+50
-39
@@ -87,7 +87,8 @@ def client(db_engine, db_session):
|
|||||||
# ── 样例数据 ────────────────────────────────────────────────────────────
|
# ── 样例数据 ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
SAMPLE_ARXIV_ID = "2401.12345"
|
SAMPLE_ARXIV_ID = "2401.12345"
|
||||||
ADMIN_TOKEN = "test-admin-token-12345"
|
_TEST_ADMIN_USERNAME = "admin"
|
||||||
|
_TEST_ADMIN_PASSWORD = "test-password-12345"
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
@@ -138,46 +139,56 @@ def sample_paper(db_session):
|
|||||||
def sample_summary_dict() -> dict:
|
def sample_summary_dict() -> dict:
|
||||||
"""完整合法的 summary dict。"""
|
"""完整合法的 summary dict。"""
|
||||||
return {
|
return {
|
||||||
|
"arxiv_id": "2401.12345",
|
||||||
"title_zh": "测试论文中文标题",
|
"title_zh": "测试论文中文标题",
|
||||||
"one_line": "这是一篇关于自然语言处理的测试论文的一句话总结。",
|
"one_line": "这是一篇关于自然语言处理的测试论文的一句话总结。",
|
||||||
"tags": ["自然语言处理", "大语言模型", "Transformer"],
|
"tags": ["自然语言处理", "大语言模型", "Transformer"],
|
||||||
"difficulty": "中级",
|
"difficulty": "中级",
|
||||||
"prerequisites": {
|
"prerequisites": {
|
||||||
"concepts": ["Transformer", "注意力机制"],
|
"concepts": [
|
||||||
"level": "中级",
|
{
|
||||||
|
"term": "Transformer",
|
||||||
|
"explanation": "一种基于自注意力机制的序列到序列模型架构,广泛用于NLP任务。",
|
||||||
|
"why_matters": "本文方法基于 Transformer 架构进行改进。",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"term": "注意力机制",
|
||||||
|
"explanation": "允许模型在处理序列时动态关注不同位置的信息的机制。",
|
||||||
|
"why_matters": "理解注意力机制是理解本文方法的基础。",
|
||||||
|
},
|
||||||
|
],
|
||||||
},
|
},
|
||||||
"motivation": {
|
"motivation": {
|
||||||
"problem": "现有模型在长文本理解上存在不足。",
|
"problem": "现有模型在长文本理解上存在不足,主要体现在注意力计算复杂度随序列长度二次增长,导致实际应用中无法处理超长文本输入。",
|
||||||
"goal": "提出一种新的注意力机制来提升长文本建模能力。",
|
"goal": "提出一种新的稀疏注意力机制来有效提升长文本建模能力,在保持模型整体性能的同时大幅降低计算开销和显存占用。",
|
||||||
"gap": "当前方法计算复杂度过高。",
|
"gap": "当前方法计算复杂度过高,已有的稀疏注意力方案在保留全局信息方面存在明显不足,导致长距离依赖建模效果不佳。",
|
||||||
},
|
},
|
||||||
"method": {
|
"method": {
|
||||||
"overview": "提出了一种高效的稀疏注意力机制。",
|
"overview": "提出了一种高效的稀疏注意力机制,通过局部-全局混合的注意力模式,在降低计算复杂度的同时保留了关键的全局信息流动。",
|
||||||
"key_idea": "使用局部-全局混合的注意力模式来降低计算复杂度。",
|
"key_idea": "使用局部-全局混合的注意力模式来降低计算复杂度,局部窗口捕获短距离依赖,全局采样点维护长距离信息传递。",
|
||||||
"steps": [
|
"steps": "首先分析现有注意力机制的计算瓶颈,发现全连接注意力中大部分注意力权重接近于零。然后设计了一种混合稀疏注意力模式,包含局部滑动窗口和全局随机采样两条路径。最后在多个长文本基准数据集上进行了全面的实验验证。",
|
||||||
"分析现有注意力机制的瓶颈",
|
"novelty": "首次将局部-全局注意力模式结合应用于长文本建模,通过可学习的采样策略动态调整全局注意力点的位置,而非固定模式。",
|
||||||
"设计稀疏注意力模式",
|
|
||||||
"在多个基准上验证效果",
|
|
||||||
],
|
|
||||||
"novelty": "首次将局部-全局注意力模式结合应用于长文本建模。",
|
|
||||||
},
|
},
|
||||||
"results": {
|
"results": {
|
||||||
"main_findings": [
|
"main_findings": "在长文本基准 LongBench 上取得了 SOTA 结果,平均得分提升 3.2 个百分点。推理速度相比全注意力提升了 2 倍,显存占用降低 60%。在 32k 序列长度下仍保持与全注意力相当的生成质量。",
|
||||||
"在长文本基准上取得了 SOTA 结果",
|
|
||||||
"推理速度提升了 2 倍",
|
|
||||||
],
|
|
||||||
"benchmarks": [
|
"benchmarks": [
|
||||||
{"dataset": "LongBench", "score": 85.3},
|
{"task": "长文本摘要", "metric": "ROUGE-L", "this_work": "42.1", "baseline": "38.9", "improvement": "+3.2"},
|
||||||
],
|
|
||||||
"limitations": [
|
|
||||||
"在超长文本(>100k tokens)上效果有所下降",
|
|
||||||
],
|
],
|
||||||
|
"limitations": "在超长文本(>100k tokens)上效果有所下降,主要原因是全局采样点数量不足以覆盖所有关键信息。此外,在小规模数据集上的优势不如大规模数据集明显。",
|
||||||
},
|
},
|
||||||
"improvements": {
|
"improvements": {
|
||||||
"weaknesses": ["仅验证了英文数据"],
|
"weaknesses": "仅验证了英文数据,未在中文等多语言场景下测试。全局采样策略在极端长度的文本上可能需要增加采样点数量,增加了工程复杂度。",
|
||||||
"future_work": ["扩展到多语言场景"],
|
"future_work": "扩展到多语言场景,研究自适应采样策略,使模型能根据输入内容动态调整全局注意力点的分配。同时探索与 Flash Attention 等底层优化的兼容性。",
|
||||||
"reproducibility": "代码已开源,模型权重可下载。",
|
"reproducibility": "代码已在 GitHub 开源,提供了完整的训练脚本和预训练模型权重。实验使用了公开数据集,硬件需求为 8×A100 GPU。",
|
||||||
},
|
},
|
||||||
|
"figures": [
|
||||||
|
{
|
||||||
|
"id": "Figure 1",
|
||||||
|
"caption": "稀疏注意力机制的整体架构图",
|
||||||
|
"description": "展示了局部窗口注意力和全局采样注意力的组合方式,以及信息如何在两种路径间流动。",
|
||||||
|
"reason": "帮助理解本文方法的核心设计思想,直观展示了局部-全局混合模式的工作原理。",
|
||||||
|
},
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -200,21 +211,21 @@ def mock_pi_output(sample_summary_json) -> str:
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def admin_token():
|
def auth_client(client, monkeypatch):
|
||||||
"""返回测试用的 ADMIN_TOKEN(需要配合 monkeypatch 使用)。"""
|
"""已登录的 TestClient(session cookie 自动携带)。"""
|
||||||
return ADMIN_TOKEN
|
from app.config import settings
|
||||||
|
|
||||||
|
monkeypatch.setattr(settings, "ADMIN_USERNAME", _TEST_ADMIN_USERNAME)
|
||||||
@pytest.fixture
|
monkeypatch.setattr(settings, "ADMIN_PASSWORD", _TEST_ADMIN_PASSWORD)
|
||||||
def admin_headers(admin_token):
|
monkeypatch.setattr(settings, "CHROMA_ENABLED", False)
|
||||||
"""带 Bearer token 的请求头。"""
|
# 登录获取 session cookie
|
||||||
return {"Authorization": f"Bearer {admin_token}"}
|
resp = client.post(
|
||||||
|
"/admin/login",
|
||||||
|
data={"username": _TEST_ADMIN_USERNAME, "password": _TEST_ADMIN_PASSWORD},
|
||||||
@pytest.fixture
|
follow_redirects=False,
|
||||||
def wrong_admin_headers():
|
)
|
||||||
"""错误的 Authorization 请求头。"""
|
assert resp.status_code == 303
|
||||||
return {"Authorization": "Bearer wrong-token"}
|
return client
|
||||||
|
|
||||||
|
|
||||||
# ── 多样例数据 ────────────────────────────────────────────────────────────
|
# ── 多样例数据 ────────────────────────────────────────────────────────────
|
||||||
|
|||||||
+94
-100
@@ -16,19 +16,6 @@ from app.models import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# ── Fixtures ────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
ADMIN_TOKEN = "test-admin-token-12345"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def auth_client(client, monkeypatch):
|
|
||||||
"""带 admin token monkeypatch 的 TestClient。"""
|
|
||||||
monkeypatch.setattr(settings, "ADMIN_TOKEN", ADMIN_TOKEN)
|
|
||||||
monkeypatch.setattr(settings, "CHROMA_ENABLED", False)
|
|
||||||
return client
|
|
||||||
|
|
||||||
|
|
||||||
# ═══════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════
|
||||||
# Admin Routes — 鉴权测试
|
# Admin Routes — 鉴权测试
|
||||||
# ═══════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════
|
||||||
@@ -37,80 +24,92 @@ def auth_client(client, monkeypatch):
|
|||||||
class TestAdminAuth:
|
class TestAdminAuth:
|
||||||
"""管理接口鉴权测试。"""
|
"""管理接口鉴权测试。"""
|
||||||
|
|
||||||
def test_no_token_returns_403(self, auth_client):
|
def test_unauthenticated_redirects_to_login(self, auth_client):
|
||||||
"""无 token 时请求管理接口应返回 403。"""
|
"""未登录时请求管理接口应重定向到登录页。"""
|
||||||
resp = auth_client.post("/admin/crawl")
|
# 用未登录的 client(auth_client 已登录,这里直接用 client)
|
||||||
assert resp.status_code in (403, 401)
|
pass # 见下方 test_no_session_returns_303
|
||||||
|
|
||||||
def test_wrong_token_returns_401(self, auth_client, wrong_admin_headers):
|
def test_no_session_returns_303(self, client, monkeypatch):
|
||||||
"""错误 token 应返回 401。"""
|
"""无 session 时请求管理接口应返回 303 重定向。"""
|
||||||
resp = auth_client.post("/admin/crawl", headers=wrong_admin_headers)
|
monkeypatch.setattr(settings, "ADMIN_PASSWORD", "some-password")
|
||||||
assert resp.status_code == 401
|
resp = client.post("/admin/crawl", follow_redirects=False)
|
||||||
|
assert resp.status_code == 303
|
||||||
|
assert "/admin/login" in resp.headers.get("location", "")
|
||||||
|
|
||||||
def test_correct_token_accepted(self, auth_client, admin_headers):
|
def test_wrong_password_shows_error(self, client, monkeypatch):
|
||||||
"""正确 token 应被接受(crawl 可能会失败但不是 401)。"""
|
"""错误密码应返回登录页并显示错误。"""
|
||||||
|
monkeypatch.setattr(settings, "ADMIN_USERNAME", "admin")
|
||||||
|
monkeypatch.setattr(settings, "ADMIN_PASSWORD", "correct-pass")
|
||||||
|
resp = client.post(
|
||||||
|
"/admin/login",
|
||||||
|
data={"username": "admin", "password": "wrong-pass"},
|
||||||
|
follow_redirects=False,
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert "错误" in resp.text or "error" in resp.text.lower()
|
||||||
|
|
||||||
|
def test_correct_login_redirects_to_logs(self, client, monkeypatch):
|
||||||
|
"""正确登录应重定向到 /admin/logs。"""
|
||||||
|
monkeypatch.setattr(settings, "ADMIN_USERNAME", "admin")
|
||||||
|
monkeypatch.setattr(settings, "ADMIN_PASSWORD", "test-pass")
|
||||||
|
resp = client.post(
|
||||||
|
"/admin/login",
|
||||||
|
data={"username": "admin", "password": "test-pass"},
|
||||||
|
follow_redirects=False,
|
||||||
|
)
|
||||||
|
assert resp.status_code == 303
|
||||||
|
assert "/admin/logs" in resp.headers.get("location", "")
|
||||||
|
|
||||||
|
def test_logout_clears_session(self, auth_client, monkeypatch):
|
||||||
|
"""退出登录后应清除 session。"""
|
||||||
|
monkeypatch.setattr(settings, "CHROMA_ENABLED", False)
|
||||||
|
resp = auth_client.post("/admin/logout", follow_redirects=False)
|
||||||
|
assert resp.status_code == 303
|
||||||
|
# 退出后访问管理页应被重定向
|
||||||
|
resp = auth_client.get("/admin/logs", follow_redirects=False)
|
||||||
|
assert resp.status_code == 303
|
||||||
|
|
||||||
|
def test_correct_session_accepted(self, auth_client):
|
||||||
|
"""已登录 session 应被接受(crawl 可能会失败但不是 303)。"""
|
||||||
with patch(
|
with patch(
|
||||||
"app.routes.admin.crawl_daily", new_callable=AsyncMock
|
"app.routes.admin.crawl_daily", new_callable=AsyncMock
|
||||||
) as mock_crawl:
|
) as mock_crawl:
|
||||||
mock_crawl.return_value = {"found": 0, "new": 0, "status": "success"}
|
mock_crawl.return_value = {"found": 0, "new": 0, "status": "success"}
|
||||||
resp = auth_client.post("/admin/crawl", headers=admin_headers)
|
resp = auth_client.post("/admin/crawl")
|
||||||
assert resp.status_code != 401
|
assert resp.status_code != 303
|
||||||
|
|
||||||
# ── summarize route auth ────────────────────────────────────────
|
# ── summarize route auth ────────────────────────────────────────
|
||||||
|
|
||||||
def test_no_token_returns_401_for_summarize(self, client):
|
def test_no_session_returns_303_for_summarize(self, client, monkeypatch):
|
||||||
"""无 Bearer token 返回 401。"""
|
"""无 session 返回 303。"""
|
||||||
resp = client.post("/admin/summarize")
|
monkeypatch.setattr(settings, "ADMIN_PASSWORD", "some-password")
|
||||||
assert resp.status_code in (401, 403)
|
resp = client.post("/admin/summarize", follow_redirects=False)
|
||||||
|
assert resp.status_code == 303
|
||||||
|
|
||||||
def test_wrong_token_returns_401_for_summarize(self, client):
|
def test_correct_session_batch_summarize(self, auth_client):
|
||||||
resp = client.post(
|
"""已登录调用 batch summarize,mock 掉服务层。"""
|
||||||
"/admin/summarize",
|
with patch(
|
||||||
headers={"Authorization": "Bearer wrong-token"},
|
"app.routes.admin.summarize_batch", new_callable=AsyncMock
|
||||||
)
|
) as mock:
|
||||||
assert resp.status_code == 401
|
mock.return_value = {
|
||||||
|
"status": "success",
|
||||||
|
"done": 0,
|
||||||
|
"failed": 0,
|
||||||
|
"total": 0,
|
||||||
|
}
|
||||||
|
resp = auth_client.post("/admin/summarize")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["status"] == "success"
|
||||||
|
|
||||||
def test_correct_token_batch_summarize(self, client, admin_headers):
|
def test_single_paper_not_found(self, auth_client):
|
||||||
"""正确 token 调用 batch summarize,mock 掉服务层。"""
|
|
||||||
import app.config as config_mod
|
|
||||||
|
|
||||||
original = config_mod.settings.ADMIN_TOKEN
|
|
||||||
config_mod.settings.ADMIN_TOKEN = ADMIN_TOKEN
|
|
||||||
try:
|
|
||||||
with patch(
|
|
||||||
"app.routes.admin.summarize_batch", new_callable=AsyncMock
|
|
||||||
) as mock:
|
|
||||||
mock.return_value = {
|
|
||||||
"status": "success",
|
|
||||||
"done": 0,
|
|
||||||
"failed": 0,
|
|
||||||
"total": 0,
|
|
||||||
}
|
|
||||||
resp = client.post("/admin/summarize", headers=admin_headers)
|
|
||||||
assert resp.status_code == 200
|
|
||||||
assert resp.json()["status"] == "success"
|
|
||||||
finally:
|
|
||||||
config_mod.settings.ADMIN_TOKEN = original
|
|
||||||
|
|
||||||
def test_single_paper_not_found(self, client, admin_headers):
|
|
||||||
"""单篇总结不存在的论文返回 404。"""
|
"""单篇总结不存在的论文返回 404。"""
|
||||||
import app.config as config_mod
|
with patch(
|
||||||
|
"app.routes.admin.summarize_single",
|
||||||
original = config_mod.settings.ADMIN_TOKEN
|
new_callable=AsyncMock,
|
||||||
config_mod.settings.ADMIN_TOKEN = ADMIN_TOKEN
|
return_value={"status": "not_found", "arxiv_id": "nonexistent.99999"},
|
||||||
try:
|
):
|
||||||
with patch(
|
resp = auth_client.post("/admin/summarize/nonexistent.99999")
|
||||||
"app.routes.admin.summarize_single",
|
assert resp.status_code == 404
|
||||||
new_callable=AsyncMock,
|
|
||||||
return_value={"status": "not_found", "arxiv_id": "nonexistent.99999"},
|
|
||||||
):
|
|
||||||
resp = client.post(
|
|
||||||
"/admin/summarize/nonexistent.99999",
|
|
||||||
headers=admin_headers,
|
|
||||||
)
|
|
||||||
assert resp.status_code == 404
|
|
||||||
finally:
|
|
||||||
config_mod.settings.ADMIN_TOKEN = original
|
|
||||||
|
|
||||||
|
|
||||||
# ═══════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════
|
||||||
@@ -121,27 +120,25 @@ class TestAdminAuth:
|
|||||||
class TestAdminCrawl:
|
class TestAdminCrawl:
|
||||||
"""POST /admin/crawl 测试。"""
|
"""POST /admin/crawl 测试。"""
|
||||||
|
|
||||||
def test_crawl_default_today(self, auth_client, admin_headers):
|
def test_crawl_default_today(self, auth_client):
|
||||||
"""不指定日期时默认抓取今天。"""
|
"""不指定日期时默认抓取今天。"""
|
||||||
with patch(
|
with patch(
|
||||||
"app.routes.admin.crawl_daily", new_callable=AsyncMock
|
"app.routes.admin.crawl_daily", new_callable=AsyncMock
|
||||||
) as mock_crawl:
|
) as mock_crawl:
|
||||||
mock_crawl.return_value = {"found": 5, "new": 3, "status": "success"}
|
mock_crawl.return_value = {"found": 5, "new": 3, "status": "success"}
|
||||||
resp = auth_client.post("/admin/crawl", headers=admin_headers)
|
resp = auth_client.post("/admin/crawl")
|
||||||
assert resp.status_code == 200
|
assert resp.status_code == 200
|
||||||
data = resp.json()
|
data = resp.json()
|
||||||
assert data["status"] == "success"
|
assert data["status"] == "success"
|
||||||
mock_crawl.assert_called_once()
|
mock_crawl.assert_called_once()
|
||||||
|
|
||||||
def test_crawl_specific_date(self, auth_client, admin_headers):
|
def test_crawl_specific_date(self, auth_client):
|
||||||
"""指定日期抓取。"""
|
"""指定日期抓取。"""
|
||||||
with patch(
|
with patch(
|
||||||
"app.routes.admin.crawl_daily", new_callable=AsyncMock
|
"app.routes.admin.crawl_daily", new_callable=AsyncMock
|
||||||
) as mock_crawl:
|
) as mock_crawl:
|
||||||
mock_crawl.return_value = {"found": 2, "new": 1, "status": "success"}
|
mock_crawl.return_value = {"found": 2, "new": 1, "status": "success"}
|
||||||
resp = auth_client.post(
|
resp = auth_client.post("/admin/crawl?date=2024-01-15")
|
||||||
"/admin/crawl?date=2024-01-15", headers=admin_headers
|
|
||||||
)
|
|
||||||
assert resp.status_code == 200
|
assert resp.status_code == 200
|
||||||
mock_crawl.assert_called_once()
|
mock_crawl.assert_called_once()
|
||||||
call_args = mock_crawl.call_args
|
call_args = mock_crawl.call_args
|
||||||
@@ -156,21 +153,21 @@ class TestAdminCrawl:
|
|||||||
class TestAdminCleanup:
|
class TestAdminCleanup:
|
||||||
"""POST /admin/cleanup 测试。"""
|
"""POST /admin/cleanup 测试。"""
|
||||||
|
|
||||||
def test_cleanup_returns_stats(self, auth_client, admin_headers):
|
def test_cleanup_returns_stats(self, auth_client):
|
||||||
"""清理应返回统计信息。"""
|
"""清理应返回统计信息。"""
|
||||||
with patch("app.routes.admin.cleanup_tmp") as mock_cleanup:
|
with patch("app.routes.admin.cleanup_tmp") as mock_cleanup:
|
||||||
mock_cleanup.return_value = {"scanned": 3, "removed": 1, "errors": []}
|
mock_cleanup.return_value = {"scanned": 3, "removed": 1, "errors": []}
|
||||||
resp = auth_client.post("/admin/cleanup", headers=admin_headers)
|
resp = auth_client.post("/admin/cleanup")
|
||||||
assert resp.status_code == 200
|
assert resp.status_code == 200
|
||||||
data = resp.json()
|
data = resp.json()
|
||||||
assert data["scanned"] == 3
|
assert data["scanned"] == 3
|
||||||
assert data["removed"] == 1
|
assert data["removed"] == 1
|
||||||
|
|
||||||
def test_cleanup_writes_log(self, auth_client, admin_headers, db_session):
|
def test_cleanup_writes_log(self, auth_client, db_session):
|
||||||
"""清理应写入 crawl_logs。"""
|
"""清理应写入 crawl_logs。"""
|
||||||
with patch("app.routes.admin.cleanup_tmp") as mock_cleanup:
|
with patch("app.routes.admin.cleanup_tmp") as mock_cleanup:
|
||||||
mock_cleanup.return_value = {"scanned": 0, "removed": 0, "errors": []}
|
mock_cleanup.return_value = {"scanned": 0, "removed": 0, "errors": []}
|
||||||
auth_client.post("/admin/cleanup", headers=admin_headers)
|
auth_client.post("/admin/cleanup")
|
||||||
|
|
||||||
logs = (
|
logs = (
|
||||||
db_session.execute(select(CrawlLog).where(CrawlLog.task == "cleanup"))
|
db_session.execute(select(CrawlLog).where(CrawlLog.task == "cleanup"))
|
||||||
@@ -189,7 +186,7 @@ class TestAdminCleanup:
|
|||||||
class TestAdminDelete:
|
class TestAdminDelete:
|
||||||
"""POST /admin/delete 测试。"""
|
"""POST /admin/delete 测试。"""
|
||||||
|
|
||||||
def test_delete_requires_confirm(self, auth_client, admin_headers):
|
def test_delete_requires_confirm(self, auth_client):
|
||||||
"""confirm 不是 'DELETE' 时应返回 422。"""
|
"""confirm 不是 'DELETE' 时应返回 422。"""
|
||||||
resp = auth_client.post(
|
resp = auth_client.post(
|
||||||
"/admin/delete",
|
"/admin/delete",
|
||||||
@@ -199,12 +196,11 @@ class TestAdminDelete:
|
|||||||
"include_notes": True,
|
"include_notes": True,
|
||||||
"confirm": "WRONG",
|
"confirm": "WRONG",
|
||||||
},
|
},
|
||||||
headers=admin_headers,
|
|
||||||
)
|
)
|
||||||
assert resp.status_code == 422
|
assert resp.status_code == 422
|
||||||
|
|
||||||
def test_delete_with_confirm(
|
def test_delete_with_confirm(
|
||||||
self, auth_client, admin_headers, db_session, sample_papers_range
|
self, auth_client, db_session, sample_papers_range
|
||||||
):
|
):
|
||||||
"""confirm='DELETE' 时应执行删除。"""
|
"""confirm='DELETE' 时应执行删除。"""
|
||||||
resp = auth_client.post(
|
resp = auth_client.post(
|
||||||
@@ -215,13 +211,12 @@ class TestAdminDelete:
|
|||||||
"include_notes": True,
|
"include_notes": True,
|
||||||
"confirm": "DELETE",
|
"confirm": "DELETE",
|
||||||
},
|
},
|
||||||
headers=admin_headers,
|
|
||||||
)
|
)
|
||||||
assert resp.status_code == 200
|
assert resp.status_code == 200
|
||||||
data = resp.json()
|
data = resp.json()
|
||||||
assert data["deleted"] == 3
|
assert data["deleted"] == 3
|
||||||
|
|
||||||
def test_delete_invalid_date_range(self, auth_client, admin_headers):
|
def test_delete_invalid_date_range(self, auth_client):
|
||||||
"""date_start > date_end 应返回 400。"""
|
"""date_start > date_end 应返回 400。"""
|
||||||
resp = auth_client.post(
|
resp = auth_client.post(
|
||||||
"/admin/delete",
|
"/admin/delete",
|
||||||
@@ -230,11 +225,10 @@ class TestAdminDelete:
|
|||||||
"date_end": "2024-01-10",
|
"date_end": "2024-01-10",
|
||||||
"confirm": "DELETE",
|
"confirm": "DELETE",
|
||||||
},
|
},
|
||||||
headers=admin_headers,
|
|
||||||
)
|
)
|
||||||
assert resp.status_code == 400
|
assert resp.status_code == 400
|
||||||
|
|
||||||
def test_delete_without_confirm_field(self, auth_client, admin_headers):
|
def test_delete_without_confirm_field(self, auth_client):
|
||||||
"""缺少 confirm 字段应返回 422。"""
|
"""缺少 confirm 字段应返回 422。"""
|
||||||
resp = auth_client.post(
|
resp = auth_client.post(
|
||||||
"/admin/delete",
|
"/admin/delete",
|
||||||
@@ -242,7 +236,6 @@ class TestAdminDelete:
|
|||||||
"date_start": "2024-01-10",
|
"date_start": "2024-01-10",
|
||||||
"date_end": "2024-01-12",
|
"date_end": "2024-01-12",
|
||||||
},
|
},
|
||||||
headers=admin_headers,
|
|
||||||
)
|
)
|
||||||
assert resp.status_code == 422
|
assert resp.status_code == 422
|
||||||
|
|
||||||
@@ -255,19 +248,20 @@ class TestAdminDelete:
|
|||||||
class TestAdminLogs:
|
class TestAdminLogs:
|
||||||
"""GET /admin/logs 测试。"""
|
"""GET /admin/logs 测试。"""
|
||||||
|
|
||||||
def test_logs_returns_page(self, auth_client, admin_headers):
|
def test_logs_returns_page(self, auth_client):
|
||||||
"""应返回管理日志页面。"""
|
"""应返回管理日志页面。"""
|
||||||
resp = auth_client.get("/admin/logs", headers=admin_headers)
|
resp = auth_client.get("/admin/logs")
|
||||||
assert resp.status_code == 200
|
assert resp.status_code == 200
|
||||||
assert "text/html" in resp.headers.get("content-type", "")
|
assert "text/html" in resp.headers.get("content-type", "")
|
||||||
|
|
||||||
def test_logs_requires_auth(self, auth_client):
|
def test_logs_requires_auth(self, client, monkeypatch):
|
||||||
"""日志页面需要鉴权。"""
|
"""日志页面需要鉴权。"""
|
||||||
resp = auth_client.get("/admin/logs")
|
monkeypatch.setattr(settings, "ADMIN_PASSWORD", "some-password")
|
||||||
assert resp.status_code in (403, 401)
|
resp = client.get("/admin/logs", follow_redirects=False)
|
||||||
|
assert resp.status_code == 303
|
||||||
|
|
||||||
def test_logs_contains_data(
|
def test_logs_contains_data(
|
||||||
self, auth_client, admin_headers, db_session, sample_papers_range
|
self, auth_client, db_session, sample_papers_range
|
||||||
):
|
):
|
||||||
"""日志页面应包含日志数据。"""
|
"""日志页面应包含日志数据。"""
|
||||||
# 先创建一条日志
|
# 先创建一条日志
|
||||||
@@ -282,7 +276,7 @@ class TestAdminLogs:
|
|||||||
)
|
)
|
||||||
db_session.commit()
|
db_session.commit()
|
||||||
|
|
||||||
resp = auth_client.get("/admin/logs", headers=admin_headers)
|
resp = auth_client.get("/admin/logs")
|
||||||
assert resp.status_code == 200
|
assert resp.status_code == 200
|
||||||
assert "crawl" in resp.text.lower() or "日志" in resp.text
|
assert "crawl" in resp.text.lower() or "日志" in resp.text
|
||||||
|
|
||||||
|
|||||||
@@ -1,107 +0,0 @@
|
|||||||
"""LaTeX 图片提取测试 — 从 .tex 源码中提取图片文件。"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
# ═══════════════════════════════════════════════════════════════════════
|
|
||||||
# Image Extraction
|
|
||||||
# ═══════════════════════════════════════════════════════════════════════
|
|
||||||
|
|
||||||
|
|
||||||
class TestImageExtraction:
|
|
||||||
"""LaTeX 图片提取测试。"""
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_extract_images_from_source_no_dir(self, monkeypatch, tmp_path):
|
|
||||||
"""源码目录不存在时返回 0。"""
|
|
||||||
monkeypatch.setattr(
|
|
||||||
"app.services.pdf_downloader.tmp_dir", lambda x: tmp_path / "tmp" / x
|
|
||||||
)
|
|
||||||
monkeypatch.setattr(
|
|
||||||
"app.services.pdf_downloader.paper_dir", lambda x: tmp_path / "papers" / x
|
|
||||||
)
|
|
||||||
from app.services.image_extractor import extract_images_from_source
|
|
||||||
|
|
||||||
result = await extract_images_from_source("2401.99999")
|
|
||||||
assert result == 0
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_extract_images_from_tex(self, monkeypatch, tmp_path):
|
|
||||||
"""从 .tex 文件中提取图片。"""
|
|
||||||
from app.services.image_extractor import extract_images_from_source
|
|
||||||
|
|
||||||
tmp_source = tmp_path / "tmp" / "2401.00001" / "source"
|
|
||||||
tmp_source.mkdir(parents=True)
|
|
||||||
|
|
||||||
images_dir = tmp_source / "figs"
|
|
||||||
images_dir.mkdir()
|
|
||||||
(images_dir / "figure1.png").write_bytes(b"\x89PNG\r\n")
|
|
||||||
(images_dir / "figure2.jpg").write_bytes(b"\xff\xd8\xff\xe0")
|
|
||||||
|
|
||||||
# 创建 .tex 文件
|
|
||||||
tex_content = r"""
|
|
||||||
\documentclass{article}
|
|
||||||
\begin{document}
|
|
||||||
\begin{figure}
|
|
||||||
\includegraphics[width=0.8\textwidth]{figs/figure1.png}
|
|
||||||
\includegraphics{figs/figure2.jpg}
|
|
||||||
\includegraphics[angle=90]{figs/nonexistent.pdf}
|
|
||||||
\end{figure}
|
|
||||||
\end{document}
|
|
||||||
"""
|
|
||||||
(tmp_source / "main.tex").write_text(tex_content)
|
|
||||||
|
|
||||||
papers_dir = tmp_path / "papers" / "2401.00001"
|
|
||||||
monkeypatch.setattr(
|
|
||||||
"app.services.image_extractor.tmp_dir", lambda x: tmp_path / "tmp" / x
|
|
||||||
)
|
|
||||||
monkeypatch.setattr(
|
|
||||||
"app.services.image_extractor.paper_dir", lambda x: tmp_path / "papers" / x
|
|
||||||
)
|
|
||||||
|
|
||||||
# Mock download_source_zip to avoid real network call (source dir already exists)
|
|
||||||
async def _noop_download(*args, **kwargs):
|
|
||||||
pass
|
|
||||||
|
|
||||||
monkeypatch.setattr(
|
|
||||||
"app.services.image_extractor.download_source_zip", _noop_download
|
|
||||||
)
|
|
||||||
|
|
||||||
result = await extract_images_from_source("2401.00001")
|
|
||||||
|
|
||||||
assert result == 2
|
|
||||||
dest_images = papers_dir / "images"
|
|
||||||
assert dest_images.exists()
|
|
||||||
assert (dest_images / "figure1.png").exists()
|
|
||||||
assert (dest_images / "figure2.jpg").exists()
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_extract_images_empty_tex(self, monkeypatch, tmp_path):
|
|
||||||
""".tex 文件无图片时返回 0。"""
|
|
||||||
from app.services.image_extractor import extract_images_from_source
|
|
||||||
|
|
||||||
tmp_source = tmp_path / "tmp" / "2401.00002" / "source"
|
|
||||||
tmp_source.mkdir(parents=True)
|
|
||||||
(tmp_source / "main.tex").write_text(
|
|
||||||
r"\documentclass{article}\begin{document}Hello\end{document}"
|
|
||||||
)
|
|
||||||
|
|
||||||
monkeypatch.setattr(
|
|
||||||
"app.services.image_extractor.tmp_dir", lambda x: tmp_path / "tmp" / x
|
|
||||||
)
|
|
||||||
monkeypatch.setattr(
|
|
||||||
"app.services.image_extractor.paper_dir", lambda x: tmp_path / "papers" / x
|
|
||||||
)
|
|
||||||
|
|
||||||
# Mock download_source_zip to avoid real network call
|
|
||||||
async def _noop_download(*args, **kwargs):
|
|
||||||
pass
|
|
||||||
|
|
||||||
monkeypatch.setattr(
|
|
||||||
"app.services.image_extractor.download_source_zip", _noop_download
|
|
||||||
)
|
|
||||||
|
|
||||||
result = await extract_images_from_source("2401.00002")
|
|
||||||
assert result == 0
|
|
||||||
@@ -64,10 +64,9 @@ class TestSummarySchema:
|
|||||||
SummarySchema.model_validate(sample_summary_dict)
|
SummarySchema.model_validate(sample_summary_dict)
|
||||||
|
|
||||||
def test_extra_fields_ignored(self, sample_summary_dict):
|
def test_extra_fields_ignored(self, sample_summary_dict):
|
||||||
sample_summary_dict["figures"] = ["fig1.png"]
|
|
||||||
sample_summary_dict["takeaway"] = "important paper"
|
sample_summary_dict["takeaway"] = "important paper"
|
||||||
schema = SummarySchema.model_validate(sample_summary_dict)
|
schema = SummarySchema.model_validate(sample_summary_dict)
|
||||||
assert not hasattr(schema, "figures")
|
assert not hasattr(schema, "takeaway")
|
||||||
assert schema.title_zh # 正常解析
|
assert schema.title_zh # 正常解析
|
||||||
|
|
||||||
def test_flatten_for_db(self, sample_summary_dict):
|
def test_flatten_for_db(self, sample_summary_dict):
|
||||||
@@ -80,7 +79,7 @@ class TestSummarySchema:
|
|||||||
assert "updated_at" in flat
|
assert "updated_at" in flat
|
||||||
# JSON 字段可解析
|
# JSON 字段可解析
|
||||||
assert isinstance(json.loads(flat["prerequisites_json"]), dict)
|
assert isinstance(json.loads(flat["prerequisites_json"]), dict)
|
||||||
assert isinstance(json.loads(flat["method_steps_json"]), list)
|
assert isinstance(flat["figures_json"], str) # figures 序列化为 JSON
|
||||||
|
|
||||||
|
|
||||||
# ═══════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════
|
||||||
@@ -99,7 +98,7 @@ class TestQualityAssessment:
|
|||||||
sample_summary_dict["motivation"]["goal"] = ""
|
sample_summary_dict["motivation"]["goal"] = ""
|
||||||
sample_summary_dict["motivation"]["gap"] = ""
|
sample_summary_dict["motivation"]["gap"] = ""
|
||||||
sample_summary_dict["method"]["overview"] = ""
|
sample_summary_dict["method"]["overview"] = ""
|
||||||
sample_summary_dict["results"]["main_findings"] = []
|
sample_summary_dict["results"]["main_findings"] = ""
|
||||||
schema = SummarySchema.model_validate(sample_summary_dict)
|
schema = SummarySchema.model_validate(sample_summary_dict)
|
||||||
assert assess_quality(schema) == "degraded"
|
assert assess_quality(schema) == "degraded"
|
||||||
|
|
||||||
|
|||||||
+18
-26
@@ -182,7 +182,7 @@ class TestSummarizeOneFlow:
|
|||||||
patch(
|
patch(
|
||||||
"app.services.summarizer.call_pi",
|
"app.services.summarizer.call_pi",
|
||||||
new_callable=AsyncMock,
|
new_callable=AsyncMock,
|
||||||
return_value=mock_pi_output,
|
return_value=(mock_pi_output, "test-session-id"),
|
||||||
),
|
),
|
||||||
):
|
):
|
||||||
result = await summarize_one(db_session, sample_paper)
|
result = await summarize_one(db_session, sample_paper)
|
||||||
@@ -246,27 +246,28 @@ class TestSummarizeOneFlow:
|
|||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_json_not_found(self, db_session, sample_paper, _patch_paths):
|
async def test_json_not_found(self, db_session, sample_paper, _patch_paths):
|
||||||
"""pi 输出无 JSON → json_not_found。"""
|
"""pi 输出无 JSON → 验证循环重试 4 次后 ValueError (unknown)。"""
|
||||||
with (
|
with (
|
||||||
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
|
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
|
||||||
patch(
|
patch(
|
||||||
"app.services.summarizer.call_pi",
|
"app.services.summarizer.call_pi",
|
||||||
new_callable=AsyncMock,
|
new_callable=AsyncMock,
|
||||||
return_value="No JSON in this output at all.",
|
return_value=("No JSON in this output at all.", "test-session-id"),
|
||||||
),
|
),
|
||||||
):
|
):
|
||||||
result = await summarize_one(db_session, sample_paper)
|
result = await summarize_one(db_session, sample_paper)
|
||||||
|
|
||||||
assert result["status"] == "failed"
|
assert result["status"] == "failed"
|
||||||
assert result["error_type"] == "json_not_found"
|
assert result["error_type"] == "unknown"
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_field_missing_and_retry(
|
async def test_validation_fails_and_retries(
|
||||||
self, db_session, sample_paper, _patch_paths
|
self, db_session, sample_paper, _patch_paths
|
||||||
):
|
):
|
||||||
"""必填字段缺失 → field_missing → retry → permanent_failure。"""
|
"""验证失败(字段不符合要求)→ 重试多次后失败。"""
|
||||||
bad_json = json.dumps(
|
bad_json = json.dumps(
|
||||||
{
|
{
|
||||||
|
"arxiv_id": sample_paper.arxiv_id,
|
||||||
"title_zh": "", # 空的必填字段
|
"title_zh": "", # 空的必填字段
|
||||||
"one_line": "valid line",
|
"one_line": "valid line",
|
||||||
"tags": ["tag1"],
|
"tags": ["tag1"],
|
||||||
@@ -282,23 +283,14 @@ class TestSummarizeOneFlow:
|
|||||||
patch(
|
patch(
|
||||||
"app.services.summarizer.call_pi",
|
"app.services.summarizer.call_pi",
|
||||||
new_callable=AsyncMock,
|
new_callable=AsyncMock,
|
||||||
return_value=bad_output,
|
return_value=(bad_output, "test-session-id"),
|
||||||
),
|
),
|
||||||
):
|
):
|
||||||
# 第一次失败 → pending (retry)
|
# _validate_summary 先拦截,4 轮都失败后 ValueError → unknown
|
||||||
result1 = await summarize_one(db_session, sample_paper)
|
result = await summarize_one(db_session, sample_paper)
|
||||||
assert result1["status"] == "failed"
|
assert result["status"] == "failed"
|
||||||
assert result1["error_type"] == "field_missing"
|
assert result["error_type"] == "unknown"
|
||||||
assert result1["retry_count"] == 1
|
assert result["retry_count"] == 1
|
||||||
|
|
||||||
# 第二次失败 → permanent_failure (SUMMARY_MAX_RETRIES=1, 所以 2 次 > 1+1)
|
|
||||||
db_session.refresh(sample_paper)
|
|
||||||
result2 = await summarize_one(db_session, sample_paper)
|
|
||||||
assert result2["status"] == "failed"
|
|
||||||
assert result2["retry_count"] == 2
|
|
||||||
|
|
||||||
db_session.refresh(sample_paper)
|
|
||||||
assert sample_paper.summary_status.status == "permanent_failure"
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_raw_output_saved_on_failure(
|
async def test_raw_output_saved_on_failure(
|
||||||
@@ -310,7 +302,7 @@ class TestSummarizeOneFlow:
|
|||||||
patch(
|
patch(
|
||||||
"app.services.summarizer.call_pi",
|
"app.services.summarizer.call_pi",
|
||||||
new_callable=AsyncMock,
|
new_callable=AsyncMock,
|
||||||
return_value="Some output without JSON",
|
return_value=("Some output without JSON", "test-session-id"),
|
||||||
),
|
),
|
||||||
):
|
):
|
||||||
await summarize_one(db_session, sample_paper)
|
await summarize_one(db_session, sample_paper)
|
||||||
@@ -329,7 +321,7 @@ class TestSummarizeOneFlow:
|
|||||||
patch(
|
patch(
|
||||||
"app.services.summarizer.call_pi",
|
"app.services.summarizer.call_pi",
|
||||||
new_callable=AsyncMock,
|
new_callable=AsyncMock,
|
||||||
return_value=mock_pi_output,
|
return_value=(mock_pi_output, "test-session-id"),
|
||||||
),
|
),
|
||||||
):
|
):
|
||||||
await summarize_one(db_session, sample_paper)
|
await summarize_one(db_session, sample_paper)
|
||||||
@@ -417,7 +409,7 @@ class TestBatchSummarize:
|
|||||||
patch(
|
patch(
|
||||||
"app.services.summarizer.call_pi",
|
"app.services.summarizer.call_pi",
|
||||||
new_callable=AsyncMock,
|
new_callable=AsyncMock,
|
||||||
return_value=mock_pi_output,
|
return_value=(mock_pi_output, "test-session-id"),
|
||||||
),
|
),
|
||||||
):
|
):
|
||||||
result = await summarize_batch(db_session, _session_factory=_TestSession)
|
result = await summarize_batch(db_session, _session_factory=_TestSession)
|
||||||
@@ -464,7 +456,7 @@ class TestBatchSummarize:
|
|||||||
call_count += 1
|
call_count += 1
|
||||||
if call_count == 1:
|
if call_count == 1:
|
||||||
raise PiTimeoutError("timeout")
|
raise PiTimeoutError("timeout")
|
||||||
return mock_pi_output
|
return mock_pi_output, "test-session-id"
|
||||||
|
|
||||||
with (
|
with (
|
||||||
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
|
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
|
||||||
@@ -506,7 +498,7 @@ class TestBatchSummarize:
|
|||||||
patch(
|
patch(
|
||||||
"app.services.summarizer.call_pi",
|
"app.services.summarizer.call_pi",
|
||||||
new_callable=AsyncMock,
|
new_callable=AsyncMock,
|
||||||
return_value=mock_pi_output,
|
return_value=(mock_pi_output, "test-session-id"),
|
||||||
),
|
),
|
||||||
):
|
):
|
||||||
await summarize_batch(db_session, _session_factory=_TestSession)
|
await summarize_batch(db_session, _session_factory=_TestSession)
|
||||||
|
|||||||
@@ -672,9 +672,11 @@ dependencies = [
|
|||||||
{ name = "chromadb" },
|
{ name = "chromadb" },
|
||||||
{ name = "fastapi" },
|
{ name = "fastapi" },
|
||||||
{ name = "httpx" },
|
{ name = "httpx" },
|
||||||
|
{ name = "itsdangerous" },
|
||||||
{ name = "jinja2" },
|
{ name = "jinja2" },
|
||||||
{ name = "pydantic" },
|
{ name = "pydantic" },
|
||||||
{ name = "pydantic-settings" },
|
{ name = "pydantic-settings" },
|
||||||
|
{ name = "pymupdf" },
|
||||||
{ name = "python-dotenv" },
|
{ name = "python-dotenv" },
|
||||||
{ name = "python-multipart" },
|
{ name = "python-multipart" },
|
||||||
{ name = "sqlalchemy" },
|
{ name = "sqlalchemy" },
|
||||||
@@ -694,9 +696,11 @@ requires-dist = [
|
|||||||
{ name = "chromadb", specifier = ">=1.0" },
|
{ name = "chromadb", specifier = ">=1.0" },
|
||||||
{ name = "fastapi", specifier = ">=0.115" },
|
{ name = "fastapi", specifier = ">=0.115" },
|
||||||
{ name = "httpx", specifier = ">=0.28" },
|
{ name = "httpx", specifier = ">=0.28" },
|
||||||
|
{ name = "itsdangerous", specifier = ">=2.2.0" },
|
||||||
{ name = "jinja2", specifier = ">=3.1" },
|
{ name = "jinja2", specifier = ">=3.1" },
|
||||||
{ name = "pydantic", specifier = ">=2.0" },
|
{ name = "pydantic", specifier = ">=2.0" },
|
||||||
{ name = "pydantic-settings", specifier = ">=2.0" },
|
{ name = "pydantic-settings", specifier = ">=2.0" },
|
||||||
|
{ name = "pymupdf", specifier = ">=1.25" },
|
||||||
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" },
|
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" },
|
||||||
{ name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24" },
|
{ name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24" },
|
||||||
{ name = "python-dotenv", specifier = ">=1.0" },
|
{ name = "python-dotenv", specifier = ">=1.0" },
|
||||||
@@ -850,6 +854,15 @@ wheels = [
|
|||||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
|
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "itsdangerous"
|
||||||
|
version = "2.2.0"
|
||||||
|
source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
|
||||||
|
sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "jinja2"
|
name = "jinja2"
|
||||||
version = "3.1.6"
|
version = "3.1.6"
|
||||||
@@ -1778,6 +1791,22 @@ wheels = [
|
|||||||
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" },
|
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pymupdf"
|
||||||
|
version = "1.27.2.3"
|
||||||
|
source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }
|
||||||
|
sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/22/32/708bedc9dde7b328d45abbc076091769d44f2f24ad151ad92d56a6ec142b/pymupdf-1.27.2.3.tar.gz", hash = "sha256:7a92faa25129e8bbec5e50eeb9214f187665428c31b05c4ef6e36c58c0b1c6d2", size = 85759618, upload-time = "2026-04-24T14:13:14.42Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/dc/09/ddbdfa7ee91fbabd6f63d7d744884cbdfe3e7ff9b8604749fb38bddf5c5d/pymupdf-1.27.2.3-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:fc1bc3cae6e9e150b0dbb0a9221bdfd411d65f0db2fe359eaa22467d7cc2a05f", size = 24002636, upload-time = "2026-04-24T14:09:17.459Z" },
|
||||||
|
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/01/89/3f8edd6c4f50ca370e2a2f2a3011face36f3760728ffe76dffec91c0fca0/pymupdf-1.27.2.3-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:660d93cb6da5bbddf11d3982ae27745dd3a9902d9f24cdb69adab83962294b5a", size = 23278238, upload-time = "2026-04-24T14:09:32.882Z" },
|
||||||
|
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c3/26/b7e5a70eb83bd189f8b5df87ec442746b992f2f632662839b288170d357d/pymupdf-1.27.2.3-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1dd460a3ae4597a755f00a3bd9771f5ebf1531dc111f6a36bf05dd00a6b84425", size = 24333923, upload-time = "2026-04-24T14:09:47.341Z" },
|
||||||
|
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/e4/a0/aa1ee2240f29481a04a827c313333b4ecd8a14d6ac3e15d3f41a30574781/pymupdf-1.27.2.3-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:857842b4888827bd6155a1131341b2822a7ebe9a8c15a975fd7d490d7a64a30c", size = 24963198, upload-time = "2026-04-24T14:10:07.408Z" },
|
||||||
|
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/69/49/4f742451f980840829fc00ba158bebb25d389c846d8f4f8c65936ee55de8/pymupdf-1.27.2.3-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:580983849c64a08d08344ca3d1580e87c01f046a8392421797bc850efd72a5b6", size = 25184609, upload-time = "2026-04-24T14:10:22.911Z" },
|
||||||
|
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/f6/3f/3853d6608f394faf6eec2bd4e8ea9f6a00beea329b071abdb29f4164cc3d/pymupdf-1.27.2.3-cp310-abi3-win32.whl", hash = "sha256:a5c1088a87189891a4946ab314a14b7934ac4c5b6077f7e74ebee956f8906d0e", size = 18019286, upload-time = "2026-04-24T14:10:34.239Z" },
|
||||||
|
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/44/47/5fb10fe73f96b31253a41647c362ea9e0380920bddf16028414a051247fc/pymupdf-1.27.2.3-cp310-abi3-win_amd64.whl", hash = "sha256:d20f68ef15195e073071dbc4ae7455257c7889af7584e39df490c0a92728526e", size = 19249102, upload-time = "2026-04-24T14:10:46.72Z" },
|
||||||
|
{ url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/53/a4/b9e91aac82293f9c954654c85581ee8212b5b05efadc534b581141241e6f/pymupdf-1.27.2.3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:77691604c5d1d0233827139bbcdea61fd57879c84712b8e49b1f45520f7ab9c2", size = 25000393, upload-time = "2026-04-24T14:11:01.669Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pypika"
|
name = "pypika"
|
||||||
version = "0.51.1"
|
version = "0.51.1"
|
||||||
|
|||||||
Reference in New Issue
Block a user