diff --git a/.env.example b/.env.example
index 7a57b9d..1b1e082 100644
--- a/.env.example
+++ b/.env.example
@@ -22,13 +22,15 @@ HTTP_USER_AGENT=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
PI_BIN=
SUMMARY_SKILL=daily-paper-summary
SUMMARY_CONCURRENCY=3
-SUMMARY_TIMEOUT_SECONDS=300
+SUMMARY_TIMEOUT_SECONDS=900
SUMMARY_MAX_RETRIES=1
+SUMMARY_PDF_MODE=auto
# ─── 调度 ─────────────────────────────────
SCHEDULER_ENABLED=false
-SCHEDULE_HOUR=8
+SCHEDULE_HOUR=4
SCHEDULE_MINUTE=0
+# 抓取时自动探测:先试今天,无数据则回退昨天(无需手动配置偏移)
APP_WORKERS=1
# ─── 数据库 ─────────────────────────────
diff --git a/.gitignore b/.gitignore
index 7c658a8..f67af4f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@ venv/
dist/
build/
.DS_Store
+CLAUDE.md
diff --git a/app/cli.py b/app/cli.py
index 09471d5..ea163df 100644
--- a/app/cli.py
+++ b/app/cli.py
@@ -1,8 +1,6 @@
"""CLI 工具 — 手动抓取论文。"""
import asyncio
-import sys
-from datetime import date
import typer
from dotenv import load_dotenv
@@ -17,28 +15,53 @@ cli_app = typer.Typer(help="HF Daily Papers 管理 CLI")
def crawl(
date_str: str = typer.Argument(
None,
- help="抓取日期 (YYYY-MM-DD),默认今天",
+ help="抓取日期 (YYYY-MM-DD),留空则自动探测",
),
top_n: int = typer.Option(None, "--top", "-n", help="取前 N 篇"),
+ force: bool = typer.Option(False, "--force", "-f", help="强制重抓(即使已抓取过)"),
):
"""手动抓取指定日期的 HuggingFace Daily Papers。"""
from app.config import settings
from app.database import SessionLocal, engine
from app.database import init_db as _init
+ from app.models import Paper
from app.services.crawler import crawl_daily
+ from app.utils import today_str, yesterday_str
+ from sqlalchemy import func, select
- target = date_str or date.today().isoformat()
+ target = date_str or today_str()
# 确保数据库和表存在
import os
os.makedirs(settings.db_path.parent, exist_ok=True)
_init(engine)
- typer.echo(f"📡 开始抓取 {target} ...")
db = SessionLocal()
try:
+ # 检查是否已抓取过(非 force 模式)
+ if not force and not date_str:
+ existing = db.scalar(select(func.count(Paper.id)).where(Paper.paper_date == target)) or 0
+ if existing > 0:
+ typer.echo(f"⏭️ {target} 已有 {existing} 篇论文,跳过(用 --force 强制重抓)")
+ return
+
+ typer.echo(f"📡 开始抓取 {target} ...")
result = asyncio.run(crawl_daily(db, target, top_n))
+
+ # 未指定日期且今天无数据时,自动回退到昨天
+ if not date_str and result["status"] == "success" and result["found"] == 0:
+ fallback = yesterday_str()
+ existing = db.scalar(select(func.count(Paper.id)).where(Paper.paper_date == fallback)) or 0
+ if existing > 0:
+ typer.echo(
+ f"⏭️ {fallback} 已有 {existing} 篇论文,跳过(用 --force 强制重抓)"
+ )
+ else:
+ typer.echo(f"🔄 {target} 无数据,尝试 {fallback} ...")
+ target = fallback
+ result = asyncio.run(crawl_daily(db, target, top_n))
+
if result["status"] == "success":
typer.echo(
f"✅ 抓取完成:发现 {result['found']} 篇,新增 {result['new']} 篇"
@@ -56,6 +79,11 @@ def summarize(
None,
help="指定论文 arXiv ID;留空则批量处理所有 pending",
),
+ pdf_mode: str = typer.Option(
+ "auto",
+ "--pdf-mode",
+ help="PDF 传递方式:auto(自动选择)| inject(全量注入)| search(pi 自主搜索)",
+ ),
):
"""手动触发 AI 总结。"""
from app.config import settings
@@ -65,17 +93,21 @@ def summarize(
import os
+ if pdf_mode not in ("auto", "inject", "search"):
+ typer.echo(f"❌ 无效的 pdf_mode: {pdf_mode},只支持 auto / inject / search", err=True)
+ raise typer.Exit(code=1)
+
os.makedirs(settings.db_path.parent, exist_ok=True)
_init(engine)
db = SessionLocal()
try:
if arxiv_id:
- typer.echo(f"🤖 开始总结 {arxiv_id} ...")
- result = asyncio.run(summarize_single(db, arxiv_id))
+ typer.echo(f"🤖 开始总结 {arxiv_id} (mode={pdf_mode}) ...")
+ result = asyncio.run(summarize_single(db, arxiv_id, pdf_mode=pdf_mode))
else:
- typer.echo("🤖 开始批量总结 pending 论文 ...")
- result = asyncio.run(summarize_batch(db))
+ typer.echo(f"🤖 开始批量总结 pending 论文 (mode={pdf_mode}) ...")
+ result = asyncio.run(summarize_batch(db, pdf_mode=pdf_mode))
if result.get("status") in ("success", "done"):
typer.echo(f"✅ 总结完成:{result}")
diff --git a/app/config.py b/app/config.py
index b94359f..66987e6 100644
--- a/app/config.py
+++ b/app/config.py
@@ -32,12 +32,13 @@ class Settings(BaseSettings):
PI_BIN: str = ""
SUMMARY_SKILL: str = "daily-paper-summary"
SUMMARY_CONCURRENCY: int = 3
- SUMMARY_TIMEOUT_SECONDS: int = 300
+ SUMMARY_TIMEOUT_SECONDS: int = 900
SUMMARY_MAX_RETRIES: int = 1
+ SUMMARY_PDF_MODE: str = "auto" # "auto" = ≤80k 用 inject,>80k 用 search;也可强制 "inject" / "search"
# 调度
SCHEDULER_ENABLED: bool = False
- SCHEDULE_HOUR: int = 8
+ SCHEDULE_HOUR: int = 4
SCHEDULE_MINUTE: int = 0
APP_WORKERS: int = 1
diff --git a/app/database.py b/app/database.py
index 8f68655..3817c76 100644
--- a/app/database.py
+++ b/app/database.py
@@ -73,6 +73,9 @@ def _migrate(engine) -> None:
"paper_summaries": [
("figures_json", "TEXT"),
],
+ "crawl_logs": [
+ ("details_json", "TEXT"),
+ ],
}
with engine.connect() as conn:
diff --git a/app/models.py b/app/models.py
index b4426b8..6a0b9eb 100644
--- a/app/models.py
+++ b/app/models.py
@@ -1,6 +1,6 @@
"""SQLAlchemy ORM 模型 — papers, authors, tags, summaries, user data, logs, locks。"""
-from datetime import date, datetime
+from enum import StrEnum
from sqlalchemy import (
Boolean,
@@ -8,17 +8,29 @@ from sqlalchemy import (
Date,
DateTime,
ForeignKey,
- Index,
Integer,
String,
Text,
UniqueConstraint,
)
-from sqlalchemy.orm import relationship
+from sqlalchemy.orm import joinedload, relationship
from app.database import Base
+# ── 枚举 ────────────────────────────────────────────────────────────────
+
+
+class SummaryState(StrEnum):
+ """总结状态枚举 — 对应 summary_status.status 列。"""
+
+ PENDING = "pending"
+ PROCESSING = "processing"
+ DONE = "done"
+ FAILED = "failed"
+ PERMANENT_FAILURE = "permanent_failure"
+
+
# ── papers ──────────────────────────────────────────────────────────────
class Paper(Base):
__tablename__ = "papers"
@@ -35,10 +47,6 @@ class Paper(Base):
hf_url = Column(String)
arxiv_url = Column(String)
pdf_url = Column(String)
- source_url = Column(String)
- asset_status = Column(String, default="not_downloaded")
- asset_error = Column(String)
- meta_path = Column(String)
summary_path = Column(String)
raw_output_path = Column(String)
summary_quality = Column(String)
@@ -170,6 +178,7 @@ class CrawlLog(Base):
papers_found = Column(Integer)
papers_new = Column(Integer)
error = Column(Text)
+ details_json = Column(Text) # 任务专用元数据 JSON(如 cleanup: {scanned, removed})
started_at = Column(DateTime, nullable=False)
completed_at = Column(DateTime)
@@ -244,3 +253,21 @@ class DataDeleteJob(Base):
error = Column(Text)
started_at = Column(DateTime, nullable=False)
completed_at = Column(DateTime)
+
+
+# ── 常用 joinedload 选项集 ──────────────────────────────────────────────
+# 避免在各路由/服务中重复写 .options(joinedload(Paper.authors), ...)
+
+PAPER_DEFAULT_LOAD = (
+ joinedload(Paper.authors),
+ joinedload(Paper.tags),
+ joinedload(Paper.summary_status),
+)
+
+PAPER_FULL_LOAD = (
+ joinedload(Paper.authors),
+ joinedload(Paper.tags),
+ joinedload(Paper.summary_status),
+ joinedload(Paper.bookmark),
+ joinedload(Paper.reading_status),
+)
diff --git a/app/routes/admin.py b/app/routes/admin.py
index 7b56fdd..1587bde 100644
--- a/app/routes/admin.py
+++ b/app/routes/admin.py
@@ -1,23 +1,38 @@
-"""管理接口 — 抓取、总结、清理、删除、日志,需要登录鉴权。"""
+"""管理接口 — 仪表盘、抓取、总结、清理、删除、日志,需要登录鉴权。"""
from __future__ import annotations
import hashlib
-from datetime import date, datetime, timezone
+import json
+import logging
+from datetime import date
from fastapi import APIRouter, Depends, Form, HTTPException, Query, Request
from fastapi.responses import RedirectResponse
from pydantic import BaseModel, field_validator
-from sqlalchemy import select
+from sqlalchemy import func, select, text
from sqlalchemy.orm import Session
from app.config import settings
from app.database import get_db
-from app.models import CrawlLog, DataDeleteJob, TaskLock
+from app.models import (
+ CrawlLog,
+ DataDeleteJob,
+ Paper,
+ PaperTag,
+ SummaryState,
+ SummaryStatus,
+ TaskLock,
+)
+from app.services.admin import get_admin_stats
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
from app.services.crawler import crawl_daily
+from app.services.pipeline import run_pipeline
+from app.services.scheduler import get_scheduler
from app.services.summarizer import summarize_batch, summarize_single
-from app.utils import release_lock, templates, today_str
+from app.utils import release_lock, templates, today_str, utc_now
+
+logger = logging.getLogger(__name__)
router = APIRouter(prefix="/admin", tags=["admin"])
@@ -42,12 +57,6 @@ async def verify_admin(request: Request) -> None:
raise HTTPException(status_code=303, headers={"Location": "/admin/login"})
-def verify_admin_page(request: Request) -> None:
- """页面级认证:未登录重定向到登录页(同步版本,用于模板路由)。"""
- if not request.session.get("is_admin"):
- raise HTTPException(status_code=303, headers={"Location": "/admin/login"})
-
-
# ── 登录 / 登出 ──────────────────────────────────────────────────────
@@ -55,7 +64,7 @@ def verify_admin_page(request: Request) -> None:
async def admin_login_page(request: Request):
"""显示登录页面。已登录则直接跳转管理页。"""
if request.session.get("is_admin"):
- return RedirectResponse("/admin/logs", status_code=303)
+ return RedirectResponse("/admin/", status_code=303)
return templates.TemplateResponse(request, "login.html", {"error": None})
@@ -68,7 +77,7 @@ async def admin_login_submit(
"""处理登录表单提交。"""
if username == settings.ADMIN_USERNAME and _check_password(password):
request.session["is_admin"] = True
- return RedirectResponse("/admin/logs", status_code=303)
+ return RedirectResponse("/admin/", status_code=303)
return templates.TemplateResponse(
request, "login.html", {"error": "用户名或密码错误"}
)
@@ -81,6 +90,75 @@ async def admin_logout(request: Request):
return RedirectResponse("/admin/login", status_code=303)
+# ── 仪表盘 ──────────────────────────────────────────────────────────
+
+
+@router.get("/")
+async def admin_dashboard(
+ request: Request,
+ _admin: None = Depends(verify_admin),
+ db: Session = Depends(get_db),
+):
+ """管理仪表盘 — 系统状态总览。"""
+ stats = get_admin_stats(db)
+
+ # 调度器历史(最近 10 条 task=scheduler 日志)
+ scheduler_history = (
+ db.execute(
+ select(CrawlLog)
+ .where(CrawlLog.task == "scheduler")
+ .order_by(CrawlLog.started_at.desc())
+ .limit(10)
+ )
+ .scalars()
+ .all()
+ )
+
+ return templates.TemplateResponse(
+ request,
+ "admin_dashboard.html",
+ {"stats": stats, "scheduler_history": scheduler_history},
+ )
+
+
+# ── 调度器 ──────────────────────────────────────────────────────────
+
+
+@router.get("/scheduler-status")
+async def admin_scheduler_status(_admin: None = Depends(verify_admin)):
+ """调度器运行状态(JSON)。"""
+ scheduler = get_scheduler()
+ next_run = None
+ if scheduler:
+ for job in scheduler.get_jobs():
+ if job.id == "daily_pipeline":
+ next_run = job.next_run_time
+ break
+ return {
+ "enabled": scheduler is not None,
+ "schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
+ "timezone": settings.APP_TIMEZONE,
+ "next_run": next_run.isoformat() if next_run else None,
+ }
+
+
+@router.post("/trigger-pipeline")
+async def admin_trigger_pipeline(
+ _admin: None = Depends(verify_admin),
+ db: Session = Depends(get_db),
+):
+ """手动触发一次完整流水线(crawl → summarize → cleanup)。"""
+ today = today_str()
+ try:
+ result = await run_pipeline(db, today, owner="admin_trigger")
+ except RuntimeError as exc:
+ raise HTTPException(status_code=409, detail=str(exc))
+
+ if result["status"] == "failed":
+ raise HTTPException(status_code=500, detail=result.get("error"))
+ return {"status": "success", "message": "流水线执行完成"}
+
+
# ── 请求模型 ──────────────────────────────────────────────────────────
@@ -111,7 +189,7 @@ async def admin_crawl(
target_date = date or today_str()
# TaskLock 防重入
- now = datetime.now(timezone.utc)
+ now = utc_now()
lock = TaskLock(
task="crawl",
lock_key=target_date,
@@ -146,7 +224,7 @@ async def admin_summarize_batch(
db: Session = Depends(get_db),
):
"""批量总结所有 pending 论文。"""
- result = await summarize_batch(db)
+ result = await summarize_batch(db, pdf_mode=settings.SUMMARY_PDF_MODE)
if result.get("status") == "conflict":
raise HTTPException(
status_code=409, detail=result.get("error", "batch already running")
@@ -161,7 +239,7 @@ async def admin_summarize_single(
db: Session = Depends(get_db),
):
"""总结或重跑单篇论文。"""
- result = await summarize_single(db, arxiv_id, force=True)
+ result = await summarize_single(db, arxiv_id, force=True, pdf_mode=settings.SUMMARY_PDF_MODE)
if result.get("status") == "not_found":
raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}")
return result
@@ -176,7 +254,7 @@ async def admin_cleanup(
db: Session = Depends(get_db),
):
"""清理 data/tmp/ 中超过 24 小时的临时文件。"""
- now = datetime.now(timezone.utc)
+ now = utc_now()
log_entry = CrawlLog(
task="cleanup",
status="running",
@@ -188,9 +266,11 @@ async def admin_cleanup(
try:
result = cleanup_tmp()
log_entry.status = "success"
- log_entry.completed_at = datetime.now(timezone.utc)
- log_entry.papers_found = result.get("scanned", 0)
- log_entry.papers_new = result.get("removed", 0)
+ log_entry.completed_at = utc_now()
+ log_entry.details_json = json.dumps({
+ "scanned": result.get("scanned", 0),
+ "removed": result.get("removed", 0),
+ }, ensure_ascii=False)
if result.get("errors"):
log_entry.error = "; ".join(result["errors"])[:2000]
db.commit()
@@ -198,7 +278,7 @@ async def admin_cleanup(
except Exception as exc:
log_entry.status = "failed"
log_entry.error = str(exc)[:2000]
- log_entry.completed_at = datetime.now(timezone.utc)
+ log_entry.completed_at = utc_now()
db.commit()
raise HTTPException(status_code=500, detail=str(exc))
@@ -236,7 +316,7 @@ async def admin_logs(
page: int = Query(1, ge=1),
per_page: int = Query(20, ge=1, le=100),
):
- """查看任务日志(CrawlLog + DataDeleteJob)。"""
+ """查看任务日志(CrawlLog + DataDeleteJob)+ 总结状态统计。"""
crawl_logs = (
db.execute(
select(CrawlLog)
@@ -259,6 +339,22 @@ async def admin_logs(
.all()
)
+ # 总结状态统计概要
+ summary_total = db.scalar(select(func.count(Paper.id))) or 0
+ summary_done = db.scalar(
+ select(func.count(SummaryStatus.id)).where(SummaryStatus.status == SummaryState.DONE)
+ ) or 0
+ summary_pending = db.scalar(
+ select(func.count(SummaryStatus.id)).where(
+ SummaryStatus.status.in_([SummaryState.PENDING, SummaryState.PROCESSING])
+ )
+ ) or 0
+ summary_failed = db.scalar(
+ select(func.count(SummaryStatus.id)).where(
+ SummaryStatus.status.in_([SummaryState.FAILED, SummaryState.PERMANENT_FAILURE])
+ )
+ ) or 0
+
return templates.TemplateResponse(
request,
"admin_logs.html",
@@ -267,5 +363,311 @@ async def admin_logs(
"delete_jobs": delete_jobs,
"page": page,
"per_page": per_page,
+ "summary_total": summary_total,
+ "summary_done": summary_done,
+ "summary_pending": summary_pending,
+ "summary_failed": summary_failed,
},
)
+
+
+# ── 总结状态管理 ────────────────────────────────────────────────────
+
+
+@router.get("/summary-status")
+async def admin_summary_status(
+ request: Request,
+ _admin: None = Depends(verify_admin),
+ db: Session = Depends(get_db),
+ status: str = Query("all"),
+ page: int = Query(1, ge=1),
+ per_page: int = Query(20, ge=1, le=100),
+):
+ """总结状态列表(HTMX 片段或 JSON)。"""
+
+ query = (
+ select(Paper, SummaryStatus)
+ .outerjoin(SummaryStatus, SummaryStatus.paper_id == Paper.id)
+ .order_by(Paper.paper_date.desc())
+ )
+
+ if status != "all":
+ if status == "none":
+ query = query.where(SummaryStatus.paper_id == None) # noqa: E711
+ else:
+ query = query.where(SummaryStatus.status == status)
+
+ total = db.scalar(
+ select(func.count()).select_from(query.subquery())
+ )
+ results = (
+ db.execute(query.offset((page - 1) * per_page).limit(per_page))
+ .all()
+ )
+
+ # 判断是否 HTMX 请求
+ is_htmx = request.headers.get("HX-Request") == "true"
+
+ if is_htmx:
+ # 返回 HTML 片段
+ return templates.TemplateResponse(
+ request,
+ "partials/summary_list.html",
+ {
+ "results": results,
+ "total": total or 0,
+ "page": page,
+ "per_page": per_page,
+ "current_status": status,
+ },
+ )
+
+ # 非 HTMX 返回 JSON
+ items = []
+ for paper, ss in results:
+ item = {
+ "arxiv_id": paper.arxiv_id,
+ "title": paper.title_zh or paper.title_en,
+ "paper_date": str(paper.paper_date),
+ "summary_status": ss.status if ss else "none",
+ "retry_count": ss.retry_count if ss else 0,
+ "error_type": ss.error_type if ss else None,
+ "error": ss.error if ss else None,
+ }
+ items.append(item)
+ return {"items": items, "total": total or 0, "page": page, "per_page": per_page}
+
+
+@router.post("/summary-retry-failed")
+async def admin_summary_retry_failed(
+ _admin: None = Depends(verify_admin),
+ db: Session = Depends(get_db),
+):
+ """重试所有失败状态的总结任务。"""
+ failed_ids = (
+ db.execute(
+ select(Paper.arxiv_id)
+ .join(SummaryStatus, SummaryStatus.paper_id == Paper.id)
+ .where(SummaryStatus.status.in_([SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]))
+ )
+ .scalars()
+ .all()
+ )
+
+ if not failed_ids:
+ return {"status": "success", "message": "没有失败的任务需要重试", "count": 0}
+
+ # 重置失败任务的状态为 pending
+ db.execute(
+ SummaryStatus.__table__.update()
+ .where(SummaryStatus.status.in_([SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]))
+ .values(status=SummaryState.PENDING, error=None, error_type=None)
+ )
+ db.commit()
+
+ return {
+ "status": "success",
+ "message": f"已重置 {len(failed_ids)} 个失败任务为待总结状态",
+ "count": len(failed_ids),
+ }
+
+
+# ── 论文管理 ────────────────────────────────────────────────────────
+
+
+# 排序映射
+_SORT_MAP = {
+ "date_desc": Paper.paper_date.desc(),
+ "date_asc": Paper.paper_date.asc(),
+ "upvotes_desc": Paper.upvotes.desc(),
+ "title_asc": Paper.title_en.asc(),
+}
+
+
+@router.get("/papers")
+async def admin_papers(
+ request: Request,
+ _admin: None = Depends(verify_admin),
+ db: Session = Depends(get_db),
+ q: str = Query("", description="搜索标题/摘要"),
+ date_from: str | None = Query(None),
+ date_to: str | None = Query(None),
+ tag: str = Query(""),
+ summary_status: str = Query("all"),
+ sort: str = Query("date_desc"),
+ page: int = Query(1, ge=1),
+ per_page: int = Query(20, ge=1, le=100),
+):
+ """论文管理列表页面。"""
+ query = select(Paper)
+
+ # 搜索
+ if q.strip():
+ query = query.where(
+ Paper.title_en.ilike(f"%{q}%")
+ | Paper.title_zh.ilike(f"%{q}%")
+ | Paper.abstract.ilike(f"%{q}%")
+ )
+
+ # 日期范围
+ if date_from:
+ query = query.where(Paper.paper_date >= date_from)
+ if date_to:
+ query = query.where(Paper.paper_date <= date_to)
+
+ # 标签筛选
+ if tag:
+ query = query.join(PaperTag, PaperTag.paper_id == Paper.id).where(
+ PaperTag.tag == tag
+ )
+
+ # 总结状态筛选
+ if summary_status != "all":
+ if summary_status == "none":
+ query = query.outerjoin(
+ SummaryStatus, SummaryStatus.paper_id == Paper.id
+ ).where(SummaryStatus.paper_id == None) # noqa: E711
+ else:
+ query = query.join(
+ SummaryStatus, SummaryStatus.paper_id == Paper.id
+ ).where(SummaryStatus.status == summary_status)
+
+ # 排序
+ order = _SORT_MAP.get(sort, Paper.paper_date.desc())
+ query = query.order_by(order)
+
+ # 计数
+ total = db.scalar(select(func.count()).select_from(query.subquery()))
+
+ # 分页
+ papers = (
+ db.execute(query.offset((page - 1) * per_page).limit(per_page))
+ .scalars()
+ .all()
+ )
+
+ # 获取每篇论文的总结状态
+ paper_ids = [p.id for p in papers]
+ statuses = {}
+ if paper_ids:
+ rows = db.execute(
+ select(SummaryStatus.paper_id, SummaryStatus.status).where(
+ SummaryStatus.paper_id.in_(paper_ids)
+ )
+ ).all()
+ paper_id_to_arxiv = {p.id: p.arxiv_id for p in papers}
+ for pid, st in rows:
+ statuses[paper_id_to_arxiv.get(pid, "")] = st
+
+ # 构建分页 URL 辅助函数
+ def pagination_url(p: int) -> str:
+ params = dict(request.query_params)
+ params["page"] = str(p)
+ return "/admin/papers?" + "&".join(f"{k}={v}" for k, v in params.items())
+
+ return templates.TemplateResponse(
+ request,
+ "admin_papers.html",
+ {
+ "papers": papers,
+ "paper_summary_statuses": statuses,
+ "total": total or 0,
+ "page": page,
+ "per_page": per_page,
+ "current_status": summary_status,
+ "current_sort": sort,
+ "pagination_url": pagination_url,
+ },
+ )
+
+
+@router.post("/paper-delete/{arxiv_id}")
+async def admin_paper_delete(
+ arxiv_id: str,
+ _admin: None = Depends(verify_admin),
+ db: Session = Depends(get_db),
+):
+ """删除单篇论文。"""
+ paper = db.scalar(select(Paper).where(Paper.arxiv_id == arxiv_id))
+ if not paper:
+ raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}")
+
+ # 删除相关数据(ORM cascade 自动处理关联表)
+ db.delete(paper)
+ db.commit()
+
+ # 清理 FTS 索引
+ try:
+ db.execute(text("DELETE FROM papers_fts WHERE arxiv_id = :aid"), {"aid": arxiv_id})
+ db.commit()
+ except Exception:
+ logger.warning("Failed to clean FTS index for %s", arxiv_id, exc_info=True)
+
+ return {"status": "success", "message": f"已删除 {arxiv_id}"}
+
+
+class BatchActionRequest(BaseModel):
+ action: str # "delete" or "summarize"
+ arxiv_ids: list[str]
+
+ @field_validator("action")
+ @classmethod
+ def action_must_be_valid(cls, v: str) -> str:
+ if v not in ("delete", "summarize"):
+ raise ValueError("action must be 'delete' or 'summarize'")
+ return v
+
+
+@router.post("/papers-batch-action")
+async def admin_papers_batch_action(
+ body: BatchActionRequest,
+ _admin: None = Depends(verify_admin),
+ db: Session = Depends(get_db),
+):
+ """批量操作论文(删除或总结)。"""
+ if not body.arxiv_ids:
+ raise HTTPException(status_code=400, detail="arxiv_ids 不能为空")
+
+ if body.action == "delete":
+ papers = db.execute(
+ select(Paper).where(Paper.arxiv_id.in_(body.arxiv_ids))
+ ).scalars().all()
+
+ count = 0
+ for paper in papers:
+ db.delete(paper)
+ count += 1
+ db.commit()
+
+ # 清理 FTS 索引
+ try:
+ db.execute(
+ text("DELETE FROM papers_fts WHERE arxiv_id IN :ids"),
+ {"ids": tuple(body.arxiv_ids)},
+ )
+ db.commit()
+ except Exception:
+ logger.warning("Failed to clean FTS index for batch delete", exc_info=True)
+
+ return {"status": "success", "message": f"已删除 {count} 篇论文", "count": count}
+
+ elif body.action == "summarize":
+ # 将选中论文的总结状态重置为 pending
+ paper_ids = db.execute(
+ select(Paper.id).where(Paper.arxiv_id.in_(body.arxiv_ids))
+ ).scalars().all()
+
+ if paper_ids:
+ # 删除旧的 status 记录让其重新进入 pipeline
+ db.execute(
+ SummaryStatus.__table__.delete().where(
+ SummaryStatus.paper_id.in_(paper_ids)
+ )
+ )
+ db.commit()
+
+ return {
+ "status": "success",
+ "message": f"已将 {len(paper_ids)} 篇论文重置为待总结",
+ "count": len(paper_ids),
+ }
diff --git a/app/routes/compare.py b/app/routes/compare.py
index dbca7f6..b3f9c1b 100644
--- a/app/routes/compare.py
+++ b/app/routes/compare.py
@@ -2,11 +2,12 @@
from __future__ import annotations
-from fastapi import APIRouter, Depends, HTTPException, Query, Request
+from fastapi import APIRouter, Depends, Query, Request
+from sqlalchemy import select
from sqlalchemy.orm import Session, joinedload
from app.database import get_db
-from app.models import Paper
+from app.models import PAPER_DEFAULT_LOAD, Paper
from app.utils import templates
router = APIRouter()
@@ -48,14 +49,16 @@ def compare_page(
)
papers = (
- db.query(Paper)
- .filter(Paper.arxiv_id.in_(arxiv_ids))
- .options(
- joinedload(Paper.authors),
- joinedload(Paper.tags),
- joinedload(Paper.summary),
- joinedload(Paper.summary_status),
+ db.execute(
+ select(Paper)
+ .where(Paper.arxiv_id.in_(arxiv_ids))
+ .options(
+ joinedload(Paper.summary),
+ *PAPER_DEFAULT_LOAD,
+ )
)
+ .unique()
+ .scalars()
.all()
)
diff --git a/app/routes/pages.py b/app/routes/pages.py
index 3993e58..474bb08 100644
--- a/app/routes/pages.py
+++ b/app/routes/pages.py
@@ -2,18 +2,20 @@
from __future__ import annotations
+import json
import logging
+import re
from datetime import date, timedelta
-from pathlib import Path
from fastapi import APIRouter, Depends, HTTPException, Query, Request
from fastapi.responses import RedirectResponse
+from sqlalchemy import select
from sqlalchemy.orm import Session, joinedload
from app.config import settings
from app.database import get_db
-from app.models import Paper
-from app.utils import templates, today_str
+from app.models import PAPER_FULL_LOAD, Paper
+from app.utils import PAPERS_DIR, safe_json_loads, templates, today_str, latest_paper_date
logger = logging.getLogger(__name__)
@@ -21,9 +23,9 @@ router = APIRouter()
@router.get("/")
-def index(request: Request):
- """重定向到 /day/{today}。"""
- return RedirectResponse(url=f"/day/{today_str()}")
+def index(request: Request, db: Session = Depends(get_db)):
+ """重定向到最新有论文的日期页。"""
+ return RedirectResponse(url=f"/day/{latest_paper_date(db)}")
@router.get("/day/{date_str}")
@@ -39,23 +41,24 @@ def day_page(date_str: str, request: Request, db: Session = Depends(get_db)):
today = today_str()
papers = (
- db.query(Paper)
- .filter(Paper.paper_date == date_str)
- .options(
- joinedload(Paper.authors),
- joinedload(Paper.tags),
- joinedload(Paper.summary_status),
- joinedload(Paper.bookmark),
+ db.execute(
+ select(Paper)
+ .where(Paper.paper_date == date_str)
+ .options(*PAPER_FULL_LOAD)
+ .order_by(Paper.upvotes.desc())
)
- .order_by(Paper.upvotes.desc())
+ .scalars()
+ .unique()
.all()
)
dates_raw = (
- db.query(Paper.paper_date)
- .distinct()
- .order_by(Paper.paper_date.desc())
- .limit(30)
+ db.execute(
+ select(Paper.paper_date)
+ .distinct()
+ .order_by(Paper.paper_date.desc())
+ .limit(30)
+ )
.all()
)
available_dates = [
@@ -81,18 +84,17 @@ def day_page(date_str: str, request: Request, db: Session = Depends(get_db)):
def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db)):
"""论文详情页。"""
paper = (
- db.query(Paper)
- .filter(Paper.arxiv_id == arxiv_id)
- .options(
- joinedload(Paper.authors),
- joinedload(Paper.tags),
- joinedload(Paper.summary),
- joinedload(Paper.summary_status),
- joinedload(Paper.bookmark),
- joinedload(Paper.reading_status),
- joinedload(Paper.note),
+ db.execute(
+ select(Paper)
+ .where(Paper.arxiv_id == arxiv_id)
+ .options(
+ joinedload(Paper.summary),
+ joinedload(Paper.note),
+ *PAPER_FULL_LOAD,
+ )
)
- .first()
+ .unique()
+ .scalar_one_or_none()
)
if not paper:
raise HTTPException(status_code=404, detail="Paper not found")
@@ -108,28 +110,15 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
images = _get_paper_images(arxiv_id)
# 预处理 JSON 字段供模板直接使用
- import json as _json
-
- prereqs = {}
- if paper.summary and paper.summary.prerequisites_json:
- try:
- prereqs = _json.loads(paper.summary.prerequisites_json)
- except (ValueError, TypeError):
- pass
-
- benchmarks = []
- if paper.summary and paper.summary.results_benchmarks_json:
- try:
- benchmarks = _json.loads(paper.summary.results_benchmarks_json)
- except (ValueError, TypeError):
- pass
-
- figures_raw = []
- if paper.summary and paper.summary.figures_json:
- try:
- figures_raw = _json.loads(paper.summary.figures_json)
- except (ValueError, TypeError):
- pass
+ prereqs = safe_json_loads(
+ paper.summary.prerequisites_json if paper.summary else None, default={}
+ )
+ benchmarks = safe_json_loads(
+ paper.summary.results_benchmarks_json if paper.summary else None, default=[]
+ )
+ figures_raw = safe_json_loads(
+ paper.summary.figures_json if paper.summary else None, default=[]
+ )
linked_figures = _link_figures_with_images(figures_raw, images, arxiv_id)
@@ -228,9 +217,12 @@ def _get_similar_papers(db: Session, arxiv_id: str, top_k: int = 6) -> list[dict
return []
papers = (
- db.query(Paper)
- .filter(Paper.arxiv_id.in_(list(papers_info.keys())))
- .options(joinedload(Paper.tags))
+ db.execute(
+ select(Paper)
+ .where(Paper.arxiv_id.in_(list(papers_info.keys())))
+ .options(joinedload(Paper.tags))
+ )
+ .scalars()
.all()
)
@@ -260,7 +252,7 @@ def _get_similar_papers(db: Session, arxiv_id: str, top_k: int = 6) -> list[dict
def _get_paper_images(arxiv_id: str) -> list[dict]:
"""获取论文提取的图片列表。"""
- images_dir = Path("data/papers") / arxiv_id / "images"
+ images_dir = PAPERS_DIR / arxiv_id / "images"
if not images_dir.exists():
return []
@@ -286,15 +278,12 @@ def _link_figures_with_images(
if not figures or not images:
return figures
- import json as _json
- import re
-
- manifest_path = Path("data/papers") / arxiv_id / "images" / "manifest.json"
+ manifest_path = PAPERS_DIR / arxiv_id / "images" / "manifest.json"
if not manifest_path.exists():
return figures
try:
- manifest = _json.loads(manifest_path.read_text(encoding="utf-8"))
+ manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
except (ValueError, TypeError):
return figures
diff --git a/app/routes/search.py b/app/routes/search.py
index df4bbf4..76e1c4b 100644
--- a/app/routes/search.py
+++ b/app/routes/search.py
@@ -7,12 +7,12 @@ from xml.sax.saxutils import escape
from fastapi import APIRouter, Depends, Query, Request
from fastapi.responses import Response
-from sqlalchemy import text
+from sqlalchemy import select
from sqlalchemy.orm import Session, joinedload
from app.config import settings
from app.database import get_db
-from app.models import Paper, PaperTag, UserReadingStatus
+from app.models import Paper, PaperTag
from app.services.searcher import get_all_tags, search_papers
from app.services.user_data import query_reading_list
from app.utils import templates, today_str
@@ -144,9 +144,9 @@ def rss_feed(
"""RSS 2.0 Feed — 最近 7 天论文。"""
seven_days_ago = date.today() - timedelta(days=7)
- query = (
- db.query(Paper)
- .filter(Paper.paper_date >= seven_days_ago)
+ stmt = (
+ select(Paper)
+ .where(Paper.paper_date >= seven_days_ago)
.options(
joinedload(Paper.authors),
joinedload(Paper.tags),
@@ -156,9 +156,9 @@ def rss_feed(
)
if tag:
- query = query.filter(Paper.tags.any(PaperTag.tag == tag))
+ stmt = stmt.where(Paper.tags.any(PaperTag.tag == tag))
- papers = query.all()
+ papers = db.execute(stmt).unique().scalars().all()
xml = _generate_rss_xml(papers, settings.BASE_URL, tag or None)
return Response(content=xml, media_type="application/xml")
diff --git a/app/services/admin.py b/app/services/admin.py
new file mode 100644
index 0000000..8346322
--- /dev/null
+++ b/app/services/admin.py
@@ -0,0 +1,109 @@
+"""管理后台服务 — 统计聚合、系统状态。"""
+
+from __future__ import annotations
+
+from datetime import date
+from pathlib import Path
+
+from sqlalchemy import func, select, text
+from sqlalchemy.orm import Session
+
+from app.config import settings
+from app.models import CrawlLog, Paper, SummaryState, TaskLock
+from app.services.scheduler import get_scheduler
+from app.utils import PAPERS_DIR, TMP_DIR
+
+
+def _dir_size(path: Path) -> int:
+ """递归计算目录总字节数。"""
+ if not path.exists():
+ return 0
+ return sum(f.stat().st_size for f in path.rglob("*") if f.is_file())
+
+
+def _fmt_size(nbytes: int) -> str:
+ """字节数 → 人类可读字符串。"""
+ for unit in ("B", "KB", "MB", "GB"):
+ if nbytes < 1024:
+ return f"{nbytes:.1f} {unit}"
+ nbytes /= 1024
+ return f"{nbytes:.1f} TB"
+
+
+def get_admin_stats(db: Session) -> dict:
+ """管理仪表盘统计数据。"""
+ today = date.today()
+
+ # ── 论文统计 ──────────────────────────────────────────────────────
+ total_papers = db.scalar(select(func.count(Paper.id)))
+ today_papers = db.scalar(
+ select(func.count(Paper.id)).where(Paper.paper_date == today)
+ )
+
+ # ── 总结状态分布 ──────────────────────────────────────────────────
+ summary_rows = db.execute(
+ text("""
+ SELECT COALESCE(ss.status, 'none') AS status, COUNT(*) AS cnt
+ FROM papers p
+ LEFT JOIN summary_status ss ON ss.paper_id = p.id
+ GROUP BY status
+ """)
+ ).fetchall()
+ status_counts = {row[0]: row[1] for row in summary_rows}
+
+ # ── 存储概况 ──────────────────────────────────────────────────────
+ db_size = _fmt_size(settings.db_path.stat().st_size) if settings.db_path.exists() else "0 B"
+ papers_size = _fmt_size(_dir_size(PAPERS_DIR))
+ tmp_size = _fmt_size(_dir_size(TMP_DIR))
+
+ # ── 调度器状态 ────────────────────────────────────────────────────
+ scheduler = get_scheduler()
+ scheduler_enabled = scheduler is not None
+ next_run = None
+ if scheduler_enabled:
+ for job in scheduler.get_jobs():
+ if job.id == "daily_pipeline":
+ next_run = job.next_run_time
+ break
+
+ # ── 最近日志(5 条) ──────────────────────────────────────────────
+ recent_logs = (
+ db.execute(
+ select(CrawlLog)
+ .order_by(CrawlLog.started_at.desc())
+ .limit(5)
+ )
+ .scalars()
+ .all()
+ )
+
+ # ── 活跃锁 ────────────────────────────────────────────────────────
+ active_locks = (
+ db.execute(
+ select(TaskLock).where(TaskLock.status == "running")
+ )
+ .scalars()
+ .all()
+ )
+
+ return {
+ "total_papers": total_papers or 0,
+ "today_papers": today_papers or 0,
+ "pending_count": status_counts.get(SummaryState.PENDING, 0),
+ "failed_count": status_counts.get(SummaryState.FAILED, 0)
+ + status_counts.get(SummaryState.PERMANENT_FAILURE, 0),
+ "done_count": status_counts.get(SummaryState.DONE, 0),
+ "running_count": status_counts.get("running", 0)
+ + status_counts.get(SummaryState.PROCESSING, 0),
+ "none_count": status_counts.get("none", 0),
+ "status_counts": status_counts,
+ "db_size": db_size,
+ "papers_size": papers_size,
+ "tmp_size": tmp_size,
+ "scheduler_enabled": scheduler_enabled,
+ "schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
+ "timezone": settings.APP_TIMEZONE,
+ "next_run": next_run.isoformat() if next_run else None,
+ "recent_logs": recent_logs,
+ "active_locks": active_locks,
+ }
diff --git a/app/services/cleaner.py b/app/services/cleaner.py
index 9a21b99..7d29cfd 100644
--- a/app/services/cleaner.py
+++ b/app/services/cleaner.py
@@ -2,21 +2,20 @@
from __future__ import annotations
+import json
import logging
import shutil
-from datetime import date, datetime, timezone
-from pathlib import Path
+from datetime import date
-from sqlalchemy import delete, select, text
+from sqlalchemy import select, text
from sqlalchemy.orm import Session
from app.models import (
CrawlLog,
DataDeleteJob,
Paper,
- TaskLock,
)
-from app.utils import PAPERS_DIR, TMP_DIR
+from app.utils import PAPERS_DIR, TMP_DIR, utc_now
logger = logging.getLogger(__name__)
@@ -39,7 +38,7 @@ def cleanup_tmp(max_age_hours: int = _MAX_TMP_AGE_HOURS) -> dict:
if not TMP_DIR.exists():
return {"scanned": 0, "removed": 0, "errors": []}
- now = datetime.now(timezone.utc)
+ now = utc_now()
cutoff = now.timestamp() - (max_age_hours * 3600)
scanned = 0
removed = 0
@@ -96,7 +95,7 @@ async def delete_papers_by_date_range(
Returns:
删除结果统计
"""
- now = datetime.now(timezone.utc)
+ now = utc_now()
# 查询目标论文
papers = (
@@ -195,7 +194,7 @@ async def delete_papers_by_date_range(
job.status = job_status
job.paper_count = deleted
- job.completed_at = datetime.now(timezone.utc)
+ job.completed_at = utc_now()
if job_error:
job.error = job_error[:4000]
db.commit()
@@ -205,9 +204,14 @@ async def delete_papers_by_date_range(
task="delete",
status=job_status,
started_at=now,
- completed_at=datetime.now(timezone.utc),
+ completed_at=utc_now(),
papers_found=total,
papers_new=deleted,
+ details_json=json.dumps({
+ "total_before": total,
+ "deleted": deleted,
+ "failed": len(failed_items),
+ }, ensure_ascii=False),
error=job_error,
)
db.add(log_entry)
diff --git a/app/services/crawler.py b/app/services/crawler.py
index 33897e2..8943a4b 100644
--- a/app/services/crawler.py
+++ b/app/services/crawler.py
@@ -1,8 +1,7 @@
"""爬虫服务 — 从 HuggingFace Daily Papers API 抓取论文元数据。"""
import logging
-from datetime import date as date_type
-from datetime import datetime, timezone
+from datetime import date as date_type, datetime, timezone
import httpx
from sqlalchemy import select, text
@@ -14,9 +13,10 @@ from app.models import (
Paper,
PaperAuthor,
PaperTag,
+ SummaryState,
SummaryStatus,
)
-from app.utils import make_http_client
+from app.utils import make_http_client, utc_now
logger = logging.getLogger(__name__)
@@ -131,15 +131,17 @@ def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[
db.add(paper)
db.flush()
+ seen_authors: set[str] = set()
for idx, name in enumerate(meta["authors"]):
- if name:
+ if name and name not in seen_authors:
+ seen_authors.add(name)
db.add(PaperAuthor(paper_id=paper.id, name=name, position=idx))
for tag_name in meta["tags"]:
if tag_name:
db.add(PaperTag(paper_id=paper.id, tag=tag_name, source="hf"))
- db.add(SummaryStatus(paper_id=paper.id, status="pending"))
+ db.add(SummaryStatus(paper_id=paper.id, status=SummaryState.PENDING))
authors_text = ", ".join(meta["authors"])
tags_text = ", ".join(meta["tags"])
@@ -172,7 +174,7 @@ def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[
async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -> dict:
"""完整的抓取流程:获取 + 入库 + 写日志。"""
- now = datetime.now(timezone.utc)
+ now = utc_now()
log_entry = CrawlLog(
task="crawl",
status="running",
@@ -188,7 +190,7 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -
log_entry.status = "success"
log_entry.papers_found = len(raw_papers)
log_entry.papers_new = len(new_papers)
- log_entry.completed_at = datetime.now(timezone.utc)
+ log_entry.completed_at = utc_now()
db.commit()
return {
"found": len(raw_papers),
@@ -200,6 +202,6 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -
logger.exception("Crawl failed for %s", target_date)
log_entry.status = "failed"
log_entry.error = str(exc)
- log_entry.completed_at = datetime.now(timezone.utc)
+ log_entry.completed_at = utc_now()
db.commit()
return {"found": 0, "new": 0, "status": "failed", "error": str(exc)}
diff --git a/app/services/embedder.py b/app/services/embedder.py
index 698931b..b30b729 100644
--- a/app/services/embedder.py
+++ b/app/services/embedder.py
@@ -5,7 +5,8 @@ from __future__ import annotations
import logging
from pathlib import Path
-from sqlalchemy.orm import Session, joinedload
+from sqlalchemy import select
+from sqlalchemy.orm import joinedload
from app.config import settings
from app.models import Paper
@@ -188,12 +189,11 @@ def index_paper(paper_id: str, texts_dict: dict | None = None) -> bool:
db = SessionLocal()
try:
- paper = (
- db.query(Paper)
- .filter(Paper.arxiv_id == paper_id)
+ paper = db.execute(
+ select(Paper)
+ .where(Paper.arxiv_id == paper_id)
.options(joinedload(Paper.tags), joinedload(Paper.summary))
- .first()
- )
+ ).unique().scalar_one_or_none()
if not paper:
logger.warning("Paper %s not found for indexing", paper_id)
return False
@@ -242,36 +242,6 @@ def index_paper(paper_id: str, texts_dict: dict | None = None) -> bool:
return False
-# ── 批量索引 ────────────────────────────────────────────────────────────
-
-
-def index_batch(paper_ids: list[str]) -> dict:
- """批量索引论文,单篇失败不影响其他。
-
- Returns:
- {"total": int, "success": int, "failed": int}
- """
- if not paper_ids:
- return {"total": 0, "success": 0, "failed": 0}
-
- col = get_collection()
- if col is None:
- return {"total": len(paper_ids), "success": 0, "failed": len(paper_ids)}
-
- success = 0
- failed = 0
- for pid in paper_ids:
- if index_paper(pid):
- success += 1
- else:
- failed += 1
-
- logger.info(
- "Batch index: total=%d success=%d failed=%d", len(paper_ids), success, failed
- )
- return {"total": len(paper_ids), "success": success, "failed": failed}
-
-
# ── 删除 ────────────────────────────────────────────────────────────────
diff --git a/app/services/pdf_downloader.py b/app/services/pdf_downloader.py
index 6db50d0..96da241 100644
--- a/app/services/pdf_downloader.py
+++ b/app/services/pdf_downloader.py
@@ -1,10 +1,9 @@
-"""PDF 下载与源码下载 — 从 arXiv 下载论文 PDF 和 LaTeX 源码包。"""
+"""PDF 下载 — 从 arXiv 下载论文 PDF。"""
from __future__ import annotations
import logging
import shutil
-import zipfile
from pathlib import Path
from app.utils import PAPERS_DIR, TMP_DIR, make_http_client
@@ -54,44 +53,6 @@ async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
return dest
-# ── 源码下载 ────────────────────────────────────────────────────────────
-
-
-async def download_source_zip(arxiv_id: str, source_url: str, dest_dir: Path) -> None:
- """下载 arXiv 源码并解压。"""
- dest_dir.mkdir(parents=True, exist_ok=True)
- zip_path = tmp_dir(arxiv_id) / "source.zip"
-
- try:
- async with make_http_client(follow_redirects=True) as client:
- resp = await client.get(source_url)
- resp.raise_for_status()
- zip_path.write_bytes(resp.content)
- except Exception as exc:
- logger.debug("Failed to download source for %s: %s", arxiv_id, exc)
- return
-
- try:
- with zipfile.ZipFile(zip_path, "r") as zf:
- zf.extractall(dest_dir)
- logger.debug("Extracted source for %s", arxiv_id)
- except zipfile.BadZipFile:
- # 可能是 tar.gz
- import tarfile
-
- try:
- with tarfile.open(zip_path, "r:*") as tf:
- tf.extractall(dest_dir, filter="data")
- logger.debug("Extracted source (tar) for %s", arxiv_id)
- except Exception:
- logger.warning("Cannot extract source for %s", arxiv_id)
- except Exception:
- logger.warning("Cannot extract source for %s", arxiv_id, exc_info=True)
- finally:
- if zip_path.exists():
- zip_path.unlink()
-
-
# ── 临时文件清理 ────────────────────────────────────────────────────────
diff --git a/app/services/pdf_image_extractor.py b/app/services/pdf_image_extractor.py
index f79908d..bf76a21 100644
--- a/app/services/pdf_image_extractor.py
+++ b/app/services/pdf_image_extractor.py
@@ -16,6 +16,7 @@ import re
from pathlib import Path
from app.services.pdf_downloader import paper_dir
+from app.utils import TMP_DIR
logger = logging.getLogger(__name__)
@@ -40,10 +41,7 @@ def _find_nearby_labels(
"""
matched: list[str] = []
for rect in rects:
- if isinstance(rect, (list, tuple)):
- y_min, y_max = rect[1], rect[3]
- else:
- y_min, y_max = rect.y0, rect.y1
+ y_min, y_max = rect.y0, rect.y1
for label_key, positions in labels.items():
for label_page, label_y in positions:
@@ -69,7 +67,7 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
import pymupdf
if pdf_path is None:
- pdf_path = Path("data/tmp") / arxiv_id / "paper.pdf"
+ pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
if not pdf_path.exists():
logger.warning("PDF not found for %s: %s", arxiv_id, pdf_path)
@@ -162,10 +160,7 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
continue
margin = 5
- if isinstance(bbox, (list, tuple)):
- x0, y0, x1, y1 = bbox
- else:
- x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
+ x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
clip_rect = pymupdf.Rect(x0 - margin, y0 - margin, x1 + margin, y1 + margin)
zoom = 2
diff --git a/app/services/pi_client.py b/app/services/pi_client.py
index a6df419..75a6e28 100644
--- a/app/services/pi_client.py
+++ b/app/services/pi_client.py
@@ -62,26 +62,17 @@ def write_meta_json(paper) -> Path:
# ── PDF 文本提取 ────────────────────────────────────────────────────────
-def _trim_body(text: str, max_chars: int = 80_000) -> str:
+def _trim_body(text: str, max_chars: int | None = None) -> str:
"""去除参考文献,保留正文+附录,超长时从末尾截断。
策略:
1. 去掉 References/Bibliography 段落(纯引用列表,对解读无用)
2. 正文 + 附录全部保留
- 3. 如果总长超过 max_chars,从末尾截断(附录靠后,优先保留正文)
+ 3. 如果指定了 max_chars 且总长超过,从末尾截断(附录靠后,优先保留正文)
"""
import re
# 找 References 段落的位置(在 Appendix 之后的那个)
- # 有些论文结构:正文 -> Appendix -> References
- # 也可能是:正文 -> References -> Appendix
- # 策略:只删除明确的 References 块
- ref_pattern = re.compile(
- r"(?m)^(?:References|Bibliography|参考文献)\s*$\n"
- r"(?s:.*?)" # References 内容
- r"(?=\n(?:A\s|Appendix|Supplementary|Acknowledgment|致谢)\s|\Z)",
- )
-
# 简单策略:找到 References 标题,如果后面没有 Appendix 就全删
# 如果后面还有 Appendix,只删 References 到 Appendix 之间的内容
ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text)
@@ -110,26 +101,30 @@ def _trim_body(text: str, max_chars: int = 80_000) -> str:
else:
text = text[:ack_match.start()].rstrip()
- # 最后:如果还超长,从末尾截断(附录在后面,正文在前面,优先保留正文)
- if len(text) > max_chars:
+ # 最后:如果指定了上限且超长,从末尾截断(附录在后面,正文在前面,优先保留正文)
+ if max_chars is not None and len(text) > max_chars:
text = text[:max_chars].rstrip()
return text
-def extract_pdf_text(pdf_path: Path) -> Path:
- """用 pymupdf 提取 PDF 正文文本(自动截断参考文献和附录),保存为 .txt。"""
+def extract_pdf_text(pdf_path: Path, max_chars: int | None = None) -> Path:
+ """用 pymupdf 提取 PDF 正文文本,保存为 .txt。
+
+ max_chars=None 时不截断,给 search/auto 模式保留完整内容。
+ """
import pymupdf
txt_path = pdf_path.with_suffix(".txt")
if txt_path.exists():
+ # 缓存优先;如果需重新提取(不同 max_chars),先删旧文件
return txt_path
doc = pymupdf.open(str(pdf_path))
raw_text = "\n\n".join(page.get_text() for page in doc)
doc.close()
- body = _trim_body(raw_text)
+ body = _trim_body(raw_text, max_chars=max_chars)
txt_path.write_text(body, encoding="utf-8")
logger.info(
"Extracted PDF text: %s (%d -> %d chars, -%d%%)",
@@ -141,6 +136,91 @@ def extract_pdf_text(pdf_path: Path) -> Path:
return txt_path
+# ── Prompt 构建 ─────────────────────────────────────────────────────────
+
+
+def _build_prompt(
+ arxiv_id: str,
+ meta_path: Path,
+ txt_path: Path,
+ pdf_mode: str,
+ fix_errors: list[str] | None = None,
+) -> str:
+ """根据模式构建 pi prompt。
+
+ inject: 全量注入,prompt 末尾包含论文全文内容
+ search: pi 自主 read 文件,prompt 只包含工作流指令
+ """
+ json_schema = (
+ "## 必须包含以下字段(不要自创字段名):\n"
+ '{"arxiv_id": "...", '
+ '"title_zh": "中文标题", '
+ '"one_line": "一句话概括(≤50字)", '
+ '"tags": ["标签1","标签2"], '
+ '"difficulty": "入门/进阶/前沿", '
+ '"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的(50-150字)","why_matters":"为什么读懂本文需要它"}]}, '
+ '"motivation": {"problem": "详细段落:现有方法的具体问题(包含具体场景和数据)", '
+ '"goal": "详细段落:本文的具体目标", '
+ '"gap": "详细段落:本文的独特切入角度"}, '
+ '"method": {"overview": "详细段落:方法整体思路(先直觉再技术路线)", '
+ '"key_idea": "详细段落:核心创新点(和已有方法的本质区别)", '
+ '"steps": "详细段落:方法步骤的完整描述(每步的输入输出和具体操作)", '
+ '"novelty": "详细段落:技术新颖性分析"}, '
+ '"results": {"main_findings": "详细段落:核心发现(带具体数字和指标,逐一分析每个实验)", '
+ '"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
+ '"limitations": "详细段落:局限性分析(作者承认的+你自己的观察")}, '
+ '"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", '
+ '"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", '
+ '"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度")}, '
+ '"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要"},'
+ '{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要"}]'
+ "\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Table,id 严格使用 \"Figure N\" 或 \"Table N\" 格式。"
+ "}"
+ )
+
+ writing_requirements = (
+ "## 写作要求\n"
+ "- 每个字符串字段必须写成详细段落(200-500字),不要用列表或数组\n"
+ "- 必须包含论文中的具体数据、数字、实验指标\n"
+ "- 像资深同事给同事讲论文一样,专业但易懂\n"
+ "- 数学公式、符号、变量必须使用 LaTeX 格式:行内公式用 $...$,独立公式用 $$...$$\n"
+ " 例如:损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$,学习率 $\\eta$\n"
+ )
+
+ if fix_errors:
+ error_list = "\n".join(f"- {e}" for e in fix_errors)
+ return (
+ "你之前生成的 JSON 存在以下问题,请修正后重新用 write_file 保存到 "
+ f"data/papers/{arxiv_id}/summary.json:\n\n"
+ f"{error_list}\n\n"
+ "注意:所有字符串字段必须是详细段落(≥50字),不能是数组或列表。"
+ "修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
+ )
+
+ if pdf_mode == "search":
+ return (
+ "请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n"
+ "## 工作流程\n"
+ f"1. 先用 read 工具读取 {meta_path} 了解论文元信息(标题、作者、摘要)\n"
+ f"2. 再用 read 工具阅读 {txt_path}(论文正文全文),可以多次读取定位关键段落\n"
+ f"3. 充分理解后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n\n"
+ + writing_requirements
+ + "\n"
+ + json_schema
+ )
+ else:
+ return (
+ "请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n"
+ "## 工作流程\n"
+ "论文元信息和正文全文已在上文提供,请仔细阅读。\n"
+ f"1. 充分理解论文后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n"
+ "2. 用 bash 运行 python scripts/validate_summary.py 验证\n\n"
+ + writing_requirements
+ + "\n"
+ + json_schema
+ )
+
+
# ── pi CLI 调用 ────────────────────────────────────────────────────────
@@ -149,63 +229,41 @@ async def call_pi(
pdf_path: Path,
fix_errors: list[str] | None = None,
session_id: str | None = None,
+ pdf_mode: str = "inject",
) -> tuple[str, str]:
"""调用 pi CLI 非交互模式,返回 (stdout 文本, session_id)。
fix_errors: 如果非空,表示上一次验证失败的错误列表,pi 需要修正这些问题。
session_id: 如果非空,用 --continue 延续该 session;否则创建新 session。
+ pdf_mode: "inject" = 全量注入 prompt(@file),"search" = pi 自主 read 文件。
"""
arxiv_id = meta_path.parent.name
- # 将 PDF 转为文本文件,以 @txt 方式传给 pi
- txt_path = extract_pdf_text(pdf_path)
+ # 提取 PDF 全文(不截断),根据实际大小自动选择模式
+ txt_path = extract_pdf_text(pdf_path, max_chars=None)
+ txt_size = len(txt_path.read_text(encoding="utf-8"))
- if fix_errors:
- # 验证失败后的修正提示(同一 session 内,pi 能看到之前写的文件)
- error_list = "\n".join(f"- {e}" for e in fix_errors)
- prompt_text = (
- "你之前生成的 JSON 存在以下问题,请修正后重新用 write_file 保存到 "
- f"data/papers/{arxiv_id}/summary.json:\n\n"
- f"{error_list}\n\n"
- "注意:所有字符串字段必须是详细段落(≥50字),不能是数组或列表。"
- "修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
- )
- else:
- prompt_text = (
- "请深度解读以下论文,严格按下面的 JSON schema 输出结果。"
- "只输出一个 JSON 对象,不要输出其他内容。\n\n"
- "## 写作要求\n"
- "- 每个字符串字段必须写成详细段落(200-500字),不要用列表或数组\n"
- "- 必须包含论文中的具体数据、数字、实验指标\n"
- "- 像资深同事给同事讲论文一样,专业但易懂\n"
- "- 数学公式、符号、变量必须使用 LaTeX 格式:行内公式用 $...$,独立公式用 $$...$$\n"
- " 例如:损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$,学习率 $\\eta$\n\n"
- "## 必须包含以下字段(不要自创字段名):\n"
- '{"arxiv_id": "...", '
- '"title_zh": "中文标题", '
- '"one_line": "一句话概括(≤50字)", '
- '"tags": ["标签1","标签2"], '
- '"difficulty": "入门/进阶/前沿", '
- '"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的(50-150字)","why_matters":"为什么读懂本文需要它"}]}, '
- '"motivation": {"problem": "详细段落:现有方法的具体问题(包含具体场景和数据)", '
- '"goal": "详细段落:本文的具体目标", '
- '"gap": "详细段落:本文的独特切入角度"}, '
- '"method": {"overview": "详细段落:方法整体思路(先直觉再技术路线)", '
- '"key_idea": "详细段落:核心创新点(和已有方法的本质区别)", '
- '"steps": "详细段落:方法步骤的完整描述(每步的输入输出和具体操作)", '
- '"novelty": "详细段落:技术新颖性分析"}, '
- '"results": {"main_findings": "详细段落:核心发现(带具体数字和指标,逐一分析每个实验)", '
- '"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
- '"limitations": "详细段落:局限性分析(作者承认的+你自己的观察)"}, '
- '"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", '
- '"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", '
- '"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度)"}, '
- '"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要"},'
- '{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要"}]'
- "\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Table,id 严格使用 \"Figure N\" 或 \"Table N\" 格式。"
- "}\n\n"
- "请深度解读以下论文:"
- )
+ actual_mode = pdf_mode
+ if pdf_mode == "auto":
+ if txt_size > 80_000:
+ actual_mode = "search"
+ logger.info(
+ "Auto mode: %s text=%d chars > 80k → search", arxiv_id, txt_size
+ )
+ else:
+ actual_mode = "inject"
+ logger.info(
+ "Auto mode: %s text=%d chars ≤ 80k → inject", arxiv_id, txt_size
+ )
+
+ # inject 模式需要截断过长的文本(避免撑爆 context)
+ if actual_mode == "inject" and txt_size > 80_000:
+ body = txt_path.read_text(encoding="utf-8")
+ trimmed = body[:80_000].rstrip()
+ txt_path.write_text(trimmed, encoding="utf-8")
+ logger.info("Truncated %s for inject: %d → %d chars", arxiv_id, txt_size, len(trimmed))
+
+ prompt_text = _build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors)
# 构建 session ID(每篇论文一个独立 session)
if session_id is None:
@@ -213,10 +271,12 @@ async def call_pi(
session_id = f"summary-{arxiv_id}-{uuid.uuid4().hex[:8]}"
+ # 工具列表:search 模式需要 read 工具
+ tools = "bash,write_file" if actual_mode != "search" else "bash,write_file,read"
cmd = [
settings.PI_BIN,
"-p",
- "--tools", "bash,write_file",
+ "--tools", tools,
]
if fix_errors:
cmd += ["--session", session_id, "--continue"]
@@ -227,11 +287,14 @@ async def call_pi(
settings.SUMMARY_SKILL,
prompt_text,
]
- if not fix_errors:
- # 首次调用传文件,后续 --continue 不需要(session 内已有)
+ if not fix_errors and actual_mode != "search":
+ # inject 模式:首次调用传 @file;search 模式 pi 自己 read,不注入
cmd += [f"@{meta_path}", f"@{txt_path}"]
- logger.info("Calling pi for %s (fix=%s, session=%s)", arxiv_id, bool(fix_errors), session_id)
+ logger.info(
+ "Calling pi for %s (fix=%s, session=%s, mode=%s)",
+ arxiv_id, bool(fix_errors), session_id, actual_mode,
+ )
proc = await asyncio.create_subprocess_exec(
*cmd,
diff --git a/app/services/pipeline.py b/app/services/pipeline.py
new file mode 100644
index 0000000..37ffbf4
--- /dev/null
+++ b/app/services/pipeline.py
@@ -0,0 +1,108 @@
+"""流水线服务 — crawl → summarize → cleanup 的共享编排逻辑。
+
+供 admin 手动触发和 scheduler 定时调度共用。
+"""
+
+from __future__ import annotations
+
+import logging
+from datetime import date as date_type
+
+from sqlalchemy.orm import Session
+
+from app.config import settings
+from app.models import CrawlLog, TaskLock
+from app.services.cleaner import cleanup_tmp
+from app.services.crawler import crawl_daily
+from app.services.summarizer import summarize_batch
+from app.utils import utc_now, yesterday_str
+
+logger = logging.getLogger(__name__)
+
+
+async def run_pipeline(db: Session, target_date: str, owner: str) -> dict:
+ """执行完整流水线:crawl → summarize → cleanup。
+
+ 使用 task_locks 防重入,写入 CrawlLog 记录。
+
+ Args:
+ db: 数据库 session
+ target_date: 目标日期 YYYY-MM-DD
+ owner: 调用者标识(如 "admin_trigger" / "daily_pipeline")
+
+ Returns:
+ {"status": "success"|"failed", "error": str|None, ...}
+ """
+ now = utc_now()
+ lock_key = f"pipeline-{target_date}"
+
+ # ── 获取锁 ──────────────────────────────────────────────────────────
+ lock = TaskLock(
+ task="scheduler",
+ lock_key=lock_key,
+ status="running",
+ owner=owner,
+ acquired_at=now,
+ )
+ try:
+ db.add(lock)
+ db.commit()
+ except Exception:
+ db.rollback()
+ raise RuntimeError(f"Pipeline already running for {target_date}")
+
+ # ── 写调度日志 ──────────────────────────────────────────────────────
+ log_entry = CrawlLog(
+ task="scheduler",
+ status="running",
+ date=date_type.fromisoformat(target_date),
+ started_at=now,
+ )
+ db.add(log_entry)
+ db.commit()
+
+ error_msg = None
+ crawl_result: dict = {}
+ try:
+ # Step 1: 抓取(先试今天,无数据则回退昨天)
+ crawl_result = await crawl_daily(db, target_date)
+ logger.info("Pipeline [%s]: crawl %s, found=%d new=%d",
+ owner, target_date,
+ crawl_result.get("found", 0), crawl_result.get("new", 0))
+
+ if crawl_result.get("status") == "success" and crawl_result.get("found") == 0:
+ yesterday = yesterday_str()
+ logger.info("Pipeline [%s]: falling back to %s", owner, yesterday)
+ crawl_result = await crawl_daily(db, yesterday)
+
+ # Step 2: 总结
+ summarize_result = await summarize_batch(db, pdf_mode=settings.SUMMARY_PDF_MODE)
+ logger.info("Pipeline [%s]: summarize done, result=%s", owner, summarize_result)
+
+ # Step 3: 清理
+ cleanup_result = cleanup_tmp()
+ logger.info("Pipeline [%s]: cleanup done, removed=%d",
+ owner, cleanup_result.get("removed", 0))
+
+ log_entry.status = "success"
+ log_entry.papers_found = crawl_result.get("found", 0)
+ log_entry.papers_new = crawl_result.get("new", 0)
+
+ except Exception as exc:
+ logger.exception("Pipeline [%s] failed", owner)
+ log_entry.status = "failed"
+ error_msg = str(exc)[:2000]
+
+ finally:
+ log_entry.completed_at = utc_now()
+ if error_msg:
+ log_entry.error = error_msg
+ db.commit()
+
+ lock.status = "finished"
+ lock.released_at = utc_now()
+ db.commit()
+
+ if error_msg:
+ return {"status": "failed", "error": error_msg}
+ return {"status": "success", "message": "Pipeline completed"}
diff --git a/app/services/scheduler.py b/app/services/scheduler.py
index f0915ff..ea2c6f5 100644
--- a/app/services/scheduler.py
+++ b/app/services/scheduler.py
@@ -3,7 +3,6 @@
from __future__ import annotations
import logging
-from datetime import datetime, timezone
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.cron import CronTrigger
@@ -12,10 +11,8 @@ from zoneinfo import ZoneInfo
from app.config import settings
from app.database import SessionLocal
-from app.models import CrawlLog, TaskLock
-from app.services.cleaner import cleanup_tmp
-from app.services.crawler import crawl_daily
-from app.services.summarizer import summarize_batch
+from app.services.pipeline import run_pipeline
+from app.utils import today_str
logger = logging.getLogger(__name__)
@@ -92,85 +89,15 @@ def stop_scheduler() -> None:
async def _daily_pipeline() -> None:
"""每日流水线:抓取 → 总结 → 清理。
- 使用 task_locks 表防止重入:同一天的 pipeline 任务只有一个能运行。
+ 委托给 pipeline.run_pipeline 执行,使用 task_locks 防重入。
"""
- tz = ZoneInfo(settings.APP_TIMEZONE)
- today = datetime.now(tz).strftime("%Y-%m-%d")
- now = datetime.now(timezone.utc)
- lock_key = f"pipeline-{today}"
+ today = today_str()
db: Session = SessionLocal()
try:
- # 尝试获取锁
- lock = TaskLock(
- task="scheduler",
- lock_key=lock_key,
- status="running",
- owner="daily_pipeline",
- acquired_at=now,
- )
- try:
- db.add(lock)
- db.commit()
- except Exception:
- db.rollback()
- logger.warning("Daily pipeline already running for %s, skipping", today)
- return
-
- # 写调度日志
- log_entry = CrawlLog(
- task="scheduler",
- status="running",
- date=datetime.now(tz).date(),
- started_at=now,
- )
- db.add(log_entry)
- db.commit()
-
- error_msg = None
- try:
- # Step 1: 抓取
- logger.info("Scheduler pipeline: crawl %s", today)
- crawl_result = await crawl_daily(db, today)
- logger.info(
- "Scheduler pipeline: crawl done, found=%d new=%d",
- crawl_result.get("found", 0),
- crawl_result.get("new", 0),
- )
-
- # Step 2: 总结 pending 论文
- logger.info("Scheduler pipeline: summarize batch")
- summarize_result = await summarize_batch(db)
- logger.info(
- "Scheduler pipeline: summarize done, result=%s", summarize_result
- )
-
- # Step 3: 清理临时文件
- logger.info("Scheduler pipeline: cleanup tmp")
- cleanup_result = cleanup_tmp()
- logger.info(
- "Scheduler pipeline: cleanup done, removed=%d",
- cleanup_result.get("removed", 0),
- )
-
- log_entry.status = "success"
-
- except Exception as exc:
- logger.exception("Scheduler pipeline failed for %s", today)
- log_entry.status = "failed"
- error_msg = str(exc)[:2000]
-
- finally:
- log_entry.completed_at = datetime.now(timezone.utc)
- if error_msg:
- log_entry.error = error_msg
- db.commit()
-
- # 释放锁
- lock.status = "finished"
- lock.released_at = datetime.now(timezone.utc)
- db.commit()
-
+ await run_pipeline(db, today, owner="daily_pipeline")
+ except RuntimeError:
+ logger.warning("Daily pipeline already running for %s, skipping", today)
except Exception:
logger.exception("Unexpected error in daily pipeline")
finally:
diff --git a/app/services/schemas.py b/app/services/schemas.py
index 4476beb..5b3f487 100644
--- a/app/services/schemas.py
+++ b/app/services/schemas.py
@@ -3,10 +3,10 @@
from __future__ import annotations
import json
-from datetime import datetime, timezone
-
from pydantic import BaseModel, Field, ValidationError, field_validator
+from app.utils import sanitize_html, utc_now
+
# ── 子模型 ──────────────────────────────────────────────────────────────
@@ -90,18 +90,6 @@ class SummarySchema(BaseModel):
# ── 质量评估 ────────────────────────────────────────────────────────────
-# 必填字段:title_zh, one_line, tags, motivation.problem, method.key_idea
-# — 缺失时 Pydantic 校验就会报错,不会走到 assess_quality
-# 重要字段:motivation.goal, motivation.gap, method.overview, results.main_findings
-# — 缺失可入库,标记 degraded
-_OPTIONAL_BUT_IMPORTANT_FIELDS = [
- "motivation.goal",
- "motivation.gap",
- "method.overview",
- "results.main_findings",
-]
-
-
def assess_quality(schema: SummarySchema) -> str:
"""评估总结质量:normal / degraded / low。"""
# low:内容空洞的启发式判断
@@ -128,31 +116,40 @@ def assess_quality(schema: SummarySchema) -> str:
def flatten_for_db(schema: SummarySchema) -> dict:
- """将 SummarySchema 展平为 paper_summaries 表的列值 dict。"""
+ """将 SummarySchema 展平为 paper_summaries 表的列值 dict。
+
+ 所有供前端用 |safe 渲染的文本字段均经过 HTML 清洗。
+ """
+ # 清洗 prerequisites 嵌套文本
+ prereqs = schema.prerequisites.model_dump()
+ for c in prereqs.get("concepts", []):
+ if isinstance(c, dict):
+ for key in ("explanation", "why_matters"):
+ if key in c and c[key]:
+ c[key] = sanitize_html(c[key])
+
return {
- "one_line": schema.one_line,
+ "one_line": sanitize_html(schema.one_line),
"difficulty": schema.difficulty,
- "prerequisites_json": json.dumps(
- schema.prerequisites.model_dump(), ensure_ascii=False
- ),
- "motivation_problem": schema.motivation.problem,
- "motivation_goal": schema.motivation.goal,
- "motivation_gap": schema.motivation.gap,
- "method_overview": schema.method.overview,
- "method_key_idea": schema.method.key_idea,
- "method_steps_json": schema.method.steps,
- "method_novelty": schema.method.novelty,
- "results_main_json": schema.results.main_findings,
+ "prerequisites_json": json.dumps(prereqs, ensure_ascii=False),
+ "motivation_problem": sanitize_html(schema.motivation.problem),
+ "motivation_goal": sanitize_html(schema.motivation.goal),
+ "motivation_gap": sanitize_html(schema.motivation.gap),
+ "method_overview": sanitize_html(schema.method.overview),
+ "method_key_idea": sanitize_html(schema.method.key_idea),
+ "method_steps_json": sanitize_html(schema.method.steps),
+ "method_novelty": sanitize_html(schema.method.novelty),
+ "results_main_json": sanitize_html(schema.results.main_findings),
"results_benchmarks_json": json.dumps(
schema.results.benchmarks, ensure_ascii=False
),
- "limitations_json": schema.results.limitations,
- "weaknesses_json": schema.improvements.weaknesses,
- "future_work_json": schema.improvements.future_work,
- "reproducibility": schema.improvements.reproducibility,
+ "limitations_json": sanitize_html(schema.results.limitations),
+ "weaknesses_json": sanitize_html(schema.improvements.weaknesses),
+ "future_work_json": sanitize_html(schema.improvements.future_work),
+ "reproducibility": sanitize_html(schema.improvements.reproducibility),
"figures_json": json.dumps(schema.figures, ensure_ascii=False),
"full_json": schema.model_dump_json(ensure_ascii=False),
- "updated_at": datetime.now(timezone.utc),
+ "updated_at": utc_now(),
}
diff --git a/app/services/searcher.py b/app/services/searcher.py
index 7cf1385..c79e604 100644
--- a/app/services/searcher.py
+++ b/app/services/searcher.py
@@ -6,11 +6,11 @@ import logging
import math
import re
-from sqlalchemy import text
-from sqlalchemy.orm import Session, joinedload
+from sqlalchemy import select, text
+from sqlalchemy.orm import Session
from app.config import settings
-from app.models import Paper
+from app.models import PAPER_FULL_LOAD, Paper
logger = logging.getLogger(__name__)
@@ -213,21 +213,15 @@ def _search_semantic(
arxiv_ids = [c["arxiv_id"] for c in candidates]
distance_map = {c["arxiv_id"]: c["distance"] for c in candidates}
- papers_query = (
- db.query(Paper)
- .filter(Paper.arxiv_id.in_(arxiv_ids))
- .options(
- joinedload(Paper.authors),
- joinedload(Paper.tags),
- joinedload(Paper.summary_status),
- joinedload(Paper.bookmark),
- joinedload(Paper.reading_status),
- )
+ stmt = (
+ select(Paper)
+ .where(Paper.arxiv_id.in_(arxiv_ids))
+ .options(*PAPER_FULL_LOAD)
)
if tag:
- papers_query = papers_query.filter(Paper.tags.any(tag=tag))
+ stmt = stmt.where(Paper.tags.any(tag=tag))
- papers = papers_query.all()
+ papers = db.execute(stmt).unique().scalars().all()
# 按语义距离排序
id_order = {aid: idx for idx, aid in enumerate(arxiv_ids)}
@@ -257,11 +251,7 @@ def _search_tag_only(
offset: int,
) -> dict:
"""只有标签筛选,无关键词。"""
- order = (
- "p.paper_date DESC, p.upvotes DESC"
- if sort == "date"
- else "p.paper_date DESC, p.upvotes DESC"
- )
+ order = "p.paper_date DESC, p.upvotes DESC"
rows_sql = text(f"""
SELECT p.id
@@ -307,15 +297,13 @@ def _load_papers_by_ids(
return []
papers = (
- db.query(Paper)
- .filter(Paper.id.in_(paper_ids))
- .options(
- joinedload(Paper.authors),
- joinedload(Paper.tags),
- joinedload(Paper.summary_status),
- joinedload(Paper.bookmark),
- joinedload(Paper.reading_status),
+ db.execute(
+ select(Paper)
+ .where(Paper.id.in_(paper_ids))
+ .options(*PAPER_FULL_LOAD)
)
+ .unique()
+ .scalars()
.all()
)
diff --git a/app/services/summarizer.py b/app/services/summarizer.py
index 1f31ec1..ed01e81 100644
--- a/app/services/summarizer.py
+++ b/app/services/summarizer.py
@@ -2,23 +2,24 @@
from __future__ import annotations
+import asyncio
import json
import logging
-import shutil
-from datetime import datetime, timezone
from pathlib import Path
from pydantic import ValidationError
from sqlalchemy import select
-from sqlalchemy.orm import Session, joinedload
+from sqlalchemy.orm import Session
from app.config import settings
from app.database import SessionLocal
from app.models import (
+ PAPER_DEFAULT_LOAD,
CrawlLog,
Paper,
PaperSummary,
PaperTag,
+ SummaryState,
SummaryStatus,
TaskLock,
)
@@ -42,7 +43,7 @@ from app.services.schemas import (
classify_validation_error,
flatten_for_db,
)
-from app.utils import PAPERS_DIR, release_lock
+from app.utils import TMP_DIR, release_lock, utc_now
logger = logging.getLogger(__name__)
@@ -96,8 +97,6 @@ def _update_summary_in_db(
"""将校验后的总结写入 DB:paper_summaries + papers + paper_tags + FTS5。"""
from sqlalchemy import text
- now = datetime.now(timezone.utc)
-
# 1. paper_summaries:upsert
existing = db.get(PaperSummary, paper.id)
flat = flatten_for_db(schema)
@@ -213,21 +212,14 @@ def _validate_summary(json_data: dict, arxiv_id: str) -> list[str]:
# ── 文件操作 ────────────────────────────────────────────────────────────
-def _save_files(arxiv_id: str, schema: SummarySchema, raw_output: str) -> None:
- """保存 summary.json 和 raw_output.txt。"""
- d = paper_dir(arxiv_id)
- d.mkdir(parents=True, exist_ok=True)
- (d / "summary.json").write_text(
- schema.model_dump_json(ensure_ascii=False, indent=2),
- encoding="utf-8",
- )
- (d / "raw_output.txt").write_text(raw_output, encoding="utf-8")
-
-
-def _save_raw_output_only(arxiv_id: str, raw_output: str) -> None:
- """仅保存 raw_output.txt(失败时)。"""
+def _save_files(arxiv_id: str, schema: SummarySchema | None, raw_output: str) -> None:
d = paper_dir(arxiv_id)
d.mkdir(parents=True, exist_ok=True)
+ if schema:
+ (d / "summary.json").write_text(
+ schema.model_dump_json(ensure_ascii=False, indent=2),
+ encoding="utf-8",
+ )
(d / "raw_output.txt").write_text(raw_output, encoding="utf-8")
@@ -240,26 +232,25 @@ async def summarize_one(
semaphore: asyncio.Semaphore | None = None,
*,
force: bool = False,
+ pdf_mode: str = "auto",
) -> dict:
"""总结单篇论文的完整流程。"""
- import asyncio
-
arxiv_id = paper.arxiv_id
# 获取或创建 summary_status
if not paper.summary_status:
- db.add(SummaryStatus(paper_id=paper.id, status="pending"))
+ db.add(SummaryStatus(paper_id=paper.id, status=SummaryState.PENDING))
db.commit()
db.refresh(paper)
status = paper.summary_status
# 跳过已完成的(除非 force)
- if status.status == "done" and not force:
+ if status.status == SummaryState.DONE and not force:
return {"arxiv_id": arxiv_id, "status": "skipped", "reason": "already_done"}
# 跳过 permanent_failure(除非 force)
- if status.status == "permanent_failure" and not force:
+ if status.status == SummaryState.PERMANENT_FAILURE and not force:
return {
"arxiv_id": arxiv_id,
"status": "skipped",
@@ -269,182 +260,202 @@ async def summarize_one(
if semaphore:
await semaphore.acquire()
try:
- return await _do_summarize_one(db, paper)
+ return await _do_summarize_one(db, paper, pdf_mode=pdf_mode)
finally:
if semaphore:
semaphore.release()
-async def _do_summarize_one(db: Session, paper: Paper) -> dict:
- """实际的单篇总结执行(在 semaphore 保护下)。"""
- import asyncio
+async def _generate_with_retry(
+ arxiv_id: str, meta_path: Path, pdf_path: Path, pdf_mode: str = "auto"
+) -> tuple[dict, str]:
+ """调用 pi CLI 生成总结,最多 4 轮验证循环。
+
+ Returns:
+ (json_data, raw_output)
+ Raises:
+ ValueError: 4 轮验证仍未通过
+ """
+ validation_errors: list[str] = []
+ json_data: dict | None = None
+ raw_output = ""
+ session_id = None
+
+ for attempt in range(1, 5):
+ # 清理上一轮 pi 写的不完整文件
+ stale = paper_dir(arxiv_id) / "summary.json"
+ if stale.exists():
+ stale.unlink()
+
+ if attempt == 1:
+ raw_output, session_id = await call_pi(meta_path, pdf_path, pdf_mode=pdf_mode)
+ else:
+ raw_output, session_id = await call_pi(
+ meta_path, pdf_path,
+ fix_errors=validation_errors,
+ session_id=session_id,
+ pdf_mode=pdf_mode,
+ )
+
+ # 优先读取 pi 写入的 summary.json,否则从 stdout 提取
+ summary_file = paper_dir(arxiv_id) / "summary.json"
+ try:
+ if summary_file.exists():
+ json_data = json.loads(summary_file.read_text(encoding="utf-8"))
+ logger.info("Read summary.json written by pi for %s", arxiv_id)
+ else:
+ json_data = extract_json(raw_output)
+ except (json.JSONDecodeError, JsonNotFoundError) as exc:
+ logger.warning(
+ "JSON extraction failed for %s (attempt %d): %s",
+ arxiv_id, attempt, str(exc)[:200],
+ )
+ validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"]
+ continue
+
+ validation_errors = _validate_summary(json_data, arxiv_id)
+ if not validation_errors:
+ break
+ logger.warning(
+ "Validation failed for %s (attempt %d): %s",
+ arxiv_id, attempt, "; ".join(validation_errors),
+ )
+
+ if validation_errors:
+ exc = ValueError(
+ f"Summary validation failed after 4 attempts: {'; '.join(validation_errors)}"
+ )
+ exc.raw_output = raw_output # 供上层 _handle_summary_failure 使用
+ raise exc
+
+ return json_data, raw_output
+
+
+def _persist_summary(
+ db: Session, paper: Paper, json_data: dict, raw_output: str
+) -> str:
+ """Pydantic 校验 → 质量评估 → 保存文件 → 更新 DB → 返回 quality。"""
+ schema = SummarySchema.model_validate(json_data)
+ quality = assess_quality(schema)
+
+ _save_files(paper.arxiv_id, schema, raw_output)
+ _update_summary_in_db(db, paper, schema, quality, raw_output)
+
+ # 状态 → done
+ paper.summary_status.status = SummaryState.DONE
+ paper.summary_status.quality = quality
+ paper.summary_status.completed_at = utc_now()
+ paper.summary_status.raw_output_saved = True
+ db.commit()
+
+ # 触发性增强(失败不影响总结)
+ _maybe_extract_images(paper.arxiv_id, schema)
+ _maybe_index_chroma(paper.arxiv_id, paper, schema)
+
+ return quality
+
+
+def _handle_summary_failure(
+ db: Session, paper: Paper, exc: Exception, raw_output: str,
+) -> dict:
+ """记录失败:保存 raw_output、重试计数、错误分类。"""
+ error_type = _classify_error(exc)
+ logger.error(
+ "Summarize failed: %s error_type=%s %s",
+ paper.arxiv_id, error_type, str(exc)[:200],
+ )
- arxiv_id = paper.arxiv_id
status = paper.summary_status
- now = datetime.now(timezone.utc)
+ if raw_output:
+ _save_files(paper.arxiv_id, None, raw_output)
+ status.raw_output_saved = True
+
+ status.retry_count = (status.retry_count or 0) + 1
+ status.error_type = error_type
+ status.error = str(exc)[:2000]
+
+ if status.retry_count >= settings.SUMMARY_MAX_RETRIES + 1:
+ status.status = SummaryState.PERMANENT_FAILURE
+ else:
+ status.status = SummaryState.PENDING
+
+ status.completed_at = utc_now()
+ db.commit()
+
+ return {
+ "arxiv_id": paper.arxiv_id,
+ "status": "failed",
+ "error_type": error_type,
+ "error": str(exc)[:200],
+ "retry_count": status.retry_count,
+ }
+
+
+def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None:
+ """从 PDF 提取图片和表格(失败不影响总结)。"""
+ try:
+ from app.services.pdf_image_extractor import (
+ extract_images_from_pdf,
+ filter_images_by_summary,
+ )
+ pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
+ extract_images_from_pdf(arxiv_id, pdf_path)
+ if schema.figures:
+ filter_images_by_summary(arxiv_id, schema.figures)
+ except Exception:
+ logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
+
+
+def _maybe_index_chroma(arxiv_id: str, paper: Paper, schema: SummarySchema) -> None:
+ """写入 ChromaDB 语义索引(失败不影响总结)。"""
+ try:
+ from app.services.embedder import index_paper
+
+ texts_dict = {
+ "arxiv_id": arxiv_id,
+ "title_zh": schema.title_zh or "",
+ "title_en": paper.title_en or "",
+ "tags": " ".join(t.tag for t in paper.tags) if paper.tags else "",
+ "one_line": schema.one_line or "",
+ "motivation_problem": schema.motivation.problem or "",
+ "method_key_idea": schema.method.key_idea or "",
+ "paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
+ }
+ index_paper(arxiv_id, texts_dict)
+ except Exception:
+ logger.warning("Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True)
+
+
+async def _do_summarize_one(
+ db: Session, paper: Paper, pdf_mode: str = "auto"
+) -> dict:
+ """实际的单篇总结执行(在 semaphore 保护下)。"""
+ arxiv_id = paper.arxiv_id
# 状态 → processing
- status.status = "processing"
- status.started_at = now
+ paper.summary_status.status = SummaryState.PROCESSING
+ paper.summary_status.started_at = utc_now()
db.commit()
raw_output = ""
try:
- # 写 meta.json
meta_path = write_meta_json(paper)
-
- # 下载 PDF
await download_pdf(arxiv_id, paper.pdf_url)
- # 带验证的生成循环:最多 4 轮,同一 session 内 pi 可看到之前写的文件
- json_data = None
- validation_errors = []
- session_id = None
- for attempt in range(1, 5):
- # 清理上一轮 pi 通过 write_file 写的不完整文件
- stale = paper_dir(arxiv_id) / "summary.json"
- if stale.exists():
- stale.unlink()
+ json_data, raw_output = await _generate_with_retry(
+ arxiv_id, meta_path, TMP_DIR / arxiv_id / "paper.pdf",
+ pdf_mode=pdf_mode,
+ )
- if attempt == 1:
- raw_output, session_id = await call_pi(
- meta_path, Path("data/tmp") / arxiv_id / "paper.pdf"
- )
- else:
- # 验证失败,同一 session 内带着错误信息让 pi 修正
- raw_output, session_id = await call_pi(
- meta_path,
- Path("data/tmp") / arxiv_id / "paper.pdf",
- fix_errors=validation_errors,
- session_id=session_id,
- )
-
- # 优先从 pi write_file 写入的 summary.json 读取,否则从 stdout 提取
- # 如果都失败,当作验证错误,继续下一次尝试
- json_data = None
- summary_file = paper_dir(arxiv_id) / "summary.json"
- try:
- if summary_file.exists():
- json_data = json.loads(summary_file.read_text(encoding="utf-8"))
- logger.info("Read summary.json written by pi for %s", arxiv_id)
- else:
- json_data = extract_json(raw_output)
- except (json.JSONDecodeError, JsonNotFoundError) as exc:
- logger.warning(
- "JSON extraction failed for %s (attempt %d): %s",
- arxiv_id,
- attempt,
- str(exc)[:200],
- )
- validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"]
- continue
-
- # 运行验证脚本
- validation_errors = _validate_summary(json_data, arxiv_id)
- if not validation_errors:
- break
- logger.warning(
- "Validation failed for %s (attempt %d): %s",
- arxiv_id,
- attempt,
- "; ".join(validation_errors),
- )
-
- if validation_errors:
- raise ValueError(
- f"Summary validation failed after 4 attempts: {'; '.join(validation_errors)}"
- )
-
- # Pydantic 校验
- schema = SummarySchema.model_validate(json_data)
-
- # 质量评估
- quality = assess_quality(schema)
-
- # 保存文件
- _save_files(arxiv_id, schema, raw_output)
-
- # 更新 DB
- _update_summary_in_db(db, paper, schema, quality, raw_output)
-
- # 状态 → done
- status.status = "done"
- status.quality = quality
- status.completed_at = datetime.now(timezone.utc)
- status.raw_output_saved = True
- db.commit()
-
- # PDF 图片提取(可选增强,失败不影响总结)
- try:
- from app.services.pdf_image_extractor import (
- extract_images_from_pdf,
- filter_images_by_summary,
- )
- pdf_path = Path("data/tmp") / arxiv_id / "paper.pdf"
- extract_images_from_pdf(arxiv_id, pdf_path)
- # 根据 summary 中 figures 字段过滤,只保留被引用的图表
- if schema.figures:
- filter_images_by_summary(arxiv_id, schema.figures)
- except Exception:
- logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
-
- # 同步写入语义索引(失败仅 log)
- try:
- from app.services.embedder import index_paper
-
- texts_dict = {
- "arxiv_id": arxiv_id,
- "title_zh": schema.title_zh or "",
- "title_en": paper.title_en or "",
- "tags": " ".join(t.tag for t in paper.tags) if paper.tags else "",
- "one_line": schema.one_line or "",
- "motivation_problem": schema.motivation.problem or "",
- "method_key_idea": schema.method.key_idea or "",
- "paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
- }
- index_paper(arxiv_id, texts_dict)
- except Exception:
- logger.warning(
- "Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True
- )
+ quality = _persist_summary(db, paper, json_data, raw_output)
logger.info("Summarize done: %s quality=%s", arxiv_id, quality)
return {"arxiv_id": arxiv_id, "status": "done", "quality": quality}
except Exception as exc:
- error_type = _classify_error(exc)
- logger.error(
- "Summarize failed: %s error_type=%s %s",
- arxiv_id,
- error_type,
- str(exc)[:200],
- )
-
- # 保存 raw_output(如果有)
- if raw_output:
- _save_raw_output_only(arxiv_id, raw_output)
- status.raw_output_saved = True
-
- # 重试逻辑
- status.retry_count = (status.retry_count or 0) + 1
- status.error_type = error_type
- status.error = str(exc)[:2000]
-
- if status.retry_count >= settings.SUMMARY_MAX_RETRIES + 1:
- status.status = "permanent_failure"
- else:
- status.status = "pending"
-
- status.completed_at = datetime.now(timezone.utc)
- db.commit()
-
- return {
- "arxiv_id": arxiv_id,
- "status": "failed",
- "error_type": error_type,
- "error": str(exc)[:200],
- "retry_count": status.retry_count,
- }
+ # 从异常对象获取 raw_output(_generate_with_retry 失败时仍有输出)
+ fail_output = getattr(exc, "raw_output", raw_output)
+ return _handle_summary_failure(db, paper, exc, fail_output)
finally:
cleanup_tmp(arxiv_id)
@@ -458,22 +469,18 @@ async def summarize_single(
arxiv_id: str,
*,
force: bool = True,
+ pdf_mode: str = "auto",
_session_factory=None,
) -> dict:
"""单篇总结入口(供 admin 路由和 CLI 调用)。
_session_factory: 可选的 session 工厂,测试时注入内存 DB 的 session。
"""
- paper = (
- db.query(Paper)
- .filter(Paper.arxiv_id == arxiv_id)
- .options(
- joinedload(Paper.authors),
- joinedload(Paper.tags),
- joinedload(Paper.summary_status),
- )
- .first()
- )
+ paper = db.execute(
+ select(Paper)
+ .where(Paper.arxiv_id == arxiv_id)
+ .options(*PAPER_DEFAULT_LOAD)
+ ).unique().scalar_one_or_none()
if not paper:
return {"status": "not_found", "arxiv_id": arxiv_id}
@@ -482,17 +489,12 @@ async def summarize_single(
# 每篇用独立 session 避免并发问题
paper_db = make_session()
try:
- paper_in_new_session = (
- paper_db.query(Paper)
- .filter(Paper.arxiv_id == arxiv_id)
- .options(
- joinedload(Paper.authors),
- joinedload(Paper.tags),
- joinedload(Paper.summary_status),
- )
- .first()
- )
- result = await summarize_one(paper_db, paper_in_new_session, force=force)
+ paper_in_new_session = paper_db.execute(
+ select(Paper)
+ .where(Paper.arxiv_id == arxiv_id)
+ .options(*PAPER_DEFAULT_LOAD)
+ ).unique().scalar_one_or_none()
+ result = await summarize_one(paper_db, paper_in_new_session, force=force, pdf_mode=pdf_mode)
finally:
paper_db.close()
@@ -506,15 +508,14 @@ async def summarize_batch(
db: Session,
arxiv_ids: list[str] | None = None,
*,
+ pdf_mode: str = "auto",
_session_factory=None,
) -> dict:
"""批量总结入口。arxiv_ids=None 时处理所有 pending 论文。
_session_factory: 可选的 session 工厂,测试时注入内存 DB 的 session。
"""
- import asyncio
-
- now = datetime.now(timezone.utc)
+ now = utc_now()
# TaskLock 防重入
lock = TaskLock(
@@ -543,20 +544,16 @@ async def summarize_batch(
try:
# 查询待总结论文
- query = db.query(Paper).options(
- joinedload(Paper.authors),
- joinedload(Paper.tags),
- joinedload(Paper.summary_status),
- )
+ stmt = select(Paper).options(*PAPER_DEFAULT_LOAD)
if arxiv_ids:
- query = query.filter(Paper.arxiv_id.in_(arxiv_ids))
+ stmt = stmt.where(Paper.arxiv_id.in_(arxiv_ids))
else:
# 只处理 pending 或 failed(可重试的)
- query = query.join(SummaryStatus).filter(
- SummaryStatus.status.in_(["pending", "failed"])
+ stmt = stmt.join(SummaryStatus).where(
+ SummaryStatus.status.in_([SummaryState.PENDING, SummaryState.FAILED])
)
- papers = query.all()
+ papers = db.execute(stmt).unique().scalars().all()
total = len(papers)
logger.info("Summarize batch: %d papers to process", total)
@@ -564,7 +561,7 @@ async def summarize_batch(
log_entry.status = "success"
log_entry.papers_found = 0
log_entry.papers_new = 0
- log_entry.completed_at = datetime.now(timezone.utc)
+ log_entry.completed_at = utc_now()
release_lock(db, lock)
return {
"status": "success",
@@ -581,17 +578,12 @@ async def summarize_batch(
async def _process_paper(paper: Paper) -> dict:
paper_db = make_session()
try:
- p = (
- paper_db.query(Paper)
- .filter(Paper.id == paper.id)
- .options(
- joinedload(Paper.authors),
- joinedload(Paper.tags),
- joinedload(Paper.summary_status),
- )
- .first()
- )
- return await summarize_one(paper_db, p, semaphore)
+ p = paper_db.execute(
+ select(Paper)
+ .where(Paper.id == paper.id)
+ .options(*PAPER_DEFAULT_LOAD)
+ ).unique().scalar_one_or_none()
+ return await summarize_one(paper_db, p, semaphore, pdf_mode=pdf_mode)
finally:
paper_db.close()
@@ -619,7 +611,7 @@ async def summarize_batch(
log_entry.status = "success" if failed == 0 else "failed"
log_entry.papers_found = total
log_entry.papers_new = done
- log_entry.completed_at = datetime.now(timezone.utc)
+ log_entry.completed_at = utc_now()
db.commit()
logger.info(
@@ -641,7 +633,7 @@ async def summarize_batch(
logger.exception("Summarize batch failed")
log_entry.status = "failed"
log_entry.error = str(exc)[:2000]
- log_entry.completed_at = datetime.now(timezone.utc)
+ log_entry.completed_at = utc_now()
db.commit()
return {"status": "failed", "error": str(exc)}
diff --git a/app/services/user_data.py b/app/services/user_data.py
index fd93b18..ffac3b4 100644
--- a/app/services/user_data.py
+++ b/app/services/user_data.py
@@ -2,23 +2,24 @@
from __future__ import annotations
-from datetime import datetime, timezone
-
-from sqlalchemy import or_
+from sqlalchemy import or_, select
from sqlalchemy.orm import Session, joinedload
-from app.models import Paper, PaperTag, UserBookmark, UserNote, UserReadingStatus
+from app.models import PAPER_FULL_LOAD, Paper, PaperTag, UserBookmark, UserNote, UserReadingStatus
+from app.utils import utc_now
# ── 收藏 ──────────────────────────────────────────────────────────────
def toggle_bookmark(db: Session, arxiv_id: str) -> dict:
"""切换收藏状态。返回 {"bookmarked": bool, "arxiv_id": str}。"""
- paper = db.query(Paper).filter(Paper.arxiv_id == arxiv_id).first()
+ paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none()
if not paper:
return {"error": "not_found"}
- existing = db.query(UserBookmark).filter(UserBookmark.paper_id == paper.id).first()
+ existing = db.execute(
+ select(UserBookmark).where(UserBookmark.paper_id == paper.id)
+ ).scalar_one_or_none()
if existing:
db.delete(existing)
db.commit()
@@ -26,7 +27,7 @@ def toggle_bookmark(db: Session, arxiv_id: str) -> dict:
else:
bookmark = UserBookmark(
paper_id=paper.id,
- created_at=datetime.now(timezone.utc),
+ created_at=utc_now(),
)
db.add(bookmark)
db.commit()
@@ -43,16 +44,14 @@ def set_reading_status(db: Session, arxiv_id: str, status: str) -> dict:
if status not in VALID_STATUSES:
return {"error": "invalid_status", "valid": sorted(VALID_STATUSES)}
- paper = db.query(Paper).filter(Paper.arxiv_id == arxiv_id).first()
+ paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none()
if not paper:
return {"error": "not_found"}
- now = datetime.now(timezone.utc)
- existing = (
- db.query(UserReadingStatus)
- .filter(UserReadingStatus.paper_id == paper.id)
- .first()
- )
+ now = utc_now()
+ existing = db.execute(
+ select(UserReadingStatus).where(UserReadingStatus.paper_id == paper.id)
+ ).scalar_one_or_none()
if existing:
existing.status = status
existing.updated_at = now
@@ -73,11 +72,13 @@ def set_reading_status(db: Session, arxiv_id: str, status: str) -> dict:
def get_note(db: Session, arxiv_id: str) -> dict | None:
"""获取笔记。返回 {"arxiv_id", "content", "updated_at"} 或 None(论文不存在时)。"""
- paper = db.query(Paper).filter(Paper.arxiv_id == arxiv_id).first()
+ paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none()
if not paper:
return None
- note = db.query(UserNote).filter(UserNote.paper_id == paper.id).first()
+ note = db.execute(
+ select(UserNote).where(UserNote.paper_id == paper.id)
+ ).scalar_one_or_none()
if not note:
return {"arxiv_id": arxiv_id, "content": "", "updated_at": None}
@@ -90,12 +91,14 @@ def get_note(db: Session, arxiv_id: str) -> dict | None:
def save_note(db: Session, arxiv_id: str, content: str) -> dict:
"""创建或更新笔记。返回 {"arxiv_id", "content", "updated_at"}。"""
- paper = db.query(Paper).filter(Paper.arxiv_id == arxiv_id).first()
+ paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none()
if not paper:
return {"error": "not_found"}
- now = datetime.now(timezone.utc)
- existing = db.query(UserNote).filter(UserNote.paper_id == paper.id).first()
+ now = utc_now()
+ existing = db.execute(
+ select(UserNote).where(UserNote.paper_id == paper.id)
+ ).scalar_one_or_none()
if existing:
existing.content = content
existing.updated_at = now
@@ -126,7 +129,7 @@ def query_reading_list(
) -> list[Paper]:
"""根据筛选条件查询阅读列表。"""
# 基础:有任意用户数据的论文
- base = db.query(Paper).filter(
+ stmt = select(Paper).where(
or_(
Paper.bookmark.has(),
Paper.reading_status.has(),
@@ -136,25 +139,25 @@ def query_reading_list(
# 应用筛选
if filter_type == "has_note":
- base = base.filter(Paper.note.has())
+ stmt = stmt.where(Paper.note.has())
elif filter_type in ("unread", "skimmed", "read_summary", "read_full"):
- base = base.filter(
+ stmt = stmt.where(
Paper.reading_status.has(UserReadingStatus.status == filter_type)
)
# 应用标签
if tag:
- base = base.filter(Paper.tags.any(PaperTag.tag == tag))
+ stmt = stmt.where(Paper.tags.any(PaperTag.tag == tag))
return (
- base.options(
- joinedload(Paper.authors),
- joinedload(Paper.tags),
- joinedload(Paper.summary_status),
- joinedload(Paper.bookmark),
- joinedload(Paper.reading_status),
- joinedload(Paper.note),
+ db.execute(
+ stmt.options(
+ joinedload(Paper.note),
+ *PAPER_FULL_LOAD,
+ )
+ .order_by(Paper.paper_date.desc(), Paper.upvotes.desc())
)
- .order_by(Paper.paper_date.desc(), Paper.upvotes.desc())
+ .unique()
+ .scalars()
.all()
)
diff --git a/app/static/css/admin.css b/app/static/css/admin.css
new file mode 100644
index 0000000..eda6411
--- /dev/null
+++ b/app/static/css/admin.css
@@ -0,0 +1,156 @@
+/* 管理后台公共样式 — 全局链接,可被浏览器缓存 */
+/* 原 admin_styles.html 内容,改为独立 CSS 文件 */
+
+/* ── Admin Shared ─────────────────────────────────────────────── */
+.admin-page { max-width:100%; }
+
+/* subnav */
+.admin-subnav { display:flex; align-items:center; border-bottom:2px solid var(--border); margin-bottom:24px; }
+.admin-subnav-link { padding:10px 20px; font-size:.9rem; font-weight:500; color:var(--ink-light); border:none; border-bottom:2px solid transparent; margin-bottom:-2px; background:none; cursor:pointer; font-family:var(--font-sans); text-decoration:none; transition:color .2s,border-color .2s; }
+.admin-subnav-link:hover { color:var(--accent); text-decoration:none; }
+.admin-subnav-link.active { color:var(--accent); border-bottom-color:var(--accent); }
+.admin-subnav-spacer { flex:1; }
+.admin-subnav-form { margin:0; }
+.admin-subnav-logout { color:var(--ink-muted); font-weight:400; }
+.admin-subnav-logout:hover { color:#8c2828; }
+
+/* tabs */
+.admin-tabs { display:flex; border-bottom:2px solid var(--border); margin-bottom:20px; }
+.admin-tab { padding:10px 24px; border:none; background:none; font-size:.9rem; font-weight:500; color:var(--ink-light); cursor:pointer; border-bottom:2px solid transparent; margin-bottom:-2px; transition:color .2s,border-color .2s; font-family:var(--font-sans); }
+.admin-tab:hover { color:var(--accent); }
+.admin-tab.active { color:var(--accent); border-bottom-color:var(--accent); }
+.admin-tab-content { display:none; }
+.admin-tab-content.active { display:block; }
+
+/* table */
+.admin-table-wrap { overflow-x:auto; border:1px solid var(--border); border-radius:var(--radius); }
+.admin-table { width:100%; border-collapse:collapse; font-size:.85rem; background:var(--surface); }
+.admin-table th { padding:10px 12px; text-align:left; font-weight:600; color:var(--ink-light); background:var(--bg); border-bottom:1px solid var(--border); white-space:nowrap; }
+.admin-table td { padding:8px 12px; border-bottom:1px solid var(--border); color:var(--ink); vertical-align:middle; }
+.admin-table tbody tr:hover { background:var(--bg); }
+.admin-table tbody tr:last-child td { border-bottom:none; }
+.admin-table-compact { font-size:.8rem; }
+.admin-table-compact th, .admin-table-compact td { padding:6px 8px; }
+
+/* badges */
+.task-badge, .status-badge { display:inline-block; padding:2px 8px; border-radius:3px; font-size:.75rem; font-weight:500; white-space:nowrap; }
+.task-crawl { background:#e3f2fd; color:#1565c0; }
+.task-summarize { background:#f3e5f5; color:#7b1fa2; }
+.task-cleanup { background:#e8f5e9; color:#2e7d32; }
+.task-delete { background:#fce4ec; color:#c62828; }
+.task-scheduler { background:#fff3e0; color:#e65100; }
+.status-success { background:#e8f5e9; color:#388e3c; }
+.status-running { background:#e3f2fd; color:#1976d2; }
+.status-failed { background:#fce4ec; color:#c62828; }
+.time-cell { white-space:nowrap; color:var(--ink-light); }
+.error-cell { max-width:200px; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; color:#c62828; font-size:.8rem; }
+
+/* action button */
+.admin-action-btn { display:inline-flex; align-items:center; gap:6px; padding:8px 18px; background:var(--surface); border:1px solid var(--border); border-radius:var(--radius); font-size:.85rem; font-weight:500; color:var(--ink); cursor:pointer; transition:all .2s; font-family:var(--font-sans); line-height:1.4; }
+.admin-action-btn:hover { border-color:var(--accent); color:var(--accent); box-shadow:0 2px 8px var(--shadow); }
+.admin-action-btn:active { transform:translateY(1px); box-shadow:none; }
+.admin-action-btn-sm { padding:5px 12px; font-size:.8rem; }
+.admin-action-btn-danger:hover { border-color:#8c2828; color:#8c2828; }
+
+/* checkbox */
+.admin-check { appearance:none; -webkit-appearance:none; width:18px; height:18px; border:1.5px solid var(--border); border-radius:3px; background:var(--surface); cursor:pointer; vertical-align:middle; position:relative; transition:all .15s; }
+.admin-check:hover { border-color:var(--accent); }
+.admin-check:checked { background:var(--accent); border-color:var(--accent); }
+.admin-check:checked::after { content:''; position:absolute; top:2px; left:5px; width:5px; height:9px; border:solid #fff; border-width:0 2px 2px 0; transform:rotate(45deg); }
+
+/* toast */
+.admin-toast { position:fixed; bottom:24px; left:50%; transform:translateX(-50%) translateY(20px); background:var(--ink); color:var(--surface); padding:12px 24px; border-radius:var(--radius); font-size:.88rem; z-index:9999; opacity:0; transition:opacity .3s,transform .3s; max-width:400px; text-align:center; pointer-events:none; }
+.admin-toast.show { opacity:1; transform:translateX(-50%) translateY(0); }
+
+/* confirm dialog */
+.confirm-overlay { position:fixed; inset:0; background:rgba(0,0,0,.4); display:flex; align-items:center; justify-content:center; z-index:9999; }
+.confirm-dialog { background:var(--surface); border:1px solid var(--border); border-radius:var(--radius); padding:24px; max-width:400px; width:90%; box-shadow:0 8px 32px rgba(0,0,0,.15); }
+.confirm-msg { font-size:.95rem; color:var(--ink); margin-bottom:20px; line-height:1.6; }
+.confirm-actions { display:flex; justify-content:flex-end; gap:10px; }
+.confirm-btn { padding:8px 18px; border-radius:var(--radius); font-size:.85rem; font-weight:500; cursor:pointer; border:1px solid var(--border); font-family:var(--font-sans); transition:all .15s; }
+.confirm-btn-cancel { background:var(--surface); color:var(--ink-light); }
+.confirm-btn-cancel:hover { border-color:var(--ink-light); }
+.confirm-btn-ok { background:#8c2828; color:#fff; border-color:#8c2828; }
+.confirm-btn-ok:hover { background:#a13030; }
+
+/* ── Dashboard ────────────────────────────────────────────────── */
+.stats-grid { display:grid; grid-template-columns:repeat(4,1fr); gap:16px; margin-bottom:24px; }
+.stat-card { background:var(--surface); border:1px solid var(--border); border-radius:var(--radius); padding:20px; text-align:center; }
+.stat-value { font-family:var(--font-body); font-size:2rem; font-weight:500; color:var(--accent); line-height:1.2; }
+.stat-warn { color:#7a6430; }
+.stat-danger { color:#8c2828; }
+.stat-label { font-size:.82rem; color:var(--ink-light); margin-top:4px; }
+.admin-quick-actions { display:flex; gap:10px; flex-wrap:wrap; margin-bottom:24px; }
+.admin-info-grid { display:grid; grid-template-columns:1fr 1fr; gap:20px; margin-bottom:24px; }
+.admin-info-card { background:var(--surface); border:1px solid var(--border); border-radius:var(--radius); padding:20px; }
+.admin-info-title { font-family:var(--font-body); font-size:1.05rem; font-weight:500; margin-bottom:16px; color:var(--ink); }
+.admin-info-body { display:flex; flex-direction:column; gap:10px; }
+.info-row { display:flex; align-items:center; gap:12px; }
+.info-label { font-size:.85rem; color:var(--ink-light); min-width:72px; flex-shrink:0; }
+.info-value { font-size:.88rem; color:var(--ink); display:flex; align-items:center; gap:6px; }
+.status-dot { display:inline-block; width:8px; height:8px; border-radius:50%; }
+.status-dot-on { background:#3d6e3d; }
+.status-dot-off { background:var(--ink-muted); }
+.scheduler-history { margin-top:20px; padding-top:16px; border-top:1px solid var(--border); }
+.section-subtitle { font-size:.9rem; font-weight:500; color:var(--ink-light); margin-bottom:10px; }
+.summary-dist { margin-top:20px; padding-top:16px; border-top:1px solid var(--border); }
+.summary-dist-bars { display:flex; flex-direction:column; gap:8px; }
+.dist-row { display:flex; align-items:center; gap:8px; }
+.dist-label { font-size:.8rem; color:var(--ink-light); min-width:60px; text-align:right; }
+.dist-bar-wrap { flex:1; height:16px; background:var(--bg); border-radius:4px; overflow:hidden; }
+.dist-bar { height:100%; border-radius:4px; min-width:2px; transition:width .3s; }
+.dist-bar-done { background:#3d6e3d; }
+.dist-bar-pending { background:#7a6430; }
+.dist-bar-running,.dist-bar-processing { background:var(--accent); }
+.dist-bar-failed,.dist-bar-permanent_failure { background:#8c2828; }
+.dist-bar-none { background:var(--ink-muted); }
+.dist-count { font-size:.8rem; color:var(--ink); font-variant-numeric:tabular-nums; min-width:28px; }
+.admin-section { margin-top:24px; }
+.admin-section-title { font-family:var(--font-body); font-size:1.1rem; font-weight:500; margin-bottom:12px; color:var(--ink); }
+
+/* ── Logs: Summary ────────────────────────────────────────────── */
+.summary-filters { display:flex; align-items:center; gap:6px; flex-wrap:wrap; margin-bottom:12px; }
+.summary-filter-label { font-size:.85rem; color:var(--ink-light); }
+.summary-filters .filter-chip { padding:4px 10px; font-size:.8rem; background:var(--surface); border:1px solid var(--border); border-radius:4px; color:var(--ink-light); cursor:pointer; transition:all .2s; font-family:var(--font-sans); }
+.summary-filters .filter-chip:hover { border-color:var(--accent); color:var(--accent); }
+.summary-filters .filter-chip.active { background:var(--accent); color:#fff; border-color:var(--accent); }
+.summary-stats-row { display:flex; gap:16px; margin-bottom:16px; flex-wrap:wrap; }
+.summary-stat { font-size:.85rem; color:var(--ink-light); }
+.summary-stat strong { font-variant-numeric:tabular-nums; }
+.summary-stat-pending strong { color:#7a6430; }
+.summary-stat-failed strong { color:#8c2828; }
+.summary-stat-done strong { color:#3d6e3d; }
+.summary-table td.title-cell { max-width:300px; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; }
+.retry-btn { padding:3px 10px; font-size:.75rem; background:var(--surface); border:1px solid var(--border); border-radius:4px; color:var(--accent); cursor:pointer; transition:all .2s; font-family:var(--font-sans); }
+.retry-btn:hover { border-color:var(--accent); background:var(--accent); color:#fff; }
+.retry-btn:disabled { opacity:.5; cursor:not-allowed; }
+.summary-batch-actions { margin-top:16px; padding-top:16px; border-top:1px solid var(--border); }
+.admin-actions { margin-top:32px; padding-top:20px; border-top:1px solid var(--border); }
+.admin-actions-title { font-family:var(--font-body); font-size:1.1rem; font-weight:600; margin-bottom:12px; color:var(--ink); }
+.admin-action-buttons { display:flex; gap:10px; flex-wrap:wrap; }
+
+/* ── Papers ────────────────────────────────────────────────────── */
+.paper-search-form { margin-bottom:16px; }
+.paper-search-row { display:flex; gap:8px; flex-wrap:wrap; align-items:center; }
+.paper-search-input { flex:1; min-width:200px; padding:8px 14px; border:1px solid var(--border); border-radius:var(--radius); font-size:.85rem; font-family:var(--font-sans); background:var(--surface); color:var(--ink); }
+.paper-search-input:focus { outline:none; border-color:var(--accent); }
+.paper-filter-input { padding:8px 10px; border:1px solid var(--border); border-radius:var(--radius); font-size:.82rem; font-family:var(--font-sans); background:var(--surface); color:var(--ink); }
+.paper-filter-input:focus { outline:none; border-color:var(--accent); }
+.paper-search-btn { padding:8px 18px; background:var(--accent); color:#fff; border:none; border-radius:var(--radius); font-size:.85rem; font-weight:500; cursor:pointer; font-family:var(--font-sans); transition:background .2s; }
+.paper-search-btn:hover { background:var(--accent-hover); }
+.paper-batch-bar { display:flex; align-items:center; gap:12px; padding:10px 0; margin-bottom:8px; border-bottom:1px solid var(--border); }
+.paper-batch-label { font-size:.85rem; color:var(--ink-light); }
+.paper-selected-count { font-size:.82rem; color:var(--ink-muted); }
+.th-check { width:40px; text-align:center; }
+.title-cell { max-width:400px; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; }
+.title-cell a { color:var(--ink); }
+.title-cell a:hover { color:var(--accent); }
+.action-cell { white-space:nowrap; }
+.action-btn-sm { display:inline-flex; align-items:center; justify-content:center; width:28px; height:28px; background:var(--surface); border:1px solid var(--border); border-radius:4px; font-size:.85rem; color:var(--ink-light); cursor:pointer; transition:all .15s; padding:0; vertical-align:middle; }
+.action-btn-sm:hover { border-color:var(--accent); color:var(--accent); }
+.action-btn-danger:hover { border-color:#8c2828; color:#8c2828; }
+
+/* ── Responsive ────────────────────────────────────────────────── */
+@media (max-width:880px) { .stats-grid{grid-template-columns:repeat(2,1fr);} .admin-info-grid{grid-template-columns:1fr;} }
+@media (max-width:640px) { .admin-table{font-size:.8rem;} .admin-table th,.admin-table td{padding:6px 8px;} .admin-action-buttons{flex-direction:column;} .admin-action-btn{width:100%;justify-content:center;} .paper-search-row{flex-direction:column;} .paper-search-input,.paper-filter-input,.paper-search-btn{width:100%;} .paper-batch-bar{flex-wrap:wrap;gap:8px;} }
+@media (max-width:480px) { .stats-grid{grid-template-columns:1fr 1fr;} .stat-value{font-size:1.5rem;} .admin-quick-actions{flex-direction:column;} }
diff --git a/app/static/css/style.css b/app/static/css/style.css
index fb57d22..5a5d2e2 100644
--- a/app/static/css/style.css
+++ b/app/static/css/style.css
@@ -1073,3 +1073,110 @@ mark {
.motivation-block p {
margin-bottom: 0.8rem;
}
+
+/* ── Login ──────────────────────────────────────────────────────── */
+
+.login-page {
+ display: flex;
+ justify-content: center;
+ align-items: center;
+ min-height: 60vh;
+ padding: 40px 16px;
+}
+
+.login-card {
+ width: 100%;
+ max-width: 400px;
+ background: var(--surface);
+ border: 1px solid var(--border);
+ border-radius: var(--radius-lg);
+ padding: 36px 32px;
+ box-shadow: 0 4px 24px var(--shadow);
+}
+
+.login-header {
+ text-align: center;
+ margin-bottom: 28px;
+}
+
+.login-title {
+ font-family: var(--font-body);
+ font-size: 1.4rem;
+ font-weight: 700;
+ color: var(--ink);
+ margin: 0 0 8px;
+}
+
+.login-subtitle {
+ font-size: 0.9rem;
+ color: var(--ink-light);
+ margin: 0;
+}
+
+.login-error {
+ background: #fce4ec;
+ color: #c62828;
+ padding: 10px 14px;
+ border-radius: var(--radius);
+ font-size: 0.85rem;
+ margin-bottom: 20px;
+ text-align: center;
+}
+
+.login-form {
+ display: flex;
+ flex-direction: column;
+ gap: 18px;
+}
+
+.login-field label {
+ display: block;
+ font-size: 0.85rem;
+ font-weight: 600;
+ color: var(--ink);
+ margin-bottom: 6px;
+}
+
+.login-field input {
+ width: 100%;
+ padding: 10px 14px;
+ border: 1px solid var(--border);
+ border-radius: var(--radius);
+ font-size: 0.9rem;
+ font-family: var(--font-sans);
+ background: var(--bg);
+ color: var(--ink);
+ transition: border-color 0.2s;
+ box-sizing: border-box;
+}
+
+.login-field input:focus {
+ outline: none;
+ border-color: var(--accent);
+ box-shadow: 0 0 0 3px rgba(27, 54, 93, 0.1);
+}
+
+.login-btn {
+ width: 100%;
+ padding: 12px;
+ background: var(--accent);
+ color: #fff;
+ border: none;
+ border-radius: var(--radius);
+ font-size: 0.95rem;
+ font-weight: 600;
+ cursor: pointer;
+ transition: background 0.2s;
+ font-family: var(--font-sans);
+ margin-top: 4px;
+}
+
+.login-btn:hover {
+ background: var(--accent-hover);
+}
+
+@media (max-width: 480px) {
+ .login-card {
+ padding: 28px 20px;
+ }
+}
diff --git a/app/static/js/app.js b/app/static/js/app.js
index 5eb7142..214362a 100644
--- a/app/static/js/app.js
+++ b/app/static/js/app.js
@@ -1,11 +1,10 @@
-/* app.js — 基础前端交互 */
+/* app.js — 基础前端交互 + 管理后台共享工具 */
// Ctrl+K 或 / 聚焦搜索框
document.addEventListener("keydown", function (e) {
var input = document.querySelector(".nav-search-input");
if (!input) return;
- // 忽略在输入框内的按键
if (e.target.tagName === "INPUT" || e.target.tagName === "TEXTAREA") return;
if ((e.ctrlKey || e.metaKey) && e.key === "k") {
@@ -16,3 +15,49 @@ document.addEventListener("keydown", function (e) {
input.focus();
}
});
+
+// ── Toast 通知(管理后台共享)──────────────────────────────────────────
+
+function showToast(msg, opts) {
+ opts = opts || {};
+ var duration = opts.duration || 2500;
+ var callback = opts.callback || null;
+
+ var t = document.createElement("div");
+ t.className = "admin-toast";
+ t.textContent = String(msg).substring(0, 200);
+ document.body.appendChild(t);
+ requestAnimationFrame(function () { t.classList.add("show"); });
+ setTimeout(function () {
+ t.classList.remove("show");
+ setTimeout(function () {
+ t.remove();
+ if (typeof callback === "function") callback();
+ }, 300);
+ }, duration);
+}
+
+// ── Admin 通用操作(管理后台共享)───────────────────────────────────────
+
+function adminAction(action, callback) {
+ fetch("/admin/" + action, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ })
+ .then(function (r) {
+ if (r.status === 303 || r.status === 401) {
+ window.location.href = "/admin/login";
+ return;
+ }
+ return r.json();
+ })
+ .then(function (data) {
+ if (data) {
+ showToast(data.error ? "❌ " + data.error.substring(0, 200) : "✅ 操作成功");
+ if (typeof callback === "function") callback(data);
+ }
+ })
+ .catch(function (err) {
+ showToast("❌ 请求失败");
+ });
+}
diff --git a/app/static/js/lightbox.js b/app/static/js/lightbox.js
new file mode 100644
index 0000000..ec77c6a
--- /dev/null
+++ b/app/static/js/lightbox.js
@@ -0,0 +1,159 @@
+/* lightbox.js — 图片查看器:缩放、拖拽、键盘操作 */
+
+(function() {
+ function openLightbox(src, alt) {
+ var existing = document.querySelector('.lightbox-overlay');
+ if (existing) existing.remove();
+
+ var overlay = document.createElement('div');
+ overlay.className = 'lightbox-overlay';
+
+ var img = document.createElement('img');
+ img.src = src;
+ img.alt = alt || '';
+ img.draggable = false;
+
+ // 工具栏
+ var toolbar = document.createElement('div');
+ toolbar.className = 'lightbox-toolbar';
+ toolbar.innerHTML =
+ '− ' +
+ '+ ' +
+ '⊡ ' +
+ '1:1 ' +
+ '✕ ';
+
+ overlay.appendChild(img);
+ overlay.appendChild(toolbar);
+ document.body.appendChild(overlay);
+
+ // 视图状态
+ var scale = 1, tx = 0, ty = 0;
+ var baseW = 0, baseH = 0;
+ var dragging = false, dragStartX = 0, dragStartY = 0, startTx = 0, startTy = 0;
+
+ function apply() {
+ img.style.transform = 'translate(' + tx + 'px,' + ty + 'px) scale(' + scale + ')';
+ }
+
+ function fitToScreen() {
+ if (!baseW) return;
+ var sw = window.innerWidth, sh = window.innerHeight;
+ scale = Math.min(sw * 0.9 / baseW, sh * 0.9 / baseH, 1);
+ tx = (sw - baseW * scale) / 2;
+ ty = (sh - baseH * scale) / 2;
+ apply();
+ }
+
+ function resetOrigin() {
+ scale = 1;
+ tx = (window.innerWidth - baseW) / 2;
+ ty = (window.innerHeight - baseH) / 2;
+ apply();
+ }
+
+ function zoomAt(factor, cx, cy) {
+ var newScale = Math.max(0.1, Math.min(scale * factor, 20));
+ tx = cx - (cx - tx) * (newScale / scale);
+ ty = cy - (cy - ty) * (newScale / scale);
+ scale = newScale;
+ apply();
+ }
+
+ function zoomCenter(factor) {
+ var cx = window.innerWidth / 2;
+ var cy = window.innerHeight / 2;
+ var newScale = Math.max(0.1, Math.min(scale * factor, 20));
+ tx = cx - (cx - tx) * (newScale / scale);
+ ty = cy - (cy - ty) * (newScale / scale);
+ scale = newScale;
+ apply();
+ }
+
+ // 图片加载后初始化
+ img.onload = function() {
+ baseW = img.naturalWidth;
+ baseH = img.naturalHeight;
+ fitToScreen();
+ };
+ // 如果已缓存
+ if (img.complete && img.naturalWidth) {
+ baseW = img.naturalWidth;
+ baseH = img.naturalHeight;
+ fitToScreen();
+ }
+
+ // 工具栏按钮(缩小 / 放大 / 适合 / 原始 / 关闭)
+ var btns = toolbar.querySelectorAll('button');
+ btns[0].onclick = function(e) { e.stopPropagation(); zoomCenter(0.7); };
+ btns[1].onclick = function(e) { e.stopPropagation(); zoomCenter(1.4); };
+ btns[2].onclick = function(e) { e.stopPropagation(); fitToScreen(); };
+ btns[3].onclick = function(e) { e.stopPropagation(); resetOrigin(); };
+ btns[4].onclick = function(e) { e.stopPropagation(); close(); };
+
+ // 滚轮缩放(以鼠标为中心)
+ overlay.addEventListener('wheel', function(e) {
+ e.preventDefault();
+ var factor = e.deltaY < 0 ? 1.15 : 0.87;
+ var rect = overlay.getBoundingClientRect();
+ var cx = e.clientX - rect.left;
+ var cy = e.clientY - rect.top;
+ var newScale = Math.max(0.1, Math.min(scale * factor, 20));
+ tx = cx - (cx - tx) * (newScale / scale);
+ ty = cy - (cy - ty) * (newScale / scale);
+ scale = newScale;
+ apply();
+ }, { passive: false });
+
+ // 拖拽平移
+ overlay.addEventListener('pointerdown', function(e) {
+ if (e.target.closest('.lightbox-toolbar')) return;
+ dragging = true;
+ dragStartX = e.clientX;
+ dragStartY = e.clientY;
+ startTx = tx;
+ startTy = ty;
+ img.classList.add('dragging');
+ overlay.setPointerCapture(e.pointerId);
+ });
+ overlay.addEventListener('pointermove', function(e) {
+ if (!dragging) return;
+ tx = startTx + (e.clientX - dragStartX);
+ ty = startTy + (e.clientY - dragStartY);
+ apply();
+ });
+ overlay.addEventListener('pointerup', function() {
+ dragging = false;
+ img.classList.remove('dragging');
+ });
+
+ // ESC 关闭
+ function onKey(e) {
+ if (e.key === 'Escape') { close(); }
+ else if (e.key === '+' || e.key === '=') { zoomCenter(1.4); }
+ else if (e.key === '-') { zoomCenter(0.7); }
+ else if (e.key === '0') { fitToScreen(); }
+ }
+
+ function close() {
+ overlay.remove();
+ document.removeEventListener('keydown', onKey);
+ }
+
+ document.addEventListener('keydown', onKey);
+
+ // 激活动画
+ requestAnimationFrame(function() {
+ overlay.classList.add('active');
+ });
+ }
+
+ document.addEventListener('click', function(e) {
+ var img = e.target;
+ if (img.tagName !== 'IMG') return;
+ if (!img.closest('.inline-figure') && !img.closest('.gallery-item')) return;
+ if (img.closest('.lightbox-overlay')) return;
+ e.preventDefault();
+ openLightbox(img.src, img.alt);
+ });
+})();
diff --git a/app/templates/admin_dashboard.html b/app/templates/admin_dashboard.html
new file mode 100644
index 0000000..862b13e
--- /dev/null
+++ b/app/templates/admin_dashboard.html
@@ -0,0 +1,185 @@
+{% extends "base.html" %}
+{% block title %}管理仪表盘 — HF Daily Papers{% endblock %}
+{% block content %}
+
+ {% set active = "dashboard" %}{% include "partials/admin_subnav.html" %}
+
+
📊 系统状态
+
+
+
+
{{ stats.total_papers }}
+
论文总数
+
+
+
{{ stats.today_papers }}
+
今日新增
+
+
+
+ {{ stats.pending_count + stats.none_count }}
+
+
待总结
+
+
+
+ {{ stats.failed_count }}
+
+
总结失败
+
+
+
+
+ 🔄 抓取今天
+ 📝 批量总结
+ 🧹 清理临时文件
+
+
+
+
+
🕐 调度器
+
+
+ 状态
+
+ {% if stats.scheduler_enabled %}
+ 运行中
+ {% else %}
+ 未启用
+ {% endif %}
+
+
+
+ 调度时间
+ {{ stats.schedule_time }}({{ stats.timezone }})
+
+ {% if stats.next_run %}
+
+ 下次执行
+ {{ stats.next_run[:19] | replace('T', ' ') }}
+
+ {% endif %}
+ {% if stats.active_locks %}
+
+ 活跃任务
+
+ {% for lock in stats.active_locks %}
+ {{ lock.task }}
+ {% endfor %}
+
+
+ {% endif %}
+
+
+
+ ▶ 立即执行流水线
+
+
+
+
+
执行历史
+ {% if scheduler_history %}
+
+
+
+ 时间 状态 发现 新增 错误
+
+
+ {% for log in scheduler_history %}
+
+ {{ log.started_at.strftime('%m-%d %H:%M') if log.started_at else '-' }}
+
+ {% if log.status == 'success' %}✓{% elif log.status == 'running' %}⟳{% elif log.status == 'failed' %}✗{% else %}{{ log.status }}{% endif %}
+
+ {{ log.papers_found or 0 }}
+ {{ log.papers_new or 0 }}
+
+ {{ (log.error[:50] + '...') if log.error and log.error|length > 50 else (log.error or '-') }}
+
+
+ {% endfor %}
+
+
+
+ {% else %}
+
暂无调度器执行记录。
+ {% endif %}
+
+
+
+
+
💾 存储概况
+
+
数据库 {{ stats.db_size }}
+
论文文件 {{ stats.papers_size }}
+
临时文件 {{ stats.tmp_size }}
+
+
+
总结状态分布
+
+ {% set total = stats.total_papers or 1 %}
+ {% set labels = {"done": "已完成", "pending": "待总结", "running": "运行中", "processing": "处理中", "failed": "失败", "permanent_failure": "永久失败", "none": "未开始"} %}
+ {% for st, cnt in stats.status_counts.items() %}
+ {% if cnt > 0 %}
+
+
{{ labels.get(st, st) }}
+
+
{{ cnt }}
+
+ {% endif %}
+ {% endfor %}
+
+
+
+
+
+
+
📋 最近活动
+ {% if stats.recent_logs %}
+
+
+
+ 任务 状态 日期 发现 新增 开始时间 完成时间 错误
+
+
+ {% for log in stats.recent_logs %}
+
+ {{ log.task }}
+
+ {# djlint:off #}
+ {% if log.status == 'success' %}✓ 成功{% elif log.status == 'running' %}⟳ 运行中{% elif log.status == 'failed' %}✗ 失败{% else %}{{ log.status }}{% endif %}
+ {# djlint:on #}
+
+ {{ log.date or '-' }}
+ {{ log.papers_found or 0 }}
+ {{ log.papers_new or 0 }}
+ {{ log.started_at.strftime('%m-%d %H:%M') if log.started_at else '-' }}
+ {{ log.completed_at.strftime('%m-%d %H:%M') if log.completed_at else '-' }}
+
+ {{ (log.error[:60] + '...') if log.error and log.error|length > 60 else (log.error or '-') }}
+
+
+ {% endfor %}
+
+
+
+ {% else %}
+
+
暂无活动日志
+
通过快捷操作触发任务后,日志将出现在这里。
+
+ {% endif %}
+
+
+{% endblock %}
+
+{% block scripts %}
+
+{% endblock %}
diff --git a/app/templates/admin_logs.html b/app/templates/admin_logs.html
index f7d0bf5..24ea1fd 100644
--- a/app/templates/admin_logs.html
+++ b/app/templates/admin_logs.html
@@ -1,68 +1,43 @@
-{% extends "base.html" %} {% block title %}管理日志 — HF Daily Papers{% endblock
-%} {% block content %}
-
+{% extends "base.html" %}
+{% block title %}管理日志 — HF Daily Papers{% endblock %}
+{% block content %}
+
+ {% set active = "logs" %}{% include "partials/admin_subnav.html" %}
+
📋 管理日志
抓取日志
删除记录
+ 总结状态
-
+
{% if crawl_logs %}
-
- ID
- 任务
- 状态
- 日期
- 发现
- 新增
- 开始时间
- 完成时间
- 错误
-
+ ID 任务 状态 日期 发现 新增 开始时间 完成时间 错误
{% for log in crawl_logs %}
{{ log.id }}
-
- {{ log.task }}
-
-
-
- {# djlint:off #}
- {% if log.status == 'success' %}
- ✓ 成功
- {% elif log.status == 'running' %}
- ⟳ 运行中
- {% elif log.status == 'failed' %}
- ✗ 失败
- {% else %}
- {{ log.status }}
- {% endif %}
- {# djlint:on #}
-
-
+ {{ log.task }}
+
+ {# djlint:off #}
+ {% if log.status == 'success' %}✓ 成功{% elif log.status == 'running' %}⟳ 运行中{% elif log.status == 'failed' %}✗ 失败{% else %}{{ log.status }}{% endif %}
+ {# djlint:on #}
+
{{ log.date or '-' }}
{{ log.papers_found or 0 }}
{{ log.papers_new or 0 }}
-
- {{ log.started_at.strftime('%m-%d %H:%M') if log.started_at else
- '-' }}
-
-
- {{ log.completed_at.strftime('%m-%d %H:%M') if log.completed_at
- else '-' }}
-
+ {{ log.started_at.strftime('%m-%d %H:%M') if log.started_at else '-' }}
+ {{ log.completed_at.strftime('%m-%d %H:%M') if log.completed_at else '-' }}
- {{ log.error[:80] + '...' if log.error and log.error|length > 80
- else (log.error or '-') }}
+ {{ log.error[:80] + '...' if log.error and log.error|length > 80 else (log.error or '-') }}
{% endfor %}
@@ -77,23 +52,13 @@
{% endif %}
-
+
{% if delete_jobs %}
-
- ID
- 起始日期
- 结束日期
- 包含笔记
- 论文数
- 状态
- 开始时间
- 完成时间
- 错误
-
+ ID 起始日期 结束日期 包含笔记 论文数 状态 开始时间 完成时间 错误
{% for job in delete_jobs %}
@@ -103,32 +68,15 @@
{{ job.date_end }}
{{ '是' if job.include_notes else '否' }}
{{ job.paper_count or 0 }}
-
-
- {# djlint:off #}
- {% if job.status == 'success' %}
- ✓ 成功
- {% elif job.status == 'running' %}
- ⟳ 运行中
- {% elif job.status == 'failed' %}
- ✗ 失败
- {% else %}
- {{ job.status }}
- {% endif %}
- {# djlint:on #}
-
-
-
- {{ job.started_at.strftime('%m-%d %H:%M') if job.started_at else
- '-' }}
-
-
- {{ job.completed_at.strftime('%m-%d %H:%M') if job.completed_at
- else '-' }}
-
+
+ {# djlint:off #}
+ {% if job.status == 'success' %}✓ 成功{% elif job.status == 'running' %}⟳ 运行中{% elif job.status == 'failed' %}✗ 失败{% else %}{{ job.status }}{% endif %}
+ {# djlint:on #}
+
+ {{ job.started_at.strftime('%m-%d %H:%M') if job.started_at else '-' }}
+ {{ job.completed_at.strftime('%m-%d %H:%M') if job.completed_at else '-' }}
- {{ job.error[:80] + '...' if job.error and job.error|length > 80
- else (job.error or '-') }}
+ {{ job.error[:80] + '...' if job.error and job.error|length > 80 else (job.error or '-') }}
{% endfor %}
@@ -143,259 +91,81 @@
{% endif %}
+
+
+
+ 筛选:
+ 全部
+ 未开始
+ 待总结
+ 运行中
+ 失败
+ 永久失败
+ 已完成
+
+
+ 全部 {{ summary_total or 0 }}
+ 待总结 {{ summary_pending or 0 }}
+ 失败 {{ summary_failed or 0 }}
+ 已完成 {{ summary_done or 0 }}
+
+
+
+ 🔄 重试所有失败
+
+
+
管理操作
-
- 🔄 抓取今天
-
-
- 📝 批量总结
-
-
- 🧹 清理临时文件
-
+ 🔄 抓取今天
+ 📝 批量总结
+ 🧹 清理临时文件
+{% endblock %}
-
-{% endblock %} {% block scripts %}
+{% block scripts %}
{% endblock %}
diff --git a/app/templates/admin_papers.html b/app/templates/admin_papers.html
new file mode 100644
index 0000000..d1ac639
--- /dev/null
+++ b/app/templates/admin_papers.html
@@ -0,0 +1,171 @@
+{% extends "base.html" %}
+{% block title %}论文管理 — HF Daily Papers{% endblock %}
+{% block content %}
+
+ {% set active = "papers" %}{% include "partials/admin_subnav.html" %}
+
+
📄 论文管理
+
+
+
+
+
+
+ 批量操作
+ 已选 0 篇
+ 📝 批量总结
+ 🗑 批量删除
+
+
+ {% if papers %}
+
+
+ {% set total_pages = ((total + per_page - 1) // per_page) if total else 1 %}
+ {% if total_pages > 1 %}
+
+ {% endif %}
+ {% else %}
+
+
没有找到匹配的论文
+
调整搜索条件或清除筛选。
+
+ {% endif %}
+
+
+
+
+
+
确定删除?
+
+ 取消
+ 确定删除
+
+
+
+{% endblock %}
+
+{% block scripts %}
+
+{% endblock %}
diff --git a/app/templates/base.html b/app/templates/base.html
index 9ac7ee7..13ae633 100644
--- a/app/templates/base.html
+++ b/app/templates/base.html
@@ -6,7 +6,9 @@
{% block title %}HF Daily Papers{% endblock %}
+ {% if is_admin %} {% endif %}
+ {% block head_style %}{% endblock %}