feat: add admin dashboard, pipeline service, lightbox, and update dependencies

This commit is contained in:
2026-06-09 09:32:10 +08:00
parent 0d293422ac
commit 32978b3fc5
50 changed files with 4054 additions and 1618 deletions
+424 -22
View File
@@ -1,23 +1,38 @@
"""管理接口 — 抓取、总结、清理、删除、日志,需要登录鉴权。"""
"""管理接口 — 仪表盘、抓取、总结、清理、删除、日志,需要登录鉴权。"""
from __future__ import annotations
import hashlib
from datetime import date, datetime, timezone
import json
import logging
from datetime import date
from fastapi import APIRouter, Depends, Form, HTTPException, Query, Request
from fastapi.responses import RedirectResponse
from pydantic import BaseModel, field_validator
from sqlalchemy import select
from sqlalchemy import func, select, text
from sqlalchemy.orm import Session
from app.config import settings
from app.database import get_db
from app.models import CrawlLog, DataDeleteJob, TaskLock
from app.models import (
CrawlLog,
DataDeleteJob,
Paper,
PaperTag,
SummaryState,
SummaryStatus,
TaskLock,
)
from app.services.admin import get_admin_stats
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
from app.services.crawler import crawl_daily
from app.services.pipeline import run_pipeline
from app.services.scheduler import get_scheduler
from app.services.summarizer import summarize_batch, summarize_single
from app.utils import release_lock, templates, today_str
from app.utils import release_lock, templates, today_str, utc_now
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/admin", tags=["admin"])
@@ -42,12 +57,6 @@ async def verify_admin(request: Request) -> None:
raise HTTPException(status_code=303, headers={"Location": "/admin/login"})
def verify_admin_page(request: Request) -> None:
"""页面级认证:未登录重定向到登录页(同步版本,用于模板路由)。"""
if not request.session.get("is_admin"):
raise HTTPException(status_code=303, headers={"Location": "/admin/login"})
# ── 登录 / 登出 ──────────────────────────────────────────────────────
@@ -55,7 +64,7 @@ def verify_admin_page(request: Request) -> None:
async def admin_login_page(request: Request):
"""显示登录页面。已登录则直接跳转管理页。"""
if request.session.get("is_admin"):
return RedirectResponse("/admin/logs", status_code=303)
return RedirectResponse("/admin/", status_code=303)
return templates.TemplateResponse(request, "login.html", {"error": None})
@@ -68,7 +77,7 @@ async def admin_login_submit(
"""处理登录表单提交。"""
if username == settings.ADMIN_USERNAME and _check_password(password):
request.session["is_admin"] = True
return RedirectResponse("/admin/logs", status_code=303)
return RedirectResponse("/admin/", status_code=303)
return templates.TemplateResponse(
request, "login.html", {"error": "用户名或密码错误"}
)
@@ -81,6 +90,75 @@ async def admin_logout(request: Request):
return RedirectResponse("/admin/login", status_code=303)
# ── 仪表盘 ──────────────────────────────────────────────────────────
@router.get("/")
async def admin_dashboard(
request: Request,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""管理仪表盘 — 系统状态总览。"""
stats = get_admin_stats(db)
# 调度器历史(最近 10 条 task=scheduler 日志)
scheduler_history = (
db.execute(
select(CrawlLog)
.where(CrawlLog.task == "scheduler")
.order_by(CrawlLog.started_at.desc())
.limit(10)
)
.scalars()
.all()
)
return templates.TemplateResponse(
request,
"admin_dashboard.html",
{"stats": stats, "scheduler_history": scheduler_history},
)
# ── 调度器 ──────────────────────────────────────────────────────────
@router.get("/scheduler-status")
async def admin_scheduler_status(_admin: None = Depends(verify_admin)):
"""调度器运行状态(JSON)。"""
scheduler = get_scheduler()
next_run = None
if scheduler:
for job in scheduler.get_jobs():
if job.id == "daily_pipeline":
next_run = job.next_run_time
break
return {
"enabled": scheduler is not None,
"schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
"timezone": settings.APP_TIMEZONE,
"next_run": next_run.isoformat() if next_run else None,
}
@router.post("/trigger-pipeline")
async def admin_trigger_pipeline(
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""手动触发一次完整流水线(crawl → summarize → cleanup)。"""
today = today_str()
try:
result = await run_pipeline(db, today, owner="admin_trigger")
except RuntimeError as exc:
raise HTTPException(status_code=409, detail=str(exc))
if result["status"] == "failed":
raise HTTPException(status_code=500, detail=result.get("error"))
return {"status": "success", "message": "流水线执行完成"}
# ── 请求模型 ──────────────────────────────────────────────────────────
@@ -111,7 +189,7 @@ async def admin_crawl(
target_date = date or today_str()
# TaskLock 防重入
now = datetime.now(timezone.utc)
now = utc_now()
lock = TaskLock(
task="crawl",
lock_key=target_date,
@@ -146,7 +224,7 @@ async def admin_summarize_batch(
db: Session = Depends(get_db),
):
"""批量总结所有 pending 论文。"""
result = await summarize_batch(db)
result = await summarize_batch(db, pdf_mode=settings.SUMMARY_PDF_MODE)
if result.get("status") == "conflict":
raise HTTPException(
status_code=409, detail=result.get("error", "batch already running")
@@ -161,7 +239,7 @@ async def admin_summarize_single(
db: Session = Depends(get_db),
):
"""总结或重跑单篇论文。"""
result = await summarize_single(db, arxiv_id, force=True)
result = await summarize_single(db, arxiv_id, force=True, pdf_mode=settings.SUMMARY_PDF_MODE)
if result.get("status") == "not_found":
raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}")
return result
@@ -176,7 +254,7 @@ async def admin_cleanup(
db: Session = Depends(get_db),
):
"""清理 data/tmp/ 中超过 24 小时的临时文件。"""
now = datetime.now(timezone.utc)
now = utc_now()
log_entry = CrawlLog(
task="cleanup",
status="running",
@@ -188,9 +266,11 @@ async def admin_cleanup(
try:
result = cleanup_tmp()
log_entry.status = "success"
log_entry.completed_at = datetime.now(timezone.utc)
log_entry.papers_found = result.get("scanned", 0)
log_entry.papers_new = result.get("removed", 0)
log_entry.completed_at = utc_now()
log_entry.details_json = json.dumps({
"scanned": result.get("scanned", 0),
"removed": result.get("removed", 0),
}, ensure_ascii=False)
if result.get("errors"):
log_entry.error = "; ".join(result["errors"])[:2000]
db.commit()
@@ -198,7 +278,7 @@ async def admin_cleanup(
except Exception as exc:
log_entry.status = "failed"
log_entry.error = str(exc)[:2000]
log_entry.completed_at = datetime.now(timezone.utc)
log_entry.completed_at = utc_now()
db.commit()
raise HTTPException(status_code=500, detail=str(exc))
@@ -236,7 +316,7 @@ async def admin_logs(
page: int = Query(1, ge=1),
per_page: int = Query(20, ge=1, le=100),
):
"""查看任务日志(CrawlLog + DataDeleteJob)。"""
"""查看任务日志(CrawlLog + DataDeleteJob+ 总结状态统计"""
crawl_logs = (
db.execute(
select(CrawlLog)
@@ -259,6 +339,22 @@ async def admin_logs(
.all()
)
# 总结状态统计概要
summary_total = db.scalar(select(func.count(Paper.id))) or 0
summary_done = db.scalar(
select(func.count(SummaryStatus.id)).where(SummaryStatus.status == SummaryState.DONE)
) or 0
summary_pending = db.scalar(
select(func.count(SummaryStatus.id)).where(
SummaryStatus.status.in_([SummaryState.PENDING, SummaryState.PROCESSING])
)
) or 0
summary_failed = db.scalar(
select(func.count(SummaryStatus.id)).where(
SummaryStatus.status.in_([SummaryState.FAILED, SummaryState.PERMANENT_FAILURE])
)
) or 0
return templates.TemplateResponse(
request,
"admin_logs.html",
@@ -267,5 +363,311 @@ async def admin_logs(
"delete_jobs": delete_jobs,
"page": page,
"per_page": per_page,
"summary_total": summary_total,
"summary_done": summary_done,
"summary_pending": summary_pending,
"summary_failed": summary_failed,
},
)
# ── 总结状态管理 ────────────────────────────────────────────────────
@router.get("/summary-status")
async def admin_summary_status(
request: Request,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
status: str = Query("all"),
page: int = Query(1, ge=1),
per_page: int = Query(20, ge=1, le=100),
):
"""总结状态列表(HTMX 片段或 JSON)。"""
query = (
select(Paper, SummaryStatus)
.outerjoin(SummaryStatus, SummaryStatus.paper_id == Paper.id)
.order_by(Paper.paper_date.desc())
)
if status != "all":
if status == "none":
query = query.where(SummaryStatus.paper_id == None) # noqa: E711
else:
query = query.where(SummaryStatus.status == status)
total = db.scalar(
select(func.count()).select_from(query.subquery())
)
results = (
db.execute(query.offset((page - 1) * per_page).limit(per_page))
.all()
)
# 判断是否 HTMX 请求
is_htmx = request.headers.get("HX-Request") == "true"
if is_htmx:
# 返回 HTML 片段
return templates.TemplateResponse(
request,
"partials/summary_list.html",
{
"results": results,
"total": total or 0,
"page": page,
"per_page": per_page,
"current_status": status,
},
)
# 非 HTMX 返回 JSON
items = []
for paper, ss in results:
item = {
"arxiv_id": paper.arxiv_id,
"title": paper.title_zh or paper.title_en,
"paper_date": str(paper.paper_date),
"summary_status": ss.status if ss else "none",
"retry_count": ss.retry_count if ss else 0,
"error_type": ss.error_type if ss else None,
"error": ss.error if ss else None,
}
items.append(item)
return {"items": items, "total": total or 0, "page": page, "per_page": per_page}
@router.post("/summary-retry-failed")
async def admin_summary_retry_failed(
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""重试所有失败状态的总结任务。"""
failed_ids = (
db.execute(
select(Paper.arxiv_id)
.join(SummaryStatus, SummaryStatus.paper_id == Paper.id)
.where(SummaryStatus.status.in_([SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]))
)
.scalars()
.all()
)
if not failed_ids:
return {"status": "success", "message": "没有失败的任务需要重试", "count": 0}
# 重置失败任务的状态为 pending
db.execute(
SummaryStatus.__table__.update()
.where(SummaryStatus.status.in_([SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]))
.values(status=SummaryState.PENDING, error=None, error_type=None)
)
db.commit()
return {
"status": "success",
"message": f"已重置 {len(failed_ids)} 个失败任务为待总结状态",
"count": len(failed_ids),
}
# ── 论文管理 ────────────────────────────────────────────────────────
# 排序映射
_SORT_MAP = {
"date_desc": Paper.paper_date.desc(),
"date_asc": Paper.paper_date.asc(),
"upvotes_desc": Paper.upvotes.desc(),
"title_asc": Paper.title_en.asc(),
}
@router.get("/papers")
async def admin_papers(
request: Request,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
q: str = Query("", description="搜索标题/摘要"),
date_from: str | None = Query(None),
date_to: str | None = Query(None),
tag: str = Query(""),
summary_status: str = Query("all"),
sort: str = Query("date_desc"),
page: int = Query(1, ge=1),
per_page: int = Query(20, ge=1, le=100),
):
"""论文管理列表页面。"""
query = select(Paper)
# 搜索
if q.strip():
query = query.where(
Paper.title_en.ilike(f"%{q}%")
| Paper.title_zh.ilike(f"%{q}%")
| Paper.abstract.ilike(f"%{q}%")
)
# 日期范围
if date_from:
query = query.where(Paper.paper_date >= date_from)
if date_to:
query = query.where(Paper.paper_date <= date_to)
# 标签筛选
if tag:
query = query.join(PaperTag, PaperTag.paper_id == Paper.id).where(
PaperTag.tag == tag
)
# 总结状态筛选
if summary_status != "all":
if summary_status == "none":
query = query.outerjoin(
SummaryStatus, SummaryStatus.paper_id == Paper.id
).where(SummaryStatus.paper_id == None) # noqa: E711
else:
query = query.join(
SummaryStatus, SummaryStatus.paper_id == Paper.id
).where(SummaryStatus.status == summary_status)
# 排序
order = _SORT_MAP.get(sort, Paper.paper_date.desc())
query = query.order_by(order)
# 计数
total = db.scalar(select(func.count()).select_from(query.subquery()))
# 分页
papers = (
db.execute(query.offset((page - 1) * per_page).limit(per_page))
.scalars()
.all()
)
# 获取每篇论文的总结状态
paper_ids = [p.id for p in papers]
statuses = {}
if paper_ids:
rows = db.execute(
select(SummaryStatus.paper_id, SummaryStatus.status).where(
SummaryStatus.paper_id.in_(paper_ids)
)
).all()
paper_id_to_arxiv = {p.id: p.arxiv_id for p in papers}
for pid, st in rows:
statuses[paper_id_to_arxiv.get(pid, "")] = st
# 构建分页 URL 辅助函数
def pagination_url(p: int) -> str:
params = dict(request.query_params)
params["page"] = str(p)
return "/admin/papers?" + "&".join(f"{k}={v}" for k, v in params.items())
return templates.TemplateResponse(
request,
"admin_papers.html",
{
"papers": papers,
"paper_summary_statuses": statuses,
"total": total or 0,
"page": page,
"per_page": per_page,
"current_status": summary_status,
"current_sort": sort,
"pagination_url": pagination_url,
},
)
@router.post("/paper-delete/{arxiv_id}")
async def admin_paper_delete(
arxiv_id: str,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""删除单篇论文。"""
paper = db.scalar(select(Paper).where(Paper.arxiv_id == arxiv_id))
if not paper:
raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}")
# 删除相关数据(ORM cascade 自动处理关联表)
db.delete(paper)
db.commit()
# 清理 FTS 索引
try:
db.execute(text("DELETE FROM papers_fts WHERE arxiv_id = :aid"), {"aid": arxiv_id})
db.commit()
except Exception:
logger.warning("Failed to clean FTS index for %s", arxiv_id, exc_info=True)
return {"status": "success", "message": f"已删除 {arxiv_id}"}
class BatchActionRequest(BaseModel):
action: str # "delete" or "summarize"
arxiv_ids: list[str]
@field_validator("action")
@classmethod
def action_must_be_valid(cls, v: str) -> str:
if v not in ("delete", "summarize"):
raise ValueError("action must be 'delete' or 'summarize'")
return v
@router.post("/papers-batch-action")
async def admin_papers_batch_action(
body: BatchActionRequest,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""批量操作论文(删除或总结)。"""
if not body.arxiv_ids:
raise HTTPException(status_code=400, detail="arxiv_ids 不能为空")
if body.action == "delete":
papers = db.execute(
select(Paper).where(Paper.arxiv_id.in_(body.arxiv_ids))
).scalars().all()
count = 0
for paper in papers:
db.delete(paper)
count += 1
db.commit()
# 清理 FTS 索引
try:
db.execute(
text("DELETE FROM papers_fts WHERE arxiv_id IN :ids"),
{"ids": tuple(body.arxiv_ids)},
)
db.commit()
except Exception:
logger.warning("Failed to clean FTS index for batch delete", exc_info=True)
return {"status": "success", "message": f"已删除 {count} 篇论文", "count": count}
elif body.action == "summarize":
# 将选中论文的总结状态重置为 pending
paper_ids = db.execute(
select(Paper.id).where(Paper.arxiv_id.in_(body.arxiv_ids))
).scalars().all()
if paper_ids:
# 删除旧的 status 记录让其重新进入 pipeline
db.execute(
SummaryStatus.__table__.delete().where(
SummaryStatus.paper_id.in_(paper_ids)
)
)
db.commit()
return {
"status": "success",
"message": f"已将 {len(paper_ids)} 篇论文重置为待总结",
"count": len(paper_ids),
}
+12 -9
View File
@@ -2,11 +2,12 @@
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException, Query, Request
from fastapi import APIRouter, Depends, Query, Request
from sqlalchemy import select
from sqlalchemy.orm import Session, joinedload
from app.database import get_db
from app.models import Paper
from app.models import PAPER_DEFAULT_LOAD, Paper
from app.utils import templates
router = APIRouter()
@@ -48,14 +49,16 @@ def compare_page(
)
papers = (
db.query(Paper)
.filter(Paper.arxiv_id.in_(arxiv_ids))
.options(
joinedload(Paper.authors),
joinedload(Paper.tags),
joinedload(Paper.summary),
joinedload(Paper.summary_status),
db.execute(
select(Paper)
.where(Paper.arxiv_id.in_(arxiv_ids))
.options(
joinedload(Paper.summary),
*PAPER_DEFAULT_LOAD,
)
)
.unique()
.scalars()
.all()
)
+49 -60
View File
@@ -2,18 +2,20 @@
from __future__ import annotations
import json
import logging
import re
from datetime import date, timedelta
from pathlib import Path
from fastapi import APIRouter, Depends, HTTPException, Query, Request
from fastapi.responses import RedirectResponse
from sqlalchemy import select
from sqlalchemy.orm import Session, joinedload
from app.config import settings
from app.database import get_db
from app.models import Paper
from app.utils import templates, today_str
from app.models import PAPER_FULL_LOAD, Paper
from app.utils import PAPERS_DIR, safe_json_loads, templates, today_str, latest_paper_date
logger = logging.getLogger(__name__)
@@ -21,9 +23,9 @@ router = APIRouter()
@router.get("/")
def index(request: Request):
"""重定向到 /day/{today}"""
return RedirectResponse(url=f"/day/{today_str()}")
def index(request: Request, db: Session = Depends(get_db)):
"""重定向到最新有论文的日期页"""
return RedirectResponse(url=f"/day/{latest_paper_date(db)}")
@router.get("/day/{date_str}")
@@ -39,23 +41,24 @@ def day_page(date_str: str, request: Request, db: Session = Depends(get_db)):
today = today_str()
papers = (
db.query(Paper)
.filter(Paper.paper_date == date_str)
.options(
joinedload(Paper.authors),
joinedload(Paper.tags),
joinedload(Paper.summary_status),
joinedload(Paper.bookmark),
db.execute(
select(Paper)
.where(Paper.paper_date == date_str)
.options(*PAPER_FULL_LOAD)
.order_by(Paper.upvotes.desc())
)
.order_by(Paper.upvotes.desc())
.scalars()
.unique()
.all()
)
dates_raw = (
db.query(Paper.paper_date)
.distinct()
.order_by(Paper.paper_date.desc())
.limit(30)
db.execute(
select(Paper.paper_date)
.distinct()
.order_by(Paper.paper_date.desc())
.limit(30)
)
.all()
)
available_dates = [
@@ -81,18 +84,17 @@ def day_page(date_str: str, request: Request, db: Session = Depends(get_db)):
def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db)):
"""论文详情页。"""
paper = (
db.query(Paper)
.filter(Paper.arxiv_id == arxiv_id)
.options(
joinedload(Paper.authors),
joinedload(Paper.tags),
joinedload(Paper.summary),
joinedload(Paper.summary_status),
joinedload(Paper.bookmark),
joinedload(Paper.reading_status),
joinedload(Paper.note),
db.execute(
select(Paper)
.where(Paper.arxiv_id == arxiv_id)
.options(
joinedload(Paper.summary),
joinedload(Paper.note),
*PAPER_FULL_LOAD,
)
)
.first()
.unique()
.scalar_one_or_none()
)
if not paper:
raise HTTPException(status_code=404, detail="Paper not found")
@@ -108,28 +110,15 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
images = _get_paper_images(arxiv_id)
# 预处理 JSON 字段供模板直接使用
import json as _json
prereqs = {}
if paper.summary and paper.summary.prerequisites_json:
try:
prereqs = _json.loads(paper.summary.prerequisites_json)
except (ValueError, TypeError):
pass
benchmarks = []
if paper.summary and paper.summary.results_benchmarks_json:
try:
benchmarks = _json.loads(paper.summary.results_benchmarks_json)
except (ValueError, TypeError):
pass
figures_raw = []
if paper.summary and paper.summary.figures_json:
try:
figures_raw = _json.loads(paper.summary.figures_json)
except (ValueError, TypeError):
pass
prereqs = safe_json_loads(
paper.summary.prerequisites_json if paper.summary else None, default={}
)
benchmarks = safe_json_loads(
paper.summary.results_benchmarks_json if paper.summary else None, default=[]
)
figures_raw = safe_json_loads(
paper.summary.figures_json if paper.summary else None, default=[]
)
linked_figures = _link_figures_with_images(figures_raw, images, arxiv_id)
@@ -228,9 +217,12 @@ def _get_similar_papers(db: Session, arxiv_id: str, top_k: int = 6) -> list[dict
return []
papers = (
db.query(Paper)
.filter(Paper.arxiv_id.in_(list(papers_info.keys())))
.options(joinedload(Paper.tags))
db.execute(
select(Paper)
.where(Paper.arxiv_id.in_(list(papers_info.keys())))
.options(joinedload(Paper.tags))
)
.scalars()
.all()
)
@@ -260,7 +252,7 @@ def _get_similar_papers(db: Session, arxiv_id: str, top_k: int = 6) -> list[dict
def _get_paper_images(arxiv_id: str) -> list[dict]:
"""获取论文提取的图片列表。"""
images_dir = Path("data/papers") / arxiv_id / "images"
images_dir = PAPERS_DIR / arxiv_id / "images"
if not images_dir.exists():
return []
@@ -286,15 +278,12 @@ def _link_figures_with_images(
if not figures or not images:
return figures
import json as _json
import re
manifest_path = Path("data/papers") / arxiv_id / "images" / "manifest.json"
manifest_path = PAPERS_DIR / arxiv_id / "images" / "manifest.json"
if not manifest_path.exists():
return figures
try:
manifest = _json.loads(manifest_path.read_text(encoding="utf-8"))
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
except (ValueError, TypeError):
return figures
+7 -7
View File
@@ -7,12 +7,12 @@ from xml.sax.saxutils import escape
from fastapi import APIRouter, Depends, Query, Request
from fastapi.responses import Response
from sqlalchemy import text
from sqlalchemy import select
from sqlalchemy.orm import Session, joinedload
from app.config import settings
from app.database import get_db
from app.models import Paper, PaperTag, UserReadingStatus
from app.models import Paper, PaperTag
from app.services.searcher import get_all_tags, search_papers
from app.services.user_data import query_reading_list
from app.utils import templates, today_str
@@ -144,9 +144,9 @@ def rss_feed(
"""RSS 2.0 Feed — 最近 7 天论文。"""
seven_days_ago = date.today() - timedelta(days=7)
query = (
db.query(Paper)
.filter(Paper.paper_date >= seven_days_ago)
stmt = (
select(Paper)
.where(Paper.paper_date >= seven_days_ago)
.options(
joinedload(Paper.authors),
joinedload(Paper.tags),
@@ -156,9 +156,9 @@ def rss_feed(
)
if tag:
query = query.filter(Paper.tags.any(PaperTag.tag == tag))
stmt = stmt.where(Paper.tags.any(PaperTag.tag == tag))
papers = query.all()
papers = db.execute(stmt).unique().scalars().all()
xml = _generate_rss_xml(papers, settings.BASE_URL, tag or None)
return Response(content=xml, media_type="application/xml")