Files
daily-paper/app/routes/admin.py
T
Rain-Bus 2cfd1a8a9f feat: add admin crawl, cleanup, delete, logs endpoints with scheduler and tests
- Add POST /admin/crawl with TaskLock-based reentrancy guard
- Add POST /admin/cleanup (tmp files older than 24h) with CrawlLog
- Add POST /admin/delete with date range and 'DELETE' confirm token
- Add GET /admin/logs (paginated CrawlLog + DataDeleteJob viewer)
- Add app/services/cleaner.py (cleanup_tmp, delete_papers_by_date_range)
- Add app/services/scheduler.py (APScheduler daily crawl/cleanup jobs)
- Wire scheduler startup/shutdown hooks in app/main.py
- Add admin nav link in base.html and APP_HOST security warning
- Add apscheduler>=3.10 dependency
- Add tests/test_admin_phase4.py covering the new endpoints
2026-06-05 23:07:45 +08:00

239 lines
7.6 KiB
Python

"""管理接口 — 抓取、总结、清理、删除、日志,需要 ADMIN_TOKEN 鉴权。"""
from __future__ import annotations
from datetime import date, datetime, timezone
from fastapi import APIRouter, Depends, HTTPException, Query, Request
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from fastapi.templating import Jinja2Templates
from pydantic import BaseModel, field_validator
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.config import settings
from app.database import get_db
from app.models import CrawlLog, DataDeleteJob, TaskLock
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
from app.services.crawler import crawl_daily
from app.services.summarizer import summarize_batch, summarize_single
router = APIRouter(prefix="/admin", tags=["admin"])
security = HTTPBearer()
templates = Jinja2Templates(directory="app/templates")
async def verify_admin(
credentials: HTTPAuthorizationCredentials = Depends(security),
) -> str:
"""验证 ADMIN_TOKEN。"""
if credentials.credentials != settings.ADMIN_TOKEN:
raise HTTPException(status_code=401, detail="Invalid admin token")
return credentials.credentials
# ── 请求模型 ──────────────────────────────────────────────────────────────
class DeleteRequest(BaseModel):
date_start: date
date_end: date
include_notes: bool = True
confirm: str
@field_validator("confirm")
@classmethod
def confirm_must_be_delete(cls, v: str) -> str:
if v != "DELETE":
raise ValueError("confirm must be 'DELETE' to proceed")
return v
# ── 抓取 ──────────────────────────────────────────────────────────────────
@router.post("/crawl")
async def admin_crawl(
_admin: str = Depends(verify_admin),
db: Session = Depends(get_db),
date: str | None = Query(None, description="YYYY-MM-DD,默认今天"),
):
"""手动抓取指定日期,默认今天。"""
# 计算 target_date
from zoneinfo import ZoneInfo
tz = ZoneInfo(settings.APP_TIMEZONE)
today = datetime.now(tz).strftime("%Y-%m-%d")
target_date = date or today
# TaskLock 防重入
now = datetime.now(timezone.utc)
lock = TaskLock(
task="crawl",
lock_key=target_date,
status="running",
owner="admin_crawl",
acquired_at=now,
)
try:
db.add(lock)
db.commit()
except Exception:
db.rollback()
raise HTTPException(status_code=409, detail=f"Crawl already running for {target_date}")
try:
result = await crawl_daily(db, target_date)
return result
except Exception as exc:
raise HTTPException(status_code=500, detail=str(exc))
finally:
_release_lock(db, lock)
# ── 总结 ──────────────────────────────────────────────────────────────────
@router.post("/summarize")
async def admin_summarize_batch(
_admin: str = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""批量总结所有 pending 论文。"""
result = await summarize_batch(db)
if result.get("status") == "conflict":
raise HTTPException(status_code=409, detail=result.get("error", "batch already running"))
return result
@router.post("/summarize/{arxiv_id}")
async def admin_summarize_single(
arxiv_id: str,
_admin: str = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""总结或重跑单篇论文。"""
result = await summarize_single(db, arxiv_id, force=True)
if result.get("status") == "not_found":
raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}")
return result
# ── 清理 ──────────────────────────────────────────────────────────────────
@router.post("/cleanup")
async def admin_cleanup(
_admin: str = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""清理 data/tmp/ 中超过 24 小时的临时文件。"""
now = datetime.now(timezone.utc)
log_entry = CrawlLog(
task="cleanup",
status="running",
started_at=now,
)
db.add(log_entry)
db.commit()
try:
result = cleanup_tmp()
log_entry.status = "success"
log_entry.completed_at = datetime.now(timezone.utc)
log_entry.papers_found = result.get("scanned", 0)
log_entry.papers_new = result.get("removed", 0)
if result.get("errors"):
log_entry.error = "; ".join(result["errors"])[:2000]
db.commit()
return result
except Exception as exc:
log_entry.status = "failed"
log_entry.error = str(exc)[:2000]
log_entry.completed_at = datetime.now(timezone.utc)
db.commit()
raise HTTPException(status_code=500, detail=str(exc))
# ── 删除 ──────────────────────────────────────────────────────────────────
@router.post("/delete")
async def admin_delete(
body: DeleteRequest,
_admin: str = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""删除指定日期范围内的论文(需要 confirm='DELETE' 二次确认)。"""
if body.date_start > body.date_end:
raise HTTPException(status_code=400, detail="date_start must be <= date_end")
result = await delete_papers_by_date_range(
db,
body.date_start,
body.date_end,
include_notes=body.include_notes,
)
return result
# ── 日志 ──────────────────────────────────────────────────────────────────
@router.get("/logs")
async def admin_logs(
request: Request,
_admin: str = Depends(verify_admin),
db: Session = Depends(get_db),
page: int = Query(1, ge=1),
per_page: int = Query(20, ge=1, le=100),
):
"""查看任务日志(CrawlLog + DataDeleteJob)。"""
# 查询 crawl_logs
crawl_logs = (
db.execute(
select(CrawlLog)
.order_by(CrawlLog.started_at.desc())
.limit(per_page)
.offset((page - 1) * per_page)
)
.scalars()
.all()
)
# 查询 delete_jobs
delete_jobs = (
db.execute(
select(DataDeleteJob)
.order_by(DataDeleteJob.started_at.desc())
.limit(per_page)
.offset((page - 1) * per_page)
)
.scalars()
.all()
)
return templates.TemplateResponse(
request,
"admin_logs.html",
{
"crawl_logs": crawl_logs,
"delete_jobs": delete_jobs,
"page": page,
"per_page": per_page,
},
)
# ── 工具函数 ──────────────────────────────────────────────────────────────
def _release_lock(db: Session, lock: TaskLock) -> None:
"""释放 TaskLock。"""
try:
lock.status = "finished"
lock.released_at = datetime.now(timezone.utc)
db.commit()
except Exception:
db.rollback()