"""管理接口 — 抓取、总结、清理、删除、日志,需要 ADMIN_TOKEN 鉴权。""" from __future__ import annotations from datetime import date, datetime, timezone from fastapi import APIRouter, Depends, HTTPException, Query, Request from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer from fastapi.templating import Jinja2Templates from pydantic import BaseModel, field_validator from sqlalchemy import select from sqlalchemy.orm import Session from app.config import settings from app.database import get_db from app.models import CrawlLog, DataDeleteJob, TaskLock from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range from app.services.crawler import crawl_daily from app.services.summarizer import summarize_batch, summarize_single router = APIRouter(prefix="/admin", tags=["admin"]) security = HTTPBearer() templates = Jinja2Templates(directory="app/templates") async def verify_admin( credentials: HTTPAuthorizationCredentials = Depends(security), ) -> str: """验证 ADMIN_TOKEN。""" if credentials.credentials != settings.ADMIN_TOKEN: raise HTTPException(status_code=401, detail="Invalid admin token") return credentials.credentials # ── 请求模型 ────────────────────────────────────────────────────────────── class DeleteRequest(BaseModel): date_start: date date_end: date include_notes: bool = True confirm: str @field_validator("confirm") @classmethod def confirm_must_be_delete(cls, v: str) -> str: if v != "DELETE": raise ValueError("confirm must be 'DELETE' to proceed") return v # ── 抓取 ────────────────────────────────────────────────────────────────── @router.post("/crawl") async def admin_crawl( _admin: str = Depends(verify_admin), db: Session = Depends(get_db), date: str | None = Query(None, description="YYYY-MM-DD,默认今天"), ): """手动抓取指定日期,默认今天。""" # 计算 target_date from zoneinfo import ZoneInfo tz = ZoneInfo(settings.APP_TIMEZONE) today = datetime.now(tz).strftime("%Y-%m-%d") target_date = date or today # TaskLock 防重入 now = datetime.now(timezone.utc) lock = TaskLock( task="crawl", lock_key=target_date, status="running", owner="admin_crawl", acquired_at=now, ) try: db.add(lock) db.commit() except Exception: db.rollback() raise HTTPException(status_code=409, detail=f"Crawl already running for {target_date}") try: result = await crawl_daily(db, target_date) return result except Exception as exc: raise HTTPException(status_code=500, detail=str(exc)) finally: _release_lock(db, lock) # ── 总结 ────────────────────────────────────────────────────────────────── @router.post("/summarize") async def admin_summarize_batch( _admin: str = Depends(verify_admin), db: Session = Depends(get_db), ): """批量总结所有 pending 论文。""" result = await summarize_batch(db) if result.get("status") == "conflict": raise HTTPException(status_code=409, detail=result.get("error", "batch already running")) return result @router.post("/summarize/{arxiv_id}") async def admin_summarize_single( arxiv_id: str, _admin: str = Depends(verify_admin), db: Session = Depends(get_db), ): """总结或重跑单篇论文。""" result = await summarize_single(db, arxiv_id, force=True) if result.get("status") == "not_found": raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}") return result # ── 清理 ────────────────────────────────────────────────────────────────── @router.post("/cleanup") async def admin_cleanup( _admin: str = Depends(verify_admin), db: Session = Depends(get_db), ): """清理 data/tmp/ 中超过 24 小时的临时文件。""" now = datetime.now(timezone.utc) log_entry = CrawlLog( task="cleanup", status="running", started_at=now, ) db.add(log_entry) db.commit() try: result = cleanup_tmp() log_entry.status = "success" log_entry.completed_at = datetime.now(timezone.utc) log_entry.papers_found = result.get("scanned", 0) log_entry.papers_new = result.get("removed", 0) if result.get("errors"): log_entry.error = "; ".join(result["errors"])[:2000] db.commit() return result except Exception as exc: log_entry.status = "failed" log_entry.error = str(exc)[:2000] log_entry.completed_at = datetime.now(timezone.utc) db.commit() raise HTTPException(status_code=500, detail=str(exc)) # ── 删除 ────────────────────────────────────────────────────────────────── @router.post("/delete") async def admin_delete( body: DeleteRequest, _admin: str = Depends(verify_admin), db: Session = Depends(get_db), ): """删除指定日期范围内的论文(需要 confirm='DELETE' 二次确认)。""" if body.date_start > body.date_end: raise HTTPException(status_code=400, detail="date_start must be <= date_end") result = await delete_papers_by_date_range( db, body.date_start, body.date_end, include_notes=body.include_notes, ) return result # ── 日志 ────────────────────────────────────────────────────────────────── @router.get("/logs") async def admin_logs( request: Request, _admin: str = Depends(verify_admin), db: Session = Depends(get_db), page: int = Query(1, ge=1), per_page: int = Query(20, ge=1, le=100), ): """查看任务日志(CrawlLog + DataDeleteJob)。""" # 查询 crawl_logs crawl_logs = ( db.execute( select(CrawlLog) .order_by(CrawlLog.started_at.desc()) .limit(per_page) .offset((page - 1) * per_page) ) .scalars() .all() ) # 查询 delete_jobs delete_jobs = ( db.execute( select(DataDeleteJob) .order_by(DataDeleteJob.started_at.desc()) .limit(per_page) .offset((page - 1) * per_page) ) .scalars() .all() ) return templates.TemplateResponse( request, "admin_logs.html", { "crawl_logs": crawl_logs, "delete_jobs": delete_jobs, "page": page, "per_page": per_page, }, ) # ── 工具函数 ────────────────────────────────────────────────────────────── def _release_lock(db: Session, lock: TaskLock) -> None: """释放 TaskLock。""" try: lock.status = "finished" lock.released_at = datetime.now(timezone.utc) db.commit() except Exception: db.rollback()