"""管理接口 — 抓取、总结、清理、删除、日志,需要 ADMIN_TOKEN 鉴权。""" from __future__ import annotations from datetime import date, datetime, timezone from fastapi import APIRouter, Depends, HTTPException, Query, Request from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer from pydantic import BaseModel, field_validator from sqlalchemy import select from sqlalchemy.orm import Session from app.config import settings from app.database import get_db from app.models import CrawlLog, DataDeleteJob, TaskLock from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range from app.services.crawler import crawl_daily from app.services.summarizer import summarize_batch, summarize_single from app.utils import release_lock, templates, today_str router = APIRouter(prefix="/admin", tags=["admin"]) security = HTTPBearer() async def verify_admin( credentials: HTTPAuthorizationCredentials = Depends(security), ) -> str: """验证 ADMIN_TOKEN。""" if credentials.credentials != settings.ADMIN_TOKEN: raise HTTPException(status_code=401, detail="Invalid admin token") return credentials.credentials # ── 请求模型 ────────────────────────────────────────────────────────── class DeleteRequest(BaseModel): date_start: date date_end: date include_notes: bool = True confirm: str @field_validator("confirm") @classmethod def confirm_must_be_delete(cls, v: str) -> str: if v != "DELETE": raise ValueError("confirm must be 'DELETE' to proceed") return v # ── 抓取 ────────────────────────────────────────────────────────────── @router.post("/crawl") async def admin_crawl( _admin: str = Depends(verify_admin), db: Session = Depends(get_db), date: str | None = Query(None, description="YYYY-MM-DD,默认今天"), ): """手动抓取指定日期,默认今天。""" target_date = date or today_str() # TaskLock 防重入 now = datetime.now(timezone.utc) lock = TaskLock( task="crawl", lock_key=target_date, status="running", owner="admin_crawl", acquired_at=now, ) try: db.add(lock) db.commit() except Exception: db.rollback() raise HTTPException(status_code=409, detail=f"Crawl already running for {target_date}") try: result = await crawl_daily(db, target_date) return result except Exception as exc: raise HTTPException(status_code=500, detail=str(exc)) finally: release_lock(db, lock) # ── 总结 ────────────────────────────────────────────────────────────── @router.post("/summarize") async def admin_summarize_batch( _admin: str = Depends(verify_admin), db: Session = Depends(get_db), ): """批量总结所有 pending 论文。""" result = await summarize_batch(db) if result.get("status") == "conflict": raise HTTPException(status_code=409, detail=result.get("error", "batch already running")) return result @router.post("/summarize/{arxiv_id}") async def admin_summarize_single( arxiv_id: str, _admin: str = Depends(verify_admin), db: Session = Depends(get_db), ): """总结或重跑单篇论文。""" result = await summarize_single(db, arxiv_id, force=True) if result.get("status") == "not_found": raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}") return result # ── 清理 ────────────────────────────────────────────────────────────── @router.post("/cleanup") async def admin_cleanup( _admin: str = Depends(verify_admin), db: Session = Depends(get_db), ): """清理 data/tmp/ 中超过 24 小时的临时文件。""" now = datetime.now(timezone.utc) log_entry = CrawlLog( task="cleanup", status="running", started_at=now, ) db.add(log_entry) db.commit() try: result = cleanup_tmp() log_entry.status = "success" log_entry.completed_at = datetime.now(timezone.utc) log_entry.papers_found = result.get("scanned", 0) log_entry.papers_new = result.get("removed", 0) if result.get("errors"): log_entry.error = "; ".join(result["errors"])[:2000] db.commit() return result except Exception as exc: log_entry.status = "failed" log_entry.error = str(exc)[:2000] log_entry.completed_at = datetime.now(timezone.utc) db.commit() raise HTTPException(status_code=500, detail=str(exc)) # ── 删除 ────────────────────────────────────────────────────────────── @router.post("/delete") async def admin_delete( body: DeleteRequest, _admin: str = Depends(verify_admin), db: Session = Depends(get_db), ): """删除指定日期范围内的论文(需要 confirm='DELETE' 二次确认)。""" if body.date_start > body.date_end: raise HTTPException(status_code=400, detail="date_start must be <= date_end") result = await delete_papers_by_date_range( db, body.date_start, body.date_end, include_notes=body.include_notes, ) return result # ── 日志 ────────────────────────────────────────────────────────────── @router.get("/logs") async def admin_logs( request: Request, _admin: str = Depends(verify_admin), db: Session = Depends(get_db), page: int = Query(1, ge=1), per_page: int = Query(20, ge=1, le=100), ): """查看任务日志(CrawlLog + DataDeleteJob)。""" crawl_logs = ( db.execute( select(CrawlLog) .order_by(CrawlLog.started_at.desc()) .limit(per_page) .offset((page - 1) * per_page) ) .scalars() .all() ) delete_jobs = ( db.execute( select(DataDeleteJob) .order_by(DataDeleteJob.started_at.desc()) .limit(per_page) .offset((page - 1) * per_page) ) .scalars() .all() ) return templates.TemplateResponse( request, "admin_logs.html", { "crawl_logs": crawl_logs, "delete_jobs": delete_jobs, "page": page, "per_page": per_page, }, )