Files
daily-paper/app/routes/admin.py
T

222 lines
6.9 KiB
Python

"""管理接口 — 抓取、总结、清理、删除、日志,需要 ADMIN_TOKEN 鉴权。"""
from __future__ import annotations
from datetime import date, datetime, timezone
from fastapi import APIRouter, Depends, HTTPException, Query, Request
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from pydantic import BaseModel, field_validator
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.config import settings
from app.database import get_db
from app.models import CrawlLog, DataDeleteJob, TaskLock
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
from app.services.crawler import crawl_daily
from app.services.summarizer import summarize_batch, summarize_single
from app.utils import release_lock, templates, today_str
router = APIRouter(prefix="/admin", tags=["admin"])
security = HTTPBearer()
async def verify_admin(
credentials: HTTPAuthorizationCredentials = Depends(security),
) -> str:
"""验证 ADMIN_TOKEN。"""
if credentials.credentials != settings.ADMIN_TOKEN:
raise HTTPException(status_code=401, detail="Invalid admin token")
return credentials.credentials
# ── 请求模型 ──────────────────────────────────────────────────────────
class DeleteRequest(BaseModel):
date_start: date
date_end: date
include_notes: bool = True
confirm: str
@field_validator("confirm")
@classmethod
def confirm_must_be_delete(cls, v: str) -> str:
if v != "DELETE":
raise ValueError("confirm must be 'DELETE' to proceed")
return v
# ── 抓取 ──────────────────────────────────────────────────────────────
@router.post("/crawl")
async def admin_crawl(
_admin: str = Depends(verify_admin),
db: Session = Depends(get_db),
date: str | None = Query(None, description="YYYY-MM-DD,默认今天"),
):
"""手动抓取指定日期,默认今天。"""
target_date = date or today_str()
# TaskLock 防重入
now = datetime.now(timezone.utc)
lock = TaskLock(
task="crawl",
lock_key=target_date,
status="running",
owner="admin_crawl",
acquired_at=now,
)
try:
db.add(lock)
db.commit()
except Exception:
db.rollback()
raise HTTPException(
status_code=409, detail=f"Crawl already running for {target_date}"
)
try:
result = await crawl_daily(db, target_date)
return result
except Exception as exc:
raise HTTPException(status_code=500, detail=str(exc))
finally:
release_lock(db, lock)
# ── 总结 ──────────────────────────────────────────────────────────────
@router.post("/summarize")
async def admin_summarize_batch(
_admin: str = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""批量总结所有 pending 论文。"""
result = await summarize_batch(db)
if result.get("status") == "conflict":
raise HTTPException(
status_code=409, detail=result.get("error", "batch already running")
)
return result
@router.post("/summarize/{arxiv_id}")
async def admin_summarize_single(
arxiv_id: str,
_admin: str = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""总结或重跑单篇论文。"""
result = await summarize_single(db, arxiv_id, force=True)
if result.get("status") == "not_found":
raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}")
return result
# ── 清理 ──────────────────────────────────────────────────────────────
@router.post("/cleanup")
async def admin_cleanup(
_admin: str = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""清理 data/tmp/ 中超过 24 小时的临时文件。"""
now = datetime.now(timezone.utc)
log_entry = CrawlLog(
task="cleanup",
status="running",
started_at=now,
)
db.add(log_entry)
db.commit()
try:
result = cleanup_tmp()
log_entry.status = "success"
log_entry.completed_at = datetime.now(timezone.utc)
log_entry.papers_found = result.get("scanned", 0)
log_entry.papers_new = result.get("removed", 0)
if result.get("errors"):
log_entry.error = "; ".join(result["errors"])[:2000]
db.commit()
return result
except Exception as exc:
log_entry.status = "failed"
log_entry.error = str(exc)[:2000]
log_entry.completed_at = datetime.now(timezone.utc)
db.commit()
raise HTTPException(status_code=500, detail=str(exc))
# ── 删除 ──────────────────────────────────────────────────────────────
@router.post("/delete")
async def admin_delete(
body: DeleteRequest,
_admin: str = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""删除指定日期范围内的论文(需要 confirm='DELETE' 二次确认)。"""
if body.date_start > body.date_end:
raise HTTPException(status_code=400, detail="date_start must be <= date_end")
result = await delete_papers_by_date_range(
db,
body.date_start,
body.date_end,
include_notes=body.include_notes,
)
return result
# ── 日志 ──────────────────────────────────────────────────────────────
@router.get("/logs")
async def admin_logs(
request: Request,
_admin: str = Depends(verify_admin),
db: Session = Depends(get_db),
page: int = Query(1, ge=1),
per_page: int = Query(20, ge=1, le=100),
):
"""查看任务日志(CrawlLog + DataDeleteJob)。"""
crawl_logs = (
db.execute(
select(CrawlLog)
.order_by(CrawlLog.started_at.desc())
.limit(per_page)
.offset((page - 1) * per_page)
)
.scalars()
.all()
)
delete_jobs = (
db.execute(
select(DataDeleteJob)
.order_by(DataDeleteJob.started_at.desc())
.limit(per_page)
.offset((page - 1) * per_page)
)
.scalars()
.all()
)
return templates.TemplateResponse(
request,
"admin_logs.html",
{
"crawl_logs": crawl_logs,
"delete_jobs": delete_jobs,
"page": page,
"per_page": per_page,
},
)