2cfd1a8a9f
- Add POST /admin/crawl with TaskLock-based reentrancy guard - Add POST /admin/cleanup (tmp files older than 24h) with CrawlLog - Add POST /admin/delete with date range and 'DELETE' confirm token - Add GET /admin/logs (paginated CrawlLog + DataDeleteJob viewer) - Add app/services/cleaner.py (cleanup_tmp, delete_papers_by_date_range) - Add app/services/scheduler.py (APScheduler daily crawl/cleanup jobs) - Wire scheduler startup/shutdown hooks in app/main.py - Add admin nav link in base.html and APP_HOST security warning - Add apscheduler>=3.10 dependency - Add tests/test_admin_phase4.py covering the new endpoints
239 lines
7.6 KiB
Python
239 lines
7.6 KiB
Python
"""管理接口 — 抓取、总结、清理、删除、日志,需要 ADMIN_TOKEN 鉴权。"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import date, datetime, timezone
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException, Query, Request
|
|
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
|
|
from fastapi.templating import Jinja2Templates
|
|
from pydantic import BaseModel, field_validator
|
|
from sqlalchemy import select
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.config import settings
|
|
from app.database import get_db
|
|
from app.models import CrawlLog, DataDeleteJob, TaskLock
|
|
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
|
|
from app.services.crawler import crawl_daily
|
|
from app.services.summarizer import summarize_batch, summarize_single
|
|
|
|
router = APIRouter(prefix="/admin", tags=["admin"])
|
|
security = HTTPBearer()
|
|
templates = Jinja2Templates(directory="app/templates")
|
|
|
|
|
|
async def verify_admin(
|
|
credentials: HTTPAuthorizationCredentials = Depends(security),
|
|
) -> str:
|
|
"""验证 ADMIN_TOKEN。"""
|
|
if credentials.credentials != settings.ADMIN_TOKEN:
|
|
raise HTTPException(status_code=401, detail="Invalid admin token")
|
|
return credentials.credentials
|
|
|
|
|
|
# ── 请求模型 ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
class DeleteRequest(BaseModel):
|
|
date_start: date
|
|
date_end: date
|
|
include_notes: bool = True
|
|
confirm: str
|
|
|
|
@field_validator("confirm")
|
|
@classmethod
|
|
def confirm_must_be_delete(cls, v: str) -> str:
|
|
if v != "DELETE":
|
|
raise ValueError("confirm must be 'DELETE' to proceed")
|
|
return v
|
|
|
|
|
|
# ── 抓取 ──────────────────────────────────────────────────────────────────
|
|
|
|
|
|
@router.post("/crawl")
|
|
async def admin_crawl(
|
|
_admin: str = Depends(verify_admin),
|
|
db: Session = Depends(get_db),
|
|
date: str | None = Query(None, description="YYYY-MM-DD,默认今天"),
|
|
):
|
|
"""手动抓取指定日期,默认今天。"""
|
|
# 计算 target_date
|
|
from zoneinfo import ZoneInfo
|
|
|
|
tz = ZoneInfo(settings.APP_TIMEZONE)
|
|
today = datetime.now(tz).strftime("%Y-%m-%d")
|
|
target_date = date or today
|
|
|
|
# TaskLock 防重入
|
|
now = datetime.now(timezone.utc)
|
|
lock = TaskLock(
|
|
task="crawl",
|
|
lock_key=target_date,
|
|
status="running",
|
|
owner="admin_crawl",
|
|
acquired_at=now,
|
|
)
|
|
try:
|
|
db.add(lock)
|
|
db.commit()
|
|
except Exception:
|
|
db.rollback()
|
|
raise HTTPException(status_code=409, detail=f"Crawl already running for {target_date}")
|
|
|
|
try:
|
|
result = await crawl_daily(db, target_date)
|
|
return result
|
|
except Exception as exc:
|
|
raise HTTPException(status_code=500, detail=str(exc))
|
|
finally:
|
|
_release_lock(db, lock)
|
|
|
|
|
|
# ── 总结 ──────────────────────────────────────────────────────────────────
|
|
|
|
|
|
@router.post("/summarize")
|
|
async def admin_summarize_batch(
|
|
_admin: str = Depends(verify_admin),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
"""批量总结所有 pending 论文。"""
|
|
result = await summarize_batch(db)
|
|
if result.get("status") == "conflict":
|
|
raise HTTPException(status_code=409, detail=result.get("error", "batch already running"))
|
|
return result
|
|
|
|
|
|
@router.post("/summarize/{arxiv_id}")
|
|
async def admin_summarize_single(
|
|
arxiv_id: str,
|
|
_admin: str = Depends(verify_admin),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
"""总结或重跑单篇论文。"""
|
|
result = await summarize_single(db, arxiv_id, force=True)
|
|
if result.get("status") == "not_found":
|
|
raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}")
|
|
return result
|
|
|
|
|
|
# ── 清理 ──────────────────────────────────────────────────────────────────
|
|
|
|
|
|
@router.post("/cleanup")
|
|
async def admin_cleanup(
|
|
_admin: str = Depends(verify_admin),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
"""清理 data/tmp/ 中超过 24 小时的临时文件。"""
|
|
now = datetime.now(timezone.utc)
|
|
log_entry = CrawlLog(
|
|
task="cleanup",
|
|
status="running",
|
|
started_at=now,
|
|
)
|
|
db.add(log_entry)
|
|
db.commit()
|
|
|
|
try:
|
|
result = cleanup_tmp()
|
|
log_entry.status = "success"
|
|
log_entry.completed_at = datetime.now(timezone.utc)
|
|
log_entry.papers_found = result.get("scanned", 0)
|
|
log_entry.papers_new = result.get("removed", 0)
|
|
if result.get("errors"):
|
|
log_entry.error = "; ".join(result["errors"])[:2000]
|
|
db.commit()
|
|
return result
|
|
except Exception as exc:
|
|
log_entry.status = "failed"
|
|
log_entry.error = str(exc)[:2000]
|
|
log_entry.completed_at = datetime.now(timezone.utc)
|
|
db.commit()
|
|
raise HTTPException(status_code=500, detail=str(exc))
|
|
|
|
|
|
# ── 删除 ──────────────────────────────────────────────────────────────────
|
|
|
|
|
|
@router.post("/delete")
|
|
async def admin_delete(
|
|
body: DeleteRequest,
|
|
_admin: str = Depends(verify_admin),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
"""删除指定日期范围内的论文(需要 confirm='DELETE' 二次确认)。"""
|
|
if body.date_start > body.date_end:
|
|
raise HTTPException(status_code=400, detail="date_start must be <= date_end")
|
|
|
|
result = await delete_papers_by_date_range(
|
|
db,
|
|
body.date_start,
|
|
body.date_end,
|
|
include_notes=body.include_notes,
|
|
)
|
|
return result
|
|
|
|
|
|
# ── 日志 ──────────────────────────────────────────────────────────────────
|
|
|
|
|
|
@router.get("/logs")
|
|
async def admin_logs(
|
|
request: Request,
|
|
_admin: str = Depends(verify_admin),
|
|
db: Session = Depends(get_db),
|
|
page: int = Query(1, ge=1),
|
|
per_page: int = Query(20, ge=1, le=100),
|
|
):
|
|
"""查看任务日志(CrawlLog + DataDeleteJob)。"""
|
|
# 查询 crawl_logs
|
|
crawl_logs = (
|
|
db.execute(
|
|
select(CrawlLog)
|
|
.order_by(CrawlLog.started_at.desc())
|
|
.limit(per_page)
|
|
.offset((page - 1) * per_page)
|
|
)
|
|
.scalars()
|
|
.all()
|
|
)
|
|
|
|
# 查询 delete_jobs
|
|
delete_jobs = (
|
|
db.execute(
|
|
select(DataDeleteJob)
|
|
.order_by(DataDeleteJob.started_at.desc())
|
|
.limit(per_page)
|
|
.offset((page - 1) * per_page)
|
|
)
|
|
.scalars()
|
|
.all()
|
|
)
|
|
|
|
return templates.TemplateResponse(
|
|
request,
|
|
"admin_logs.html",
|
|
{
|
|
"crawl_logs": crawl_logs,
|
|
"delete_jobs": delete_jobs,
|
|
"page": page,
|
|
"per_page": per_page,
|
|
},
|
|
)
|
|
|
|
|
|
# ── 工具函数 ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
def _release_lock(db: Session, lock: TaskLock) -> None:
|
|
"""释放 TaskLock。"""
|
|
try:
|
|
lock.status = "finished"
|
|
lock.released_at = datetime.now(timezone.utc)
|
|
db.commit()
|
|
except Exception:
|
|
db.rollback()
|