222 lines
6.9 KiB
Python
222 lines
6.9 KiB
Python
"""管理接口 — 抓取、总结、清理、删除、日志,需要 ADMIN_TOKEN 鉴权。"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import date, datetime, timezone
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException, Query, Request
|
|
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
|
|
from pydantic import BaseModel, field_validator
|
|
from sqlalchemy import select
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.config import settings
|
|
from app.database import get_db
|
|
from app.models import CrawlLog, DataDeleteJob, TaskLock
|
|
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
|
|
from app.services.crawler import crawl_daily
|
|
from app.services.summarizer import summarize_batch, summarize_single
|
|
from app.utils import release_lock, templates, today_str
|
|
|
|
router = APIRouter(prefix="/admin", tags=["admin"])
|
|
security = HTTPBearer()
|
|
|
|
|
|
async def verify_admin(
|
|
credentials: HTTPAuthorizationCredentials = Depends(security),
|
|
) -> str:
|
|
"""验证 ADMIN_TOKEN。"""
|
|
if credentials.credentials != settings.ADMIN_TOKEN:
|
|
raise HTTPException(status_code=401, detail="Invalid admin token")
|
|
return credentials.credentials
|
|
|
|
|
|
# ── 请求模型 ──────────────────────────────────────────────────────────
|
|
|
|
|
|
class DeleteRequest(BaseModel):
|
|
date_start: date
|
|
date_end: date
|
|
include_notes: bool = True
|
|
confirm: str
|
|
|
|
@field_validator("confirm")
|
|
@classmethod
|
|
def confirm_must_be_delete(cls, v: str) -> str:
|
|
if v != "DELETE":
|
|
raise ValueError("confirm must be 'DELETE' to proceed")
|
|
return v
|
|
|
|
|
|
# ── 抓取 ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
@router.post("/crawl")
|
|
async def admin_crawl(
|
|
_admin: str = Depends(verify_admin),
|
|
db: Session = Depends(get_db),
|
|
date: str | None = Query(None, description="YYYY-MM-DD,默认今天"),
|
|
):
|
|
"""手动抓取指定日期,默认今天。"""
|
|
target_date = date or today_str()
|
|
|
|
# TaskLock 防重入
|
|
now = datetime.now(timezone.utc)
|
|
lock = TaskLock(
|
|
task="crawl",
|
|
lock_key=target_date,
|
|
status="running",
|
|
owner="admin_crawl",
|
|
acquired_at=now,
|
|
)
|
|
try:
|
|
db.add(lock)
|
|
db.commit()
|
|
except Exception:
|
|
db.rollback()
|
|
raise HTTPException(
|
|
status_code=409, detail=f"Crawl already running for {target_date}"
|
|
)
|
|
|
|
try:
|
|
result = await crawl_daily(db, target_date)
|
|
return result
|
|
except Exception as exc:
|
|
raise HTTPException(status_code=500, detail=str(exc))
|
|
finally:
|
|
release_lock(db, lock)
|
|
|
|
|
|
# ── 总结 ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
@router.post("/summarize")
|
|
async def admin_summarize_batch(
|
|
_admin: str = Depends(verify_admin),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
"""批量总结所有 pending 论文。"""
|
|
result = await summarize_batch(db)
|
|
if result.get("status") == "conflict":
|
|
raise HTTPException(
|
|
status_code=409, detail=result.get("error", "batch already running")
|
|
)
|
|
return result
|
|
|
|
|
|
@router.post("/summarize/{arxiv_id}")
|
|
async def admin_summarize_single(
|
|
arxiv_id: str,
|
|
_admin: str = Depends(verify_admin),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
"""总结或重跑单篇论文。"""
|
|
result = await summarize_single(db, arxiv_id, force=True)
|
|
if result.get("status") == "not_found":
|
|
raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}")
|
|
return result
|
|
|
|
|
|
# ── 清理 ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
@router.post("/cleanup")
|
|
async def admin_cleanup(
|
|
_admin: str = Depends(verify_admin),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
"""清理 data/tmp/ 中超过 24 小时的临时文件。"""
|
|
now = datetime.now(timezone.utc)
|
|
log_entry = CrawlLog(
|
|
task="cleanup",
|
|
status="running",
|
|
started_at=now,
|
|
)
|
|
db.add(log_entry)
|
|
db.commit()
|
|
|
|
try:
|
|
result = cleanup_tmp()
|
|
log_entry.status = "success"
|
|
log_entry.completed_at = datetime.now(timezone.utc)
|
|
log_entry.papers_found = result.get("scanned", 0)
|
|
log_entry.papers_new = result.get("removed", 0)
|
|
if result.get("errors"):
|
|
log_entry.error = "; ".join(result["errors"])[:2000]
|
|
db.commit()
|
|
return result
|
|
except Exception as exc:
|
|
log_entry.status = "failed"
|
|
log_entry.error = str(exc)[:2000]
|
|
log_entry.completed_at = datetime.now(timezone.utc)
|
|
db.commit()
|
|
raise HTTPException(status_code=500, detail=str(exc))
|
|
|
|
|
|
# ── 删除 ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
@router.post("/delete")
|
|
async def admin_delete(
|
|
body: DeleteRequest,
|
|
_admin: str = Depends(verify_admin),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
"""删除指定日期范围内的论文(需要 confirm='DELETE' 二次确认)。"""
|
|
if body.date_start > body.date_end:
|
|
raise HTTPException(status_code=400, detail="date_start must be <= date_end")
|
|
|
|
result = await delete_papers_by_date_range(
|
|
db,
|
|
body.date_start,
|
|
body.date_end,
|
|
include_notes=body.include_notes,
|
|
)
|
|
return result
|
|
|
|
|
|
# ── 日志 ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
@router.get("/logs")
|
|
async def admin_logs(
|
|
request: Request,
|
|
_admin: str = Depends(verify_admin),
|
|
db: Session = Depends(get_db),
|
|
page: int = Query(1, ge=1),
|
|
per_page: int = Query(20, ge=1, le=100),
|
|
):
|
|
"""查看任务日志(CrawlLog + DataDeleteJob)。"""
|
|
crawl_logs = (
|
|
db.execute(
|
|
select(CrawlLog)
|
|
.order_by(CrawlLog.started_at.desc())
|
|
.limit(per_page)
|
|
.offset((page - 1) * per_page)
|
|
)
|
|
.scalars()
|
|
.all()
|
|
)
|
|
|
|
delete_jobs = (
|
|
db.execute(
|
|
select(DataDeleteJob)
|
|
.order_by(DataDeleteJob.started_at.desc())
|
|
.limit(per_page)
|
|
.offset((page - 1) * per_page)
|
|
)
|
|
.scalars()
|
|
.all()
|
|
)
|
|
|
|
return templates.TemplateResponse(
|
|
request,
|
|
"admin_logs.html",
|
|
{
|
|
"crawl_logs": crawl_logs,
|
|
"delete_jobs": delete_jobs,
|
|
"page": page,
|
|
"per_page": per_page,
|
|
},
|
|
)
|