0d293422ac
- Replace image_extractor with pdf_image_extractor service - Enhance pi_client with expanded API capabilities - Improve summarizer service with additional features - Update admin routes with more endpoints - Add login page template - Enhance detail page with comprehensive layout - Improve search and trends pages - Update base template with additional elements - Refactor tests for better coverage - Add validate_summary script - Update project configuration and dependencies
272 lines
8.8 KiB
Python
272 lines
8.8 KiB
Python
"""管理接口 — 抓取、总结、清理、删除、日志,需要登录鉴权。"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
from datetime import date, datetime, timezone
|
|
|
|
from fastapi import APIRouter, Depends, Form, HTTPException, Query, Request
|
|
from fastapi.responses import RedirectResponse
|
|
from pydantic import BaseModel, field_validator
|
|
from sqlalchemy import select
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.config import settings
|
|
from app.database import get_db
|
|
from app.models import CrawlLog, DataDeleteJob, TaskLock
|
|
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
|
|
from app.services.crawler import crawl_daily
|
|
from app.services.summarizer import summarize_batch, summarize_single
|
|
from app.utils import release_lock, templates, today_str
|
|
|
|
router = APIRouter(prefix="/admin", tags=["admin"])
|
|
|
|
|
|
# ── 认证 ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
def _check_password(password: str) -> bool:
|
|
"""校验密码,支持明文或 sha256 哈希。"""
|
|
stored = settings.ADMIN_PASSWORD
|
|
if not stored:
|
|
return False
|
|
if password == stored:
|
|
return True
|
|
# 也支持存 sha256 哈希
|
|
return hashlib.sha256(password.encode()).hexdigest() == stored
|
|
|
|
|
|
async def verify_admin(request: Request) -> None:
|
|
"""检查 session 中的登录状态,未登录则重定向到登录页。"""
|
|
if not request.session.get("is_admin"):
|
|
raise HTTPException(status_code=303, headers={"Location": "/admin/login"})
|
|
|
|
|
|
def verify_admin_page(request: Request) -> None:
|
|
"""页面级认证:未登录重定向到登录页(同步版本,用于模板路由)。"""
|
|
if not request.session.get("is_admin"):
|
|
raise HTTPException(status_code=303, headers={"Location": "/admin/login"})
|
|
|
|
|
|
# ── 登录 / 登出 ──────────────────────────────────────────────────────
|
|
|
|
|
|
@router.get("/login")
|
|
async def admin_login_page(request: Request):
|
|
"""显示登录页面。已登录则直接跳转管理页。"""
|
|
if request.session.get("is_admin"):
|
|
return RedirectResponse("/admin/logs", status_code=303)
|
|
return templates.TemplateResponse(request, "login.html", {"error": None})
|
|
|
|
|
|
@router.post("/login")
|
|
async def admin_login_submit(
|
|
request: Request,
|
|
username: str = Form(""),
|
|
password: str = Form(""),
|
|
):
|
|
"""处理登录表单提交。"""
|
|
if username == settings.ADMIN_USERNAME and _check_password(password):
|
|
request.session["is_admin"] = True
|
|
return RedirectResponse("/admin/logs", status_code=303)
|
|
return templates.TemplateResponse(
|
|
request, "login.html", {"error": "用户名或密码错误"}
|
|
)
|
|
|
|
|
|
@router.post("/logout")
|
|
async def admin_logout(request: Request):
|
|
"""退出登录,清除 session。"""
|
|
request.session.clear()
|
|
return RedirectResponse("/admin/login", status_code=303)
|
|
|
|
|
|
# ── 请求模型 ──────────────────────────────────────────────────────────
|
|
|
|
|
|
class DeleteRequest(BaseModel):
|
|
date_start: date
|
|
date_end: date
|
|
include_notes: bool = True
|
|
confirm: str
|
|
|
|
@field_validator("confirm")
|
|
@classmethod
|
|
def confirm_must_be_delete(cls, v: str) -> str:
|
|
if v != "DELETE":
|
|
raise ValueError("confirm must be 'DELETE' to proceed")
|
|
return v
|
|
|
|
|
|
# ── 抓取 ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
@router.post("/crawl")
|
|
async def admin_crawl(
|
|
_admin: None = Depends(verify_admin),
|
|
db: Session = Depends(get_db),
|
|
date: str | None = Query(None, description="YYYY-MM-DD,默认今天"),
|
|
):
|
|
"""手动抓取指定日期,默认今天。"""
|
|
target_date = date or today_str()
|
|
|
|
# TaskLock 防重入
|
|
now = datetime.now(timezone.utc)
|
|
lock = TaskLock(
|
|
task="crawl",
|
|
lock_key=target_date,
|
|
status="running",
|
|
owner="admin_crawl",
|
|
acquired_at=now,
|
|
)
|
|
try:
|
|
db.add(lock)
|
|
db.commit()
|
|
except Exception:
|
|
db.rollback()
|
|
raise HTTPException(
|
|
status_code=409, detail=f"Crawl already running for {target_date}"
|
|
)
|
|
|
|
try:
|
|
result = await crawl_daily(db, target_date)
|
|
return result
|
|
except Exception as exc:
|
|
raise HTTPException(status_code=500, detail=str(exc))
|
|
finally:
|
|
release_lock(db, lock)
|
|
|
|
|
|
# ── 总结 ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
@router.post("/summarize")
|
|
async def admin_summarize_batch(
|
|
_admin: None = Depends(verify_admin),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
"""批量总结所有 pending 论文。"""
|
|
result = await summarize_batch(db)
|
|
if result.get("status") == "conflict":
|
|
raise HTTPException(
|
|
status_code=409, detail=result.get("error", "batch already running")
|
|
)
|
|
return result
|
|
|
|
|
|
@router.post("/summarize/{arxiv_id}")
|
|
async def admin_summarize_single(
|
|
arxiv_id: str,
|
|
_admin: None = Depends(verify_admin),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
"""总结或重跑单篇论文。"""
|
|
result = await summarize_single(db, arxiv_id, force=True)
|
|
if result.get("status") == "not_found":
|
|
raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}")
|
|
return result
|
|
|
|
|
|
# ── 清理 ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
@router.post("/cleanup")
|
|
async def admin_cleanup(
|
|
_admin: None = Depends(verify_admin),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
"""清理 data/tmp/ 中超过 24 小时的临时文件。"""
|
|
now = datetime.now(timezone.utc)
|
|
log_entry = CrawlLog(
|
|
task="cleanup",
|
|
status="running",
|
|
started_at=now,
|
|
)
|
|
db.add(log_entry)
|
|
db.commit()
|
|
|
|
try:
|
|
result = cleanup_tmp()
|
|
log_entry.status = "success"
|
|
log_entry.completed_at = datetime.now(timezone.utc)
|
|
log_entry.papers_found = result.get("scanned", 0)
|
|
log_entry.papers_new = result.get("removed", 0)
|
|
if result.get("errors"):
|
|
log_entry.error = "; ".join(result["errors"])[:2000]
|
|
db.commit()
|
|
return result
|
|
except Exception as exc:
|
|
log_entry.status = "failed"
|
|
log_entry.error = str(exc)[:2000]
|
|
log_entry.completed_at = datetime.now(timezone.utc)
|
|
db.commit()
|
|
raise HTTPException(status_code=500, detail=str(exc))
|
|
|
|
|
|
# ── 删除 ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
@router.post("/delete")
|
|
async def admin_delete(
|
|
body: DeleteRequest,
|
|
_admin: None = Depends(verify_admin),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
"""删除指定日期范围内的论文(需要 confirm='DELETE' 二次确认)。"""
|
|
if body.date_start > body.date_end:
|
|
raise HTTPException(status_code=400, detail="date_start must be <= date_end")
|
|
|
|
result = await delete_papers_by_date_range(
|
|
db,
|
|
body.date_start,
|
|
body.date_end,
|
|
include_notes=body.include_notes,
|
|
)
|
|
return result
|
|
|
|
|
|
# ── 日志 ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
@router.get("/logs")
|
|
async def admin_logs(
|
|
request: Request,
|
|
_admin: None = Depends(verify_admin),
|
|
db: Session = Depends(get_db),
|
|
page: int = Query(1, ge=1),
|
|
per_page: int = Query(20, ge=1, le=100),
|
|
):
|
|
"""查看任务日志(CrawlLog + DataDeleteJob)。"""
|
|
crawl_logs = (
|
|
db.execute(
|
|
select(CrawlLog)
|
|
.order_by(CrawlLog.started_at.desc())
|
|
.limit(per_page)
|
|
.offset((page - 1) * per_page)
|
|
)
|
|
.scalars()
|
|
.all()
|
|
)
|
|
|
|
delete_jobs = (
|
|
db.execute(
|
|
select(DataDeleteJob)
|
|
.order_by(DataDeleteJob.started_at.desc())
|
|
.limit(per_page)
|
|
.offset((page - 1) * per_page)
|
|
)
|
|
.scalars()
|
|
.all()
|
|
)
|
|
|
|
return templates.TemplateResponse(
|
|
request,
|
|
"admin_logs.html",
|
|
{
|
|
"crawl_logs": crawl_logs,
|
|
"delete_jobs": delete_jobs,
|
|
"page": page,
|
|
"per_page": per_page,
|
|
},
|
|
)
|