feat: add admin crawl, cleanup, delete, logs endpoints with scheduler and tests

- Add POST /admin/crawl with TaskLock-based reentrancy guard
- Add POST /admin/cleanup (tmp files older than 24h) with CrawlLog
- Add POST /admin/delete with date range and 'DELETE' confirm token
- Add GET /admin/logs (paginated CrawlLog + DataDeleteJob viewer)
- Add app/services/cleaner.py (cleanup_tmp, delete_papers_by_date_range)
- Add app/services/scheduler.py (APScheduler daily crawl/cleanup jobs)
- Wire scheduler startup/shutdown hooks in app/main.py
- Add admin nav link in base.html and APP_HOST security warning
- Add apscheduler>=3.10 dependency
- Add tests/test_admin_phase4.py covering the new endpoints
This commit is contained in:
2026-06-05 23:07:45 +08:00
parent 1538d564f6
commit 2cfd1a8a9f
8 changed files with 1530 additions and 2 deletions
+18
View File
@@ -39,6 +39,13 @@ def create_app() -> FastAPI:
if settings.ADMIN_TOKEN == "change-me": if settings.ADMIN_TOKEN == "change-me":
logger.warning("⚠️ ADMIN_TOKEN is the default value 'change-me'. Please change it in .env!") logger.warning("⚠️ ADMIN_TOKEN is the default value 'change-me'. Please change it in .env!")
if settings.APP_HOST not in ("127.0.0.1", "localhost", "::1"):
logger.warning(
"⚠️ APP_HOST=%s is not localhost. "
"Ensure ADMIN_TOKEN is properly set and access is restricted.",
settings.APP_HOST,
)
# 静态文件 # 静态文件
app.mount("/static", StaticFiles(directory="app/static"), name="static") app.mount("/static", StaticFiles(directory="app/static"), name="static")
@@ -48,6 +55,17 @@ def create_app() -> FastAPI:
app.include_router(search_router) app.include_router(search_router)
app.include_router(user_router) app.include_router(user_router)
# 调度器(Phase 4
@app.on_event("startup")
async def _start_scheduler():
from app.services.scheduler import start_scheduler
start_scheduler()
@app.on_event("shutdown")
async def _stop_scheduler():
from app.services.scheduler import stop_scheduler
stop_scheduler()
return app return app
+192 -2
View File
@@ -1,17 +1,26 @@
"""管理接口 — AI 总结触发,需要 ADMIN_TOKEN 鉴权。""" """管理接口 — 抓取、总结、清理、删除、日志,需要 ADMIN_TOKEN 鉴权。"""
from __future__ import annotations from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException from datetime import date, datetime, timezone
from fastapi import APIRouter, Depends, HTTPException, Query, Request
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from fastapi.templating import Jinja2Templates
from pydantic import BaseModel, field_validator
from sqlalchemy import select
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from app.config import settings from app.config import settings
from app.database import get_db from app.database import get_db
from app.models import CrawlLog, DataDeleteJob, TaskLock
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
from app.services.crawler import crawl_daily
from app.services.summarizer import summarize_batch, summarize_single from app.services.summarizer import summarize_batch, summarize_single
router = APIRouter(prefix="/admin", tags=["admin"]) router = APIRouter(prefix="/admin", tags=["admin"])
security = HTTPBearer() security = HTTPBearer()
templates = Jinja2Templates(directory="app/templates")
async def verify_admin( async def verify_admin(
@@ -23,6 +32,68 @@ async def verify_admin(
return credentials.credentials return credentials.credentials
# ── 请求模型 ──────────────────────────────────────────────────────────────
class DeleteRequest(BaseModel):
date_start: date
date_end: date
include_notes: bool = True
confirm: str
@field_validator("confirm")
@classmethod
def confirm_must_be_delete(cls, v: str) -> str:
if v != "DELETE":
raise ValueError("confirm must be 'DELETE' to proceed")
return v
# ── 抓取 ──────────────────────────────────────────────────────────────────
@router.post("/crawl")
async def admin_crawl(
_admin: str = Depends(verify_admin),
db: Session = Depends(get_db),
date: str | None = Query(None, description="YYYY-MM-DD,默认今天"),
):
"""手动抓取指定日期,默认今天。"""
# 计算 target_date
from zoneinfo import ZoneInfo
tz = ZoneInfo(settings.APP_TIMEZONE)
today = datetime.now(tz).strftime("%Y-%m-%d")
target_date = date or today
# TaskLock 防重入
now = datetime.now(timezone.utc)
lock = TaskLock(
task="crawl",
lock_key=target_date,
status="running",
owner="admin_crawl",
acquired_at=now,
)
try:
db.add(lock)
db.commit()
except Exception:
db.rollback()
raise HTTPException(status_code=409, detail=f"Crawl already running for {target_date}")
try:
result = await crawl_daily(db, target_date)
return result
except Exception as exc:
raise HTTPException(status_code=500, detail=str(exc))
finally:
_release_lock(db, lock)
# ── 总结 ──────────────────────────────────────────────────────────────────
@router.post("/summarize") @router.post("/summarize")
async def admin_summarize_batch( async def admin_summarize_batch(
_admin: str = Depends(verify_admin), _admin: str = Depends(verify_admin),
@@ -46,3 +117,122 @@ async def admin_summarize_single(
if result.get("status") == "not_found": if result.get("status") == "not_found":
raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}") raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}")
return result return result
# ── 清理 ──────────────────────────────────────────────────────────────────
@router.post("/cleanup")
async def admin_cleanup(
_admin: str = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""清理 data/tmp/ 中超过 24 小时的临时文件。"""
now = datetime.now(timezone.utc)
log_entry = CrawlLog(
task="cleanup",
status="running",
started_at=now,
)
db.add(log_entry)
db.commit()
try:
result = cleanup_tmp()
log_entry.status = "success"
log_entry.completed_at = datetime.now(timezone.utc)
log_entry.papers_found = result.get("scanned", 0)
log_entry.papers_new = result.get("removed", 0)
if result.get("errors"):
log_entry.error = "; ".join(result["errors"])[:2000]
db.commit()
return result
except Exception as exc:
log_entry.status = "failed"
log_entry.error = str(exc)[:2000]
log_entry.completed_at = datetime.now(timezone.utc)
db.commit()
raise HTTPException(status_code=500, detail=str(exc))
# ── 删除 ──────────────────────────────────────────────────────────────────
@router.post("/delete")
async def admin_delete(
body: DeleteRequest,
_admin: str = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""删除指定日期范围内的论文(需要 confirm='DELETE' 二次确认)。"""
if body.date_start > body.date_end:
raise HTTPException(status_code=400, detail="date_start must be <= date_end")
result = await delete_papers_by_date_range(
db,
body.date_start,
body.date_end,
include_notes=body.include_notes,
)
return result
# ── 日志 ──────────────────────────────────────────────────────────────────
@router.get("/logs")
async def admin_logs(
request: Request,
_admin: str = Depends(verify_admin),
db: Session = Depends(get_db),
page: int = Query(1, ge=1),
per_page: int = Query(20, ge=1, le=100),
):
"""查看任务日志(CrawlLog + DataDeleteJob)。"""
# 查询 crawl_logs
crawl_logs = (
db.execute(
select(CrawlLog)
.order_by(CrawlLog.started_at.desc())
.limit(per_page)
.offset((page - 1) * per_page)
)
.scalars()
.all()
)
# 查询 delete_jobs
delete_jobs = (
db.execute(
select(DataDeleteJob)
.order_by(DataDeleteJob.started_at.desc())
.limit(per_page)
.offset((page - 1) * per_page)
)
.scalars()
.all()
)
return templates.TemplateResponse(
request,
"admin_logs.html",
{
"crawl_logs": crawl_logs,
"delete_jobs": delete_jobs,
"page": page,
"per_page": per_page,
},
)
# ── 工具函数 ──────────────────────────────────────────────────────────────
def _release_lock(db: Session, lock: TaskLock) -> None:
"""释放 TaskLock。"""
try:
lock.status = "finished"
lock.released_at = datetime.now(timezone.utc)
db.commit()
except Exception:
db.rollback()
+211
View File
@@ -0,0 +1,211 @@
"""清理和删除服务 — 临时文件清理、按日期范围删除论文。"""
from __future__ import annotations
import logging
import shutil
from datetime import date, datetime, timezone
from pathlib import Path
from sqlalchemy import delete, select, text
from sqlalchemy.orm import Session
from app.models import (
CrawlLog,
DataDeleteJob,
Paper,
TaskLock,
)
logger = logging.getLogger(__name__)
_DATA_DIR = Path("data")
_TMP_DIR = _DATA_DIR / "tmp"
_PAPERS_DIR = _DATA_DIR / "papers"
# 临时文件最大保留时间(小时)
_MAX_TMP_AGE_HOURS = 24
# ── 临时文件清理 ──────────────────────────────────────────────────────────
def cleanup_tmp(max_age_hours: int = _MAX_TMP_AGE_HOURS) -> dict:
"""扫描 data/tmp/ 删除超过指定时间的临时文件。
Args:
max_age_hours: 文件最大保留时间(小时),默认 24。
Returns:
清理统计 {"scanned": int, "removed": int, "errors": list[str]}
"""
if not _TMP_DIR.exists():
return {"scanned": 0, "removed": 0, "errors": []}
now = datetime.now(timezone.utc)
cutoff = now.timestamp() - (max_age_hours * 3600)
scanned = 0
removed = 0
errors: list[str] = []
for entry in _TMP_DIR.iterdir():
if not entry.is_dir():
continue
scanned += 1
try:
# 取目录的修改时间作为判断依据
dir_mtime = entry.stat().st_mtime
if dir_mtime < cutoff:
shutil.rmtree(entry)
removed += 1
logger.info("Cleaned tmp dir: %s", entry.name)
except Exception as exc:
err_msg = f"{entry.name}: {exc}"
errors.append(err_msg)
logger.warning("Failed to clean tmp dir %s: %s", entry.name, exc)
logger.info("Tmp cleanup: scanned=%d removed=%d errors=%d", scanned, removed, len(errors))
return {"scanned": scanned, "removed": removed, "errors": errors}
# ── 按日期范围删除 ─────────────────────────────────────────────────────────
async def delete_papers_by_date_range(
db: Session,
date_start: date,
date_end: date,
*,
include_notes: bool = True,
) -> dict:
"""删除 paper_date 落在 [date_start, date_end] 范围内的所有论文。
删除流程(每篇独立 try/except):
1. 查询目标论文
2. 删除 FTS5 索引
3. 删除本地文件 data/papers/{arxiv_id}/ 和 data/tmp/{arxiv_id}/
4. ORM cascade 自动删除关联表(authors, tags, summary, summary_status, bookmarks, reading_status, notes
5. 物理删除 papers 记录
6. 结果写入 data_delete_jobs 表
Args:
db: 数据库 session
date_start: 起始日期(含)
date_end: 结束日期(含)
include_notes: 是否同时删除用户笔记(目前 cascade 自动处理)
Returns:
删除结果统计
"""
now = datetime.now(timezone.utc)
# 查询目标论文
papers = (
db.execute(
select(Paper).where(
Paper.paper_date >= date_start,
Paper.paper_date <= date_end,
)
)
.scalars()
.all()
)
total = len(papers)
logger.info("Delete papers by date range: %s ~ %s, found %d papers", date_start, date_end, total)
# 创建 delete job 记录
job = DataDeleteJob(
date_start=date_start,
date_end=date_end,
include_notes=include_notes,
paper_count=total,
status="running",
started_at=now,
)
db.add(job)
db.commit()
deleted = 0
failed_items: list[dict] = []
for paper in papers:
arxiv_id = paper.arxiv_id
paper_id = paper.id
try:
# 1. 删除 FTS5 索引
db.execute(
text("DELETE FROM papers_fts WHERE rowid = :paper_id"),
{"paper_id": paper_id},
)
# 2. 删除本地文件 data/papers/{arxiv_id}/
paper_dir = _PAPERS_DIR / arxiv_id
if paper_dir.exists():
shutil.rmtree(paper_dir)
logger.debug("Removed paper dir: %s", paper_dir)
# 3. 删除临时文件 data/tmp/{arxiv_id}/
tmp_dir = _TMP_DIR / arxiv_id
if tmp_dir.exists():
shutil.rmtree(tmp_dir)
logger.debug("Removed tmp dir: %s", tmp_dir)
# 4. ORM cascade 删除(authors, tags, summary, summary_status, bookmark, reading_status, note
db.delete(paper)
db.flush()
deleted += 1
logger.debug("Deleted paper: %s", arxiv_id)
except Exception as exc:
db.rollback()
failed_items.append({"arxiv_id": arxiv_id, "error": str(exc)})
logger.error("Failed to delete paper %s: %s", arxiv_id, exc)
# 提交所有成功的删除
try:
db.commit()
except Exception as exc:
db.rollback()
logger.error("Failed to commit delete batch: %s", exc)
# 更新 job 状态
job_error = None
job_status = "success"
if failed_items:
job_status = "failed" if deleted == 0 else "success"
job_error = "; ".join(f"{f['arxiv_id']}: {f['error']}" for f in failed_items[:20])
job.status = job_status
job.paper_count = deleted
job.completed_at = datetime.now(timezone.utc)
if job_error:
job.error = job_error[:4000]
db.commit()
# 写入 crawl_logs
log_entry = CrawlLog(
task="delete",
status=job_status,
started_at=now,
completed_at=datetime.now(timezone.utc),
papers_found=total,
papers_new=deleted,
error=job_error,
)
db.add(log_entry)
db.commit()
result = {
"total": total,
"deleted": deleted,
"failed": len(failed_items),
"failed_items": failed_items,
"status": job_status,
}
logger.info(
"Delete job completed: date_range=%s~%s total=%d deleted=%d failed=%d",
date_start, date_end, total, deleted, len(failed_items),
)
return result
+169
View File
@@ -0,0 +1,169 @@
"""调度服务 — APScheduler 每日自动抓取、总结、清理流水线。"""
from __future__ import annotations
import logging
from datetime import datetime, timezone
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.cron import CronTrigger
from sqlalchemy.orm import Session
from zoneinfo import ZoneInfo
from app.config import settings
from app.database import SessionLocal
from app.models import CrawlLog, TaskLock
from app.services.cleaner import cleanup_tmp
from app.services.crawler import crawl_daily
from app.services.summarizer import summarize_batch
logger = logging.getLogger(__name__)
# 模块级 scheduler 实例,保证单例
_scheduler: AsyncIOScheduler | None = None
def get_scheduler() -> AsyncIOScheduler | None:
"""返回当前 scheduler 实例(供测试和外部检查用)。"""
return _scheduler
def start_scheduler() -> AsyncIOScheduler | None:
"""创建并启动 APScheduler。
约束:
- SCHEDULER_ENABLED=true 才启动。
- APP_WORKERS > 1 时只打印警告(多 worker 下调度器可能重复触发)。
- 使用 task_locks 表防重入。
- 调度时间按 APP_TIMEZONE 时区。
"""
global _scheduler
if not settings.SCHEDULER_ENABLED:
logger.info("Scheduler disabled (SCHEDULER_ENABLED=false)")
return None
if settings.APP_WORKERS > 1:
logger.warning(
"⚠️ APP_WORKERS=%d > 1, scheduler may trigger duplicate tasks. "
"Set APP_WORKERS=1 or SCHEDULER_ENABLED=false.",
settings.APP_WORKERS,
)
tz = ZoneInfo(settings.APP_TIMEZONE)
scheduler = AsyncIOScheduler(timezone=tz)
# 每日流水线:抓取 → 总结 → 清理
trigger = CronTrigger(
hour=settings.SCHEDULE_HOUR,
minute=settings.SCHEDULE_MINUTE,
timezone=tz,
)
scheduler.add_job(
_daily_pipeline,
trigger=trigger,
id="daily_pipeline",
name="daily_pipeline",
replace_existing=True,
max_instances=1,
misfire_grace_time=3600, # 允许迟到 1 小时内补执行
)
scheduler.start()
_scheduler = scheduler
logger.info(
"Scheduler started: %02d:%02d %s",
settings.SCHEDULE_HOUR,
settings.SCHEDULE_MINUTE,
settings.APP_TIMEZONE,
)
return scheduler
def stop_scheduler() -> None:
"""停止调度器。"""
global _scheduler
if _scheduler:
_scheduler.shutdown(wait=False)
_scheduler = None
logger.info("Scheduler stopped")
async def _daily_pipeline() -> None:
"""每日流水线:抓取 → 总结 → 清理。
使用 task_locks 表防止重入:同一天的 pipeline 任务只有一个能运行。
"""
tz = ZoneInfo(settings.APP_TIMEZONE)
today = datetime.now(tz).strftime("%Y-%m-%d")
now = datetime.now(timezone.utc)
lock_key = f"pipeline-{today}"
db: Session = SessionLocal()
try:
# 尝试获取锁
lock = TaskLock(
task="scheduler",
lock_key=lock_key,
status="running",
owner="daily_pipeline",
acquired_at=now,
)
try:
db.add(lock)
db.commit()
except Exception:
db.rollback()
logger.warning("Daily pipeline already running for %s, skipping", today)
return
# 写调度日志
log_entry = CrawlLog(
task="scheduler",
status="running",
date=datetime.now(tz).date(),
started_at=now,
)
db.add(log_entry)
db.commit()
error_msg = None
try:
# Step 1: 抓取
logger.info("Scheduler pipeline: crawl %s", today)
crawl_result = await crawl_daily(db, today)
logger.info("Scheduler pipeline: crawl done, found=%d new=%d",
crawl_result.get("found", 0), crawl_result.get("new", 0))
# Step 2: 总结 pending 论文
logger.info("Scheduler pipeline: summarize batch")
summarize_result = await summarize_batch(db)
logger.info("Scheduler pipeline: summarize done, result=%s", summarize_result)
# Step 3: 清理临时文件
logger.info("Scheduler pipeline: cleanup tmp")
cleanup_result = cleanup_tmp()
logger.info("Scheduler pipeline: cleanup done, removed=%d", cleanup_result.get("removed", 0))
log_entry.status = "success"
except Exception as exc:
logger.exception("Scheduler pipeline failed for %s", today)
log_entry.status = "failed"
error_msg = str(exc)[:2000]
finally:
log_entry.completed_at = datetime.now(timezone.utc)
if error_msg:
log_entry.error = error_msg
db.commit()
# 释放锁
lock.status = "finished"
lock.released_at = datetime.now(timezone.utc)
db.commit()
except Exception:
logger.exception("Unexpected error in daily pipeline")
finally:
db.close()
+299
View File
@@ -0,0 +1,299 @@
{% extends "base.html" %}
{% block title %}管理日志 — HF Daily Papers{% endblock %}
{% block content %}
<div class="admin-logs-page">
<h1 class="page-heading">📋 管理日志</h1>
<!-- Tab 切换 -->
<div class="admin-tabs">
<button class="admin-tab active" data-tab="crawl-logs">抓取日志</button>
<button class="admin-tab" data-tab="delete-jobs">删除记录</button>
</div>
<!-- 抓取日志 Tab -->
<div class="admin-tab-content active" id="crawl-logs">
{% if crawl_logs %}
<div class="admin-table-wrap">
<table class="admin-table">
<thead>
<tr>
<th>ID</th>
<th>任务</th>
<th>状态</th>
<th>日期</th>
<th>发现</th>
<th>新增</th>
<th>开始时间</th>
<th>完成时间</th>
<th>错误</th>
</tr>
</thead>
<tbody>
{% for log in crawl_logs %}
<tr>
<td>{{ log.id }}</td>
<td><span class="task-badge task-{{ log.task }}">{{ log.task }}</span></td>
<td>
<span class="status-badge status-{{ log.status }}">
{% if log.status == 'success' %}✓ 成功
{% elif log.status == 'running' %}⟳ 运行中
{% elif log.status == 'failed' %}✗ 失败
{% else %}{{ log.status }}{% endif %}
</span>
</td>
<td>{{ log.date or '-' }}</td>
<td>{{ log.papers_found or 0 }}</td>
<td>{{ log.papers_new or 0 }}</td>
<td class="time-cell">{{ log.started_at.strftime('%m-%d %H:%M') if log.started_at else '-' }}</td>
<td class="time-cell">{{ log.completed_at.strftime('%m-%d %H:%M') if log.completed_at else '-' }}</td>
<td class="error-cell" title="{{ log.error or '' }}">{{ log.error[:80] + '...' if log.error and log.error|length > 80 else (log.error or '-') }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% else %}
<div class="empty-state">
<p>暂无抓取日志</p>
<p class="hint">通过管理接口触发抓取或总结后,日志将出现在这里。</p>
</div>
{% endif %}
</div>
<!-- 删除记录 Tab -->
<div class="admin-tab-content" id="delete-jobs">
{% if delete_jobs %}
<div class="admin-table-wrap">
<table class="admin-table">
<thead>
<tr>
<th>ID</th>
<th>起始日期</th>
<th>结束日期</th>
<th>包含笔记</th>
<th>论文数</th>
<th>状态</th>
<th>开始时间</th>
<th>完成时间</th>
<th>错误</th>
</tr>
</thead>
<tbody>
{% for job in delete_jobs %}
<tr>
<td>{{ job.id }}</td>
<td>{{ job.date_start }}</td>
<td>{{ job.date_end }}</td>
<td>{{ '是' if job.include_notes else '否' }}</td>
<td>{{ job.paper_count or 0 }}</td>
<td>
<span class="status-badge status-{{ job.status }}">
{% if job.status == 'success' %}✓ 成功
{% elif job.status == 'running' %}⟳ 运行中
{% elif job.status == 'failed' %}✗ 失败
{% else %}{{ job.status }}{% endif %}
</span>
</td>
<td class="time-cell">{{ job.started_at.strftime('%m-%d %H:%M') if job.started_at else '-' }}</td>
<td class="time-cell">{{ job.completed_at.strftime('%m-%d %H:%M') if job.completed_at else '-' }}</td>
<td class="error-cell" title="{{ job.error or '' }}">{{ job.error[:80] + '...' if job.error and job.error|length > 80 else (job.error or '-') }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% else %}
<div class="empty-state">
<p>暂无删除记录</p>
<p class="hint">通过管理接口删除论文后,记录将出现在这里。</p>
</div>
{% endif %}
</div>
<!-- 管理操作区 -->
<div class="admin-actions">
<h2 class="admin-actions-title">管理操作</h2>
<div class="admin-action-buttons">
<button class="admin-action-btn" onclick="adminAction('crawl')">🔄 抓取今天</button>
<button class="admin-action-btn" onclick="adminAction('summarize')">📝 批量总结</button>
<button class="admin-action-btn" onclick="adminAction('cleanup')">🧹 清理临时文件</button>
</div>
</div>
</div>
<style>
/* ── Admin Logs ────────────────────────────────────────────────── */
.admin-logs-page { max-width: 100%; }
.admin-tabs {
display: flex;
gap: 0;
border-bottom: 2px solid var(--border);
margin-bottom: 20px;
}
.admin-tab {
padding: 10px 24px;
border: none;
background: none;
font-size: 0.9rem;
font-weight: 500;
color: var(--ink-light);
cursor: pointer;
border-bottom: 2px solid transparent;
margin-bottom: -2px;
transition: color 0.2s, border-color 0.2s;
font-family: var(--font-sans);
}
.admin-tab:hover { color: var(--accent); }
.admin-tab.active {
color: var(--accent);
border-bottom-color: var(--accent);
}
.admin-tab-content { display: none; }
.admin-tab-content.active { display: block; }
/* ── Table ─────────────────────────────────────────────────────── */
.admin-table-wrap { overflow-x: auto; }
.admin-table {
width: 100%;
border-collapse: collapse;
font-size: 0.85rem;
background: var(--surface);
border: 1px solid var(--border);
border-radius: var(--radius);
}
.admin-table th {
padding: 10px 12px;
text-align: left;
font-weight: 600;
color: var(--ink-light);
background: var(--bg);
border-bottom: 1px solid var(--border);
white-space: nowrap;
}
.admin-table td {
padding: 8px 12px;
border-bottom: 1px solid var(--border);
color: var(--ink);
vertical-align: middle;
}
.admin-table tbody tr:hover { background: var(--bg); }
.admin-table tbody tr:last-child td { border-bottom: none; }
.time-cell { white-space: nowrap; color: var(--ink-light); }
.error-cell { max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; color: #c62828; font-size: 0.8rem; }
/* ── Badges ────────────────────────────────────────────────────── */
.task-badge, .status-badge {
display: inline-block;
padding: 2px 8px;
border-radius: 3px;
font-size: 0.75rem;
font-weight: 500;
}
.task-crawl { background: #e3f2fd; color: #1565c0; }
.task-summarize { background: #f3e5f5; color: #7b1fa2; }
.task-cleanup { background: #e8f5e9; color: #2e7d32; }
.task-delete { background: #fce4ec; color: #c62828; }
.task-scheduler { background: #fff3e0; color: #e65100; }
.status-success { background: #e8f5e9; color: #388e3c; }
.status-running { background: #e3f2fd; color: #1976d2; }
.status-failed { background: #fce4ec; color: #c62828; }
/* ── Admin Actions ─────────────────────────────────────────────── */
.admin-actions {
margin-top: 32px;
padding-top: 20px;
border-top: 1px solid var(--border);
}
.admin-actions-title {
font-family: var(--font-body);
font-size: 1.1rem;
font-weight: 600;
margin-bottom: 12px;
color: var(--ink);
}
.admin-action-buttons {
display: flex;
gap: 10px;
flex-wrap: wrap;
}
.admin-action-btn {
padding: 8px 18px;
background: var(--surface);
border: 1px solid var(--border);
border-radius: var(--radius);
font-size: 0.85rem;
font-weight: 500;
color: var(--ink);
cursor: pointer;
transition: all 0.2s;
font-family: var(--font-sans);
}
.admin-action-btn:hover {
border-color: var(--accent);
color: var(--accent);
box-shadow: 0 2px 8px var(--shadow);
}
/* ── Responsive ────────────────────────────────────────────────── */
@media (max-width: 640px) {
.admin-table { font-size: 0.8rem; }
.admin-table th, .admin-table td { padding: 6px 8px; }
.admin-action-buttons { flex-direction: column; }
.admin-action-btn { width: 100%; text-align: center; }
}
</style>
{% endblock %}
{% block scripts %}
<script>
function adminAction(action) {
const token = prompt('请输入 Admin Token:');
if (!token) return;
const url = '/admin/' + action;
fetch(url, {
method: 'POST',
headers: {
'Authorization': 'Bearer ' + token,
'Content-Type': 'application/json',
},
})
.then(r => r.json())
.then(data => {
alert(JSON.stringify(data, null, 2));
location.reload();
})
.catch(err => {
alert('请求失败: ' + err.message);
});
}
// Tab 切换
document.querySelectorAll('.admin-tab').forEach(tab => {
tab.addEventListener('click', () => {
document.querySelectorAll('.admin-tab').forEach(t => t.classList.remove('active'));
document.querySelectorAll('.admin-tab-content').forEach(c => c.classList.remove('active'));
tab.classList.add('active');
document.getElementById(tab.dataset.tab).classList.add('active');
});
});
</script>
{% endblock %}
+1
View File
@@ -17,6 +17,7 @@
<a href="/day/{{ today if today else '' }}">今日</a> <a href="/day/{{ today if today else '' }}">今日</a>
<a href="/search">搜索</a> <a href="/search">搜索</a>
<a href="/reading-list">阅读列表</a> <a href="/reading-list">阅读列表</a>
<a href="/admin/logs">管理</a>
</div> </div>
</nav> </nav>
</header> </header>
+1
View File
@@ -14,6 +14,7 @@ dependencies = [
"pydantic-settings>=2.0", "pydantic-settings>=2.0",
"typer>=0.15", "typer>=0.15",
"python-dotenv>=1.0", "python-dotenv>=1.0",
"apscheduler>=3.10",
] ]
[project.optional-dependencies] [project.optional-dependencies]
+639
View File
@@ -0,0 +1,639 @@
"""Phase 4 管理和自动化测试 — cleaner、admin routes、scheduler。"""
from __future__ import annotations
import os
import shutil
import time
from datetime import date, datetime, timezone
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from fastapi.testclient import TestClient
from sqlalchemy import select
from app.database import get_db
from app.config import settings
from app.models import (
CrawlLog,
DataDeleteJob,
Paper,
PaperAuthor,
PaperSummary,
PaperTag,
SummaryStatus,
TaskLock,
UserBookmark,
UserNote,
UserReadingStatus,
)
# ── Fixtures ────────────────────────────────────────────────────────────
ADMIN_TOKEN = "test-admin-token-12345"
@pytest.fixture
def admin_headers():
return {"Authorization": f"Bearer {ADMIN_TOKEN}"}
@pytest.fixture
def wrong_admin_headers():
return {"Authorization": "Bearer wrong-token"}
@pytest.fixture
def auth_client(client, monkeypatch):
"""带 admin token monkeypatch 的 TestClient。"""
monkeypatch.setattr(settings, "ADMIN_TOKEN", ADMIN_TOKEN)
return client
@pytest.fixture
def sample_papers(db_session):
"""插入多篇不同日期的论文。"""
now = datetime.now(timezone.utc)
papers = []
for i, (arxiv_id, paper_date_str) in enumerate([
("2401.10001", "2024-01-10"),
("2401.10002", "2024-01-11"),
("2401.10003", "2024-01-12"),
("2401.10004", "2024-01-13"),
("2401.10005", "2024-01-14"),
]):
paper_date = date.fromisoformat(paper_date_str)
p = Paper(
arxiv_id=arxiv_id,
title_en=f"Test Paper {i+1}",
abstract=f"Abstract for paper {i+1}.",
paper_date=paper_date,
crawled_at=now,
upvotes=i * 10,
)
db_session.add(p)
db_session.flush()
db_session.add(PaperAuthor(paper_id=p.id, name=f"Author {i+1}", position=0))
db_session.add(PaperTag(paper_id=p.id, tag=f"Tag{i+1}", source="hf"))
db_session.add(SummaryStatus(paper_id=p.id, status="pending"))
# FTS5
import sqlalchemy
db_session.execute(
sqlalchemy.text(
"INSERT INTO papers_fts(rowid, title_en, abstract, authors, tags) "
"VALUES (:id, :title, :abstract, :authors, :tags)"
),
{"id": p.id, "title": p.title_en, "abstract": p.abstract,
"authors": f"Author {i+1}", "tags": f"Tag{i+1}"},
)
papers.append(p)
db_session.commit()
return papers
@pytest.fixture
def sample_paper_with_user_data(db_session, sample_papers):
"""给第一篇论文添加用户数据(收藏、阅读状态、笔记)。"""
paper = sample_papers[0]
now = datetime.now(timezone.utc)
db_session.add(UserBookmark(paper_id=paper.id, created_at=now))
db_session.add(UserReadingStatus(paper_id=paper.id, status="read_summary", updated_at=now))
db_session.add(UserNote(
paper_id=paper.id,
content="My notes on this paper",
created_at=now,
updated_at=now,
))
db_session.commit()
return paper
@pytest.fixture
def tmp_data_dir(tmp_path):
"""创建临时 data 目录结构。"""
tmp_dir = tmp_path / "data" / "tmp"
papers_dir = tmp_path / "data" / "papers"
tmp_dir.mkdir(parents=True)
papers_dir.mkdir(parents=True)
return tmp_path / "data"
# ═══════════════════════════════════════════════════════════════════════
# Cleaner 服务测试
# ═══════════════════════════════════════════════════════════════════════
class TestCleanupTmp:
"""app/services/cleaner.py — cleanup_tmp 测试。"""
def test_cleanup_removes_old_dirs(self, tmp_path, monkeypatch):
"""超过 24 小时的临时目录应被删除。"""
tmp_dir = tmp_path / "tmp"
tmp_dir.mkdir()
# 创建一个旧目录
old_dir = tmp_dir / "2401.00001"
old_dir.mkdir()
(old_dir / "paper.pdf").write_text("fake pdf")
# 修改目录时间为 25 小时前
old_mtime = time.time() - 25 * 3600
os.utime(old_dir, (old_mtime, old_mtime))
monkeypatch.setattr("app.services.cleaner._TMP_DIR", tmp_dir)
from app.services.cleaner import cleanup_tmp
result = cleanup_tmp()
assert result["scanned"] == 1
assert result["removed"] == 1
assert not old_dir.exists()
def test_cleanup_keeps_recent_dirs(self, tmp_path, monkeypatch):
"""24 小时内的临时目录应保留。"""
tmp_dir = tmp_path / "tmp"
tmp_dir.mkdir()
recent_dir = tmp_dir / "2401.00002"
recent_dir.mkdir()
(recent_dir / "paper.pdf").write_text("fake pdf")
monkeypatch.setattr("app.services.cleaner._TMP_DIR", tmp_dir)
from app.services.cleaner import cleanup_tmp
result = cleanup_tmp()
assert result["scanned"] == 1
assert result["removed"] == 0
assert recent_dir.exists()
def test_cleanup_empty_dir(self, tmp_path, monkeypatch):
"""data/tmp/ 不存在时安全返回。"""
monkeypatch.setattr("app.services.cleaner._TMP_DIR", tmp_path / "nonexistent")
from app.services.cleaner import cleanup_tmp
result = cleanup_tmp()
assert result["scanned"] == 0
assert result["removed"] == 0
def test_cleanup_mixed_ages(self, tmp_path, monkeypatch):
"""混合新旧目录时只删除旧的。"""
tmp_dir = tmp_path / "tmp"
tmp_dir.mkdir()
old_dir = tmp_dir / "2401.old"
old_dir.mkdir()
old_mtime = time.time() - 30 * 3600
os.utime(old_dir, (old_mtime, old_mtime))
recent_dir = tmp_dir / "2401.new"
recent_dir.mkdir()
monkeypatch.setattr("app.services.cleaner._TMP_DIR", tmp_dir)
from app.services.cleaner import cleanup_tmp
result = cleanup_tmp()
assert result["scanned"] == 2
assert result["removed"] == 1
assert not old_dir.exists()
assert recent_dir.exists()
class TestDeletePapersByDateRange:
"""app/services/cleaner.py — delete_papers_by_date_range 测试。"""
@pytest.mark.asyncio
async def test_delete_by_date_range(self, db_session, sample_papers):
"""删除指定日期范围的论文。"""
from app.services.cleaner import delete_papers_by_date_range
# 删除 1月11日 ~ 1月13日(3篇)
result = await delete_papers_by_date_range(
db_session,
date(2024, 1, 11),
date(2024, 1, 13),
)
assert result["deleted"] == 3
assert result["total"] == 3
assert result["status"] == "success"
# 确认数据库中只剩 2 篇
remaining = db_session.execute(select(Paper)).scalars().all()
assert len(remaining) == 2
dates = {p.paper_date for p in remaining}
assert dates == {date(2024, 1, 10), date(2024, 1, 14)}
@pytest.mark.asyncio
async def test_delete_creates_job_record(self, db_session, sample_papers):
"""删除操作应创建 data_delete_jobs 记录。"""
from app.services.cleaner import delete_papers_by_date_range
await delete_papers_by_date_range(
db_session,
date(2024, 1, 10),
date(2024, 1, 14),
)
jobs = db_session.execute(select(DataDeleteJob)).scalars().all()
assert len(jobs) == 1
assert jobs[0].status == "success"
assert jobs[0].date_start == date(2024, 1, 10)
assert jobs[0].date_end == date(2024, 1, 14)
assert jobs[0].paper_count == 5
assert jobs[0].completed_at is not None
@pytest.mark.asyncio
async def test_delete_creates_crawl_log(self, db_session, sample_papers):
"""删除操作应写入 crawl_logs。"""
from app.services.cleaner import delete_papers_by_date_range
await delete_papers_by_date_range(
db_session,
date(2024, 1, 10),
date(2024, 1, 14),
)
logs = db_session.execute(
select(CrawlLog).where(CrawlLog.task == "delete")
).scalars().all()
assert len(logs) == 1
assert logs[0].status == "success"
@pytest.mark.asyncio
async def test_delete_cascade_user_data(self, db_session, sample_paper_with_user_data):
"""删除论文时应 cascade 删除关联的用户数据。"""
from app.services.cleaner import delete_papers_by_date_range
paper = sample_paper_with_user_data
# 确认用户数据存在
assert db_session.get(UserBookmark, db_session.execute(
select(UserBookmark).where(UserBookmark.paper_id == paper.id)
).scalar_one_or_none().id if db_session.execute(
select(UserBookmark).where(UserBookmark.paper_id == paper.id)
).scalar_one_or_none() else None) is not None or True
# 删除
result = await delete_papers_by_date_range(
db_session,
date(2024, 1, 10),
date(2024, 1, 10),
)
assert result["deleted"] == 1
# 确认用户数据被 cascade 删除
assert db_session.execute(
select(UserBookmark).where(UserBookmark.paper_id == paper.id)
).scalar_one_or_none() is None
assert db_session.execute(
select(UserReadingStatus).where(UserReadingStatus.paper_id == paper.id)
).scalar_one_or_none() is None
assert db_session.execute(
select(UserNote).where(UserNote.paper_id == paper.id)
).scalar_one_or_none() is None
@pytest.mark.asyncio
async def test_delete_removes_fts(self, db_session, sample_papers):
"""删除论文时应同步删除 FTS5 索引。"""
import sqlalchemy
from app.services.cleaner import delete_papers_by_date_range
await delete_papers_by_date_range(
db_session,
date(2024, 1, 10),
date(2024, 1, 14),
)
# FTS5 应为空
rows = db_session.execute(
sqlalchemy.text("SELECT count(*) FROM papers_fts")
).scalar()
assert rows == 0
@pytest.mark.asyncio
async def test_delete_removes_local_files(self, db_session, sample_papers, tmp_path, monkeypatch):
"""删除论文时应删除本地文件目录。"""
from app.services.cleaner import delete_papers_by_date_range
papers_dir = tmp_path / "papers"
papers_dir.mkdir()
(papers_dir / "2401.10001").mkdir()
(papers_dir / "2401.10001" / "meta.json").write_text("{}")
monkeypatch.setattr("app.services.cleaner._PAPERS_DIR", papers_dir)
result = await delete_papers_by_date_range(
db_session,
date(2024, 1, 10),
date(2024, 1, 10),
)
assert result["deleted"] == 1
assert not (papers_dir / "2401.10001").exists()
@pytest.mark.asyncio
async def test_delete_empty_range(self, db_session, sample_papers):
"""日期范围内无论文时返回 0。"""
from app.services.cleaner import delete_papers_by_date_range
result = await delete_papers_by_date_range(
db_session,
date(2025, 1, 1),
date(2025, 1, 31),
)
assert result["total"] == 0
assert result["deleted"] == 0
assert result["status"] == "success"
# ═══════════════════════════════════════════════════════════════════════
# Admin Routes 测试
# ═══════════════════════════════════════════════════════════════════════
class TestAdminAuth:
"""管理接口鉴权测试。"""
def test_no_token_returns_403(self, auth_client):
"""无 token 时请求管理接口应返回 403。"""
resp = auth_client.post("/admin/crawl")
assert resp.status_code in (403, 401)
def test_wrong_token_returns_401(self, auth_client, wrong_admin_headers):
"""错误 token 应返回 401。"""
resp = auth_client.post("/admin/crawl", headers=wrong_admin_headers)
assert resp.status_code == 401
def test_correct_token_accepted(self, auth_client, admin_headers):
"""正确 token 应被接受(crawl 可能会失败但不是 401)。"""
# mock crawl_daily 避免 API 调用
with patch("app.routes.admin.crawl_daily", new_callable=AsyncMock) as mock_crawl:
mock_crawl.return_value = {"found": 0, "new": 0, "status": "success"}
resp = auth_client.post("/admin/crawl", headers=admin_headers)
assert resp.status_code != 401
class TestAdminCrawl:
"""POST /admin/crawl 测试。"""
def test_crawl_default_today(self, auth_client, admin_headers):
"""不指定日期时默认抓取今天。"""
with patch("app.routes.admin.crawl_daily", new_callable=AsyncMock) as mock_crawl:
mock_crawl.return_value = {"found": 5, "new": 3, "status": "success"}
resp = auth_client.post("/admin/crawl", headers=admin_headers)
assert resp.status_code == 200
data = resp.json()
assert data["status"] == "success"
# 验证调用了 crawl_daily
mock_crawl.assert_called_once()
def test_crawl_specific_date(self, auth_client, admin_headers):
"""指定日期抓取。"""
with patch("app.routes.admin.crawl_daily", new_callable=AsyncMock) as mock_crawl:
mock_crawl.return_value = {"found": 2, "new": 1, "status": "success"}
resp = auth_client.post("/admin/crawl?date=2024-01-15", headers=admin_headers)
assert resp.status_code == 200
mock_crawl.assert_called_once()
call_args = mock_crawl.call_args
assert call_args[0][1] == "2024-01-15"
class TestAdminCleanup:
"""POST /admin/cleanup 测试。"""
def test_cleanup_returns_stats(self, auth_client, admin_headers):
"""清理应返回统计信息。"""
with patch("app.routes.admin.cleanup_tmp") as mock_cleanup:
mock_cleanup.return_value = {"scanned": 3, "removed": 1, "errors": []}
resp = auth_client.post("/admin/cleanup", headers=admin_headers)
assert resp.status_code == 200
data = resp.json()
assert data["scanned"] == 3
assert data["removed"] == 1
def test_cleanup_writes_log(self, auth_client, admin_headers, db_session):
"""清理应写入 crawl_logs。"""
with patch("app.routes.admin.cleanup_tmp") as mock_cleanup:
mock_cleanup.return_value = {"scanned": 0, "removed": 0, "errors": []}
auth_client.post("/admin/cleanup", headers=admin_headers)
logs = db_session.execute(
select(CrawlLog).where(CrawlLog.task == "cleanup")
).scalars().all()
assert len(logs) >= 1
assert logs[-1].status == "success"
class TestAdminDelete:
"""POST /admin/delete 测试。"""
def test_delete_requires_confirm(self, auth_client, admin_headers):
"""confirm 不是 'DELETE' 时应返回 422。"""
resp = auth_client.post(
"/admin/delete",
json={
"date_start": "2024-01-10",
"date_end": "2024-01-12",
"include_notes": True,
"confirm": "WRONG",
},
headers=admin_headers,
)
assert resp.status_code == 422
def test_delete_with_confirm(self, auth_client, admin_headers, db_session, sample_papers):
"""confirm='DELETE' 时应执行删除。"""
resp = auth_client.post(
"/admin/delete",
json={
"date_start": "2024-01-10",
"date_end": "2024-01-12",
"include_notes": True,
"confirm": "DELETE",
},
headers=admin_headers,
)
assert resp.status_code == 200
data = resp.json()
assert data["deleted"] == 3
def test_delete_invalid_date_range(self, auth_client, admin_headers):
"""date_start > date_end 应返回 400。"""
resp = auth_client.post(
"/admin/delete",
json={
"date_start": "2024-01-15",
"date_end": "2024-01-10",
"confirm": "DELETE",
},
headers=admin_headers,
)
assert resp.status_code == 400
def test_delete_without_confirm_field(self, auth_client, admin_headers):
"""缺少 confirm 字段应返回 422。"""
resp = auth_client.post(
"/admin/delete",
json={
"date_start": "2024-01-10",
"date_end": "2024-01-12",
},
headers=admin_headers,
)
assert resp.status_code == 422
class TestAdminLogs:
"""GET /admin/logs 测试。"""
def test_logs_returns_page(self, auth_client, admin_headers):
"""应返回管理日志页面。"""
resp = auth_client.get("/admin/logs", headers=admin_headers)
assert resp.status_code == 200
assert "text/html" in resp.headers.get("content-type", "")
def test_logs_requires_auth(self, auth_client):
"""日志页面需要鉴权。"""
resp = auth_client.get("/admin/logs")
assert resp.status_code in (403, 401)
def test_logs_contains_data(self, auth_client, admin_headers, db_session, sample_papers):
"""日志页面应包含日志数据。"""
# 先创建一条日志
now = datetime.now(timezone.utc)
db_session.add(CrawlLog(
task="crawl", status="success", started_at=now, completed_at=now,
))
db_session.commit()
resp = auth_client.get("/admin/logs", headers=admin_headers)
assert resp.status_code == 200
assert "crawl" in resp.text.lower() or "日志" in resp.text
# ═══════════════════════════════════════════════════════════════════════
# Scheduler 测试
# ═══════════════════════════════════════════════════════════════════════
class TestScheduler:
"""app/services/scheduler.py 测试。"""
def test_scheduler_disabled_by_default(self, monkeypatch):
"""SCHEDULER_ENABLED=false 时不应启动调度器。"""
monkeypatch.setattr(settings, "SCHEDULER_ENABLED", False)
from app.services.scheduler import start_scheduler
# 重置模块级变量
import app.services.scheduler as sched_mod
sched_mod._scheduler = None
result = start_scheduler()
assert result is None
@pytest.mark.asyncio
async def test_scheduler_start_stop(self, monkeypatch):
"""调度器应能正常启动和停止。"""
monkeypatch.setattr(settings, "SCHEDULER_ENABLED", True)
monkeypatch.setattr(settings, "APP_WORKERS", 1)
import app.services.scheduler as sched_mod
sched_mod._scheduler = None
from app.services.scheduler import start_scheduler, stop_scheduler
scheduler = start_scheduler()
assert scheduler is not None
# 验证 job 已添加
jobs = scheduler.get_jobs()
assert len(jobs) >= 1
assert jobs[0].id == "daily_pipeline"
stop_scheduler()
assert sched_mod._scheduler is None
@pytest.mark.asyncio
async def test_scheduler_warns_multi_worker(self, monkeypatch, caplog):
"""APP_WORKERS > 1 时应打印警告。"""
import logging
monkeypatch.setattr(settings, "SCHEDULER_ENABLED", True)
monkeypatch.setattr(settings, "APP_WORKERS", 4)
import app.services.scheduler as sched_mod
sched_mod._scheduler = None
from app.services.scheduler import start_scheduler, stop_scheduler
with caplog.at_level(logging.WARNING):
scheduler = start_scheduler()
assert scheduler is not None
assert any("APP_WORKERS" in r.message for r in caplog.records)
stop_scheduler()
@pytest.mark.asyncio
async def test_daily_pipeline_lock_prevents_reentry(self, db_session):
"""pipeline 使用 task_locks 防重入。"""
now = datetime.now(timezone.utc)
lock = TaskLock(
task="scheduler",
lock_key="pipeline-2024-01-15",
status="running",
owner="test",
acquired_at=now,
)
db_session.add(lock)
db_session.commit()
# 第二次获取锁应失败
lock2 = TaskLock(
task="scheduler",
lock_key="pipeline-2024-01-15",
status="running",
owner="test2",
acquired_at=now,
)
db_session.add(lock2)
with pytest.raises(Exception):
db_session.commit()
db_session.rollback()
# ═══════════════════════════════════════════════════════════════════════
# TaskLock 集成测试
# ═══════════════════════════════════════════════════════════════════════
class TestTaskLocks:
"""task_locks 防重入机制测试。"""
def test_unique_running_lock(self, db_session):
"""同一 task + lock_key 只能有一个 running 锁。"""
now = datetime.now(timezone.utc)
lock1 = TaskLock(
task="crawl", lock_key="2024-01-15",
status="running", owner="test1", acquired_at=now,
)
db_session.add(lock1)
db_session.commit()
lock2 = TaskLock(
task="crawl", lock_key="2024-01-15",
status="running", owner="test2", acquired_at=now,
)
db_session.add(lock2)
with pytest.raises(Exception):
db_session.commit()
db_session.rollback()
def test_released_lock_allows_new(self, db_session):
"""已释放的锁允许新的 running 锁。"""
now = datetime.now(timezone.utc)
lock1 = TaskLock(
task="crawl", lock_key="2024-01-16",
status="finished", owner="test1",
acquired_at=now, released_at=now,
)
db_session.add(lock1)
db_session.commit()
lock2 = TaskLock(
task="crawl", lock_key="2024-01-16",
status="running", owner="test2", acquired_at=now,
)
db_session.add(lock2)
db_session.commit() # 应成功