feat: add admin crawl, cleanup, delete, logs endpoints with scheduler and tests

- Add POST /admin/crawl with TaskLock-based reentrancy guard
- Add POST /admin/cleanup (tmp files older than 24h) with CrawlLog
- Add POST /admin/delete with date range and 'DELETE' confirm token
- Add GET /admin/logs (paginated CrawlLog + DataDeleteJob viewer)
- Add app/services/cleaner.py (cleanup_tmp, delete_papers_by_date_range)
- Add app/services/scheduler.py (APScheduler daily crawl/cleanup jobs)
- Wire scheduler startup/shutdown hooks in app/main.py
- Add admin nav link in base.html and APP_HOST security warning
- Add apscheduler>=3.10 dependency
- Add tests/test_admin_phase4.py covering the new endpoints
This commit is contained in:
2026-06-05 23:07:45 +08:00
parent 1538d564f6
commit 2cfd1a8a9f
8 changed files with 1530 additions and 2 deletions
+211
View File
@@ -0,0 +1,211 @@
"""清理和删除服务 — 临时文件清理、按日期范围删除论文。"""
from __future__ import annotations
import logging
import shutil
from datetime import date, datetime, timezone
from pathlib import Path
from sqlalchemy import delete, select, text
from sqlalchemy.orm import Session
from app.models import (
CrawlLog,
DataDeleteJob,
Paper,
TaskLock,
)
logger = logging.getLogger(__name__)
_DATA_DIR = Path("data")
_TMP_DIR = _DATA_DIR / "tmp"
_PAPERS_DIR = _DATA_DIR / "papers"
# 临时文件最大保留时间(小时)
_MAX_TMP_AGE_HOURS = 24
# ── 临时文件清理 ──────────────────────────────────────────────────────────
def cleanup_tmp(max_age_hours: int = _MAX_TMP_AGE_HOURS) -> dict:
"""扫描 data/tmp/ 删除超过指定时间的临时文件。
Args:
max_age_hours: 文件最大保留时间(小时),默认 24。
Returns:
清理统计 {"scanned": int, "removed": int, "errors": list[str]}
"""
if not _TMP_DIR.exists():
return {"scanned": 0, "removed": 0, "errors": []}
now = datetime.now(timezone.utc)
cutoff = now.timestamp() - (max_age_hours * 3600)
scanned = 0
removed = 0
errors: list[str] = []
for entry in _TMP_DIR.iterdir():
if not entry.is_dir():
continue
scanned += 1
try:
# 取目录的修改时间作为判断依据
dir_mtime = entry.stat().st_mtime
if dir_mtime < cutoff:
shutil.rmtree(entry)
removed += 1
logger.info("Cleaned tmp dir: %s", entry.name)
except Exception as exc:
err_msg = f"{entry.name}: {exc}"
errors.append(err_msg)
logger.warning("Failed to clean tmp dir %s: %s", entry.name, exc)
logger.info("Tmp cleanup: scanned=%d removed=%d errors=%d", scanned, removed, len(errors))
return {"scanned": scanned, "removed": removed, "errors": errors}
# ── 按日期范围删除 ─────────────────────────────────────────────────────────
async def delete_papers_by_date_range(
db: Session,
date_start: date,
date_end: date,
*,
include_notes: bool = True,
) -> dict:
"""删除 paper_date 落在 [date_start, date_end] 范围内的所有论文。
删除流程(每篇独立 try/except):
1. 查询目标论文
2. 删除 FTS5 索引
3. 删除本地文件 data/papers/{arxiv_id}/ 和 data/tmp/{arxiv_id}/
4. ORM cascade 自动删除关联表(authors, tags, summary, summary_status, bookmarks, reading_status, notes
5. 物理删除 papers 记录
6. 结果写入 data_delete_jobs 表
Args:
db: 数据库 session
date_start: 起始日期(含)
date_end: 结束日期(含)
include_notes: 是否同时删除用户笔记(目前 cascade 自动处理)
Returns:
删除结果统计
"""
now = datetime.now(timezone.utc)
# 查询目标论文
papers = (
db.execute(
select(Paper).where(
Paper.paper_date >= date_start,
Paper.paper_date <= date_end,
)
)
.scalars()
.all()
)
total = len(papers)
logger.info("Delete papers by date range: %s ~ %s, found %d papers", date_start, date_end, total)
# 创建 delete job 记录
job = DataDeleteJob(
date_start=date_start,
date_end=date_end,
include_notes=include_notes,
paper_count=total,
status="running",
started_at=now,
)
db.add(job)
db.commit()
deleted = 0
failed_items: list[dict] = []
for paper in papers:
arxiv_id = paper.arxiv_id
paper_id = paper.id
try:
# 1. 删除 FTS5 索引
db.execute(
text("DELETE FROM papers_fts WHERE rowid = :paper_id"),
{"paper_id": paper_id},
)
# 2. 删除本地文件 data/papers/{arxiv_id}/
paper_dir = _PAPERS_DIR / arxiv_id
if paper_dir.exists():
shutil.rmtree(paper_dir)
logger.debug("Removed paper dir: %s", paper_dir)
# 3. 删除临时文件 data/tmp/{arxiv_id}/
tmp_dir = _TMP_DIR / arxiv_id
if tmp_dir.exists():
shutil.rmtree(tmp_dir)
logger.debug("Removed tmp dir: %s", tmp_dir)
# 4. ORM cascade 删除(authors, tags, summary, summary_status, bookmark, reading_status, note
db.delete(paper)
db.flush()
deleted += 1
logger.debug("Deleted paper: %s", arxiv_id)
except Exception as exc:
db.rollback()
failed_items.append({"arxiv_id": arxiv_id, "error": str(exc)})
logger.error("Failed to delete paper %s: %s", arxiv_id, exc)
# 提交所有成功的删除
try:
db.commit()
except Exception as exc:
db.rollback()
logger.error("Failed to commit delete batch: %s", exc)
# 更新 job 状态
job_error = None
job_status = "success"
if failed_items:
job_status = "failed" if deleted == 0 else "success"
job_error = "; ".join(f"{f['arxiv_id']}: {f['error']}" for f in failed_items[:20])
job.status = job_status
job.paper_count = deleted
job.completed_at = datetime.now(timezone.utc)
if job_error:
job.error = job_error[:4000]
db.commit()
# 写入 crawl_logs
log_entry = CrawlLog(
task="delete",
status=job_status,
started_at=now,
completed_at=datetime.now(timezone.utc),
papers_found=total,
papers_new=deleted,
error=job_error,
)
db.add(log_entry)
db.commit()
result = {
"total": total,
"deleted": deleted,
"failed": len(failed_items),
"failed_items": failed_items,
"status": job_status,
}
logger.info(
"Delete job completed: date_range=%s~%s total=%d deleted=%d failed=%d",
date_start, date_end, total, deleted, len(failed_items),
)
return result
+169
View File
@@ -0,0 +1,169 @@
"""调度服务 — APScheduler 每日自动抓取、总结、清理流水线。"""
from __future__ import annotations
import logging
from datetime import datetime, timezone
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.cron import CronTrigger
from sqlalchemy.orm import Session
from zoneinfo import ZoneInfo
from app.config import settings
from app.database import SessionLocal
from app.models import CrawlLog, TaskLock
from app.services.cleaner import cleanup_tmp
from app.services.crawler import crawl_daily
from app.services.summarizer import summarize_batch
logger = logging.getLogger(__name__)
# 模块级 scheduler 实例,保证单例
_scheduler: AsyncIOScheduler | None = None
def get_scheduler() -> AsyncIOScheduler | None:
"""返回当前 scheduler 实例(供测试和外部检查用)。"""
return _scheduler
def start_scheduler() -> AsyncIOScheduler | None:
"""创建并启动 APScheduler。
约束:
- SCHEDULER_ENABLED=true 才启动。
- APP_WORKERS > 1 时只打印警告(多 worker 下调度器可能重复触发)。
- 使用 task_locks 表防重入。
- 调度时间按 APP_TIMEZONE 时区。
"""
global _scheduler
if not settings.SCHEDULER_ENABLED:
logger.info("Scheduler disabled (SCHEDULER_ENABLED=false)")
return None
if settings.APP_WORKERS > 1:
logger.warning(
"⚠️ APP_WORKERS=%d > 1, scheduler may trigger duplicate tasks. "
"Set APP_WORKERS=1 or SCHEDULER_ENABLED=false.",
settings.APP_WORKERS,
)
tz = ZoneInfo(settings.APP_TIMEZONE)
scheduler = AsyncIOScheduler(timezone=tz)
# 每日流水线:抓取 → 总结 → 清理
trigger = CronTrigger(
hour=settings.SCHEDULE_HOUR,
minute=settings.SCHEDULE_MINUTE,
timezone=tz,
)
scheduler.add_job(
_daily_pipeline,
trigger=trigger,
id="daily_pipeline",
name="daily_pipeline",
replace_existing=True,
max_instances=1,
misfire_grace_time=3600, # 允许迟到 1 小时内补执行
)
scheduler.start()
_scheduler = scheduler
logger.info(
"Scheduler started: %02d:%02d %s",
settings.SCHEDULE_HOUR,
settings.SCHEDULE_MINUTE,
settings.APP_TIMEZONE,
)
return scheduler
def stop_scheduler() -> None:
"""停止调度器。"""
global _scheduler
if _scheduler:
_scheduler.shutdown(wait=False)
_scheduler = None
logger.info("Scheduler stopped")
async def _daily_pipeline() -> None:
"""每日流水线:抓取 → 总结 → 清理。
使用 task_locks 表防止重入:同一天的 pipeline 任务只有一个能运行。
"""
tz = ZoneInfo(settings.APP_TIMEZONE)
today = datetime.now(tz).strftime("%Y-%m-%d")
now = datetime.now(timezone.utc)
lock_key = f"pipeline-{today}"
db: Session = SessionLocal()
try:
# 尝试获取锁
lock = TaskLock(
task="scheduler",
lock_key=lock_key,
status="running",
owner="daily_pipeline",
acquired_at=now,
)
try:
db.add(lock)
db.commit()
except Exception:
db.rollback()
logger.warning("Daily pipeline already running for %s, skipping", today)
return
# 写调度日志
log_entry = CrawlLog(
task="scheduler",
status="running",
date=datetime.now(tz).date(),
started_at=now,
)
db.add(log_entry)
db.commit()
error_msg = None
try:
# Step 1: 抓取
logger.info("Scheduler pipeline: crawl %s", today)
crawl_result = await crawl_daily(db, today)
logger.info("Scheduler pipeline: crawl done, found=%d new=%d",
crawl_result.get("found", 0), crawl_result.get("new", 0))
# Step 2: 总结 pending 论文
logger.info("Scheduler pipeline: summarize batch")
summarize_result = await summarize_batch(db)
logger.info("Scheduler pipeline: summarize done, result=%s", summarize_result)
# Step 3: 清理临时文件
logger.info("Scheduler pipeline: cleanup tmp")
cleanup_result = cleanup_tmp()
logger.info("Scheduler pipeline: cleanup done, removed=%d", cleanup_result.get("removed", 0))
log_entry.status = "success"
except Exception as exc:
logger.exception("Scheduler pipeline failed for %s", today)
log_entry.status = "failed"
error_msg = str(exc)[:2000]
finally:
log_entry.completed_at = datetime.now(timezone.utc)
if error_msg:
log_entry.error = error_msg
db.commit()
# 释放锁
lock.status = "finished"
lock.released_at = datetime.now(timezone.utc)
db.commit()
except Exception:
logger.exception("Unexpected error in daily pipeline")
finally:
db.close()