feat: add admin crawl, cleanup, delete, logs endpoints with scheduler and tests

- Add POST /admin/crawl with TaskLock-based reentrancy guard
- Add POST /admin/cleanup (tmp files older than 24h) with CrawlLog
- Add POST /admin/delete with date range and 'DELETE' confirm token
- Add GET /admin/logs (paginated CrawlLog + DataDeleteJob viewer)
- Add app/services/cleaner.py (cleanup_tmp, delete_papers_by_date_range)
- Add app/services/scheduler.py (APScheduler daily crawl/cleanup jobs)
- Wire scheduler startup/shutdown hooks in app/main.py
- Add admin nav link in base.html and APP_HOST security warning
- Add apscheduler>=3.10 dependency
- Add tests/test_admin_phase4.py covering the new endpoints
This commit is contained in:
2026-06-05 23:07:45 +08:00
parent 1538d564f6
commit 2cfd1a8a9f
8 changed files with 1530 additions and 2 deletions
+211
View File
@@ -0,0 +1,211 @@
"""清理和删除服务 — 临时文件清理、按日期范围删除论文。"""
from __future__ import annotations
import logging
import shutil
from datetime import date, datetime, timezone
from pathlib import Path
from sqlalchemy import delete, select, text
from sqlalchemy.orm import Session
from app.models import (
CrawlLog,
DataDeleteJob,
Paper,
TaskLock,
)
logger = logging.getLogger(__name__)
_DATA_DIR = Path("data")
_TMP_DIR = _DATA_DIR / "tmp"
_PAPERS_DIR = _DATA_DIR / "papers"
# 临时文件最大保留时间(小时)
_MAX_TMP_AGE_HOURS = 24
# ── 临时文件清理 ──────────────────────────────────────────────────────────
def cleanup_tmp(max_age_hours: int = _MAX_TMP_AGE_HOURS) -> dict:
"""扫描 data/tmp/ 删除超过指定时间的临时文件。
Args:
max_age_hours: 文件最大保留时间(小时),默认 24。
Returns:
清理统计 {"scanned": int, "removed": int, "errors": list[str]}
"""
if not _TMP_DIR.exists():
return {"scanned": 0, "removed": 0, "errors": []}
now = datetime.now(timezone.utc)
cutoff = now.timestamp() - (max_age_hours * 3600)
scanned = 0
removed = 0
errors: list[str] = []
for entry in _TMP_DIR.iterdir():
if not entry.is_dir():
continue
scanned += 1
try:
# 取目录的修改时间作为判断依据
dir_mtime = entry.stat().st_mtime
if dir_mtime < cutoff:
shutil.rmtree(entry)
removed += 1
logger.info("Cleaned tmp dir: %s", entry.name)
except Exception as exc:
err_msg = f"{entry.name}: {exc}"
errors.append(err_msg)
logger.warning("Failed to clean tmp dir %s: %s", entry.name, exc)
logger.info("Tmp cleanup: scanned=%d removed=%d errors=%d", scanned, removed, len(errors))
return {"scanned": scanned, "removed": removed, "errors": errors}
# ── 按日期范围删除 ─────────────────────────────────────────────────────────
async def delete_papers_by_date_range(
db: Session,
date_start: date,
date_end: date,
*,
include_notes: bool = True,
) -> dict:
"""删除 paper_date 落在 [date_start, date_end] 范围内的所有论文。
删除流程(每篇独立 try/except):
1. 查询目标论文
2. 删除 FTS5 索引
3. 删除本地文件 data/papers/{arxiv_id}/ 和 data/tmp/{arxiv_id}/
4. ORM cascade 自动删除关联表(authors, tags, summary, summary_status, bookmarks, reading_status, notes
5. 物理删除 papers 记录
6. 结果写入 data_delete_jobs 表
Args:
db: 数据库 session
date_start: 起始日期(含)
date_end: 结束日期(含)
include_notes: 是否同时删除用户笔记(目前 cascade 自动处理)
Returns:
删除结果统计
"""
now = datetime.now(timezone.utc)
# 查询目标论文
papers = (
db.execute(
select(Paper).where(
Paper.paper_date >= date_start,
Paper.paper_date <= date_end,
)
)
.scalars()
.all()
)
total = len(papers)
logger.info("Delete papers by date range: %s ~ %s, found %d papers", date_start, date_end, total)
# 创建 delete job 记录
job = DataDeleteJob(
date_start=date_start,
date_end=date_end,
include_notes=include_notes,
paper_count=total,
status="running",
started_at=now,
)
db.add(job)
db.commit()
deleted = 0
failed_items: list[dict] = []
for paper in papers:
arxiv_id = paper.arxiv_id
paper_id = paper.id
try:
# 1. 删除 FTS5 索引
db.execute(
text("DELETE FROM papers_fts WHERE rowid = :paper_id"),
{"paper_id": paper_id},
)
# 2. 删除本地文件 data/papers/{arxiv_id}/
paper_dir = _PAPERS_DIR / arxiv_id
if paper_dir.exists():
shutil.rmtree(paper_dir)
logger.debug("Removed paper dir: %s", paper_dir)
# 3. 删除临时文件 data/tmp/{arxiv_id}/
tmp_dir = _TMP_DIR / arxiv_id
if tmp_dir.exists():
shutil.rmtree(tmp_dir)
logger.debug("Removed tmp dir: %s", tmp_dir)
# 4. ORM cascade 删除(authors, tags, summary, summary_status, bookmark, reading_status, note
db.delete(paper)
db.flush()
deleted += 1
logger.debug("Deleted paper: %s", arxiv_id)
except Exception as exc:
db.rollback()
failed_items.append({"arxiv_id": arxiv_id, "error": str(exc)})
logger.error("Failed to delete paper %s: %s", arxiv_id, exc)
# 提交所有成功的删除
try:
db.commit()
except Exception as exc:
db.rollback()
logger.error("Failed to commit delete batch: %s", exc)
# 更新 job 状态
job_error = None
job_status = "success"
if failed_items:
job_status = "failed" if deleted == 0 else "success"
job_error = "; ".join(f"{f['arxiv_id']}: {f['error']}" for f in failed_items[:20])
job.status = job_status
job.paper_count = deleted
job.completed_at = datetime.now(timezone.utc)
if job_error:
job.error = job_error[:4000]
db.commit()
# 写入 crawl_logs
log_entry = CrawlLog(
task="delete",
status=job_status,
started_at=now,
completed_at=datetime.now(timezone.utc),
papers_found=total,
papers_new=deleted,
error=job_error,
)
db.add(log_entry)
db.commit()
result = {
"total": total,
"deleted": deleted,
"failed": len(failed_items),
"failed_items": failed_items,
"status": job_status,
}
logger.info(
"Delete job completed: date_range=%s~%s total=%d deleted=%d failed=%d",
date_start, date_end, total, deleted, len(failed_items),
)
return result