feat: add admin crawl, cleanup, delete, logs endpoints with scheduler and tests
- Add POST /admin/crawl with TaskLock-based reentrancy guard - Add POST /admin/cleanup (tmp files older than 24h) with CrawlLog - Add POST /admin/delete with date range and 'DELETE' confirm token - Add GET /admin/logs (paginated CrawlLog + DataDeleteJob viewer) - Add app/services/cleaner.py (cleanup_tmp, delete_papers_by_date_range) - Add app/services/scheduler.py (APScheduler daily crawl/cleanup jobs) - Wire scheduler startup/shutdown hooks in app/main.py - Add admin nav link in base.html and APP_HOST security warning - Add apscheduler>=3.10 dependency - Add tests/test_admin_phase4.py covering the new endpoints
This commit is contained in:
@@ -0,0 +1,211 @@
|
||||
"""清理和删除服务 — 临时文件清理、按日期范围删除论文。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import shutil
|
||||
from datetime import date, datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from sqlalchemy import delete, select, text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.models import (
|
||||
CrawlLog,
|
||||
DataDeleteJob,
|
||||
Paper,
|
||||
TaskLock,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_DATA_DIR = Path("data")
|
||||
_TMP_DIR = _DATA_DIR / "tmp"
|
||||
_PAPERS_DIR = _DATA_DIR / "papers"
|
||||
|
||||
# 临时文件最大保留时间(小时)
|
||||
_MAX_TMP_AGE_HOURS = 24
|
||||
|
||||
|
||||
# ── 临时文件清理 ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def cleanup_tmp(max_age_hours: int = _MAX_TMP_AGE_HOURS) -> dict:
|
||||
"""扫描 data/tmp/ 删除超过指定时间的临时文件。
|
||||
|
||||
Args:
|
||||
max_age_hours: 文件最大保留时间(小时),默认 24。
|
||||
|
||||
Returns:
|
||||
清理统计 {"scanned": int, "removed": int, "errors": list[str]}
|
||||
"""
|
||||
if not _TMP_DIR.exists():
|
||||
return {"scanned": 0, "removed": 0, "errors": []}
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
cutoff = now.timestamp() - (max_age_hours * 3600)
|
||||
scanned = 0
|
||||
removed = 0
|
||||
errors: list[str] = []
|
||||
|
||||
for entry in _TMP_DIR.iterdir():
|
||||
if not entry.is_dir():
|
||||
continue
|
||||
scanned += 1
|
||||
try:
|
||||
# 取目录的修改时间作为判断依据
|
||||
dir_mtime = entry.stat().st_mtime
|
||||
if dir_mtime < cutoff:
|
||||
shutil.rmtree(entry)
|
||||
removed += 1
|
||||
logger.info("Cleaned tmp dir: %s", entry.name)
|
||||
except Exception as exc:
|
||||
err_msg = f"{entry.name}: {exc}"
|
||||
errors.append(err_msg)
|
||||
logger.warning("Failed to clean tmp dir %s: %s", entry.name, exc)
|
||||
|
||||
logger.info("Tmp cleanup: scanned=%d removed=%d errors=%d", scanned, removed, len(errors))
|
||||
return {"scanned": scanned, "removed": removed, "errors": errors}
|
||||
|
||||
|
||||
# ── 按日期范围删除 ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def delete_papers_by_date_range(
|
||||
db: Session,
|
||||
date_start: date,
|
||||
date_end: date,
|
||||
*,
|
||||
include_notes: bool = True,
|
||||
) -> dict:
|
||||
"""删除 paper_date 落在 [date_start, date_end] 范围内的所有论文。
|
||||
|
||||
删除流程(每篇独立 try/except):
|
||||
1. 查询目标论文
|
||||
2. 删除 FTS5 索引
|
||||
3. 删除本地文件 data/papers/{arxiv_id}/ 和 data/tmp/{arxiv_id}/
|
||||
4. ORM cascade 自动删除关联表(authors, tags, summary, summary_status, bookmarks, reading_status, notes)
|
||||
5. 物理删除 papers 记录
|
||||
6. 结果写入 data_delete_jobs 表
|
||||
|
||||
Args:
|
||||
db: 数据库 session
|
||||
date_start: 起始日期(含)
|
||||
date_end: 结束日期(含)
|
||||
include_notes: 是否同时删除用户笔记(目前 cascade 自动处理)
|
||||
|
||||
Returns:
|
||||
删除结果统计
|
||||
"""
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
# 查询目标论文
|
||||
papers = (
|
||||
db.execute(
|
||||
select(Paper).where(
|
||||
Paper.paper_date >= date_start,
|
||||
Paper.paper_date <= date_end,
|
||||
)
|
||||
)
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
|
||||
total = len(papers)
|
||||
logger.info("Delete papers by date range: %s ~ %s, found %d papers", date_start, date_end, total)
|
||||
|
||||
# 创建 delete job 记录
|
||||
job = DataDeleteJob(
|
||||
date_start=date_start,
|
||||
date_end=date_end,
|
||||
include_notes=include_notes,
|
||||
paper_count=total,
|
||||
status="running",
|
||||
started_at=now,
|
||||
)
|
||||
db.add(job)
|
||||
db.commit()
|
||||
|
||||
deleted = 0
|
||||
failed_items: list[dict] = []
|
||||
|
||||
for paper in papers:
|
||||
arxiv_id = paper.arxiv_id
|
||||
paper_id = paper.id
|
||||
try:
|
||||
# 1. 删除 FTS5 索引
|
||||
db.execute(
|
||||
text("DELETE FROM papers_fts WHERE rowid = :paper_id"),
|
||||
{"paper_id": paper_id},
|
||||
)
|
||||
|
||||
# 2. 删除本地文件 data/papers/{arxiv_id}/
|
||||
paper_dir = _PAPERS_DIR / arxiv_id
|
||||
if paper_dir.exists():
|
||||
shutil.rmtree(paper_dir)
|
||||
logger.debug("Removed paper dir: %s", paper_dir)
|
||||
|
||||
# 3. 删除临时文件 data/tmp/{arxiv_id}/
|
||||
tmp_dir = _TMP_DIR / arxiv_id
|
||||
if tmp_dir.exists():
|
||||
shutil.rmtree(tmp_dir)
|
||||
logger.debug("Removed tmp dir: %s", tmp_dir)
|
||||
|
||||
# 4. ORM cascade 删除(authors, tags, summary, summary_status, bookmark, reading_status, note)
|
||||
db.delete(paper)
|
||||
db.flush()
|
||||
|
||||
deleted += 1
|
||||
logger.debug("Deleted paper: %s", arxiv_id)
|
||||
|
||||
except Exception as exc:
|
||||
db.rollback()
|
||||
failed_items.append({"arxiv_id": arxiv_id, "error": str(exc)})
|
||||
logger.error("Failed to delete paper %s: %s", arxiv_id, exc)
|
||||
|
||||
# 提交所有成功的删除
|
||||
try:
|
||||
db.commit()
|
||||
except Exception as exc:
|
||||
db.rollback()
|
||||
logger.error("Failed to commit delete batch: %s", exc)
|
||||
|
||||
# 更新 job 状态
|
||||
job_error = None
|
||||
job_status = "success"
|
||||
if failed_items:
|
||||
job_status = "failed" if deleted == 0 else "success"
|
||||
job_error = "; ".join(f"{f['arxiv_id']}: {f['error']}" for f in failed_items[:20])
|
||||
|
||||
job.status = job_status
|
||||
job.paper_count = deleted
|
||||
job.completed_at = datetime.now(timezone.utc)
|
||||
if job_error:
|
||||
job.error = job_error[:4000]
|
||||
db.commit()
|
||||
|
||||
# 写入 crawl_logs
|
||||
log_entry = CrawlLog(
|
||||
task="delete",
|
||||
status=job_status,
|
||||
started_at=now,
|
||||
completed_at=datetime.now(timezone.utc),
|
||||
papers_found=total,
|
||||
papers_new=deleted,
|
||||
error=job_error,
|
||||
)
|
||||
db.add(log_entry)
|
||||
db.commit()
|
||||
|
||||
result = {
|
||||
"total": total,
|
||||
"deleted": deleted,
|
||||
"failed": len(failed_items),
|
||||
"failed_items": failed_items,
|
||||
"status": job_status,
|
||||
}
|
||||
logger.info(
|
||||
"Delete job completed: date_range=%s~%s total=%d deleted=%d failed=%d",
|
||||
date_start, date_end, total, deleted, len(failed_items),
|
||||
)
|
||||
return result
|
||||
@@ -0,0 +1,169 @@
|
||||
"""调度服务 — APScheduler 每日自动抓取、总结、清理流水线。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||
from apscheduler.triggers.cron import CronTrigger
|
||||
from sqlalchemy.orm import Session
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
from app.config import settings
|
||||
from app.database import SessionLocal
|
||||
from app.models import CrawlLog, TaskLock
|
||||
from app.services.cleaner import cleanup_tmp
|
||||
from app.services.crawler import crawl_daily
|
||||
from app.services.summarizer import summarize_batch
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 模块级 scheduler 实例,保证单例
|
||||
_scheduler: AsyncIOScheduler | None = None
|
||||
|
||||
|
||||
def get_scheduler() -> AsyncIOScheduler | None:
|
||||
"""返回当前 scheduler 实例(供测试和外部检查用)。"""
|
||||
return _scheduler
|
||||
|
||||
|
||||
def start_scheduler() -> AsyncIOScheduler | None:
|
||||
"""创建并启动 APScheduler。
|
||||
|
||||
约束:
|
||||
- SCHEDULER_ENABLED=true 才启动。
|
||||
- APP_WORKERS > 1 时只打印警告(多 worker 下调度器可能重复触发)。
|
||||
- 使用 task_locks 表防重入。
|
||||
- 调度时间按 APP_TIMEZONE 时区。
|
||||
"""
|
||||
global _scheduler
|
||||
|
||||
if not settings.SCHEDULER_ENABLED:
|
||||
logger.info("Scheduler disabled (SCHEDULER_ENABLED=false)")
|
||||
return None
|
||||
|
||||
if settings.APP_WORKERS > 1:
|
||||
logger.warning(
|
||||
"⚠️ APP_WORKERS=%d > 1, scheduler may trigger duplicate tasks. "
|
||||
"Set APP_WORKERS=1 or SCHEDULER_ENABLED=false.",
|
||||
settings.APP_WORKERS,
|
||||
)
|
||||
|
||||
tz = ZoneInfo(settings.APP_TIMEZONE)
|
||||
scheduler = AsyncIOScheduler(timezone=tz)
|
||||
|
||||
# 每日流水线:抓取 → 总结 → 清理
|
||||
trigger = CronTrigger(
|
||||
hour=settings.SCHEDULE_HOUR,
|
||||
minute=settings.SCHEDULE_MINUTE,
|
||||
timezone=tz,
|
||||
)
|
||||
scheduler.add_job(
|
||||
_daily_pipeline,
|
||||
trigger=trigger,
|
||||
id="daily_pipeline",
|
||||
name="daily_pipeline",
|
||||
replace_existing=True,
|
||||
max_instances=1,
|
||||
misfire_grace_time=3600, # 允许迟到 1 小时内补执行
|
||||
)
|
||||
|
||||
scheduler.start()
|
||||
_scheduler = scheduler
|
||||
logger.info(
|
||||
"Scheduler started: %02d:%02d %s",
|
||||
settings.SCHEDULE_HOUR,
|
||||
settings.SCHEDULE_MINUTE,
|
||||
settings.APP_TIMEZONE,
|
||||
)
|
||||
return scheduler
|
||||
|
||||
|
||||
def stop_scheduler() -> None:
|
||||
"""停止调度器。"""
|
||||
global _scheduler
|
||||
if _scheduler:
|
||||
_scheduler.shutdown(wait=False)
|
||||
_scheduler = None
|
||||
logger.info("Scheduler stopped")
|
||||
|
||||
|
||||
async def _daily_pipeline() -> None:
|
||||
"""每日流水线:抓取 → 总结 → 清理。
|
||||
|
||||
使用 task_locks 表防止重入:同一天的 pipeline 任务只有一个能运行。
|
||||
"""
|
||||
tz = ZoneInfo(settings.APP_TIMEZONE)
|
||||
today = datetime.now(tz).strftime("%Y-%m-%d")
|
||||
now = datetime.now(timezone.utc)
|
||||
lock_key = f"pipeline-{today}"
|
||||
|
||||
db: Session = SessionLocal()
|
||||
try:
|
||||
# 尝试获取锁
|
||||
lock = TaskLock(
|
||||
task="scheduler",
|
||||
lock_key=lock_key,
|
||||
status="running",
|
||||
owner="daily_pipeline",
|
||||
acquired_at=now,
|
||||
)
|
||||
try:
|
||||
db.add(lock)
|
||||
db.commit()
|
||||
except Exception:
|
||||
db.rollback()
|
||||
logger.warning("Daily pipeline already running for %s, skipping", today)
|
||||
return
|
||||
|
||||
# 写调度日志
|
||||
log_entry = CrawlLog(
|
||||
task="scheduler",
|
||||
status="running",
|
||||
date=datetime.now(tz).date(),
|
||||
started_at=now,
|
||||
)
|
||||
db.add(log_entry)
|
||||
db.commit()
|
||||
|
||||
error_msg = None
|
||||
try:
|
||||
# Step 1: 抓取
|
||||
logger.info("Scheduler pipeline: crawl %s", today)
|
||||
crawl_result = await crawl_daily(db, today)
|
||||
logger.info("Scheduler pipeline: crawl done, found=%d new=%d",
|
||||
crawl_result.get("found", 0), crawl_result.get("new", 0))
|
||||
|
||||
# Step 2: 总结 pending 论文
|
||||
logger.info("Scheduler pipeline: summarize batch")
|
||||
summarize_result = await summarize_batch(db)
|
||||
logger.info("Scheduler pipeline: summarize done, result=%s", summarize_result)
|
||||
|
||||
# Step 3: 清理临时文件
|
||||
logger.info("Scheduler pipeline: cleanup tmp")
|
||||
cleanup_result = cleanup_tmp()
|
||||
logger.info("Scheduler pipeline: cleanup done, removed=%d", cleanup_result.get("removed", 0))
|
||||
|
||||
log_entry.status = "success"
|
||||
|
||||
except Exception as exc:
|
||||
logger.exception("Scheduler pipeline failed for %s", today)
|
||||
log_entry.status = "failed"
|
||||
error_msg = str(exc)[:2000]
|
||||
|
||||
finally:
|
||||
log_entry.completed_at = datetime.now(timezone.utc)
|
||||
if error_msg:
|
||||
log_entry.error = error_msg
|
||||
db.commit()
|
||||
|
||||
# 释放锁
|
||||
lock.status = "finished"
|
||||
lock.released_at = datetime.now(timezone.utc)
|
||||
db.commit()
|
||||
|
||||
except Exception:
|
||||
logger.exception("Unexpected error in daily pipeline")
|
||||
finally:
|
||||
db.close()
|
||||
Reference in New Issue
Block a user