2cfd1a8a9f
- Add POST /admin/crawl with TaskLock-based reentrancy guard - Add POST /admin/cleanup (tmp files older than 24h) with CrawlLog - Add POST /admin/delete with date range and 'DELETE' confirm token - Add GET /admin/logs (paginated CrawlLog + DataDeleteJob viewer) - Add app/services/cleaner.py (cleanup_tmp, delete_papers_by_date_range) - Add app/services/scheduler.py (APScheduler daily crawl/cleanup jobs) - Wire scheduler startup/shutdown hooks in app/main.py - Add admin nav link in base.html and APP_HOST security warning - Add apscheduler>=3.10 dependency - Add tests/test_admin_phase4.py covering the new endpoints
170 lines
5.1 KiB
Python
170 lines
5.1 KiB
Python
"""调度服务 — APScheduler 每日自动抓取、总结、清理流水线。"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
|
|
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
|
from apscheduler.triggers.cron import CronTrigger
|
|
from sqlalchemy.orm import Session
|
|
from zoneinfo import ZoneInfo
|
|
|
|
from app.config import settings
|
|
from app.database import SessionLocal
|
|
from app.models import CrawlLog, TaskLock
|
|
from app.services.cleaner import cleanup_tmp
|
|
from app.services.crawler import crawl_daily
|
|
from app.services.summarizer import summarize_batch
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# 模块级 scheduler 实例,保证单例
|
|
_scheduler: AsyncIOScheduler | None = None
|
|
|
|
|
|
def get_scheduler() -> AsyncIOScheduler | None:
|
|
"""返回当前 scheduler 实例(供测试和外部检查用)。"""
|
|
return _scheduler
|
|
|
|
|
|
def start_scheduler() -> AsyncIOScheduler | None:
|
|
"""创建并启动 APScheduler。
|
|
|
|
约束:
|
|
- SCHEDULER_ENABLED=true 才启动。
|
|
- APP_WORKERS > 1 时只打印警告(多 worker 下调度器可能重复触发)。
|
|
- 使用 task_locks 表防重入。
|
|
- 调度时间按 APP_TIMEZONE 时区。
|
|
"""
|
|
global _scheduler
|
|
|
|
if not settings.SCHEDULER_ENABLED:
|
|
logger.info("Scheduler disabled (SCHEDULER_ENABLED=false)")
|
|
return None
|
|
|
|
if settings.APP_WORKERS > 1:
|
|
logger.warning(
|
|
"⚠️ APP_WORKERS=%d > 1, scheduler may trigger duplicate tasks. "
|
|
"Set APP_WORKERS=1 or SCHEDULER_ENABLED=false.",
|
|
settings.APP_WORKERS,
|
|
)
|
|
|
|
tz = ZoneInfo(settings.APP_TIMEZONE)
|
|
scheduler = AsyncIOScheduler(timezone=tz)
|
|
|
|
# 每日流水线:抓取 → 总结 → 清理
|
|
trigger = CronTrigger(
|
|
hour=settings.SCHEDULE_HOUR,
|
|
minute=settings.SCHEDULE_MINUTE,
|
|
timezone=tz,
|
|
)
|
|
scheduler.add_job(
|
|
_daily_pipeline,
|
|
trigger=trigger,
|
|
id="daily_pipeline",
|
|
name="daily_pipeline",
|
|
replace_existing=True,
|
|
max_instances=1,
|
|
misfire_grace_time=3600, # 允许迟到 1 小时内补执行
|
|
)
|
|
|
|
scheduler.start()
|
|
_scheduler = scheduler
|
|
logger.info(
|
|
"Scheduler started: %02d:%02d %s",
|
|
settings.SCHEDULE_HOUR,
|
|
settings.SCHEDULE_MINUTE,
|
|
settings.APP_TIMEZONE,
|
|
)
|
|
return scheduler
|
|
|
|
|
|
def stop_scheduler() -> None:
|
|
"""停止调度器。"""
|
|
global _scheduler
|
|
if _scheduler:
|
|
_scheduler.shutdown(wait=False)
|
|
_scheduler = None
|
|
logger.info("Scheduler stopped")
|
|
|
|
|
|
async def _daily_pipeline() -> None:
|
|
"""每日流水线:抓取 → 总结 → 清理。
|
|
|
|
使用 task_locks 表防止重入:同一天的 pipeline 任务只有一个能运行。
|
|
"""
|
|
tz = ZoneInfo(settings.APP_TIMEZONE)
|
|
today = datetime.now(tz).strftime("%Y-%m-%d")
|
|
now = datetime.now(timezone.utc)
|
|
lock_key = f"pipeline-{today}"
|
|
|
|
db: Session = SessionLocal()
|
|
try:
|
|
# 尝试获取锁
|
|
lock = TaskLock(
|
|
task="scheduler",
|
|
lock_key=lock_key,
|
|
status="running",
|
|
owner="daily_pipeline",
|
|
acquired_at=now,
|
|
)
|
|
try:
|
|
db.add(lock)
|
|
db.commit()
|
|
except Exception:
|
|
db.rollback()
|
|
logger.warning("Daily pipeline already running for %s, skipping", today)
|
|
return
|
|
|
|
# 写调度日志
|
|
log_entry = CrawlLog(
|
|
task="scheduler",
|
|
status="running",
|
|
date=datetime.now(tz).date(),
|
|
started_at=now,
|
|
)
|
|
db.add(log_entry)
|
|
db.commit()
|
|
|
|
error_msg = None
|
|
try:
|
|
# Step 1: 抓取
|
|
logger.info("Scheduler pipeline: crawl %s", today)
|
|
crawl_result = await crawl_daily(db, today)
|
|
logger.info("Scheduler pipeline: crawl done, found=%d new=%d",
|
|
crawl_result.get("found", 0), crawl_result.get("new", 0))
|
|
|
|
# Step 2: 总结 pending 论文
|
|
logger.info("Scheduler pipeline: summarize batch")
|
|
summarize_result = await summarize_batch(db)
|
|
logger.info("Scheduler pipeline: summarize done, result=%s", summarize_result)
|
|
|
|
# Step 3: 清理临时文件
|
|
logger.info("Scheduler pipeline: cleanup tmp")
|
|
cleanup_result = cleanup_tmp()
|
|
logger.info("Scheduler pipeline: cleanup done, removed=%d", cleanup_result.get("removed", 0))
|
|
|
|
log_entry.status = "success"
|
|
|
|
except Exception as exc:
|
|
logger.exception("Scheduler pipeline failed for %s", today)
|
|
log_entry.status = "failed"
|
|
error_msg = str(exc)[:2000]
|
|
|
|
finally:
|
|
log_entry.completed_at = datetime.now(timezone.utc)
|
|
if error_msg:
|
|
log_entry.error = error_msg
|
|
db.commit()
|
|
|
|
# 释放锁
|
|
lock.status = "finished"
|
|
lock.released_at = datetime.now(timezone.utc)
|
|
db.commit()
|
|
|
|
except Exception:
|
|
logger.exception("Unexpected error in daily pipeline")
|
|
finally:
|
|
db.close()
|