Files
daily-paper/app/services/pipeline.py
T

109 lines
3.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""流水线服务 — crawl → summarize → cleanup 的共享编排逻辑。
供 admin 手动触发和 scheduler 定时调度共用。
"""
from __future__ import annotations
import logging
from datetime import date as date_type
from sqlalchemy.orm import Session
from app.config import settings
from app.models import CrawlLog, TaskLock
from app.services.cleaner import cleanup_tmp
from app.services.crawler import crawl_daily
from app.services.summarizer import summarize_batch
from app.utils import utc_now, yesterday_str
logger = logging.getLogger(__name__)
async def run_pipeline(db: Session, target_date: str, owner: str) -> dict:
"""执行完整流水线:crawl → summarize → cleanup。
使用 task_locks 防重入,写入 CrawlLog 记录。
Args:
db: 数据库 session
target_date: 目标日期 YYYY-MM-DD
owner: 调用者标识(如 "admin_trigger" / "daily_pipeline"
Returns:
{"status": "success"|"failed", "error": str|None, ...}
"""
now = utc_now()
lock_key = f"pipeline-{target_date}"
# ── 获取锁 ──────────────────────────────────────────────────────────
lock = TaskLock(
task="scheduler",
lock_key=lock_key,
status="running",
owner=owner,
acquired_at=now,
)
try:
db.add(lock)
db.commit()
except Exception:
db.rollback()
raise RuntimeError(f"Pipeline already running for {target_date}")
# ── 写调度日志 ──────────────────────────────────────────────────────
log_entry = CrawlLog(
task="scheduler",
status="running",
date=date_type.fromisoformat(target_date),
started_at=now,
)
db.add(log_entry)
db.commit()
error_msg = None
crawl_result: dict = {}
try:
# Step 1: 抓取(先试今天,无数据则回退昨天)
crawl_result = await crawl_daily(db, target_date)
logger.info("Pipeline [%s]: crawl %s, found=%d new=%d",
owner, target_date,
crawl_result.get("found", 0), crawl_result.get("new", 0))
if crawl_result.get("status") == "success" and crawl_result.get("found") == 0:
yesterday = yesterday_str()
logger.info("Pipeline [%s]: falling back to %s", owner, yesterday)
crawl_result = await crawl_daily(db, yesterday)
# Step 2: 总结
summarize_result = await summarize_batch(db, pdf_mode=settings.SUMMARY_PDF_MODE)
logger.info("Pipeline [%s]: summarize done, result=%s", owner, summarize_result)
# Step 3: 清理
cleanup_result = cleanup_tmp()
logger.info("Pipeline [%s]: cleanup done, removed=%d",
owner, cleanup_result.get("removed", 0))
log_entry.status = "success"
log_entry.papers_found = crawl_result.get("found", 0)
log_entry.papers_new = crawl_result.get("new", 0)
except Exception as exc:
logger.exception("Pipeline [%s] failed", owner)
log_entry.status = "failed"
error_msg = str(exc)[:2000]
finally:
log_entry.completed_at = utc_now()
if error_msg:
log_entry.error = error_msg
db.commit()
lock.status = "finished"
lock.released_at = utc_now()
db.commit()
if error_msg:
return {"status": "failed", "error": error_msg}
return {"status": "success", "message": "Pipeline completed"}