refactor: extract admin business logic to services, introduce job queue, add derived index helpers

- Move DB operations from routes/admin.py to services/admin.py (get_logs_context, query_summary_statuses, retry_failed, delete/reset operations) - Add services/jobs.py with Job/JobEvent-based async job queue (create_job, run_job, enqueue_job) - Add services/derived.py with FTS5 reindex and paper index deletion helpers - Refactor scheduler to use job queue instead of direct pipeline calls - Add heartbeat_at/expires_at to TaskLock for lock health tracking - Remove DESIGN_REVIEW.md - Update tests: remove redundant integration tests, add unit tests for new services
2026-06-13 18:31:43 +08:00
parent 21f16e6756
commit 743d69efd0
20 changed files with 1391 additions and 1063 deletions
@@ -4,36 +4,20 @@ from __future__ import annotations

 import hashlib
 import hmac
-import json
-import logging
 from datetime import date

-from fastapi import APIRouter, Depends, Form, HTTPException, Query, Request
+from fastapi import APIRouter, BackgroundTasks, Depends, Form, HTTPException, Query, Request
 from fastapi.responses import RedirectResponse
 from pydantic import BaseModel, field_validator
-from sqlalchemy import bindparam, func, select, text
 from sqlalchemy.orm import Session

 from app.config import settings
 from app.database import get_db
-from app.models import (
-    CrawlLog,
-    DataDeleteJob,
-    Paper,
-    PaperTag,
-    SummaryState,
-    SummaryStatus,
-)
 from app.services import admin as admin_svc
 from app.services.admin import get_admin_stats
-from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
-from app.services.crawler import refresh_upvotes
-from app.services.pipeline import run_crawl, run_pipeline
-from app.services.scheduler import get_scheduler
-from app.services.summarizer import summarize_batch, summarize_single
-from app.utils import templates, today_str, utc_now
-
-logger = logging.getLogger(__name__)
+from app.services.cleaner import cleanup_tmp
+from app.services.jobs import create_job, enqueue_job
+from app.utils import templates, today_str

 router = APIRouter(prefix="/admin", tags=["admin"])

@@ -103,18 +87,7 @@ async def admin_dashboard(
 ):
    """管理仪表盘 — 系统状态总览。"""
    stats = get_admin_stats(db)
-
-    # 调度器历史（最近 10 条 task=scheduler 日志）
-    scheduler_history = (
-        db.execute(
-            select(CrawlLog)
-            .where(CrawlLog.task == "scheduler")
-            .order_by(CrawlLog.started_at.desc())
-            .limit(10)
-        )
-        .scalars()
-        .all()
-    )
+    scheduler_history = admin_svc.get_scheduler_history(db)

    return templates.TemplateResponse(
        request,
@@ -129,53 +102,43 @@ async def admin_dashboard(
@router.get("/scheduler-status")
 async def admin_scheduler_status(_admin: None = Depends(verify_admin)):
    """调度器运行状态（JSON）。"""
-    scheduler = get_scheduler()
-    next_run = None
-    upvote_next_run = None
-    if scheduler:
-        for job in scheduler.get_jobs():
-            if job.id == "daily_pipeline":
-                next_run = job.next_run_time
-            elif job.id == "upvote_refresh":
-                upvote_next_run = job.next_run_time
-    return {
-        "enabled": scheduler is not None,
-        "schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
-        "timezone": settings.APP_TIMEZONE,
-        "next_run": next_run.isoformat() if next_run else None,
-        "upvote_next_run": upvote_next_run.isoformat() if upvote_next_run else None,
-        "upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
-    }
+    return admin_svc.get_scheduler_status()


@router.post("/trigger-pipeline")
 async def admin_trigger_pipeline(
+    background_tasks: BackgroundTasks,
    _admin: None = Depends(verify_admin),
    db: Session = Depends(get_db),
 ):
    """手动触发一次完整流水线（crawl → summarize → cleanup）。"""
    today = today_str()
-    try:
-        result = await run_pipeline(db, today, owner="admin_trigger")
-    except RuntimeError as exc:
-        raise HTTPException(status_code=409, detail=str(exc))
-
-    if result["status"] == "failed":
-        raise HTTPException(status_code=500, detail=result.get("error"))
-    return {"status": "success", "message": "流水线执行完成"}
+    job = create_job(
+        db,
+        "pipeline_daily",
+        owner="admin_trigger",
+        payload={"target_date": today},
+    )
+    enqueue_job(background_tasks, job.id)
+    return {"status": "queued", "job_id": job.id, "message": "流水线任务已创建"}


@router.post("/refresh-upvotes")
 async def admin_refresh_upvotes(
+    background_tasks: BackgroundTasks,
    _admin: None = Depends(verify_admin),
    db: Session = Depends(get_db),
    days: int | None = Query(None, description="刷新最近 N 天，默认使用配置值"),
 ):
    """手动刷新最近 N 天论文的 upvotes。"""
-    result = await refresh_upvotes(db, days=days)
-    if result["status"] == "failed":
-        raise HTTPException(status_code=500, detail=result.get("error"))
-    return result
+    job = create_job(
+        db,
+        "refresh_upvotes",
+        owner="admin_refresh",
+        payload={"days": days},
+    )
+    enqueue_job(background_tasks, job.id)
+    return {"status": "queued", "job_id": job.id}


 # ── 请求模型 ──────────────────────────────────────────────────────────
@@ -200,18 +163,21 @@ class DeleteRequest(BaseModel):

@router.post("/crawl")
 async def admin_crawl(
+    background_tasks: BackgroundTasks,
    _admin: None = Depends(verify_admin),
    db: Session = Depends(get_db),
    date: str | None = Query(None, description="YYYY-MM-DD，默认今天"),
 ):
    """手动抓取指定日期，默认今天。"""
    target_date = date or today_str()
-    try:
-        return await run_crawl(db, target_date, owner="admin_crawl")
-    except RuntimeError as exc:
-        raise HTTPException(status_code=409, detail=str(exc))
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=str(exc))
+    job = create_job(
+        db,
+        "crawl_daily",
+        owner="admin_crawl",
+        payload={"target_date": target_date},
+    )
+    enqueue_job(background_tasks, job.id)
+    return {"status": "queued", "job_id": job.id, "target_date": target_date}


 # ── 总结 ──────────────────────────────────────────────────────────────
@@ -219,23 +185,41 @@ async def admin_crawl(

@router.post("/summarize")
 async def admin_summarize_batch(
+    background_tasks: BackgroundTasks,
    _admin: None = Depends(verify_admin),
    db: Session = Depends(get_db),
 ):
    """批量总结所有 pending 论文。"""
-    return await summarize_batch(db, pdf_mode=settings.SUMMARY_PDF_MODE)
+    job = create_job(
+        db,
+        "summarize_batch",
+        owner="admin_summarize",
+        payload={"pdf_mode": settings.SUMMARY_PDF_MODE},
+    )
+    enqueue_job(background_tasks, job.id)
+    return {"status": "queued", "job_id": job.id}


@router.post("/summarize/{arxiv_id}")
 async def admin_summarize_single(
    arxiv_id: str,
+    background_tasks: BackgroundTasks,
    _admin: None = Depends(verify_admin),
    db: Session = Depends(get_db),
 ):
    """总结或重跑单篇论文。"""
-    return await summarize_single(
-        db, arxiv_id, force=True, pdf_mode=settings.SUMMARY_PDF_MODE
+    job = create_job(
+        db,
+        "summarize_one",
+        owner="admin_summarize",
+        payload={
+            "arxiv_id": arxiv_id,
+            "force": True,
+            "pdf_mode": settings.SUMMARY_PDF_MODE,
+        },
    )
+    enqueue_job(background_tasks, job.id)
+    return {"status": "queued", "job_id": job.id, "arxiv_id": arxiv_id}


 # ── 清理 ──────────────────────────────────────────────────────────────
@@ -243,39 +227,25 @@ async def admin_summarize_single(

@router.post("/cleanup")
 async def admin_cleanup(
+    background_tasks: BackgroundTasks,
    _admin: None = Depends(verify_admin),
    db: Session = Depends(get_db),
 ):
    """清理 data/tmp/ 中超过 24 小时的临时文件。"""
-    now = utc_now()
-    log_entry = CrawlLog(
-        task="cleanup",
-        status="running",
-        started_at=now,
-    )
-    db.add(log_entry)
-    db.commit()
+    job = create_job(db, "cleanup_tmp", owner="admin_cleanup", payload={})
+    enqueue_job(background_tasks, job.id)
+    return {"status": "queued", "job_id": job.id}

+
+@router.post("/cleanup-now")
+async def admin_cleanup_now(
+    _admin: None = Depends(verify_admin),
+    db: Session = Depends(get_db),
+):
+    """同步清理临时文件，保留给测试和本地排障使用。"""
    try:
-        result = cleanup_tmp()
-        log_entry.status = "success"
-        log_entry.completed_at = utc_now()
-        log_entry.details_json = json.dumps(
-            {
-                "scanned": result.get("scanned", 0),
-                "removed": result.get("removed", 0),
-            },
-            ensure_ascii=False,
-        )
-        if result.get("errors"):
-            log_entry.error = "; ".join(result["errors"])[:2000]
-        db.commit()
-        return result
+        return admin_svc.run_cleanup_now(db, cleanup_tmp)
    except Exception as exc:
-        log_entry.status = "failed"
-        log_entry.error = str(exc)[:2000]
-        log_entry.completed_at = utc_now()
-        db.commit()
        raise HTTPException(status_code=500, detail=str(exc))


@@ -285,6 +255,7 @@ async def admin_cleanup(
@router.post("/delete")
 async def admin_delete(
    body: DeleteRequest,
+    background_tasks: BackgroundTasks,
    _admin: None = Depends(verify_admin),
    db: Session = Depends(get_db),
 ):
@@ -292,13 +263,31 @@ async def admin_delete(
    if body.date_start > body.date_end:
        raise HTTPException(status_code=400, detail="date_start must be <= date_end")

-    result = await delete_papers_by_date_range(
+    job = create_job(
        db,
-        body.date_start,
-        body.date_end,
-        include_notes=body.include_notes,
+        "delete_range",
+        owner="admin_delete",
+        payload={
+            "date_start": body.date_start.isoformat(),
+            "date_end": body.date_end.isoformat(),
+            "include_notes": body.include_notes,
+        },
    )
-    return result
+    enqueue_job(background_tasks, job.id)
+    return {"status": "queued", "job_id": job.id}
+
+
+@router.get("/jobs/{job_id}")
+async def admin_job_detail(
+    job_id: int,
+    _admin: None = Depends(verify_admin),
+    db: Session = Depends(get_db),
+):
+    """查询后台任务状态和阶段事件。"""
+    detail = admin_svc.get_job_detail(db, job_id)
+    if not detail:
+        raise HTTPException(status_code=404, detail=f"Job not found: {job_id}")
+    return detail


 # ── 日志 ──────────────────────────────────────────────────────────────
@@ -313,72 +302,10 @@ async def admin_logs(
    per_page: int = Query(20, ge=1, le=100),
 ):
    """查看任务日志（CrawlLog + DataDeleteJob）+ 总结状态统计。"""
-    crawl_logs = (
-        db.execute(
-            select(CrawlLog)
-            .order_by(CrawlLog.started_at.desc())
-            .limit(per_page)
-            .offset((page - 1) * per_page)
-        )
-        .scalars()
-        .all()
-    )
-
-    delete_jobs = (
-        db.execute(
-            select(DataDeleteJob)
-            .order_by(DataDeleteJob.started_at.desc())
-            .limit(per_page)
-            .offset((page - 1) * per_page)
-        )
-        .scalars()
-        .all()
-    )
-
-    # 总结状态统计概要
-    summary_total = db.scalar(select(func.count(Paper.id))) or 0
-    summary_done = (
-        db.scalar(
-            select(func.count(SummaryStatus.id)).where(
-                SummaryStatus.status == SummaryState.DONE
-            )
-        )
-        or 0
-    )
-    summary_pending = (
-        db.scalar(
-            select(func.count(SummaryStatus.id)).where(
-                SummaryStatus.status.in_(
-                    [SummaryState.PENDING, SummaryState.PROCESSING]
-                )
-            )
-        )
-        or 0
-    )
-    summary_failed = (
-        db.scalar(
-            select(func.count(SummaryStatus.id)).where(
-                SummaryStatus.status.in_(
-                    [SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]
-                )
-            )
-        )
-        or 0
-    )
-
    return templates.TemplateResponse(
        request,
        "admin_logs.html",
-        {
-            "crawl_logs": crawl_logs,
-            "delete_jobs": delete_jobs,
-            "page": page,
-            "per_page": per_page,
-            "summary_total": summary_total,
-            "summary_done": summary_done,
-            "summary_pending": summary_pending,
-            "summary_failed": summary_failed,
-        },
+        admin_svc.get_logs_context(db, page=page, per_page=per_page),
    )


@@ -395,22 +322,10 @@ async def admin_summary_status(
    per_page: int = Query(20, ge=1, le=100),
 ):
    """总结状态列表（HTMX 片段或 JSON）。"""
-
-    query = (
-        select(Paper, SummaryStatus)
-        .outerjoin(SummaryStatus, SummaryStatus.paper_id == Paper.id)
-        .order_by(Paper.paper_date.desc())
+    results, total = admin_svc.query_summary_statuses(
+        db, status=status, page=page, per_page=per_page
    )

-    if status != "all":
-        if status == "none":
-            query = query.where(SummaryStatus.paper_id == None)  # noqa: E711
-        else:
-            query = query.where(SummaryStatus.status == status)
-
-    total = db.scalar(select(func.count()).select_from(query.subquery()))
-    results = db.execute(query.offset((page - 1) * per_page).limit(per_page)).all()
-
    # 判断是否 HTMX 请求
    is_htmx = request.headers.get("HX-Request") == "true"

@@ -421,27 +336,16 @@ async def admin_summary_status(
            "partials/summary_list.html",
            {
                "results": results,
-                "total": total or 0,
+                "total": total,
                "page": page,
                "per_page": per_page,
                "current_status": status,
            },
        )

-    # 非 HTMX 返回 JSON
-    items = []
-    for paper, ss in results:
-        item = {
-            "arxiv_id": paper.arxiv_id,
-            "title": paper.title_zh or paper.title_en,
-            "paper_date": str(paper.paper_date),
-            "summary_status": ss.status if ss else "none",
-            "retry_count": ss.retry_count if ss else 0,
-            "error_type": ss.error_type if ss else None,
-            "error": ss.error if ss else None,
-        }
-        items.append(item)
-    return {"items": items, "total": total or 0, "page": page, "per_page": per_page}
+    return admin_svc.serialize_summary_statuses(
+        results, total=total, page=page, per_page=per_page
+    )


@router.post("/summary-retry-failed")
@@ -450,39 +354,14 @@ async def admin_summary_retry_failed(
    db: Session = Depends(get_db),
 ):
    """重试所有失败状态的总结任务。"""
-    failed_ids = (
-        db.execute(
-            select(Paper.arxiv_id)
-            .join(SummaryStatus, SummaryStatus.paper_id == Paper.id)
-            .where(
-                SummaryStatus.status.in_(
-                    [SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]
-                )
-            )
-        )
-        .scalars()
-        .all()
-    )
-
-    if not failed_ids:
+    count = admin_svc.retry_failed_summaries(db)
+    if not count:
        return {"status": "success", "message": "没有失败的任务需要重试", "count": 0}

-    # 重置失败任务的状态为 pending
-    db.execute(
-        SummaryStatus.__table__.update()
-        .where(
-            SummaryStatus.status.in_(
-                [SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]
-            )
-        )
-        .values(status=SummaryState.PENDING, error=None, error_type=None)
-    )
-    db.commit()
-
    return {
        "status": "success",
-        "message": f"已重置 {len(failed_ids)} 个失败任务为待总结状态",
-        "count": len(failed_ids),
+        "message": f"已重置 {count} 个失败任务为待总结状态",
+        "count": count,
    }


@@ -545,23 +424,8 @@ async def admin_paper_delete(
    db: Session = Depends(get_db),
 ):
    """删除单篇论文。"""
-    paper = db.scalar(select(Paper).where(Paper.arxiv_id == arxiv_id))
-    if not paper:
+    if not admin_svc.delete_paper_by_arxiv(db, arxiv_id):
        raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}")
-
-    # 删除相关数据（ORM cascade 自动处理关联表）
-    db.delete(paper)
-    db.commit()
-
-    # 清理 FTS 索引
-    try:
-        db.execute(
-            text("DELETE FROM papers_fts WHERE arxiv_id = :aid"), {"aid": arxiv_id}
-        )
-        db.commit()
-    except Exception:
-        logger.warning("Failed to clean FTS index for %s", arxiv_id, exc_info=True)
-
    return {"status": "success", "message": f"已删除 {arxiv_id}"}


@@ -588,28 +452,7 @@ async def admin_papers_batch_action(
        raise HTTPException(status_code=400, detail="arxiv_ids 不能为空")

    if body.action == "delete":
-        papers = (
-            db.execute(select(Paper).where(Paper.arxiv_id.in_(body.arxiv_ids)))
-            .scalars()
-            .all()
-        )
-
-        count = 0
-        for paper in papers:
-            db.delete(paper)
-            count += 1
-        db.commit()
-
-        # 清理 FTS 索引
-        try:
-            stmt = text("DELETE FROM papers_fts WHERE arxiv_id IN :ids").bindparams(
-                bindparam("ids", expanding=True)
-            )
-            db.execute(stmt, {"ids": body.arxiv_ids})
-            db.commit()
-        except Exception:
-            logger.warning("Failed to clean FTS index for batch delete", exc_info=True)
-
+        count = admin_svc.delete_papers_by_arxiv_ids(db, body.arxiv_ids)
        return {
            "status": "success",
            "message": f"已删除 {count} 篇论文",
@@ -617,24 +460,10 @@ async def admin_papers_batch_action(
        }

    elif body.action == "summarize":
-        # 将选中论文的总结状态重置为 pending
-        paper_ids = (
-            db.execute(select(Paper.id).where(Paper.arxiv_id.in_(body.arxiv_ids)))
-            .scalars()
-            .all()
-        )
-
-        if paper_ids:
-            # 删除旧的 status 记录让其重新进入 pipeline
-            db.execute(
-                SummaryStatus.__table__.delete().where(
-                    SummaryStatus.paper_id.in_(paper_ids)
-                )
-            )
-            db.commit()
+        count = admin_svc.reset_summaries_pending(db, body.arxiv_ids)

        return {
            "status": "success",
-            "message": f"已将 {len(paper_ids)} 篇论文重置为待总结",
-            "count": len(paper_ids),
+            "message": f"已将 {count} 篇论文重置为待总结",
+            "count": count,
        }