refactor: extract admin business logic to services, introduce job queue, add derived index helpers

- Move DB operations from routes/admin.py to services/admin.py (get_logs_context, query_summary_statuses, retry_failed, delete/reset operations)
- Add services/jobs.py with Job/JobEvent-based async job queue (create_job, run_job, enqueue_job)
- Add services/derived.py with FTS5 reindex and paper index deletion helpers
- Refactor scheduler to use job queue instead of direct pipeline calls
- Add heartbeat_at/expires_at to TaskLock for lock health tracking
- Remove DESIGN_REVIEW.md
- Update tests: remove redundant integration tests, add unit tests for new services
This commit is contained in:
2026-06-13 18:31:43 +08:00
parent 21f16e6756
commit 743d69efd0
20 changed files with 1391 additions and 1063 deletions
+66 -6
View File
@@ -26,7 +26,7 @@ def crawl(
from app.database import SessionLocal, engine
from app.database import init_db as _init
from app.models import Paper
from app.services.crawler import crawl_daily
from app.services.jobs import create_job, run_job
from app.utils import today_str, yesterday_str
from sqlalchemy import func, select
@@ -55,7 +55,13 @@ def crawl(
return
typer.echo(f"📡 开始抓取 {target} ...")
result = asyncio.run(crawl_daily(db, target, top_n))
job = create_job(
db,
"crawl_daily",
owner="cli_crawl",
payload={"target_date": target, "top_n": top_n},
)
result = asyncio.run(run_job(db, job.id))
# 未指定日期且今天失败或无数据时,自动回退到昨天
need_fallback = not date_str and (
@@ -76,7 +82,13 @@ def crawl(
else:
typer.echo(f"🔄 {target} 无数据,尝试 {fallback} ...")
target = fallback
result = asyncio.run(crawl_daily(db, target, top_n))
job = create_job(
db,
"crawl_daily",
owner="cli_crawl",
payload={"target_date": target, "top_n": top_n},
)
result = asyncio.run(run_job(db, job.id))
if result["status"] == "success":
typer.echo(
@@ -110,7 +122,7 @@ def summarize(
from app.config import settings
from app.database import SessionLocal, engine
from app.database import init_db as _init
from app.services.summarizer import summarize_batch, summarize_single
from app.services.jobs import create_job, run_job
import os
@@ -142,11 +154,25 @@ def summarize(
try:
if arxiv_id:
typer.echo(f"🤖 开始总结 {arxiv_id} (mode={pdf_mode}) ...")
result = asyncio.run(summarize_single(db, arxiv_id, pdf_mode=pdf_mode))
job = create_job(
db,
"summarize_one",
owner="cli_summarize",
payload={"arxiv_id": arxiv_id, "pdf_mode": pdf_mode, "force": False},
)
else:
typer.echo(f"🤖 开始批量总结 pending 论文 (mode={pdf_mode}) ...")
result = asyncio.run(summarize_batch(db, pdf_mode=pdf_mode))
job = create_job(
db,
"summarize_batch",
owner="cli_summarize",
payload={"pdf_mode": pdf_mode},
)
result = asyncio.run(run_job(db, job.id))
if result.get("status") == "failed":
typer.echo(f"❌ 总结失败:{result.get('error')}", err=True)
raise typer.Exit(code=1)
typer.echo(f"✅ 总结完成:{result}")
except NotFoundError as exc:
typer.echo(f"{exc.message}", err=True)
@@ -172,5 +198,39 @@ def init_db():
typer.echo(f"✅ 数据库已初始化:{settings.db_path}")
@cli_app.command("rebuild-derived")
def rebuild_derived(
fts: bool = typer.Option(False, "--fts", help="重建 FTS5 全文索引"),
chroma: bool = typer.Option(False, "--chroma", help="重建 ChromaDB 语义索引"),
):
"""重建可派生数据索引。"""
from app.config import settings
from app.database import SessionLocal, engine
from app.database import init_db as _init
from app.services.jobs import create_job, run_job
import os
if not fts and not chroma:
fts = True
os.makedirs(settings.db_path.parent, exist_ok=True)
_init(engine)
db = SessionLocal()
try:
for job_type in [
*(["reindex_fts"] if fts else []),
*(["reindex_chroma"] if chroma else []),
]:
job = create_job(db, job_type, owner="cli_rebuild_derived", payload={})
result = asyncio.run(run_job(db, job.id))
typer.echo(f"{job_type}: {result}")
if result.get("status") == "failed":
raise typer.Exit(code=1)
finally:
db.close()
if __name__ == "__main__":
cli_app()
+16
View File
@@ -76,10 +76,26 @@ def _migrate(engine) -> None:
"crawl_logs": [
("details_json", "TEXT"),
],
"task_locks": [
("heartbeat_at", "DATETIME"),
("expires_at", "DATETIME"),
],
"jobs": [
("heartbeat_at", "DATETIME"),
],
}
with engine.connect() as conn:
for table, columns in _MIGRATIONS.items():
table_exists = conn.execute(
text(
"SELECT name FROM sqlite_master "
"WHERE type IN ('table', 'virtual table') AND name = :name"
),
{"name": table},
).fetchone()
if not table_exists:
continue
# 获取已有列名
existing = {
row[1] for row in conn.execute(text(f"PRAGMA table_info({table})"))
+7
View File
@@ -32,7 +32,14 @@ async def lifespan(app: FastAPI):
# ── startup ──
from app.services.scheduler import start_scheduler
from app.services.embedder import init_chroma
from app.services.jobs import recover_stale_jobs
from app.database import SessionLocal
db = SessionLocal()
try:
recover_stale_jobs(db)
finally:
db.close()
start_scheduler()
init_chroma()
+59
View File
@@ -32,6 +32,26 @@ class SummaryState(StrEnum):
PERMANENT_FAILURE = "permanent_failure"
class JobStatus(StrEnum):
"""后台任务状态枚举 — 对应 jobs.status 列。"""
QUEUED = "queued"
RUNNING = "running"
SUCCESS = "success"
FAILED = "failed"
STALE = "stale"
CANCELLED = "cancelled"
class JobEventStatus(StrEnum):
"""任务阶段事件状态枚举 — 对应 job_events.status 列。"""
STARTED = "started"
SUCCESS = "success"
FAILED = "failed"
INFO = "info"
# ── papers ──────────────────────────────────────────────────────────────
class Paper(Base):
__tablename__ = "papers"
@@ -194,9 +214,48 @@ class TaskLock(Base):
status = Column(String, nullable=False)
owner = Column(String)
acquired_at = Column(DateTime, nullable=False)
heartbeat_at = Column(DateTime)
expires_at = Column(DateTime)
released_at = Column(DateTime)
# ── jobs / job_events ──────────────────────────────────────────────────
class Job(Base):
__tablename__ = "jobs"
id = Column(Integer, primary_key=True, autoincrement=True)
type = Column(String, nullable=False, index=True)
status = Column(String, nullable=False, default=JobStatus.QUEUED, index=True)
owner = Column(String)
payload_json = Column(Text)
result_json = Column(Text)
error = Column(Text)
created_at = Column(DateTime, nullable=False)
started_at = Column(DateTime)
heartbeat_at = Column(DateTime)
completed_at = Column(DateTime)
events = relationship(
"JobEvent", back_populates="job", cascade="all, delete-orphan"
)
class JobEvent(Base):
__tablename__ = "job_events"
id = Column(Integer, primary_key=True, autoincrement=True)
job_id = Column(
Integer, ForeignKey("jobs.id", ondelete="CASCADE"), nullable=False, index=True
)
stage = Column(String, nullable=False)
status = Column(String, nullable=False)
message = Column(Text)
payload_json = Column(Text)
created_at = Column(DateTime, nullable=False)
job = relationship("Job", back_populates="events")
# ── user data ──────────────────────────────────────────────────────────
class UserBookmark(Base):
__tablename__ = "user_bookmarks"
+106 -277
View File
@@ -4,36 +4,20 @@ from __future__ import annotations
import hashlib
import hmac
import json
import logging
from datetime import date
from fastapi import APIRouter, Depends, Form, HTTPException, Query, Request
from fastapi import APIRouter, BackgroundTasks, Depends, Form, HTTPException, Query, Request
from fastapi.responses import RedirectResponse
from pydantic import BaseModel, field_validator
from sqlalchemy import bindparam, func, select, text
from sqlalchemy.orm import Session
from app.config import settings
from app.database import get_db
from app.models import (
CrawlLog,
DataDeleteJob,
Paper,
PaperTag,
SummaryState,
SummaryStatus,
)
from app.services import admin as admin_svc
from app.services.admin import get_admin_stats
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
from app.services.crawler import refresh_upvotes
from app.services.pipeline import run_crawl, run_pipeline
from app.services.scheduler import get_scheduler
from app.services.summarizer import summarize_batch, summarize_single
from app.utils import templates, today_str, utc_now
logger = logging.getLogger(__name__)
from app.services.cleaner import cleanup_tmp
from app.services.jobs import create_job, enqueue_job
from app.utils import templates, today_str
router = APIRouter(prefix="/admin", tags=["admin"])
@@ -103,18 +87,7 @@ async def admin_dashboard(
):
"""管理仪表盘 — 系统状态总览。"""
stats = get_admin_stats(db)
# 调度器历史(最近 10 条 task=scheduler 日志)
scheduler_history = (
db.execute(
select(CrawlLog)
.where(CrawlLog.task == "scheduler")
.order_by(CrawlLog.started_at.desc())
.limit(10)
)
.scalars()
.all()
)
scheduler_history = admin_svc.get_scheduler_history(db)
return templates.TemplateResponse(
request,
@@ -129,53 +102,43 @@ async def admin_dashboard(
@router.get("/scheduler-status")
async def admin_scheduler_status(_admin: None = Depends(verify_admin)):
"""调度器运行状态(JSON)。"""
scheduler = get_scheduler()
next_run = None
upvote_next_run = None
if scheduler:
for job in scheduler.get_jobs():
if job.id == "daily_pipeline":
next_run = job.next_run_time
elif job.id == "upvote_refresh":
upvote_next_run = job.next_run_time
return {
"enabled": scheduler is not None,
"schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
"timezone": settings.APP_TIMEZONE,
"next_run": next_run.isoformat() if next_run else None,
"upvote_next_run": upvote_next_run.isoformat() if upvote_next_run else None,
"upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
}
return admin_svc.get_scheduler_status()
@router.post("/trigger-pipeline")
async def admin_trigger_pipeline(
background_tasks: BackgroundTasks,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""手动触发一次完整流水线(crawl → summarize → cleanup)。"""
today = today_str()
try:
result = await run_pipeline(db, today, owner="admin_trigger")
except RuntimeError as exc:
raise HTTPException(status_code=409, detail=str(exc))
if result["status"] == "failed":
raise HTTPException(status_code=500, detail=result.get("error"))
return {"status": "success", "message": "流水线执行完成"}
job = create_job(
db,
"pipeline_daily",
owner="admin_trigger",
payload={"target_date": today},
)
enqueue_job(background_tasks, job.id)
return {"status": "queued", "job_id": job.id, "message": "流水线任务已创建"}
@router.post("/refresh-upvotes")
async def admin_refresh_upvotes(
background_tasks: BackgroundTasks,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
days: int | None = Query(None, description="刷新最近 N 天,默认使用配置值"),
):
"""手动刷新最近 N 天论文的 upvotes。"""
result = await refresh_upvotes(db, days=days)
if result["status"] == "failed":
raise HTTPException(status_code=500, detail=result.get("error"))
return result
job = create_job(
db,
"refresh_upvotes",
owner="admin_refresh",
payload={"days": days},
)
enqueue_job(background_tasks, job.id)
return {"status": "queued", "job_id": job.id}
# ── 请求模型 ──────────────────────────────────────────────────────────
@@ -200,18 +163,21 @@ class DeleteRequest(BaseModel):
@router.post("/crawl")
async def admin_crawl(
background_tasks: BackgroundTasks,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
date: str | None = Query(None, description="YYYY-MM-DD,默认今天"),
):
"""手动抓取指定日期,默认今天。"""
target_date = date or today_str()
try:
return await run_crawl(db, target_date, owner="admin_crawl")
except RuntimeError as exc:
raise HTTPException(status_code=409, detail=str(exc))
except Exception as exc:
raise HTTPException(status_code=500, detail=str(exc))
job = create_job(
db,
"crawl_daily",
owner="admin_crawl",
payload={"target_date": target_date},
)
enqueue_job(background_tasks, job.id)
return {"status": "queued", "job_id": job.id, "target_date": target_date}
# ── 总结 ──────────────────────────────────────────────────────────────
@@ -219,23 +185,41 @@ async def admin_crawl(
@router.post("/summarize")
async def admin_summarize_batch(
background_tasks: BackgroundTasks,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""批量总结所有 pending 论文。"""
return await summarize_batch(db, pdf_mode=settings.SUMMARY_PDF_MODE)
job = create_job(
db,
"summarize_batch",
owner="admin_summarize",
payload={"pdf_mode": settings.SUMMARY_PDF_MODE},
)
enqueue_job(background_tasks, job.id)
return {"status": "queued", "job_id": job.id}
@router.post("/summarize/{arxiv_id}")
async def admin_summarize_single(
arxiv_id: str,
background_tasks: BackgroundTasks,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""总结或重跑单篇论文。"""
return await summarize_single(
db, arxiv_id, force=True, pdf_mode=settings.SUMMARY_PDF_MODE
job = create_job(
db,
"summarize_one",
owner="admin_summarize",
payload={
"arxiv_id": arxiv_id,
"force": True,
"pdf_mode": settings.SUMMARY_PDF_MODE,
},
)
enqueue_job(background_tasks, job.id)
return {"status": "queued", "job_id": job.id, "arxiv_id": arxiv_id}
# ── 清理 ──────────────────────────────────────────────────────────────
@@ -243,39 +227,25 @@ async def admin_summarize_single(
@router.post("/cleanup")
async def admin_cleanup(
background_tasks: BackgroundTasks,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""清理 data/tmp/ 中超过 24 小时的临时文件。"""
now = utc_now()
log_entry = CrawlLog(
task="cleanup",
status="running",
started_at=now,
)
db.add(log_entry)
db.commit()
job = create_job(db, "cleanup_tmp", owner="admin_cleanup", payload={})
enqueue_job(background_tasks, job.id)
return {"status": "queued", "job_id": job.id}
@router.post("/cleanup-now")
async def admin_cleanup_now(
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""同步清理临时文件,保留给测试和本地排障使用。"""
try:
result = cleanup_tmp()
log_entry.status = "success"
log_entry.completed_at = utc_now()
log_entry.details_json = json.dumps(
{
"scanned": result.get("scanned", 0),
"removed": result.get("removed", 0),
},
ensure_ascii=False,
)
if result.get("errors"):
log_entry.error = "; ".join(result["errors"])[:2000]
db.commit()
return result
return admin_svc.run_cleanup_now(db, cleanup_tmp)
except Exception as exc:
log_entry.status = "failed"
log_entry.error = str(exc)[:2000]
log_entry.completed_at = utc_now()
db.commit()
raise HTTPException(status_code=500, detail=str(exc))
@@ -285,6 +255,7 @@ async def admin_cleanup(
@router.post("/delete")
async def admin_delete(
body: DeleteRequest,
background_tasks: BackgroundTasks,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
@@ -292,13 +263,31 @@ async def admin_delete(
if body.date_start > body.date_end:
raise HTTPException(status_code=400, detail="date_start must be <= date_end")
result = await delete_papers_by_date_range(
job = create_job(
db,
body.date_start,
body.date_end,
include_notes=body.include_notes,
"delete_range",
owner="admin_delete",
payload={
"date_start": body.date_start.isoformat(),
"date_end": body.date_end.isoformat(),
"include_notes": body.include_notes,
},
)
return result
enqueue_job(background_tasks, job.id)
return {"status": "queued", "job_id": job.id}
@router.get("/jobs/{job_id}")
async def admin_job_detail(
job_id: int,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""查询后台任务状态和阶段事件。"""
detail = admin_svc.get_job_detail(db, job_id)
if not detail:
raise HTTPException(status_code=404, detail=f"Job not found: {job_id}")
return detail
# ── 日志 ──────────────────────────────────────────────────────────────
@@ -313,72 +302,10 @@ async def admin_logs(
per_page: int = Query(20, ge=1, le=100),
):
"""查看任务日志(CrawlLog + DataDeleteJob+ 总结状态统计。"""
crawl_logs = (
db.execute(
select(CrawlLog)
.order_by(CrawlLog.started_at.desc())
.limit(per_page)
.offset((page - 1) * per_page)
)
.scalars()
.all()
)
delete_jobs = (
db.execute(
select(DataDeleteJob)
.order_by(DataDeleteJob.started_at.desc())
.limit(per_page)
.offset((page - 1) * per_page)
)
.scalars()
.all()
)
# 总结状态统计概要
summary_total = db.scalar(select(func.count(Paper.id))) or 0
summary_done = (
db.scalar(
select(func.count(SummaryStatus.id)).where(
SummaryStatus.status == SummaryState.DONE
)
)
or 0
)
summary_pending = (
db.scalar(
select(func.count(SummaryStatus.id)).where(
SummaryStatus.status.in_(
[SummaryState.PENDING, SummaryState.PROCESSING]
)
)
)
or 0
)
summary_failed = (
db.scalar(
select(func.count(SummaryStatus.id)).where(
SummaryStatus.status.in_(
[SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]
)
)
)
or 0
)
return templates.TemplateResponse(
request,
"admin_logs.html",
{
"crawl_logs": crawl_logs,
"delete_jobs": delete_jobs,
"page": page,
"per_page": per_page,
"summary_total": summary_total,
"summary_done": summary_done,
"summary_pending": summary_pending,
"summary_failed": summary_failed,
},
admin_svc.get_logs_context(db, page=page, per_page=per_page),
)
@@ -395,22 +322,10 @@ async def admin_summary_status(
per_page: int = Query(20, ge=1, le=100),
):
"""总结状态列表(HTMX 片段或 JSON)。"""
query = (
select(Paper, SummaryStatus)
.outerjoin(SummaryStatus, SummaryStatus.paper_id == Paper.id)
.order_by(Paper.paper_date.desc())
results, total = admin_svc.query_summary_statuses(
db, status=status, page=page, per_page=per_page
)
if status != "all":
if status == "none":
query = query.where(SummaryStatus.paper_id == None) # noqa: E711
else:
query = query.where(SummaryStatus.status == status)
total = db.scalar(select(func.count()).select_from(query.subquery()))
results = db.execute(query.offset((page - 1) * per_page).limit(per_page)).all()
# 判断是否 HTMX 请求
is_htmx = request.headers.get("HX-Request") == "true"
@@ -421,27 +336,16 @@ async def admin_summary_status(
"partials/summary_list.html",
{
"results": results,
"total": total or 0,
"total": total,
"page": page,
"per_page": per_page,
"current_status": status,
},
)
# 非 HTMX 返回 JSON
items = []
for paper, ss in results:
item = {
"arxiv_id": paper.arxiv_id,
"title": paper.title_zh or paper.title_en,
"paper_date": str(paper.paper_date),
"summary_status": ss.status if ss else "none",
"retry_count": ss.retry_count if ss else 0,
"error_type": ss.error_type if ss else None,
"error": ss.error if ss else None,
}
items.append(item)
return {"items": items, "total": total or 0, "page": page, "per_page": per_page}
return admin_svc.serialize_summary_statuses(
results, total=total, page=page, per_page=per_page
)
@router.post("/summary-retry-failed")
@@ -450,39 +354,14 @@ async def admin_summary_retry_failed(
db: Session = Depends(get_db),
):
"""重试所有失败状态的总结任务。"""
failed_ids = (
db.execute(
select(Paper.arxiv_id)
.join(SummaryStatus, SummaryStatus.paper_id == Paper.id)
.where(
SummaryStatus.status.in_(
[SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]
)
)
)
.scalars()
.all()
)
if not failed_ids:
count = admin_svc.retry_failed_summaries(db)
if not count:
return {"status": "success", "message": "没有失败的任务需要重试", "count": 0}
# 重置失败任务的状态为 pending
db.execute(
SummaryStatus.__table__.update()
.where(
SummaryStatus.status.in_(
[SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]
)
)
.values(status=SummaryState.PENDING, error=None, error_type=None)
)
db.commit()
return {
"status": "success",
"message": f"已重置 {len(failed_ids)} 个失败任务为待总结状态",
"count": len(failed_ids),
"message": f"已重置 {count} 个失败任务为待总结状态",
"count": count,
}
@@ -545,23 +424,8 @@ async def admin_paper_delete(
db: Session = Depends(get_db),
):
"""删除单篇论文。"""
paper = db.scalar(select(Paper).where(Paper.arxiv_id == arxiv_id))
if not paper:
if not admin_svc.delete_paper_by_arxiv(db, arxiv_id):
raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}")
# 删除相关数据(ORM cascade 自动处理关联表)
db.delete(paper)
db.commit()
# 清理 FTS 索引
try:
db.execute(
text("DELETE FROM papers_fts WHERE arxiv_id = :aid"), {"aid": arxiv_id}
)
db.commit()
except Exception:
logger.warning("Failed to clean FTS index for %s", arxiv_id, exc_info=True)
return {"status": "success", "message": f"已删除 {arxiv_id}"}
@@ -588,28 +452,7 @@ async def admin_papers_batch_action(
raise HTTPException(status_code=400, detail="arxiv_ids 不能为空")
if body.action == "delete":
papers = (
db.execute(select(Paper).where(Paper.arxiv_id.in_(body.arxiv_ids)))
.scalars()
.all()
)
count = 0
for paper in papers:
db.delete(paper)
count += 1
db.commit()
# 清理 FTS 索引
try:
stmt = text("DELETE FROM papers_fts WHERE arxiv_id IN :ids").bindparams(
bindparam("ids", expanding=True)
)
db.execute(stmt, {"ids": body.arxiv_ids})
db.commit()
except Exception:
logger.warning("Failed to clean FTS index for batch delete", exc_info=True)
count = admin_svc.delete_papers_by_arxiv_ids(db, body.arxiv_ids)
return {
"status": "success",
"message": f"已删除 {count} 篇论文",
@@ -617,24 +460,10 @@ async def admin_papers_batch_action(
}
elif body.action == "summarize":
# 将选中论文的总结状态重置为 pending
paper_ids = (
db.execute(select(Paper.id).where(Paper.arxiv_id.in_(body.arxiv_ids)))
.scalars()
.all()
)
if paper_ids:
# 删除旧的 status 记录让其重新进入 pipeline
db.execute(
SummaryStatus.__table__.delete().where(
SummaryStatus.paper_id.in_(paper_ids)
)
)
db.commit()
count = admin_svc.reset_summaries_pending(db, body.arxiv_ids)
return {
"status": "success",
"message": f"已将 {len(paper_ids)} 篇论文重置为待总结",
"count": len(paper_ids),
"message": f"已将 {count} 篇论文重置为待总结",
"count": count,
}
+324 -3
View File
@@ -1,17 +1,30 @@
"""管理后台服务 — 统计聚合、系统状态。"""
"""管理后台服务 — 统计聚合、系统状态、管理操作"""
from __future__ import annotations
import json
from datetime import date
from pathlib import Path
from typing import Callable
from sqlalchemy import func, select, text
from sqlalchemy.orm import Session
from app.config import settings
from app.models import CrawlLog, Paper, PaperTag, SummaryState, SummaryStatus, TaskLock
from app.models import (
CrawlLog,
DataDeleteJob,
Job,
JobEvent,
Paper,
PaperTag,
SummaryState,
SummaryStatus,
TaskLock,
)
from app.services.derived import delete_paper_indexes
from app.services.scheduler import get_scheduler
from app.utils import PAPERS_DIR, TMP_DIR
from app.utils import PAPERS_DIR, TMP_DIR, utc_now
# admin_papers 排序映射
SORT_MAP = {
@@ -190,3 +203,311 @@ def query_papers(
statuses[paper_id_to_arxiv.get(pid, "")] = st
return papers, total or 0, statuses
def get_scheduler_history(db: Session, limit: int = 10) -> list[CrawlLog]:
"""最近的调度器运行日志。"""
return (
db.execute(
select(CrawlLog)
.where(CrawlLog.task == "scheduler")
.order_by(CrawlLog.started_at.desc())
.limit(limit)
)
.scalars()
.all()
)
def get_scheduler_status() -> dict:
"""调度器运行状态。"""
scheduler = get_scheduler()
next_run = None
upvote_next_run = None
if scheduler:
for job in scheduler.get_jobs():
if job.id == "daily_pipeline":
next_run = job.next_run_time
elif job.id == "upvote_refresh":
upvote_next_run = job.next_run_time
return {
"enabled": scheduler is not None,
"schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
"timezone": settings.APP_TIMEZONE,
"next_run": next_run.isoformat() if next_run else None,
"upvote_next_run": upvote_next_run.isoformat() if upvote_next_run else None,
"upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
}
def run_cleanup_now(db: Session, cleanup_func: Callable[[], dict]) -> dict:
"""同步执行临时目录清理,并写入 CrawlLog。"""
log_entry = CrawlLog(task="cleanup", status="running", started_at=utc_now())
db.add(log_entry)
db.commit()
try:
result = cleanup_func()
log_entry.status = "success"
log_entry.completed_at = utc_now()
log_entry.details_json = json.dumps(
{
"scanned": result.get("scanned", 0),
"removed": result.get("removed", 0),
},
ensure_ascii=False,
)
if result.get("errors"):
log_entry.error = "; ".join(result["errors"])[:2000]
db.commit()
return result
except Exception as exc:
log_entry.status = "failed"
log_entry.error = str(exc)[:2000]
log_entry.completed_at = utc_now()
db.commit()
raise
def get_job_detail(db: Session, job_id: int) -> dict | None:
"""后台任务详情和阶段事件,返回可 JSON 序列化 dict。"""
job = db.get(Job, job_id)
if not job:
return None
events = (
db.execute(
select(JobEvent)
.where(JobEvent.job_id == job_id)
.order_by(JobEvent.created_at.asc())
)
.scalars()
.all()
)
return {
"id": job.id,
"type": job.type,
"status": job.status,
"owner": job.owner,
"payload": json.loads(job.payload_json or "{}"),
"result": json.loads(job.result_json or "{}") if job.result_json else None,
"error": job.error,
"created_at": job.created_at.isoformat(),
"started_at": job.started_at.isoformat() if job.started_at else None,
"completed_at": job.completed_at.isoformat() if job.completed_at else None,
"events": [
{
"stage": event.stage,
"status": event.status,
"message": event.message,
"payload": json.loads(event.payload_json or "{}")
if event.payload_json
else None,
"created_at": event.created_at.isoformat(),
}
for event in events
],
}
def get_logs_context(db: Session, *, page: int, per_page: int) -> dict:
"""管理日志页上下文。"""
crawl_logs = (
db.execute(
select(CrawlLog)
.order_by(CrawlLog.started_at.desc())
.limit(per_page)
.offset((page - 1) * per_page)
)
.scalars()
.all()
)
delete_jobs = (
db.execute(
select(DataDeleteJob)
.order_by(DataDeleteJob.started_at.desc())
.limit(per_page)
.offset((page - 1) * per_page)
)
.scalars()
.all()
)
summary_total = db.scalar(select(func.count(Paper.id))) or 0
summary_done = (
db.scalar(
select(func.count(SummaryStatus.id)).where(
SummaryStatus.status == SummaryState.DONE
)
)
or 0
)
summary_pending = (
db.scalar(
select(func.count(SummaryStatus.id)).where(
SummaryStatus.status.in_(
[SummaryState.PENDING, SummaryState.PROCESSING]
)
)
)
or 0
)
summary_failed = (
db.scalar(
select(func.count(SummaryStatus.id)).where(
SummaryStatus.status.in_(
[SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]
)
)
)
or 0
)
return {
"crawl_logs": crawl_logs,
"delete_jobs": delete_jobs,
"page": page,
"per_page": per_page,
"summary_total": summary_total,
"summary_done": summary_done,
"summary_pending": summary_pending,
"summary_failed": summary_failed,
}
def query_summary_statuses(
db: Session,
*,
status: str,
page: int,
per_page: int,
) -> tuple[list[tuple[Paper, SummaryStatus | None]], int]:
"""总结状态列表查询。"""
query = (
select(Paper, SummaryStatus)
.outerjoin(SummaryStatus, SummaryStatus.paper_id == Paper.id)
.order_by(Paper.paper_date.desc())
)
if status != "all":
if status == "none":
query = query.where(SummaryStatus.paper_id == None) # noqa: E711
else:
query = query.where(SummaryStatus.status == status)
total = db.scalar(select(func.count()).select_from(query.subquery())) or 0
results = db.execute(query.offset((page - 1) * per_page).limit(per_page)).all()
return results, total
def serialize_summary_statuses(
results: list[tuple[Paper, SummaryStatus | None]],
*,
total: int,
page: int,
per_page: int,
) -> dict:
"""总结状态列表 JSON 响应。"""
items = []
for paper, ss in results:
items.append(
{
"arxiv_id": paper.arxiv_id,
"title": paper.title_zh or paper.title_en,
"paper_date": str(paper.paper_date),
"summary_status": ss.status if ss else "none",
"retry_count": ss.retry_count if ss else 0,
"error_type": ss.error_type if ss else None,
"error": ss.error if ss else None,
}
)
return {"items": items, "total": total, "page": page, "per_page": per_page}
def retry_failed_summaries(db: Session) -> int:
"""将失败/永久失败的总结任务重置为 pending。"""
failed_ids = (
db.execute(
select(Paper.arxiv_id)
.join(SummaryStatus, SummaryStatus.paper_id == Paper.id)
.where(
SummaryStatus.status.in_(
[SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]
)
)
)
.scalars()
.all()
)
if not failed_ids:
return 0
db.execute(
SummaryStatus.__table__.update()
.where(
SummaryStatus.status.in_(
[SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]
)
)
.values(status=SummaryState.PENDING, error=None, error_type=None)
)
db.commit()
return len(failed_ids)
def delete_paper_by_arxiv(db: Session, arxiv_id: str) -> bool:
"""删除单篇论文和派生索引。"""
paper = db.scalar(select(Paper).where(Paper.arxiv_id == arxiv_id))
if not paper:
return False
paper_id = paper.id
db.delete(paper)
db.commit()
delete_paper_indexes(db, paper_id=paper_id, arxiv_id=arxiv_id)
db.commit()
return True
def delete_papers_by_arxiv_ids(db: Session, arxiv_ids: list[str]) -> int:
"""批量删除论文和派生索引。"""
papers = (
db.execute(select(Paper).where(Paper.arxiv_id.in_(arxiv_ids))).scalars().all()
)
deleted = [(paper.id, paper.arxiv_id) for paper in papers]
for paper in papers:
db.delete(paper)
db.commit()
for paper_id, arxiv_id in deleted:
delete_paper_indexes(db, paper_id=paper_id, arxiv_id=arxiv_id)
db.commit()
return len(deleted)
def reset_summaries_pending(db: Session, arxiv_ids: list[str]) -> int:
"""将指定论文的总结状态重置为 pending,没有状态则创建。"""
paper_ids = (
db.execute(select(Paper.id).where(Paper.arxiv_id.in_(arxiv_ids)))
.scalars()
.all()
)
if not paper_ids:
return 0
existing_statuses = (
db.execute(select(SummaryStatus).where(SummaryStatus.paper_id.in_(paper_ids)))
.scalars()
.all()
)
existing_ids = {status.paper_id for status in existing_statuses}
for status in existing_statuses:
status.status = SummaryState.PENDING
status.quality = None
status.error = None
status.error_type = None
status.raw_output_saved = False
status.started_at = None
status.completed_at = None
for paper_id in paper_ids:
if paper_id not in existing_ids:
db.add(SummaryStatus(paper_id=paper_id, status=SummaryState.PENDING))
db.commit()
return len(paper_ids)
+3 -16
View File
@@ -4,7 +4,7 @@ import logging
from datetime import date as date_type, datetime, timezone
import httpx
from sqlalchemy import select, text
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.config import settings
@@ -16,6 +16,7 @@ from app.models import (
SummaryState,
SummaryStatus,
)
from app.services.derived import reindex_paper_fts
from app.utils import make_http_client, recent_date_strs, utc_now
logger = logging.getLogger(__name__)
@@ -143,21 +144,7 @@ def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[
db.add(SummaryStatus(paper_id=paper.id, status=SummaryState.PENDING))
authors_text = ", ".join(meta["authors"])
tags_text = ", ".join(meta["tags"])
db.execute(
text(
"INSERT INTO papers_fts(rowid, title_en, abstract, authors, tags) "
"VALUES (:id, :title, :abstract, :authors, :tags)"
),
{
"id": paper.id,
"title": meta["title_en"],
"abstract": meta["abstract"] or "",
"authors": authors_text,
"tags": tags_text,
},
)
reindex_paper_fts(db, paper)
new_papers.append(paper)
logger.debug("Inserted new paper: %s", arxiv_id)
+140
View File
@@ -0,0 +1,140 @@
"""派生数据维护 — FTS5 / ChromaDB 等可重建索引。"""
from __future__ import annotations
import logging
from sqlalchemy import select, text
from sqlalchemy.orm import Session
from app.models import Paper
logger = logging.getLogger(__name__)
def _summary_text(paper: Paper) -> str:
summary = paper.summary
if not summary:
return ""
parts = [
summary.one_line,
summary.motivation_problem,
summary.motivation_goal,
summary.method_overview,
summary.method_key_idea,
summary.results_main_json,
]
return " ".join(p for p in parts if p)
def delete_fts_paper(db: Session, paper_id: int) -> None:
"""删除单篇论文的 FTS5 行。FTS5 以 papers.id 作为 rowid。"""
db.execute(
text("DELETE FROM papers_fts WHERE rowid = :paper_id"),
{"paper_id": paper_id},
)
def delete_paper_indexes(db: Session, *, paper_id: int, arxiv_id: str) -> None:
"""删除单篇论文的所有派生索引。失败项记录日志但不阻断主删除。"""
try:
delete_fts_paper(db, paper_id)
except Exception:
logger.warning("Failed to clean FTS index for %s", arxiv_id, exc_info=True)
try:
from app.services.embedder import delete_paper
delete_paper(arxiv_id)
except Exception:
logger.warning("Failed to clean ChromaDB index for %s", arxiv_id, exc_info=True)
def reindex_paper_fts(db: Session, paper: Paper) -> None:
"""按 DB 权威数据重建单篇论文的 FTS5 派生索引。"""
authors_text = ", ".join(
a.name for a in sorted(paper.authors, key=lambda a: a.position or 0)
)
tags_text = ", ".join(t.tag for t in paper.tags)
delete_fts_paper(db, paper.id)
db.execute(
text(
"""
INSERT INTO papers_fts(
rowid, title_en, title_zh, abstract, authors, tags, summary_text
)
VALUES (
:id, :title_en, :title_zh, :abstract, :authors, :tags, :summary_text
)
"""
),
{
"id": paper.id,
"title_en": paper.title_en or "",
"title_zh": paper.title_zh or "",
"abstract": paper.abstract or "",
"authors": authors_text,
"tags": tags_text,
"summary_text": _summary_text(paper),
},
)
def reindex_fts(db: Session, paper_ids: list[int] | None = None) -> dict:
"""全量或局部重建 FTS5 索引。"""
query = select(Paper)
if paper_ids:
query = query.where(Paper.id.in_(paper_ids))
papers = db.execute(query).scalars().all()
if paper_ids is None:
db.execute(text("DELETE FROM papers_fts"))
count = 0
for paper in papers:
reindex_paper_fts(db, paper)
count += 1
db.commit()
logger.info("FTS reindexed: %d papers", count)
return {"status": "success", "indexed": count}
def reindex_chroma(db: Session) -> dict:
"""按 DB 权威数据重建 ChromaDB 语义索引。"""
from app.services.embedder import index_paper
papers = db.execute(select(Paper).where(Paper.summary.has())).scalars().all()
indexed = 0
errors: list[str] = []
for paper in papers:
try:
texts_dict = {
"arxiv_id": paper.arxiv_id,
"title_zh": paper.title_zh or "",
"title_en": paper.title_en or "",
"tags": " ".join(t.tag for t in paper.tags),
"one_line": paper.summary.one_line if paper.summary else "",
"motivation_problem": (
paper.summary.motivation_problem if paper.summary else ""
),
"method_key_idea": (
paper.summary.method_key_idea if paper.summary else ""
),
"paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
}
index_paper(paper.arxiv_id, texts_dict)
indexed += 1
except Exception as exc:
errors.append(f"{paper.arxiv_id}: {exc}")
logger.warning(
"Failed to reindex ChromaDB for %s",
paper.arxiv_id,
exc_info=True,
)
return {
"status": "success" if not errors else "partial",
"indexed": indexed,
"errors": errors or None,
}
+244
View File
@@ -0,0 +1,244 @@
"""统一后台任务系统 — 创建、运行、事件记录、失败恢复。"""
from __future__ import annotations
import json
import logging
from datetime import date, timedelta
from typing import Any
from fastapi import BackgroundTasks
from sqlalchemy import or_, select
from sqlalchemy.orm import Session
from app.config import settings
from app.database import SessionLocal
from app.models import Job, JobEvent, JobEventStatus, JobStatus, TaskLock
from app.utils import truncate_error, utc_now
logger = logging.getLogger(__name__)
STALE_JOB_AFTER = timedelta(hours=6)
def _dumps(value: Any) -> str:
return json.dumps(value, ensure_ascii=False, default=str)
def _loads(value: str | None) -> dict:
if not value:
return {}
try:
data = json.loads(value)
return data if isinstance(data, dict) else {}
except json.JSONDecodeError:
return {}
def create_job(
db: Session,
job_type: str,
*,
owner: str,
payload: dict | None = None,
) -> Job:
"""创建后台任务主记录。"""
job = Job(
type=job_type,
status=JobStatus.QUEUED,
owner=owner,
payload_json=_dumps(payload or {}),
created_at=utc_now(),
)
db.add(job)
db.commit()
db.refresh(job)
add_job_event(
db,
job,
stage="created",
status=JobEventStatus.INFO,
message=f"Job queued: {job_type}",
payload=payload or {},
)
return job
def add_job_event(
db: Session,
job: Job,
*,
stage: str,
status: str,
message: str | None = None,
payload: dict | None = None,
) -> None:
"""追加一条任务阶段事件。"""
db.add(
JobEvent(
job_id=job.id,
stage=stage,
status=str(status),
message=message,
payload_json=_dumps(payload) if payload is not None else None,
created_at=utc_now(),
)
)
job.heartbeat_at = utc_now()
db.commit()
def enqueue_job(background_tasks: BackgroundTasks, job_id: int) -> None:
"""把任务提交给 FastAPI BackgroundTasks。"""
background_tasks.add_task(run_job_by_id, job_id)
async def run_job_by_id(job_id: int) -> None:
"""使用独立 DB session 运行一个已创建的 job。"""
db = SessionLocal()
try:
await run_job(db, job_id)
finally:
db.close()
async def run_job(db: Session, job_id: int) -> dict:
"""运行 job,并把状态/result/error 写回 jobs/job_events。"""
job = db.get(Job, job_id)
if not job:
raise ValueError(f"Job not found: {job_id}")
if job.status == JobStatus.RUNNING:
raise RuntimeError(f"Job already running: {job_id}")
payload = _loads(job.payload_json)
job.status = JobStatus.RUNNING
job.started_at = utc_now()
job.heartbeat_at = job.started_at
db.commit()
add_job_event(db, job, stage="run", status=JobEventStatus.STARTED)
try:
result = await _dispatch_job(db, job, payload)
except Exception as exc:
logger.exception("Job failed: id=%s type=%s", job.id, job.type)
error = truncate_error(exc, limit=4000)
job.status = JobStatus.FAILED
job.error = error
job.completed_at = utc_now()
db.commit()
add_job_event(db, job, stage="run", status=JobEventStatus.FAILED, message=error)
return {"status": "failed", "error": error}
job.status = JobStatus.SUCCESS
job.result_json = _dumps(result)
job.completed_at = utc_now()
job.error = None
db.commit()
add_job_event(
db,
job,
stage="run",
status=JobEventStatus.SUCCESS,
payload=result if isinstance(result, dict) else {"result": result},
)
return result if isinstance(result, dict) else {"status": "success", "result": result}
async def _dispatch_job(db: Session, job: Job, payload: dict) -> dict:
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
from app.services.crawler import refresh_upvotes
from app.services.derived import reindex_chroma, reindex_fts
from app.services.pipeline import run_crawl, run_pipeline
from app.services.summarizer import summarize_batch, summarize_single
if job.type == "crawl_daily":
return await run_crawl(
db,
payload["target_date"],
owner=job.owner or f"job:{job.id}",
top_n=payload.get("top_n"),
)
if job.type == "pipeline_daily":
return await run_pipeline(
db,
payload["target_date"],
owner=job.owner or f"job:{job.id}",
)
if job.type == "summarize_batch":
return await summarize_batch(
db,
pdf_mode=payload.get("pdf_mode", settings.SUMMARY_PDF_MODE),
)
if job.type == "summarize_one":
return await summarize_single(
db,
payload["arxiv_id"],
force=payload.get("force", True),
pdf_mode=payload.get("pdf_mode", settings.SUMMARY_PDF_MODE),
)
if job.type == "refresh_upvotes":
return await refresh_upvotes(db, days=payload.get("days"))
if job.type == "delete_range":
return await delete_papers_by_date_range(
db,
date.fromisoformat(payload["date_start"]),
date.fromisoformat(payload["date_end"]),
include_notes=payload.get("include_notes", True),
)
if job.type == "cleanup_tmp":
return cleanup_tmp()
if job.type == "reindex_fts":
return reindex_fts(db)
if job.type == "reindex_chroma":
return reindex_chroma(db)
raise ValueError(f"Unsupported job type: {job.type}")
def recover_stale_jobs(db: Session) -> int:
"""启动时将过期 running job/lock 标记为 stale,避免永久卡住。"""
now = utc_now()
cutoff = now - STALE_JOB_AFTER
stale_jobs = (
db.execute(
select(Job).where(
Job.status == JobStatus.RUNNING,
or_(Job.heartbeat_at == None, Job.heartbeat_at < cutoff), # noqa: E711
)
)
.scalars()
.all()
)
for job in stale_jobs:
job.status = JobStatus.STALE
job.error = "Marked stale after process restart or missed heartbeat"
job.completed_at = now
db.add(
JobEvent(
job_id=job.id,
stage="recovery",
status=JobEventStatus.FAILED,
message=job.error,
created_at=now,
)
)
stale_locks = (
db.execute(
select(TaskLock).where(
TaskLock.status == "running",
TaskLock.acquired_at < cutoff,
)
)
.scalars()
.all()
)
for lock in stale_locks:
lock.status = "stale"
lock.released_at = now
db.commit()
recovered = len(stale_jobs) + len(stale_locks)
if recovered:
logger.warning("Recovered stale runtime records: %d", recovered)
return recovered
+12 -2
View File
@@ -7,6 +7,7 @@ from __future__ import annotations
import logging
from datetime import date as date_type
from datetime import timedelta
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session
@@ -32,6 +33,8 @@ def acquire_lock(db: Session, task: str, lock_key: str, owner: str) -> TaskLock:
status="running",
owner=owner,
acquired_at=utc_now(),
heartbeat_at=utc_now(),
expires_at=utc_now() + timedelta(hours=6),
)
try:
db.add(lock)
@@ -42,7 +45,12 @@ def acquire_lock(db: Session, task: str, lock_key: str, owner: str) -> TaskLock:
return lock
async def run_crawl(db: Session, target_date: str, owner: str = "admin_crawl") -> dict:
async def run_crawl(
db: Session,
target_date: str,
owner: str = "admin_crawl",
top_n: int | None = None,
) -> dict:
"""执行单次抓取(带防重入锁)。
Args:
@@ -55,7 +63,7 @@ async def run_crawl(db: Session, target_date: str, owner: str = "admin_crawl") -
"""
lock = acquire_lock(db, "crawl", target_date, owner)
try:
return await crawl_daily(db, target_date)
return await crawl_daily(db, target_date, top_n=top_n)
finally:
release_lock(db, lock)
@@ -83,6 +91,8 @@ async def run_pipeline(db: Session, target_date: str, owner: str) -> dict:
status="running",
owner=owner,
acquired_at=now,
heartbeat_at=now,
expires_at=now + timedelta(hours=6),
)
try:
db.add(lock)
+10 -4
View File
@@ -11,8 +11,7 @@ from zoneinfo import ZoneInfo
from app.config import settings
from app.database import SessionLocal
from app.services.pipeline import run_pipeline
from app.services.crawler import refresh_upvotes
from app.services.jobs import create_job, run_job
from app.utils import today_str
logger = logging.getLogger(__name__)
@@ -112,7 +111,13 @@ async def _daily_pipeline() -> None:
db: Session = SessionLocal()
try:
await run_pipeline(db, today, owner="daily_pipeline")
job = create_job(
db,
"pipeline_daily",
owner="daily_pipeline",
payload={"target_date": today},
)
await run_job(db, job.id)
except RuntimeError:
logger.warning("Daily pipeline already running for %s, skipping", today)
except Exception:
@@ -125,7 +130,8 @@ async def _upvote_refresh() -> None:
"""刷新最近 N 天论文的 upvotes。"""
db: Session = SessionLocal()
try:
result = await refresh_upvotes(db)
job = create_job(db, "refresh_upvotes", owner="upvote_refresh", payload={})
result = await run_job(db, job.id)
logger.info(
"Upvote refresh completed: status=%s updated=%d",
result.get("status"),
+4 -15
View File
@@ -3,8 +3,6 @@
from __future__ import annotations
import logging
from sqlalchemy import text
from sqlalchemy.orm import Session
from app.models import (
@@ -13,6 +11,7 @@ from app.models import (
PaperTag,
SummaryState,
)
from app.services.derived import reindex_paper_fts
from app.services.pdf_downloader import paper_dir
from app.services.schemas import (
SummarySchema,
@@ -75,19 +74,9 @@ def _update_summary_in_db(
db.add(PaperTag(paper_id=paper.id, tag=tag_name, source="ai"))
existing_tag_names.add(tag_name)
# 4. FTS5 更新
summary_text = _build_fts_summary_text(schema)
db.execute(
text(
"UPDATE papers_fts SET title_zh=:title_zh, summary_text=:summary_text "
"WHERE rowid=:paper_id"
),
{
"title_zh": schema.title_zh,
"summary_text": summary_text,
"paper_id": paper.id,
},
)
# 4. FTS5 派生索引
db.flush()
reindex_paper_fts(db, paper)
db.commit()
logger.info("DB updated: paper=%s quality=%s", paper.arxiv_id, quality)