refactor: extract admin business logic to services, introduce job queue, add derived index helpers
- Move DB operations from routes/admin.py to services/admin.py (get_logs_context, query_summary_statuses, retry_failed, delete/reset operations) - Add services/jobs.py with Job/JobEvent-based async job queue (create_job, run_job, enqueue_job) - Add services/derived.py with FTS5 reindex and paper index deletion helpers - Refactor scheduler to use job queue instead of direct pipeline calls - Add heartbeat_at/expires_at to TaskLock for lock health tracking - Remove DESIGN_REVIEW.md - Update tests: remove redundant integration tests, add unit tests for new services
This commit is contained in:
+324
-3
@@ -1,17 +1,30 @@
|
||||
"""管理后台服务 — 统计聚合、系统状态。"""
|
||||
"""管理后台服务 — 统计聚合、系统状态、管理操作。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
from typing import Callable
|
||||
|
||||
from sqlalchemy import func, select, text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.config import settings
|
||||
from app.models import CrawlLog, Paper, PaperTag, SummaryState, SummaryStatus, TaskLock
|
||||
from app.models import (
|
||||
CrawlLog,
|
||||
DataDeleteJob,
|
||||
Job,
|
||||
JobEvent,
|
||||
Paper,
|
||||
PaperTag,
|
||||
SummaryState,
|
||||
SummaryStatus,
|
||||
TaskLock,
|
||||
)
|
||||
from app.services.derived import delete_paper_indexes
|
||||
from app.services.scheduler import get_scheduler
|
||||
from app.utils import PAPERS_DIR, TMP_DIR
|
||||
from app.utils import PAPERS_DIR, TMP_DIR, utc_now
|
||||
|
||||
# admin_papers 排序映射
|
||||
SORT_MAP = {
|
||||
@@ -190,3 +203,311 @@ def query_papers(
|
||||
statuses[paper_id_to_arxiv.get(pid, "")] = st
|
||||
|
||||
return papers, total or 0, statuses
|
||||
|
||||
|
||||
def get_scheduler_history(db: Session, limit: int = 10) -> list[CrawlLog]:
|
||||
"""最近的调度器运行日志。"""
|
||||
return (
|
||||
db.execute(
|
||||
select(CrawlLog)
|
||||
.where(CrawlLog.task == "scheduler")
|
||||
.order_by(CrawlLog.started_at.desc())
|
||||
.limit(limit)
|
||||
)
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
|
||||
|
||||
def get_scheduler_status() -> dict:
|
||||
"""调度器运行状态。"""
|
||||
scheduler = get_scheduler()
|
||||
next_run = None
|
||||
upvote_next_run = None
|
||||
if scheduler:
|
||||
for job in scheduler.get_jobs():
|
||||
if job.id == "daily_pipeline":
|
||||
next_run = job.next_run_time
|
||||
elif job.id == "upvote_refresh":
|
||||
upvote_next_run = job.next_run_time
|
||||
return {
|
||||
"enabled": scheduler is not None,
|
||||
"schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
|
||||
"timezone": settings.APP_TIMEZONE,
|
||||
"next_run": next_run.isoformat() if next_run else None,
|
||||
"upvote_next_run": upvote_next_run.isoformat() if upvote_next_run else None,
|
||||
"upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
|
||||
}
|
||||
|
||||
|
||||
def run_cleanup_now(db: Session, cleanup_func: Callable[[], dict]) -> dict:
|
||||
"""同步执行临时目录清理,并写入 CrawlLog。"""
|
||||
log_entry = CrawlLog(task="cleanup", status="running", started_at=utc_now())
|
||||
db.add(log_entry)
|
||||
db.commit()
|
||||
|
||||
try:
|
||||
result = cleanup_func()
|
||||
log_entry.status = "success"
|
||||
log_entry.completed_at = utc_now()
|
||||
log_entry.details_json = json.dumps(
|
||||
{
|
||||
"scanned": result.get("scanned", 0),
|
||||
"removed": result.get("removed", 0),
|
||||
},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
if result.get("errors"):
|
||||
log_entry.error = "; ".join(result["errors"])[:2000]
|
||||
db.commit()
|
||||
return result
|
||||
except Exception as exc:
|
||||
log_entry.status = "failed"
|
||||
log_entry.error = str(exc)[:2000]
|
||||
log_entry.completed_at = utc_now()
|
||||
db.commit()
|
||||
raise
|
||||
|
||||
|
||||
def get_job_detail(db: Session, job_id: int) -> dict | None:
|
||||
"""后台任务详情和阶段事件,返回可 JSON 序列化 dict。"""
|
||||
job = db.get(Job, job_id)
|
||||
if not job:
|
||||
return None
|
||||
events = (
|
||||
db.execute(
|
||||
select(JobEvent)
|
||||
.where(JobEvent.job_id == job_id)
|
||||
.order_by(JobEvent.created_at.asc())
|
||||
)
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
return {
|
||||
"id": job.id,
|
||||
"type": job.type,
|
||||
"status": job.status,
|
||||
"owner": job.owner,
|
||||
"payload": json.loads(job.payload_json or "{}"),
|
||||
"result": json.loads(job.result_json or "{}") if job.result_json else None,
|
||||
"error": job.error,
|
||||
"created_at": job.created_at.isoformat(),
|
||||
"started_at": job.started_at.isoformat() if job.started_at else None,
|
||||
"completed_at": job.completed_at.isoformat() if job.completed_at else None,
|
||||
"events": [
|
||||
{
|
||||
"stage": event.stage,
|
||||
"status": event.status,
|
||||
"message": event.message,
|
||||
"payload": json.loads(event.payload_json or "{}")
|
||||
if event.payload_json
|
||||
else None,
|
||||
"created_at": event.created_at.isoformat(),
|
||||
}
|
||||
for event in events
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def get_logs_context(db: Session, *, page: int, per_page: int) -> dict:
|
||||
"""管理日志页上下文。"""
|
||||
crawl_logs = (
|
||||
db.execute(
|
||||
select(CrawlLog)
|
||||
.order_by(CrawlLog.started_at.desc())
|
||||
.limit(per_page)
|
||||
.offset((page - 1) * per_page)
|
||||
)
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
delete_jobs = (
|
||||
db.execute(
|
||||
select(DataDeleteJob)
|
||||
.order_by(DataDeleteJob.started_at.desc())
|
||||
.limit(per_page)
|
||||
.offset((page - 1) * per_page)
|
||||
)
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
|
||||
summary_total = db.scalar(select(func.count(Paper.id))) or 0
|
||||
summary_done = (
|
||||
db.scalar(
|
||||
select(func.count(SummaryStatus.id)).where(
|
||||
SummaryStatus.status == SummaryState.DONE
|
||||
)
|
||||
)
|
||||
or 0
|
||||
)
|
||||
summary_pending = (
|
||||
db.scalar(
|
||||
select(func.count(SummaryStatus.id)).where(
|
||||
SummaryStatus.status.in_(
|
||||
[SummaryState.PENDING, SummaryState.PROCESSING]
|
||||
)
|
||||
)
|
||||
)
|
||||
or 0
|
||||
)
|
||||
summary_failed = (
|
||||
db.scalar(
|
||||
select(func.count(SummaryStatus.id)).where(
|
||||
SummaryStatus.status.in_(
|
||||
[SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]
|
||||
)
|
||||
)
|
||||
)
|
||||
or 0
|
||||
)
|
||||
return {
|
||||
"crawl_logs": crawl_logs,
|
||||
"delete_jobs": delete_jobs,
|
||||
"page": page,
|
||||
"per_page": per_page,
|
||||
"summary_total": summary_total,
|
||||
"summary_done": summary_done,
|
||||
"summary_pending": summary_pending,
|
||||
"summary_failed": summary_failed,
|
||||
}
|
||||
|
||||
|
||||
def query_summary_statuses(
|
||||
db: Session,
|
||||
*,
|
||||
status: str,
|
||||
page: int,
|
||||
per_page: int,
|
||||
) -> tuple[list[tuple[Paper, SummaryStatus | None]], int]:
|
||||
"""总结状态列表查询。"""
|
||||
query = (
|
||||
select(Paper, SummaryStatus)
|
||||
.outerjoin(SummaryStatus, SummaryStatus.paper_id == Paper.id)
|
||||
.order_by(Paper.paper_date.desc())
|
||||
)
|
||||
if status != "all":
|
||||
if status == "none":
|
||||
query = query.where(SummaryStatus.paper_id == None) # noqa: E711
|
||||
else:
|
||||
query = query.where(SummaryStatus.status == status)
|
||||
|
||||
total = db.scalar(select(func.count()).select_from(query.subquery())) or 0
|
||||
results = db.execute(query.offset((page - 1) * per_page).limit(per_page)).all()
|
||||
return results, total
|
||||
|
||||
|
||||
def serialize_summary_statuses(
|
||||
results: list[tuple[Paper, SummaryStatus | None]],
|
||||
*,
|
||||
total: int,
|
||||
page: int,
|
||||
per_page: int,
|
||||
) -> dict:
|
||||
"""总结状态列表 JSON 响应。"""
|
||||
items = []
|
||||
for paper, ss in results:
|
||||
items.append(
|
||||
{
|
||||
"arxiv_id": paper.arxiv_id,
|
||||
"title": paper.title_zh or paper.title_en,
|
||||
"paper_date": str(paper.paper_date),
|
||||
"summary_status": ss.status if ss else "none",
|
||||
"retry_count": ss.retry_count if ss else 0,
|
||||
"error_type": ss.error_type if ss else None,
|
||||
"error": ss.error if ss else None,
|
||||
}
|
||||
)
|
||||
return {"items": items, "total": total, "page": page, "per_page": per_page}
|
||||
|
||||
|
||||
def retry_failed_summaries(db: Session) -> int:
|
||||
"""将失败/永久失败的总结任务重置为 pending。"""
|
||||
failed_ids = (
|
||||
db.execute(
|
||||
select(Paper.arxiv_id)
|
||||
.join(SummaryStatus, SummaryStatus.paper_id == Paper.id)
|
||||
.where(
|
||||
SummaryStatus.status.in_(
|
||||
[SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]
|
||||
)
|
||||
)
|
||||
)
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
if not failed_ids:
|
||||
return 0
|
||||
|
||||
db.execute(
|
||||
SummaryStatus.__table__.update()
|
||||
.where(
|
||||
SummaryStatus.status.in_(
|
||||
[SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]
|
||||
)
|
||||
)
|
||||
.values(status=SummaryState.PENDING, error=None, error_type=None)
|
||||
)
|
||||
db.commit()
|
||||
return len(failed_ids)
|
||||
|
||||
|
||||
def delete_paper_by_arxiv(db: Session, arxiv_id: str) -> bool:
|
||||
"""删除单篇论文和派生索引。"""
|
||||
paper = db.scalar(select(Paper).where(Paper.arxiv_id == arxiv_id))
|
||||
if not paper:
|
||||
return False
|
||||
|
||||
paper_id = paper.id
|
||||
db.delete(paper)
|
||||
db.commit()
|
||||
delete_paper_indexes(db, paper_id=paper_id, arxiv_id=arxiv_id)
|
||||
db.commit()
|
||||
return True
|
||||
|
||||
|
||||
def delete_papers_by_arxiv_ids(db: Session, arxiv_ids: list[str]) -> int:
|
||||
"""批量删除论文和派生索引。"""
|
||||
papers = (
|
||||
db.execute(select(Paper).where(Paper.arxiv_id.in_(arxiv_ids))).scalars().all()
|
||||
)
|
||||
deleted = [(paper.id, paper.arxiv_id) for paper in papers]
|
||||
for paper in papers:
|
||||
db.delete(paper)
|
||||
db.commit()
|
||||
|
||||
for paper_id, arxiv_id in deleted:
|
||||
delete_paper_indexes(db, paper_id=paper_id, arxiv_id=arxiv_id)
|
||||
db.commit()
|
||||
return len(deleted)
|
||||
|
||||
|
||||
def reset_summaries_pending(db: Session, arxiv_ids: list[str]) -> int:
|
||||
"""将指定论文的总结状态重置为 pending,没有状态则创建。"""
|
||||
paper_ids = (
|
||||
db.execute(select(Paper.id).where(Paper.arxiv_id.in_(arxiv_ids)))
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
if not paper_ids:
|
||||
return 0
|
||||
|
||||
existing_statuses = (
|
||||
db.execute(select(SummaryStatus).where(SummaryStatus.paper_id.in_(paper_ids)))
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
existing_ids = {status.paper_id for status in existing_statuses}
|
||||
for status in existing_statuses:
|
||||
status.status = SummaryState.PENDING
|
||||
status.quality = None
|
||||
status.error = None
|
||||
status.error_type = None
|
||||
status.raw_output_saved = False
|
||||
status.started_at = None
|
||||
status.completed_at = None
|
||||
for paper_id in paper_ids:
|
||||
if paper_id not in existing_ids:
|
||||
db.add(SummaryStatus(paper_id=paper_id, status=SummaryState.PENDING))
|
||||
db.commit()
|
||||
return len(paper_ids)
|
||||
|
||||
+3
-16
@@ -4,7 +4,7 @@ import logging
|
||||
from datetime import date as date_type, datetime, timezone
|
||||
|
||||
import httpx
|
||||
from sqlalchemy import select, text
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.config import settings
|
||||
@@ -16,6 +16,7 @@ from app.models import (
|
||||
SummaryState,
|
||||
SummaryStatus,
|
||||
)
|
||||
from app.services.derived import reindex_paper_fts
|
||||
from app.utils import make_http_client, recent_date_strs, utc_now
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -143,21 +144,7 @@ def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[
|
||||
|
||||
db.add(SummaryStatus(paper_id=paper.id, status=SummaryState.PENDING))
|
||||
|
||||
authors_text = ", ".join(meta["authors"])
|
||||
tags_text = ", ".join(meta["tags"])
|
||||
db.execute(
|
||||
text(
|
||||
"INSERT INTO papers_fts(rowid, title_en, abstract, authors, tags) "
|
||||
"VALUES (:id, :title, :abstract, :authors, :tags)"
|
||||
),
|
||||
{
|
||||
"id": paper.id,
|
||||
"title": meta["title_en"],
|
||||
"abstract": meta["abstract"] or "",
|
||||
"authors": authors_text,
|
||||
"tags": tags_text,
|
||||
},
|
||||
)
|
||||
reindex_paper_fts(db, paper)
|
||||
|
||||
new_papers.append(paper)
|
||||
logger.debug("Inserted new paper: %s", arxiv_id)
|
||||
|
||||
@@ -0,0 +1,140 @@
|
||||
"""派生数据维护 — FTS5 / ChromaDB 等可重建索引。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from sqlalchemy import select, text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.models import Paper
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _summary_text(paper: Paper) -> str:
|
||||
summary = paper.summary
|
||||
if not summary:
|
||||
return ""
|
||||
parts = [
|
||||
summary.one_line,
|
||||
summary.motivation_problem,
|
||||
summary.motivation_goal,
|
||||
summary.method_overview,
|
||||
summary.method_key_idea,
|
||||
summary.results_main_json,
|
||||
]
|
||||
return " ".join(p for p in parts if p)
|
||||
|
||||
|
||||
def delete_fts_paper(db: Session, paper_id: int) -> None:
|
||||
"""删除单篇论文的 FTS5 行。FTS5 以 papers.id 作为 rowid。"""
|
||||
db.execute(
|
||||
text("DELETE FROM papers_fts WHERE rowid = :paper_id"),
|
||||
{"paper_id": paper_id},
|
||||
)
|
||||
|
||||
|
||||
def delete_paper_indexes(db: Session, *, paper_id: int, arxiv_id: str) -> None:
|
||||
"""删除单篇论文的所有派生索引。失败项记录日志但不阻断主删除。"""
|
||||
try:
|
||||
delete_fts_paper(db, paper_id)
|
||||
except Exception:
|
||||
logger.warning("Failed to clean FTS index for %s", arxiv_id, exc_info=True)
|
||||
|
||||
try:
|
||||
from app.services.embedder import delete_paper
|
||||
|
||||
delete_paper(arxiv_id)
|
||||
except Exception:
|
||||
logger.warning("Failed to clean ChromaDB index for %s", arxiv_id, exc_info=True)
|
||||
|
||||
|
||||
def reindex_paper_fts(db: Session, paper: Paper) -> None:
|
||||
"""按 DB 权威数据重建单篇论文的 FTS5 派生索引。"""
|
||||
authors_text = ", ".join(
|
||||
a.name for a in sorted(paper.authors, key=lambda a: a.position or 0)
|
||||
)
|
||||
tags_text = ", ".join(t.tag for t in paper.tags)
|
||||
|
||||
delete_fts_paper(db, paper.id)
|
||||
db.execute(
|
||||
text(
|
||||
"""
|
||||
INSERT INTO papers_fts(
|
||||
rowid, title_en, title_zh, abstract, authors, tags, summary_text
|
||||
)
|
||||
VALUES (
|
||||
:id, :title_en, :title_zh, :abstract, :authors, :tags, :summary_text
|
||||
)
|
||||
"""
|
||||
),
|
||||
{
|
||||
"id": paper.id,
|
||||
"title_en": paper.title_en or "",
|
||||
"title_zh": paper.title_zh or "",
|
||||
"abstract": paper.abstract or "",
|
||||
"authors": authors_text,
|
||||
"tags": tags_text,
|
||||
"summary_text": _summary_text(paper),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def reindex_fts(db: Session, paper_ids: list[int] | None = None) -> dict:
|
||||
"""全量或局部重建 FTS5 索引。"""
|
||||
query = select(Paper)
|
||||
if paper_ids:
|
||||
query = query.where(Paper.id.in_(paper_ids))
|
||||
papers = db.execute(query).scalars().all()
|
||||
|
||||
if paper_ids is None:
|
||||
db.execute(text("DELETE FROM papers_fts"))
|
||||
|
||||
count = 0
|
||||
for paper in papers:
|
||||
reindex_paper_fts(db, paper)
|
||||
count += 1
|
||||
db.commit()
|
||||
logger.info("FTS reindexed: %d papers", count)
|
||||
return {"status": "success", "indexed": count}
|
||||
|
||||
|
||||
def reindex_chroma(db: Session) -> dict:
|
||||
"""按 DB 权威数据重建 ChromaDB 语义索引。"""
|
||||
from app.services.embedder import index_paper
|
||||
|
||||
papers = db.execute(select(Paper).where(Paper.summary.has())).scalars().all()
|
||||
indexed = 0
|
||||
errors: list[str] = []
|
||||
for paper in papers:
|
||||
try:
|
||||
texts_dict = {
|
||||
"arxiv_id": paper.arxiv_id,
|
||||
"title_zh": paper.title_zh or "",
|
||||
"title_en": paper.title_en or "",
|
||||
"tags": " ".join(t.tag for t in paper.tags),
|
||||
"one_line": paper.summary.one_line if paper.summary else "",
|
||||
"motivation_problem": (
|
||||
paper.summary.motivation_problem if paper.summary else ""
|
||||
),
|
||||
"method_key_idea": (
|
||||
paper.summary.method_key_idea if paper.summary else ""
|
||||
),
|
||||
"paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
|
||||
}
|
||||
index_paper(paper.arxiv_id, texts_dict)
|
||||
indexed += 1
|
||||
except Exception as exc:
|
||||
errors.append(f"{paper.arxiv_id}: {exc}")
|
||||
logger.warning(
|
||||
"Failed to reindex ChromaDB for %s",
|
||||
paper.arxiv_id,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "success" if not errors else "partial",
|
||||
"indexed": indexed,
|
||||
"errors": errors or None,
|
||||
}
|
||||
@@ -0,0 +1,244 @@
|
||||
"""统一后台任务系统 — 创建、运行、事件记录、失败恢复。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from datetime import date, timedelta
|
||||
from typing import Any
|
||||
|
||||
from fastapi import BackgroundTasks
|
||||
from sqlalchemy import or_, select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.config import settings
|
||||
from app.database import SessionLocal
|
||||
from app.models import Job, JobEvent, JobEventStatus, JobStatus, TaskLock
|
||||
from app.utils import truncate_error, utc_now
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
STALE_JOB_AFTER = timedelta(hours=6)
|
||||
|
||||
|
||||
def _dumps(value: Any) -> str:
|
||||
return json.dumps(value, ensure_ascii=False, default=str)
|
||||
|
||||
|
||||
def _loads(value: str | None) -> dict:
|
||||
if not value:
|
||||
return {}
|
||||
try:
|
||||
data = json.loads(value)
|
||||
return data if isinstance(data, dict) else {}
|
||||
except json.JSONDecodeError:
|
||||
return {}
|
||||
|
||||
|
||||
def create_job(
|
||||
db: Session,
|
||||
job_type: str,
|
||||
*,
|
||||
owner: str,
|
||||
payload: dict | None = None,
|
||||
) -> Job:
|
||||
"""创建后台任务主记录。"""
|
||||
job = Job(
|
||||
type=job_type,
|
||||
status=JobStatus.QUEUED,
|
||||
owner=owner,
|
||||
payload_json=_dumps(payload or {}),
|
||||
created_at=utc_now(),
|
||||
)
|
||||
db.add(job)
|
||||
db.commit()
|
||||
db.refresh(job)
|
||||
add_job_event(
|
||||
db,
|
||||
job,
|
||||
stage="created",
|
||||
status=JobEventStatus.INFO,
|
||||
message=f"Job queued: {job_type}",
|
||||
payload=payload or {},
|
||||
)
|
||||
return job
|
||||
|
||||
|
||||
def add_job_event(
|
||||
db: Session,
|
||||
job: Job,
|
||||
*,
|
||||
stage: str,
|
||||
status: str,
|
||||
message: str | None = None,
|
||||
payload: dict | None = None,
|
||||
) -> None:
|
||||
"""追加一条任务阶段事件。"""
|
||||
db.add(
|
||||
JobEvent(
|
||||
job_id=job.id,
|
||||
stage=stage,
|
||||
status=str(status),
|
||||
message=message,
|
||||
payload_json=_dumps(payload) if payload is not None else None,
|
||||
created_at=utc_now(),
|
||||
)
|
||||
)
|
||||
job.heartbeat_at = utc_now()
|
||||
db.commit()
|
||||
|
||||
|
||||
def enqueue_job(background_tasks: BackgroundTasks, job_id: int) -> None:
|
||||
"""把任务提交给 FastAPI BackgroundTasks。"""
|
||||
background_tasks.add_task(run_job_by_id, job_id)
|
||||
|
||||
|
||||
async def run_job_by_id(job_id: int) -> None:
|
||||
"""使用独立 DB session 运行一个已创建的 job。"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
await run_job(db, job_id)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
async def run_job(db: Session, job_id: int) -> dict:
|
||||
"""运行 job,并把状态/result/error 写回 jobs/job_events。"""
|
||||
job = db.get(Job, job_id)
|
||||
if not job:
|
||||
raise ValueError(f"Job not found: {job_id}")
|
||||
if job.status == JobStatus.RUNNING:
|
||||
raise RuntimeError(f"Job already running: {job_id}")
|
||||
|
||||
payload = _loads(job.payload_json)
|
||||
job.status = JobStatus.RUNNING
|
||||
job.started_at = utc_now()
|
||||
job.heartbeat_at = job.started_at
|
||||
db.commit()
|
||||
add_job_event(db, job, stage="run", status=JobEventStatus.STARTED)
|
||||
|
||||
try:
|
||||
result = await _dispatch_job(db, job, payload)
|
||||
except Exception as exc:
|
||||
logger.exception("Job failed: id=%s type=%s", job.id, job.type)
|
||||
error = truncate_error(exc, limit=4000)
|
||||
job.status = JobStatus.FAILED
|
||||
job.error = error
|
||||
job.completed_at = utc_now()
|
||||
db.commit()
|
||||
add_job_event(db, job, stage="run", status=JobEventStatus.FAILED, message=error)
|
||||
return {"status": "failed", "error": error}
|
||||
|
||||
job.status = JobStatus.SUCCESS
|
||||
job.result_json = _dumps(result)
|
||||
job.completed_at = utc_now()
|
||||
job.error = None
|
||||
db.commit()
|
||||
add_job_event(
|
||||
db,
|
||||
job,
|
||||
stage="run",
|
||||
status=JobEventStatus.SUCCESS,
|
||||
payload=result if isinstance(result, dict) else {"result": result},
|
||||
)
|
||||
return result if isinstance(result, dict) else {"status": "success", "result": result}
|
||||
|
||||
|
||||
async def _dispatch_job(db: Session, job: Job, payload: dict) -> dict:
|
||||
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
|
||||
from app.services.crawler import refresh_upvotes
|
||||
from app.services.derived import reindex_chroma, reindex_fts
|
||||
from app.services.pipeline import run_crawl, run_pipeline
|
||||
from app.services.summarizer import summarize_batch, summarize_single
|
||||
|
||||
if job.type == "crawl_daily":
|
||||
return await run_crawl(
|
||||
db,
|
||||
payload["target_date"],
|
||||
owner=job.owner or f"job:{job.id}",
|
||||
top_n=payload.get("top_n"),
|
||||
)
|
||||
if job.type == "pipeline_daily":
|
||||
return await run_pipeline(
|
||||
db,
|
||||
payload["target_date"],
|
||||
owner=job.owner or f"job:{job.id}",
|
||||
)
|
||||
if job.type == "summarize_batch":
|
||||
return await summarize_batch(
|
||||
db,
|
||||
pdf_mode=payload.get("pdf_mode", settings.SUMMARY_PDF_MODE),
|
||||
)
|
||||
if job.type == "summarize_one":
|
||||
return await summarize_single(
|
||||
db,
|
||||
payload["arxiv_id"],
|
||||
force=payload.get("force", True),
|
||||
pdf_mode=payload.get("pdf_mode", settings.SUMMARY_PDF_MODE),
|
||||
)
|
||||
if job.type == "refresh_upvotes":
|
||||
return await refresh_upvotes(db, days=payload.get("days"))
|
||||
if job.type == "delete_range":
|
||||
return await delete_papers_by_date_range(
|
||||
db,
|
||||
date.fromisoformat(payload["date_start"]),
|
||||
date.fromisoformat(payload["date_end"]),
|
||||
include_notes=payload.get("include_notes", True),
|
||||
)
|
||||
if job.type == "cleanup_tmp":
|
||||
return cleanup_tmp()
|
||||
if job.type == "reindex_fts":
|
||||
return reindex_fts(db)
|
||||
if job.type == "reindex_chroma":
|
||||
return reindex_chroma(db)
|
||||
|
||||
raise ValueError(f"Unsupported job type: {job.type}")
|
||||
|
||||
|
||||
def recover_stale_jobs(db: Session) -> int:
|
||||
"""启动时将过期 running job/lock 标记为 stale,避免永久卡住。"""
|
||||
now = utc_now()
|
||||
cutoff = now - STALE_JOB_AFTER
|
||||
stale_jobs = (
|
||||
db.execute(
|
||||
select(Job).where(
|
||||
Job.status == JobStatus.RUNNING,
|
||||
or_(Job.heartbeat_at == None, Job.heartbeat_at < cutoff), # noqa: E711
|
||||
)
|
||||
)
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
for job in stale_jobs:
|
||||
job.status = JobStatus.STALE
|
||||
job.error = "Marked stale after process restart or missed heartbeat"
|
||||
job.completed_at = now
|
||||
db.add(
|
||||
JobEvent(
|
||||
job_id=job.id,
|
||||
stage="recovery",
|
||||
status=JobEventStatus.FAILED,
|
||||
message=job.error,
|
||||
created_at=now,
|
||||
)
|
||||
)
|
||||
|
||||
stale_locks = (
|
||||
db.execute(
|
||||
select(TaskLock).where(
|
||||
TaskLock.status == "running",
|
||||
TaskLock.acquired_at < cutoff,
|
||||
)
|
||||
)
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
for lock in stale_locks:
|
||||
lock.status = "stale"
|
||||
lock.released_at = now
|
||||
|
||||
db.commit()
|
||||
recovered = len(stale_jobs) + len(stale_locks)
|
||||
if recovered:
|
||||
logger.warning("Recovered stale runtime records: %d", recovered)
|
||||
return recovered
|
||||
@@ -7,6 +7,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from datetime import date as date_type
|
||||
from datetime import timedelta
|
||||
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.orm import Session
|
||||
@@ -32,6 +33,8 @@ def acquire_lock(db: Session, task: str, lock_key: str, owner: str) -> TaskLock:
|
||||
status="running",
|
||||
owner=owner,
|
||||
acquired_at=utc_now(),
|
||||
heartbeat_at=utc_now(),
|
||||
expires_at=utc_now() + timedelta(hours=6),
|
||||
)
|
||||
try:
|
||||
db.add(lock)
|
||||
@@ -42,7 +45,12 @@ def acquire_lock(db: Session, task: str, lock_key: str, owner: str) -> TaskLock:
|
||||
return lock
|
||||
|
||||
|
||||
async def run_crawl(db: Session, target_date: str, owner: str = "admin_crawl") -> dict:
|
||||
async def run_crawl(
|
||||
db: Session,
|
||||
target_date: str,
|
||||
owner: str = "admin_crawl",
|
||||
top_n: int | None = None,
|
||||
) -> dict:
|
||||
"""执行单次抓取(带防重入锁)。
|
||||
|
||||
Args:
|
||||
@@ -55,7 +63,7 @@ async def run_crawl(db: Session, target_date: str, owner: str = "admin_crawl") -
|
||||
"""
|
||||
lock = acquire_lock(db, "crawl", target_date, owner)
|
||||
try:
|
||||
return await crawl_daily(db, target_date)
|
||||
return await crawl_daily(db, target_date, top_n=top_n)
|
||||
finally:
|
||||
release_lock(db, lock)
|
||||
|
||||
@@ -83,6 +91,8 @@ async def run_pipeline(db: Session, target_date: str, owner: str) -> dict:
|
||||
status="running",
|
||||
owner=owner,
|
||||
acquired_at=now,
|
||||
heartbeat_at=now,
|
||||
expires_at=now + timedelta(hours=6),
|
||||
)
|
||||
try:
|
||||
db.add(lock)
|
||||
|
||||
@@ -11,8 +11,7 @@ from zoneinfo import ZoneInfo
|
||||
|
||||
from app.config import settings
|
||||
from app.database import SessionLocal
|
||||
from app.services.pipeline import run_pipeline
|
||||
from app.services.crawler import refresh_upvotes
|
||||
from app.services.jobs import create_job, run_job
|
||||
from app.utils import today_str
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -112,7 +111,13 @@ async def _daily_pipeline() -> None:
|
||||
|
||||
db: Session = SessionLocal()
|
||||
try:
|
||||
await run_pipeline(db, today, owner="daily_pipeline")
|
||||
job = create_job(
|
||||
db,
|
||||
"pipeline_daily",
|
||||
owner="daily_pipeline",
|
||||
payload={"target_date": today},
|
||||
)
|
||||
await run_job(db, job.id)
|
||||
except RuntimeError:
|
||||
logger.warning("Daily pipeline already running for %s, skipping", today)
|
||||
except Exception:
|
||||
@@ -125,7 +130,8 @@ async def _upvote_refresh() -> None:
|
||||
"""刷新最近 N 天论文的 upvotes。"""
|
||||
db: Session = SessionLocal()
|
||||
try:
|
||||
result = await refresh_upvotes(db)
|
||||
job = create_job(db, "refresh_upvotes", owner="upvote_refresh", payload={})
|
||||
result = await run_job(db, job.id)
|
||||
logger.info(
|
||||
"Upvote refresh completed: status=%s updated=%d",
|
||||
result.get("status"),
|
||||
|
||||
@@ -3,8 +3,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.models import (
|
||||
@@ -13,6 +11,7 @@ from app.models import (
|
||||
PaperTag,
|
||||
SummaryState,
|
||||
)
|
||||
from app.services.derived import reindex_paper_fts
|
||||
from app.services.pdf_downloader import paper_dir
|
||||
from app.services.schemas import (
|
||||
SummarySchema,
|
||||
@@ -75,19 +74,9 @@ def _update_summary_in_db(
|
||||
db.add(PaperTag(paper_id=paper.id, tag=tag_name, source="ai"))
|
||||
existing_tag_names.add(tag_name)
|
||||
|
||||
# 4. FTS5 更新
|
||||
summary_text = _build_fts_summary_text(schema)
|
||||
db.execute(
|
||||
text(
|
||||
"UPDATE papers_fts SET title_zh=:title_zh, summary_text=:summary_text "
|
||||
"WHERE rowid=:paper_id"
|
||||
),
|
||||
{
|
||||
"title_zh": schema.title_zh,
|
||||
"summary_text": summary_text,
|
||||
"paper_id": paper.id,
|
||||
},
|
||||
)
|
||||
# 4. FTS5 派生索引
|
||||
db.flush()
|
||||
reindex_paper_fts(db, paper)
|
||||
|
||||
db.commit()
|
||||
logger.info("DB updated: paper=%s quality=%s", paper.arxiv_id, quality)
|
||||
|
||||
Reference in New Issue
Block a user