feat: add concurrency safety, caption detection, admin enhancements, and performance improvements

This commit is contained in:
2026-06-14 22:20:02 +08:00
parent 8f13c31991
commit 29fb20828e
23 changed files with 1782 additions and 114 deletions
+200 -5
View File
@@ -2,8 +2,10 @@
from __future__ import annotations
import csv
import hashlib
import hmac
import io
from datetime import date
from fastapi import (
@@ -15,7 +17,7 @@ from fastapi import (
Query,
Request,
)
from fastapi.responses import RedirectResponse
from fastapi.responses import RedirectResponse, Response
from pydantic import BaseModel, field_validator
from sqlalchemy.orm import Session
@@ -298,6 +300,183 @@ async def admin_job_detail(
return detail
# ── 任务监控 ──────────────────────────────────────────────────────────
@router.get("/jobs")
async def admin_jobs(
request: Request,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
status: str = Query("all"),
job_type: str = Query("all"),
page: int = Query(1, ge=1),
per_page: int = Query(20, ge=1, le=100),
):
"""后台任务监控页。"""
jobs, total = admin_svc.query_jobs(
db, status=status, job_type=job_type, page=page, per_page=per_page
)
counts = admin_svc.get_job_status_counts(db)
def pagination_url(p: int) -> str:
params = dict(request.query_params)
params["page"] = str(p)
return "/admin/jobs?" + "&".join(f"{k}={v}" for k, v in params.items())
return templates.TemplateResponse(
request,
"admin_jobs.html",
{
"jobs": jobs,
"total": total,
"page": page,
"per_page": per_page,
"current_status": status,
"current_type": job_type,
"status_counts": counts,
"pagination_url": pagination_url,
},
)
# ── 锁管理 ────────────────────────────────────────────────────────────
@router.post("/locks/{lock_id}/release")
async def admin_release_lock(
lock_id: int,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""强制释放一个卡死的任务锁。"""
if not admin_svc.force_release_lock(db, lock_id):
raise HTTPException(
status_code=404, detail=f"Lock not found or already released: {lock_id}"
)
return {"status": "success", "lock_id": lock_id}
# ── 重抓 ──────────────────────────────────────────────────────────────
@router.post("/paper-recrawl/{arxiv_id}")
async def admin_paper_recrawl(
arxiv_id: str,
background_tasks: BackgroundTasks,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""重新抓取单篇已存在论文的完整元数据。"""
job = create_job(
db, "recrawl_one", owner="admin_recrawl", payload={"arxiv_id": arxiv_id}
)
enqueue_job(background_tasks, job.id)
return {"status": "queued", "job_id": job.id, "arxiv_id": arxiv_id}
# ── 索引重建 ──────────────────────────────────────────────────────────
class RebuildIndexRequest(BaseModel):
target: str # "fts" / "chroma" / "both"
@field_validator("target")
@classmethod
def target_must_be_valid(cls, v: str) -> str:
if v not in ("fts", "chroma", "both"):
raise ValueError("target must be 'fts', 'chroma' or 'both'")
return v
@router.post("/rebuild-indexes")
async def admin_rebuild_indexes(
body: RebuildIndexRequest,
background_tasks: BackgroundTasks,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""重建搜索索引(FTS5 / ChromaDB)。"""
job_ids: list[int] = []
if body.target in ("fts", "both"):
job = create_job(db, "reindex_fts", owner="admin_reindex", payload={})
enqueue_job(background_tasks, job.id)
job_ids.append(job.id)
if body.target in ("chroma", "both"):
job = create_job(db, "reindex_chroma", owner="admin_reindex", payload={})
enqueue_job(background_tasks, job.id)
job_ids.append(job.id)
return {"status": "queued", "job_ids": job_ids, "target": body.target}
# ── 导出 CSV ──────────────────────────────────────────────────────────
@router.get("/papers/export.csv")
async def admin_papers_export(
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
q: str = Query(""),
date_from: str | None = Query(None),
date_to: str | None = Query(None),
tag: str = Query(""),
summary_status: str = Query("all"),
sort: str = Query("date_desc"),
):
"""导出当前过滤条件下的论文为 CSV(含 UTF-8 BOMExcel 友好)。"""
papers, _total, statuses = admin_svc.query_papers(
db,
q=q,
date_from=date_from,
date_to=date_to,
tag=tag,
summary_status=summary_status,
sort=sort,
page=1,
per_page=10**6,
)
buf = io.StringIO()
buf.write("") # UTF-8 BOM for Excel
writer = csv.writer(buf)
writer.writerow(
[
"arxiv_id",
"title_en",
"title_zh",
"paper_date",
"upvotes",
"summary_status",
"authors",
"tags",
"pdf_url",
]
)
for paper in papers:
authors = ";".join(a.name for a in paper.authors)
tags = ";".join(t.tag for t in paper.tags)
writer.writerow(
[
paper.arxiv_id,
paper.title_en or "",
paper.title_zh or "",
str(paper.paper_date) if paper.paper_date else "",
paper.upvotes or 0,
statuses.get(paper.arxiv_id, "none"),
authors,
tags,
paper.pdf_url or "",
]
)
filename = f"papers_{today_str().replace('-', '')}.csv"
return Response(
content=buf.getvalue(),
media_type="text/csv; charset=utf-8",
headers={"Content-Disposition": f'attachment; filename="{filename}"'},
)
# ── 日志 ──────────────────────────────────────────────────────────────
@@ -438,24 +617,25 @@ async def admin_paper_delete(
class BatchActionRequest(BaseModel):
action: str # "delete" or "summarize"
action: str # "delete" / "summarize" / "recrawl"
arxiv_ids: list[str]
@field_validator("action")
@classmethod
def action_must_be_valid(cls, v: str) -> str:
if v not in ("delete", "summarize"):
raise ValueError("action must be 'delete' or 'summarize'")
if v not in ("delete", "summarize", "recrawl"):
raise ValueError("action must be 'delete', 'summarize' or 'recrawl'")
return v
@router.post("/papers-batch-action")
async def admin_papers_batch_action(
body: BatchActionRequest,
background_tasks: BackgroundTasks,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""批量操作论文(删除或总结)。"""
"""批量操作论文(删除 / 总结 / 重抓)。"""
if not body.arxiv_ids:
raise HTTPException(status_code=400, detail="arxiv_ids 不能为空")
@@ -475,3 +655,18 @@ async def admin_papers_batch_action(
"message": f"已将 {count} 篇论文重置为待总结",
"count": count,
}
elif body.action == "recrawl":
job = create_job(
db,
"recrawl_batch",
owner="admin_recrawl",
payload={"arxiv_ids": body.arxiv_ids},
)
enqueue_job(background_tasks, job.id)
return {
"status": "queued",
"job_id": job.id,
"count": len(body.arxiv_ids),
"message": f"已将 {len(body.arxiv_ids)} 篇论文加入重抓队列",
}