feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules
- Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection
- Add layout_detector service for PicoDet-S_layout_3cls integration
- Add exceptions module with ConflictError and NotFoundError
- Improve admin dashboard with better statistics and task management
- Add design review document with system optimization suggestions
- Add new tests for crawler, pdf_downloader, pipeline, and summary_utils
- Update dependencies and configuration
- Clean up dead code and improve error handling
This commit is contained in:
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
+94 -144
View File
@@ -3,6 +3,7 @@
from __future__ import annotations
import hashlib
import hmac
import json
import logging
from datetime import date
@@ -10,7 +11,7 @@ from datetime import date
from fastapi import APIRouter, Depends, Form, HTTPException, Query, Request
from fastapi.responses import RedirectResponse
from pydantic import BaseModel, field_validator
from sqlalchemy import func, select, text
from sqlalchemy import bindparam, func, select, text
from sqlalchemy.orm import Session
from app.config import settings
@@ -22,15 +23,15 @@ from app.models import (
PaperTag,
SummaryState,
SummaryStatus,
TaskLock,
)
from app.services import admin as admin_svc
from app.services.admin import get_admin_stats
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
from app.services.crawler import crawl_daily, refresh_upvotes
from app.services.pipeline import run_pipeline
from app.services.crawler import refresh_upvotes
from app.services.pipeline import run_crawl, run_pipeline
from app.services.scheduler import get_scheduler
from app.services.summarizer import summarize_batch, summarize_single
from app.utils import release_lock, templates, today_str, utc_now
from app.utils import templates, today_str, utc_now
logger = logging.getLogger(__name__)
@@ -41,14 +42,15 @@ router = APIRouter(prefix="/admin", tags=["admin"])
def _check_password(password: str) -> bool:
"""校验密码,支持明文或 sha256 哈希。"""
"""校验密码,支持明文或 sha256 哈希(常量时间比较)"""
stored = settings.ADMIN_PASSWORD
if not stored:
return False
if password == stored:
if hmac.compare_digest(password, stored):
return True
# 也支持存 sha256 哈希
return hashlib.sha256(password.encode()).hexdigest() == stored
hashed = hashlib.sha256(password.encode()).hexdigest()
return hmac.compare_digest(hashed, stored)
async def verify_admin(request: Request) -> None:
@@ -204,32 +206,12 @@ async def admin_crawl(
):
"""手动抓取指定日期,默认今天。"""
target_date = date or today_str()
# TaskLock 防重入
now = utc_now()
lock = TaskLock(
task="crawl",
lock_key=target_date,
status="running",
owner="admin_crawl",
acquired_at=now,
)
try:
db.add(lock)
db.commit()
except Exception:
db.rollback()
raise HTTPException(
status_code=409, detail=f"Crawl already running for {target_date}"
)
try:
result = await crawl_daily(db, target_date)
return result
return await run_crawl(db, target_date, owner="admin_crawl")
except RuntimeError as exc:
raise HTTPException(status_code=409, detail=str(exc))
except Exception as exc:
raise HTTPException(status_code=500, detail=str(exc))
finally:
release_lock(db, lock)
# ── 总结 ──────────────────────────────────────────────────────────────
@@ -241,12 +223,7 @@ async def admin_summarize_batch(
db: Session = Depends(get_db),
):
"""批量总结所有 pending 论文。"""
result = await summarize_batch(db, pdf_mode=settings.SUMMARY_PDF_MODE)
if result.get("status") == "conflict":
raise HTTPException(
status_code=409, detail=result.get("error", "batch already running")
)
return result
return await summarize_batch(db, pdf_mode=settings.SUMMARY_PDF_MODE)
@router.post("/summarize/{arxiv_id}")
@@ -256,10 +233,9 @@ async def admin_summarize_single(
db: Session = Depends(get_db),
):
"""总结或重跑单篇论文。"""
result = await summarize_single(db, arxiv_id, force=True, pdf_mode=settings.SUMMARY_PDF_MODE)
if result.get("status") == "not_found":
raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}")
return result
return await summarize_single(
db, arxiv_id, force=True, pdf_mode=settings.SUMMARY_PDF_MODE
)
# ── 清理 ──────────────────────────────────────────────────────────────
@@ -284,10 +260,13 @@ async def admin_cleanup(
result = cleanup_tmp()
log_entry.status = "success"
log_entry.completed_at = utc_now()
log_entry.details_json = json.dumps({
"scanned": result.get("scanned", 0),
"removed": result.get("removed", 0),
}, ensure_ascii=False)
log_entry.details_json = json.dumps(
{
"scanned": result.get("scanned", 0),
"removed": result.get("removed", 0),
},
ensure_ascii=False,
)
if result.get("errors"):
log_entry.error = "; ".join(result["errors"])[:2000]
db.commit()
@@ -358,19 +337,34 @@ async def admin_logs(
# 总结状态统计概要
summary_total = db.scalar(select(func.count(Paper.id))) or 0
summary_done = db.scalar(
select(func.count(SummaryStatus.id)).where(SummaryStatus.status == SummaryState.DONE)
) or 0
summary_pending = db.scalar(
select(func.count(SummaryStatus.id)).where(
SummaryStatus.status.in_([SummaryState.PENDING, SummaryState.PROCESSING])
summary_done = (
db.scalar(
select(func.count(SummaryStatus.id)).where(
SummaryStatus.status == SummaryState.DONE
)
)
) or 0
summary_failed = db.scalar(
select(func.count(SummaryStatus.id)).where(
SummaryStatus.status.in_([SummaryState.FAILED, SummaryState.PERMANENT_FAILURE])
or 0
)
summary_pending = (
db.scalar(
select(func.count(SummaryStatus.id)).where(
SummaryStatus.status.in_(
[SummaryState.PENDING, SummaryState.PROCESSING]
)
)
)
) or 0
or 0
)
summary_failed = (
db.scalar(
select(func.count(SummaryStatus.id)).where(
SummaryStatus.status.in_(
[SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]
)
)
)
or 0
)
return templates.TemplateResponse(
request,
@@ -414,13 +408,8 @@ async def admin_summary_status(
else:
query = query.where(SummaryStatus.status == status)
total = db.scalar(
select(func.count()).select_from(query.subquery())
)
results = (
db.execute(query.offset((page - 1) * per_page).limit(per_page))
.all()
)
total = db.scalar(select(func.count()).select_from(query.subquery()))
results = db.execute(query.offset((page - 1) * per_page).limit(per_page)).all()
# 判断是否 HTMX 请求
is_htmx = request.headers.get("HX-Request") == "true"
@@ -465,7 +454,11 @@ async def admin_summary_retry_failed(
db.execute(
select(Paper.arxiv_id)
.join(SummaryStatus, SummaryStatus.paper_id == Paper.id)
.where(SummaryStatus.status.in_([SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]))
.where(
SummaryStatus.status.in_(
[SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]
)
)
)
.scalars()
.all()
@@ -477,7 +470,11 @@ async def admin_summary_retry_failed(
# 重置失败任务的状态为 pending
db.execute(
SummaryStatus.__table__.update()
.where(SummaryStatus.status.in_([SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]))
.where(
SummaryStatus.status.in_(
[SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]
)
)
.values(status=SummaryState.PENDING, error=None, error_type=None)
)
db.commit()
@@ -492,15 +489,6 @@ async def admin_summary_retry_failed(
# ── 论文管理 ────────────────────────────────────────────────────────
# 排序映射
_SORT_MAP = {
"date_desc": Paper.paper_date.desc(),
"date_asc": Paper.paper_date.asc(),
"upvotes_desc": Paper.upvotes.desc(),
"title_asc": Paper.title_en.asc(),
}
@router.get("/papers")
async def admin_papers(
request: Request,
@@ -516,66 +504,18 @@ async def admin_papers(
per_page: int = Query(20, ge=1, le=100),
):
"""论文管理列表页面。"""
query = select(Paper)
# 搜索
if q.strip():
query = query.where(
Paper.title_en.ilike(f"%{q}%")
| Paper.title_zh.ilike(f"%{q}%")
| Paper.abstract.ilike(f"%{q}%")
)
# 日期范围
if date_from:
query = query.where(Paper.paper_date >= date_from)
if date_to:
query = query.where(Paper.paper_date <= date_to)
# 标签筛选
if tag:
query = query.join(PaperTag, PaperTag.paper_id == Paper.id).where(
PaperTag.tag == tag
)
# 总结状态筛选
if summary_status != "all":
if summary_status == "none":
query = query.outerjoin(
SummaryStatus, SummaryStatus.paper_id == Paper.id
).where(SummaryStatus.paper_id == None) # noqa: E711
else:
query = query.join(
SummaryStatus, SummaryStatus.paper_id == Paper.id
).where(SummaryStatus.status == summary_status)
# 排序
order = _SORT_MAP.get(sort, Paper.paper_date.desc())
query = query.order_by(order)
# 计数
total = db.scalar(select(func.count()).select_from(query.subquery()))
# 分页
papers = (
db.execute(query.offset((page - 1) * per_page).limit(per_page))
.scalars()
.all()
papers, total, statuses = admin_svc.query_papers(
db,
q=q,
date_from=date_from,
date_to=date_to,
tag=tag,
summary_status=summary_status,
sort=sort,
page=page,
per_page=per_page,
)
# 获取每篇论文的总结状态
paper_ids = [p.id for p in papers]
statuses = {}
if paper_ids:
rows = db.execute(
select(SummaryStatus.paper_id, SummaryStatus.status).where(
SummaryStatus.paper_id.in_(paper_ids)
)
).all()
paper_id_to_arxiv = {p.id: p.arxiv_id for p in papers}
for pid, st in rows:
statuses[paper_id_to_arxiv.get(pid, "")] = st
# 构建分页 URL 辅助函数
def pagination_url(p: int) -> str:
params = dict(request.query_params)
@@ -588,7 +528,7 @@ async def admin_papers(
{
"papers": papers,
"paper_summary_statuses": statuses,
"total": total or 0,
"total": total,
"page": page,
"per_page": per_page,
"current_status": summary_status,
@@ -615,7 +555,9 @@ async def admin_paper_delete(
# 清理 FTS 索引
try:
db.execute(text("DELETE FROM papers_fts WHERE arxiv_id = :aid"), {"aid": arxiv_id})
db.execute(
text("DELETE FROM papers_fts WHERE arxiv_id = :aid"), {"aid": arxiv_id}
)
db.commit()
except Exception:
logger.warning("Failed to clean FTS index for %s", arxiv_id, exc_info=True)
@@ -646,9 +588,11 @@ async def admin_papers_batch_action(
raise HTTPException(status_code=400, detail="arxiv_ids 不能为空")
if body.action == "delete":
papers = db.execute(
select(Paper).where(Paper.arxiv_id.in_(body.arxiv_ids))
).scalars().all()
papers = (
db.execute(select(Paper).where(Paper.arxiv_id.in_(body.arxiv_ids)))
.scalars()
.all()
)
count = 0
for paper in papers:
@@ -658,21 +602,27 @@ async def admin_papers_batch_action(
# 清理 FTS 索引
try:
db.execute(
text("DELETE FROM papers_fts WHERE arxiv_id IN :ids"),
{"ids": tuple(body.arxiv_ids)},
stmt = text("DELETE FROM papers_fts WHERE arxiv_id IN :ids").bindparams(
bindparam("ids", expanding=True)
)
db.execute(stmt, {"ids": body.arxiv_ids})
db.commit()
except Exception:
logger.warning("Failed to clean FTS index for batch delete", exc_info=True)
return {"status": "success", "message": f"已删除 {count} 篇论文", "count": count}
return {
"status": "success",
"message": f"已删除 {count} 篇论文",
"count": count,
}
elif body.action == "summarize":
# 将选中论文的总结状态重置为 pending
paper_ids = db.execute(
select(Paper.id).where(Paper.arxiv_id.in_(body.arxiv_ids))
).scalars().all()
paper_ids = (
db.execute(select(Paper.id).where(Paper.arxiv_id.in_(body.arxiv_ids)))
.scalars()
.all()
)
if paper_ids:
# 删除旧的 status 记录让其重新进入 pipeline