feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules
- Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection
- Add layout_detector service for PicoDet-S_layout_3cls integration
- Add exceptions module with ConflictError and NotFoundError
- Improve admin dashboard with better statistics and task management
- Add design review document with system optimization suggestions
- Add new tests for crawler, pdf_downloader, pipeline, and summary_utils
- Update dependencies and configuration
- Clean up dead code and improve error handling
This commit is contained in:
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
+94 -12
View File
@@ -9,10 +9,18 @@ from sqlalchemy import func, select, text
from sqlalchemy.orm import Session
from app.config import settings
from app.models import CrawlLog, Paper, SummaryState, TaskLock
from app.models import CrawlLog, Paper, PaperTag, SummaryState, SummaryStatus, TaskLock
from app.services.scheduler import get_scheduler
from app.utils import PAPERS_DIR, TMP_DIR
# admin_papers 排序映射
SORT_MAP = {
"date_desc": Paper.paper_date.desc(),
"date_asc": Paper.paper_date.asc(),
"upvotes_desc": Paper.upvotes.desc(),
"title_asc": Paper.title_en.asc(),
}
def _dir_size(path: Path) -> int:
"""递归计算目录总字节数。"""
@@ -52,7 +60,11 @@ def get_admin_stats(db: Session) -> dict:
status_counts = {row[0]: row[1] for row in summary_rows}
# ── 存储概况 ──────────────────────────────────────────────────────
db_size = _fmt_size(settings.db_path.stat().st_size) if settings.db_path.exists() else "0 B"
db_size = (
_fmt_size(settings.db_path.stat().st_size)
if settings.db_path.exists()
else "0 B"
)
papers_size = _fmt_size(_dir_size(PAPERS_DIR))
tmp_size = _fmt_size(_dir_size(TMP_DIR))
@@ -68,22 +80,14 @@ def get_admin_stats(db: Session) -> dict:
# ── 最近日志(5 条) ──────────────────────────────────────────────
recent_logs = (
db.execute(
select(CrawlLog)
.order_by(CrawlLog.started_at.desc())
.limit(5)
)
db.execute(select(CrawlLog).order_by(CrawlLog.started_at.desc()).limit(5))
.scalars()
.all()
)
# ── 活跃锁 ────────────────────────────────────────────────────────
active_locks = (
db.execute(
select(TaskLock).where(TaskLock.status == "running")
)
.scalars()
.all()
db.execute(select(TaskLock).where(TaskLock.status == "running")).scalars().all()
)
return {
@@ -108,3 +112,81 @@ def get_admin_stats(db: Session) -> dict:
"active_locks": active_locks,
"upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
}
def query_papers(
db: Session,
*,
q: str = "",
date_from: str | None = None,
date_to: str | None = None,
tag: str = "",
summary_status: str = "all",
sort: str = "date_desc",
page: int = 1,
per_page: int = 20,
) -> tuple[list[Paper], int, dict[str, str]]:
"""论文管理查询 — 构建过滤、排序、分页。
Returns:
(papers, total, statuses) — 论文列表、总数、{arxiv_id: summary_status}
"""
query = select(Paper)
# 搜索
if q.strip():
query = query.where(
Paper.title_en.ilike(f"%{q}%")
| Paper.title_zh.ilike(f"%{q}%")
| Paper.abstract.ilike(f"%{q}%")
)
# 日期范围
if date_from:
query = query.where(Paper.paper_date >= date_from)
if date_to:
query = query.where(Paper.paper_date <= date_to)
# 标签筛选
if tag:
query = query.join(PaperTag, PaperTag.paper_id == Paper.id).where(
PaperTag.tag == tag
)
# 总结状态筛选
if summary_status != "all":
if summary_status == "none":
query = query.outerjoin(
SummaryStatus, SummaryStatus.paper_id == Paper.id
).where(SummaryStatus.paper_id == None) # noqa: E711
else:
query = query.join(SummaryStatus, SummaryStatus.paper_id == Paper.id).where(
SummaryStatus.status == summary_status
)
# 排序
order = SORT_MAP.get(sort, Paper.paper_date.desc())
query = query.order_by(order)
# 计数
total = db.scalar(select(func.count()).select_from(query.subquery()))
# 分页
papers = (
db.execute(query.offset((page - 1) * per_page).limit(per_page)).scalars().all()
)
# 每篇论文的总结状态
paper_ids = [p.id for p in papers]
statuses: dict[str, str] = {}
if paper_ids:
rows = db.execute(
select(SummaryStatus.paper_id, SummaryStatus.status).where(
SummaryStatus.paper_id.in_(paper_ids)
)
).all()
paper_id_to_arxiv = {p.id: p.arxiv_id for p in papers}
for pid, st in rows:
statuses[paper_id_to_arxiv.get(pid, "")] = st
return papers, total or 0, statuses