feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules
- Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection
- Add layout_detector service for PicoDet-S_layout_3cls integration
- Add exceptions module with ConflictError and NotFoundError
- Improve admin dashboard with better statistics and task management
- Add design review document with system optimization suggestions
- Add new tests for crawler, pdf_downloader, pipeline, and summary_utils
- Update dependencies and configuration
- Clean up dead code and improve error handling
This commit is contained in:
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
+28 -11
View File
@@ -5,7 +5,15 @@ from __future__ import annotations
from sqlalchemy import or_, select
from sqlalchemy.orm import Session, joinedload
from app.models import PAPER_FULL_LOAD, Paper, PaperTag, UserBookmark, UserNote, UserReadingStatus
from app.exceptions import NotFoundError, ValidationError
from app.models import (
PAPER_FULL_LOAD,
Paper,
PaperTag,
UserBookmark,
UserNote,
UserReadingStatus,
)
from app.utils import utc_now
# ── 收藏 ──────────────────────────────────────────────────────────────
@@ -13,9 +21,11 @@ from app.utils import utc_now
def toggle_bookmark(db: Session, arxiv_id: str) -> dict:
"""切换收藏状态。返回 {"bookmarked": bool, "arxiv_id": str}。"""
paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none()
paper = db.execute(
select(Paper).where(Paper.arxiv_id == arxiv_id)
).scalar_one_or_none()
if not paper:
return {"error": "not_found"}
raise NotFoundError(f"Paper not found: {arxiv_id}")
existing = db.execute(
select(UserBookmark).where(UserBookmark.paper_id == paper.id)
@@ -42,11 +52,15 @@ VALID_STATUSES = {"unread", "skimmed", "read_summary", "read_full"}
def set_reading_status(db: Session, arxiv_id: str, status: str) -> dict:
"""设置阅读状态。status 必须是 unread/skimmed/read_summary/read_full。"""
if status not in VALID_STATUSES:
return {"error": "invalid_status", "valid": sorted(VALID_STATUSES)}
raise ValidationError(
f"Invalid reading status: {status}. Valid: {', '.join(sorted(VALID_STATUSES))}"
)
paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none()
paper = db.execute(
select(Paper).where(Paper.arxiv_id == arxiv_id)
).scalar_one_or_none()
if not paper:
return {"error": "not_found"}
raise NotFoundError(f"Paper not found: {arxiv_id}")
now = utc_now()
existing = db.execute(
@@ -72,7 +86,9 @@ def set_reading_status(db: Session, arxiv_id: str, status: str) -> dict:
def get_note(db: Session, arxiv_id: str) -> dict | None:
"""获取笔记。返回 {"arxiv_id", "content", "updated_at"} 或 None(论文不存在时)。"""
paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none()
paper = db.execute(
select(Paper).where(Paper.arxiv_id == arxiv_id)
).scalar_one_or_none()
if not paper:
return None
@@ -91,9 +107,11 @@ def get_note(db: Session, arxiv_id: str) -> dict | None:
def save_note(db: Session, arxiv_id: str, content: str) -> dict:
"""创建或更新笔记。返回 {"arxiv_id", "content", "updated_at"}。"""
paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none()
paper = db.execute(
select(Paper).where(Paper.arxiv_id == arxiv_id)
).scalar_one_or_none()
if not paper:
return {"error": "not_found"}
raise NotFoundError(f"Paper not found: {arxiv_id}")
now = utc_now()
existing = db.execute(
@@ -154,8 +172,7 @@ def query_reading_list(
stmt.options(
joinedload(Paper.note),
*PAPER_FULL_LOAD,
)
.order_by(Paper.paper_date.desc(), Paper.upvotes.desc())
).order_by(Paper.paper_date.desc(), Paper.upvotes.desc())
)
.unique()
.scalars()