feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
@@ -5,7 +5,15 @@ from __future__ import annotations
 from sqlalchemy import or_, select
 from sqlalchemy.orm import Session, joinedload

-from app.models import PAPER_FULL_LOAD, Paper, PaperTag, UserBookmark, UserNote, UserReadingStatus
+from app.exceptions import NotFoundError, ValidationError
+from app.models import (
+    PAPER_FULL_LOAD,
+    Paper,
+    PaperTag,
+    UserBookmark,
+    UserNote,
+    UserReadingStatus,
+)
 from app.utils import utc_now

 # ── 收藏 ──────────────────────────────────────────────────────────────
@@ -13,9 +21,11 @@ from app.utils import utc_now

 def toggle_bookmark(db: Session, arxiv_id: str) -> dict:
    """切换收藏状态。返回 {"bookmarked": bool, "arxiv_id": str}。"""
-    paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none()
+    paper = db.execute(
+        select(Paper).where(Paper.arxiv_id == arxiv_id)
+    ).scalar_one_or_none()
    if not paper:
-        return {"error": "not_found"}
+        raise NotFoundError(f"Paper not found: {arxiv_id}")

    existing = db.execute(
        select(UserBookmark).where(UserBookmark.paper_id == paper.id)
@@ -42,11 +52,15 @@ VALID_STATUSES = {"unread", "skimmed", "read_summary", "read_full"}
 def set_reading_status(db: Session, arxiv_id: str, status: str) -> dict:
    """设置阅读状态。status 必须是 unread/skimmed/read_summary/read_full。"""
    if status not in VALID_STATUSES:
-        return {"error": "invalid_status", "valid": sorted(VALID_STATUSES)}
+        raise ValidationError(
+            f"Invalid reading status: {status}. Valid: {', '.join(sorted(VALID_STATUSES))}"
+        )

-    paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none()
+    paper = db.execute(
+        select(Paper).where(Paper.arxiv_id == arxiv_id)
+    ).scalar_one_or_none()
    if not paper:
-        return {"error": "not_found"}
+        raise NotFoundError(f"Paper not found: {arxiv_id}")

    now = utc_now()
    existing = db.execute(
@@ -72,7 +86,9 @@ def set_reading_status(db: Session, arxiv_id: str, status: str) -> dict:

 def get_note(db: Session, arxiv_id: str) -> dict | None:
    """获取笔记。返回 {"arxiv_id", "content", "updated_at"} 或 None（论文不存在时）。"""
-    paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none()
+    paper = db.execute(
+        select(Paper).where(Paper.arxiv_id == arxiv_id)
+    ).scalar_one_or_none()
    if not paper:
        return None

@@ -91,9 +107,11 @@ def get_note(db: Session, arxiv_id: str) -> dict | None:

 def save_note(db: Session, arxiv_id: str, content: str) -> dict:
    """创建或更新笔记。返回 {"arxiv_id", "content", "updated_at"}。"""
-    paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none()
+    paper = db.execute(
+        select(Paper).where(Paper.arxiv_id == arxiv_id)
+    ).scalar_one_or_none()
    if not paper:
-        return {"error": "not_found"}
+        raise NotFoundError(f"Paper not found: {arxiv_id}")

    now = utc_now()
    existing = db.execute(
@@ -154,8 +172,7 @@ def query_reading_list(
            stmt.options(
                joinedload(Paper.note),
                *PAPER_FULL_LOAD,
-            )
-            .order_by(Paper.paper_date.desc(), Paper.upvotes.desc())
+            ).order_by(Paper.paper_date.desc(), Paper.upvotes.desc())
        )
        .unique()
        .scalars()