feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules
- Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection
- Add layout_detector service for PicoDet-S_layout_3cls integration
- Add exceptions module with ConflictError and NotFoundError
- Improve admin dashboard with better statistics and task management
- Add design review document with system optimization suggestions
- Add new tests for crawler, pdf_downloader, pipeline, and summary_utils
- Update dependencies and configuration
- Clean up dead code and improve error handling
This commit is contained in:
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
+13 -14
View File
@@ -2,6 +2,9 @@
from __future__ import annotations
import pytest
from app.exceptions import NotFoundError, ValidationError
from app.services.user_data import (
get_note,
save_note,
@@ -27,9 +30,8 @@ class TestBookmarkService:
assert result["bookmarked"] is False
def test_toggle_bookmark_not_found(self, db_session):
result = toggle_bookmark(db_session, "nonexistent")
assert "error" in result
assert result["error"] == "not_found"
with pytest.raises(NotFoundError):
toggle_bookmark(db_session, "nonexistent")
# ═══════════════════════════════════════════════════════════════════════
@@ -44,9 +46,8 @@ class TestReadingStatusService:
assert result["arxiv_id"] == "2401.12345"
def test_set_reading_status_invalid(self, db_session, sample_paper):
result = set_reading_status(db_session, "2401.12345", "invalid_status")
assert "error" in result
assert result["error"] == "invalid_status"
with pytest.raises(ValidationError):
set_reading_status(db_session, "2401.12345", "invalid_status")
def test_update_existing_status(self, db_session, sample_paper):
set_reading_status(db_session, "2401.12345", "skimmed")
@@ -54,9 +55,8 @@ class TestReadingStatusService:
assert result["status"] == "read_full"
def test_set_reading_status_not_found(self, db_session):
result = set_reading_status(db_session, "nonexistent", "unread")
assert "error" in result
assert result["error"] == "not_found"
with pytest.raises(NotFoundError):
set_reading_status(db_session, "nonexistent", "unread")
def test_all_valid_statuses(self, db_session, sample_paper):
for status in ("unread", "skimmed", "read_summary", "read_full"):
@@ -93,9 +93,8 @@ class TestNoteService:
assert result is None
def test_save_note_paper_not_found(self, db_session):
result = save_note(db_session, "nonexistent", "内容")
assert "error" in result
assert result["error"] == "not_found"
with pytest.raises(NotFoundError):
save_note(db_session, "nonexistent", "内容")
# ═══════════════════════════════════════════════════════════════════════
@@ -143,12 +142,12 @@ class TestUserDataRoutes:
assert data["status"] == "read_summary"
def test_reading_status_invalid(self, client, sample_paper):
"""无效状态返回 422"""
"""无效状态返回 400 (ValidationError)"""
resp = client.post(
"/api/reading-status/2401.12345",
json={"status": "invalid"},
)
assert resp.status_code == 422
assert resp.status_code == 400
def test_reading_status_not_found(self, client):
"""不存在的论文返回 404。"""