feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules
- Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection
- Add layout_detector service for PicoDet-S_layout_3cls integration
- Add exceptions module with ConflictError and NotFoundError
- Improve admin dashboard with better statistics and task management
- Add design review document with system optimization suggestions
- Add new tests for crawler, pdf_downloader, pipeline, and summary_utils
- Update dependencies and configuration
- Clean up dead code and improve error handling
This commit is contained in:
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
+35 -22
View File
@@ -23,12 +23,8 @@ from app.services.pdf_downloader import (
)
from app.services.pi_client import PiTimeoutError
from app.services.schemas import SummarySchema
from app.services.summarizer import (
_save_files,
_update_summary_in_db,
summarize_batch,
summarize_one,
)
from app.services.summarizer import summarize_batch, summarize_one
from app.services.summary_persister import _save_files, _update_summary_in_db
from app.utils import utc_now
@@ -39,7 +35,14 @@ from app.utils import utc_now
def _summarize_tmp_paths(tmp_path):
"""将 data 目录重定向到 tmp_path(供 summarizer 测试使用)。"""
with (
patch("app.services.summarizer.paper_dir", lambda aid: tmp_path / "papers" / aid),
patch(
"app.services.summary_persister.paper_dir",
lambda aid: tmp_path / "papers" / aid,
),
patch(
"app.services.summary_generator.paper_dir",
lambda aid: tmp_path / "papers" / aid,
),
patch("app.services.pdf_downloader.PAPERS_DIR", tmp_path / "papers"),
patch("app.services.pdf_downloader.TMP_DIR", tmp_path / "tmp"),
patch("app.utils.PAPERS_DIR", tmp_path / "papers"),
@@ -134,7 +137,9 @@ class TestFileOperations:
def test_save_files(self, tmp_path, sample_summary_dict):
schema = SummarySchema.model_validate(sample_summary_dict)
with patch("app.services.summarizer.paper_dir", lambda aid: tmp_path / aid):
with patch(
"app.services.summary_persister.paper_dir", lambda aid: tmp_path / aid
):
_save_files("2401.12345", schema, "raw output text")
paper_dir = tmp_path / "2401.12345"
@@ -144,7 +149,9 @@ class TestFileOperations:
assert saved["title_zh"] == "测试论文中文标题"
def test_save_raw_output_only(self, tmp_path):
with patch("app.services.summarizer.paper_dir", lambda aid: tmp_path / aid):
with patch(
"app.services.summary_persister.paper_dir", lambda aid: tmp_path / aid
):
_save_files("2401.12345", None, "raw output")
paper_dir = tmp_path / "2401.12345"
assert (paper_dir / "raw_output.txt").exists()
@@ -180,7 +187,7 @@ class TestSummarizeOneFlow:
with (
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch(
"app.services.summarizer.call_pi",
"app.services.summary_generator.call_pi",
new_callable=AsyncMock,
return_value=(mock_pi_output, "test-session-id"),
),
@@ -209,7 +216,9 @@ class TestSummarizeOneFlow:
assert fts_row[0] == "测试论文中文标题"
@pytest.mark.asyncio
async def test_pdf_download_failure(self, db_session, sample_paper, _summarize_tmp_paths):
async def test_pdf_download_failure(
self, db_session, sample_paper, _summarize_tmp_paths
):
"""PDF 下载失败 → error_type=pdf_download_failedtmp 被清理。"""
with (
patch(
@@ -233,7 +242,7 @@ class TestSummarizeOneFlow:
with (
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch(
"app.services.summarizer.call_pi",
"app.services.summary_generator.call_pi",
new_callable=AsyncMock,
side_effect=PiTimeoutError("timeout after 300s"),
),
@@ -250,7 +259,7 @@ class TestSummarizeOneFlow:
with (
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch(
"app.services.summarizer.call_pi",
"app.services.summary_generator.call_pi",
new_callable=AsyncMock,
return_value=("No JSON in this output at all.", "test-session-id"),
),
@@ -281,7 +290,7 @@ class TestSummarizeOneFlow:
with (
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch(
"app.services.summarizer.call_pi",
"app.services.summary_generator.call_pi",
new_callable=AsyncMock,
return_value=(bad_output, "test-session-id"),
),
@@ -300,7 +309,7 @@ class TestSummarizeOneFlow:
with (
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch(
"app.services.summarizer.call_pi",
"app.services.summary_generator.call_pi",
new_callable=AsyncMock,
return_value=("Some output without JSON", "test-session-id"),
),
@@ -319,7 +328,7 @@ class TestSummarizeOneFlow:
with (
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch(
"app.services.summarizer.call_pi",
"app.services.summary_generator.call_pi",
new_callable=AsyncMock,
return_value=(mock_pi_output, "test-session-id"),
),
@@ -347,7 +356,9 @@ class TestSummarizeOneFlow:
assert not tmp_paper.exists()
@pytest.mark.asyncio
async def test_skips_done_paper(self, db_session, sample_paper, _summarize_tmp_paths):
async def test_skips_done_paper(
self, db_session, sample_paper, _summarize_tmp_paths
):
"""已完成的论文跳过。"""
sample_paper.summary_status.status = "done"
db_session.commit()
@@ -393,7 +404,7 @@ class TestBatchSummarize:
with (
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch(
"app.services.summarizer.call_pi",
"app.services.summary_generator.call_pi",
new_callable=AsyncMock,
return_value=(mock_pi_output, "test-session-id"),
),
@@ -446,7 +457,7 @@ class TestBatchSummarize:
with (
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.call_pi", side_effect=_mock_call_pi),
patch("app.services.summary_generator.call_pi", side_effect=_mock_call_pi),
):
result = await summarize_batch(db_session, _session_factory=_TestSession)
@@ -456,6 +467,8 @@ class TestBatchSummarize:
@pytest.mark.asyncio
async def test_task_lock_conflict(self, db_session, _summarize_tmp_paths):
"""TaskLock 防止并发 batch。"""
from app.exceptions import ConflictError
# 先插入一个 running 锁
db_session.add(
TaskLock(
@@ -467,8 +480,8 @@ class TestBatchSummarize:
)
db_session.commit()
result = await summarize_batch(db_session)
assert result["status"] == "conflict"
with pytest.raises(ConflictError):
await summarize_batch(db_session)
@pytest.mark.asyncio
async def test_task_lock_released(
@@ -482,7 +495,7 @@ class TestBatchSummarize:
with (
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch(
"app.services.summarizer.call_pi",
"app.services.summary_generator.call_pi",
new_callable=AsyncMock,
return_value=(mock_pi_output, "test-session-id"),
),