feat: refactor summarizer and PDF extraction pipeline
- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
This commit is contained in:
+35
-22
@@ -23,12 +23,8 @@ from app.services.pdf_downloader import (
|
||||
)
|
||||
from app.services.pi_client import PiTimeoutError
|
||||
from app.services.schemas import SummarySchema
|
||||
from app.services.summarizer import (
|
||||
_save_files,
|
||||
_update_summary_in_db,
|
||||
summarize_batch,
|
||||
summarize_one,
|
||||
)
|
||||
from app.services.summarizer import summarize_batch, summarize_one
|
||||
from app.services.summary_persister import _save_files, _update_summary_in_db
|
||||
from app.utils import utc_now
|
||||
|
||||
|
||||
@@ -39,7 +35,14 @@ from app.utils import utc_now
|
||||
def _summarize_tmp_paths(tmp_path):
|
||||
"""将 data 目录重定向到 tmp_path(供 summarizer 测试使用)。"""
|
||||
with (
|
||||
patch("app.services.summarizer.paper_dir", lambda aid: tmp_path / "papers" / aid),
|
||||
patch(
|
||||
"app.services.summary_persister.paper_dir",
|
||||
lambda aid: tmp_path / "papers" / aid,
|
||||
),
|
||||
patch(
|
||||
"app.services.summary_generator.paper_dir",
|
||||
lambda aid: tmp_path / "papers" / aid,
|
||||
),
|
||||
patch("app.services.pdf_downloader.PAPERS_DIR", tmp_path / "papers"),
|
||||
patch("app.services.pdf_downloader.TMP_DIR", tmp_path / "tmp"),
|
||||
patch("app.utils.PAPERS_DIR", tmp_path / "papers"),
|
||||
@@ -134,7 +137,9 @@ class TestFileOperations:
|
||||
|
||||
def test_save_files(self, tmp_path, sample_summary_dict):
|
||||
schema = SummarySchema.model_validate(sample_summary_dict)
|
||||
with patch("app.services.summarizer.paper_dir", lambda aid: tmp_path / aid):
|
||||
with patch(
|
||||
"app.services.summary_persister.paper_dir", lambda aid: tmp_path / aid
|
||||
):
|
||||
_save_files("2401.12345", schema, "raw output text")
|
||||
|
||||
paper_dir = tmp_path / "2401.12345"
|
||||
@@ -144,7 +149,9 @@ class TestFileOperations:
|
||||
assert saved["title_zh"] == "测试论文中文标题"
|
||||
|
||||
def test_save_raw_output_only(self, tmp_path):
|
||||
with patch("app.services.summarizer.paper_dir", lambda aid: tmp_path / aid):
|
||||
with patch(
|
||||
"app.services.summary_persister.paper_dir", lambda aid: tmp_path / aid
|
||||
):
|
||||
_save_files("2401.12345", None, "raw output")
|
||||
paper_dir = tmp_path / "2401.12345"
|
||||
assert (paper_dir / "raw_output.txt").exists()
|
||||
@@ -180,7 +187,7 @@ class TestSummarizeOneFlow:
|
||||
with (
|
||||
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
|
||||
patch(
|
||||
"app.services.summarizer.call_pi",
|
||||
"app.services.summary_generator.call_pi",
|
||||
new_callable=AsyncMock,
|
||||
return_value=(mock_pi_output, "test-session-id"),
|
||||
),
|
||||
@@ -209,7 +216,9 @@ class TestSummarizeOneFlow:
|
||||
assert fts_row[0] == "测试论文中文标题"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_pdf_download_failure(self, db_session, sample_paper, _summarize_tmp_paths):
|
||||
async def test_pdf_download_failure(
|
||||
self, db_session, sample_paper, _summarize_tmp_paths
|
||||
):
|
||||
"""PDF 下载失败 → error_type=pdf_download_failed,tmp 被清理。"""
|
||||
with (
|
||||
patch(
|
||||
@@ -233,7 +242,7 @@ class TestSummarizeOneFlow:
|
||||
with (
|
||||
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
|
||||
patch(
|
||||
"app.services.summarizer.call_pi",
|
||||
"app.services.summary_generator.call_pi",
|
||||
new_callable=AsyncMock,
|
||||
side_effect=PiTimeoutError("timeout after 300s"),
|
||||
),
|
||||
@@ -250,7 +259,7 @@ class TestSummarizeOneFlow:
|
||||
with (
|
||||
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
|
||||
patch(
|
||||
"app.services.summarizer.call_pi",
|
||||
"app.services.summary_generator.call_pi",
|
||||
new_callable=AsyncMock,
|
||||
return_value=("No JSON in this output at all.", "test-session-id"),
|
||||
),
|
||||
@@ -281,7 +290,7 @@ class TestSummarizeOneFlow:
|
||||
with (
|
||||
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
|
||||
patch(
|
||||
"app.services.summarizer.call_pi",
|
||||
"app.services.summary_generator.call_pi",
|
||||
new_callable=AsyncMock,
|
||||
return_value=(bad_output, "test-session-id"),
|
||||
),
|
||||
@@ -300,7 +309,7 @@ class TestSummarizeOneFlow:
|
||||
with (
|
||||
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
|
||||
patch(
|
||||
"app.services.summarizer.call_pi",
|
||||
"app.services.summary_generator.call_pi",
|
||||
new_callable=AsyncMock,
|
||||
return_value=("Some output without JSON", "test-session-id"),
|
||||
),
|
||||
@@ -319,7 +328,7 @@ class TestSummarizeOneFlow:
|
||||
with (
|
||||
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
|
||||
patch(
|
||||
"app.services.summarizer.call_pi",
|
||||
"app.services.summary_generator.call_pi",
|
||||
new_callable=AsyncMock,
|
||||
return_value=(mock_pi_output, "test-session-id"),
|
||||
),
|
||||
@@ -347,7 +356,9 @@ class TestSummarizeOneFlow:
|
||||
assert not tmp_paper.exists()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_skips_done_paper(self, db_session, sample_paper, _summarize_tmp_paths):
|
||||
async def test_skips_done_paper(
|
||||
self, db_session, sample_paper, _summarize_tmp_paths
|
||||
):
|
||||
"""已完成的论文跳过。"""
|
||||
sample_paper.summary_status.status = "done"
|
||||
db_session.commit()
|
||||
@@ -393,7 +404,7 @@ class TestBatchSummarize:
|
||||
with (
|
||||
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
|
||||
patch(
|
||||
"app.services.summarizer.call_pi",
|
||||
"app.services.summary_generator.call_pi",
|
||||
new_callable=AsyncMock,
|
||||
return_value=(mock_pi_output, "test-session-id"),
|
||||
),
|
||||
@@ -446,7 +457,7 @@ class TestBatchSummarize:
|
||||
|
||||
with (
|
||||
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
|
||||
patch("app.services.summarizer.call_pi", side_effect=_mock_call_pi),
|
||||
patch("app.services.summary_generator.call_pi", side_effect=_mock_call_pi),
|
||||
):
|
||||
result = await summarize_batch(db_session, _session_factory=_TestSession)
|
||||
|
||||
@@ -456,6 +467,8 @@ class TestBatchSummarize:
|
||||
@pytest.mark.asyncio
|
||||
async def test_task_lock_conflict(self, db_session, _summarize_tmp_paths):
|
||||
"""TaskLock 防止并发 batch。"""
|
||||
from app.exceptions import ConflictError
|
||||
|
||||
# 先插入一个 running 锁
|
||||
db_session.add(
|
||||
TaskLock(
|
||||
@@ -467,8 +480,8 @@ class TestBatchSummarize:
|
||||
)
|
||||
db_session.commit()
|
||||
|
||||
result = await summarize_batch(db_session)
|
||||
assert result["status"] == "conflict"
|
||||
with pytest.raises(ConflictError):
|
||||
await summarize_batch(db_session)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_task_lock_released(
|
||||
@@ -482,7 +495,7 @@ class TestBatchSummarize:
|
||||
with (
|
||||
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
|
||||
patch(
|
||||
"app.services.summarizer.call_pi",
|
||||
"app.services.summary_generator.call_pi",
|
||||
new_callable=AsyncMock,
|
||||
return_value=(mock_pi_output, "test-session-id"),
|
||||
),
|
||||
|
||||
Reference in New Issue
Block a user