refactor: restructure services and add image/pdf extraction utilities

- Add image_extractor, pdf_downloader, pi_client, trends services
- Add shared utils module
- Refactor summarizer, embedder, routes for cleaner separation
- Update tests to match new service structure
This commit is contained in:
2026-06-06 00:00:55 +08:00
parent ba9afa212c
commit 85c4cfb9e8
22 changed files with 843 additions and 780 deletions
+45 -37
View File
@@ -27,14 +27,7 @@ from app.services.schemas import (
flatten_for_db,
)
from app.services.summarizer import (
JsonNotFoundError,
PdfDownloadError,
PiProcessError,
PiTimeoutError,
_call_pi,
_classify_error,
_cleanup_tmp,
_extract_json,
_save_files,
_save_raw_output_only,
_update_summary_in_db,
@@ -42,6 +35,17 @@ from app.services.summarizer import (
summarize_one,
summarize_single,
)
from app.services.pi_client import (
JsonNotFoundError,
PiProcessError,
PiTimeoutError,
call_pi as _call_pi,
extract_json as _extract_json,
)
from app.services.pdf_downloader import (
PdfDownloadError,
cleanup_tmp as _cleanup_tmp,
)
# ═══════════════════════════════════════════════════════════════════════
@@ -287,7 +291,7 @@ class TestFileOperations:
def test_save_files(self, tmp_path, sample_summary_dict):
schema = SummarySchema.model_validate(sample_summary_dict)
with patch("app.services.summarizer._PAPERS_DIR", tmp_path):
with patch("app.services.summarizer.paper_dir", lambda aid: tmp_path / aid):
_save_files("2401.12345", schema, "raw output text")
paper_dir = tmp_path / "2401.12345"
@@ -297,7 +301,7 @@ class TestFileOperations:
assert saved["title_zh"] == "测试论文中文标题"
def test_save_raw_output_only(self, tmp_path):
with patch("app.services.summarizer._PAPERS_DIR", tmp_path):
with patch("app.services.summarizer.paper_dir", lambda aid: tmp_path / aid):
_save_raw_output_only("2401.12345", "raw output")
paper_dir = tmp_path / "2401.12345"
assert (paper_dir / "raw_output.txt").exists()
@@ -307,13 +311,13 @@ class TestFileOperations:
tmp_paper = tmp_path / "2401.12345"
tmp_paper.mkdir()
(tmp_paper / "paper.pdf").write_bytes(b"%PDF-fake")
with patch("app.services.summarizer._TMP_DIR", tmp_path):
with patch("app.services.pdf_downloader.TMP_DIR", tmp_path):
_cleanup_tmp("2401.12345")
assert not tmp_paper.exists()
def test_cleanup_tmp_nonexistent(self, tmp_path):
"""清理不存在的目录不报错。"""
with patch("app.services.summarizer._TMP_DIR", tmp_path):
with patch("app.services.pdf_downloader.TMP_DIR", tmp_path):
_cleanup_tmp("nonexistent") # 不抛异常
@@ -329,9 +333,11 @@ class TestSummarizeOneFlow:
def _patch_paths(self, tmp_path):
"""将 data 目录重定向到 tmp_path。"""
with (
patch("app.services.summarizer._PAPERS_DIR", tmp_path / "papers"),
patch("app.services.summarizer._TMP_DIR", tmp_path / "tmp"),
patch("app.services.summarizer._DATA_DIR", tmp_path),
patch("app.services.summarizer.paper_dir", lambda aid: tmp_path / "papers" / aid),
patch("app.services.pdf_downloader.PAPERS_DIR", tmp_path / "papers"),
patch("app.services.pdf_downloader.TMP_DIR", tmp_path / "tmp"),
patch("app.utils.PAPERS_DIR", tmp_path / "papers"),
patch("app.utils.TMP_DIR", tmp_path / "tmp"),
):
yield
@@ -341,8 +347,8 @@ class TestSummarizeOneFlow:
):
"""pending → processing → done 全流程。"""
with (
patch("app.services.summarizer._download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer._call_pi", new_callable=AsyncMock, return_value=mock_pi_output),
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.call_pi", new_callable=AsyncMock, return_value=mock_pi_output),
):
result = await summarize_one(db_session, sample_paper)
@@ -374,7 +380,7 @@ class TestSummarizeOneFlow:
"""PDF 下载失败 → error_type=pdf_download_failedtmp 被清理。"""
with (
patch(
"app.services.summarizer._download_pdf",
"app.services.summarizer.download_pdf",
new_callable=AsyncMock,
side_effect=PdfDownloadError("network error"),
),
@@ -392,9 +398,9 @@ class TestSummarizeOneFlow:
async def test_pi_timeout(self, db_session, sample_paper, _patch_paths):
"""pi 超时 → timeout 错误,retry_count 递增。"""
with (
patch("app.services.summarizer._download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch(
"app.services.summarizer._call_pi",
"app.services.summarizer.call_pi",
new_callable=AsyncMock,
side_effect=PiTimeoutError("timeout after 300s"),
),
@@ -409,9 +415,9 @@ class TestSummarizeOneFlow:
async def test_json_not_found(self, db_session, sample_paper, _patch_paths):
"""pi 输出无 JSON → json_not_found。"""
with (
patch("app.services.summarizer._download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch(
"app.services.summarizer._call_pi",
"app.services.summarizer.call_pi",
new_callable=AsyncMock,
return_value="No JSON in this output at all.",
),
@@ -436,9 +442,9 @@ class TestSummarizeOneFlow:
bad_output = f"```json\n{bad_json}\n```"
with (
patch("app.services.summarizer._download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch(
"app.services.summarizer._call_pi",
"app.services.summarizer.call_pi",
new_callable=AsyncMock,
return_value=bad_output,
),
@@ -464,9 +470,9 @@ class TestSummarizeOneFlow:
):
"""失败时仍保存 raw_output.txt。"""
with (
patch("app.services.summarizer._download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch(
"app.services.summarizer._call_pi",
"app.services.summarizer.call_pi",
new_callable=AsyncMock,
return_value="Some output without JSON",
),
@@ -483,8 +489,8 @@ class TestSummarizeOneFlow:
):
"""成功后清理 tmp 目录。"""
with (
patch("app.services.summarizer._download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer._call_pi", new_callable=AsyncMock, return_value=mock_pi_output),
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.call_pi", new_callable=AsyncMock, return_value=mock_pi_output),
):
await summarize_one(db_session, sample_paper)
@@ -498,7 +504,7 @@ class TestSummarizeOneFlow:
"""失败后也清理 tmp 目录。"""
with (
patch(
"app.services.summarizer._download_pdf",
"app.services.summarizer.download_pdf",
new_callable=AsyncMock,
side_effect=PdfDownloadError("fail"),
),
@@ -529,9 +535,11 @@ class TestBatchSummarize:
@pytest.fixture
def _patch_paths(self, tmp_path):
with (
patch("app.services.summarizer._PAPERS_DIR", tmp_path / "papers"),
patch("app.services.summarizer._TMP_DIR", tmp_path / "tmp"),
patch("app.services.summarizer._DATA_DIR", tmp_path),
patch("app.services.summarizer.paper_dir", lambda aid: tmp_path / "papers" / aid),
patch("app.services.pdf_downloader.PAPERS_DIR", tmp_path / "papers"),
patch("app.services.pdf_downloader.TMP_DIR", tmp_path / "tmp"),
patch("app.utils.PAPERS_DIR", tmp_path / "papers"),
patch("app.utils.TMP_DIR", tmp_path / "tmp"),
):
yield
@@ -561,8 +569,8 @@ class TestBatchSummarize:
_TestSession = _sm(bind=db_engine, autoflush=False, autocommit=False)
with (
patch("app.services.summarizer._download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer._call_pi", new_callable=AsyncMock, return_value=mock_pi_output),
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.call_pi", new_callable=AsyncMock, return_value=mock_pi_output),
):
result = await summarize_batch(
db_session, _session_factory=_TestSession
@@ -612,8 +620,8 @@ class TestBatchSummarize:
return mock_pi_output
with (
patch("app.services.summarizer._download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer._call_pi", side_effect=_mock_call_pi),
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.call_pi", side_effect=_mock_call_pi),
):
result = await summarize_batch(
db_session, _session_factory=_TestSession
@@ -646,8 +654,8 @@ class TestBatchSummarize:
_TestSession = _sm(bind=db_engine, autoflush=False, autocommit=False)
with (
patch("app.services.summarizer._download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer._call_pi", new_callable=AsyncMock, return_value=mock_pi_output),
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.call_pi", new_callable=AsyncMock, return_value=mock_pi_output),
):
await summarize_batch(
db_session, _session_factory=_TestSession