refactor: restructure services and add image/pdf extraction utilities

- Add image_extractor, pdf_downloader, pi_client, trends services
- Add shared utils module
- Refactor summarizer, embedder, routes for cleaner separation
- Update tests to match new service structure
This commit is contained in:
2026-06-06 00:00:55 +08:00
parent ba9afa212c
commit 85c4cfb9e8
22 changed files with 843 additions and 780 deletions
+1 -1
View File
@@ -15,13 +15,13 @@ from sqlalchemy.pool import StaticPool
from app.database import get_db
from app.main import create_app
from app.database import init_db
from app.models import (
Paper,
PaperAuthor,
PaperSummary,
PaperTag,
SummaryStatus,
init_db,
)
+5 -5
View File
@@ -141,7 +141,7 @@ class TestCleanupTmp:
old_mtime = time.time() - 25 * 3600
os.utime(old_dir, (old_mtime, old_mtime))
monkeypatch.setattr("app.services.cleaner._TMP_DIR", tmp_dir)
monkeypatch.setattr("app.services.cleaner.TMP_DIR", tmp_dir)
from app.services.cleaner import cleanup_tmp
result = cleanup_tmp()
@@ -158,7 +158,7 @@ class TestCleanupTmp:
recent_dir.mkdir()
(recent_dir / "paper.pdf").write_text("fake pdf")
monkeypatch.setattr("app.services.cleaner._TMP_DIR", tmp_dir)
monkeypatch.setattr("app.services.cleaner.TMP_DIR", tmp_dir)
from app.services.cleaner import cleanup_tmp
result = cleanup_tmp()
@@ -168,7 +168,7 @@ class TestCleanupTmp:
def test_cleanup_empty_dir(self, tmp_path, monkeypatch):
"""data/tmp/ 不存在时安全返回。"""
monkeypatch.setattr("app.services.cleaner._TMP_DIR", tmp_path / "nonexistent")
monkeypatch.setattr("app.services.cleaner.TMP_DIR", tmp_path / "nonexistent")
from app.services.cleaner import cleanup_tmp
result = cleanup_tmp()
assert result["scanned"] == 0
@@ -187,7 +187,7 @@ class TestCleanupTmp:
recent_dir = tmp_dir / "2401.new"
recent_dir.mkdir()
monkeypatch.setattr("app.services.cleaner._TMP_DIR", tmp_dir)
monkeypatch.setattr("app.services.cleaner.TMP_DIR", tmp_dir)
from app.services.cleaner import cleanup_tmp
result = cleanup_tmp()
@@ -318,7 +318,7 @@ class TestDeletePapersByDateRange:
(papers_dir / "2401.10001").mkdir()
(papers_dir / "2401.10001" / "meta.json").write_text("{}")
monkeypatch.setattr("app.services.cleaner._PAPERS_DIR", papers_dir)
monkeypatch.setattr("app.services.cleaner.PAPERS_DIR", papers_dir)
result = await delete_papers_by_date_range(
db_session,
+42 -39
View File
@@ -125,10 +125,9 @@ class TestEmbedderInit:
"""CHROMA_ENABLED=false 时不初始化。"""
monkeypatch.setattr(settings, "CHROMA_ENABLED", False)
import app.services.embedder as emb
emb._client = None
emb._collection = None
emb._chroma.reset()
emb.init_chroma()
assert emb._client is None
assert emb._chroma._client is None
def test_chroma_init_success(self, monkeypatch, tmp_path):
"""CHROMA_ENABLED=true 时初始化成功。"""
@@ -136,23 +135,20 @@ class TestEmbedderInit:
monkeypatch.setattr(settings, "CHROMA_DIR", str(tmp_path / "chroma"))
import app.services.embedder as emb
emb._client = None
emb._collection = None
emb._chroma.reset()
emb.init_chroma()
assert emb._client is not None
assert emb._collection is not None
assert emb._chroma._client is not None
assert emb._chroma._collection is not None
# 清理
emb._client = None
emb._collection = None
emb._chroma.reset()
def test_get_collection_returns_none_when_disabled(self, monkeypatch):
"""CHROMA_ENABLED=false 时 get_collection 返回 None。"""
monkeypatch.setattr(settings, "CHROMA_ENABLED", False)
import app.services.embedder as emb
emb._client = None
emb._collection = None
emb._chroma.reset()
assert emb.get_collection() is None
@@ -163,8 +159,7 @@ class TestEmbedderIndexing:
"""CHROMA_ENABLED=false 时 index_paper 返回 False。"""
monkeypatch.setattr(settings, "CHROMA_ENABLED", False)
import app.services.embedder as emb
emb._client = None
emb._collection = None
emb._chroma.reset()
assert emb.index_paper("test-id") is False
def test_index_paper_no_api_config(self, monkeypatch, tmp_path):
@@ -175,22 +170,19 @@ class TestEmbedderIndexing:
monkeypatch.setattr(settings, "EMBED_MODEL", "")
import app.services.embedder as emb
emb._client = None
emb._collection = None
emb._chroma.reset()
emb.init_chroma()
result = emb.index_paper("test-id", {"title_zh": "测试", "title_en": "Test"})
assert result is False
emb._client = None
emb._collection = None
emb._chroma.reset()
def test_index_batch_disabled(self, monkeypatch):
"""CHROMA_ENABLED=false 时 index_batch 返回全失败。"""
monkeypatch.setattr(settings, "CHROMA_ENABLED", False)
import app.services.embedder as emb
emb._client = None
emb._collection = None
emb._chroma.reset()
result = emb.index_batch(["a", "b"])
assert result["success"] == 0
assert result["failed"] == 2
@@ -206,16 +198,14 @@ class TestEmbedderIndexing:
"""CHROMA_ENABLED=false 时 delete_paper 返回 False。"""
monkeypatch.setattr(settings, "CHROMA_ENABLED", False)
import app.services.embedder as emb
emb._client = None
emb._collection = None
emb._chroma.reset()
assert emb.delete_paper("test-id") is False
def test_search_similar_disabled(self, monkeypatch):
"""CHROMA_ENABLED=false 时 search_similar 返回空列表。"""
monkeypatch.setattr(settings, "CHROMA_ENABLED", False)
import app.services.embedder as emb
emb._client = None
emb._collection = None
emb._chroma.reset()
assert emb.search_similar("test query") == []
@@ -427,8 +417,8 @@ class TestTrendsDashboard:
from unittest.mock import patch as upatch
import app.routes.trends as trends_mod
# monkeypatch _get_trends_data 中的 date.today
with upatch("app.routes.trends.date") as mock_date:
# monkeypatch get_trends_data 中的 date.today
with upatch("app.services.trends.date") as mock_date:
mock_date.today.return_value = date(2024, 1, 20)
mock_date.side_effect = lambda *a, **kw: date(*a, **kw)
@@ -528,15 +518,17 @@ class TestImageExtraction:
@pytest.mark.asyncio
async def test_extract_images_from_source_no_dir(self, monkeypatch, tmp_path):
"""源码目录不存在时返回 0。"""
monkeypatch.setattr("app.services.summarizer._tmp_dir", lambda x: tmp_path / "tmp" / x)
monkeypatch.setattr("app.services.summarizer._paper_dir", lambda x: tmp_path / "papers" / x)
from app.services.summarizer import _extract_images_from_source
result = await _extract_images_from_source("2401.99999")
monkeypatch.setattr("app.services.pdf_downloader.tmp_dir", lambda x: tmp_path / "tmp" / x)
monkeypatch.setattr("app.services.pdf_downloader.paper_dir", lambda x: tmp_path / "papers" / x)
from app.services.image_extractor import extract_images_from_source
result = await extract_images_from_source("2401.99999")
assert result == 0
@pytest.mark.asyncio
async def test_extract_images_from_tex(self, monkeypatch, tmp_path):
"""从 .tex 文件中提取图片。"""
from app.services.image_extractor import extract_images_from_source
tmp_source = tmp_path / "tmp" / "2401.00001" / "source"
tmp_source.mkdir(parents=True)
@@ -559,11 +551,16 @@ class TestImageExtraction:
(tmp_source / "main.tex").write_text(tex_content)
papers_dir = tmp_path / "papers" / "2401.00001"
monkeypatch.setattr("app.services.summarizer._tmp_dir", lambda x: tmp_path / "tmp" / x)
monkeypatch.setattr("app.services.summarizer._paper_dir", lambda x: tmp_path / "papers" / x)
monkeypatch.setattr("app.services.image_extractor.tmp_dir", lambda x: tmp_path / "tmp" / x)
monkeypatch.setattr("app.services.image_extractor.paper_dir", lambda x: tmp_path / "papers" / x)
from app.services.summarizer import _extract_images_from_source
result = await _extract_images_from_source("2401.00001")
# Mock download_source_zip to avoid real network call (source dir already exists)
async def _noop_download(*args, **kwargs):
pass
monkeypatch.setattr("app.services.image_extractor.download_source_zip", _noop_download)
result = await extract_images_from_source("2401.00001")
assert result == 2
dest_images = papers_dir / "images"
@@ -574,15 +571,22 @@ class TestImageExtraction:
@pytest.mark.asyncio
async def test_extract_images_empty_tex(self, monkeypatch, tmp_path):
""".tex 文件无图片时返回 0。"""
from app.services.image_extractor import extract_images_from_source
tmp_source = tmp_path / "tmp" / "2401.00002" / "source"
tmp_source.mkdir(parents=True)
(tmp_source / "main.tex").write_text(r"\documentclass{article}\begin{document}Hello\end{document}")
monkeypatch.setattr("app.services.summarizer._tmp_dir", lambda x: tmp_path / "tmp" / x)
monkeypatch.setattr("app.services.summarizer._paper_dir", lambda x: tmp_path / "papers" / x)
monkeypatch.setattr("app.services.image_extractor.tmp_dir", lambda x: tmp_path / "tmp" / x)
monkeypatch.setattr("app.services.image_extractor.paper_dir", lambda x: tmp_path / "papers" / x)
from app.services.summarizer import _extract_images_from_source
result = await _extract_images_from_source("2401.00002")
# Mock download_source_zip to avoid real network call
async def _noop_download(*args, **kwargs):
pass
monkeypatch.setattr("app.services.image_extractor.download_source_zip", _noop_download)
result = await extract_images_from_source("2401.00002")
assert result == 0
@@ -644,8 +648,7 @@ class TestGracefulDegradation:
"""CHROMA 关闭时删除论文正常工作。"""
monkeypatch.setattr(settings, "CHROMA_ENABLED", False)
import app.services.embedder as emb
emb._client = None
emb._collection = None
emb._chroma.reset()
from app.services.cleaner import delete_papers_by_date_range
result = await delete_papers_by_date_range(
+45 -37
View File
@@ -27,14 +27,7 @@ from app.services.schemas import (
flatten_for_db,
)
from app.services.summarizer import (
JsonNotFoundError,
PdfDownloadError,
PiProcessError,
PiTimeoutError,
_call_pi,
_classify_error,
_cleanup_tmp,
_extract_json,
_save_files,
_save_raw_output_only,
_update_summary_in_db,
@@ -42,6 +35,17 @@ from app.services.summarizer import (
summarize_one,
summarize_single,
)
from app.services.pi_client import (
JsonNotFoundError,
PiProcessError,
PiTimeoutError,
call_pi as _call_pi,
extract_json as _extract_json,
)
from app.services.pdf_downloader import (
PdfDownloadError,
cleanup_tmp as _cleanup_tmp,
)
# ═══════════════════════════════════════════════════════════════════════
@@ -287,7 +291,7 @@ class TestFileOperations:
def test_save_files(self, tmp_path, sample_summary_dict):
schema = SummarySchema.model_validate(sample_summary_dict)
with patch("app.services.summarizer._PAPERS_DIR", tmp_path):
with patch("app.services.summarizer.paper_dir", lambda aid: tmp_path / aid):
_save_files("2401.12345", schema, "raw output text")
paper_dir = tmp_path / "2401.12345"
@@ -297,7 +301,7 @@ class TestFileOperations:
assert saved["title_zh"] == "测试论文中文标题"
def test_save_raw_output_only(self, tmp_path):
with patch("app.services.summarizer._PAPERS_DIR", tmp_path):
with patch("app.services.summarizer.paper_dir", lambda aid: tmp_path / aid):
_save_raw_output_only("2401.12345", "raw output")
paper_dir = tmp_path / "2401.12345"
assert (paper_dir / "raw_output.txt").exists()
@@ -307,13 +311,13 @@ class TestFileOperations:
tmp_paper = tmp_path / "2401.12345"
tmp_paper.mkdir()
(tmp_paper / "paper.pdf").write_bytes(b"%PDF-fake")
with patch("app.services.summarizer._TMP_DIR", tmp_path):
with patch("app.services.pdf_downloader.TMP_DIR", tmp_path):
_cleanup_tmp("2401.12345")
assert not tmp_paper.exists()
def test_cleanup_tmp_nonexistent(self, tmp_path):
"""清理不存在的目录不报错。"""
with patch("app.services.summarizer._TMP_DIR", tmp_path):
with patch("app.services.pdf_downloader.TMP_DIR", tmp_path):
_cleanup_tmp("nonexistent") # 不抛异常
@@ -329,9 +333,11 @@ class TestSummarizeOneFlow:
def _patch_paths(self, tmp_path):
"""将 data 目录重定向到 tmp_path。"""
with (
patch("app.services.summarizer._PAPERS_DIR", tmp_path / "papers"),
patch("app.services.summarizer._TMP_DIR", tmp_path / "tmp"),
patch("app.services.summarizer._DATA_DIR", tmp_path),
patch("app.services.summarizer.paper_dir", lambda aid: tmp_path / "papers" / aid),
patch("app.services.pdf_downloader.PAPERS_DIR", tmp_path / "papers"),
patch("app.services.pdf_downloader.TMP_DIR", tmp_path / "tmp"),
patch("app.utils.PAPERS_DIR", tmp_path / "papers"),
patch("app.utils.TMP_DIR", tmp_path / "tmp"),
):
yield
@@ -341,8 +347,8 @@ class TestSummarizeOneFlow:
):
"""pending → processing → done 全流程。"""
with (
patch("app.services.summarizer._download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer._call_pi", new_callable=AsyncMock, return_value=mock_pi_output),
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.call_pi", new_callable=AsyncMock, return_value=mock_pi_output),
):
result = await summarize_one(db_session, sample_paper)
@@ -374,7 +380,7 @@ class TestSummarizeOneFlow:
"""PDF 下载失败 → error_type=pdf_download_failedtmp 被清理。"""
with (
patch(
"app.services.summarizer._download_pdf",
"app.services.summarizer.download_pdf",
new_callable=AsyncMock,
side_effect=PdfDownloadError("network error"),
),
@@ -392,9 +398,9 @@ class TestSummarizeOneFlow:
async def test_pi_timeout(self, db_session, sample_paper, _patch_paths):
"""pi 超时 → timeout 错误,retry_count 递增。"""
with (
patch("app.services.summarizer._download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch(
"app.services.summarizer._call_pi",
"app.services.summarizer.call_pi",
new_callable=AsyncMock,
side_effect=PiTimeoutError("timeout after 300s"),
),
@@ -409,9 +415,9 @@ class TestSummarizeOneFlow:
async def test_json_not_found(self, db_session, sample_paper, _patch_paths):
"""pi 输出无 JSON → json_not_found。"""
with (
patch("app.services.summarizer._download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch(
"app.services.summarizer._call_pi",
"app.services.summarizer.call_pi",
new_callable=AsyncMock,
return_value="No JSON in this output at all.",
),
@@ -436,9 +442,9 @@ class TestSummarizeOneFlow:
bad_output = f"```json\n{bad_json}\n```"
with (
patch("app.services.summarizer._download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch(
"app.services.summarizer._call_pi",
"app.services.summarizer.call_pi",
new_callable=AsyncMock,
return_value=bad_output,
),
@@ -464,9 +470,9 @@ class TestSummarizeOneFlow:
):
"""失败时仍保存 raw_output.txt。"""
with (
patch("app.services.summarizer._download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch(
"app.services.summarizer._call_pi",
"app.services.summarizer.call_pi",
new_callable=AsyncMock,
return_value="Some output without JSON",
),
@@ -483,8 +489,8 @@ class TestSummarizeOneFlow:
):
"""成功后清理 tmp 目录。"""
with (
patch("app.services.summarizer._download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer._call_pi", new_callable=AsyncMock, return_value=mock_pi_output),
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.call_pi", new_callable=AsyncMock, return_value=mock_pi_output),
):
await summarize_one(db_session, sample_paper)
@@ -498,7 +504,7 @@ class TestSummarizeOneFlow:
"""失败后也清理 tmp 目录。"""
with (
patch(
"app.services.summarizer._download_pdf",
"app.services.summarizer.download_pdf",
new_callable=AsyncMock,
side_effect=PdfDownloadError("fail"),
),
@@ -529,9 +535,11 @@ class TestBatchSummarize:
@pytest.fixture
def _patch_paths(self, tmp_path):
with (
patch("app.services.summarizer._PAPERS_DIR", tmp_path / "papers"),
patch("app.services.summarizer._TMP_DIR", tmp_path / "tmp"),
patch("app.services.summarizer._DATA_DIR", tmp_path),
patch("app.services.summarizer.paper_dir", lambda aid: tmp_path / "papers" / aid),
patch("app.services.pdf_downloader.PAPERS_DIR", tmp_path / "papers"),
patch("app.services.pdf_downloader.TMP_DIR", tmp_path / "tmp"),
patch("app.utils.PAPERS_DIR", tmp_path / "papers"),
patch("app.utils.TMP_DIR", tmp_path / "tmp"),
):
yield
@@ -561,8 +569,8 @@ class TestBatchSummarize:
_TestSession = _sm(bind=db_engine, autoflush=False, autocommit=False)
with (
patch("app.services.summarizer._download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer._call_pi", new_callable=AsyncMock, return_value=mock_pi_output),
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.call_pi", new_callable=AsyncMock, return_value=mock_pi_output),
):
result = await summarize_batch(
db_session, _session_factory=_TestSession
@@ -612,8 +620,8 @@ class TestBatchSummarize:
return mock_pi_output
with (
patch("app.services.summarizer._download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer._call_pi", side_effect=_mock_call_pi),
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.call_pi", side_effect=_mock_call_pi),
):
result = await summarize_batch(
db_session, _session_factory=_TestSession
@@ -646,8 +654,8 @@ class TestBatchSummarize:
_TestSession = _sm(bind=db_engine, autoflush=False, autocommit=False)
with (
patch("app.services.summarizer._download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer._call_pi", new_callable=AsyncMock, return_value=mock_pi_output),
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
patch("app.services.summarizer.call_pi", new_callable=AsyncMock, return_value=mock_pi_output),
):
await summarize_batch(
db_session, _session_factory=_TestSession