feat: refactor summarizer and PDF extraction pipeline
- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
This commit is contained in:
@@ -0,0 +1,77 @@
|
||||
"""PDF 下载测试 — download_pdf、路径工具、错误处理。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from app.services.pdf_downloader import (
|
||||
PdfDownloadError,
|
||||
download_pdf,
|
||||
paper_dir,
|
||||
tmp_dir,
|
||||
)
|
||||
from app.utils import PAPERS_DIR, TMP_DIR
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# 路径工具
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
|
||||
|
||||
class TestPathHelpers:
|
||||
def test_paper_dir(self):
|
||||
assert paper_dir("2401.12345") == PAPERS_DIR / "2401.12345"
|
||||
|
||||
def test_tmp_dir(self):
|
||||
assert tmp_dir("2401.12345") == TMP_DIR / "2401.12345"
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# download_pdf
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
|
||||
|
||||
class TestDownloadPdf:
|
||||
@pytest.mark.asyncio
|
||||
async def test_success_download(self, tmp_path):
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.content = b"%PDF-1.4 fake"
|
||||
mock_resp.raise_for_status = MagicMock()
|
||||
|
||||
mock_session = MagicMock()
|
||||
mock_session.get.return_value = mock_resp
|
||||
|
||||
with (
|
||||
patch("app.services.pdf_downloader.TMP_DIR", tmp_path),
|
||||
patch(
|
||||
"app.services.pdf_downloader._get_session", return_value=mock_session
|
||||
),
|
||||
):
|
||||
result = await download_pdf(
|
||||
"2401.12345", "https://arxiv.org/pdf/2401.12345.pdf"
|
||||
)
|
||||
|
||||
assert result.exists()
|
||||
assert result.name == "paper.pdf"
|
||||
assert result.read_bytes() == b"%PDF-1.4 fake"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_empty_pdf_url_raises(self):
|
||||
with pytest.raises(PdfDownloadError, match="no pdf_url"):
|
||||
await download_pdf("2401.12345", "")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_http_failure_raises(self, tmp_path):
|
||||
mock_session = MagicMock()
|
||||
mock_session.get.side_effect = ConnectionError("refused")
|
||||
|
||||
with (
|
||||
patch("app.services.pdf_downloader.TMP_DIR", tmp_path),
|
||||
patch(
|
||||
"app.services.pdf_downloader._get_session", return_value=mock_session
|
||||
),
|
||||
):
|
||||
with pytest.raises(PdfDownloadError, match="failed to download"):
|
||||
await download_pdf("2401.12345", "https://bad.url/pdf.pdf")
|
||||
Reference in New Issue
Block a user