feat: refactor summarizer and PDF extraction pipeline
- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
This commit is contained in:
@@ -0,0 +1,189 @@
|
||||
"""爬虫服务测试 — _parse_paper、fetch_daily、upsert_papers、crawl_daily。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from app.services.crawler import (
|
||||
_parse_paper,
|
||||
crawl_daily,
|
||||
fetch_daily,
|
||||
upsert_papers,
|
||||
)
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# _parse_paper
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
|
||||
|
||||
class TestParsePaper:
|
||||
def test_normal_item(self):
|
||||
item = {
|
||||
"paper": {
|
||||
"id": "2401.12345",
|
||||
"title": "Test Paper",
|
||||
"abstract": "Abstract text",
|
||||
"publishedAt": "2024-01-15T00:00:00",
|
||||
"authors": [{"name": "Alice"}, {"name": "Bob"}],
|
||||
"tags": [{"name": "NLP"}, {"name": "LLM"}],
|
||||
"upvotes": 42,
|
||||
}
|
||||
}
|
||||
result = _parse_paper(item)
|
||||
assert result["arxiv_id"] == "2401.12345"
|
||||
assert result["title_en"] == "Test Paper"
|
||||
assert len(result["authors"]) == 2
|
||||
assert result["authors"] == ["Alice", "Bob"]
|
||||
assert result["tags"] == ["NLP", "LLM"]
|
||||
assert result["upvotes"] == 42
|
||||
assert "huggingface.co" in result["hf_url"]
|
||||
|
||||
def test_empty_id(self):
|
||||
item = {"paper": {"id": "", "authors": [], "tags": []}}
|
||||
result = _parse_paper(item)
|
||||
assert result["arxiv_id"] == ""
|
||||
assert result["hf_url"] == ""
|
||||
|
||||
def test_missing_published_at(self):
|
||||
item = {"paper": {"id": "2401.00001", "title": "T", "authors": [], "tags": []}}
|
||||
result = _parse_paper(item)
|
||||
assert result["published_at"] is None
|
||||
|
||||
def test_flat_structure_fallback(self):
|
||||
"""无 paper 包装时直接从顶层取字段。"""
|
||||
item = {"id": "2401.99999", "title": "Flat", "authors": [], "tags": []}
|
||||
result = _parse_paper(item)
|
||||
assert result["arxiv_id"] == "2401.99999"
|
||||
assert result["title_en"] == "Flat"
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# fetch_daily
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
|
||||
|
||||
class TestFetchDaily:
|
||||
@pytest.mark.asyncio
|
||||
async def test_returns_papers(self, monkeypatch):
|
||||
fake_data = [{"paper": {"id": "2401.00001"}}]
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.json.return_value = fake_data
|
||||
mock_resp.raise_for_status = MagicMock()
|
||||
|
||||
mock_client = AsyncMock()
|
||||
mock_client.get.return_value = mock_resp
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
|
||||
with patch("app.services.crawler.make_http_client", return_value=mock_client):
|
||||
result = await fetch_daily("2024-01-15")
|
||||
assert len(result) == 1
|
||||
assert result[0]["paper"]["id"] == "2401.00001"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_respects_top_n(self, monkeypatch):
|
||||
fake_data = [{"paper": {"id": f"2401.{i:05d}"}} for i in range(10)]
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.json.return_value = fake_data
|
||||
mock_resp.raise_for_status = MagicMock()
|
||||
|
||||
mock_client = AsyncMock()
|
||||
mock_client.get.return_value = mock_resp
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
|
||||
with patch("app.services.crawler.make_http_client", return_value=mock_client):
|
||||
result = await fetch_daily("2024-01-15", top_n=3)
|
||||
assert len(result) == 3
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# upsert_papers
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
|
||||
|
||||
class TestUpsertPapers:
|
||||
def test_inserts_new_paper(self, db_session):
|
||||
papers_raw = [
|
||||
{
|
||||
"paper": {
|
||||
"id": "2401.00001",
|
||||
"title": "New Paper",
|
||||
"abstract": "Abstract",
|
||||
"authors": [{"name": "Alice"}],
|
||||
"tags": [{"name": "CV"}],
|
||||
"upvotes": 5,
|
||||
}
|
||||
}
|
||||
]
|
||||
new = upsert_papers(db_session, papers_raw, "2024-01-15")
|
||||
assert len(new) == 1
|
||||
assert new[0].arxiv_id == "2401.00001"
|
||||
assert new[0].title_en == "New Paper"
|
||||
|
||||
def test_updates_existing_upvotes(self, db_session, sample_paper):
|
||||
papers_raw = [
|
||||
{
|
||||
"paper": {
|
||||
"id": sample_paper.arxiv_id,
|
||||
"title": sample_paper.title_en,
|
||||
"upvotes": 999,
|
||||
"authors": [],
|
||||
"tags": [],
|
||||
}
|
||||
}
|
||||
]
|
||||
new = upsert_papers(db_session, papers_raw, "2024-01-15")
|
||||
assert len(new) == 0 # 不新增
|
||||
db_session.refresh(sample_paper)
|
||||
assert sample_paper.upvotes == 999
|
||||
|
||||
def test_skips_empty_id(self, db_session):
|
||||
papers_raw = [{"paper": {"id": "", "title": "Nope", "authors": [], "tags": []}}]
|
||||
new = upsert_papers(db_session, papers_raw, "2024-01-15")
|
||||
assert len(new) == 0
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
# crawl_daily
|
||||
# ═══════════════════════════════════════════════════════════════════════
|
||||
|
||||
|
||||
class TestCrawlDaily:
|
||||
@pytest.mark.asyncio
|
||||
async def test_success_flow(self, db_session):
|
||||
with patch(
|
||||
"app.services.crawler.fetch_daily",
|
||||
new_callable=AsyncMock,
|
||||
) as mock_fetch:
|
||||
mock_fetch.return_value = [
|
||||
{
|
||||
"paper": {
|
||||
"id": "2401.00001",
|
||||
"title": "T",
|
||||
"authors": [],
|
||||
"tags": [],
|
||||
"upvotes": 0,
|
||||
}
|
||||
}
|
||||
]
|
||||
result = await crawl_daily(db_session, "2024-01-15")
|
||||
|
||||
assert result["status"] == "success"
|
||||
assert result["new"] == 1
|
||||
assert result["found"] == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_failure_returns_failed(self, db_session):
|
||||
with patch(
|
||||
"app.services.crawler.fetch_daily",
|
||||
new_callable=AsyncMock,
|
||||
side_effect=ConnectionError("network error"),
|
||||
):
|
||||
result = await crawl_daily(db_session, "2024-01-15")
|
||||
|
||||
assert result["status"] == "failed"
|
||||
assert "network error" in result["error"]
|
||||
Reference in New Issue
Block a user