21f16e6756
- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
190 lines
7.5 KiB
Python
190 lines
7.5 KiB
Python
"""爬虫服务测试 — _parse_paper、fetch_daily、upsert_papers、crawl_daily。"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from app.services.crawler import (
|
|
_parse_paper,
|
|
crawl_daily,
|
|
fetch_daily,
|
|
upsert_papers,
|
|
)
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# _parse_paper
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
class TestParsePaper:
|
|
def test_normal_item(self):
|
|
item = {
|
|
"paper": {
|
|
"id": "2401.12345",
|
|
"title": "Test Paper",
|
|
"abstract": "Abstract text",
|
|
"publishedAt": "2024-01-15T00:00:00",
|
|
"authors": [{"name": "Alice"}, {"name": "Bob"}],
|
|
"tags": [{"name": "NLP"}, {"name": "LLM"}],
|
|
"upvotes": 42,
|
|
}
|
|
}
|
|
result = _parse_paper(item)
|
|
assert result["arxiv_id"] == "2401.12345"
|
|
assert result["title_en"] == "Test Paper"
|
|
assert len(result["authors"]) == 2
|
|
assert result["authors"] == ["Alice", "Bob"]
|
|
assert result["tags"] == ["NLP", "LLM"]
|
|
assert result["upvotes"] == 42
|
|
assert "huggingface.co" in result["hf_url"]
|
|
|
|
def test_empty_id(self):
|
|
item = {"paper": {"id": "", "authors": [], "tags": []}}
|
|
result = _parse_paper(item)
|
|
assert result["arxiv_id"] == ""
|
|
assert result["hf_url"] == ""
|
|
|
|
def test_missing_published_at(self):
|
|
item = {"paper": {"id": "2401.00001", "title": "T", "authors": [], "tags": []}}
|
|
result = _parse_paper(item)
|
|
assert result["published_at"] is None
|
|
|
|
def test_flat_structure_fallback(self):
|
|
"""无 paper 包装时直接从顶层取字段。"""
|
|
item = {"id": "2401.99999", "title": "Flat", "authors": [], "tags": []}
|
|
result = _parse_paper(item)
|
|
assert result["arxiv_id"] == "2401.99999"
|
|
assert result["title_en"] == "Flat"
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# fetch_daily
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
class TestFetchDaily:
|
|
@pytest.mark.asyncio
|
|
async def test_returns_papers(self, monkeypatch):
|
|
fake_data = [{"paper": {"id": "2401.00001"}}]
|
|
mock_resp = MagicMock()
|
|
mock_resp.json.return_value = fake_data
|
|
mock_resp.raise_for_status = MagicMock()
|
|
|
|
mock_client = AsyncMock()
|
|
mock_client.get.return_value = mock_resp
|
|
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
|
mock_client.__aexit__ = AsyncMock(return_value=False)
|
|
|
|
with patch("app.services.crawler.make_http_client", return_value=mock_client):
|
|
result = await fetch_daily("2024-01-15")
|
|
assert len(result) == 1
|
|
assert result[0]["paper"]["id"] == "2401.00001"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_respects_top_n(self, monkeypatch):
|
|
fake_data = [{"paper": {"id": f"2401.{i:05d}"}} for i in range(10)]
|
|
mock_resp = MagicMock()
|
|
mock_resp.json.return_value = fake_data
|
|
mock_resp.raise_for_status = MagicMock()
|
|
|
|
mock_client = AsyncMock()
|
|
mock_client.get.return_value = mock_resp
|
|
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
|
mock_client.__aexit__ = AsyncMock(return_value=False)
|
|
|
|
with patch("app.services.crawler.make_http_client", return_value=mock_client):
|
|
result = await fetch_daily("2024-01-15", top_n=3)
|
|
assert len(result) == 3
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# upsert_papers
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
class TestUpsertPapers:
|
|
def test_inserts_new_paper(self, db_session):
|
|
papers_raw = [
|
|
{
|
|
"paper": {
|
|
"id": "2401.00001",
|
|
"title": "New Paper",
|
|
"abstract": "Abstract",
|
|
"authors": [{"name": "Alice"}],
|
|
"tags": [{"name": "CV"}],
|
|
"upvotes": 5,
|
|
}
|
|
}
|
|
]
|
|
new = upsert_papers(db_session, papers_raw, "2024-01-15")
|
|
assert len(new) == 1
|
|
assert new[0].arxiv_id == "2401.00001"
|
|
assert new[0].title_en == "New Paper"
|
|
|
|
def test_updates_existing_upvotes(self, db_session, sample_paper):
|
|
papers_raw = [
|
|
{
|
|
"paper": {
|
|
"id": sample_paper.arxiv_id,
|
|
"title": sample_paper.title_en,
|
|
"upvotes": 999,
|
|
"authors": [],
|
|
"tags": [],
|
|
}
|
|
}
|
|
]
|
|
new = upsert_papers(db_session, papers_raw, "2024-01-15")
|
|
assert len(new) == 0 # 不新增
|
|
db_session.refresh(sample_paper)
|
|
assert sample_paper.upvotes == 999
|
|
|
|
def test_skips_empty_id(self, db_session):
|
|
papers_raw = [{"paper": {"id": "", "title": "Nope", "authors": [], "tags": []}}]
|
|
new = upsert_papers(db_session, papers_raw, "2024-01-15")
|
|
assert len(new) == 0
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# crawl_daily
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
class TestCrawlDaily:
|
|
@pytest.mark.asyncio
|
|
async def test_success_flow(self, db_session):
|
|
with patch(
|
|
"app.services.crawler.fetch_daily",
|
|
new_callable=AsyncMock,
|
|
) as mock_fetch:
|
|
mock_fetch.return_value = [
|
|
{
|
|
"paper": {
|
|
"id": "2401.00001",
|
|
"title": "T",
|
|
"authors": [],
|
|
"tags": [],
|
|
"upvotes": 0,
|
|
}
|
|
}
|
|
]
|
|
result = await crawl_daily(db_session, "2024-01-15")
|
|
|
|
assert result["status"] == "success"
|
|
assert result["new"] == 1
|
|
assert result["found"] == 1
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_failure_returns_failed(self, db_session):
|
|
with patch(
|
|
"app.services.crawler.fetch_daily",
|
|
new_callable=AsyncMock,
|
|
side_effect=ConnectionError("network error"),
|
|
):
|
|
result = await crawl_daily(db_session, "2024-01-15")
|
|
|
|
assert result["status"] == "failed"
|
|
assert "network error" in result["error"]
|