Files
Rain-Bus 743d69efd0 refactor: extract admin business logic to services, introduce job queue, add derived index helpers
- Move DB operations from routes/admin.py to services/admin.py (get_logs_context, query_summary_statuses, retry_failed, delete/reset operations)
- Add services/jobs.py with Job/JobEvent-based async job queue (create_job, run_job, enqueue_job)
- Add services/derived.py with FTS5 reindex and paper index deletion helpers
- Refactor scheduler to use job queue instead of direct pipeline calls
- Add heartbeat_at/expires_at to TaskLock for lock health tracking
- Remove DESIGN_REVIEW.md
- Update tests: remove redundant integration tests, add unit tests for new services
2026-06-13 18:31:43 +08:00

250 lines
9.4 KiB
Python

"""爬虫服务测试 — _parse_paper、fetch_daily、upsert_papers、crawl_daily。"""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from app.services.crawler import (
_parse_paper,
crawl_daily,
fetch_daily,
refresh_upvotes,
upsert_papers,
)
# ═══════════════════════════════════════════════════════════════════════
# _parse_paper
# ═══════════════════════════════════════════════════════════════════════
class TestParsePaper:
def test_normal_item(self):
item = {
"paper": {
"id": "2401.12345",
"title": "Test Paper",
"abstract": "Abstract text",
"publishedAt": "2024-01-15T00:00:00",
"authors": [{"name": "Alice"}, {"name": "Bob"}],
"tags": [{"name": "NLP"}, {"name": "LLM"}],
"upvotes": 42,
}
}
result = _parse_paper(item)
assert result["arxiv_id"] == "2401.12345"
assert result["title_en"] == "Test Paper"
assert len(result["authors"]) == 2
assert result["authors"] == ["Alice", "Bob"]
assert result["tags"] == ["NLP", "LLM"]
assert result["upvotes"] == 42
assert "huggingface.co" in result["hf_url"]
def test_empty_id(self):
item = {"paper": {"id": "", "authors": [], "tags": []}}
result = _parse_paper(item)
assert result["arxiv_id"] == ""
assert result["hf_url"] == ""
def test_missing_published_at(self):
item = {"paper": {"id": "2401.00001", "title": "T", "authors": [], "tags": []}}
result = _parse_paper(item)
assert result["published_at"] is None
def test_flat_structure_fallback(self):
"""无 paper 包装时直接从顶层取字段。"""
item = {"id": "2401.99999", "title": "Flat", "authors": [], "tags": []}
result = _parse_paper(item)
assert result["arxiv_id"] == "2401.99999"
assert result["title_en"] == "Flat"
# ═══════════════════════════════════════════════════════════════════════
# fetch_daily
# ═══════════════════════════════════════════════════════════════════════
class TestFetchDaily:
@pytest.mark.asyncio
async def test_returns_papers(self, monkeypatch):
fake_data = [{"paper": {"id": "2401.00001"}}]
mock_resp = MagicMock()
mock_resp.json.return_value = fake_data
mock_resp.raise_for_status = MagicMock()
mock_client = AsyncMock()
mock_client.get.return_value = mock_resp
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
with patch("app.services.crawler.make_http_client", return_value=mock_client):
result = await fetch_daily("2024-01-15")
assert len(result) == 1
assert result[0]["paper"]["id"] == "2401.00001"
@pytest.mark.asyncio
async def test_respects_top_n(self, monkeypatch):
fake_data = [{"paper": {"id": f"2401.{i:05d}"}} for i in range(10)]
mock_resp = MagicMock()
mock_resp.json.return_value = fake_data
mock_resp.raise_for_status = MagicMock()
mock_client = AsyncMock()
mock_client.get.return_value = mock_resp
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
with patch("app.services.crawler.make_http_client", return_value=mock_client):
result = await fetch_daily("2024-01-15", top_n=3)
assert len(result) == 3
# ═══════════════════════════════════════════════════════════════════════
# upsert_papers
# ═══════════════════════════════════════════════════════════════════════
class TestUpsertPapers:
def test_inserts_new_paper(self, db_session):
papers_raw = [
{
"paper": {
"id": "2401.00001",
"title": "New Paper",
"abstract": "Abstract",
"authors": [{"name": "Alice"}],
"tags": [{"name": "CV"}],
"upvotes": 5,
}
}
]
new = upsert_papers(db_session, papers_raw, "2024-01-15")
assert len(new) == 1
assert new[0].arxiv_id == "2401.00001"
assert new[0].title_en == "New Paper"
def test_updates_existing_upvotes(self, db_session, sample_paper):
papers_raw = [
{
"paper": {
"id": sample_paper.arxiv_id,
"title": sample_paper.title_en,
"upvotes": 999,
"authors": [],
"tags": [],
}
}
]
new = upsert_papers(db_session, papers_raw, "2024-01-15")
assert len(new) == 0 # 不新增
db_session.refresh(sample_paper)
assert sample_paper.upvotes == 999
def test_skips_empty_id(self, db_session):
papers_raw = [{"paper": {"id": "", "title": "Nope", "authors": [], "tags": []}}]
new = upsert_papers(db_session, papers_raw, "2024-01-15")
assert len(new) == 0
# ═══════════════════════════════════════════════════════════════════════
# crawl_daily
# ═══════════════════════════════════════════════════════════════════════
class TestCrawlDaily:
@pytest.mark.asyncio
async def test_success_flow(self, db_session):
with patch(
"app.services.crawler.fetch_daily",
new_callable=AsyncMock,
) as mock_fetch:
mock_fetch.return_value = [
{
"paper": {
"id": "2401.00001",
"title": "T",
"authors": [],
"tags": [],
"upvotes": 0,
}
}
]
result = await crawl_daily(db_session, "2024-01-15")
assert result["status"] == "success"
assert result["new"] == 1
assert result["found"] == 1
@pytest.mark.asyncio
async def test_failure_returns_failed(self, db_session):
with patch(
"app.services.crawler.fetch_daily",
new_callable=AsyncMock,
side_effect=ConnectionError("network error"),
):
result = await crawl_daily(db_session, "2024-01-15")
assert result["status"] == "failed"
assert "network error" in result["error"]
class TestRefreshUpvotes:
@pytest.mark.asyncio
async def test_refresh_updates_existing_without_inserting_new(
self, db_session, sample_paper
):
sample_paper.arxiv_id = "1706.03762"
sample_paper.upvotes = 10
db_session.commit()
with patch(
"app.services.crawler.fetch_daily",
new_callable=AsyncMock,
return_value=[
{
"paper": {
"id": "1706.03762",
"upvotes": 999,
"authors": [],
"tags": [],
}
},
{
"paper": {
"id": "2010.11929",
"upvotes": 123,
"authors": [],
"tags": [],
}
},
],
):
result = await refresh_upvotes(db_session, days=1)
db_session.refresh(sample_paper)
assert result["status"] == "success"
assert result["updated"] == 1
assert sample_paper.upvotes == 999
assert db_session.query(type(sample_paper)).count() == 1
@pytest.mark.asyncio
async def test_refresh_returns_partial_when_one_day_fails(self, db_session):
async def _fetch_daily(target_date):
if target_date.endswith("01"):
raise ConnectionError("hf down")
return []
with (
patch(
"app.services.crawler.recent_date_strs",
return_value=["2024-01-01", "2024-01-02"],
),
patch("app.services.crawler.fetch_daily", side_effect=_fetch_daily),
):
result = await refresh_upvotes(db_session, days=2)
assert result["status"] == "partial"
assert result["errors"] == ["2024-01-01: hf down"]