743d69efd0
- Move DB operations from routes/admin.py to services/admin.py (get_logs_context, query_summary_statuses, retry_failed, delete/reset operations) - Add services/jobs.py with Job/JobEvent-based async job queue (create_job, run_job, enqueue_job) - Add services/derived.py with FTS5 reindex and paper index deletion helpers - Refactor scheduler to use job queue instead of direct pipeline calls - Add heartbeat_at/expires_at to TaskLock for lock health tracking - Remove DESIGN_REVIEW.md - Update tests: remove redundant integration tests, add unit tests for new services
250 lines
9.4 KiB
Python
250 lines
9.4 KiB
Python
"""爬虫服务测试 — _parse_paper、fetch_daily、upsert_papers、crawl_daily。"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from app.services.crawler import (
|
|
_parse_paper,
|
|
crawl_daily,
|
|
fetch_daily,
|
|
refresh_upvotes,
|
|
upsert_papers,
|
|
)
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# _parse_paper
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
class TestParsePaper:
|
|
def test_normal_item(self):
|
|
item = {
|
|
"paper": {
|
|
"id": "2401.12345",
|
|
"title": "Test Paper",
|
|
"abstract": "Abstract text",
|
|
"publishedAt": "2024-01-15T00:00:00",
|
|
"authors": [{"name": "Alice"}, {"name": "Bob"}],
|
|
"tags": [{"name": "NLP"}, {"name": "LLM"}],
|
|
"upvotes": 42,
|
|
}
|
|
}
|
|
result = _parse_paper(item)
|
|
assert result["arxiv_id"] == "2401.12345"
|
|
assert result["title_en"] == "Test Paper"
|
|
assert len(result["authors"]) == 2
|
|
assert result["authors"] == ["Alice", "Bob"]
|
|
assert result["tags"] == ["NLP", "LLM"]
|
|
assert result["upvotes"] == 42
|
|
assert "huggingface.co" in result["hf_url"]
|
|
|
|
def test_empty_id(self):
|
|
item = {"paper": {"id": "", "authors": [], "tags": []}}
|
|
result = _parse_paper(item)
|
|
assert result["arxiv_id"] == ""
|
|
assert result["hf_url"] == ""
|
|
|
|
def test_missing_published_at(self):
|
|
item = {"paper": {"id": "2401.00001", "title": "T", "authors": [], "tags": []}}
|
|
result = _parse_paper(item)
|
|
assert result["published_at"] is None
|
|
|
|
def test_flat_structure_fallback(self):
|
|
"""无 paper 包装时直接从顶层取字段。"""
|
|
item = {"id": "2401.99999", "title": "Flat", "authors": [], "tags": []}
|
|
result = _parse_paper(item)
|
|
assert result["arxiv_id"] == "2401.99999"
|
|
assert result["title_en"] == "Flat"
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# fetch_daily
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
class TestFetchDaily:
|
|
@pytest.mark.asyncio
|
|
async def test_returns_papers(self, monkeypatch):
|
|
fake_data = [{"paper": {"id": "2401.00001"}}]
|
|
mock_resp = MagicMock()
|
|
mock_resp.json.return_value = fake_data
|
|
mock_resp.raise_for_status = MagicMock()
|
|
|
|
mock_client = AsyncMock()
|
|
mock_client.get.return_value = mock_resp
|
|
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
|
mock_client.__aexit__ = AsyncMock(return_value=False)
|
|
|
|
with patch("app.services.crawler.make_http_client", return_value=mock_client):
|
|
result = await fetch_daily("2024-01-15")
|
|
assert len(result) == 1
|
|
assert result[0]["paper"]["id"] == "2401.00001"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_respects_top_n(self, monkeypatch):
|
|
fake_data = [{"paper": {"id": f"2401.{i:05d}"}} for i in range(10)]
|
|
mock_resp = MagicMock()
|
|
mock_resp.json.return_value = fake_data
|
|
mock_resp.raise_for_status = MagicMock()
|
|
|
|
mock_client = AsyncMock()
|
|
mock_client.get.return_value = mock_resp
|
|
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
|
mock_client.__aexit__ = AsyncMock(return_value=False)
|
|
|
|
with patch("app.services.crawler.make_http_client", return_value=mock_client):
|
|
result = await fetch_daily("2024-01-15", top_n=3)
|
|
assert len(result) == 3
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# upsert_papers
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
class TestUpsertPapers:
|
|
def test_inserts_new_paper(self, db_session):
|
|
papers_raw = [
|
|
{
|
|
"paper": {
|
|
"id": "2401.00001",
|
|
"title": "New Paper",
|
|
"abstract": "Abstract",
|
|
"authors": [{"name": "Alice"}],
|
|
"tags": [{"name": "CV"}],
|
|
"upvotes": 5,
|
|
}
|
|
}
|
|
]
|
|
new = upsert_papers(db_session, papers_raw, "2024-01-15")
|
|
assert len(new) == 1
|
|
assert new[0].arxiv_id == "2401.00001"
|
|
assert new[0].title_en == "New Paper"
|
|
|
|
def test_updates_existing_upvotes(self, db_session, sample_paper):
|
|
papers_raw = [
|
|
{
|
|
"paper": {
|
|
"id": sample_paper.arxiv_id,
|
|
"title": sample_paper.title_en,
|
|
"upvotes": 999,
|
|
"authors": [],
|
|
"tags": [],
|
|
}
|
|
}
|
|
]
|
|
new = upsert_papers(db_session, papers_raw, "2024-01-15")
|
|
assert len(new) == 0 # 不新增
|
|
db_session.refresh(sample_paper)
|
|
assert sample_paper.upvotes == 999
|
|
|
|
def test_skips_empty_id(self, db_session):
|
|
papers_raw = [{"paper": {"id": "", "title": "Nope", "authors": [], "tags": []}}]
|
|
new = upsert_papers(db_session, papers_raw, "2024-01-15")
|
|
assert len(new) == 0
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# crawl_daily
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
class TestCrawlDaily:
|
|
@pytest.mark.asyncio
|
|
async def test_success_flow(self, db_session):
|
|
with patch(
|
|
"app.services.crawler.fetch_daily",
|
|
new_callable=AsyncMock,
|
|
) as mock_fetch:
|
|
mock_fetch.return_value = [
|
|
{
|
|
"paper": {
|
|
"id": "2401.00001",
|
|
"title": "T",
|
|
"authors": [],
|
|
"tags": [],
|
|
"upvotes": 0,
|
|
}
|
|
}
|
|
]
|
|
result = await crawl_daily(db_session, "2024-01-15")
|
|
|
|
assert result["status"] == "success"
|
|
assert result["new"] == 1
|
|
assert result["found"] == 1
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_failure_returns_failed(self, db_session):
|
|
with patch(
|
|
"app.services.crawler.fetch_daily",
|
|
new_callable=AsyncMock,
|
|
side_effect=ConnectionError("network error"),
|
|
):
|
|
result = await crawl_daily(db_session, "2024-01-15")
|
|
|
|
assert result["status"] == "failed"
|
|
assert "network error" in result["error"]
|
|
|
|
|
|
class TestRefreshUpvotes:
|
|
@pytest.mark.asyncio
|
|
async def test_refresh_updates_existing_without_inserting_new(
|
|
self, db_session, sample_paper
|
|
):
|
|
sample_paper.arxiv_id = "1706.03762"
|
|
sample_paper.upvotes = 10
|
|
db_session.commit()
|
|
|
|
with patch(
|
|
"app.services.crawler.fetch_daily",
|
|
new_callable=AsyncMock,
|
|
return_value=[
|
|
{
|
|
"paper": {
|
|
"id": "1706.03762",
|
|
"upvotes": 999,
|
|
"authors": [],
|
|
"tags": [],
|
|
}
|
|
},
|
|
{
|
|
"paper": {
|
|
"id": "2010.11929",
|
|
"upvotes": 123,
|
|
"authors": [],
|
|
"tags": [],
|
|
}
|
|
},
|
|
],
|
|
):
|
|
result = await refresh_upvotes(db_session, days=1)
|
|
|
|
db_session.refresh(sample_paper)
|
|
assert result["status"] == "success"
|
|
assert result["updated"] == 1
|
|
assert sample_paper.upvotes == 999
|
|
assert db_session.query(type(sample_paper)).count() == 1
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_refresh_returns_partial_when_one_day_fails(self, db_session):
|
|
async def _fetch_daily(target_date):
|
|
if target_date.endswith("01"):
|
|
raise ConnectionError("hf down")
|
|
return []
|
|
|
|
with (
|
|
patch(
|
|
"app.services.crawler.recent_date_strs",
|
|
return_value=["2024-01-01", "2024-01-02"],
|
|
),
|
|
patch("app.services.crawler.fetch_daily", side_effect=_fetch_daily),
|
|
):
|
|
result = await refresh_upvotes(db_session, days=2)
|
|
|
|
assert result["status"] == "partial"
|
|
assert result["errors"] == ["2024-01-01: hf down"]
|