"""爬虫服务测试 — _parse_paper、fetch_daily、upsert_papers、crawl_daily。""" from __future__ import annotations from unittest.mock import AsyncMock, MagicMock, patch import pytest from app.services.crawler import ( _parse_paper, crawl_daily, fetch_daily, upsert_papers, ) # ═══════════════════════════════════════════════════════════════════════ # _parse_paper # ═══════════════════════════════════════════════════════════════════════ class TestParsePaper: def test_normal_item(self): item = { "paper": { "id": "2401.12345", "title": "Test Paper", "abstract": "Abstract text", "publishedAt": "2024-01-15T00:00:00", "authors": [{"name": "Alice"}, {"name": "Bob"}], "tags": [{"name": "NLP"}, {"name": "LLM"}], "upvotes": 42, } } result = _parse_paper(item) assert result["arxiv_id"] == "2401.12345" assert result["title_en"] == "Test Paper" assert len(result["authors"]) == 2 assert result["authors"] == ["Alice", "Bob"] assert result["tags"] == ["NLP", "LLM"] assert result["upvotes"] == 42 assert "huggingface.co" in result["hf_url"] def test_empty_id(self): item = {"paper": {"id": "", "authors": [], "tags": []}} result = _parse_paper(item) assert result["arxiv_id"] == "" assert result["hf_url"] == "" def test_missing_published_at(self): item = {"paper": {"id": "2401.00001", "title": "T", "authors": [], "tags": []}} result = _parse_paper(item) assert result["published_at"] is None def test_flat_structure_fallback(self): """无 paper 包装时直接从顶层取字段。""" item = {"id": "2401.99999", "title": "Flat", "authors": [], "tags": []} result = _parse_paper(item) assert result["arxiv_id"] == "2401.99999" assert result["title_en"] == "Flat" # ═══════════════════════════════════════════════════════════════════════ # fetch_daily # ═══════════════════════════════════════════════════════════════════════ class TestFetchDaily: @pytest.mark.asyncio async def test_returns_papers(self, monkeypatch): fake_data = [{"paper": {"id": "2401.00001"}}] mock_resp = MagicMock() mock_resp.json.return_value = fake_data mock_resp.raise_for_status = MagicMock() mock_client = AsyncMock() mock_client.get.return_value = mock_resp mock_client.__aenter__ = AsyncMock(return_value=mock_client) mock_client.__aexit__ = AsyncMock(return_value=False) with patch("app.services.crawler.make_http_client", return_value=mock_client): result = await fetch_daily("2024-01-15") assert len(result) == 1 assert result[0]["paper"]["id"] == "2401.00001" @pytest.mark.asyncio async def test_respects_top_n(self, monkeypatch): fake_data = [{"paper": {"id": f"2401.{i:05d}"}} for i in range(10)] mock_resp = MagicMock() mock_resp.json.return_value = fake_data mock_resp.raise_for_status = MagicMock() mock_client = AsyncMock() mock_client.get.return_value = mock_resp mock_client.__aenter__ = AsyncMock(return_value=mock_client) mock_client.__aexit__ = AsyncMock(return_value=False) with patch("app.services.crawler.make_http_client", return_value=mock_client): result = await fetch_daily("2024-01-15", top_n=3) assert len(result) == 3 # ═══════════════════════════════════════════════════════════════════════ # upsert_papers # ═══════════════════════════════════════════════════════════════════════ class TestUpsertPapers: def test_inserts_new_paper(self, db_session): papers_raw = [ { "paper": { "id": "2401.00001", "title": "New Paper", "abstract": "Abstract", "authors": [{"name": "Alice"}], "tags": [{"name": "CV"}], "upvotes": 5, } } ] new = upsert_papers(db_session, papers_raw, "2024-01-15") assert len(new) == 1 assert new[0].arxiv_id == "2401.00001" assert new[0].title_en == "New Paper" def test_updates_existing_upvotes(self, db_session, sample_paper): papers_raw = [ { "paper": { "id": sample_paper.arxiv_id, "title": sample_paper.title_en, "upvotes": 999, "authors": [], "tags": [], } } ] new = upsert_papers(db_session, papers_raw, "2024-01-15") assert len(new) == 0 # 不新增 db_session.refresh(sample_paper) assert sample_paper.upvotes == 999 def test_skips_empty_id(self, db_session): papers_raw = [{"paper": {"id": "", "title": "Nope", "authors": [], "tags": []}}] new = upsert_papers(db_session, papers_raw, "2024-01-15") assert len(new) == 0 # ═══════════════════════════════════════════════════════════════════════ # crawl_daily # ═══════════════════════════════════════════════════════════════════════ class TestCrawlDaily: @pytest.mark.asyncio async def test_success_flow(self, db_session): with patch( "app.services.crawler.fetch_daily", new_callable=AsyncMock, ) as mock_fetch: mock_fetch.return_value = [ { "paper": { "id": "2401.00001", "title": "T", "authors": [], "tags": [], "upvotes": 0, } } ] result = await crawl_daily(db_session, "2024-01-15") assert result["status"] == "success" assert result["new"] == 1 assert result["found"] == 1 @pytest.mark.asyncio async def test_failure_returns_failed(self, db_session): with patch( "app.services.crawler.fetch_daily", new_callable=AsyncMock, side_effect=ConnectionError("network error"), ): result = await crawl_daily(db_session, "2024-01-15") assert result["status"] == "failed" assert "network error" in result["error"]