feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
@@ -0,0 +1,189 @@
+"""爬虫服务测试 — _parse_paper、fetch_daily、upsert_papers、crawl_daily。"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from app.services.crawler import (
+    _parse_paper,
+    crawl_daily,
+    fetch_daily,
+    upsert_papers,
+)
+
+
+# ═══════════════════════════════════════════════════════════════════════
+# _parse_paper
+# ═══════════════════════════════════════════════════════════════════════
+
+
+class TestParsePaper:
+    def test_normal_item(self):
+        item = {
+            "paper": {
+                "id": "2401.12345",
+                "title": "Test Paper",
+                "abstract": "Abstract text",
+                "publishedAt": "2024-01-15T00:00:00",
+                "authors": [{"name": "Alice"}, {"name": "Bob"}],
+                "tags": [{"name": "NLP"}, {"name": "LLM"}],
+                "upvotes": 42,
+            }
+        }
+        result = _parse_paper(item)
+        assert result["arxiv_id"] == "2401.12345"
+        assert result["title_en"] == "Test Paper"
+        assert len(result["authors"]) == 2
+        assert result["authors"] == ["Alice", "Bob"]
+        assert result["tags"] == ["NLP", "LLM"]
+        assert result["upvotes"] == 42
+        assert "huggingface.co" in result["hf_url"]
+
+    def test_empty_id(self):
+        item = {"paper": {"id": "", "authors": [], "tags": []}}
+        result = _parse_paper(item)
+        assert result["arxiv_id"] == ""
+        assert result["hf_url"] == ""
+
+    def test_missing_published_at(self):
+        item = {"paper": {"id": "2401.00001", "title": "T", "authors": [], "tags": []}}
+        result = _parse_paper(item)
+        assert result["published_at"] is None
+
+    def test_flat_structure_fallback(self):
+        """无 paper 包装时直接从顶层取字段。"""
+        item = {"id": "2401.99999", "title": "Flat", "authors": [], "tags": []}
+        result = _parse_paper(item)
+        assert result["arxiv_id"] == "2401.99999"
+        assert result["title_en"] == "Flat"
+
+
+# ═══════════════════════════════════════════════════════════════════════
+# fetch_daily
+# ═══════════════════════════════════════════════════════════════════════
+
+
+class TestFetchDaily:
+    @pytest.mark.asyncio
+    async def test_returns_papers(self, monkeypatch):
+        fake_data = [{"paper": {"id": "2401.00001"}}]
+        mock_resp = MagicMock()
+        mock_resp.json.return_value = fake_data
+        mock_resp.raise_for_status = MagicMock()
+
+        mock_client = AsyncMock()
+        mock_client.get.return_value = mock_resp
+        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+        mock_client.__aexit__ = AsyncMock(return_value=False)
+
+        with patch("app.services.crawler.make_http_client", return_value=mock_client):
+            result = await fetch_daily("2024-01-15")
+        assert len(result) == 1
+        assert result[0]["paper"]["id"] == "2401.00001"
+
+    @pytest.mark.asyncio
+    async def test_respects_top_n(self, monkeypatch):
+        fake_data = [{"paper": {"id": f"2401.{i:05d}"}} for i in range(10)]
+        mock_resp = MagicMock()
+        mock_resp.json.return_value = fake_data
+        mock_resp.raise_for_status = MagicMock()
+
+        mock_client = AsyncMock()
+        mock_client.get.return_value = mock_resp
+        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+        mock_client.__aexit__ = AsyncMock(return_value=False)
+
+        with patch("app.services.crawler.make_http_client", return_value=mock_client):
+            result = await fetch_daily("2024-01-15", top_n=3)
+        assert len(result) == 3
+
+
+# ═══════════════════════════════════════════════════════════════════════
+# upsert_papers
+# ═══════════════════════════════════════════════════════════════════════
+
+
+class TestUpsertPapers:
+    def test_inserts_new_paper(self, db_session):
+        papers_raw = [
+            {
+                "paper": {
+                    "id": "2401.00001",
+                    "title": "New Paper",
+                    "abstract": "Abstract",
+                    "authors": [{"name": "Alice"}],
+                    "tags": [{"name": "CV"}],
+                    "upvotes": 5,
+                }
+            }
+        ]
+        new = upsert_papers(db_session, papers_raw, "2024-01-15")
+        assert len(new) == 1
+        assert new[0].arxiv_id == "2401.00001"
+        assert new[0].title_en == "New Paper"
+
+    def test_updates_existing_upvotes(self, db_session, sample_paper):
+        papers_raw = [
+            {
+                "paper": {
+                    "id": sample_paper.arxiv_id,
+                    "title": sample_paper.title_en,
+                    "upvotes": 999,
+                    "authors": [],
+                    "tags": [],
+                }
+            }
+        ]
+        new = upsert_papers(db_session, papers_raw, "2024-01-15")
+        assert len(new) == 0  # 不新增
+        db_session.refresh(sample_paper)
+        assert sample_paper.upvotes == 999
+
+    def test_skips_empty_id(self, db_session):
+        papers_raw = [{"paper": {"id": "", "title": "Nope", "authors": [], "tags": []}}]
+        new = upsert_papers(db_session, papers_raw, "2024-01-15")
+        assert len(new) == 0
+
+
+# ═══════════════════════════════════════════════════════════════════════
+# crawl_daily
+# ═══════════════════════════════════════════════════════════════════════
+
+
+class TestCrawlDaily:
+    @pytest.mark.asyncio
+    async def test_success_flow(self, db_session):
+        with patch(
+            "app.services.crawler.fetch_daily",
+            new_callable=AsyncMock,
+        ) as mock_fetch:
+            mock_fetch.return_value = [
+                {
+                    "paper": {
+                        "id": "2401.00001",
+                        "title": "T",
+                        "authors": [],
+                        "tags": [],
+                        "upvotes": 0,
+                    }
+                }
+            ]
+            result = await crawl_daily(db_session, "2024-01-15")
+
+        assert result["status"] == "success"
+        assert result["new"] == 1
+        assert result["found"] == 1
+
+    @pytest.mark.asyncio
+    async def test_failure_returns_failed(self, db_session):
+        with patch(
+            "app.services.crawler.fetch_daily",
+            new_callable=AsyncMock,
+            side_effect=ConnectionError("network error"),
+        ):
+            result = await crawl_daily(db_session, "2024-01-15")
+
+        assert result["status"] == "failed"
+        assert "network error" in result["error"]