refactor: split monolithic phase tests into per-module test files

- rename test_admin_phase4.py -> test_admin.py, test_search.py -> test_searcher.py
- split test_phase5.py into test_cleaner, test_embedder, test_image_extractor, test_pages
- move schema tests from test_summarizer.py into dedicated test_schemas.py
- add sample_papers_range and sample_papers_with_summary fixtures in conftest
- update .gitignore to exclude all of data/
This commit is contained in:
2026-06-06 00:34:30 +08:00
parent 85c4cfb9e8
commit f7f1a4c0cb
11 changed files with 1245 additions and 1249 deletions
+7 -239
View File
@@ -1,15 +1,13 @@
"""AI 总结服务测试 — Mock 全链路,不调用真实 pi"""
"""AI 总结服务测试 — summarize_one 状态流转、批量处理、DB 更新、文件操作"""
from __future__ import annotations
import asyncio
import json
from datetime import date, datetime, timezone
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
from unittest.mock import AsyncMock, patch
import pytest
from pydantic import ValidationError
from sqlalchemy import text
from app.models import (
@@ -20,193 +18,19 @@ from app.models import (
SummaryStatus,
TaskLock,
)
from app.services.schemas import (
SummarySchema,
assess_quality,
classify_validation_error,
flatten_for_db,
from app.services.pdf_downloader import (
PdfDownloadError,
cleanup_tmp as _cleanup_tmp,
)
from app.services.pi_client import PiTimeoutError
from app.services.schemas import SummarySchema
from app.services.summarizer import (
_classify_error,
_save_files,
_save_raw_output_only,
_update_summary_in_db,
summarize_batch,
summarize_one,
summarize_single,
)
from app.services.pi_client import (
JsonNotFoundError,
PiProcessError,
PiTimeoutError,
call_pi as _call_pi,
extract_json as _extract_json,
)
from app.services.pdf_downloader import (
PdfDownloadError,
cleanup_tmp as _cleanup_tmp,
)
# ═══════════════════════════════════════════════════════════════════════
# Schema 校验测试
# ═══════════════════════════════════════════════════════════════════════
class TestSummarySchema:
"""Pydantic schema 校验。"""
def test_valid_summary(self, sample_summary_dict):
schema = SummarySchema.model_validate(sample_summary_dict)
assert schema.title_zh == "测试论文中文标题"
assert len(schema.tags) == 3
assert schema.motivation.problem
def test_missing_title_zh(self, sample_summary_dict):
del sample_summary_dict["title_zh"]
with pytest.raises(ValidationError) as exc_info:
SummarySchema.model_validate(sample_summary_dict)
assert classify_validation_error(exc_info.value) == "field_missing"
def test_empty_one_line(self, sample_summary_dict):
sample_summary_dict["one_line"] = ""
with pytest.raises(ValidationError):
SummarySchema.model_validate(sample_summary_dict)
def test_empty_tags(self, sample_summary_dict):
sample_summary_dict["tags"] = []
with pytest.raises(ValidationError):
SummarySchema.model_validate(sample_summary_dict)
def test_empty_motivation_problem(self, sample_summary_dict):
sample_summary_dict["motivation"]["problem"] = ""
with pytest.raises(ValidationError):
SummarySchema.model_validate(sample_summary_dict)
def test_empty_method_key_idea(self, sample_summary_dict):
sample_summary_dict["method"]["key_idea"] = ""
with pytest.raises(ValidationError):
SummarySchema.model_validate(sample_summary_dict)
def test_extra_fields_ignored(self, sample_summary_dict):
sample_summary_dict["figures"] = ["fig1.png"]
sample_summary_dict["takeaway"] = "important paper"
schema = SummarySchema.model_validate(sample_summary_dict)
assert not hasattr(schema, "figures")
assert schema.title_zh # 正常解析
def test_flatten_for_db(self, sample_summary_dict):
schema = SummarySchema.model_validate(sample_summary_dict)
flat = flatten_for_db(schema)
assert flat["one_line"] == schema.one_line
assert flat["motivation_problem"] == schema.motivation.problem
assert flat["method_key_idea"] == schema.method.key_idea
assert "full_json" in flat
assert "updated_at" in flat
# JSON 字段可解析
assert isinstance(json.loads(flat["prerequisites_json"]), dict)
assert isinstance(json.loads(flat["method_steps_json"]), list)
class TestQualityAssessment:
"""质量分级测试。"""
def test_quality_normal(self, sample_summary_dict):
schema = SummarySchema.model_validate(sample_summary_dict)
assert assess_quality(schema) == "normal"
def test_quality_degraded_missing_goal(self, sample_summary_dict):
sample_summary_dict["motivation"]["goal"] = ""
sample_summary_dict["motivation"]["gap"] = ""
sample_summary_dict["method"]["overview"] = ""
sample_summary_dict["results"]["main_findings"] = []
schema = SummarySchema.model_validate(sample_summary_dict)
assert assess_quality(schema) == "degraded"
def test_quality_low_short_one_line(self, sample_summary_dict):
sample_summary_dict["one_line"] = ""
schema = SummarySchema.model_validate(sample_summary_dict)
assert assess_quality(schema) == "low"
def test_quality_low_short_key_idea(self, sample_summary_dict):
sample_summary_dict["method"]["key_idea"] = ""
schema = SummarySchema.model_validate(sample_summary_dict)
assert assess_quality(schema) == "low"
# ═══════════════════════════════════════════════════════════════════════
# JSON 提取测试
# ═══════════════════════════════════════════════════════════════════════
class TestJsonExtraction:
"""pi 输出的 JSON 提取。"""
def test_direct_json(self, sample_summary_json):
result = _extract_json(sample_summary_json)
assert result["title_zh"] == "测试论文中文标题"
def test_fenced_code_block(self, sample_summary_json):
raw = f"一些文字\n```json\n{sample_summary_json}\n```\n更多文字"
result = _extract_json(raw)
assert result["title_zh"] == "测试论文中文标题"
def test_fenced_without_lang(self, sample_summary_json):
raw = f"文字\n```\n{sample_summary_json}\n```"
result = _extract_json(raw)
assert result["title_zh"] == "测试论文中文标题"
def test_embedded_braces(self, sample_summary_dict):
json_str = json.dumps(sample_summary_dict, ensure_ascii=False)
raw = f"Here is the summary:\n{json_str}\nEnd."
result = _extract_json(raw)
assert result["title_zh"] == "测试论文中文标题"
def test_no_json_raises(self):
with pytest.raises(JsonNotFoundError):
_extract_json("No JSON here at all.")
def test_json_without_title_zh_falls_through(self):
"""不含 title_zh 的 JSON 不是我们要的。"""
raw = json.dumps({"other": "data"})
# 如果有其他合法 JSON 块也能返回,但没有就直接找最大块
# 此场景 raw 本身就是一个 JSON dict,但没有 title_zh
# 策略 1 会跳过(无 title_zh),策略 2 无代码块,策略 3 找到最大块
result = _extract_json(raw)
assert result == {"other": "data"} # 最大块兜底
# ═══════════════════════════════════════════════════════════════════════
# 错误分类测试
# ═══════════════════════════════════════════════════════════════════════
class TestErrorClassification:
"""异常 → error_type 映射。"""
def test_pdf_download_error(self):
assert _classify_error(PdfDownloadError("fail")) == "pdf_download_failed"
def test_timeout_error(self):
assert _classify_error(PiTimeoutError("timeout")) == "timeout"
def test_process_error(self):
assert _classify_error(PiProcessError(1, "stderr")) == "process_error"
def test_json_not_found(self):
assert _classify_error(JsonNotFoundError("not found")) == "json_not_found"
def test_json_invalid(self):
assert _classify_error(json.JSONDecodeError("bad", "", 0)) == "json_invalid"
def test_field_missing(self):
try:
SummarySchema.model_validate({"title_zh": ""}) # type: ignore
except ValidationError as exc:
assert _classify_error(exc) == "field_missing"
def test_unknown_error(self):
assert _classify_error(RuntimeError("boom")) == "unknown"
# ═══════════════════════════════════════════════════════════════════════
@@ -675,59 +499,3 @@ class TestBatchSummarize:
result = await summarize_batch(db_session)
assert result["status"] == "success"
assert result["total"] == 0
# ═══════════════════════════════════════════════════════════════════════
# Admin 路由鉴权测试
# ═══════════════════════════════════════════════════════════════════════
class TestAdminAuth:
"""管理接口鉴权 — 只测 HTTP 层,mock 掉实际服务调用。"""
def test_no_token_returns_401(self, client):
"""无 Bearer token 返回 401。"""
resp = client.post("/admin/summarize")
assert resp.status_code in (401, 403)
def test_wrong_token_returns_401(self, client):
resp = client.post(
"/admin/summarize",
headers={"Authorization": "Bearer wrong-token"},
)
assert resp.status_code == 401
def test_correct_token_batch(self, client, admin_headers):
"""正确 token 调用 batch summarizemock 掉服务层。"""
import app.config as config_mod
original = config_mod.settings.ADMIN_TOKEN
config_mod.settings.ADMIN_TOKEN = "test-admin-token-12345"
try:
with patch("app.routes.admin.summarize_batch", new_callable=AsyncMock) as mock:
mock.return_value = {"status": "success", "done": 0, "failed": 0, "total": 0}
resp = client.post("/admin/summarize", headers=admin_headers)
assert resp.status_code == 200
assert resp.json()["status"] == "success"
finally:
config_mod.settings.ADMIN_TOKEN = original
def test_single_paper_not_found(self, client, admin_headers):
"""单篇总结不存在的论文返回 404。"""
import app.config as config_mod
original = config_mod.settings.ADMIN_TOKEN
config_mod.settings.ADMIN_TOKEN = "test-admin-token-12345"
try:
with patch(
"app.routes.admin.summarize_single",
new_callable=AsyncMock,
return_value={"status": "not_found", "arxiv_id": "nonexistent.99999"},
):
resp = client.post(
"/admin/summarize/nonexistent.99999",
headers=admin_headers,
)
assert resp.status_code == 404
finally:
config_mod.settings.ADMIN_TOKEN = original