Files
daily-paper/tests/test_summary_utils.py
T
Rain-Bus 21f16e6756 feat: refactor summarizer and PDF extraction pipeline
- Split summarizer into summary_generator and summary_persister modules
- Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection
- Add layout_detector service for PicoDet-S_layout_3cls integration
- Add exceptions module with ConflictError and NotFoundError
- Improve admin dashboard with better statistics and task management
- Add design review document with system optimization suggestions
- Add new tests for crawler, pdf_downloader, pipeline, and summary_utils
- Update dependencies and configuration
- Clean up dead code and improve error handling
2026-06-13 13:16:47 +08:00

175 lines
7.1 KiB
Python

"""summary_utils 测试 — PDF 文本提取、正文裁剪、JSON 提取、meta.json 写入、prompt 构建。"""
from __future__ import annotations
import json
from unittest.mock import MagicMock, patch
import pytest
from app.services.summary_utils import (
JsonNotFoundError,
_trim_body,
build_prompt,
extract_json,
extract_pdf_text,
write_meta_json,
)
# ═══════════════════════════════════════════════════════════════════════
# _trim_body 正文裁剪
# ═══════════════════════════════════════════════════════════════════════
class TestTrimBody:
def test_removes_references_section(self):
text = "Intro\n\nSome content here.\n\nReferences\n[1] Smith et al."
result = _trim_body(text)
assert "References" not in result
assert "Intro" in result
def test_removes_bibliography(self):
text = "Body\n\nBibliography\n[1] Smith"
result = _trim_body(text)
assert "Bibliography" not in result
def test_keeps_appendix_after_references(self):
text = "Body\n\nReferences\n[1] X\n\nAppendix\nExtra content"
result = _trim_body(text)
assert "Appendix" in result
assert "Extra content" in result
assert "References" not in result
def test_removes_acknowledgments(self):
text = "Body\n\nAcknowledgments\nThanks to everyone."
result = _trim_body(text)
assert "Acknowledgments" not in result
def test_max_chars_truncation(self):
text = "A" * 1000
result = _trim_body(text, max_chars=100)
assert len(result) <= 100
def test_no_truncation_when_none(self):
text = "A" * 500
result = _trim_body(text, max_chars=None)
assert len(result) == 500
# ═══════════════════════════════════════════════════════════════════════
# extract_pdf_text
# ═══════════════════════════════════════════════════════════════════════
class TestExtractPdfText:
def test_extracts_text_and_saves(self, tmp_path):
pdf_path = tmp_path / "test.pdf"
pdf_path.write_bytes(b"%PDF-fake")
mock_page = MagicMock()
mock_page.get_text.return_value = "Page 1 text"
mock_doc = MagicMock()
mock_doc.__iter__ = MagicMock(return_value=iter([mock_page]))
mock_doc.__enter__ = MagicMock(return_value=mock_doc)
mock_doc.__exit__ = MagicMock(return_value=False)
with (
patch("pymupdf.open", return_value=mock_doc),
patch(
"app.services.summary_utils._trim_body", side_effect=lambda t, **kw: t
),
):
result_path = extract_pdf_text(pdf_path)
assert result_path.suffix == ".txt"
assert result_path.exists()
assert "Page 1 text" in result_path.read_text()
def test_uses_cached_txt(self, tmp_path):
pdf_path = tmp_path / "test.pdf"
pdf_path.write_bytes(b"%PDF-fake")
txt_path = tmp_path / "test.txt"
txt_path.write_text("cached", encoding="utf-8")
with patch("pymupdf.open") as mock_open:
result = extract_pdf_text(pdf_path)
mock_open.assert_not_called()
assert result == txt_path
# ═══════════════════════════════════════════════════════════════════════
# write_meta_json
# ═══════════════════════════════════════════════════════════════════════
class TestWriteMetaJson:
def test_writes_meta_json(self, tmp_path, sample_paper):
with patch("app.services.pdf_downloader.paper_dir", lambda aid: tmp_path / aid):
result = write_meta_json(sample_paper)
assert result.exists()
assert result.name == "meta.json"
data = json.loads(result.read_text(encoding="utf-8"))
assert data["arxiv_id"] == "2401.12345"
assert data["title_en"] == "Test Paper Title"
# ═══════════════════════════════════════════════════════════════════════
# build_prompt
# ═══════════════════════════════════════════════════════════════════════
class TestBuildPrompt:
def test_inject_mode_contains_schema(self, tmp_path):
prompt = build_prompt(
"2401.12345", tmp_path / "meta", tmp_path / "txt", "inject"
)
assert "title_zh" in prompt
assert "必须包含以下字段" in prompt
def test_search_mode_contains_read_instruction(self, tmp_path):
prompt = build_prompt(
"2401.12345", tmp_path / "meta", tmp_path / "txt", "search"
)
assert "read" in prompt.lower()
assert "title_zh" in prompt
def test_fix_errors_mode(self, tmp_path):
prompt = build_prompt(
"2401.12345",
tmp_path / "meta",
tmp_path / "txt",
"inject",
fix_errors=["字段缺失"],
)
assert "字段缺失" in prompt
assert "修正" in prompt
# ═══════════════════════════════════════════════════════════════════════
# extract_json
# ═══════════════════════════════════════════════════════════════════════
class TestExtractJson:
def test_direct_json(self, sample_summary_json):
result = extract_json(sample_summary_json)
assert result["title_zh"] == "测试论文中文标题"
def test_fenced_code_block(self, sample_summary_json):
raw = f"some text\n```json\n{sample_summary_json}\n```\nmore text"
result = extract_json(raw)
assert result["title_zh"] == "测试论文中文标题"
def test_brace_matching_fallback(self, sample_summary_dict):
json_str = json.dumps(sample_summary_dict, ensure_ascii=False)
raw = f"Here is the result: {json_str} end."
result = extract_json(raw)
assert result["title_zh"] == "测试论文中文标题"
def test_no_json_raises(self):
with pytest.raises(JsonNotFoundError):
extract_json("plain text no json here at all")