21f16e6756
- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
175 lines
7.1 KiB
Python
175 lines
7.1 KiB
Python
"""summary_utils 测试 — PDF 文本提取、正文裁剪、JSON 提取、meta.json 写入、prompt 构建。"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from app.services.summary_utils import (
|
|
JsonNotFoundError,
|
|
_trim_body,
|
|
build_prompt,
|
|
extract_json,
|
|
extract_pdf_text,
|
|
write_meta_json,
|
|
)
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# _trim_body 正文裁剪
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
class TestTrimBody:
|
|
def test_removes_references_section(self):
|
|
text = "Intro\n\nSome content here.\n\nReferences\n[1] Smith et al."
|
|
result = _trim_body(text)
|
|
assert "References" not in result
|
|
assert "Intro" in result
|
|
|
|
def test_removes_bibliography(self):
|
|
text = "Body\n\nBibliography\n[1] Smith"
|
|
result = _trim_body(text)
|
|
assert "Bibliography" not in result
|
|
|
|
def test_keeps_appendix_after_references(self):
|
|
text = "Body\n\nReferences\n[1] X\n\nAppendix\nExtra content"
|
|
result = _trim_body(text)
|
|
assert "Appendix" in result
|
|
assert "Extra content" in result
|
|
assert "References" not in result
|
|
|
|
def test_removes_acknowledgments(self):
|
|
text = "Body\n\nAcknowledgments\nThanks to everyone."
|
|
result = _trim_body(text)
|
|
assert "Acknowledgments" not in result
|
|
|
|
def test_max_chars_truncation(self):
|
|
text = "A" * 1000
|
|
result = _trim_body(text, max_chars=100)
|
|
assert len(result) <= 100
|
|
|
|
def test_no_truncation_when_none(self):
|
|
text = "A" * 500
|
|
result = _trim_body(text, max_chars=None)
|
|
assert len(result) == 500
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# extract_pdf_text
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
class TestExtractPdfText:
|
|
def test_extracts_text_and_saves(self, tmp_path):
|
|
pdf_path = tmp_path / "test.pdf"
|
|
pdf_path.write_bytes(b"%PDF-fake")
|
|
|
|
mock_page = MagicMock()
|
|
mock_page.get_text.return_value = "Page 1 text"
|
|
mock_doc = MagicMock()
|
|
mock_doc.__iter__ = MagicMock(return_value=iter([mock_page]))
|
|
mock_doc.__enter__ = MagicMock(return_value=mock_doc)
|
|
mock_doc.__exit__ = MagicMock(return_value=False)
|
|
|
|
with (
|
|
patch("pymupdf.open", return_value=mock_doc),
|
|
patch(
|
|
"app.services.summary_utils._trim_body", side_effect=lambda t, **kw: t
|
|
),
|
|
):
|
|
result_path = extract_pdf_text(pdf_path)
|
|
|
|
assert result_path.suffix == ".txt"
|
|
assert result_path.exists()
|
|
assert "Page 1 text" in result_path.read_text()
|
|
|
|
def test_uses_cached_txt(self, tmp_path):
|
|
pdf_path = tmp_path / "test.pdf"
|
|
pdf_path.write_bytes(b"%PDF-fake")
|
|
txt_path = tmp_path / "test.txt"
|
|
txt_path.write_text("cached", encoding="utf-8")
|
|
|
|
with patch("pymupdf.open") as mock_open:
|
|
result = extract_pdf_text(pdf_path)
|
|
|
|
mock_open.assert_not_called()
|
|
assert result == txt_path
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# write_meta_json
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
class TestWriteMetaJson:
|
|
def test_writes_meta_json(self, tmp_path, sample_paper):
|
|
with patch("app.services.pdf_downloader.paper_dir", lambda aid: tmp_path / aid):
|
|
result = write_meta_json(sample_paper)
|
|
|
|
assert result.exists()
|
|
assert result.name == "meta.json"
|
|
data = json.loads(result.read_text(encoding="utf-8"))
|
|
assert data["arxiv_id"] == "2401.12345"
|
|
assert data["title_en"] == "Test Paper Title"
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# build_prompt
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
class TestBuildPrompt:
|
|
def test_inject_mode_contains_schema(self, tmp_path):
|
|
prompt = build_prompt(
|
|
"2401.12345", tmp_path / "meta", tmp_path / "txt", "inject"
|
|
)
|
|
assert "title_zh" in prompt
|
|
assert "必须包含以下字段" in prompt
|
|
|
|
def test_search_mode_contains_read_instruction(self, tmp_path):
|
|
prompt = build_prompt(
|
|
"2401.12345", tmp_path / "meta", tmp_path / "txt", "search"
|
|
)
|
|
assert "read" in prompt.lower()
|
|
assert "title_zh" in prompt
|
|
|
|
def test_fix_errors_mode(self, tmp_path):
|
|
prompt = build_prompt(
|
|
"2401.12345",
|
|
tmp_path / "meta",
|
|
tmp_path / "txt",
|
|
"inject",
|
|
fix_errors=["字段缺失"],
|
|
)
|
|
assert "字段缺失" in prompt
|
|
assert "修正" in prompt
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
# extract_json
|
|
# ═══════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
class TestExtractJson:
|
|
def test_direct_json(self, sample_summary_json):
|
|
result = extract_json(sample_summary_json)
|
|
assert result["title_zh"] == "测试论文中文标题"
|
|
|
|
def test_fenced_code_block(self, sample_summary_json):
|
|
raw = f"some text\n```json\n{sample_summary_json}\n```\nmore text"
|
|
result = extract_json(raw)
|
|
assert result["title_zh"] == "测试论文中文标题"
|
|
|
|
def test_brace_matching_fallback(self, sample_summary_dict):
|
|
json_str = json.dumps(sample_summary_dict, ensure_ascii=False)
|
|
raw = f"Here is the result: {json_str} end."
|
|
result = extract_json(raw)
|
|
assert result["title_zh"] == "测试论文中文标题"
|
|
|
|
def test_no_json_raises(self):
|
|
with pytest.raises(JsonNotFoundError):
|
|
extract_json("plain text no json here at all")
|