"""summary_utils 测试 — PDF 文本提取、正文裁剪、JSON 提取、meta.json 写入、prompt 构建。""" from __future__ import annotations import json from unittest.mock import MagicMock, patch import pytest from app.services.summary_utils import ( JsonNotFoundError, _trim_body, build_prompt, extract_json, extract_pdf_text, write_meta_json, ) # ═══════════════════════════════════════════════════════════════════════ # _trim_body 正文裁剪 # ═══════════════════════════════════════════════════════════════════════ class TestTrimBody: def test_removes_references_section(self): text = "Intro\n\nSome content here.\n\nReferences\n[1] Smith et al." result = _trim_body(text) assert "References" not in result assert "Intro" in result def test_removes_bibliography(self): text = "Body\n\nBibliography\n[1] Smith" result = _trim_body(text) assert "Bibliography" not in result def test_keeps_appendix_after_references(self): text = "Body\n\nReferences\n[1] X\n\nAppendix\nExtra content" result = _trim_body(text) assert "Appendix" in result assert "Extra content" in result assert "References" not in result def test_removes_acknowledgments(self): text = "Body\n\nAcknowledgments\nThanks to everyone." result = _trim_body(text) assert "Acknowledgments" not in result def test_max_chars_truncation(self): text = "A" * 1000 result = _trim_body(text, max_chars=100) assert len(result) <= 100 def test_no_truncation_when_none(self): text = "A" * 500 result = _trim_body(text, max_chars=None) assert len(result) == 500 # ═══════════════════════════════════════════════════════════════════════ # extract_pdf_text # ═══════════════════════════════════════════════════════════════════════ class TestExtractPdfText: def test_extracts_text_and_saves(self, tmp_path): pdf_path = tmp_path / "test.pdf" pdf_path.write_bytes(b"%PDF-fake") mock_page = MagicMock() mock_page.get_text.return_value = "Page 1 text" mock_doc = MagicMock() mock_doc.__iter__ = MagicMock(return_value=iter([mock_page])) mock_doc.__enter__ = MagicMock(return_value=mock_doc) mock_doc.__exit__ = MagicMock(return_value=False) with ( patch("pymupdf.open", return_value=mock_doc), patch( "app.services.summary_utils._trim_body", side_effect=lambda t, **kw: t ), ): result_path = extract_pdf_text(pdf_path) assert result_path.suffix == ".txt" assert result_path.exists() assert "Page 1 text" in result_path.read_text() def test_uses_cached_txt(self, tmp_path): pdf_path = tmp_path / "test.pdf" pdf_path.write_bytes(b"%PDF-fake") txt_path = tmp_path / "test.txt" txt_path.write_text("cached", encoding="utf-8") with patch("pymupdf.open") as mock_open: result = extract_pdf_text(pdf_path) mock_open.assert_not_called() assert result == txt_path # ═══════════════════════════════════════════════════════════════════════ # write_meta_json # ═══════════════════════════════════════════════════════════════════════ class TestWriteMetaJson: def test_writes_meta_json(self, tmp_path, sample_paper): with patch("app.services.pdf_downloader.paper_dir", lambda aid: tmp_path / aid): result = write_meta_json(sample_paper) assert result.exists() assert result.name == "meta.json" data = json.loads(result.read_text(encoding="utf-8")) assert data["arxiv_id"] == "2401.12345" assert data["title_en"] == "Test Paper Title" # ═══════════════════════════════════════════════════════════════════════ # build_prompt # ═══════════════════════════════════════════════════════════════════════ class TestBuildPrompt: def test_inject_mode_contains_schema(self, tmp_path): prompt = build_prompt( "2401.12345", tmp_path / "meta", tmp_path / "txt", "inject" ) assert "title_zh" in prompt assert "必须包含以下字段" in prompt def test_search_mode_contains_read_instruction(self, tmp_path): prompt = build_prompt( "2401.12345", tmp_path / "meta", tmp_path / "txt", "search" ) assert "read" in prompt.lower() assert "title_zh" in prompt def test_fix_errors_mode(self, tmp_path): prompt = build_prompt( "2401.12345", tmp_path / "meta", tmp_path / "txt", "inject", fix_errors=["字段缺失"], ) assert "字段缺失" in prompt assert "修正" in prompt # ═══════════════════════════════════════════════════════════════════════ # extract_json # ═══════════════════════════════════════════════════════════════════════ class TestExtractJson: def test_direct_json(self, sample_summary_json): result = extract_json(sample_summary_json) assert result["title_zh"] == "测试论文中文标题" def test_fenced_code_block(self, sample_summary_json): raw = f"some text\n```json\n{sample_summary_json}\n```\nmore text" result = extract_json(raw) assert result["title_zh"] == "测试论文中文标题" def test_brace_matching_fallback(self, sample_summary_dict): json_str = json.dumps(sample_summary_dict, ensure_ascii=False) raw = f"Here is the result: {json_str} end." result = extract_json(raw) assert result["title_zh"] == "测试论文中文标题" def test_no_json_raises(self): with pytest.raises(JsonNotFoundError): extract_json("plain text no json here at all")