daily-paper/tests/test_summary_utils.py

"""summary_utils 测试 — PDF 文本提取、正文裁剪、JSON 提取、meta.json 写入、prompt 构建。"""

from __future__ import annotations

import json
from unittest.mock import MagicMock, patch

import pytest

from app.services.summary_utils import (
    JsonNotFoundError,
    _trim_body,
    build_prompt,
    extract_json,
    extract_pdf_text,
    write_meta_json,
)


# ═══════════════════════════════════════════════════════════════════════
# _trim_body 正文裁剪
# ═══════════════════════════════════════════════════════════════════════


class TestTrimBody:
    def test_removes_references_section(self):
        text = "Intro\n\nSome content here.\n\nReferences\n[1] Smith et al."
        result = _trim_body(text)
        assert "References" not in result
        assert "Intro" in result

    def test_removes_bibliography(self):
        text = "Body\n\nBibliography\n[1] Smith"
        result = _trim_body(text)
        assert "Bibliography" not in result

    def test_keeps_appendix_after_references(self):
        text = "Body\n\nReferences\n[1] X\n\nAppendix\nExtra content"
        result = _trim_body(text)
        assert "Appendix" in result
        assert "Extra content" in result
        assert "References" not in result

    def test_removes_acknowledgments(self):
        text = "Body\n\nAcknowledgments\nThanks to everyone."
        result = _trim_body(text)
        assert "Acknowledgments" not in result

    def test_max_chars_truncation(self):
        text = "A" * 1000
        result = _trim_body(text, max_chars=100)
        assert len(result) <= 100

    def test_no_truncation_when_none(self):
        text = "A" * 500
        result = _trim_body(text, max_chars=None)
        assert len(result) == 500


# ═══════════════════════════════════════════════════════════════════════
# extract_pdf_text
# ═══════════════════════════════════════════════════════════════════════


class TestExtractPdfText:
    def test_extracts_text_and_saves(self, tmp_path):
        pdf_path = tmp_path / "test.pdf"
        pdf_path.write_bytes(b"%PDF-fake")

        mock_page = MagicMock()
        mock_page.get_text.return_value = "Page 1 text"
        mock_doc = MagicMock()
        mock_doc.__iter__ = MagicMock(return_value=iter([mock_page]))
        mock_doc.__enter__ = MagicMock(return_value=mock_doc)
        mock_doc.__exit__ = MagicMock(return_value=False)

        with (
            patch("pymupdf.open", return_value=mock_doc),
            patch(
                "app.services.summary_utils._trim_body", side_effect=lambda t, **kw: t
            ),
        ):
            result_path = extract_pdf_text(pdf_path)

        assert result_path.suffix == ".txt"
        assert result_path.exists()
        assert "Page 1 text" in result_path.read_text()

    def test_uses_cached_txt(self, tmp_path):
        pdf_path = tmp_path / "test.pdf"
        pdf_path.write_bytes(b"%PDF-fake")
        txt_path = tmp_path / "test.txt"
        txt_path.write_text("cached", encoding="utf-8")

        with patch("pymupdf.open") as mock_open:
            result = extract_pdf_text(pdf_path)

        mock_open.assert_not_called()
        assert result == txt_path


# ═══════════════════════════════════════════════════════════════════════
# write_meta_json
# ═══════════════════════════════════════════════════════════════════════


class TestWriteMetaJson:
    def test_writes_meta_json(self, tmp_path, sample_paper):
        with patch("app.services.pdf_downloader.paper_dir", lambda aid: tmp_path / aid):
            result = write_meta_json(sample_paper)

        assert result.exists()
        assert result.name == "meta.json"
        data = json.loads(result.read_text(encoding="utf-8"))
        assert data["arxiv_id"] == "2401.12345"
        assert data["title_en"] == "Test Paper Title"


# ═══════════════════════════════════════════════════════════════════════
# build_prompt
# ═══════════════════════════════════════════════════════════════════════


class TestBuildPrompt:
    def test_inject_mode_contains_schema(self, tmp_path):
        prompt = build_prompt(
            "2401.12345", tmp_path / "meta", tmp_path / "txt", "inject"
        )
        assert "title_zh" in prompt
        assert "必须包含以下字段" in prompt

    def test_search_mode_contains_read_instruction(self, tmp_path):
        prompt = build_prompt(
            "2401.12345", tmp_path / "meta", tmp_path / "txt", "search"
        )
        assert "read" in prompt.lower()
        assert "title_zh" in prompt

    def test_fix_errors_mode(self, tmp_path):
        prompt = build_prompt(
            "2401.12345",
            tmp_path / "meta",
            tmp_path / "txt",
            "inject",
            fix_errors=["字段缺失"],
        )
        assert "字段缺失" in prompt
        assert "修正" in prompt


# ═══════════════════════════════════════════════════════════════════════
# extract_json
# ═══════════════════════════════════════════════════════════════════════


class TestExtractJson:
    def test_direct_json(self, sample_summary_json):
        result = extract_json(sample_summary_json)
        assert result["title_zh"] == "测试论文中文标题"

    def test_fenced_code_block(self, sample_summary_json):
        raw = f"some text\n```json\n{sample_summary_json}\n```\nmore text"
        result = extract_json(raw)
        assert result["title_zh"] == "测试论文中文标题"

    def test_brace_matching_fallback(self, sample_summary_dict):
        json_str = json.dumps(sample_summary_dict, ensure_ascii=False)
        raw = f"Here is the result: {json_str} end."
        result = extract_json(raw)
        assert result["title_zh"] == "测试论文中文标题"

    def test_no_json_raises(self):
        with pytest.raises(JsonNotFoundError):
            extract_json("plain text no json here at all")