daily-paper/tests/test_image_extractor.py

"""LaTeX 图片提取测试 — 从 .tex 源码中提取图片文件。"""

from __future__ import annotations

import pytest


# ═══════════════════════════════════════════════════════════════════════
# Image Extraction
# ═══════════════════════════════════════════════════════════════════════


class TestImageExtraction:
    """LaTeX 图片提取测试。"""

    @pytest.mark.asyncio
    async def test_extract_images_from_source_no_dir(self, monkeypatch, tmp_path):
        """源码目录不存在时返回 0。"""
        monkeypatch.setattr("app.services.pdf_downloader.tmp_dir", lambda x: tmp_path / "tmp" / x)
        monkeypatch.setattr("app.services.pdf_downloader.paper_dir", lambda x: tmp_path / "papers" / x)
        from app.services.image_extractor import extract_images_from_source
        result = await extract_images_from_source("2401.99999")
        assert result == 0

    @pytest.mark.asyncio
    async def test_extract_images_from_tex(self, monkeypatch, tmp_path):
        """从 .tex 文件中提取图片。"""
        from app.services.image_extractor import extract_images_from_source

        tmp_source = tmp_path / "tmp" / "2401.00001" / "source"
        tmp_source.mkdir(parents=True)

        images_dir = tmp_source / "figs"
        images_dir.mkdir()
        (images_dir / "figure1.png").write_bytes(b"\x89PNG\r\n")
        (images_dir / "figure2.jpg").write_bytes(b"\xff\xd8\xff\xe0")

        # 创建 .tex 文件
        tex_content = r"""
\documentclass{article}
\begin{document}
\begin{figure}
  \includegraphics[width=0.8\textwidth]{figs/figure1.png}
  \includegraphics{figs/figure2.jpg}
  \includegraphics[angle=90]{figs/nonexistent.pdf}
\end{figure}
\end{document}
"""
        (tmp_source / "main.tex").write_text(tex_content)

        papers_dir = tmp_path / "papers" / "2401.00001"
        monkeypatch.setattr("app.services.image_extractor.tmp_dir", lambda x: tmp_path / "tmp" / x)
        monkeypatch.setattr("app.services.image_extractor.paper_dir", lambda x: tmp_path / "papers" / x)

        # Mock download_source_zip to avoid real network call (source dir already exists)
        async def _noop_download(*args, **kwargs):
            pass

        monkeypatch.setattr("app.services.image_extractor.download_source_zip", _noop_download)

        result = await extract_images_from_source("2401.00001")

        assert result == 2
        dest_images = papers_dir / "images"
        assert dest_images.exists()
        assert (dest_images / "figure1.png").exists()
        assert (dest_images / "figure2.jpg").exists()

    @pytest.mark.asyncio
    async def test_extract_images_empty_tex(self, monkeypatch, tmp_path):
        """.tex 文件无图片时返回 0。"""
        from app.services.image_extractor import extract_images_from_source

        tmp_source = tmp_path / "tmp" / "2401.00002" / "source"
        tmp_source.mkdir(parents=True)
        (tmp_source / "main.tex").write_text(r"\documentclass{article}\begin{document}Hello\end{document}")

        monkeypatch.setattr("app.services.image_extractor.tmp_dir", lambda x: tmp_path / "tmp" / x)
        monkeypatch.setattr("app.services.image_extractor.paper_dir", lambda x: tmp_path / "papers" / x)

        # Mock download_source_zip to avoid real network call
        async def _noop_download(*args, **kwargs):
            pass

        monkeypatch.setattr("app.services.image_extractor.download_source_zip", _noop_download)

        result = await extract_images_from_source("2401.00002")
        assert result == 0