daily-paper/app/services/pdf_downloader.py

"""PDF 下载 — 从 arXiv 下载论文 PDF。"""

from __future__ import annotations

import logging
import shutil
from pathlib import Path

from app.utils import PAPERS_DIR, TMP_DIR, make_http_client

logger = logging.getLogger(__name__)


# ── 自定义异常 ──────────────────────────────────────────────────────────


class PdfDownloadError(Exception):
    pass


# ── 路径工具 ────────────────────────────────────────────────────────────


def paper_dir(arxiv_id: str) -> Path:
    return PAPERS_DIR / arxiv_id


def tmp_dir(arxiv_id: str) -> Path:
    return TMP_DIR / arxiv_id


# ── PDF 下载 ────────────────────────────────────────────────────────────


async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
    """下载 PDF 到 data/tmp/{arxiv_id}/paper.pdf。"""
    if not pdf_url:
        raise PdfDownloadError(f"no pdf_url for {arxiv_id}")

    dest_dir = tmp_dir(arxiv_id)
    dest_dir.mkdir(parents=True, exist_ok=True)
    dest = dest_dir / "paper.pdf"

    try:
        async with make_http_client(follow_redirects=True) as client:
            resp = await client.get(pdf_url)
            resp.raise_for_status()
            dest.write_bytes(resp.content)
    except Exception as exc:
        raise PdfDownloadError(f"failed to download PDF for {arxiv_id}: {exc}") from exc

    logger.info("Downloaded PDF: %s (%d bytes)", arxiv_id, dest.stat().st_size)
    return dest


# ── 临时文件清理 ────────────────────────────────────────────────────────


def cleanup_tmp(arxiv_id: str) -> None:
    """清理 data/tmp/{arxiv_id}/ 目录。"""
    td = tmp_dir(arxiv_id)
    if td.exists():
        try:
            shutil.rmtree(td)
            logger.debug("Cleaned tmp: %s", arxiv_id)
        except Exception:
            logger.warning("Failed to clean tmp for %s", arxiv_id, exc_info=True)