"""PDF 下载 — 从 arXiv 下载论文 PDF。""" from __future__ import annotations import logging import os import shutil from pathlib import Path import requests from app.config import settings from app.utils import PAPERS_DIR, TMP_DIR logger = logging.getLogger(__name__) # ── 自定义异常 ────────────────────────────────────────────────────────── class PdfDownloadError(Exception): pass # ── 路径工具 ──────────────────────────────────────────────────────────── def paper_dir(arxiv_id: str) -> Path: return PAPERS_DIR / arxiv_id def tmp_dir(arxiv_id: str) -> Path: return TMP_DIR / arxiv_id # ── PDF 下载 ──────────────────────────────────────────────────────────── # 复用 TCP 连接的 session _http_session: requests.Session | None = None def _get_session() -> requests.Session: global _http_session if _http_session is None: _http_session = requests.Session() _http_session.headers.update({"User-Agent": "hf-daily-papers/1.0"}) # 代理:优先 $PROXY_SERVER,其次 settings.http_proxy proxy = os.environ.get("PROXY_SERVER") if proxy: _http_session.proxies = {"http": proxy, "https": proxy} logger.info("PDF download using proxy from $PROXY_SERVER: %s", proxy) return _http_session def close_http_session() -> None: """关闭全局 HTTP Session,供应用 shutdown 时调用。""" global _http_session if _http_session is not None: _http_session.close() _http_session = None async def download_pdf(arxiv_id: str, pdf_url: str) -> Path: """下载 PDF 到 data/tmp/{arxiv_id}/paper.pdf。""" if not pdf_url: raise PdfDownloadError(f"no pdf_url for {arxiv_id}") dest_dir = tmp_dir(arxiv_id) dest_dir.mkdir(parents=True, exist_ok=True) dest = dest_dir / "paper.pdf" try: session = _get_session() resp = session.get( pdf_url, timeout=settings.PDF_DOWNLOAD_TIMEOUT, allow_redirects=True ) resp.raise_for_status() dest.write_bytes(resp.content) except Exception as exc: # 清理残留的部分文件 if dest.exists(): try: dest.unlink() except OSError: pass raise PdfDownloadError(f"failed to download PDF for {arxiv_id}: {exc}") from exc logger.info("Downloaded PDF: %s (%d bytes)", arxiv_id, dest.stat().st_size) return dest # ── 临时文件清理 ──────────────────────────────────────────────────────── def cleanup_tmp(arxiv_id: str) -> None: """清理 data/tmp/{arxiv_id}/ 目录。""" td = tmp_dir(arxiv_id) if td.exists(): try: shutil.rmtree(td) logger.debug("Cleaned tmp: %s", arxiv_id) except Exception: logger.warning("Failed to clean tmp for %s", arxiv_id, exc_info=True)