Files
daily-paper/app/services/pdf_downloader.py
T
Rain-Bus 21f16e6756 feat: refactor summarizer and PDF extraction pipeline
- Split summarizer into summary_generator and summary_persister modules
- Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection
- Add layout_detector service for PicoDet-S_layout_3cls integration
- Add exceptions module with ConflictError and NotFoundError
- Improve admin dashboard with better statistics and task management
- Add design review document with system optimization suggestions
- Add new tests for crawler, pdf_downloader, pipeline, and summary_utils
- Update dependencies and configuration
- Clean up dead code and improve error handling
2026-06-13 13:16:47 +08:00

102 lines
3.3 KiB
Python

"""PDF 下载 — 从 arXiv 下载论文 PDF。"""
from __future__ import annotations
import logging
import os
import shutil
from pathlib import Path
import requests
from app.config import settings
from app.utils import PAPERS_DIR, TMP_DIR
logger = logging.getLogger(__name__)
# ── 自定义异常 ──────────────────────────────────────────────────────────
class PdfDownloadError(Exception):
pass
# ── 路径工具 ────────────────────────────────────────────────────────────
def paper_dir(arxiv_id: str) -> Path:
return PAPERS_DIR / arxiv_id
def tmp_dir(arxiv_id: str) -> Path:
return TMP_DIR / arxiv_id
# ── PDF 下载 ────────────────────────────────────────────────────────────
# 复用 TCP 连接的 session
_http_session: requests.Session | None = None
def _get_session() -> requests.Session:
global _http_session
if _http_session is None:
_http_session = requests.Session()
_http_session.headers.update({"User-Agent": "hf-daily-papers/1.0"})
# 代理:优先 $PROXY_SERVER,其次 settings.http_proxy
proxy = os.environ.get("PROXY_SERVER")
if proxy:
_http_session.proxies = {"http": proxy, "https": proxy}
logger.info("PDF download using proxy from $PROXY_SERVER: %s", proxy)
return _http_session
def close_http_session() -> None:
"""关闭全局 HTTP Session,供应用 shutdown 时调用。"""
global _http_session
if _http_session is not None:
_http_session.close()
_http_session = None
async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
"""下载 PDF 到 data/tmp/{arxiv_id}/paper.pdf。"""
if not pdf_url:
raise PdfDownloadError(f"no pdf_url for {arxiv_id}")
dest_dir = tmp_dir(arxiv_id)
dest_dir.mkdir(parents=True, exist_ok=True)
dest = dest_dir / "paper.pdf"
try:
session = _get_session()
resp = session.get(pdf_url, timeout=settings.PDF_DOWNLOAD_TIMEOUT, allow_redirects=True)
resp.raise_for_status()
dest.write_bytes(resp.content)
except Exception as exc:
# 清理残留的部分文件
if dest.exists():
try:
dest.unlink()
except OSError:
pass
raise PdfDownloadError(f"failed to download PDF for {arxiv_id}: {exc}") from exc
logger.info("Downloaded PDF: %s (%d bytes)", arxiv_id, dest.stat().st_size)
return dest
# ── 临时文件清理 ────────────────────────────────────────────────────────
def cleanup_tmp(arxiv_id: str) -> None:
"""清理 data/tmp/{arxiv_id}/ 目录。"""
td = tmp_dir(arxiv_id)
if td.exists():
try:
shutil.rmtree(td)
logger.debug("Cleaned tmp: %s", arxiv_id)
except Exception:
logger.warning("Failed to clean tmp for %s", arxiv_id, exc_info=True)