90fe705e8f
- 核心变更: - app/services/layout_detector.py: 重写布局检测器,从 PicoDet-S_layout_3cls 迁移到 DocLayout-YOLO (DocStructBench, imgsz=1024) - 支持多设备推理 (CPU/CUDA/DirectML/OpenVINO 等),自动探测最优设备 - 预处理改为 letterbox (保比例缩放+灰边 padding),坐标还原使用 (model_coord - padding) / ratio 公式 - 后处理解析 YOLOv10 end-to-end 输出 [N,6]=[x1,y1,x2,y2,conf,cls] - 类别映射改为按 class name 动态匹配 (figure/figure_group→picture, table/table_group→table) - 新增文件: - scripts/export_doclayout_yolo_onnx.py: DocLayout-YOLO ONNX 导出脚本 (独立 venv 运行) - tests/test_layout_detector.py: 布局检测器完整测试 (35 个用例) - 配置更新: - .env.example: 更新布局检测配置 (新增 LAYOUT_IMGSZ, LAYOUT_DEVICE, LAYOUT_DEVICE_ID) - app/config.py: Settings 类对应字段 - pyproject.toml: 新增 export 依赖组 (torch, doclayout-yolo, onnx 等) - 删除旧文件: - scripts/export_picodet_onnx.py: 旧 PicoDet 导出脚本 - 文档更新: - README.md: 更新环境变量说明 - 相关服务注释更新 (pdf_image_extractor.py, summary_persister.py, reextract_images.py) 此重构遵循项目初期开发阶段规范,大胆调整数据模型,无需向后兼容。
104 lines
3.3 KiB
Python
104 lines
3.3 KiB
Python
"""PDF 下载 — 从 arXiv 下载论文 PDF。"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
|
|
from app.config import settings
|
|
from app.utils import PAPERS_DIR, TMP_DIR
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ── 自定义异常 ──────────────────────────────────────────────────────────
|
|
|
|
|
|
class PdfDownloadError(Exception):
|
|
pass
|
|
|
|
|
|
# ── 路径工具 ────────────────────────────────────────────────────────────
|
|
|
|
|
|
def paper_dir(arxiv_id: str) -> Path:
|
|
return PAPERS_DIR / arxiv_id
|
|
|
|
|
|
def tmp_dir(arxiv_id: str) -> Path:
|
|
return TMP_DIR / arxiv_id
|
|
|
|
|
|
# ── PDF 下载 ────────────────────────────────────────────────────────────
|
|
|
|
# 复用 TCP 连接的 session
|
|
_http_session: requests.Session | None = None
|
|
|
|
|
|
def _get_session() -> requests.Session:
|
|
global _http_session
|
|
if _http_session is None:
|
|
_http_session = requests.Session()
|
|
_http_session.headers.update({"User-Agent": "hf-daily-papers/1.0"})
|
|
# 代理:优先 $PROXY_SERVER,其次 settings.http_proxy
|
|
proxy = os.environ.get("PROXY_SERVER")
|
|
if proxy:
|
|
_http_session.proxies = {"http": proxy, "https": proxy}
|
|
logger.info("PDF download using proxy from $PROXY_SERVER: %s", proxy)
|
|
return _http_session
|
|
|
|
|
|
def close_http_session() -> None:
|
|
"""关闭全局 HTTP Session,供应用 shutdown 时调用。"""
|
|
global _http_session
|
|
if _http_session is not None:
|
|
_http_session.close()
|
|
_http_session = None
|
|
|
|
|
|
async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
|
|
"""下载 PDF 到 data/tmp/{arxiv_id}/paper.pdf。"""
|
|
if not pdf_url:
|
|
raise PdfDownloadError(f"no pdf_url for {arxiv_id}")
|
|
|
|
dest_dir = tmp_dir(arxiv_id)
|
|
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
dest = dest_dir / "paper.pdf"
|
|
|
|
try:
|
|
session = _get_session()
|
|
resp = session.get(
|
|
pdf_url, timeout=settings.PDF_DOWNLOAD_TIMEOUT, allow_redirects=True
|
|
)
|
|
resp.raise_for_status()
|
|
dest.write_bytes(resp.content)
|
|
except Exception as exc:
|
|
# 清理残留的部分文件
|
|
if dest.exists():
|
|
try:
|
|
dest.unlink()
|
|
except OSError:
|
|
pass
|
|
raise PdfDownloadError(f"failed to download PDF for {arxiv_id}: {exc}") from exc
|
|
|
|
logger.info("Downloaded PDF: %s (%d bytes)", arxiv_id, dest.stat().st_size)
|
|
return dest
|
|
|
|
|
|
# ── 临时文件清理 ────────────────────────────────────────────────────────
|
|
|
|
|
|
def cleanup_tmp(arxiv_id: str) -> None:
|
|
"""清理 data/tmp/{arxiv_id}/ 目录。"""
|
|
td = tmp_dir(arxiv_id)
|
|
if td.exists():
|
|
try:
|
|
shutil.rmtree(td)
|
|
logger.debug("Cleaned tmp: %s", arxiv_id)
|
|
except Exception:
|
|
logger.warning("Failed to clean tmp for %s", arxiv_id, exc_info=True)
|