feat: add claude backend, refactor summary utilities, improve batch worker pattern, add pymupdf4llm

This commit is contained in:
2026-06-12 22:25:57 +08:00
parent b42e9149e5
commit e2f0e1a8be
13 changed files with 1350 additions and 1010 deletions
+24 -5
View File
@@ -3,10 +3,13 @@
from __future__ import annotations
import logging
import os
import shutil
from pathlib import Path
from app.utils import PAPERS_DIR, TMP_DIR, make_http_client
import requests
from app.utils import PAPERS_DIR, TMP_DIR
logger = logging.getLogger(__name__)
@@ -31,6 +34,22 @@ def tmp_dir(arxiv_id: str) -> Path:
# ── PDF 下载 ────────────────────────────────────────────────────────────
# 复用 TCP 连接的 session
_http_session: requests.Session | None = None
def _get_session() -> requests.Session:
global _http_session
if _http_session is None:
_http_session = requests.Session()
_http_session.headers.update({"User-Agent": "hf-daily-papers/1.0"})
# 代理:优先 $PROXY_SERVER,其次 settings.http_proxy
proxy = os.environ.get("PROXY_SERVER")
if proxy:
_http_session.proxies = {"http": proxy, "https": proxy}
logger.info("PDF download using proxy from $PROXY_SERVER: %s", proxy)
return _http_session
async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
"""下载 PDF 到 data/tmp/{arxiv_id}/paper.pdf。"""
@@ -42,10 +61,10 @@ async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
dest = dest_dir / "paper.pdf"
try:
async with make_http_client(follow_redirects=True) as client:
resp = await client.get(pdf_url)
resp.raise_for_status()
dest.write_bytes(resp.content)
session = _get_session()
resp = session.get(pdf_url, timeout=120, allow_redirects=True)
resp.raise_for_status()
dest.write_bytes(resp.content)
except Exception as exc:
raise PdfDownloadError(f"failed to download PDF for {arxiv_id}: {exc}") from exc