feat: add claude backend, refactor summary utilities, improve batch worker pattern, add pymupdf4llm

2026-06-12 22:25:57 +08:00
parent b42e9149e5
commit e2f0e1a8be
13 changed files with 1350 additions and 1010 deletions
@@ -3,10 +3,13 @@
 from __future__ import annotations

 import logging
+import os
 import shutil
 from pathlib import Path

-from app.utils import PAPERS_DIR, TMP_DIR, make_http_client
+import requests
+
+from app.utils import PAPERS_DIR, TMP_DIR

 logger = logging.getLogger(__name__)

@@ -31,6 +34,22 @@ def tmp_dir(arxiv_id: str) -> Path:

 # ── PDF 下载 ────────────────────────────────────────────────────────────

+# 复用 TCP 连接的 session
+_http_session: requests.Session | None = None
+
+
+def _get_session() -> requests.Session:
+    global _http_session
+    if _http_session is None:
+        _http_session = requests.Session()
+        _http_session.headers.update({"User-Agent": "hf-daily-papers/1.0"})
+        # 代理：优先 $PROXY_SERVER，其次 settings.http_proxy
+        proxy = os.environ.get("PROXY_SERVER")
+        if proxy:
+            _http_session.proxies = {"http": proxy, "https": proxy}
+            logger.info("PDF download using proxy from $PROXY_SERVER: %s", proxy)
+    return _http_session
+

 async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
    """下载 PDF 到 data/tmp/{arxiv_id}/paper.pdf。"""
@@ -42,10 +61,10 @@ async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
    dest = dest_dir / "paper.pdf"

    try:
-        async with make_http_client(follow_redirects=True) as client:
-            resp = await client.get(pdf_url)
-            resp.raise_for_status()
-            dest.write_bytes(resp.content)
+        session = _get_session()
+        resp = session.get(pdf_url, timeout=120, allow_redirects=True)
+        resp.raise_for_status()
+        dest.write_bytes(resp.content)
    except Exception as exc:
        raise PdfDownloadError(f"failed to download PDF for {arxiv_id}: {exc}") from exc