feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
@@ -9,6 +9,7 @@ from pathlib import Path

 import requests

+from app.config import settings
 from app.utils import PAPERS_DIR, TMP_DIR

 logger = logging.getLogger(__name__)
@@ -51,6 +52,14 @@ def _get_session() -> requests.Session:
    return _http_session


+def close_http_session() -> None:
+    """关闭全局 HTTP Session，供应用 shutdown 时调用。"""
+    global _http_session
+    if _http_session is not None:
+        _http_session.close()
+        _http_session = None
+
+
 async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
    """下载 PDF 到 data/tmp/{arxiv_id}/paper.pdf。"""
    if not pdf_url:
@@ -62,10 +71,16 @@ async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:

    try:
        session = _get_session()
-        resp = session.get(pdf_url, timeout=120, allow_redirects=True)
+        resp = session.get(pdf_url, timeout=settings.PDF_DOWNLOAD_TIMEOUT, allow_redirects=True)
        resp.raise_for_status()
        dest.write_bytes(resp.content)
    except Exception as exc:
+        # 清理残留的部分文件
+        if dest.exists():
+            try:
+                dest.unlink()
+            except OSError:
+                pass
        raise PdfDownloadError(f"failed to download PDF for {arxiv_id}: {exc}") from exc

    logger.info("Downloaded PDF: %s (%d bytes)", arxiv_id, dest.stat().st_size)