feat: refactor summarizer and PDF extraction pipeline
- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
This commit is contained in:
@@ -9,6 +9,7 @@ from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
from app.config import settings
|
||||
from app.utils import PAPERS_DIR, TMP_DIR
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -51,6 +52,14 @@ def _get_session() -> requests.Session:
|
||||
return _http_session
|
||||
|
||||
|
||||
def close_http_session() -> None:
|
||||
"""关闭全局 HTTP Session,供应用 shutdown 时调用。"""
|
||||
global _http_session
|
||||
if _http_session is not None:
|
||||
_http_session.close()
|
||||
_http_session = None
|
||||
|
||||
|
||||
async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
|
||||
"""下载 PDF 到 data/tmp/{arxiv_id}/paper.pdf。"""
|
||||
if not pdf_url:
|
||||
@@ -62,10 +71,16 @@ async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
|
||||
|
||||
try:
|
||||
session = _get_session()
|
||||
resp = session.get(pdf_url, timeout=120, allow_redirects=True)
|
||||
resp = session.get(pdf_url, timeout=settings.PDF_DOWNLOAD_TIMEOUT, allow_redirects=True)
|
||||
resp.raise_for_status()
|
||||
dest.write_bytes(resp.content)
|
||||
except Exception as exc:
|
||||
# 清理残留的部分文件
|
||||
if dest.exists():
|
||||
try:
|
||||
dest.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
raise PdfDownloadError(f"failed to download PDF for {arxiv_id}: {exc}") from exc
|
||||
|
||||
logger.info("Downloaded PDF: %s (%d bytes)", arxiv_id, dest.stat().st_size)
|
||||
|
||||
Reference in New Issue
Block a user