refactor: restructure services and add image/pdf extraction utilities

- Add image_extractor, pdf_downloader, pi_client, trends services
- Add shared utils module
- Refactor summarizer, embedder, routes for cleaner separation
- Update tests to match new service structure
This commit is contained in:
2026-06-06 00:00:55 +08:00
parent ba9afa212c
commit 85c4cfb9e8
22 changed files with 843 additions and 780 deletions
+73
View File
@@ -0,0 +1,73 @@
"""公共工具 — 消除各模块间的重复代码。"""
from __future__ import annotations
from datetime import datetime, timezone
from pathlib import Path
from zoneinfo import ZoneInfo
import httpx
from fastapi.templating import Jinja2Templates
from app.config import settings
# ── 路径常量 ──────────────────────────────────────────────────────────
DATA_DIR = Path("data")
PAPERS_DIR = DATA_DIR / "papers"
TMP_DIR = DATA_DIR / "tmp"
# ── 模板单例 ──────────────────────────────────────────────────────────
templates = Jinja2Templates(directory="app/templates")
# ── 时区工具 ──────────────────────────────────────────────────────────
def today_str() -> str:
"""当前日期字符串(按 APP_TIMEZONE)。"""
tz = ZoneInfo(settings.APP_TIMEZONE)
return datetime.now(tz).strftime("%Y-%m-%d")
# ── 锁释放 ────────────────────────────────────────────────────────────
def release_lock(db, lock) -> None:
"""释放 TaskLock。"""
try:
lock.status = "finished"
lock.released_at = datetime.now(timezone.utc)
db.commit()
except Exception:
db.rollback()
# ── HTTP 客户端工厂 ───────────────────────────────────────────────────
def make_http_client(*, sync: bool = False, follow_redirects: bool = False, **kwargs) -> httpx.AsyncClient | httpx.Client:
"""创建带 proxy 和默认配置的 httpx 客户端。
Args:
sync: True 返回同步 ClientFalse 返回 AsyncClient
follow_redirects: 是否跟随重定向
**kwargs: 覆盖默认参数
"""
defaults: dict = {
"timeout": settings.HTTP_TIMEOUT_SECONDS,
"headers": {"User-Agent": settings.HTTP_USER_AGENT},
"follow_redirects": follow_redirects,
}
if settings.http_proxy:
defaults["transport"] = (
httpx.HTTPTransport(proxy=settings.http_proxy)
if sync
else httpx.AsyncHTTPTransport(proxy=settings.http_proxy)
)
defaults.update(kwargs)
if sync:
return httpx.Client(**defaults)
return httpx.AsyncClient(**defaults)