90fe705e8f
- 核心变更: - app/services/layout_detector.py: 重写布局检测器,从 PicoDet-S_layout_3cls 迁移到 DocLayout-YOLO (DocStructBench, imgsz=1024) - 支持多设备推理 (CPU/CUDA/DirectML/OpenVINO 等),自动探测最优设备 - 预处理改为 letterbox (保比例缩放+灰边 padding),坐标还原使用 (model_coord - padding) / ratio 公式 - 后处理解析 YOLOv10 end-to-end 输出 [N,6]=[x1,y1,x2,y2,conf,cls] - 类别映射改为按 class name 动态匹配 (figure/figure_group→picture, table/table_group→table) - 新增文件: - scripts/export_doclayout_yolo_onnx.py: DocLayout-YOLO ONNX 导出脚本 (独立 venv 运行) - tests/test_layout_detector.py: 布局检测器完整测试 (35 个用例) - 配置更新: - .env.example: 更新布局检测配置 (新增 LAYOUT_IMGSZ, LAYOUT_DEVICE, LAYOUT_DEVICE_ID) - app/config.py: Settings 类对应字段 - pyproject.toml: 新增 export 依赖组 (torch, doclayout-yolo, onnx 等) - 删除旧文件: - scripts/export_picodet_onnx.py: 旧 PicoDet 导出脚本 - 文档更新: - README.md: 更新环境变量说明 - 相关服务注释更新 (pdf_image_extractor.py, summary_persister.py, reextract_images.py) 此重构遵循项目初期开发阶段规范,大胆调整数据模型,无需向后兼容。
92 lines
2.6 KiB
Python
92 lines
2.6 KiB
Python
"""应用配置 — 从 .env / 环境变量加载。"""
|
||
|
||
from pathlib import Path
|
||
|
||
from pydantic_settings import BaseSettings
|
||
|
||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||
|
||
|
||
class Settings(BaseSettings):
|
||
# 应用
|
||
APP_HOST: str = "127.0.0.1"
|
||
APP_PORT: int = 8000
|
||
APP_DEBUG: bool = False
|
||
BASE_URL: str = "http://127.0.0.1:8000"
|
||
APP_TIMEZONE: str = "Asia/Shanghai"
|
||
|
||
# 安全
|
||
ADMIN_USERNAME: str = "admin"
|
||
ADMIN_PASSWORD: str = ""
|
||
SECRET_KEY: str = "change-me"
|
||
|
||
# HuggingFace / arXiv
|
||
HF_API_BASE: str = "https://huggingface.co/api"
|
||
HF_PROXY: str = ""
|
||
TOP_N: int = 20
|
||
HTTP_TIMEOUT_SECONDS: int = 30
|
||
HTTP_MAX_RETRIES: int = 3
|
||
HTTP_USER_AGENT: str = "hf-daily-papers-local/0.1"
|
||
PDF_DOWNLOAD_TIMEOUT: int = 120
|
||
|
||
# AI 总结
|
||
SUMMARY_BACKEND: str = "pi" # "pi" | "claude"
|
||
PI_BIN: str = ""
|
||
SUMMARY_SKILL: str = "daily-paper-summary"
|
||
CLAUDE_BIN: str = "claude"
|
||
SUMMARY_CONCURRENCY: int = 3
|
||
SUMMARY_TIMEOUT_SECONDS: int = 1200
|
||
SUMMARY_MAX_RETRIES: int = 2
|
||
SUMMARY_PDF_MODE: str = (
|
||
"auto" # "auto" = ≤80k 用 inject,>80k 用 search;也可强制 "inject" / "search"
|
||
)
|
||
|
||
# 调度
|
||
SCHEDULER_ENABLED: bool = False
|
||
SCHEDULE_HOUR: int = 4
|
||
SCHEDULE_MINUTE: int = 0
|
||
APP_WORKERS: int = 1
|
||
UPVOTE_REFRESH_DAYS: int = 7 # 刷新最近 N 天论文的 upvotes
|
||
|
||
# 数据库
|
||
DATABASE_URL: str = "sqlite:///data/db/papers.db"
|
||
|
||
# 语义搜索
|
||
CHROMA_ENABLED: bool = False
|
||
CHROMA_DIR: str = "data/chroma"
|
||
EMBED_API_BASE: str = ""
|
||
EMBED_API_KEY: str = ""
|
||
EMBED_MODEL: str = ""
|
||
EMBED_DIMENSIONS: int = 0
|
||
|
||
# 布局检测
|
||
LAYOUT_MODEL_PATH: str = "data/models/doclayout_yolo_docstructbench_imgsz1024.onnx"
|
||
LAYOUT_IMGSZ: int = 1024
|
||
LAYOUT_THRESHOLD: float = 0.2
|
||
# 推理设备:auto|cpu|cuda|directml|openvino|cann|tensorrt|qnn
|
||
# auto = 按优先级 [CUDA, DirectML, OpenVINO, CPU] 自动探测
|
||
LAYOUT_DEVICE: str = "auto"
|
||
LAYOUT_DEVICE_ID: int = 0
|
||
|
||
model_config = {
|
||
"env_file": str(BASE_DIR / ".env"),
|
||
"env_file_encoding": "utf-8",
|
||
"extra": "ignore",
|
||
}
|
||
|
||
@property
|
||
def db_path(self) -> Path:
|
||
"""从 DATABASE_URL 解析出 SQLite 文件路径。"""
|
||
# sqlite:///data/db/papers.db → data/db/papers.db
|
||
url = self.DATABASE_URL
|
||
if url.startswith("sqlite:///"):
|
||
return BASE_DIR / url[len("sqlite:///") :]
|
||
raise ValueError(f"Unsupported DATABASE_URL: {url}")
|
||
|
||
@property
|
||
def http_proxy(self) -> str | None:
|
||
return self.HF_PROXY or None
|
||
|
||
|
||
settings = Settings()
|