Files
daily-paper/app/config.py
T
Rain-Bus 90fe705e8f refactor: 迁移布局检测模型从 PicoDet 到 DocLayout-YOLO
- 核心变更:
  - app/services/layout_detector.py: 重写布局检测器,从 PicoDet-S_layout_3cls 迁移到 DocLayout-YOLO (DocStructBench, imgsz=1024)
  - 支持多设备推理 (CPU/CUDA/DirectML/OpenVINO 等),自动探测最优设备
  - 预处理改为 letterbox (保比例缩放+灰边 padding),坐标还原使用 (model_coord - padding) / ratio 公式
  - 后处理解析 YOLOv10 end-to-end 输出 [N,6]=[x1,y1,x2,y2,conf,cls]
  - 类别映射改为按 class name 动态匹配 (figure/figure_group→picture, table/table_group→table)

- 新增文件:
  - scripts/export_doclayout_yolo_onnx.py: DocLayout-YOLO ONNX 导出脚本 (独立 venv 运行)
  - tests/test_layout_detector.py: 布局检测器完整测试 (35 个用例)

- 配置更新:
  - .env.example: 更新布局检测配置 (新增 LAYOUT_IMGSZ, LAYOUT_DEVICE, LAYOUT_DEVICE_ID)
  - app/config.py: Settings 类对应字段
  - pyproject.toml: 新增 export 依赖组 (torch, doclayout-yolo, onnx 等)

- 删除旧文件:
  - scripts/export_picodet_onnx.py: 旧 PicoDet 导出脚本

- 文档更新:
  - README.md: 更新环境变量说明
  - 相关服务注释更新 (pdf_image_extractor.py, summary_persister.py, reextract_images.py)

此重构遵循项目初期开发阶段规范,大胆调整数据模型,无需向后兼容。
2026-06-14 10:41:44 +08:00

92 lines
2.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""应用配置 — 从 .env / 环境变量加载。"""
from pathlib import Path
from pydantic_settings import BaseSettings
BASE_DIR = Path(__file__).resolve().parent.parent
class Settings(BaseSettings):
# 应用
APP_HOST: str = "127.0.0.1"
APP_PORT: int = 8000
APP_DEBUG: bool = False
BASE_URL: str = "http://127.0.0.1:8000"
APP_TIMEZONE: str = "Asia/Shanghai"
# 安全
ADMIN_USERNAME: str = "admin"
ADMIN_PASSWORD: str = ""
SECRET_KEY: str = "change-me"
# HuggingFace / arXiv
HF_API_BASE: str = "https://huggingface.co/api"
HF_PROXY: str = ""
TOP_N: int = 20
HTTP_TIMEOUT_SECONDS: int = 30
HTTP_MAX_RETRIES: int = 3
HTTP_USER_AGENT: str = "hf-daily-papers-local/0.1"
PDF_DOWNLOAD_TIMEOUT: int = 120
# AI 总结
SUMMARY_BACKEND: str = "pi" # "pi" | "claude"
PI_BIN: str = ""
SUMMARY_SKILL: str = "daily-paper-summary"
CLAUDE_BIN: str = "claude"
SUMMARY_CONCURRENCY: int = 3
SUMMARY_TIMEOUT_SECONDS: int = 1200
SUMMARY_MAX_RETRIES: int = 2
SUMMARY_PDF_MODE: str = (
"auto" # "auto" = ≤80k 用 inject>80k 用 search;也可强制 "inject" / "search"
)
# 调度
SCHEDULER_ENABLED: bool = False
SCHEDULE_HOUR: int = 4
SCHEDULE_MINUTE: int = 0
APP_WORKERS: int = 1
UPVOTE_REFRESH_DAYS: int = 7 # 刷新最近 N 天论文的 upvotes
# 数据库
DATABASE_URL: str = "sqlite:///data/db/papers.db"
# 语义搜索
CHROMA_ENABLED: bool = False
CHROMA_DIR: str = "data/chroma"
EMBED_API_BASE: str = ""
EMBED_API_KEY: str = ""
EMBED_MODEL: str = ""
EMBED_DIMENSIONS: int = 0
# 布局检测
LAYOUT_MODEL_PATH: str = "data/models/doclayout_yolo_docstructbench_imgsz1024.onnx"
LAYOUT_IMGSZ: int = 1024
LAYOUT_THRESHOLD: float = 0.2
# 推理设备:auto|cpu|cuda|directml|openvino|cann|tensorrt|qnn
# auto = 按优先级 [CUDA, DirectML, OpenVINO, CPU] 自动探测
LAYOUT_DEVICE: str = "auto"
LAYOUT_DEVICE_ID: int = 0
model_config = {
"env_file": str(BASE_DIR / ".env"),
"env_file_encoding": "utf-8",
"extra": "ignore",
}
@property
def db_path(self) -> Path:
"""从 DATABASE_URL 解析出 SQLite 文件路径。"""
# sqlite:///data/db/papers.db → data/db/papers.db
url = self.DATABASE_URL
if url.startswith("sqlite:///"):
return BASE_DIR / url[len("sqlite:///") :]
raise ValueError(f"Unsupported DATABASE_URL: {url}")
@property
def http_proxy(self) -> str | None:
return self.HF_PROXY or None
settings = Settings()