Files
Rain-Bus 90fe705e8f refactor: 迁移布局检测模型从 PicoDet 到 DocLayout-YOLO
- 核心变更:
  - app/services/layout_detector.py: 重写布局检测器,从 PicoDet-S_layout_3cls 迁移到 DocLayout-YOLO (DocStructBench, imgsz=1024)
  - 支持多设备推理 (CPU/CUDA/DirectML/OpenVINO 等),自动探测最优设备
  - 预处理改为 letterbox (保比例缩放+灰边 padding),坐标还原使用 (model_coord - padding) / ratio 公式
  - 后处理解析 YOLOv10 end-to-end 输出 [N,6]=[x1,y1,x2,y2,conf,cls]
  - 类别映射改为按 class name 动态匹配 (figure/figure_group→picture, table/table_group→table)

- 新增文件:
  - scripts/export_doclayout_yolo_onnx.py: DocLayout-YOLO ONNX 导出脚本 (独立 venv 运行)
  - tests/test_layout_detector.py: 布局检测器完整测试 (35 个用例)

- 配置更新:
  - .env.example: 更新布局检测配置 (新增 LAYOUT_IMGSZ, LAYOUT_DEVICE, LAYOUT_DEVICE_ID)
  - app/config.py: Settings 类对应字段
  - pyproject.toml: 新增 export 依赖组 (torch, doclayout-yolo, onnx 等)

- 删除旧文件:
  - scripts/export_picodet_onnx.py: 旧 PicoDet 导出脚本

- 文档更新:
  - README.md: 更新环境变量说明
  - 相关服务注释更新 (pdf_image_extractor.py, summary_persister.py, reextract_images.py)

此重构遵循项目初期开发阶段规范,大胆调整数据模型,无需向后兼容。
2026-06-14 10:41:44 +08:00

145 lines
4.3 KiB
Python

"""FastAPI 应用入口。"""
import logging
import os
from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.responses import JSONResponse
from fastapi.staticfiles import StaticFiles
from starlette.middleware.sessions import SessionMiddleware
from app.config import settings
from app.exceptions import (
AppError,
ConflictError,
ExternalAPIError,
NotFoundError,
PdfProcessError,
ValidationError,
)
from app.database import engine, init_db
from app.routes.admin import router as admin_router
from app.routes.compare import router as compare_router
from app.routes.pages import router as pages_router
from app.routes.search import router as search_router
from app.routes.trends import router as trends_router
from app.routes.user import router as user_router
logging.basicConfig(
level=logging.DEBUG if settings.APP_DEBUG else logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
logger = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI):
"""应用生命周期管理:启动与关闭。"""
# ── startup ──
from app.services.scheduler import start_scheduler
from app.services.embedder import init_chroma
from app.services.jobs import recover_stale_jobs
from app.database import SessionLocal
db = SessionLocal()
try:
recover_stale_jobs(db)
finally:
db.close()
start_scheduler()
init_chroma()
yield
# ── shutdown ──
from app.services.scheduler import stop_scheduler
from app.services.pdf_downloader import close_http_session
stop_scheduler()
close_http_session()
def create_app() -> FastAPI:
app = FastAPI(
title="HF Daily Papers",
description="HuggingFace Daily Papers — 中文论文导览站",
version="0.1.0",
lifespan=lifespan,
)
# 确保数据目录存在
os.makedirs(settings.db_path.parent, exist_ok=True)
# 初始化数据库
init_db(engine)
logger.info("Database initialized at %s", settings.db_path)
# Session 中间件
app.add_middleware(SessionMiddleware, secret_key=settings.SECRET_KEY)
# ── 统一业务异常处理 ──
@app.exception_handler(NotFoundError)
async def _not_found_handler(request, exc):
return JSONResponse(status_code=404, content={"error": exc.message})
@app.exception_handler(ValidationError)
async def _validation_handler(request, exc):
return JSONResponse(status_code=400, content={"error": exc.message})
@app.exception_handler(ExternalAPIError)
async def _external_api_handler(request, exc):
return JSONResponse(status_code=502, content={"error": exc.message})
@app.exception_handler(PdfProcessError)
async def _pdf_process_handler(request, exc):
return JSONResponse(status_code=500, content={"error": exc.message})
@app.exception_handler(ConflictError)
async def _conflict_handler(request, exc):
return JSONResponse(status_code=409, content={"error": exc.message})
@app.exception_handler(AppError)
async def _app_error_handler(request, exc):
return JSONResponse(status_code=500, content={"error": exc.message})
# 安全警告
if settings.SECRET_KEY == "change-me":
logger.warning(
"⚠️ SECRET_KEY is the default value 'change-me'. Please change it in .env!"
)
if not settings.ADMIN_PASSWORD:
logger.warning("⚠️ ADMIN_PASSWORD is empty. Please set it in .env!")
# 静态文件
app.mount("/static", StaticFiles(directory="app/static"), name="static")
# 论文图片静态服务
papers_images_dir = os.path.join("data", "papers")
os.makedirs(papers_images_dir, exist_ok=True)
app.mount("/papers", StaticFiles(directory=papers_images_dir), name="papers")
# 路由
app.include_router(pages_router)
app.include_router(admin_router)
app.include_router(search_router)
app.include_router(user_router)
app.include_router(trends_router)
app.include_router(compare_router)
return app
app = create_app()
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"app.main:app",
host=settings.APP_HOST,
port=settings.APP_PORT,
reload=settings.APP_DEBUG,
)