"""PicoDet-S_layout_3cls 布局检测 — 纯 ONNX Runtime 推理. 用 onnxruntime 加载导出好的 ONNX 模型,检测 PDF 页面中的 figure / table 区域。 模型自带 NMS + GFL decode,输出即为后处理完毕的检测框。 输入: image: (1, 3, 480, 480) float32 — ImageNet 标准化后的图片 scale_factor: (1, 2) float32 — [y_scale, x_scale],用于坐标还原 输出: fetch_name_0: (N, 6) float32 — [xmin, ymin, xmax, ymax, score, class_id] fetch_name_1: (1,) int32 — 有效框数量 N """ from __future__ import annotations import logging from dataclasses import dataclass from pathlib import Path import numpy as np import onnxruntime as ort import pymupdf from app.config import settings logger = logging.getLogger(__name__) # 模型输入尺寸 _MODEL_SIZE = 480 # ImageNet normalize _MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32) _STD = np.array([0.229, 0.224, 0.225], dtype=np.float32) # PicoDet label → 内部 boxclass _LABEL_MAP: dict[int, str] = { 0: "picture", # PicoDet "image" → "picture" 1: "table", # 2: seal — 忽略 } # 最小 bbox 尺寸(PDF 点) _MIN_BOX_SIZE = 20 @dataclass class LayoutBox: """检测到的布局区域,兼容现有 _process_page 代码。""" x0: float y0: float x1: float y1: float boxclass: str # "picture" | "table" class _LayoutDetector: """单例:管理 ONNX InferenceSession 生命周期。""" def __init__(self) -> None: self._session: ort.InferenceSession | None = None def _init_session(self) -> ort.InferenceSession: if self._session is not None: return self._session model_path = Path(settings.LAYOUT_MODEL_PATH) if not model_path.exists(): raise FileNotFoundError( f"Layout model not found: {model_path}. " "Run scripts/export_picodet_onnx.py first." ) logger.info("Loading ONNX layout model: %s", model_path) self._session = ort.InferenceSession( str(model_path), providers=["CPUExecutionProvider"] ) logger.info("ONNX layout model loaded") return self._session def detect_page(self, page: pymupdf.Page) -> list[LayoutBox]: """检测单页 PDF 的 figure / table 区域。 流程: 1. pymupdf 以 480×480 渲染页面 2. ImageNet normalize → NCHW 3. ONNX 推理 → 得到已解码+NMS 的检测框 4. 像素坐标 → PDF 点坐标 5. 过滤 seal 类和低置信度框 Args: page: pymupdf Page 对象 Returns: LayoutBox 列表,坐标为 PDF 点 """ session = self._init_session() page_w = page.rect.width page_h = page.rect.height # 1. 渲染页面到 _MODEL_SIZE × _MODEL_SIZE zoom_x = _MODEL_SIZE / page_w zoom_y = _MODEL_SIZE / page_h mat = pymupdf.Matrix(zoom_x, zoom_y) pix = page.get_pixmap(matrix=mat) # 2. 预处理 img = ( np.frombuffer(pix.samples, dtype=np.uint8) .reshape(pix.height, pix.width, pix.n) .astype(np.float32) / 255.0 ) # 去掉 alpha 通道(如有) if img.shape[2] == 4: img = img[:, :, :3] img = (img - _MEAN) / _STD img = img.transpose(2, 0, 1)[np.newaxis] # (1, 3, H, W) # scale_factor 用于坐标还原(模型内部可能用) scale_factor = np.array([[1.0, 1.0]], dtype=np.float32) # 3. 推理 input_names = [i.name for i in session.get_inputs()] feed = {input_names[0]: img} if len(input_names) > 1: feed[input_names[1]] = scale_factor outputs = session.run(None, feed) boxes_raw = outputs[0] # (N, 6): [class_id, score, xmin, ymin, xmax, ymax] num_boxes = int(outputs[1][0]) # 有效框数 if num_boxes == 0: return [] # 4. 像素 → PDF 点坐标 sx = page_w / _MODEL_SIZE sy = page_h / _MODEL_SIZE result: list[LayoutBox] = [] for i in range(min(num_boxes, len(boxes_raw))): cls_id, score, xmin, ymin, xmax, ymax = boxes_raw[i] cls_id = int(cls_id) # 跳过 seal 类和低置信度 if cls_id not in _LABEL_MAP: continue if score < settings.LAYOUT_THRESHOLD: continue x0, y0 = xmin * sx, ymin * sy x1, y1 = xmax * sx, ymax * sy # 跳过极小区域 if (x1 - x0) < _MIN_BOX_SIZE or (y1 - y0) < _MIN_BOX_SIZE: continue result.append( LayoutBox(x0=x0, y0=y0, x1=x1, y1=y1, boxclass=_LABEL_MAP[cls_id]) ) return result # 模块级单例 _detector = _LayoutDetector() def detect_page_layout(page: pymupdf.Page) -> list[LayoutBox]: """检测 PDF 页面中的 figure / table 区域。 Returns: LayoutBox 列表,坐标为 PDF 点,仅含 picture/table。 """ return _detector.detect_page(page)