daily-paper/app/services/layout_detector.py

"""PicoDet-S_layout_3cls 布局检测 — 纯 ONNX Runtime 推理.

用 onnxruntime 加载导出好的 ONNX 模型，检测 PDF 页面中的 figure / table 区域。
模型自带 NMS + GFL decode，输出即为后处理完毕的检测框。

输入:
  image:        (1, 3, 480, 480) float32 — ImageNet 标准化后的图片
  scale_factor: (1, 2) float32 — [y_scale, x_scale]，用于坐标还原

输出:
  fetch_name_0: (N, 6) float32 — [xmin, ymin, xmax, ymax, score, class_id]
  fetch_name_1: (1,) int32 — 有效框数量 N
"""

from __future__ import annotations

import logging
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import onnxruntime as ort
import pymupdf

from app.config import settings

logger = logging.getLogger(__name__)

# 模型输入尺寸
_MODEL_SIZE = 480
# ImageNet normalize
_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32)
_STD = np.array([0.229, 0.224, 0.225], dtype=np.float32)
# PicoDet label → 内部 boxclass
_LABEL_MAP: dict[int, str] = {
    0: "picture",  # PicoDet "image" → "picture"
    1: "table",
    # 2: seal — 忽略
}
# 最小 bbox 尺寸（PDF 点）
_MIN_BOX_SIZE = 20


@dataclass
class LayoutBox:
    """检测到的布局区域，兼容现有 _process_page 代码。"""

    x0: float
    y0: float
    x1: float
    y1: float
    boxclass: str  # "picture" | "table"


class _LayoutDetector:
    """单例：管理 ONNX InferenceSession 生命周期。"""

    def __init__(self) -> None:
        self._session: ort.InferenceSession | None = None

    def _init_session(self) -> ort.InferenceSession:
        if self._session is not None:
            return self._session

        model_path = Path(settings.LAYOUT_MODEL_PATH)
        if not model_path.exists():
            raise FileNotFoundError(
                f"Layout model not found: {model_path}. "
                "Run scripts/export_picodet_onnx.py first."
            )

        logger.info("Loading ONNX layout model: %s", model_path)
        self._session = ort.InferenceSession(
            str(model_path), providers=["CPUExecutionProvider"]
        )
        logger.info("ONNX layout model loaded")
        return self._session

    def detect_page(self, page: pymupdf.Page) -> list[LayoutBox]:
        """检测单页 PDF 的 figure / table 区域。

        流程：
        1. pymupdf 以 480×480 渲染页面
        2. ImageNet normalize → NCHW
        3. ONNX 推理 → 得到已解码+NMS 的检测框
        4. 像素坐标 → PDF 点坐标
        5. 过滤 seal 类和低置信度框

        Args:
            page: pymupdf Page 对象

        Returns:
            LayoutBox 列表，坐标为 PDF 点
        """
        session = self._init_session()

        page_w = page.rect.width
        page_h = page.rect.height

        # 1. 渲染页面到 _MODEL_SIZE × _MODEL_SIZE
        zoom_x = _MODEL_SIZE / page_w
        zoom_y = _MODEL_SIZE / page_h
        mat = pymupdf.Matrix(zoom_x, zoom_y)
        pix = page.get_pixmap(matrix=mat)

        # 2. 预处理
        img = (
            np.frombuffer(pix.samples, dtype=np.uint8)
            .reshape(pix.height, pix.width, pix.n)
            .astype(np.float32)
            / 255.0
        )
        # 去掉 alpha 通道（如有）
        if img.shape[2] == 4:
            img = img[:, :, :3]
        img = (img - _MEAN) / _STD
        img = img.transpose(2, 0, 1)[np.newaxis]  # (1, 3, H, W)

        # scale_factor 用于坐标还原（模型内部可能用）
        scale_factor = np.array([[1.0, 1.0]], dtype=np.float32)

        # 3. 推理
        input_names = [i.name for i in session.get_inputs()]
        feed = {input_names[0]: img}
        if len(input_names) > 1:
            feed[input_names[1]] = scale_factor

        outputs = session.run(None, feed)
        boxes_raw = outputs[0]  # (N, 6): [class_id, score, xmin, ymin, xmax, ymax]
        num_boxes = int(outputs[1][0])  # 有效框数

        if num_boxes == 0:
            return []

        # 4. 像素 → PDF 点坐标
        sx = page_w / _MODEL_SIZE
        sy = page_h / _MODEL_SIZE

        result: list[LayoutBox] = []
        for i in range(min(num_boxes, len(boxes_raw))):
            cls_id, score, xmin, ymin, xmax, ymax = boxes_raw[i]
            cls_id = int(cls_id)

            # 跳过 seal 类和低置信度
            if cls_id not in _LABEL_MAP:
                continue
            if score < settings.LAYOUT_THRESHOLD:
                continue

            x0, y0 = xmin * sx, ymin * sy
            x1, y1 = xmax * sx, ymax * sy

            # 跳过极小区域
            if (x1 - x0) < _MIN_BOX_SIZE or (y1 - y0) < _MIN_BOX_SIZE:
                continue

            result.append(
                LayoutBox(x0=x0, y0=y0, x1=x1, y1=y1, boxclass=_LABEL_MAP[cls_id])
            )

        return result


# 模块级单例
_detector = _LayoutDetector()


def detect_page_layout(page: pymupdf.Page) -> list[LayoutBox]:
    """检测 PDF 页面中的 figure / table 区域。

    Returns:
        LayoutBox 列表，坐标为 PDF 点，仅含 picture/table。
    """
    return _detector.detect_page(page)