feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules
- Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection
- Add layout_detector service for PicoDet-S_layout_3cls integration
- Add exceptions module with ConflictError and NotFoundError
- Improve admin dashboard with better statistics and task management
- Add design review document with system optimization suggestions
- Add new tests for crawler, pdf_downloader, pipeline, and summary_utils
- Update dependencies and configuration
- Clean up dead code and improve error handling
This commit is contained in:
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
+174
View File
@@ -0,0 +1,174 @@
"""PicoDet-S_layout_3cls 布局检测 — 纯 ONNX Runtime 推理.
用 onnxruntime 加载导出好的 ONNX 模型,检测 PDF 页面中的 figure / table 区域。
模型自带 NMS + GFL decode,输出即为后处理完毕的检测框。
输入:
image: (1, 3, 480, 480) float32 — ImageNet 标准化后的图片
scale_factor: (1, 2) float32 — [y_scale, x_scale],用于坐标还原
输出:
fetch_name_0: (N, 6) float32 — [xmin, ymin, xmax, ymax, score, class_id]
fetch_name_1: (1,) int32 — 有效框数量 N
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import onnxruntime as ort
import pymupdf
from app.config import settings
logger = logging.getLogger(__name__)
# 模型输入尺寸
_MODEL_SIZE = 480
# ImageNet normalize
_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32)
_STD = np.array([0.229, 0.224, 0.225], dtype=np.float32)
# PicoDet label → 内部 boxclass
_LABEL_MAP: dict[int, str] = {
0: "picture", # PicoDet "image" → "picture"
1: "table",
# 2: seal — 忽略
}
# 最小 bbox 尺寸(PDF 点)
_MIN_BOX_SIZE = 20
@dataclass
class LayoutBox:
"""检测到的布局区域,兼容现有 _process_page 代码。"""
x0: float
y0: float
x1: float
y1: float
boxclass: str # "picture" | "table"
class _LayoutDetector:
"""单例:管理 ONNX InferenceSession 生命周期。"""
def __init__(self) -> None:
self._session: ort.InferenceSession | None = None
def _init_session(self) -> ort.InferenceSession:
if self._session is not None:
return self._session
model_path = Path(settings.LAYOUT_MODEL_PATH)
if not model_path.exists():
raise FileNotFoundError(
f"Layout model not found: {model_path}. "
"Run scripts/export_picodet_onnx.py first."
)
logger.info("Loading ONNX layout model: %s", model_path)
self._session = ort.InferenceSession(
str(model_path), providers=["CPUExecutionProvider"]
)
logger.info("ONNX layout model loaded")
return self._session
def detect_page(self, page: pymupdf.Page) -> list[LayoutBox]:
"""检测单页 PDF 的 figure / table 区域。
流程:
1. pymupdf 以 480×480 渲染页面
2. ImageNet normalize → NCHW
3. ONNX 推理 → 得到已解码+NMS 的检测框
4. 像素坐标 → PDF 点坐标
5. 过滤 seal 类和低置信度框
Args:
page: pymupdf Page 对象
Returns:
LayoutBox 列表,坐标为 PDF 点
"""
session = self._init_session()
page_w = page.rect.width
page_h = page.rect.height
# 1. 渲染页面到 _MODEL_SIZE × _MODEL_SIZE
zoom_x = _MODEL_SIZE / page_w
zoom_y = _MODEL_SIZE / page_h
mat = pymupdf.Matrix(zoom_x, zoom_y)
pix = page.get_pixmap(matrix=mat)
# 2. 预处理
img = (
np.frombuffer(pix.samples, dtype=np.uint8)
.reshape(pix.height, pix.width, pix.n)
.astype(np.float32)
/ 255.0
)
# 去掉 alpha 通道(如有)
if img.shape[2] == 4:
img = img[:, :, :3]
img = (img - _MEAN) / _STD
img = img.transpose(2, 0, 1)[np.newaxis] # (1, 3, H, W)
# scale_factor 用于坐标还原(模型内部可能用)
scale_factor = np.array([[1.0, 1.0]], dtype=np.float32)
# 3. 推理
input_names = [i.name for i in session.get_inputs()]
feed = {input_names[0]: img}
if len(input_names) > 1:
feed[input_names[1]] = scale_factor
outputs = session.run(None, feed)
boxes_raw = outputs[0] # (N, 6): [class_id, score, xmin, ymin, xmax, ymax]
num_boxes = int(outputs[1][0]) # 有效框数
if num_boxes == 0:
return []
# 4. 像素 → PDF 点坐标
sx = page_w / _MODEL_SIZE
sy = page_h / _MODEL_SIZE
result: list[LayoutBox] = []
for i in range(min(num_boxes, len(boxes_raw))):
cls_id, score, xmin, ymin, xmax, ymax = boxes_raw[i]
cls_id = int(cls_id)
# 跳过 seal 类和低置信度
if cls_id not in _LABEL_MAP:
continue
if score < settings.LAYOUT_THRESHOLD:
continue
x0, y0 = xmin * sx, ymin * sy
x1, y1 = xmax * sx, ymax * sy
# 跳过极小区域
if (x1 - x0) < _MIN_BOX_SIZE or (y1 - y0) < _MIN_BOX_SIZE:
continue
result.append(
LayoutBox(x0=x0, y0=y0, x1=x1, y1=y1, boxclass=_LABEL_MAP[cls_id])
)
return result
# 模块级单例
_detector = _LayoutDetector()
def detect_page_layout(page: pymupdf.Page) -> list[LayoutBox]:
"""检测 PDF 页面中的 figure / table 区域。
Returns:
LayoutBox 列表,坐标为 PDF 点,仅含 picture/table。
"""
return _detector.detect_page(page)