feat: add concurrency safety, caption detection, admin enhancements, and performance improvements

2026-06-14 22:20:02 +08:00
parent 8f13c31991
commit 29fb20828e
23 changed files with 1782 additions and 114 deletions
@@ -34,6 +34,8 @@ _CLUSTER_GAP = 15
 _MIN_BOX_AREA = 2000
 # Phase 2: 搜索文本到 box 的最大匹配距离（单位: pt）
 _LABEL_MATCH_DISTANCE = 100
+# DocLayout caption 与 figure/table 匹配的最大距离（单位: pt）
+_CAPTION_MATCH_DISTANCE = 120


 # ── Box 聚类 ─────────────────────────────────────────────────────────
@@ -53,6 +55,15 @@ class _BoxCluster:
        self.boxclass = "table" if raw == "table-fallback" else raw


+def _cluster_to_box(cluster: _BoxCluster) -> list[float]:
+    return [
+        round(float(cluster.x0), 1),
+        round(float(cluster.y0), 1),
+        round(float(cluster.x1), 1),
+        round(float(cluster.y1), 1),
+    ]
+
+
 def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]:
    """将相邻的同类型 box 合并为聚类。"""
    if not boxes:
@@ -92,6 +103,67 @@ def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]:
    return [_BoxCluster(members) for members in groups.values()]


+def _caption_class_for_content(boxclass: str) -> str:
+    return "figure_caption" if boxclass == "picture" else "table_caption"
+
+
+def _caption_distance(content: _BoxCluster, caption: _BoxCluster) -> float | None:
+    """Return a spatial score for pairing a caption with a content box."""
+    h_overlap = min(content.x1, caption.x1) - max(content.x0, caption.x0)
+    min_width = min(content.x1 - content.x0, caption.x1 - caption.x0)
+    if min_width <= 0 or h_overlap < min_width * 0.25:
+        return None
+
+    if caption.y1 < content.y0:
+        v_gap = content.y0 - caption.y1
+    elif caption.y0 > content.y1:
+        v_gap = caption.y0 - content.y1
+    else:
+        v_gap = 0.0
+
+    return v_gap if v_gap <= _CAPTION_MATCH_DISTANCE else None
+
+
+def _extract_caption_text(page, caption: _BoxCluster) -> str:
+    rect = pymupdf.Rect(caption.x0, caption.y0, caption.x1, caption.y1)
+    try:
+        text = page.get_text("text", clip=rect)
+    except Exception:
+        return ""
+    return " ".join(text.split())
+
+
+def _match_captions(
+    page,
+    content_clusters: list[_BoxCluster],
+    caption_clusters: list[_BoxCluster],
+) -> dict[int, tuple[_BoxCluster, str]]:
+    """Match each content cluster to its nearest same-type DocLayout caption."""
+    matches: dict[int, tuple[_BoxCluster, str]] = {}
+    used_captions: set[int] = set()
+    candidates: list[tuple[float, int, int]] = []
+
+    for content_idx, content in enumerate(content_clusters):
+        wanted_caption_class = _caption_class_for_content(content.boxclass)
+        for caption_idx, caption in enumerate(caption_clusters):
+            if caption.boxclass != wanted_caption_class:
+                continue
+            dist = _caption_distance(content, caption)
+            if dist is not None:
+                candidates.append((dist, content_idx, caption_idx))
+
+    for _dist, content_idx, caption_idx in sorted(candidates):
+        if content_idx in matches or caption_idx in used_captions:
+            continue
+        text = _extract_caption_text(page, caption_clusters[caption_idx])
+        if not text:
+            continue
+        matches[content_idx] = (caption_clusters[caption_idx], text)
+        used_captions.add(caption_idx)
+
+    return matches
+
+
 # ── Phase 1: 检测 + 渲染 ──────────────────────────────────────────────


@@ -102,14 +174,25 @@ def _render_box(
    filename: str,
    cap_type: str,
    page_num: int,
+    caption: _BoxCluster | None = None,
 ) -> bool:
-    """渲染单个 box 区域并保存 JPEG，成功返回 True。"""
+    """渲染单个 box 区域并保存 JPEG，成功返回 True。
+
+    若提供 caption，则将内容与 caption 区域合并后一起截取，
+    使同一张截图同时包含图/表及其标题文字。
+    """
    page_width = page.rect.width
+    x0, y0, x1, y1 = box.x0, box.y0, box.x1, box.y1
+    if caption is not None:
+        x0 = min(x0, caption.x0)
+        y0 = min(y0, caption.y0)
+        x1 = max(x1, caption.x1)
+        y1 = max(y1, caption.y1)
    clip = pymupdf.Rect(
-        max(0, box.x0 - _REGION_PADDING),
-        max(0, box.y0 - _REGION_PADDING),
-        min(page_width, box.x1 + _REGION_PADDING),
-        box.y1 + _REGION_PADDING,
+        max(0, x0 - _REGION_PADDING),
+        max(0, y0 - _REGION_PADDING),
+        min(page_width, x1 + _REGION_PADDING),
+        y1 + _REGION_PADDING,
    )
    mat = pymupdf.Matrix(_RENDER_ZOOM, _RENDER_ZOOM)
    try:
@@ -136,25 +219,31 @@ def _process_page(
    fig_counter = 0
    tbl_counter = 0

-    # 收集本页的 table/picture box（跳过极小区域）
+    # 收集本页的 table/picture box 与 caption box（跳过极小区域）
    raw_boxes = []
+    raw_caption_boxes = []
    for box in page_boxes:
-        if box.boxclass not in ("table", "table-fallback", "picture"):
-            continue
        w = box.x1 - box.x0
        h = box.y1 - box.y0
-        if w < 20 or h < 20 or w * h < _MIN_BOX_AREA:
-            continue
-        raw_boxes.append(box)
+        if box.boxclass in ("table", "table-fallback", "picture"):
+            if w < 20 or h < 20 or w * h < _MIN_BOX_AREA:
+                continue
+            raw_boxes.append(box)
+        elif box.boxclass in ("figure_caption", "table_caption"):
+            if w < 30 or h < 6:
+                continue
+            raw_caption_boxes.append(box)

    if not raw_boxes:
        return 0

    # 聚类：将同一 figure/table 的碎片 box 合并
    clusters = _cluster_boxes(raw_boxes)
+    caption_clusters = _cluster_boxes(raw_caption_boxes)
+    caption_matches = _match_captions(page, clusters, caption_clusters)

    extracted = 0
-    for cluster in clusters:
+    for cluster_idx, cluster in enumerate(clusters):
        cap_type = "figure" if cluster.boxclass == "picture" else "table"

        if cap_type == "figure":
@@ -168,21 +257,33 @@ def _process_page(
            continue
        seen_labels.add(label)

+        caption_match = caption_matches.get(cluster_idx)
+        caption_cluster = caption_match[0] if caption_match else None
+
        filename = f"{label.replace(' ', '_').lower()}.jpg"
-        if not _render_box(page, cluster, images_dest, filename, cap_type, page_num):
+        if not _render_box(
+            page,
+            cluster,
+            images_dest,
+            filename,
+            cap_type,
+            page_num,
+            caption=caption_cluster,
+        ):
            continue

-        manifest[filename] = {
+        info = {
            "page": page_num,
            "type": cap_type,
            "label": label,
-            "box": [
-                round(float(cluster.x0), 1),
-                round(float(cluster.y0), 1),
-                round(float(cluster.x1), 1),
-                round(float(cluster.y1), 1),
-            ],
+            "box": _cluster_to_box(cluster),
        }
+        if caption_match:
+            info["caption_text"] = caption_match[1][:500]
+            info["caption_box"] = _cluster_to_box(caption_cluster)
+            info["caption_source"] = "doclayout"
+
+        manifest[filename] = info
        extracted += 1

    return extracted
@@ -446,14 +547,20 @@ def label_images_by_summary(
        cap_type = info.get("type", "figure")

        # 读取 caption 文本（从 figures 列表）
-        caption_text = ""
+        summary_caption_text = ""
        for fig in figures:
            if fig.get("id") == fig_id:
-                caption_text = fig.get("caption", "")
+                summary_caption_text = fig.get("caption", "")
                break

        info["label"] = fig_id
-        info["caption_text"] = caption_text[:200] if caption_text else ""
+        existing_caption_text = info.get("caption_text", "")
+        if existing_caption_text and summary_caption_text:
+            info["summary_caption_text"] = summary_caption_text[:500]
+        else:
+            info["caption_text"] = (
+                summary_caption_text[:500] if summary_caption_text else ""
+            )
        info.setdefault("figures" if cap_type == "figure" else "tables", []).append(
            fig_id
        )