feat: add concurrency safety, caption detection, admin enhancements, and performance improvements

This commit is contained in:
2026-06-14 22:20:02 +08:00
parent 8f13c31991
commit 29fb20828e
23 changed files with 1782 additions and 114 deletions
+130 -23
View File
@@ -34,6 +34,8 @@ _CLUSTER_GAP = 15
_MIN_BOX_AREA = 2000
# Phase 2: 搜索文本到 box 的最大匹配距离(单位: pt)
_LABEL_MATCH_DISTANCE = 100
# DocLayout caption 与 figure/table 匹配的最大距离(单位: pt)
_CAPTION_MATCH_DISTANCE = 120
# ── Box 聚类 ─────────────────────────────────────────────────────────
@@ -53,6 +55,15 @@ class _BoxCluster:
self.boxclass = "table" if raw == "table-fallback" else raw
def _cluster_to_box(cluster: _BoxCluster) -> list[float]:
return [
round(float(cluster.x0), 1),
round(float(cluster.y0), 1),
round(float(cluster.x1), 1),
round(float(cluster.y1), 1),
]
def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]:
"""将相邻的同类型 box 合并为聚类。"""
if not boxes:
@@ -92,6 +103,67 @@ def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]:
return [_BoxCluster(members) for members in groups.values()]
def _caption_class_for_content(boxclass: str) -> str:
return "figure_caption" if boxclass == "picture" else "table_caption"
def _caption_distance(content: _BoxCluster, caption: _BoxCluster) -> float | None:
"""Return a spatial score for pairing a caption with a content box."""
h_overlap = min(content.x1, caption.x1) - max(content.x0, caption.x0)
min_width = min(content.x1 - content.x0, caption.x1 - caption.x0)
if min_width <= 0 or h_overlap < min_width * 0.25:
return None
if caption.y1 < content.y0:
v_gap = content.y0 - caption.y1
elif caption.y0 > content.y1:
v_gap = caption.y0 - content.y1
else:
v_gap = 0.0
return v_gap if v_gap <= _CAPTION_MATCH_DISTANCE else None
def _extract_caption_text(page, caption: _BoxCluster) -> str:
rect = pymupdf.Rect(caption.x0, caption.y0, caption.x1, caption.y1)
try:
text = page.get_text("text", clip=rect)
except Exception:
return ""
return " ".join(text.split())
def _match_captions(
page,
content_clusters: list[_BoxCluster],
caption_clusters: list[_BoxCluster],
) -> dict[int, tuple[_BoxCluster, str]]:
"""Match each content cluster to its nearest same-type DocLayout caption."""
matches: dict[int, tuple[_BoxCluster, str]] = {}
used_captions: set[int] = set()
candidates: list[tuple[float, int, int]] = []
for content_idx, content in enumerate(content_clusters):
wanted_caption_class = _caption_class_for_content(content.boxclass)
for caption_idx, caption in enumerate(caption_clusters):
if caption.boxclass != wanted_caption_class:
continue
dist = _caption_distance(content, caption)
if dist is not None:
candidates.append((dist, content_idx, caption_idx))
for _dist, content_idx, caption_idx in sorted(candidates):
if content_idx in matches or caption_idx in used_captions:
continue
text = _extract_caption_text(page, caption_clusters[caption_idx])
if not text:
continue
matches[content_idx] = (caption_clusters[caption_idx], text)
used_captions.add(caption_idx)
return matches
# ── Phase 1: 检测 + 渲染 ──────────────────────────────────────────────
@@ -102,14 +174,25 @@ def _render_box(
filename: str,
cap_type: str,
page_num: int,
caption: _BoxCluster | None = None,
) -> bool:
"""渲染单个 box 区域并保存 JPEG,成功返回 True。"""
"""渲染单个 box 区域并保存 JPEG,成功返回 True。
若提供 caption,则将内容与 caption 区域合并后一起截取,
使同一张截图同时包含图/表及其标题文字。
"""
page_width = page.rect.width
x0, y0, x1, y1 = box.x0, box.y0, box.x1, box.y1
if caption is not None:
x0 = min(x0, caption.x0)
y0 = min(y0, caption.y0)
x1 = max(x1, caption.x1)
y1 = max(y1, caption.y1)
clip = pymupdf.Rect(
max(0, box.x0 - _REGION_PADDING),
max(0, box.y0 - _REGION_PADDING),
min(page_width, box.x1 + _REGION_PADDING),
box.y1 + _REGION_PADDING,
max(0, x0 - _REGION_PADDING),
max(0, y0 - _REGION_PADDING),
min(page_width, x1 + _REGION_PADDING),
y1 + _REGION_PADDING,
)
mat = pymupdf.Matrix(_RENDER_ZOOM, _RENDER_ZOOM)
try:
@@ -136,25 +219,31 @@ def _process_page(
fig_counter = 0
tbl_counter = 0
# 收集本页的 table/picture box(跳过极小区域)
# 收集本页的 table/picture box 与 caption box(跳过极小区域)
raw_boxes = []
raw_caption_boxes = []
for box in page_boxes:
if box.boxclass not in ("table", "table-fallback", "picture"):
continue
w = box.x1 - box.x0
h = box.y1 - box.y0
if w < 20 or h < 20 or w * h < _MIN_BOX_AREA:
continue
raw_boxes.append(box)
if box.boxclass in ("table", "table-fallback", "picture"):
if w < 20 or h < 20 or w * h < _MIN_BOX_AREA:
continue
raw_boxes.append(box)
elif box.boxclass in ("figure_caption", "table_caption"):
if w < 30 or h < 6:
continue
raw_caption_boxes.append(box)
if not raw_boxes:
return 0
# 聚类:将同一 figure/table 的碎片 box 合并
clusters = _cluster_boxes(raw_boxes)
caption_clusters = _cluster_boxes(raw_caption_boxes)
caption_matches = _match_captions(page, clusters, caption_clusters)
extracted = 0
for cluster in clusters:
for cluster_idx, cluster in enumerate(clusters):
cap_type = "figure" if cluster.boxclass == "picture" else "table"
if cap_type == "figure":
@@ -168,21 +257,33 @@ def _process_page(
continue
seen_labels.add(label)
caption_match = caption_matches.get(cluster_idx)
caption_cluster = caption_match[0] if caption_match else None
filename = f"{label.replace(' ', '_').lower()}.jpg"
if not _render_box(page, cluster, images_dest, filename, cap_type, page_num):
if not _render_box(
page,
cluster,
images_dest,
filename,
cap_type,
page_num,
caption=caption_cluster,
):
continue
manifest[filename] = {
info = {
"page": page_num,
"type": cap_type,
"label": label,
"box": [
round(float(cluster.x0), 1),
round(float(cluster.y0), 1),
round(float(cluster.x1), 1),
round(float(cluster.y1), 1),
],
"box": _cluster_to_box(cluster),
}
if caption_match:
info["caption_text"] = caption_match[1][:500]
info["caption_box"] = _cluster_to_box(caption_cluster)
info["caption_source"] = "doclayout"
manifest[filename] = info
extracted += 1
return extracted
@@ -446,14 +547,20 @@ def label_images_by_summary(
cap_type = info.get("type", "figure")
# 读取 caption 文本(从 figures 列表)
caption_text = ""
summary_caption_text = ""
for fig in figures:
if fig.get("id") == fig_id:
caption_text = fig.get("caption", "")
summary_caption_text = fig.get("caption", "")
break
info["label"] = fig_id
info["caption_text"] = caption_text[:200] if caption_text else ""
existing_caption_text = info.get("caption_text", "")
if existing_caption_text and summary_caption_text:
info["summary_caption_text"] = summary_caption_text[:500]
else:
info["caption_text"] = (
summary_caption_text[:500] if summary_caption_text else ""
)
info.setdefault("figures" if cap_type == "figure" else "tables", []).append(
fig_id
)