feat: add concurrency safety, caption detection, admin enhancements, and performance improvements
This commit is contained in:
@@ -34,6 +34,8 @@ _CLUSTER_GAP = 15
|
||||
_MIN_BOX_AREA = 2000
|
||||
# Phase 2: 搜索文本到 box 的最大匹配距离(单位: pt)
|
||||
_LABEL_MATCH_DISTANCE = 100
|
||||
# DocLayout caption 与 figure/table 匹配的最大距离(单位: pt)
|
||||
_CAPTION_MATCH_DISTANCE = 120
|
||||
|
||||
|
||||
# ── Box 聚类 ─────────────────────────────────────────────────────────
|
||||
@@ -53,6 +55,15 @@ class _BoxCluster:
|
||||
self.boxclass = "table" if raw == "table-fallback" else raw
|
||||
|
||||
|
||||
def _cluster_to_box(cluster: _BoxCluster) -> list[float]:
|
||||
return [
|
||||
round(float(cluster.x0), 1),
|
||||
round(float(cluster.y0), 1),
|
||||
round(float(cluster.x1), 1),
|
||||
round(float(cluster.y1), 1),
|
||||
]
|
||||
|
||||
|
||||
def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]:
|
||||
"""将相邻的同类型 box 合并为聚类。"""
|
||||
if not boxes:
|
||||
@@ -92,6 +103,67 @@ def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]:
|
||||
return [_BoxCluster(members) for members in groups.values()]
|
||||
|
||||
|
||||
def _caption_class_for_content(boxclass: str) -> str:
|
||||
return "figure_caption" if boxclass == "picture" else "table_caption"
|
||||
|
||||
|
||||
def _caption_distance(content: _BoxCluster, caption: _BoxCluster) -> float | None:
|
||||
"""Return a spatial score for pairing a caption with a content box."""
|
||||
h_overlap = min(content.x1, caption.x1) - max(content.x0, caption.x0)
|
||||
min_width = min(content.x1 - content.x0, caption.x1 - caption.x0)
|
||||
if min_width <= 0 or h_overlap < min_width * 0.25:
|
||||
return None
|
||||
|
||||
if caption.y1 < content.y0:
|
||||
v_gap = content.y0 - caption.y1
|
||||
elif caption.y0 > content.y1:
|
||||
v_gap = caption.y0 - content.y1
|
||||
else:
|
||||
v_gap = 0.0
|
||||
|
||||
return v_gap if v_gap <= _CAPTION_MATCH_DISTANCE else None
|
||||
|
||||
|
||||
def _extract_caption_text(page, caption: _BoxCluster) -> str:
|
||||
rect = pymupdf.Rect(caption.x0, caption.y0, caption.x1, caption.y1)
|
||||
try:
|
||||
text = page.get_text("text", clip=rect)
|
||||
except Exception:
|
||||
return ""
|
||||
return " ".join(text.split())
|
||||
|
||||
|
||||
def _match_captions(
|
||||
page,
|
||||
content_clusters: list[_BoxCluster],
|
||||
caption_clusters: list[_BoxCluster],
|
||||
) -> dict[int, tuple[_BoxCluster, str]]:
|
||||
"""Match each content cluster to its nearest same-type DocLayout caption."""
|
||||
matches: dict[int, tuple[_BoxCluster, str]] = {}
|
||||
used_captions: set[int] = set()
|
||||
candidates: list[tuple[float, int, int]] = []
|
||||
|
||||
for content_idx, content in enumerate(content_clusters):
|
||||
wanted_caption_class = _caption_class_for_content(content.boxclass)
|
||||
for caption_idx, caption in enumerate(caption_clusters):
|
||||
if caption.boxclass != wanted_caption_class:
|
||||
continue
|
||||
dist = _caption_distance(content, caption)
|
||||
if dist is not None:
|
||||
candidates.append((dist, content_idx, caption_idx))
|
||||
|
||||
for _dist, content_idx, caption_idx in sorted(candidates):
|
||||
if content_idx in matches or caption_idx in used_captions:
|
||||
continue
|
||||
text = _extract_caption_text(page, caption_clusters[caption_idx])
|
||||
if not text:
|
||||
continue
|
||||
matches[content_idx] = (caption_clusters[caption_idx], text)
|
||||
used_captions.add(caption_idx)
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
# ── Phase 1: 检测 + 渲染 ──────────────────────────────────────────────
|
||||
|
||||
|
||||
@@ -102,14 +174,25 @@ def _render_box(
|
||||
filename: str,
|
||||
cap_type: str,
|
||||
page_num: int,
|
||||
caption: _BoxCluster | None = None,
|
||||
) -> bool:
|
||||
"""渲染单个 box 区域并保存 JPEG,成功返回 True。"""
|
||||
"""渲染单个 box 区域并保存 JPEG,成功返回 True。
|
||||
|
||||
若提供 caption,则将内容与 caption 区域合并后一起截取,
|
||||
使同一张截图同时包含图/表及其标题文字。
|
||||
"""
|
||||
page_width = page.rect.width
|
||||
x0, y0, x1, y1 = box.x0, box.y0, box.x1, box.y1
|
||||
if caption is not None:
|
||||
x0 = min(x0, caption.x0)
|
||||
y0 = min(y0, caption.y0)
|
||||
x1 = max(x1, caption.x1)
|
||||
y1 = max(y1, caption.y1)
|
||||
clip = pymupdf.Rect(
|
||||
max(0, box.x0 - _REGION_PADDING),
|
||||
max(0, box.y0 - _REGION_PADDING),
|
||||
min(page_width, box.x1 + _REGION_PADDING),
|
||||
box.y1 + _REGION_PADDING,
|
||||
max(0, x0 - _REGION_PADDING),
|
||||
max(0, y0 - _REGION_PADDING),
|
||||
min(page_width, x1 + _REGION_PADDING),
|
||||
y1 + _REGION_PADDING,
|
||||
)
|
||||
mat = pymupdf.Matrix(_RENDER_ZOOM, _RENDER_ZOOM)
|
||||
try:
|
||||
@@ -136,25 +219,31 @@ def _process_page(
|
||||
fig_counter = 0
|
||||
tbl_counter = 0
|
||||
|
||||
# 收集本页的 table/picture box(跳过极小区域)
|
||||
# 收集本页的 table/picture box 与 caption box(跳过极小区域)
|
||||
raw_boxes = []
|
||||
raw_caption_boxes = []
|
||||
for box in page_boxes:
|
||||
if box.boxclass not in ("table", "table-fallback", "picture"):
|
||||
continue
|
||||
w = box.x1 - box.x0
|
||||
h = box.y1 - box.y0
|
||||
if w < 20 or h < 20 or w * h < _MIN_BOX_AREA:
|
||||
continue
|
||||
raw_boxes.append(box)
|
||||
if box.boxclass in ("table", "table-fallback", "picture"):
|
||||
if w < 20 or h < 20 or w * h < _MIN_BOX_AREA:
|
||||
continue
|
||||
raw_boxes.append(box)
|
||||
elif box.boxclass in ("figure_caption", "table_caption"):
|
||||
if w < 30 or h < 6:
|
||||
continue
|
||||
raw_caption_boxes.append(box)
|
||||
|
||||
if not raw_boxes:
|
||||
return 0
|
||||
|
||||
# 聚类:将同一 figure/table 的碎片 box 合并
|
||||
clusters = _cluster_boxes(raw_boxes)
|
||||
caption_clusters = _cluster_boxes(raw_caption_boxes)
|
||||
caption_matches = _match_captions(page, clusters, caption_clusters)
|
||||
|
||||
extracted = 0
|
||||
for cluster in clusters:
|
||||
for cluster_idx, cluster in enumerate(clusters):
|
||||
cap_type = "figure" if cluster.boxclass == "picture" else "table"
|
||||
|
||||
if cap_type == "figure":
|
||||
@@ -168,21 +257,33 @@ def _process_page(
|
||||
continue
|
||||
seen_labels.add(label)
|
||||
|
||||
caption_match = caption_matches.get(cluster_idx)
|
||||
caption_cluster = caption_match[0] if caption_match else None
|
||||
|
||||
filename = f"{label.replace(' ', '_').lower()}.jpg"
|
||||
if not _render_box(page, cluster, images_dest, filename, cap_type, page_num):
|
||||
if not _render_box(
|
||||
page,
|
||||
cluster,
|
||||
images_dest,
|
||||
filename,
|
||||
cap_type,
|
||||
page_num,
|
||||
caption=caption_cluster,
|
||||
):
|
||||
continue
|
||||
|
||||
manifest[filename] = {
|
||||
info = {
|
||||
"page": page_num,
|
||||
"type": cap_type,
|
||||
"label": label,
|
||||
"box": [
|
||||
round(float(cluster.x0), 1),
|
||||
round(float(cluster.y0), 1),
|
||||
round(float(cluster.x1), 1),
|
||||
round(float(cluster.y1), 1),
|
||||
],
|
||||
"box": _cluster_to_box(cluster),
|
||||
}
|
||||
if caption_match:
|
||||
info["caption_text"] = caption_match[1][:500]
|
||||
info["caption_box"] = _cluster_to_box(caption_cluster)
|
||||
info["caption_source"] = "doclayout"
|
||||
|
||||
manifest[filename] = info
|
||||
extracted += 1
|
||||
|
||||
return extracted
|
||||
@@ -446,14 +547,20 @@ def label_images_by_summary(
|
||||
cap_type = info.get("type", "figure")
|
||||
|
||||
# 读取 caption 文本(从 figures 列表)
|
||||
caption_text = ""
|
||||
summary_caption_text = ""
|
||||
for fig in figures:
|
||||
if fig.get("id") == fig_id:
|
||||
caption_text = fig.get("caption", "")
|
||||
summary_caption_text = fig.get("caption", "")
|
||||
break
|
||||
|
||||
info["label"] = fig_id
|
||||
info["caption_text"] = caption_text[:200] if caption_text else ""
|
||||
existing_caption_text = info.get("caption_text", "")
|
||||
if existing_caption_text and summary_caption_text:
|
||||
info["summary_caption_text"] = summary_caption_text[:500]
|
||||
else:
|
||||
info["caption_text"] = (
|
||||
summary_caption_text[:500] if summary_caption_text else ""
|
||||
)
|
||||
info.setdefault("figures" if cap_type == "figure" else "tables", []).append(
|
||||
fig_id
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user