diff --git a/app/services/pdf_image_extractor.py b/app/services/pdf_image_extractor.py index 7d771e5..891647c 100644 --- a/app/services/pdf_image_extractor.py +++ b/app/services/pdf_image_extractor.py @@ -1,12 +1,13 @@ -"""PDF 图片与表格提取 — 两阶段流水线。 +"""PDF 图片与表格提取。 -Phase 1: DocLayout-YOLO 检测 figure/table 区域 → 渲染为 JPEG(通用标签) -Phase 2: 用 LLM summary 的 figures[].id 在 PDF 中搜索定位 → 匹配到 box → 重命名 +DocLayout-YOLO 检测 figure/table 内容区域 → PDF 文本流定位 caption → 只渲染配到 +Figure/Table 标题的,用 caption 自带权威 ID 命名。没配到标题的(Algorithm 伪代码、 +无编号附录表、DocLayout 误检碎片)一律过滤,不输出。 -相比旧方案(正则匹配 caption): -- 不再依赖正则,用 LLM 输出的 ID 直接搜索 PDF 文本 -- page.search_for() 精确搜索 + 空间距离过滤,避免正文引用误匹配 -- 通用标签兜底,LLM 没提到的图表不会被丢弃 +caption 定位用 PDF 文本而非 DocLayout 的 caption box —— 后者检测不稳(多行标题只 +框一行→截断、漏检→无标题、配对错误→串台)。page.get_text("dict") 找以 +"Figure N"/"Table N" 开头的文本块:文本块天然含完整多行标题,且其 ID 即论文实际 +编号,直接命名规避串台。figure 标题优先在下方、table 标题优先在上方配对。 """ from __future__ import annotations @@ -14,6 +15,7 @@ from __future__ import annotations import json import logging import re +from dataclasses import dataclass from pathlib import Path import pymupdf @@ -32,10 +34,16 @@ _RENDER_ZOOM = 3 _CLUSTER_GAP = 15 # 最小 bbox 面积(单位: pt²)— 过滤 icon/logo 等微小误检 _MIN_BOX_AREA = 2000 -# Phase 2: 搜索文本到 box 的最大匹配距离(单位: pt) -_LABEL_MATCH_DISTANCE = 100 -# DocLayout caption 与 figure/table 匹配的最大距离(单位: pt) +# caption 文本块与 figure/table 内容块的最大垂直距离(单位: pt) _CAPTION_MATCH_DISTANCE = 120 +# 方向不符(figure 标题在上 / table 标题在下)的配对惩罚分(仍允许,兜底异常排版) +_CAPTION_WRONG_SIDE_PENALTY = 300 +# caption 开头标记:Figure 3 / Fig. 3 / Table C1 / Figure 3.5 等(大小写均可) +# 编号 = 数字开头 或 字母+数字(附录 C1);行首匹配,规避正文 "see Table 3" 引用 +_CAPTION_HEAD_RE = re.compile( + r"^\s*(Figure|Fig\.?|Table)\b\.?\s+([0-9][0-9A-Za-z.]*|[A-Z]\d[0-9A-Za-z.]*)", + re.IGNORECASE, +) # ── Box 聚类 ───────────────────────────────────────────────────────── @@ -51,8 +59,17 @@ class _BoxCluster: self.y0 = min(b.y0 for b in boxes) self.x1 = max(b.x1 for b in boxes) self.y1 = max(b.y1 for b in boxes) - raw = boxes[0].boxclass - self.boxclass = "table" if raw == "table-fallback" else raw + self.boxclass = boxes[0].boxclass + + +@dataclass(frozen=True) +class _CaptionBlock: + """从 PDF 文本流提取的标题块:自带权威 ID、完整多行文本、精确 bbox。""" + + id: str # "Figure 3" / "Table C1" + kind: str # "figure" | "table" + text: str # 完整多行标题文本 + bbox: list[float] # [x0, y0, x1, y1] def _cluster_to_box(cluster: _BoxCluster) -> list[float]: @@ -103,64 +120,88 @@ def _cluster_boxes(boxes: list, gap: float = _CLUSTER_GAP) -> list[_BoxCluster]: return [_BoxCluster(members) for members in groups.values()] -def _caption_class_for_content(boxclass: str) -> str: - return "figure_caption" if boxclass == "picture" else "table_caption" +def _find_caption_blocks(page) -> list[_CaptionBlock]: + """从页面文本流提取以 "Figure N"/"Table N"/"Fig. N" 开头的标题块。 - -def _caption_distance(content: _BoxCluster, caption: _BoxCluster) -> float | None: - """Return a spatial score for pairing a caption with a content box.""" - h_overlap = min(content.x1, caption.x1) - max(content.x0, caption.x0) - min_width = min(content.x1 - content.x0, caption.x1 - caption.x0) - if min_width <= 0 or h_overlap < min_width * 0.25: - return None - - if caption.y1 < content.y0: - v_gap = content.y0 - caption.y1 - elif caption.y0 > content.y1: - v_gap = caption.y0 - content.y1 - else: - v_gap = 0.0 - - return v_gap if v_gap <= _CAPTION_MATCH_DISTANCE else None - - -def _extract_caption_text(page, caption: _BoxCluster) -> str: - rect = pymupdf.Rect(caption.x0, caption.y0, caption.x1, caption.y1) + 用 PDF 文本而非 DocLayout caption box:文本块天然含完整多行标题, + 且其 ID 即论文实际编号(如 "Table C1"),权威且不依赖模型检测。 + """ try: - text = page.get_text("text", clip=rect) + d = page.get_text("dict") except Exception: - return "" - return " ".join(text.split()) + return [] + + results: list[_CaptionBlock] = [] + for block in d.get("blocks", []): + if block.get("type") != 0: # 仅文本块 + continue + lines = block.get("lines", []) + if not lines: + continue + line_texts = [ + "".join(span.get("text", "") for span in line.get("spans", [])) + for line in lines + ] + first_line = next((t for t in line_texts if t.strip()), "") + m = _CAPTION_HEAD_RE.match(first_line) + if not m: + continue + kind_word, num = m.group(1), m.group(2) + is_table = kind_word.lower().startswith("table") + bbox = block.get("bbox") + if not bbox or len(bbox) != 4: + continue + full_text = " ".join(t.strip() for t in line_texts if t.strip()) + results.append( + _CaptionBlock( + id=f"{'Table' if is_table else 'Figure'} {num}", + kind="table" if is_table else "figure", + text=full_text, + bbox=[float(v) for v in bbox], + ) + ) + return results -def _match_captions( - page, +def _pair_caption_blocks( content_clusters: list[_BoxCluster], - caption_clusters: list[_BoxCluster], -) -> dict[int, tuple[_BoxCluster, str]]: - """Match each content cluster to its nearest same-type DocLayout caption.""" - matches: dict[int, tuple[_BoxCluster, str]] = {} - used_captions: set[int] = set() + caption_blocks: list[_CaptionBlock], +) -> dict[int, _CaptionBlock]: + """每个内容块配方向上最近的同类型标题块。 + + figure 标题惯例在下方、table 标题在上方;方向相符优先,不符加惩罚兜底 + (跨页 / 异常排版)。按 (距离+惩罚) 升序贪心匹配,每个内容块与标题块唯一配对。 + """ candidates: list[tuple[float, int, int]] = [] - - for content_idx, content in enumerate(content_clusters): - wanted_caption_class = _caption_class_for_content(content.boxclass) - for caption_idx, caption in enumerate(caption_clusters): - if caption.boxclass != wanted_caption_class: + for c_idx, content in enumerate(content_clusters): + want_below = content.boxclass == "picture" # figure 标题在下 + want_kind = "figure" if want_below else "table" + for b_idx, cap in enumerate(caption_blocks): + if cap.kind != want_kind: continue - dist = _caption_distance(content, caption) - if dist is not None: - candidates.append((dist, content_idx, caption_idx)) + cx0, cy0, cx1, cy1 = cap.bbox + h_overlap = min(content.x1, cx1) - max(content.x0, cx0) + min_width = min(content.x1 - content.x0, cx1 - cx0) + if min_width <= 0 or h_overlap < min_width * 0.25: + continue + if cy1 <= content.y0: # 标题在内容上方 + side_below, v_gap = False, content.y0 - cy1 + elif cy0 >= content.y1: # 标题在内容下方 + side_below, v_gap = True, cy0 - content.y1 + else: + continue # 重叠,跳过 + if v_gap > _CAPTION_MATCH_DISTANCE: + continue + penalty = 0.0 if side_below == want_below else _CAPTION_WRONG_SIDE_PENALTY + candidates.append((v_gap + penalty, c_idx, b_idx)) - for _dist, content_idx, caption_idx in sorted(candidates): - if content_idx in matches or caption_idx in used_captions: + matches: dict[int, _CaptionBlock] = {} + used: set[int] = set() + for _score, c_idx, b_idx in sorted(candidates): + if c_idx in matches or b_idx in used: continue - text = _extract_caption_text(page, caption_clusters[caption_idx]) - if not text: - continue - matches[content_idx] = (caption_clusters[caption_idx], text) - used_captions.add(caption_idx) - + matches[c_idx] = caption_blocks[b_idx] + used.add(b_idx) return matches @@ -174,25 +215,27 @@ def _render_box( filename: str, cap_type: str, page_num: int, - caption: _BoxCluster | None = None, + caption_bbox: list[float] | None = None, ) -> bool: """渲染单个 box 区域并保存 JPEG,成功返回 True。 - 若提供 caption,则将内容与 caption 区域合并后一起截取, - 使同一张截图同时包含图/表及其标题文字。 + 若提供 caption_bbox,则将内容与标题区域合并后一起截取, + 使同一张截图同时包含图/表及其完整标题。 """ page_width = page.rect.width + page_height = page.rect.height x0, y0, x1, y1 = box.x0, box.y0, box.x1, box.y1 - if caption is not None: - x0 = min(x0, caption.x0) - y0 = min(y0, caption.y0) - x1 = max(x1, caption.x1) - y1 = max(y1, caption.y1) + if caption_bbox is not None: + cx0, cy0, cx1, cy1 = caption_bbox + x0 = min(x0, cx0) + y0 = min(y0, cy0) + x1 = max(x1, cx1) + y1 = max(y1, cy1) clip = pymupdf.Rect( max(0, x0 - _REGION_PADDING), max(0, y0 - _REGION_PADDING), min(page_width, x1 + _REGION_PADDING), - y1 + _REGION_PADDING, + min(page_height, y1 + _REGION_PADDING), ) mat = pymupdf.Matrix(_RENDER_ZOOM, _RENDER_ZOOM) try: @@ -200,7 +243,7 @@ def _render_box( except Exception: return False - (images_dest / filename).write_bytes(pix.tobytes("jpeg")) + (images_dest / filename).write_bytes(pix.tobytes("jpeg", jpg_quality=92)) return True @@ -213,77 +256,62 @@ def _process_page( seen_labels: set, arxiv_id: str, ) -> int: - """处理单页:检测 → 聚类 → 渲染,全部用通用标签。""" + """处理单页:检测内容 box → 文本定位 caption → 只渲染配到标题的。 + + 配到 Figure/Table caption 的 box 用 caption 自带 ID 命名(figure_3.jpg); + 没配到标题的(Algorithm 伪代码、无编号附录表、误检碎片)一律过滤,不输出。 + """ page = doc[page_idx] page_num = page_idx + 1 - fig_counter = 0 - tbl_counter = 0 - # 收集本页的 table/picture box 与 caption box(跳过极小区域) + # 收集本页 figure/table 内容 box(跳过极小区域;caption 改由文本定位,不收 box) raw_boxes = [] - raw_caption_boxes = [] for box in page_boxes: - w = box.x1 - box.x0 - h = box.y1 - box.y0 - if box.boxclass in ("table", "table-fallback", "picture"): + if box.boxclass in ("table", "picture"): + w = box.x1 - box.x0 + h = box.y1 - box.y0 if w < 20 or h < 20 or w * h < _MIN_BOX_AREA: continue raw_boxes.append(box) - elif box.boxclass in ("figure_caption", "table_caption"): - if w < 30 or h < 6: - continue - raw_caption_boxes.append(box) if not raw_boxes: return 0 - # 聚类:将同一 figure/table 的碎片 box 合并 + # 聚类:将同一 figure/table 的碎片 box 合并;用 PDF 文本定位 caption clusters = _cluster_boxes(raw_boxes) - caption_clusters = _cluster_boxes(raw_caption_boxes) - caption_matches = _match_captions(page, clusters, caption_clusters) + caption_blocks = _find_caption_blocks(page) + caption_matches = _pair_caption_blocks(clusters, caption_blocks) extracted = 0 for cluster_idx, cluster in enumerate(clusters): - cap_type = "figure" if cluster.boxclass == "picture" else "table" + cap_match = caption_matches.get(cluster_idx) + if cap_match is None: + continue # 无 Figure/Table 标题 → 过滤(Algorithm、无编号表、误检碎片) + if cap_match.id in seen_labels: + continue # 同一图表被 DocLayout 切成多块重复检测,跳过后续 + seen_labels.add(cap_match.id) - if cap_type == "figure": - fig_counter += 1 - label = f"Figure (p{page_num}-{fig_counter})" - else: - tbl_counter += 1 - label = f"Table (p{page_num}-{tbl_counter})" - - if label in seen_labels: - continue - seen_labels.add(label) - - caption_match = caption_matches.get(cluster_idx) - caption_cluster = caption_match[0] if caption_match else None - - filename = f"{label.replace(' ', '_').lower()}.jpg" + filename = f"{cap_match.id.replace(' ', '_').lower()}.jpg" if not _render_box( page, cluster, images_dest, filename, - cap_type, + cap_match.kind, page_num, - caption=caption_cluster, + caption_bbox=cap_match.bbox, ): continue - info = { + manifest[filename] = { "page": page_num, - "type": cap_type, - "label": label, + "type": cap_match.kind, + "label": cap_match.id, "box": _cluster_to_box(cluster), + "caption_text": cap_match.text[:500], + "caption_box": cap_match.bbox, + "caption_source": "text", } - if caption_match: - info["caption_text"] = caption_match[1][:500] - info["caption_box"] = _cluster_to_box(caption_cluster) - info["caption_source"] = "doclayout" - - manifest[filename] = info extracted += 1 return extracted @@ -359,230 +387,6 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int: return extracted -# ── Phase 2: 用 summary 的 figures ID 定位并重命名 ───────────────────── - - -def _distance_text_to_box(rect: pymupdf.Rect, box: list[float]) -> float | None: - """计算搜索到的文本 rect 到 box 的距离。超出阈值返回 None。 - - 判断逻辑:rect 中心与 box 的垂直距离 + 水平重叠检查。 - """ - rect_cx = (rect.x0 + rect.x1) / 2 - rect_cy = (rect.y0 + rect.y1) / 2 - bx0, by0, bx1, by1 = box - - # 水平重叠:rect 中心在 box 水平范围内(或接近) - if not (bx0 - 20 <= rect_cx <= bx1 + 20): - return None - - # 垂直距离 - if rect_cy < by0: - dist = by0 - rect_cy - elif rect_cy > by1: - dist = rect_cy - by1 - else: - dist = 0 - - return dist if dist <= _LABEL_MATCH_DISTANCE else None - - -def _search_variants(fig_id: str) -> list[str]: - """为 figure/table ID 生成搜索变体。 - - "Figure 1" → ["Figure 1", "Fig. 1", "Fig 1"] - "Fig. 1" → ["Fig. 1", "Figure 1", "Fig 1"] - "Table A1" → ["Table A1"] - """ - variants = [fig_id] - - m = re.match(r"(Fig\.?|Figure)\s+(\d+.*)", fig_id, re.IGNORECASE) - if m: - num_part = m.group(2) - variants.extend( - [ - f"Figure {num_part}", - f"Fig. {num_part}", - f"Fig {num_part}", - ] - ) - - # 去重保序 - seen = set() - result = [] - for v in variants: - if v not in seen: - seen.add(v) - result.append(v) - return result - - -def label_images_by_summary( - arxiv_id: str, - figures: list[dict], - pdf_path: Path | None = None, -) -> int: - """Phase 2: 用 summary 的 figures ID 在 PDF 中搜索定位,重命名图片。 - - 对 summary 中的每个 figure/table ID: - 1. page.search_for(id) 在所有页面搜索文本位置 - 2. 计算搜索位置与 manifest 中 box 坐标的距离 - 3. 最近匹配 → 重命名文件、更新 manifest - - Args: - arxiv_id: 论文 ID - figures: summary 的 figures 列表,每项含 id/caption/description 等 - pdf_path: PDF 路径 - - Returns: - 成功重命名的图片数量 - """ - if not figures: - return 0 - - if pdf_path is None: - pdf_path = TMP_DIR / arxiv_id / "paper.pdf" - if not pdf_path.exists(): - return 0 - - images_dest = paper_dir(arxiv_id) / "images" - manifest_path = images_dest / "manifest.json" - if not manifest_path.exists(): - return 0 - - manifest: dict[str, dict] = json.loads(manifest_path.read_text(encoding="utf-8")) - if not manifest: - return 0 - - # 构建候选列表:只对通用标签的条目做匹配 - candidates: dict[str, dict] = {} # filename → {page, box, ...} - for fname, info in manifest.items(): - if "(p" in info.get("label", ""): - candidates[fname] = info - - if not candidates: - return 0 - - with pymupdf.open(str(pdf_path)) as doc: - # 收集所有匹配候选:(fig_id, fig_index, filename, distance) - matches: list[tuple[str, int, str, float]] = [] - - for fig_idx, fig in enumerate(figures): - fig_id = fig.get("id", "") - if not fig_id: - continue - - # 生成搜索变体:Figure 1 / Fig. 1 / Fig 1 等 - search_terms = _search_variants(fig_id) - - # 在所有页面搜索该文本(含变体) - search_hits: list[tuple[int, pymupdf.Rect]] = [] # (page_num_1based, Rect) - for page_idx in range(doc.page_count): - page = doc[page_idx] - seen_rects: set[tuple[float, float]] = set() - for term in search_terms: - for r in page.search_for(term): - key = (round(r.x0, 1), round(r.y0, 1)) - if key not in seen_rects: - seen_rects.add(key) - search_hits.append((page_idx + 1, r)) - - if not search_hits: - continue - - # 对每个候选 manifest 条目,找最近的搜索命中 - for fname, info in candidates.items(): - box = info.get("box") - if not box: - continue - manifest_page = info.get("page", 0) - - best_dist: float | None = None - for hit_page, rect in search_hits: - # 只匹配同页面 - if hit_page != manifest_page: - continue - dist = _distance_text_to_box(rect, box) - if dist is not None and (best_dist is None or dist < best_dist): - best_dist = dist - - if best_dist is not None: - matches.append((fig_id, fig_idx, fname, best_dist)) - - if not matches: - logger.info("No label matches for %s", arxiv_id) - return 0 - - # 去冲突:按距离排序,每个 fig_id 和每个 filename 只匹配一次 - matches.sort(key=lambda x: x[3]) - used_fig_ids: set[int] = set() - used_filenames: set[str] = set() - renames: list[tuple[str, str, str]] = [] # (old_fname, new_fname, fig_id) - - for fig_id, fig_idx, fname, dist in matches: - if fig_idx in used_fig_ids or fname in used_filenames: - continue - used_fig_ids.add(fig_idx) - used_filenames.add(fname) - new_fname = f"{fig_id.replace(' ', '_').lower()}.jpg" - renames.append((fname, new_fname, fig_id)) - - # 执行重命名 - labeled = 0 - new_manifest: dict[str, dict] = {} - - for fname, info in manifest.items(): - if fname in used_filenames: - continue - # 未匹配的保持原样 - new_manifest[fname] = info - - for old_fname, new_fname, fig_id in renames: - old_path = images_dest / old_fname - new_path = images_dest / new_fname - if not old_path.exists(): - continue - - # 搬运 manifest 信息 - info = manifest[old_fname].copy() - cap_type = info.get("type", "figure") - - # 读取 caption 文本(从 figures 列表) - summary_caption_text = "" - for fig in figures: - if fig.get("id") == fig_id: - summary_caption_text = fig.get("caption", "") - break - - info["label"] = fig_id - existing_caption_text = info.get("caption_text", "") - if existing_caption_text and summary_caption_text: - info["summary_caption_text"] = summary_caption_text[:500] - else: - info["caption_text"] = ( - summary_caption_text[:500] if summary_caption_text else "" - ) - info.setdefault("figures" if cap_type == "figure" else "tables", []).append( - fig_id - ) - - # 重命名文件 - if new_fname != old_fname: - old_path.rename(new_path) - new_manifest[new_fname] = info - labeled += 1 - - # 写回 manifest - manifest_path.write_text(json.dumps(new_manifest, ensure_ascii=False, indent=2)) - - logger.info( - "Labeled %d/%d images for %s using summary figures", - labeled, - len(manifest), - arxiv_id, - ) - return labeled - - # ── Figure ↔ Image 关联 ──────────────────────────────────────────────── diff --git a/app/services/summary_persister.py b/app/services/summary_persister.py index 0d83dd9..997980c 100644 --- a/app/services/summary_persister.py +++ b/app/services/summary_persister.py @@ -201,20 +201,15 @@ def _cleanup_old_images(db: Session, paper: Paper) -> None: def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None: """从 PDF 提取图片和表格(失败不影响总结)。 - 两阶段流水线: - 1. DocLayout-YOLO 检测 + 渲染截图(通用标签) - 2. 用 summary 的 figures ID 在 PDF 中搜索定位 → 重命名 + DocLayout-YOLO 检测 figure/table 内容区域 → PDF 文本定位 caption → 只渲染 + 配到 Figure/Table 标题的(Algorithm、无编号附录表、误检碎片一律过滤)。 + 标题来源已切换为 PDF 文本,schema.figures 不再参与命名,参数保留备用。 """ try: - from app.services.pdf_image_extractor import ( - extract_images_from_pdf, - label_images_by_summary, - ) + from app.services.pdf_image_extractor import extract_images_from_pdf pdf_path = TMP_DIR / arxiv_id / "paper.pdf" extract_images_from_pdf(arxiv_id, pdf_path) - if schema.figures: - label_images_by_summary(arxiv_id, schema.figures, pdf_path) except Exception: logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True) diff --git a/tests/test_pdf_image_extractor.py b/tests/test_pdf_image_extractor.py index af41c14..94638ce 100644 --- a/tests/test_pdf_image_extractor.py +++ b/tests/test_pdf_image_extractor.py @@ -1,6 +1,5 @@ from __future__ import annotations -import json from unittest.mock import MagicMock import pymupdf @@ -9,7 +8,17 @@ from app.services import pdf_image_extractor as mod from app.services.layout_detector import LayoutBox -def test_process_page_extracts_doclayout_caption(tmp_path): +def _caption_block(bbox, text): + """构造一个 page.get_text("dict") 风格的文本块。""" + return { + "type": 0, + "bbox": list(bbox), + "lines": [{"spans": [{"text": text}]}], + } + + +def test_process_page_pairs_caption_from_text(tmp_path): + """caption 来自 PDF 文本流(figure 标题在内容下方),用其 ID 直接命名。""" images_dest = tmp_path / "images" images_dest.mkdir() manifest: dict[str, dict] = {} @@ -19,16 +28,18 @@ def test_process_page_extracts_doclayout_caption(tmp_path): page = MagicMock() page.rect.width = 600 + page.rect.height = 800 page.get_pixmap.return_value = pix - page.get_text.return_value = "Figure 1: Overall architecture.\n" + page.get_text.return_value = { + "blocks": [ + _caption_block((95, 310, 320, 325), "Figure 1: Overall architecture.") + ] + } doc = MagicMock() doc.__getitem__.return_value = page - boxes = [ - LayoutBox(100, 100, 300, 300, "picture"), - LayoutBox(95, 310, 320, 325, "figure_caption"), - ] + boxes = [LayoutBox(100, 100, 300, 300, "picture")] extracted = mod._process_page( doc, @@ -41,14 +52,15 @@ def test_process_page_extracts_doclayout_caption(tmp_path): ) assert extracted == 1 - info = manifest["figure_(p1-1).jpg"] + # caption 自带 ID → 直接命名 figure_1.jpg + info = manifest["figure_1.jpg"] + assert info["label"] == "Figure 1" assert info["caption_text"] == "Figure 1: Overall architecture." - assert info["caption_source"] == "doclayout" - assert info["caption_box"] == [95.0, 310.0, 320.0, 325.0] + assert info["caption_source"] == "text" def test_process_page_includes_caption_in_render(tmp_path): - """渲染时把 caption 区域合并进同一张截图。""" + """渲染时把 caption 文本块区域合并进同一张截图。""" images_dest = tmp_path / "images" images_dest.mkdir() manifest: dict[str, dict] = {} @@ -58,16 +70,16 @@ def test_process_page_includes_caption_in_render(tmp_path): page = MagicMock() page.rect.width = 600 + page.rect.height = 800 page.get_pixmap.return_value = pix - page.get_text.return_value = "Figure 1: Caption text.\n" + page.get_text.return_value = { + "blocks": [_caption_block((95, 310, 320, 325), "Figure 1: Caption text.")] + } doc = MagicMock() doc.__getitem__.return_value = page - boxes = [ - LayoutBox(100, 100, 300, 300, "picture"), - LayoutBox(95, 310, 320, 325, "figure_caption"), - ] + boxes = [LayoutBox(100, 100, 300, 300, "picture")] mod._process_page( doc, @@ -85,50 +97,74 @@ def test_process_page_includes_caption_in_render(tmp_path): assert clip == pymupdf.Rect(90, 95, 325, 330) -def test_label_images_preserves_doclayout_caption(tmp_path, monkeypatch): - arxiv_id = "2401.00001" - paper_root = tmp_path / arxiv_id - images_dest = paper_root / "images" - images_dest.mkdir(parents=True) - (images_dest / "figure_(p1-1).jpg").write_bytes(b"jpeg") - (images_dest / "manifest.json").write_text( - json.dumps( - { - "figure_(p1-1).jpg": { - "page": 1, - "type": "figure", - "label": "Figure (p1-1)", - "box": [100, 100, 300, 300], - "caption_text": "Figure 1: PDF original caption.", - "caption_source": "doclayout", - } - } - ) - ) +def test_process_page_table_caption_above(tmp_path): + """table 标题惯例在内容上方,配对后命名 table_N.jpg。""" + images_dest = tmp_path / "images" + images_dest.mkdir() + manifest: dict[str, dict] = {} - pdf_path = tmp_path / "paper.pdf" - pdf_path.write_bytes(b"%PDF") - monkeypatch.setattr(mod, "paper_dir", lambda _arxiv_id: paper_root) + pix = MagicMock() + pix.tobytes.return_value = b"jpeg" page = MagicMock() - page.search_for.return_value = [pymupdf.Rect(120, 305, 180, 320)] + page.rect.width = 600 + page.rect.height = 800 + page.get_pixmap.return_value = pix + # caption 在内容上方 [80, 90, 320, 105],内容表格 [80, 120, 320, 280] + page.get_text.return_value = { + "blocks": [_caption_block((80, 90, 320, 105), "Table 2 | Results summary.")] + } - fake_doc = MagicMock() - fake_doc.page_count = 1 - fake_doc.__getitem__.return_value = page - fake_doc.__enter__.return_value = fake_doc - fake_doc.__exit__.return_value = False - monkeypatch.setattr(mod.pymupdf, "open", lambda _path: fake_doc) + doc = MagicMock() + doc.__getitem__.return_value = page - labeled = mod.label_images_by_summary( - arxiv_id, - [{"id": "Figure 1", "caption": "Summary caption."}], - pdf_path=pdf_path, + boxes = [LayoutBox(80, 120, 320, 280, "table")] + + extracted = mod._process_page( + doc, + 0, + boxes, + images_dest=images_dest, + manifest=manifest, + seen_labels=set(), + arxiv_id="2401.00001", ) - assert labeled == 1 - manifest = json.loads((images_dest / "manifest.json").read_text()) - info = manifest["figure_1.jpg"] - assert info["caption_text"] == "Figure 1: PDF original caption." - assert info["caption_source"] == "doclayout" - assert info["summary_caption_text"] == "Summary caption." + assert extracted == 1 + info = manifest["table_2.jpg"] + assert info["label"] == "Table 2" + assert info["caption_source"] == "text" + + +def test_process_page_filters_uncaptioned(tmp_path): + """没有 Figure/Table caption 配对的 box(Algorithm、无编号表等)被过滤,不输出。""" + images_dest = tmp_path / "images" + images_dest.mkdir() + manifest: dict[str, dict] = {} + + pix = MagicMock() + pix.tobytes.return_value = b"jpeg" + + page = MagicMock() + page.rect.width = 600 + page.rect.height = 800 + page.get_pixmap.return_value = pix + page.get_text.return_value = {"blocks": []} # 无任何 caption 文本块 + + doc = MagicMock() + doc.__getitem__.return_value = page + + boxes = [LayoutBox(100, 100, 300, 300, "picture")] + + extracted = mod._process_page( + doc, + 0, + boxes, + images_dest=images_dest, + manifest=manifest, + seen_labels=set(), + arxiv_id="2401.00001", + ) + + assert extracted == 0 + assert manifest == {}