feat: enhance PDF extraction with section-based figure routing and improved caption detection

This commit is contained in:
2026-06-10 02:05:30 +08:00
parent c94ff48254
commit a1e0962820
7 changed files with 253 additions and 116 deletions
+1 -1
View File
@@ -22,7 +22,7 @@ HTTP_USER_AGENT=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
PI_BIN=
SUMMARY_SKILL=daily-paper-summary
SUMMARY_CONCURRENCY=3
SUMMARY_TIMEOUT_SECONDS=900
SUMMARY_TIMEOUT_SECONDS=1200
SUMMARY_MAX_RETRIES=2
SUMMARY_PDF_MODE=auto
+1 -1
View File
@@ -32,7 +32,7 @@ class Settings(BaseSettings):
PI_BIN: str = ""
SUMMARY_SKILL: str = "daily-paper-summary"
SUMMARY_CONCURRENCY: int = 3
SUMMARY_TIMEOUT_SECONDS: int = 900
SUMMARY_TIMEOUT_SECONDS: int = 1200
SUMMARY_MAX_RETRIES: int = 2
SUMMARY_PDF_MODE: str = "auto" # "auto" = ≤80k 用 inject>80k 用 search;也可强制 "inject" / "search"
+23 -6
View File
@@ -122,17 +122,32 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
linked_figures = _link_figures_with_images(figures_raw, images, arxiv_id)
# 拆分table_figures(有截图的 Table 类型)→ 实验结果区域展示截图
# figures(其余)→ 论文图表画廊
table_figures = []
figures = []
# 拆分图片到对应展示区域:
# table_figures → 实验结果区域(Table 截图,不变)
# method_figures → 核心方法区域(section=="method"
# results_figures → 实验结果区域(section=="results" 的 Figure
# gallery_figures → 底部画廊(其余:motivation/limitations/无 section/无图)
table_figures: list[dict] = []
method_figures: list[dict] = []
results_figures: list[dict] = []
gallery_figures: list[dict] = []
for fig in linked_figures:
fig_id = fig.get("id", "")
section = fig.get("section", "")
is_table = fig_id.lower().startswith("table")
if is_table and fig.get("image_url"):
table_figures.append(fig)
elif not is_table and section == "method" and fig.get("image_url"):
method_figures.append(fig)
elif (
not is_table
and section == "results"
and fig.get("image_url")
):
results_figures.append(fig)
else:
figures.append(fig)
gallery_figures.append(fig)
return templates.TemplateResponse(
request,
@@ -144,8 +159,10 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
"paper_images": images,
"prereqs": prereqs,
"benchmarks": benchmarks,
"figures": figures,
"figures": gallery_figures,
"table_figures": table_figures,
"method_figures": method_figures,
"results_figures": results_figures,
"chroma_enabled": settings.CHROMA_ENABLED,
"page_title": paper.title_zh or paper.title_en,
},
+159 -75
View File
@@ -37,22 +37,66 @@ _REGION_SIDE_PADDING = 10
# 表格通常比 caption 文字宽,使用更大的水平扩展
_TABLE_SIDE_PADDING = 60
# 正文行距的 2 倍 ≈ 空白间隙阈值
_CONTENT_GAP_THRESHOLD = 30
# 正文行距的 ~1.5 倍 ≈ 空白间隙阈值(学术论文紧密排版,30pt 太宽松)
_CONTENT_GAP_THRESHOLD = 20
# ── Caption 正则 ───────────────────────────────────────────────────────
# 要求以 Figure/Table 开头(避免匹配正文中的 "see Figure 3" 等)
# 支持三种 caption 格式:
# "Figure 1: Title" / "Figure 1. Title" / "Figure 1 Title"(无标点,空格分隔)
# 第三种需要后续紧跟大写字母(排除 "Figure 1 shows..." 等正文引用)
_CAPTION_RE = re.compile(
r'^(?:Fig\.?|Figure)\s+(\d+)\s*[:\.]',
r"^(?:Fig\.?|Figure)\s+(\d+)\s*(?:[:\.]\s*|\s+(?=[A-Z]))",
re.IGNORECASE,
)
_TABLE_CAPTION_RE = re.compile(
r'^Table\s+(\d+)\s*[:\.]',
r"^Table\s+(\d+)\s*(?:[:\.]\s*|\s+(?=[A-Z]))",
re.IGNORECASE,
)
# ── 停止信号:表格边界检测遇到以下内容时立即停止 ──
# 下一个 Figure/Table caption(如 "Table 2:" "Figure 3:" "Figure 4 Title"
_CAPTION_STOP_RE = re.compile(
r"^(?:Table|Fig\.?|Figure)\s+\d+\s*(?:[:\.]\s*|\s+[A-Z])",
re.IGNORECASE,
)
# Section header(如 "6.2 Evolution" "D.1 Dependency" "7 Conclusion"
_SECTION_STOP_RE = re.compile(
r"^(\d{1,2}(?:\.\d+)?\s+[A-Z][a-z]|[A-Z]\.\d+\s+[A-Z][a-z])"
)
def _estimate_column_x(caption: dict) -> tuple[float, float]:
"""估计 caption 所在列的水平边界(col_x0, col_x1)。
双栏论文中 caption 宽度远小于页面宽度,据此判断左右列。
单栏或跨栏 caption(宽度 >65% 页宽)返回整页宽度。
caption 居中对齐(中心接近页面中线)时按跨栏处理,使用宽范围。
"""
pw = caption["page_width"]
caption_w = caption["caption_x1"] - caption["caption_x0"]
# caption 宽度 >65% 页宽 → 单栏或跨栏
if caption_w > pw * 0.65:
return 0, pw
cx = (caption["caption_x0"] + caption["caption_x1"]) / 2
# caption 居中(中心距页面中线 <8%)→ 可能是跨栏表格,使用宽范围
if abs(cx - pw / 2) / pw < 0.08:
return (
max(0, caption["caption_x0"] - _TABLE_SIDE_PADDING * 2),
min(pw, caption["caption_x1"] + _TABLE_SIDE_PADDING * 2),
)
if cx < pw / 2:
return 0, pw / 2
else:
return pw / 2, pw
def _find_captions(doc) -> list[dict]:
"""扫描整个文档,找到所有 Figure/Table caption 的位置和信息。"""
@@ -77,7 +121,8 @@ def _find_captions(doc) -> list[dict]:
m = _CAPTION_RE.match(first_line)
if m:
captions.append({
captions.append(
{
"type": "figure",
"num": int(m.group(1)),
"label": f"Figure {m.group(1)}",
@@ -89,12 +134,14 @@ def _find_captions(doc) -> list[dict]:
"caption_text": text,
"page_width": page_width,
"page_height": page_height,
})
}
)
continue
m = _TABLE_CAPTION_RE.match(first_line)
if m:
captions.append({
captions.append(
{
"type": "table",
"num": int(m.group(1)),
"label": f"Table {m.group(1)}",
@@ -106,7 +153,8 @@ def _find_captions(doc) -> list[dict]:
"caption_text": text,
"page_width": page_width,
"page_height": page_height,
})
}
)
return captions
@@ -115,80 +163,81 @@ def _find_figure_top(page, caption: dict) -> float:
"""向上扫描页面,找到 Figure 的上边界。
策略:
1. 收集 caption 上方的所有内容块(文本 + 嵌入图片
2. 找到最顶部的内容块作为图的上界
3. 检查内容块之间的大间隙(表示图从间隙下方开始)
4. 如果没找到任何内容块,使用默认图高度
注意:只扫描 text blocks 是不够的,因为 figure 本身是图片/矢量图,
不会被 get_text("blocks") 返回。必须同时用 get_image_info() 检测嵌入图片。
1. 优先用嵌入图片定位(绝大多数 figure 包含嵌入图片,图片边界即 figure 边界
2. 无图片时回退到文本块间隙检测(处理纯矢量图如 TikZ/matplotlib PDF
"""
caption_y = caption["caption_y0"]
cx0 = caption["caption_x0"] - _REGION_SIDE_PADDING
cx1 = caption["caption_x1"] + _REGION_SIDE_PADDING
col_x0, col_x1 = _estimate_column_x(caption)
cx0 = max(col_x0, caption["caption_x0"] - _REGION_SIDE_PADDING)
cx1 = min(col_x1, caption["caption_x1"] + _REGION_SIDE_PADDING)
# 收集 caption 上方、同列范围内的所有内容块
# 每个元素: (x0, y0, x1, y1)
# 同页上方最近的 Figure/Table caption(多 figure 同页时截断)
_caption_cutoff: float | None = None
for b in page.get_text("blocks"):
if len(b) < 5:
continue
by0, by1 = b[1], b[3]
if by1 >= caption_y or by1 <= caption_y - _FIGURE_MAX_HEIGHT:
continue
first_line = str(b[4]).strip().split("\n")[0].strip()
if _CAPTION_STOP_RE.match(first_line):
_caption_cutoff = by0
break
# ── 策略 1:嵌入图片定位(覆盖绝大多数 figure) ──
topmost_image_y: float | None = None
for img_info in page.get_image_info():
bbox = img_info.get("bbox")
if bbox is None:
continue
if hasattr(bbox, "x0"):
ix0, iy0, ix1, iy1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
else:
ix0, iy0, ix1, iy1 = bbox[0], bbox[1], bbox[2], bbox[3]
if iy1 <= caption_y and iy1 > caption_y - _FIGURE_MAX_HEIGHT:
if ix1 > cx0 and ix0 < cx1:
if _caption_cutoff is not None and iy0 < _caption_cutoff:
continue # 属于上方另一个 figure
if topmost_image_y is None or iy0 < topmost_image_y:
topmost_image_y = iy0
if topmost_image_y is not None:
figure_top = topmost_image_y
else:
# ── 策略 2:文本块间隙检测(纯矢量图) ──
above_blocks: list[tuple[float, float, float, float]] = []
# ── 1. 文本块 ──
for b in page.get_text("blocks"):
if len(b) < 5:
continue
bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
if by1 <= caption_y and by1 > caption_y - _FIGURE_MAX_HEIGHT:
if bx1 > cx0 and bx0 < cx1:
if col_x0 > 0 and bx0 < col_x0 - _REGION_SIDE_PADDING * 2:
continue
above_blocks.append((bx0, by0, bx1, by1))
# ── 2. 嵌入图片块 — 关键!figure 本身是图片,不是文本 ──
for img_info in page.get_image_info():
bbox = img_info.get("bbox")
if bbox is None:
continue
# bbox 可能是 Rect 对象或 tuple,兼容两种格式
if hasattr(bbox, 'x0'):
ix0, iy0, ix1, iy1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
else:
ix0, iy0, ix1, iy1 = bbox[0], bbox[1], bbox[2], bbox[3]
if iy1 <= caption_y and iy1 > caption_y - _FIGURE_MAX_HEIGHT:
if ix1 > cx0 and ix0 < cx1:
above_blocks.append((ix0, iy0, ix1, iy1))
# ── 没有内容块 → 用默认高度(可能是纯矢量图,如 TikZ/matplotlib PDF ──
if not above_blocks:
return max(0, caption_y - _FIGURE_DEFAULT_HEIGHT)
# ── 找到内容区域的上边界 ──
# 按 y 从下到上排序(离 caption 最近的在前)
above_blocks.sort(key=lambda b: b[1], reverse=True)
# 从 caption 向上扫描,找到第一个大间隙以上作为图的上界
# 典型结构: [正文段落] ...空白... [图内容(图片/矢量)] [caption]
# 空白间隙 ≈ 图的上边界
figure_top = above_blocks[-1][1] # 最上面的块顶部(默认兜底)
prev_bottom = caption_y # 从 caption 顶部开始向上
prev_bottom = caption_y
for b in above_blocks:
# b = (x0, y0, x1, y1), 我们关心 y 范围
gap = prev_bottom - b[3] # b[3] = by1 = 当前块底部
if gap > _CONTENT_GAP_THRESHOLD:
# 大间隙 → 图上边界在间隙下方
if prev_bottom - b[3] > _CONTENT_GAP_THRESHOLD:
figure_top = prev_bottom - 5
break
# 小间隙 → 当前块属于图的一部分(或紧挨着图),继续向上
prev_bottom = b[1] # b[1] = by0 = 当前块顶部
prev_bottom = b[1]
else:
# 所有块都紧挨着 → 图从最上面块的顶部开始
figure_top = above_blocks[-1][1]
# 同页 caption 截断
if _caption_cutoff is not None:
figure_top = max(figure_top, _caption_cutoff + 5)
# 限制最大高度
if caption_y - figure_top > _FIGURE_MAX_HEIGHT:
figure_top = caption_y - _FIGURE_MAX_HEIGHT
# 不低于页面顶部
figure_top = max(0, figure_top)
return figure_top
return max(0, figure_top)
def _find_table_region(page, caption: dict) -> tuple[float, float, float, float]:
@@ -209,9 +258,10 @@ def _find_table_region(page, caption: dict) -> tuple[float, float, float, float]
page_height = caption["page_height"]
page_width = caption["page_width"]
# 先用较宽的范围收集可能的表格内容块
search_x0 = max(0, caption_x0 - _TABLE_SIDE_PADDING)
search_x1 = min(page_width, caption_x1 + _TABLE_SIDE_PADDING)
# 估计 caption 所在列的水平边界,避免双栏论文跨列抓取
col_x0, col_x1 = _estimate_column_x(caption)
search_x0 = max(col_x0, caption_x0 - _TABLE_SIDE_PADDING)
search_x1 = min(col_x1, caption_x1 + _TABLE_SIDE_PADDING)
below_blocks: list[tuple[float, float, float, float]] = []
for b in blocks:
@@ -220,6 +270,17 @@ def _find_table_region(page, caption: dict) -> tuple[float, float, float, float]
bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
if by0 > caption_y and by0 < caption_y + _TABLE_MAX_HEIGHT:
if bx1 > search_x0 and bx0 < search_x1:
# 双栏论文:排除跨列正文段落(宽度 >> 列宽,起点在另一列)
# 表格行起点在列内或列边界附近;正文段落起点在另一列(bx0 远小于 col_x0)
if col_x0 > 0 and bx0 < col_x0 - _TABLE_SIDE_PADDING:
continue
# 停止信号:遇到下一个 caption 或 section header 立即停止
text = str(b[4]).strip()
first_line = text.split("\n")[0].strip()
if _CAPTION_STOP_RE.match(first_line) or _SECTION_STOP_RE.match(
first_line
):
break
below_blocks.append((bx0, by0, bx1, by1))
if not below_blocks:
@@ -248,11 +309,16 @@ def _find_table_region(page, caption: dict) -> tuple[float, float, float, float]
bottom = caption_y + _TABLE_MAX_HEIGHT
# ── 检测表格内容的水平范围 ──
# 表格通常比 caption 宽,用内容块的实际宽度
content_x0 = min(caption_x0, min(b[0] for b in below_blocks))
content_x1 = max(caption_x1, max(b[2] for b in below_blocks))
# 只用 gap 之前的 block 计算水平范围(gap 之后的 block 属于正文,可能更宽)
table_blocks = [b for b in below_blocks if b[1] < bottom]
if not table_blocks:
table_blocks = below_blocks[:1] # 至少用第一个 block
content_x0 = min(caption_x0, min(b[0] for b in table_blocks))
content_x1 = max(caption_x1, max(b[2] for b in table_blocks))
# 添加边距,不超出页面
# 添加边距,不超出页面
# 使用较小 padding,避免将相邻列内容(如同页另一列的 Figure)带入截图;
# 同时不限制列边界 — 双栏论文中 caption 可能跨列起始
x0 = max(0, content_x0 - _REGION_SIDE_PADDING)
x1 = min(page_width, content_x1 + _REGION_SIDE_PADDING)
@@ -283,6 +349,12 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
images_dest = paper_dir(arxiv_id) / "images"
images_dest.mkdir(parents=True, exist_ok=True)
# 清理上次提取的旧图片,避免残留
for old_file in images_dest.glob("*.png"):
old_file.unlink()
if (images_dest / "manifest.json").exists():
(images_dest / "manifest.json").unlink()
doc = pymupdf.open(str(pdf_path))
captions = _find_captions(doc)
@@ -303,16 +375,17 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
extracted = 0
manifest: dict[str, dict] = {}
zoom = 2 # 2x 渲染,保证清晰度
zoom = 3 # 3x 渲染,保证清晰度
for cap in unique_captions:
page = doc[cap["page_num"]]
pw = cap["page_width"]
ph = cap["page_height"]
if cap["type"] == "figure":
# Figure: caption 上方是图 → 向上找图的上边界
top = _find_figure_top(page, cap)
# 上方多留 5pt 边距,确保图框边框、装饰线等不被截断
top = max(0, top - 5)
bottom = cap["caption_y1"] + 5 # 包含 caption
# 水平范围:caption 宽度 + 边距(图和 caption 通常等宽)
# 但也要考虑图内容的实际宽度
@@ -361,23 +434,30 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
}
logger.debug(
"Rendered %s: page %d, region (%.0f,%.0f)-(%.0f,%.0f) h=%.0fpt → %s",
cap["label"], cap["page_num"] + 1,
x0, top, x1, bottom, height, filename,
cap["label"],
cap["page_num"] + 1,
x0,
top,
x1,
bottom,
height,
filename,
)
doc.close()
# 保存 manifest
manifest_path = images_dest / "manifest.json"
manifest_path.write_text(
json.dumps(manifest, ensure_ascii=False, indent=2)
)
manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2))
if extracted > 0:
logger.info(
"Extracted %d figure/table screenshots from PDF for %s "
"(from %d captions found, %d unique)",
extracted, arxiv_id, len(captions), len(unique_captions),
extracted,
arxiv_id,
len(captions),
len(unique_captions),
)
return extracted
@@ -407,10 +487,10 @@ def filter_images_by_summary(arxiv_id: str, figures: list[dict]) -> int:
referenced_ids: set[str] = set()
for fig in figures:
fig_id = fig.get("id", "")
m = re.match(r'(?:Fig\.?|Figure)\s*(\d+)', fig_id, re.IGNORECASE)
m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", fig_id, re.IGNORECASE)
if m:
referenced_ids.add(f"Figure {m.group(1)}")
m2 = re.match(r'Table\s*(\d+)', fig_id, re.IGNORECASE)
m2 = re.match(r"Table\s*(\d+)", fig_id, re.IGNORECASE)
if m2:
referenced_ids.add(f"Table {m2.group(1)}")
@@ -433,7 +513,8 @@ def filter_images_by_summary(arxiv_id: str, figures: list[dict]) -> int:
if not keep_filenames:
logger.warning(
"No manifest matches for %s (refs=%s), keeping all",
arxiv_id, referenced_ids,
arxiv_id,
referenced_ids,
)
return len(all_files)
@@ -446,6 +527,9 @@ def filter_images_by_summary(arxiv_id: str, figures: list[dict]) -> int:
kept = len(all_files) - removed
logger.info(
"Filtered images for %s: kept %d, removed %d (refs=%s)",
arxiv_id, kept, removed, referenced_ids,
arxiv_id,
kept,
removed,
referenced_ids,
)
return kept
+3 -2
View File
@@ -172,9 +172,10 @@ def _build_prompt(
'"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", '
'"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", '
'"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度")}, '
'"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要"},'
'{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要"}]'
'"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要","section":"method"},'
'{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要","section":"results"}]'
"\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Tableid 严格使用 \"Figure N\"\"Table N\" 格式。"
"section 必须是 motivation/method/results/limitations 之一,表示该图最适合展示在哪个章节。"
"}"
)
+17
View File
@@ -391,6 +391,20 @@ def _handle_summary_failure(
}
def _cleanup_old_images(db: Session, paper: Paper) -> None:
"""清理旧的图片文件和 figures_json,避免重新总结时残留。"""
arxiv_id = paper.arxiv_id
images_dir = paper_dir(arxiv_id) / "images"
if images_dir.exists():
for old_file in images_dir.iterdir():
if old_file.suffix.lower() in (".png", ".jpg", ".jpeg", ".gif", ".svg") or old_file.name == "manifest.json":
old_file.unlink(missing_ok=True)
# 清除数据库中的 figures_json
if paper.summary and paper.summary.figures_json:
paper.summary.figures_json = None
db.commit()
def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None:
"""从 PDF 提取图片和表格(失败不影响总结)。"""
try:
@@ -437,6 +451,9 @@ async def _do_summarize_one(
paper.summary_status.started_at = utc_now()
db.commit()
# 清理旧的图片文件和 figures_json,避免重新总结时残留
_cleanup_old_images(db, paper)
raw_output = ""
try:
meta_path = write_meta_json(paper)
+20 -2
View File
@@ -122,6 +122,16 @@ endblock %} {% block content %}
<p>{{ paper.summary.method_novelty | safe }}</p>
</details>
{% endif %}
{% if method_figures and method_figures|length > 0 %}
{% for fig in method_figures %}
<figure class="inline-figure">
<img src="{{ fig.image_url }}" alt="{{ fig.caption or fig.id }}" loading="lazy" />
<figcaption>
<strong>{{ fig.id }}</strong>{% if fig.caption %}: {{ fig.caption }}{% endif %}
</figcaption>
</figure>
{% endfor %}
{% endif %}
</section>
{% endif %}
@@ -130,8 +140,8 @@ endblock %} {% block content %}
<section class="summary-section">
<h2>实验结果</h2>
<p>{{ paper.summary.results_main_json | safe }}</p>
{% if table_figures and table_figures|length > 0 %}
{# 优先展示原文表格截图 #}
{% if (table_figures and table_figures|length > 0) or (results_figures and results_figures|length > 0) %}
{# 展示表格截图 + 实验结果图 #}
{% for tf in table_figures %}
<figure class="inline-figure table-screenshot">
<img src="{{ tf.image_url }}" alt="{{ tf.caption or tf.id }}" loading="lazy" />
@@ -140,6 +150,14 @@ endblock %} {% block content %}
</figcaption>
</figure>
{% endfor %}
{% for fig in results_figures %}
<figure class="inline-figure">
<img src="{{ fig.image_url }}" alt="{{ fig.caption or fig.id }}" loading="lazy" />
<figcaption>
<strong>{{ fig.id }}</strong>{% if fig.caption %}: {{ fig.caption }}{% endif %}
</figcaption>
</figure>
{% endfor %}
{% if benchmarks and benchmarks|length > 0 %}
<details>
<summary>查看结构化数据</summary>