feat: enhance PDF extraction with section-based figure routing and improved caption detection
This commit is contained in:
+1
-1
@@ -22,7 +22,7 @@ HTTP_USER_AGENT=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
|
||||
PI_BIN=
|
||||
SUMMARY_SKILL=daily-paper-summary
|
||||
SUMMARY_CONCURRENCY=3
|
||||
SUMMARY_TIMEOUT_SECONDS=900
|
||||
SUMMARY_TIMEOUT_SECONDS=1200
|
||||
SUMMARY_MAX_RETRIES=2
|
||||
SUMMARY_PDF_MODE=auto
|
||||
|
||||
|
||||
+1
-1
@@ -32,7 +32,7 @@ class Settings(BaseSettings):
|
||||
PI_BIN: str = ""
|
||||
SUMMARY_SKILL: str = "daily-paper-summary"
|
||||
SUMMARY_CONCURRENCY: int = 3
|
||||
SUMMARY_TIMEOUT_SECONDS: int = 900
|
||||
SUMMARY_TIMEOUT_SECONDS: int = 1200
|
||||
SUMMARY_MAX_RETRIES: int = 2
|
||||
SUMMARY_PDF_MODE: str = "auto" # "auto" = ≤80k 用 inject,>80k 用 search;也可强制 "inject" / "search"
|
||||
|
||||
|
||||
+23
-6
@@ -122,17 +122,32 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
|
||||
|
||||
linked_figures = _link_figures_with_images(figures_raw, images, arxiv_id)
|
||||
|
||||
# 拆分:table_figures(有截图的 Table 类型)→ 实验结果区域展示截图
|
||||
# figures(其余)→ 论文图表画廊
|
||||
table_figures = []
|
||||
figures = []
|
||||
# 拆分图片到对应展示区域:
|
||||
# table_figures → 实验结果区域(Table 截图,不变)
|
||||
# method_figures → 核心方法区域(section=="method")
|
||||
# results_figures → 实验结果区域(section=="results" 的 Figure)
|
||||
# gallery_figures → 底部画廊(其余:motivation/limitations/无 section/无图)
|
||||
table_figures: list[dict] = []
|
||||
method_figures: list[dict] = []
|
||||
results_figures: list[dict] = []
|
||||
gallery_figures: list[dict] = []
|
||||
for fig in linked_figures:
|
||||
fig_id = fig.get("id", "")
|
||||
section = fig.get("section", "")
|
||||
is_table = fig_id.lower().startswith("table")
|
||||
|
||||
if is_table and fig.get("image_url"):
|
||||
table_figures.append(fig)
|
||||
elif not is_table and section == "method" and fig.get("image_url"):
|
||||
method_figures.append(fig)
|
||||
elif (
|
||||
not is_table
|
||||
and section == "results"
|
||||
and fig.get("image_url")
|
||||
):
|
||||
results_figures.append(fig)
|
||||
else:
|
||||
figures.append(fig)
|
||||
gallery_figures.append(fig)
|
||||
|
||||
return templates.TemplateResponse(
|
||||
request,
|
||||
@@ -144,8 +159,10 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
|
||||
"paper_images": images,
|
||||
"prereqs": prereqs,
|
||||
"benchmarks": benchmarks,
|
||||
"figures": figures,
|
||||
"figures": gallery_figures,
|
||||
"table_figures": table_figures,
|
||||
"method_figures": method_figures,
|
||||
"results_figures": results_figures,
|
||||
"chroma_enabled": settings.CHROMA_ENABLED,
|
||||
"page_title": paper.title_zh or paper.title_en,
|
||||
},
|
||||
|
||||
+188
-104
@@ -24,12 +24,12 @@ logger = logging.getLogger(__name__)
|
||||
# ── 截取区域参数 ───────────────────────────────────────────────────────
|
||||
|
||||
# Figure: caption 上方搜索图的范围(点)
|
||||
_FIGURE_MAX_HEIGHT = 450 # 最大向上搜索范围
|
||||
_FIGURE_MIN_HEIGHT = 50 # 最小有效截图高度
|
||||
_FIGURE_DEFAULT_HEIGHT = 280 # 上方未找到内容块时的默认图高度
|
||||
_FIGURE_MAX_HEIGHT = 450 # 最大向上搜索范围
|
||||
_FIGURE_MIN_HEIGHT = 50 # 最小有效截图高度
|
||||
_FIGURE_DEFAULT_HEIGHT = 280 # 上方未找到内容块时的默认图高度
|
||||
|
||||
# Table: caption 下方搜索表格的范围
|
||||
_TABLE_MAX_HEIGHT = 500 # 最大向下搜索范围
|
||||
_TABLE_MAX_HEIGHT = 500 # 最大向下搜索范围
|
||||
_TABLE_MIN_HEIGHT = 30
|
||||
|
||||
# caption 左右扩展(双栏论文中 caption 可能比表格窄)
|
||||
@@ -37,22 +37,66 @@ _REGION_SIDE_PADDING = 10
|
||||
# 表格通常比 caption 文字宽,使用更大的水平扩展
|
||||
_TABLE_SIDE_PADDING = 60
|
||||
|
||||
# 正文行距的 2 倍 ≈ 空白间隙阈值
|
||||
_CONTENT_GAP_THRESHOLD = 30
|
||||
# 正文行距的 ~1.5 倍 ≈ 空白间隙阈值(学术论文紧密排版,30pt 太宽松)
|
||||
_CONTENT_GAP_THRESHOLD = 20
|
||||
|
||||
|
||||
# ── Caption 正则 ───────────────────────────────────────────────────────
|
||||
|
||||
# 要求以 Figure/Table 开头(避免匹配正文中的 "see Figure 3" 等)
|
||||
# 支持三种 caption 格式:
|
||||
# "Figure 1: Title" / "Figure 1. Title" / "Figure 1 Title"(无标点,空格分隔)
|
||||
# 第三种需要后续紧跟大写字母(排除 "Figure 1 shows..." 等正文引用)
|
||||
_CAPTION_RE = re.compile(
|
||||
r'^(?:Fig\.?|Figure)\s+(\d+)\s*[:\.]',
|
||||
r"^(?:Fig\.?|Figure)\s+(\d+)\s*(?:[:\.]\s*|\s+(?=[A-Z]))",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_TABLE_CAPTION_RE = re.compile(
|
||||
r'^Table\s+(\d+)\s*[:\.]',
|
||||
r"^Table\s+(\d+)\s*(?:[:\.]\s*|\s+(?=[A-Z]))",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# ── 停止信号:表格边界检测遇到以下内容时立即停止 ──
|
||||
|
||||
# 下一个 Figure/Table caption(如 "Table 2:" "Figure 3:" "Figure 4 Title")
|
||||
_CAPTION_STOP_RE = re.compile(
|
||||
r"^(?:Table|Fig\.?|Figure)\s+\d+\s*(?:[:\.]\s*|\s+[A-Z])",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# Section header(如 "6.2 Evolution" "D.1 Dependency" "7 Conclusion")
|
||||
_SECTION_STOP_RE = re.compile(
|
||||
r"^(\d{1,2}(?:\.\d+)?\s+[A-Z][a-z]|[A-Z]\.\d+\s+[A-Z][a-z])"
|
||||
)
|
||||
|
||||
|
||||
def _estimate_column_x(caption: dict) -> tuple[float, float]:
|
||||
"""估计 caption 所在列的水平边界(col_x0, col_x1)。
|
||||
|
||||
双栏论文中 caption 宽度远小于页面宽度,据此判断左右列。
|
||||
单栏或跨栏 caption(宽度 >65% 页宽)返回整页宽度。
|
||||
caption 居中对齐(中心接近页面中线)时按跨栏处理,使用宽范围。
|
||||
"""
|
||||
pw = caption["page_width"]
|
||||
caption_w = caption["caption_x1"] - caption["caption_x0"]
|
||||
|
||||
# caption 宽度 >65% 页宽 → 单栏或跨栏
|
||||
if caption_w > pw * 0.65:
|
||||
return 0, pw
|
||||
|
||||
cx = (caption["caption_x0"] + caption["caption_x1"]) / 2
|
||||
|
||||
# caption 居中(中心距页面中线 <8%)→ 可能是跨栏表格,使用宽范围
|
||||
if abs(cx - pw / 2) / pw < 0.08:
|
||||
return (
|
||||
max(0, caption["caption_x0"] - _TABLE_SIDE_PADDING * 2),
|
||||
min(pw, caption["caption_x1"] + _TABLE_SIDE_PADDING * 2),
|
||||
)
|
||||
|
||||
if cx < pw / 2:
|
||||
return 0, pw / 2
|
||||
else:
|
||||
return pw / 2, pw
|
||||
|
||||
|
||||
def _find_captions(doc) -> list[dict]:
|
||||
"""扫描整个文档,找到所有 Figure/Table caption 的位置和信息。"""
|
||||
@@ -77,36 +121,40 @@ def _find_captions(doc) -> list[dict]:
|
||||
|
||||
m = _CAPTION_RE.match(first_line)
|
||||
if m:
|
||||
captions.append({
|
||||
"type": "figure",
|
||||
"num": int(m.group(1)),
|
||||
"label": f"Figure {m.group(1)}",
|
||||
"page_num": page_num,
|
||||
"caption_y0": by0,
|
||||
"caption_y1": by1,
|
||||
"caption_x0": bx0,
|
||||
"caption_x1": bx1,
|
||||
"caption_text": text,
|
||||
"page_width": page_width,
|
||||
"page_height": page_height,
|
||||
})
|
||||
captions.append(
|
||||
{
|
||||
"type": "figure",
|
||||
"num": int(m.group(1)),
|
||||
"label": f"Figure {m.group(1)}",
|
||||
"page_num": page_num,
|
||||
"caption_y0": by0,
|
||||
"caption_y1": by1,
|
||||
"caption_x0": bx0,
|
||||
"caption_x1": bx1,
|
||||
"caption_text": text,
|
||||
"page_width": page_width,
|
||||
"page_height": page_height,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
m = _TABLE_CAPTION_RE.match(first_line)
|
||||
if m:
|
||||
captions.append({
|
||||
"type": "table",
|
||||
"num": int(m.group(1)),
|
||||
"label": f"Table {m.group(1)}",
|
||||
"page_num": page_num,
|
||||
"caption_y0": by0,
|
||||
"caption_y1": by1,
|
||||
"caption_x0": bx0,
|
||||
"caption_x1": bx1,
|
||||
"caption_text": text,
|
||||
"page_width": page_width,
|
||||
"page_height": page_height,
|
||||
})
|
||||
captions.append(
|
||||
{
|
||||
"type": "table",
|
||||
"num": int(m.group(1)),
|
||||
"label": f"Table {m.group(1)}",
|
||||
"page_num": page_num,
|
||||
"caption_y0": by0,
|
||||
"caption_y1": by1,
|
||||
"caption_x0": bx0,
|
||||
"caption_x1": bx1,
|
||||
"caption_text": text,
|
||||
"page_width": page_width,
|
||||
"page_height": page_height,
|
||||
}
|
||||
)
|
||||
|
||||
return captions
|
||||
|
||||
@@ -115,80 +163,81 @@ def _find_figure_top(page, caption: dict) -> float:
|
||||
"""向上扫描页面,找到 Figure 的上边界。
|
||||
|
||||
策略:
|
||||
1. 收集 caption 上方的所有内容块(文本 + 嵌入图片)
|
||||
2. 找到最顶部的内容块作为图的上界
|
||||
3. 检查内容块之间的大间隙(表示图从间隙下方开始)
|
||||
4. 如果没找到任何内容块,使用默认图高度
|
||||
|
||||
注意:只扫描 text blocks 是不够的,因为 figure 本身是图片/矢量图,
|
||||
不会被 get_text("blocks") 返回。必须同时用 get_image_info() 检测嵌入图片。
|
||||
1. 优先用嵌入图片定位(绝大多数 figure 包含嵌入图片,图片边界即 figure 边界)
|
||||
2. 无图片时回退到文本块间隙检测(处理纯矢量图如 TikZ/matplotlib PDF)
|
||||
"""
|
||||
caption_y = caption["caption_y0"]
|
||||
cx0 = caption["caption_x0"] - _REGION_SIDE_PADDING
|
||||
cx1 = caption["caption_x1"] + _REGION_SIDE_PADDING
|
||||
col_x0, col_x1 = _estimate_column_x(caption)
|
||||
cx0 = max(col_x0, caption["caption_x0"] - _REGION_SIDE_PADDING)
|
||||
cx1 = min(col_x1, caption["caption_x1"] + _REGION_SIDE_PADDING)
|
||||
|
||||
# 收集 caption 上方、同列范围内的所有内容块
|
||||
# 每个元素: (x0, y0, x1, y1)
|
||||
above_blocks: list[tuple[float, float, float, float]] = []
|
||||
|
||||
# ── 1. 文本块 ──
|
||||
# 同页上方最近的 Figure/Table caption(多 figure 同页时截断)
|
||||
_caption_cutoff: float | None = None
|
||||
for b in page.get_text("blocks"):
|
||||
if len(b) < 5:
|
||||
continue
|
||||
bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
|
||||
if by1 <= caption_y and by1 > caption_y - _FIGURE_MAX_HEIGHT:
|
||||
if bx1 > cx0 and bx0 < cx1:
|
||||
above_blocks.append((bx0, by0, bx1, by1))
|
||||
by0, by1 = b[1], b[3]
|
||||
if by1 >= caption_y or by1 <= caption_y - _FIGURE_MAX_HEIGHT:
|
||||
continue
|
||||
first_line = str(b[4]).strip().split("\n")[0].strip()
|
||||
if _CAPTION_STOP_RE.match(first_line):
|
||||
_caption_cutoff = by0
|
||||
break
|
||||
|
||||
# ── 2. 嵌入图片块 — 关键!figure 本身是图片,不是文本 ──
|
||||
# ── 策略 1:嵌入图片定位(覆盖绝大多数 figure) ──
|
||||
topmost_image_y: float | None = None
|
||||
for img_info in page.get_image_info():
|
||||
bbox = img_info.get("bbox")
|
||||
if bbox is None:
|
||||
continue
|
||||
# bbox 可能是 Rect 对象或 tuple,兼容两种格式
|
||||
if hasattr(bbox, 'x0'):
|
||||
if hasattr(bbox, "x0"):
|
||||
ix0, iy0, ix1, iy1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
|
||||
else:
|
||||
ix0, iy0, ix1, iy1 = bbox[0], bbox[1], bbox[2], bbox[3]
|
||||
if iy1 <= caption_y and iy1 > caption_y - _FIGURE_MAX_HEIGHT:
|
||||
if ix1 > cx0 and ix0 < cx1:
|
||||
above_blocks.append((ix0, iy0, ix1, iy1))
|
||||
if _caption_cutoff is not None and iy0 < _caption_cutoff:
|
||||
continue # 属于上方另一个 figure
|
||||
if topmost_image_y is None or iy0 < topmost_image_y:
|
||||
topmost_image_y = iy0
|
||||
|
||||
# ── 没有内容块 → 用默认高度(可能是纯矢量图,如 TikZ/matplotlib PDF) ──
|
||||
if not above_blocks:
|
||||
return max(0, caption_y - _FIGURE_DEFAULT_HEIGHT)
|
||||
|
||||
# ── 找到内容区域的上边界 ──
|
||||
# 按 y 从下到上排序(离 caption 最近的在前)
|
||||
above_blocks.sort(key=lambda b: b[1], reverse=True)
|
||||
|
||||
# 从 caption 向上扫描,找到第一个大间隙以上作为图的上界
|
||||
# 典型结构: [正文段落] ...空白... [图内容(图片/矢量)] [caption]
|
||||
# 空白间隙 ≈ 图的上边界
|
||||
figure_top = above_blocks[-1][1] # 最上面的块顶部(默认兜底)
|
||||
|
||||
prev_bottom = caption_y # 从 caption 顶部开始向上
|
||||
for b in above_blocks:
|
||||
# b = (x0, y0, x1, y1), 我们关心 y 范围
|
||||
gap = prev_bottom - b[3] # b[3] = by1 = 当前块底部
|
||||
if gap > _CONTENT_GAP_THRESHOLD:
|
||||
# 大间隙 → 图上边界在间隙下方
|
||||
figure_top = prev_bottom - 5
|
||||
break
|
||||
# 小间隙 → 当前块属于图的一部分(或紧挨着图),继续向上
|
||||
prev_bottom = b[1] # b[1] = by0 = 当前块顶部
|
||||
if topmost_image_y is not None:
|
||||
figure_top = topmost_image_y
|
||||
else:
|
||||
# 所有块都紧挨着 → 图从最上面块的顶部开始
|
||||
figure_top = above_blocks[-1][1]
|
||||
# ── 策略 2:文本块间隙检测(纯矢量图) ──
|
||||
above_blocks: list[tuple[float, float, float, float]] = []
|
||||
for b in page.get_text("blocks"):
|
||||
if len(b) < 5:
|
||||
continue
|
||||
bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
|
||||
if by1 <= caption_y and by1 > caption_y - _FIGURE_MAX_HEIGHT:
|
||||
if bx1 > cx0 and bx0 < cx1:
|
||||
if col_x0 > 0 and bx0 < col_x0 - _REGION_SIDE_PADDING * 2:
|
||||
continue
|
||||
above_blocks.append((bx0, by0, bx1, by1))
|
||||
|
||||
if not above_blocks:
|
||||
return max(0, caption_y - _FIGURE_DEFAULT_HEIGHT)
|
||||
|
||||
above_blocks.sort(key=lambda b: b[1], reverse=True)
|
||||
prev_bottom = caption_y
|
||||
for b in above_blocks:
|
||||
if prev_bottom - b[3] > _CONTENT_GAP_THRESHOLD:
|
||||
figure_top = prev_bottom - 5
|
||||
break
|
||||
prev_bottom = b[1]
|
||||
else:
|
||||
figure_top = above_blocks[-1][1]
|
||||
|
||||
# 同页 caption 截断
|
||||
if _caption_cutoff is not None:
|
||||
figure_top = max(figure_top, _caption_cutoff + 5)
|
||||
|
||||
# 限制最大高度
|
||||
if caption_y - figure_top > _FIGURE_MAX_HEIGHT:
|
||||
figure_top = caption_y - _FIGURE_MAX_HEIGHT
|
||||
|
||||
# 不低于页面顶部
|
||||
figure_top = max(0, figure_top)
|
||||
|
||||
return figure_top
|
||||
return max(0, figure_top)
|
||||
|
||||
|
||||
def _find_table_region(page, caption: dict) -> tuple[float, float, float, float]:
|
||||
@@ -209,9 +258,10 @@ def _find_table_region(page, caption: dict) -> tuple[float, float, float, float]
|
||||
page_height = caption["page_height"]
|
||||
page_width = caption["page_width"]
|
||||
|
||||
# 先用较宽的范围收集可能的表格内容块
|
||||
search_x0 = max(0, caption_x0 - _TABLE_SIDE_PADDING)
|
||||
search_x1 = min(page_width, caption_x1 + _TABLE_SIDE_PADDING)
|
||||
# 估计 caption 所在列的水平边界,避免双栏论文跨列抓取
|
||||
col_x0, col_x1 = _estimate_column_x(caption)
|
||||
search_x0 = max(col_x0, caption_x0 - _TABLE_SIDE_PADDING)
|
||||
search_x1 = min(col_x1, caption_x1 + _TABLE_SIDE_PADDING)
|
||||
|
||||
below_blocks: list[tuple[float, float, float, float]] = []
|
||||
for b in blocks:
|
||||
@@ -220,6 +270,17 @@ def _find_table_region(page, caption: dict) -> tuple[float, float, float, float]
|
||||
bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
|
||||
if by0 > caption_y and by0 < caption_y + _TABLE_MAX_HEIGHT:
|
||||
if bx1 > search_x0 and bx0 < search_x1:
|
||||
# 双栏论文:排除跨列正文段落(宽度 >> 列宽,起点在另一列)
|
||||
# 表格行起点在列内或列边界附近;正文段落起点在另一列(bx0 远小于 col_x0)
|
||||
if col_x0 > 0 and bx0 < col_x0 - _TABLE_SIDE_PADDING:
|
||||
continue
|
||||
# 停止信号:遇到下一个 caption 或 section header 立即停止
|
||||
text = str(b[4]).strip()
|
||||
first_line = text.split("\n")[0].strip()
|
||||
if _CAPTION_STOP_RE.match(first_line) or _SECTION_STOP_RE.match(
|
||||
first_line
|
||||
):
|
||||
break
|
||||
below_blocks.append((bx0, by0, bx1, by1))
|
||||
|
||||
if not below_blocks:
|
||||
@@ -248,11 +309,16 @@ def _find_table_region(page, caption: dict) -> tuple[float, float, float, float]
|
||||
bottom = caption_y + _TABLE_MAX_HEIGHT
|
||||
|
||||
# ── 检测表格内容的水平范围 ──
|
||||
# 表格通常比 caption 宽,用内容块的实际宽度
|
||||
content_x0 = min(caption_x0, min(b[0] for b in below_blocks))
|
||||
content_x1 = max(caption_x1, max(b[2] for b in below_blocks))
|
||||
# 只用 gap 之前的 block 计算水平范围(gap 之后的 block 属于正文,可能更宽)
|
||||
table_blocks = [b for b in below_blocks if b[1] < bottom]
|
||||
if not table_blocks:
|
||||
table_blocks = below_blocks[:1] # 至少用第一个 block
|
||||
content_x0 = min(caption_x0, min(b[0] for b in table_blocks))
|
||||
content_x1 = max(caption_x1, max(b[2] for b in table_blocks))
|
||||
|
||||
# 添加边距,但不超出页面
|
||||
# 添加边距,不超出页面
|
||||
# 使用较小 padding,避免将相邻列内容(如同页另一列的 Figure)带入截图;
|
||||
# 同时不限制列边界 — 双栏论文中 caption 可能跨列起始
|
||||
x0 = max(0, content_x0 - _REGION_SIDE_PADDING)
|
||||
x1 = min(page_width, content_x1 + _REGION_SIDE_PADDING)
|
||||
|
||||
@@ -283,6 +349,12 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
|
||||
images_dest = paper_dir(arxiv_id) / "images"
|
||||
images_dest.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 清理上次提取的旧图片,避免残留
|
||||
for old_file in images_dest.glob("*.png"):
|
||||
old_file.unlink()
|
||||
if (images_dest / "manifest.json").exists():
|
||||
(images_dest / "manifest.json").unlink()
|
||||
|
||||
doc = pymupdf.open(str(pdf_path))
|
||||
captions = _find_captions(doc)
|
||||
|
||||
@@ -303,16 +375,17 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
|
||||
extracted = 0
|
||||
manifest: dict[str, dict] = {}
|
||||
|
||||
zoom = 2 # 2x 渲染,保证清晰度
|
||||
zoom = 3 # 3x 渲染,保证清晰度
|
||||
|
||||
for cap in unique_captions:
|
||||
page = doc[cap["page_num"]]
|
||||
pw = cap["page_width"]
|
||||
ph = cap["page_height"]
|
||||
|
||||
if cap["type"] == "figure":
|
||||
# Figure: caption 上方是图 → 向上找图的上边界
|
||||
top = _find_figure_top(page, cap)
|
||||
# 上方多留 5pt 边距,确保图框边框、装饰线等不被截断
|
||||
top = max(0, top - 5)
|
||||
bottom = cap["caption_y1"] + 5 # 包含 caption
|
||||
# 水平范围:caption 宽度 + 边距(图和 caption 通常等宽)
|
||||
# 但也要考虑图内容的实际宽度
|
||||
@@ -361,23 +434,30 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
|
||||
}
|
||||
logger.debug(
|
||||
"Rendered %s: page %d, region (%.0f,%.0f)-(%.0f,%.0f) h=%.0fpt → %s",
|
||||
cap["label"], cap["page_num"] + 1,
|
||||
x0, top, x1, bottom, height, filename,
|
||||
cap["label"],
|
||||
cap["page_num"] + 1,
|
||||
x0,
|
||||
top,
|
||||
x1,
|
||||
bottom,
|
||||
height,
|
||||
filename,
|
||||
)
|
||||
|
||||
doc.close()
|
||||
|
||||
# 保存 manifest
|
||||
manifest_path = images_dest / "manifest.json"
|
||||
manifest_path.write_text(
|
||||
json.dumps(manifest, ensure_ascii=False, indent=2)
|
||||
)
|
||||
manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2))
|
||||
|
||||
if extracted > 0:
|
||||
logger.info(
|
||||
"Extracted %d figure/table screenshots from PDF for %s "
|
||||
"(from %d captions found, %d unique)",
|
||||
extracted, arxiv_id, len(captions), len(unique_captions),
|
||||
extracted,
|
||||
arxiv_id,
|
||||
len(captions),
|
||||
len(unique_captions),
|
||||
)
|
||||
|
||||
return extracted
|
||||
@@ -407,10 +487,10 @@ def filter_images_by_summary(arxiv_id: str, figures: list[dict]) -> int:
|
||||
referenced_ids: set[str] = set()
|
||||
for fig in figures:
|
||||
fig_id = fig.get("id", "")
|
||||
m = re.match(r'(?:Fig\.?|Figure)\s*(\d+)', fig_id, re.IGNORECASE)
|
||||
m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", fig_id, re.IGNORECASE)
|
||||
if m:
|
||||
referenced_ids.add(f"Figure {m.group(1)}")
|
||||
m2 = re.match(r'Table\s*(\d+)', fig_id, re.IGNORECASE)
|
||||
m2 = re.match(r"Table\s*(\d+)", fig_id, re.IGNORECASE)
|
||||
if m2:
|
||||
referenced_ids.add(f"Table {m2.group(1)}")
|
||||
|
||||
@@ -433,7 +513,8 @@ def filter_images_by_summary(arxiv_id: str, figures: list[dict]) -> int:
|
||||
if not keep_filenames:
|
||||
logger.warning(
|
||||
"No manifest matches for %s (refs=%s), keeping all",
|
||||
arxiv_id, referenced_ids,
|
||||
arxiv_id,
|
||||
referenced_ids,
|
||||
)
|
||||
return len(all_files)
|
||||
|
||||
@@ -446,6 +527,9 @@ def filter_images_by_summary(arxiv_id: str, figures: list[dict]) -> int:
|
||||
kept = len(all_files) - removed
|
||||
logger.info(
|
||||
"Filtered images for %s: kept %d, removed %d (refs=%s)",
|
||||
arxiv_id, kept, removed, referenced_ids,
|
||||
arxiv_id,
|
||||
kept,
|
||||
removed,
|
||||
referenced_ids,
|
||||
)
|
||||
return kept
|
||||
|
||||
@@ -172,9 +172,10 @@ def _build_prompt(
|
||||
'"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", '
|
||||
'"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", '
|
||||
'"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度")}, '
|
||||
'"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要"},'
|
||||
'{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要"}]'
|
||||
'"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要","section":"method"},'
|
||||
'{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要","section":"results"}]'
|
||||
"\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Table,id 严格使用 \"Figure N\" 或 \"Table N\" 格式。"
|
||||
"section 必须是 motivation/method/results/limitations 之一,表示该图最适合展示在哪个章节。"
|
||||
"}"
|
||||
)
|
||||
|
||||
|
||||
@@ -391,6 +391,20 @@ def _handle_summary_failure(
|
||||
}
|
||||
|
||||
|
||||
def _cleanup_old_images(db: Session, paper: Paper) -> None:
|
||||
"""清理旧的图片文件和 figures_json,避免重新总结时残留。"""
|
||||
arxiv_id = paper.arxiv_id
|
||||
images_dir = paper_dir(arxiv_id) / "images"
|
||||
if images_dir.exists():
|
||||
for old_file in images_dir.iterdir():
|
||||
if old_file.suffix.lower() in (".png", ".jpg", ".jpeg", ".gif", ".svg") or old_file.name == "manifest.json":
|
||||
old_file.unlink(missing_ok=True)
|
||||
# 清除数据库中的 figures_json
|
||||
if paper.summary and paper.summary.figures_json:
|
||||
paper.summary.figures_json = None
|
||||
db.commit()
|
||||
|
||||
|
||||
def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None:
|
||||
"""从 PDF 提取图片和表格(失败不影响总结)。"""
|
||||
try:
|
||||
@@ -437,6 +451,9 @@ async def _do_summarize_one(
|
||||
paper.summary_status.started_at = utc_now()
|
||||
db.commit()
|
||||
|
||||
# 清理旧的图片文件和 figures_json,避免重新总结时残留
|
||||
_cleanup_old_images(db, paper)
|
||||
|
||||
raw_output = ""
|
||||
try:
|
||||
meta_path = write_meta_json(paper)
|
||||
|
||||
@@ -122,6 +122,16 @@ endblock %} {% block content %}
|
||||
<p>{{ paper.summary.method_novelty | safe }}</p>
|
||||
</details>
|
||||
{% endif %}
|
||||
{% if method_figures and method_figures|length > 0 %}
|
||||
{% for fig in method_figures %}
|
||||
<figure class="inline-figure">
|
||||
<img src="{{ fig.image_url }}" alt="{{ fig.caption or fig.id }}" loading="lazy" />
|
||||
<figcaption>
|
||||
<strong>{{ fig.id }}</strong>{% if fig.caption %}: {{ fig.caption }}{% endif %}
|
||||
</figcaption>
|
||||
</figure>
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
</section>
|
||||
{% endif %}
|
||||
|
||||
@@ -130,8 +140,8 @@ endblock %} {% block content %}
|
||||
<section class="summary-section">
|
||||
<h2>实验结果</h2>
|
||||
<p>{{ paper.summary.results_main_json | safe }}</p>
|
||||
{% if table_figures and table_figures|length > 0 %}
|
||||
{# 优先展示原文表格截图 #}
|
||||
{% if (table_figures and table_figures|length > 0) or (results_figures and results_figures|length > 0) %}
|
||||
{# 展示表格截图 + 实验结果图 #}
|
||||
{% for tf in table_figures %}
|
||||
<figure class="inline-figure table-screenshot">
|
||||
<img src="{{ tf.image_url }}" alt="{{ tf.caption or tf.id }}" loading="lazy" />
|
||||
@@ -140,6 +150,14 @@ endblock %} {% block content %}
|
||||
</figcaption>
|
||||
</figure>
|
||||
{% endfor %}
|
||||
{% for fig in results_figures %}
|
||||
<figure class="inline-figure">
|
||||
<img src="{{ fig.image_url }}" alt="{{ fig.caption or fig.id }}" loading="lazy" />
|
||||
<figcaption>
|
||||
<strong>{{ fig.id }}</strong>{% if fig.caption %}: {{ fig.caption }}{% endif %}
|
||||
</figcaption>
|
||||
</figure>
|
||||
{% endfor %}
|
||||
{% if benchmarks and benchmarks|length > 0 %}
|
||||
<details>
|
||||
<summary>查看结构化数据</summary>
|
||||
|
||||
Reference in New Issue
Block a user