From a1e09628200fdac974d442d92d36d4ee2709820f Mon Sep 17 00:00:00 2001 From: rain-bus Date: Wed, 10 Jun 2026 02:05:30 +0800 Subject: [PATCH] feat: enhance PDF extraction with section-based figure routing and improved caption detection --- .env.example | 2 +- app/config.py | 2 +- app/routes/pages.py | 29 ++- app/services/pdf_image_extractor.py | 292 ++++++++++++++++++---------- app/services/pi_client.py | 5 +- app/services/summarizer.py | 17 ++ app/templates/detail.html | 22 ++- 7 files changed, 253 insertions(+), 116 deletions(-) diff --git a/.env.example b/.env.example index f10d863..bfea3bf 100644 --- a/.env.example +++ b/.env.example @@ -22,7 +22,7 @@ HTTP_USER_AGENT=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 PI_BIN= SUMMARY_SKILL=daily-paper-summary SUMMARY_CONCURRENCY=3 -SUMMARY_TIMEOUT_SECONDS=900 +SUMMARY_TIMEOUT_SECONDS=1200 SUMMARY_MAX_RETRIES=2 SUMMARY_PDF_MODE=auto diff --git a/app/config.py b/app/config.py index a838f0d..c6ddc18 100644 --- a/app/config.py +++ b/app/config.py @@ -32,7 +32,7 @@ class Settings(BaseSettings): PI_BIN: str = "" SUMMARY_SKILL: str = "daily-paper-summary" SUMMARY_CONCURRENCY: int = 3 - SUMMARY_TIMEOUT_SECONDS: int = 900 + SUMMARY_TIMEOUT_SECONDS: int = 1200 SUMMARY_MAX_RETRIES: int = 2 SUMMARY_PDF_MODE: str = "auto" # "auto" = ≤80k 用 inject,>80k 用 search;也可强制 "inject" / "search" diff --git a/app/routes/pages.py b/app/routes/pages.py index 6380934..950e5de 100644 --- a/app/routes/pages.py +++ b/app/routes/pages.py @@ -122,17 +122,32 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db)) linked_figures = _link_figures_with_images(figures_raw, images, arxiv_id) - # 拆分:table_figures(有截图的 Table 类型)→ 实验结果区域展示截图 - # figures(其余)→ 论文图表画廊 - table_figures = [] - figures = [] + # 拆分图片到对应展示区域: + # table_figures → 实验结果区域(Table 截图,不变) + # method_figures → 核心方法区域(section=="method") + # results_figures → 实验结果区域(section=="results" 的 Figure) + # gallery_figures → 底部画廊(其余:motivation/limitations/无 section/无图) + table_figures: list[dict] = [] + method_figures: list[dict] = [] + results_figures: list[dict] = [] + gallery_figures: list[dict] = [] for fig in linked_figures: fig_id = fig.get("id", "") + section = fig.get("section", "") is_table = fig_id.lower().startswith("table") + if is_table and fig.get("image_url"): table_figures.append(fig) + elif not is_table and section == "method" and fig.get("image_url"): + method_figures.append(fig) + elif ( + not is_table + and section == "results" + and fig.get("image_url") + ): + results_figures.append(fig) else: - figures.append(fig) + gallery_figures.append(fig) return templates.TemplateResponse( request, @@ -144,8 +159,10 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db)) "paper_images": images, "prereqs": prereqs, "benchmarks": benchmarks, - "figures": figures, + "figures": gallery_figures, "table_figures": table_figures, + "method_figures": method_figures, + "results_figures": results_figures, "chroma_enabled": settings.CHROMA_ENABLED, "page_title": paper.title_zh or paper.title_en, }, diff --git a/app/services/pdf_image_extractor.py b/app/services/pdf_image_extractor.py index c733687..f8758ef 100644 --- a/app/services/pdf_image_extractor.py +++ b/app/services/pdf_image_extractor.py @@ -24,12 +24,12 @@ logger = logging.getLogger(__name__) # ── 截取区域参数 ─────────────────────────────────────────────────────── # Figure: caption 上方搜索图的范围(点) -_FIGURE_MAX_HEIGHT = 450 # 最大向上搜索范围 -_FIGURE_MIN_HEIGHT = 50 # 最小有效截图高度 -_FIGURE_DEFAULT_HEIGHT = 280 # 上方未找到内容块时的默认图高度 +_FIGURE_MAX_HEIGHT = 450 # 最大向上搜索范围 +_FIGURE_MIN_HEIGHT = 50 # 最小有效截图高度 +_FIGURE_DEFAULT_HEIGHT = 280 # 上方未找到内容块时的默认图高度 # Table: caption 下方搜索表格的范围 -_TABLE_MAX_HEIGHT = 500 # 最大向下搜索范围 +_TABLE_MAX_HEIGHT = 500 # 最大向下搜索范围 _TABLE_MIN_HEIGHT = 30 # caption 左右扩展(双栏论文中 caption 可能比表格窄) @@ -37,22 +37,66 @@ _REGION_SIDE_PADDING = 10 # 表格通常比 caption 文字宽,使用更大的水平扩展 _TABLE_SIDE_PADDING = 60 -# 正文行距的 2 倍 ≈ 空白间隙阈值 -_CONTENT_GAP_THRESHOLD = 30 +# 正文行距的 ~1.5 倍 ≈ 空白间隙阈值(学术论文紧密排版,30pt 太宽松) +_CONTENT_GAP_THRESHOLD = 20 # ── Caption 正则 ─────────────────────────────────────────────────────── # 要求以 Figure/Table 开头(避免匹配正文中的 "see Figure 3" 等) +# 支持三种 caption 格式: +# "Figure 1: Title" / "Figure 1. Title" / "Figure 1 Title"(无标点,空格分隔) +# 第三种需要后续紧跟大写字母(排除 "Figure 1 shows..." 等正文引用) _CAPTION_RE = re.compile( - r'^(?:Fig\.?|Figure)\s+(\d+)\s*[:\.]', + r"^(?:Fig\.?|Figure)\s+(\d+)\s*(?:[:\.]\s*|\s+(?=[A-Z]))", re.IGNORECASE, ) _TABLE_CAPTION_RE = re.compile( - r'^Table\s+(\d+)\s*[:\.]', + r"^Table\s+(\d+)\s*(?:[:\.]\s*|\s+(?=[A-Z]))", re.IGNORECASE, ) +# ── 停止信号:表格边界检测遇到以下内容时立即停止 ── + +# 下一个 Figure/Table caption(如 "Table 2:" "Figure 3:" "Figure 4 Title") +_CAPTION_STOP_RE = re.compile( + r"^(?:Table|Fig\.?|Figure)\s+\d+\s*(?:[:\.]\s*|\s+[A-Z])", + re.IGNORECASE, +) +# Section header(如 "6.2 Evolution" "D.1 Dependency" "7 Conclusion") +_SECTION_STOP_RE = re.compile( + r"^(\d{1,2}(?:\.\d+)?\s+[A-Z][a-z]|[A-Z]\.\d+\s+[A-Z][a-z])" +) + + +def _estimate_column_x(caption: dict) -> tuple[float, float]: + """估计 caption 所在列的水平边界(col_x0, col_x1)。 + + 双栏论文中 caption 宽度远小于页面宽度,据此判断左右列。 + 单栏或跨栏 caption(宽度 >65% 页宽)返回整页宽度。 + caption 居中对齐(中心接近页面中线)时按跨栏处理,使用宽范围。 + """ + pw = caption["page_width"] + caption_w = caption["caption_x1"] - caption["caption_x0"] + + # caption 宽度 >65% 页宽 → 单栏或跨栏 + if caption_w > pw * 0.65: + return 0, pw + + cx = (caption["caption_x0"] + caption["caption_x1"]) / 2 + + # caption 居中(中心距页面中线 <8%)→ 可能是跨栏表格,使用宽范围 + if abs(cx - pw / 2) / pw < 0.08: + return ( + max(0, caption["caption_x0"] - _TABLE_SIDE_PADDING * 2), + min(pw, caption["caption_x1"] + _TABLE_SIDE_PADDING * 2), + ) + + if cx < pw / 2: + return 0, pw / 2 + else: + return pw / 2, pw + def _find_captions(doc) -> list[dict]: """扫描整个文档,找到所有 Figure/Table caption 的位置和信息。""" @@ -77,36 +121,40 @@ def _find_captions(doc) -> list[dict]: m = _CAPTION_RE.match(first_line) if m: - captions.append({ - "type": "figure", - "num": int(m.group(1)), - "label": f"Figure {m.group(1)}", - "page_num": page_num, - "caption_y0": by0, - "caption_y1": by1, - "caption_x0": bx0, - "caption_x1": bx1, - "caption_text": text, - "page_width": page_width, - "page_height": page_height, - }) + captions.append( + { + "type": "figure", + "num": int(m.group(1)), + "label": f"Figure {m.group(1)}", + "page_num": page_num, + "caption_y0": by0, + "caption_y1": by1, + "caption_x0": bx0, + "caption_x1": bx1, + "caption_text": text, + "page_width": page_width, + "page_height": page_height, + } + ) continue m = _TABLE_CAPTION_RE.match(first_line) if m: - captions.append({ - "type": "table", - "num": int(m.group(1)), - "label": f"Table {m.group(1)}", - "page_num": page_num, - "caption_y0": by0, - "caption_y1": by1, - "caption_x0": bx0, - "caption_x1": bx1, - "caption_text": text, - "page_width": page_width, - "page_height": page_height, - }) + captions.append( + { + "type": "table", + "num": int(m.group(1)), + "label": f"Table {m.group(1)}", + "page_num": page_num, + "caption_y0": by0, + "caption_y1": by1, + "caption_x0": bx0, + "caption_x1": bx1, + "caption_text": text, + "page_width": page_width, + "page_height": page_height, + } + ) return captions @@ -115,80 +163,81 @@ def _find_figure_top(page, caption: dict) -> float: """向上扫描页面,找到 Figure 的上边界。 策略: - 1. 收集 caption 上方的所有内容块(文本 + 嵌入图片) - 2. 找到最顶部的内容块作为图的上界 - 3. 检查内容块之间的大间隙(表示图从间隙下方开始) - 4. 如果没找到任何内容块,使用默认图高度 - - 注意:只扫描 text blocks 是不够的,因为 figure 本身是图片/矢量图, - 不会被 get_text("blocks") 返回。必须同时用 get_image_info() 检测嵌入图片。 + 1. 优先用嵌入图片定位(绝大多数 figure 包含嵌入图片,图片边界即 figure 边界) + 2. 无图片时回退到文本块间隙检测(处理纯矢量图如 TikZ/matplotlib PDF) """ caption_y = caption["caption_y0"] - cx0 = caption["caption_x0"] - _REGION_SIDE_PADDING - cx1 = caption["caption_x1"] + _REGION_SIDE_PADDING + col_x0, col_x1 = _estimate_column_x(caption) + cx0 = max(col_x0, caption["caption_x0"] - _REGION_SIDE_PADDING) + cx1 = min(col_x1, caption["caption_x1"] + _REGION_SIDE_PADDING) - # 收集 caption 上方、同列范围内的所有内容块 - # 每个元素: (x0, y0, x1, y1) - above_blocks: list[tuple[float, float, float, float]] = [] - - # ── 1. 文本块 ── + # 同页上方最近的 Figure/Table caption(多 figure 同页时截断) + _caption_cutoff: float | None = None for b in page.get_text("blocks"): if len(b) < 5: continue - bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3] - if by1 <= caption_y and by1 > caption_y - _FIGURE_MAX_HEIGHT: - if bx1 > cx0 and bx0 < cx1: - above_blocks.append((bx0, by0, bx1, by1)) + by0, by1 = b[1], b[3] + if by1 >= caption_y or by1 <= caption_y - _FIGURE_MAX_HEIGHT: + continue + first_line = str(b[4]).strip().split("\n")[0].strip() + if _CAPTION_STOP_RE.match(first_line): + _caption_cutoff = by0 + break - # ── 2. 嵌入图片块 — 关键!figure 本身是图片,不是文本 ── + # ── 策略 1:嵌入图片定位(覆盖绝大多数 figure) ── + topmost_image_y: float | None = None for img_info in page.get_image_info(): bbox = img_info.get("bbox") if bbox is None: continue - # bbox 可能是 Rect 对象或 tuple,兼容两种格式 - if hasattr(bbox, 'x0'): + if hasattr(bbox, "x0"): ix0, iy0, ix1, iy1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1 else: ix0, iy0, ix1, iy1 = bbox[0], bbox[1], bbox[2], bbox[3] if iy1 <= caption_y and iy1 > caption_y - _FIGURE_MAX_HEIGHT: if ix1 > cx0 and ix0 < cx1: - above_blocks.append((ix0, iy0, ix1, iy1)) + if _caption_cutoff is not None and iy0 < _caption_cutoff: + continue # 属于上方另一个 figure + if topmost_image_y is None or iy0 < topmost_image_y: + topmost_image_y = iy0 - # ── 没有内容块 → 用默认高度(可能是纯矢量图,如 TikZ/matplotlib PDF) ── - if not above_blocks: - return max(0, caption_y - _FIGURE_DEFAULT_HEIGHT) - - # ── 找到内容区域的上边界 ── - # 按 y 从下到上排序(离 caption 最近的在前) - above_blocks.sort(key=lambda b: b[1], reverse=True) - - # 从 caption 向上扫描,找到第一个大间隙以上作为图的上界 - # 典型结构: [正文段落] ...空白... [图内容(图片/矢量)] [caption] - # 空白间隙 ≈ 图的上边界 - figure_top = above_blocks[-1][1] # 最上面的块顶部(默认兜底) - - prev_bottom = caption_y # 从 caption 顶部开始向上 - for b in above_blocks: - # b = (x0, y0, x1, y1), 我们关心 y 范围 - gap = prev_bottom - b[3] # b[3] = by1 = 当前块底部 - if gap > _CONTENT_GAP_THRESHOLD: - # 大间隙 → 图上边界在间隙下方 - figure_top = prev_bottom - 5 - break - # 小间隙 → 当前块属于图的一部分(或紧挨着图),继续向上 - prev_bottom = b[1] # b[1] = by0 = 当前块顶部 + if topmost_image_y is not None: + figure_top = topmost_image_y else: - # 所有块都紧挨着 → 图从最上面块的顶部开始 - figure_top = above_blocks[-1][1] + # ── 策略 2:文本块间隙检测(纯矢量图) ── + above_blocks: list[tuple[float, float, float, float]] = [] + for b in page.get_text("blocks"): + if len(b) < 5: + continue + bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3] + if by1 <= caption_y and by1 > caption_y - _FIGURE_MAX_HEIGHT: + if bx1 > cx0 and bx0 < cx1: + if col_x0 > 0 and bx0 < col_x0 - _REGION_SIDE_PADDING * 2: + continue + above_blocks.append((bx0, by0, bx1, by1)) + + if not above_blocks: + return max(0, caption_y - _FIGURE_DEFAULT_HEIGHT) + + above_blocks.sort(key=lambda b: b[1], reverse=True) + prev_bottom = caption_y + for b in above_blocks: + if prev_bottom - b[3] > _CONTENT_GAP_THRESHOLD: + figure_top = prev_bottom - 5 + break + prev_bottom = b[1] + else: + figure_top = above_blocks[-1][1] + + # 同页 caption 截断 + if _caption_cutoff is not None: + figure_top = max(figure_top, _caption_cutoff + 5) # 限制最大高度 if caption_y - figure_top > _FIGURE_MAX_HEIGHT: figure_top = caption_y - _FIGURE_MAX_HEIGHT - # 不低于页面顶部 - figure_top = max(0, figure_top) - - return figure_top + return max(0, figure_top) def _find_table_region(page, caption: dict) -> tuple[float, float, float, float]: @@ -209,9 +258,10 @@ def _find_table_region(page, caption: dict) -> tuple[float, float, float, float] page_height = caption["page_height"] page_width = caption["page_width"] - # 先用较宽的范围收集可能的表格内容块 - search_x0 = max(0, caption_x0 - _TABLE_SIDE_PADDING) - search_x1 = min(page_width, caption_x1 + _TABLE_SIDE_PADDING) + # 估计 caption 所在列的水平边界,避免双栏论文跨列抓取 + col_x0, col_x1 = _estimate_column_x(caption) + search_x0 = max(col_x0, caption_x0 - _TABLE_SIDE_PADDING) + search_x1 = min(col_x1, caption_x1 + _TABLE_SIDE_PADDING) below_blocks: list[tuple[float, float, float, float]] = [] for b in blocks: @@ -220,6 +270,17 @@ def _find_table_region(page, caption: dict) -> tuple[float, float, float, float] bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3] if by0 > caption_y and by0 < caption_y + _TABLE_MAX_HEIGHT: if bx1 > search_x0 and bx0 < search_x1: + # 双栏论文:排除跨列正文段落(宽度 >> 列宽,起点在另一列) + # 表格行起点在列内或列边界附近;正文段落起点在另一列(bx0 远小于 col_x0) + if col_x0 > 0 and bx0 < col_x0 - _TABLE_SIDE_PADDING: + continue + # 停止信号:遇到下一个 caption 或 section header 立即停止 + text = str(b[4]).strip() + first_line = text.split("\n")[0].strip() + if _CAPTION_STOP_RE.match(first_line) or _SECTION_STOP_RE.match( + first_line + ): + break below_blocks.append((bx0, by0, bx1, by1)) if not below_blocks: @@ -248,11 +309,16 @@ def _find_table_region(page, caption: dict) -> tuple[float, float, float, float] bottom = caption_y + _TABLE_MAX_HEIGHT # ── 检测表格内容的水平范围 ── - # 表格通常比 caption 宽,用内容块的实际宽度 - content_x0 = min(caption_x0, min(b[0] for b in below_blocks)) - content_x1 = max(caption_x1, max(b[2] for b in below_blocks)) + # 只用 gap 之前的 block 计算水平范围(gap 之后的 block 属于正文,可能更宽) + table_blocks = [b for b in below_blocks if b[1] < bottom] + if not table_blocks: + table_blocks = below_blocks[:1] # 至少用第一个 block + content_x0 = min(caption_x0, min(b[0] for b in table_blocks)) + content_x1 = max(caption_x1, max(b[2] for b in table_blocks)) - # 添加边距,但不超出页面 + # 添加边距,不超出页面 + # 使用较小 padding,避免将相邻列内容(如同页另一列的 Figure)带入截图; + # 同时不限制列边界 — 双栏论文中 caption 可能跨列起始 x0 = max(0, content_x0 - _REGION_SIDE_PADDING) x1 = min(page_width, content_x1 + _REGION_SIDE_PADDING) @@ -283,6 +349,12 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int: images_dest = paper_dir(arxiv_id) / "images" images_dest.mkdir(parents=True, exist_ok=True) + # 清理上次提取的旧图片,避免残留 + for old_file in images_dest.glob("*.png"): + old_file.unlink() + if (images_dest / "manifest.json").exists(): + (images_dest / "manifest.json").unlink() + doc = pymupdf.open(str(pdf_path)) captions = _find_captions(doc) @@ -303,16 +375,17 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int: extracted = 0 manifest: dict[str, dict] = {} - zoom = 2 # 2x 渲染,保证清晰度 + zoom = 3 # 3x 渲染,保证清晰度 for cap in unique_captions: page = doc[cap["page_num"]] pw = cap["page_width"] - ph = cap["page_height"] if cap["type"] == "figure": # Figure: caption 上方是图 → 向上找图的上边界 top = _find_figure_top(page, cap) + # 上方多留 5pt 边距,确保图框边框、装饰线等不被截断 + top = max(0, top - 5) bottom = cap["caption_y1"] + 5 # 包含 caption # 水平范围:caption 宽度 + 边距(图和 caption 通常等宽) # 但也要考虑图内容的实际宽度 @@ -361,23 +434,30 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int: } logger.debug( "Rendered %s: page %d, region (%.0f,%.0f)-(%.0f,%.0f) h=%.0fpt → %s", - cap["label"], cap["page_num"] + 1, - x0, top, x1, bottom, height, filename, + cap["label"], + cap["page_num"] + 1, + x0, + top, + x1, + bottom, + height, + filename, ) doc.close() # 保存 manifest manifest_path = images_dest / "manifest.json" - manifest_path.write_text( - json.dumps(manifest, ensure_ascii=False, indent=2) - ) + manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2)) if extracted > 0: logger.info( "Extracted %d figure/table screenshots from PDF for %s " "(from %d captions found, %d unique)", - extracted, arxiv_id, len(captions), len(unique_captions), + extracted, + arxiv_id, + len(captions), + len(unique_captions), ) return extracted @@ -407,10 +487,10 @@ def filter_images_by_summary(arxiv_id: str, figures: list[dict]) -> int: referenced_ids: set[str] = set() for fig in figures: fig_id = fig.get("id", "") - m = re.match(r'(?:Fig\.?|Figure)\s*(\d+)', fig_id, re.IGNORECASE) + m = re.match(r"(?:Fig\.?|Figure)\s*(\d+)", fig_id, re.IGNORECASE) if m: referenced_ids.add(f"Figure {m.group(1)}") - m2 = re.match(r'Table\s*(\d+)', fig_id, re.IGNORECASE) + m2 = re.match(r"Table\s*(\d+)", fig_id, re.IGNORECASE) if m2: referenced_ids.add(f"Table {m2.group(1)}") @@ -433,7 +513,8 @@ def filter_images_by_summary(arxiv_id: str, figures: list[dict]) -> int: if not keep_filenames: logger.warning( "No manifest matches for %s (refs=%s), keeping all", - arxiv_id, referenced_ids, + arxiv_id, + referenced_ids, ) return len(all_files) @@ -446,6 +527,9 @@ def filter_images_by_summary(arxiv_id: str, figures: list[dict]) -> int: kept = len(all_files) - removed logger.info( "Filtered images for %s: kept %d, removed %d (refs=%s)", - arxiv_id, kept, removed, referenced_ids, + arxiv_id, + kept, + removed, + referenced_ids, ) return kept diff --git a/app/services/pi_client.py b/app/services/pi_client.py index 75a6e28..0a7f68c 100644 --- a/app/services/pi_client.py +++ b/app/services/pi_client.py @@ -172,9 +172,10 @@ def _build_prompt( '"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", ' '"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", ' '"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度")}, ' - '"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要"},' - '{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要"}]' + '"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要","section":"method"},' + '{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要","section":"results"}]' "\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Table,id 严格使用 \"Figure N\" 或 \"Table N\" 格式。" + "section 必须是 motivation/method/results/limitations 之一,表示该图最适合展示在哪个章节。" "}" ) diff --git a/app/services/summarizer.py b/app/services/summarizer.py index ed01e81..9b1c6bd 100644 --- a/app/services/summarizer.py +++ b/app/services/summarizer.py @@ -391,6 +391,20 @@ def _handle_summary_failure( } +def _cleanup_old_images(db: Session, paper: Paper) -> None: + """清理旧的图片文件和 figures_json,避免重新总结时残留。""" + arxiv_id = paper.arxiv_id + images_dir = paper_dir(arxiv_id) / "images" + if images_dir.exists(): + for old_file in images_dir.iterdir(): + if old_file.suffix.lower() in (".png", ".jpg", ".jpeg", ".gif", ".svg") or old_file.name == "manifest.json": + old_file.unlink(missing_ok=True) + # 清除数据库中的 figures_json + if paper.summary and paper.summary.figures_json: + paper.summary.figures_json = None + db.commit() + + def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None: """从 PDF 提取图片和表格(失败不影响总结)。""" try: @@ -437,6 +451,9 @@ async def _do_summarize_one( paper.summary_status.started_at = utc_now() db.commit() + # 清理旧的图片文件和 figures_json,避免重新总结时残留 + _cleanup_old_images(db, paper) + raw_output = "" try: meta_path = write_meta_json(paper) diff --git a/app/templates/detail.html b/app/templates/detail.html index 56e6925..5b9869b 100644 --- a/app/templates/detail.html +++ b/app/templates/detail.html @@ -122,6 +122,16 @@ endblock %} {% block content %}

{{ paper.summary.method_novelty | safe }}

{% endif %} + {% if method_figures and method_figures|length > 0 %} + {% for fig in method_figures %} +
+ {{ fig.caption or fig.id }} +
+ {{ fig.id }}{% if fig.caption %}: {{ fig.caption }}{% endif %} +
+
+ {% endfor %} + {% endif %} {% endif %} @@ -130,8 +140,8 @@ endblock %} {% block content %}

实验结果

{{ paper.summary.results_main_json | safe }}

- {% if table_figures and table_figures|length > 0 %} - {# 优先展示原文表格截图 #} + {% if (table_figures and table_figures|length > 0) or (results_figures and results_figures|length > 0) %} + {# 展示表格截图 + 实验结果图 #} {% for tf in table_figures %}
{{ tf.caption or tf.id }} @@ -140,6 +150,14 @@ endblock %} {% block content %}
{% endfor %} + {% for fig in results_figures %} +
+ {{ fig.caption or fig.id }} +
+ {{ fig.id }}{% if fig.caption %}: {{ fig.caption }}{% endif %} +
+
+ {% endfor %} {% if benchmarks and benchmarks|length > 0 %}
查看结构化数据