feat: refactor PDF extraction to caption-based screenshots, add upvote refresh, clean up UI

- PDF extractor: rewrite from embedded bitmap extraction to caption-based page region screenshots. Finds Figure/Table captions via regex,截取上方/下方 page region, handles compound figures and vector graphics. - Upvote refresh: new crawler.refresh_upvotes() re-fetches upvotes for recent N days without inserting new papers. Scheduler runs daily 30min after pipeline. - Admin: add /admin/refresh-upvotes endpoint and dashboard button. - UI: remove date quick nav, show upvote update time on detail/card pages, clean up CSS date-chip styles. - Utils: add recent_date_strs() helper.
2026-06-09 18:01:01 +08:00
parent b72b5a31bb
commit 1fc6303e09
13 changed files with 460 additions and 311 deletions
@@ -41,6 +41,7 @@ class Settings(BaseSettings):
    SCHEDULE_HOUR: int = 4
    SCHEDULE_MINUTE: int = 0
    APP_WORKERS: int = 1
    UPVOTE_REFRESH_DAYS: int = 7  # 刷新最近 N 天论文的 upvotes
    # 数据库
    DATABASE_URL: str = "sqlite:///data/db/papers.db"
@@ -26,7 +26,7 @@ from app.models import (
 )
 from app.services.admin import get_admin_stats
 from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
-from app.services.crawler import crawl_daily
+from app.services.crawler import crawl_daily, refresh_upvotes
 from app.services.pipeline import run_pipeline
 from app.services.scheduler import get_scheduler
 from app.services.summarizer import summarize_batch, summarize_single
@@ -129,16 +129,20 @@ async def admin_scheduler_status(_admin: None = Depends(verify_admin)):
    """调度器运行状态（JSON）。"""
    scheduler = get_scheduler()
    next_run = None
    upvote_next_run = None
    if scheduler:
        for job in scheduler.get_jobs():
            if job.id == "daily_pipeline":
                next_run = job.next_run_time
-                break
+            elif job.id == "upvote_refresh":
                upvote_next_run = job.next_run_time
    return {
        "enabled": scheduler is not None,
        "schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
        "timezone": settings.APP_TIMEZONE,
        "next_run": next_run.isoformat() if next_run else None,
        "upvote_next_run": upvote_next_run.isoformat() if upvote_next_run else None,
        "upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
    }
@@ -159,6 +163,19 @@ async def admin_trigger_pipeline(
    return {"status": "success", "message": "流水线执行完成"}
@router.post("/refresh-upvotes")
 async def admin_refresh_upvotes(
    _admin: None = Depends(verify_admin),
    db: Session = Depends(get_db),
    days: int | None = Query(None, description="刷新最近 N 天，默认使用配置值"),
 ):
    """手动刷新最近 N 天论文的 upvotes。"""
    result = await refresh_upvotes(db, days=days)
    if result["status"] == "failed":
        raise HTTPException(status_code=500, detail=result.get("error"))
    return result
 # ── 请求模型 ──────────────────────────────────────────────────────────
@@ -315,11 +315,16 @@ def _link_figures_with_images(
    fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))]
    table_type_unmatched = [f for f in unmatched if not _is_figure_type(f.get("id", ""))]
-    # 提取的图片也按类型分流，按文件名排序
+    # 提取的图片按类型分流，按文件名中的编号排序
    def _sort_key(name: str) -> tuple[int, int]:
-        m = re.search(r'page(\d+)_(?:img|table)(\d+)', name)
+        # 新格式：figure_1.png, table_1.png
        m = re.search(r'(?:figure|table)_(\d+)', name)
        if m:
-            return (int(m.group(1)), int(m.group(2)))
+            return (0, int(m.group(1)))
        # 旧格式：page2_img1.png, page5_table1.png
        m2 = re.search(r'page(\d+)_(?:img|table)(\d+)', name)
        if m2:
            return (int(m2.group(1)), int(m2.group(2)))
        return (0, 0)
    fig_images = sorted(
@@ -106,4 +106,5 @@ def get_admin_stats(db: Session) -> dict:
        "next_run": next_run.isoformat() if next_run else None,
        "recent_logs": recent_logs,
        "active_locks": active_locks,
        "upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
    }
@@ -16,7 +16,7 @@ from app.models import (
    SummaryState,
    SummaryStatus,
 )
-from app.utils import make_http_client, utc_now
+from app.utils import make_http_client, recent_date_strs, utc_now
 logger = logging.getLogger(__name__)
@@ -205,3 +205,81 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -
        log_entry.completed_at = utc_now()
        db.commit()
        return {"found": 0, "new": 0, "status": "failed", "error": str(exc)}
 async def refresh_upvotes(db: Session, days: int | None = None) -> dict:
    """重新抓取最近 N 天论文的 upvotes，不插入新论文。
    遍历每天调用 HF API，对已有论文仅更新 upvotes 和 crawled_at。
    """
    days = days or settings.UPVOTE_REFRESH_DAYS
    date_strs = recent_date_strs(days)
    now = utc_now()
    log_entry = CrawlLog(
        task="upvote_refresh",
        status="running",
        date=date_type.fromisoformat(date_strs[0]),
        started_at=now,
    )
    db.add(log_entry)
    db.commit()
    total_updated = 0
    errors: list[str] = []
    try:
        for ds in date_strs:
            try:
                raw_papers = await fetch_daily(ds)
                updated = _update_upvotes_only(db, raw_papers)
                total_updated += updated
                logger.info("Refreshed upvotes for %s: %d papers", ds, updated)
            except Exception as exc:
                msg = f"{ds}: {exc}"
                errors.append(msg)
                logger.warning("Failed to refresh upvotes for %s: %s", ds, exc)
        log_entry.status = "success" if not errors else "partial"
        log_entry.papers_found = total_updated
        log_entry.details_json = f'{{"days": {days}, "errors": {len(errors)}}}'
        log_entry.completed_at = utc_now()
        db.commit()
        return {
            "status": "success" if not errors else "partial",
            "updated": total_updated,
            "days": days,
            "errors": errors or None,
        }
    except Exception as exc:
        logger.exception("Upvote refresh failed")
        log_entry.status = "failed"
        log_entry.error = str(exc)
        log_entry.completed_at = utc_now()
        db.commit()
        return {"status": "failed", "updated": total_updated, "error": str(exc)}
 def _update_upvotes_only(db: Session, papers_raw: list[dict]) -> int:
    """对已有论文更新 upvotes，跳过不存在的新论文。"""
    now = datetime.now(timezone.utc)
    updated = 0
    for item in papers_raw:
        meta = _parse_paper(item)
        arxiv_id = meta["arxiv_id"]
        if not arxiv_id:
            continue
        existing = db.execute(
            select(Paper).where(Paper.arxiv_id == arxiv_id)
        ).scalar_one_or_none()
        if existing:
            existing.upvotes = meta["upvotes"]
            existing.crawled_at = now
            updated += 1
    db.commit()
    return updated
@@ -1,12 +1,12 @@
-"""PDF 图片与表格提取 — 从 PDF 中提取嵌入图片和表格截图。
+"""PDF 图片与表格提取 — 基于 caption 定位的页面区域截图。
-策略：
+核心思路：学术论文排版极其规整，Figure caption 在图下方，Table caption 在表格上方。
-1. 提取 PDF 中嵌入的图片（图表、插图等），按页面位置排序
+因此反过来：先找 caption 文字 → 向上/向下截取页面区域 → 渲染为 PNG。
-2. 检测表格区域，渲染为截图
+
-3. 为每张图/表格提取附近的说明文字（caption），从中识别 Figure N / Table N
+优势（相比提取嵌入位图）：
-4. 根据 caption 内容矫正类型：标注为 "Figure" 的表格区域 → 归为图片
+- 复合图表不会被拆成碎片（整块截取）
-5. 序号匹配兜底：第 N 张图 → Figure N（学术论文图表严格按顺序出现）
+- 矢量图也能截取（页面渲染包含一切）
-6. 保存 manifest.json 供后续与 AI 总结的 figures 字段匹配
+- 不依赖 find_tables()（纯文本匹配 caption）
 """
 from __future__ import annotations
@@ -21,124 +21,252 @@ from app.utils import TMP_DIR
 logger = logging.getLogger(__name__)
-# 最小面积阈值（像素），小于此值的图片视为图标/装饰
+# ── 截取区域参数 ───────────────────────────────────────────────────────
 _MIN_AREA = 10_000  # ~100x100
 _MIN_DIM = 80
-# Caption 搜索区域 — Figure caption 在图下方，Table caption 在图上方
+# Figure: caption 上方搜索图的范围（点）
-_CAPTION_MARGIN = 10   # 贴边距离
+_FIGURE_MAX_HEIGHT = 450       # 最大向上搜索范围
-_CAPTION_MAX_DISTANCE = 250  # 最远搜索距离
+_FIGURE_MIN_HEIGHT = 50        # 最小有效截图高度
-_CAPTION_SIDE_PADDING = 40   # 左右扩展
+_FIGURE_DEFAULT_HEIGHT = 280   # 上方未找到内容块时的默认图高度
-# Figure/Table 标注正则
+# Table: caption 下方搜索表格的范围
-_FIGURE_CAPTION_RE = re.compile(
+_TABLE_MAX_HEIGHT = 500        # 最大向下搜索范围
-    r'\b(?:Fig\.?|Figure)\s*(\d+)\b', re.IGNORECASE
+_TABLE_MIN_HEIGHT = 30
 # caption 左右扩展（双栏论文中 caption 可能比表格窄）
 _REGION_SIDE_PADDING = 10
 # 表格通常比 caption 文字宽，使用更大的水平扩展
 _TABLE_SIDE_PADDING = 60
 # 正文行距的 2 倍 ≈ 空白间隙阈值
 _CONTENT_GAP_THRESHOLD = 30
 # ── Caption 正则 ───────────────────────────────────────────────────────
 # 要求以 Figure/Table 开头（避免匹配正文中的 "see Figure 3" 等）
 _CAPTION_RE = re.compile(
    r'^(?:Fig\.?|Figure)\s+(\d+)\s*[:\.]',
    re.IGNORECASE,
 )
 _TABLE_CAPTION_RE = re.compile(
-    r'\bTable\s*(\d+)\b', re.IGNORECASE
+    r'^Table\s+(\d+)\s*[:\.]',
    re.IGNORECASE,
 )
-def _extract_caption_text(page, bbox, page_height: float, *,
+def _find_captions(doc) -> list[dict]:
-                          search_above: bool = False,
+    """扫描整个文档，找到所有 Figure/Table caption 的位置和信息。"""
-                          search_both: bool = False) -> str | None:
+    captions = []
    """从图片/表格附近区域提取 caption 文字。
-    search_above=True：搜索上方（Table caption 通常在上）
+    for page_num in range(len(doc)):
-    默认搜索下方（Figure caption 通常在下）
+        page = doc[page_num]
-    search_both=True：上下都搜，返回包含 Figure/Table 标注的那边
+        page_width = page.rect.width
-    """
+        page_height = page.rect.height
    import pymupdf
    x0 = max(0, bbox.x0 - _CAPTION_SIDE_PADDING)
    x1 = bbox.x1 + _CAPTION_SIDE_PADDING
    def _search(y0: float, y1: float) -> str | None:
        rect = pymupdf.Rect(x0, y0, x1, y1)
        blocks = page.get_text("blocks")
-        parts: list[str] = []
+
        for block in blocks:
            if len(block) < 5:
                continue
-            block_rect = pymupdf.Rect(block[:4])
+            text = str(block[4]).strip()
-            if block_rect.intersects(rect):
+            if not text:
-                text = str(block[4]).strip()
+                continue
                if text:
                    parts.append(text)
        if parts:
            return " ".join(parts)
        text = page.get_textbox(rect)
        if text and len(text.strip()) >= 5:
            return text.strip()
        return None
-    if search_both:
+            bx0, by0, bx1, by1 = block[0], block[1], block[2], block[3]
-        # 上方
+            # 只取 block 第一行做匹配（避免 block 包含多段文字干扰）
-        above_y1 = max(0, bbox.y0 - _CAPTION_MARGIN)
+            first_line = text.split("\n")[0].strip()
        above_y0 = max(0, bbox.y0 - _CAPTION_MAX_DISTANCE)
        above = _search(above_y0, above_y1)
        # 下方
        below_y0 = bbox.y1 + _CAPTION_MARGIN
        below_y1 = min(page_height, bbox.y1 + _CAPTION_MAX_DISTANCE)
        below = _search(below_y0, below_y1)
-        # 优先返回包含 Figure/Table 标注的那边
+            m = _CAPTION_RE.match(first_line)
-        if above and (_FIGURE_CAPTION_RE.search(above) or _TABLE_CAPTION_RE.search(above)):
+            if m:
-            return above
+                captions.append({
-        if below and (_FIGURE_CAPTION_RE.search(below) or _TABLE_CAPTION_RE.search(below)):
+                    "type": "figure",
-            return below
+                    "num": int(m.group(1)),
-        # 否则返回更长的
+                    "label": f"Figure {m.group(1)}",
-        if above and below:
+                    "page_num": page_num,
-            return above if len(above) >= len(below) else below
+                    "caption_y0": by0,
-        return above or below
+                    "caption_y1": by1,
                    "caption_x0": bx0,
                    "caption_x1": bx1,
                    "caption_text": text,
                    "page_width": page_width,
                    "page_height": page_height,
                })
                continue
-    if search_above:
+            m = _TABLE_CAPTION_RE.match(first_line)
-        y1 = max(0, bbox.y0 - _CAPTION_MARGIN)
+            if m:
-        y0 = max(0, bbox.y0 - _CAPTION_MAX_DISTANCE)
+                captions.append({
                    "type": "table",
                    "num": int(m.group(1)),
                    "label": f"Table {m.group(1)}",
                    "page_num": page_num,
                    "caption_y0": by0,
                    "caption_y1": by1,
                    "caption_x0": bx0,
                    "caption_x1": bx1,
                    "caption_text": text,
                    "page_width": page_width,
                    "page_height": page_height,
                })
    return captions
 def _find_figure_top(page, caption: dict) -> float:
    """向上扫描页面，找到 Figure 的上边界。
    策略：
    1. 收集 caption 上方的所有内容块（文本 + 嵌入图片）
    2. 找到最顶部的内容块作为图的上界
    3. 检查内容块之间的大间隙（表示图从间隙下方开始）
    4. 如果没找到任何内容块，使用默认图高度
    注意：只扫描 text blocks 是不够的，因为 figure 本身是图片/矢量图，
    不会被 get_text("blocks") 返回。必须同时用 get_image_info() 检测嵌入图片。
    """
    caption_y = caption["caption_y0"]
    cx0 = caption["caption_x0"] - _REGION_SIDE_PADDING
    cx1 = caption["caption_x1"] + _REGION_SIDE_PADDING
    # 收集 caption 上方、同列范围内的所有内容块
    # 每个元素: (x0, y0, x1, y1)
    above_blocks: list[tuple[float, float, float, float]] = []
    # ── 1. 文本块 ──
    for b in page.get_text("blocks"):
        if len(b) < 5:
            continue
        bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
        if by1 <= caption_y and by1 > caption_y - _FIGURE_MAX_HEIGHT:
            if bx1 > cx0 and bx0 < cx1:
                above_blocks.append((bx0, by0, bx1, by1))
    # ── 2. 嵌入图片块 — 关键！figure 本身是图片，不是文本 ──
    for img_info in page.get_image_info():
        bbox = img_info.get("bbox")
        if bbox is None:
            continue
        # Rect 对象: x0, y0, x1, y1
        ix0, iy0, ix1, iy1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
        if iy1 <= caption_y and iy1 > caption_y - _FIGURE_MAX_HEIGHT:
            if ix1 > cx0 and ix0 < cx1:
                above_blocks.append((ix0, iy0, ix1, iy1))
    # ── 没有内容块 → 用默认高度（可能是纯矢量图，如 TikZ/matplotlib PDF） ──
    if not above_blocks:
        return max(0, caption_y - _FIGURE_DEFAULT_HEIGHT)
    # ── 找到内容区域的上边界 ──
    # 按 y 从下到上排序（离 caption 最近的在前）
    above_blocks.sort(key=lambda b: b[1], reverse=True)
    # 从 caption 向上扫描，找到第一个大间隙以上作为图的上界
    # 典型结构: [正文段落] ...空白... [图内容(图片/矢量)] [caption]
    # 空白间隙 ≈ 图的上边界
    figure_top = above_blocks[-1][1]  # 最上面的块顶部（默认兜底）
    prev_bottom = caption_y  # 从 caption 顶部开始向上
    for b in above_blocks:
        # b = (x0, y0, x1, y1), 我们关心 y 范围
        gap = prev_bottom - b[3]  # b[3] = by1 = 当前块底部
        if gap > _CONTENT_GAP_THRESHOLD:
            # 大间隙 → 图上边界在间隙下方
            figure_top = prev_bottom - 5
            break
        # 小间隙 → 当前块属于图的一部分（或紧挨着图），继续向上
        prev_bottom = b[1]  # b[1] = by0 = 当前块顶部
    else:
-        y0 = bbox.y1 + _CAPTION_MARGIN
+        # 所有块都紧挨着 → 图从最上面块的顶部开始
-        y1 = min(page_height, bbox.y1 + _CAPTION_MAX_DISTANCE)
+        figure_top = above_blocks[-1][1]
-    return _search(y0, y1)
+    # 限制最大高度
    if caption_y - figure_top > _FIGURE_MAX_HEIGHT:
        figure_top = caption_y - _FIGURE_MAX_HEIGHT
    # 不低于页面顶部
    figure_top = max(0, figure_top)
    return figure_top
-def _identify_label(caption_text: str | None) -> str | None:
+def _find_table_region(page, caption: dict) -> tuple[float, float, float, float]:
-    """从 caption 文字中识别 Figure N / Table N 编号。"""
+    """向下扫描页面，找到 Table 的下边界和水平范围。
    if not caption_text:
        return None
-    m = _FIGURE_CAPTION_RE.search(caption_text)
+    返回: (x0, bottom, x1) — 裁剪区域的左、下、右边界。
-    if m:
+    上边界由调用方根据 caption 位置确定。
        return f"Figure {m.group(1)}"
-    m = _TABLE_CAPTION_RE.search(caption_text)
+    策略：
-    if m:
+    1. 收集 caption 下方的文本块（表格内容是文本）
-        return f"Table {m.group(1)}"
+    2. 找到连续内容区域的底部（遇到大间隙时停止）
    3. 同时检测表格内容的水平范围（表格通常比 caption 宽）
    """
    blocks = page.get_text("blocks")
    caption_y = caption["caption_y1"]  # caption 底部作为扫描起点
    caption_x0 = caption["caption_x0"]
    caption_x1 = caption["caption_x1"]
    page_height = caption["page_height"]
    page_width = caption["page_width"]
-    return None
+    # 先用较宽的范围收集可能的表格内容块
    search_x0 = max(0, caption_x0 - _TABLE_SIDE_PADDING)
    search_x1 = min(page_width, caption_x1 + _TABLE_SIDE_PADDING)
    below_blocks: list[tuple[float, float, float, float]] = []
    for b in blocks:
        if len(b) < 5:
            continue
        bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
        if by0 > caption_y and by0 < caption_y + _TABLE_MAX_HEIGHT:
            if bx1 > search_x0 and bx0 < search_x1:
                below_blocks.append((bx0, by0, bx1, by1))
-def _is_figure_caption(caption_text: str | None) -> bool:
+    if not below_blocks:
-    """判断 caption 是否标注为 Figure（用于矫正 find_tables 的误判）。"""
+        # 没有内容 → 使用默认高度和 caption 宽度
-    if not caption_text:
+        return (
-        return False
+            max(0, caption_x0 - _REGION_SIDE_PADDING),
-    return bool(_FIGURE_CAPTION_RE.search(caption_text))
+            min(page_height, caption_y + _TABLE_MIN_HEIGHT),
            min(page_width, caption_x1 + _REGION_SIDE_PADDING),
        )
    # ── 找到连续内容区域的底部 ──
    below_blocks.sort(key=lambda b: b[1])  # 按 y 升序
    prev_y = caption_y
    bottom = below_blocks[-1][3] + 5  # 最后一块的底部 + margin
    for b in below_blocks:
        gap = b[1] - prev_y  # b[1] = by0
        if gap > _CONTENT_GAP_THRESHOLD:
            bottom = prev_y + 5
            break
        prev_y = b[3]  # b[3] = by1
    # 限制最大高度
    if bottom - caption_y > _TABLE_MAX_HEIGHT:
        bottom = caption_y + _TABLE_MAX_HEIGHT
    # ── 检测表格内容的水平范围 ──
    # 表格通常比 caption 宽，用内容块的实际宽度
    content_x0 = min(caption_x0, min(b[0] for b in below_blocks))
    content_x1 = max(caption_x1, max(b[2] for b in below_blocks))
    # 添加边距，但不超出页面
    x0 = max(0, content_x0 - _REGION_SIDE_PADDING)
    x1 = min(page_width, content_x1 + _REGION_SIDE_PADDING)
    return (x0, bottom, x1)
 def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
-    """从 PDF 提取嵌入图片和表格截图，生成 manifest。
+    """从 PDF 提取 Figure/Table 截图，生成 manifest。
-    匹配策略：
+    策略：找 caption → 定位区域 → 渲染页面截图。
    1. 提取图片→提取 caption 文字→从中识别 Figure/Table 编号
    2. 表格区域若 caption 标注为 "Figure"，则重分类为图片
    3. 未能从 caption 识别编号的，按（页码, 纵向位置）排序后用序号匹配兜底
    Args:
        arxiv_id: 论文 ID
        pdf_path: PDF 路径，默认 data/tmp/{arxiv_id}/paper.pdf
    Returns:
-        提取的图片+表格数量
+        提取的图片数量
    """
    import pymupdf
@@ -153,216 +281,100 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
    images_dest.mkdir(parents=True, exist_ok=True)
    doc = pymupdf.open(str(pdf_path))
    captions = _find_captions(doc)
    if not captions:
        logger.info("No Figure/Table captions found in PDF for %s", arxiv_id)
        doc.close()
        return 0
    # 去重：同一页同一 label 可能匹配到多个 block（如正文引用 "Figure 7"）
    # 保留每个 (type, num) 的第一个匹配（即真正的 caption）
    seen_labels: dict[str, dict] = {}
    for cap in captions:
        key = cap["label"]
        if key not in seen_labels:
            seen_labels[key] = cap
    unique_captions = list(seen_labels.values())
    extracted = 0
-    seen_hashes: set[int] = set()
+    manifest: dict[str, dict] = {}
-    # ── 第一遍：收集所有图片和表格 ──
+    zoom = 2  # 2x 渲染，保证清晰度
    image_items: list[dict] = []
    table_items: list[dict] = []
-    for page_num in range(len(doc)):
+    for cap in unique_captions:
-        page = doc[page_num]
+        page = doc[cap["page_num"]]
-        page_height = page.rect.height
+        pw = cap["page_width"]
        ph = cap["page_height"]
-        # 1. 提取嵌入图片
+        if cap["type"] == "figure":
-        image_list = page.get_images(full=True)
+            # Figure: caption 上方是图 → 向上找图的上边界
-        for img_index, img_info in enumerate(image_list):
+            top = _find_figure_top(page, cap)
-            xref = img_info[0]
+            bottom = cap["caption_y1"] + 5  # 包含 caption
-            try:
+            # 水平范围：caption 宽度 + 边距（图和 caption 通常等宽）
-                pix = pymupdf.Pixmap(doc, xref)
+            # 但也要考虑图内容的实际宽度
-            except Exception:
+            x0 = max(0, cap["caption_x0"] - _REGION_SIDE_PADDING)
            x1 = min(pw, cap["caption_x1"] + _REGION_SIDE_PADDING)
            height = bottom - top
            if height < _FIGURE_MIN_HEIGHT:
                logger.debug(
                    "Figure %s too small (%.0fpt), skipping", cap["label"], height
                )
                continue
-            if pix.width < _MIN_DIM or pix.height < _MIN_DIM:
+        else:
-                continue
+            # Table: caption 下方是表格 → 向下找表格的下边界和水平范围
-            if pix.width * pix.height < _MIN_AREA:
+            x0, bottom, x1 = _find_table_region(page, cap)
            top = max(0, cap["caption_y0"] - 3)  # 包含 caption，上边留少许 margin
            height = bottom - top
            if height < _TABLE_MIN_HEIGHT:
                logger.debug(
                    "Table %s too small (%.0fpt), skipping", cap["label"], height
                )
                continue
-            img_hash = hash(pix.tobytes()[:1024])
+        # 渲染截取
-            if img_hash in seen_hashes:
+        clip = pymupdf.Rect(x0, top, x1, bottom)
-                continue
+        mat = pymupdf.Matrix(zoom, zoom)
            seen_hashes.add(img_hash)
            img_rects = page.get_image_rects(xref)
            if not img_rects:
                continue
            bbox = img_rects[0]
            if pix.n >= 5:
                try:
                    pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
                except Exception:
                    continue
            filename = f"page{page_num + 1}_img{img_index + 1}.png"
            pix.save(str(images_dest / filename))
            extracted += 1
            caption_text = _extract_caption_text(page, bbox, page_height)
            label = _identify_label(caption_text)
            image_items.append({
                "filename": filename,
                "page": page_num + 1,
                "y0": bbox.y0,
                "caption_text": caption_text,
                "label": label,
            })
        # 2. 提取表格截图（同时搜索上方 caption，Table 标题通常在表格上方）
        try:
-            tables = page.find_tables()
+            pix = page.get_pixmap(matrix=mat, clip=clip)
        except Exception:
-            tables = None
+            logger.debug("Failed to render %s region for %s", cap["label"], arxiv_id)
            continue
-        if tables and tables.tables:
+        filename = f"{cap['label'].replace(' ', '_').lower()}.png"
-            for table_index, table in enumerate(tables.tables):
+        pix.save(str(images_dest / filename))
-                bbox = table.bbox
+        extracted += 1
                if not bbox:
                    continue
-                margin = 5
+        cap_preview = cap["caption_text"][:200] if cap["caption_text"] else ""
-                if hasattr(bbox, 'x0'):
+        manifest[filename] = {
-                    x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
+            "page": cap["page_num"] + 1,
-                    table_rect = bbox
+            "type": cap["type"],
-                else:
+            "label": cap["label"],
-                    x0, y0, x1, y1 = bbox
+            "caption_text": cap_preview,
-                    table_rect = pymupdf.Rect(x0, y0, x1, y1)
+            "figures" if cap["type"] == "figure" else "tables": [cap["label"]],
-                clip_rect = pymupdf.Rect(
+        }
-                    x0 - margin, y0 - margin, x1 + margin, y1 + margin
+        logger.debug(
-                )
+            "Rendered %s: page %d, region (%.0f,%.0f)-(%.0f,%.0f) h=%.0fpt → %s",
-
+            cap["label"], cap["page_num"] + 1,
-                zoom = 2
+            x0, top, x1, bottom, height, filename,
-                mat = pymupdf.Matrix(zoom, zoom)
+        )
                try:
                    pix = page.get_pixmap(matrix=mat, clip=clip_rect)
                except Exception:
                    continue
                if pix.width < _MIN_DIM * 2 or pix.height < 30 * 2:
                    continue
                filename = f"page{page_num + 1}_table{table_index + 1}.png"
                pix.save(str(images_dest / filename))
                extracted += 1
                # Table caption 上下都搜（学术论文惯例：Table 标题在上方，但实际排版各异）
                caption_text = _extract_caption_text(
                    page, table_rect, page_height, search_both=True,
                )
                label = _identify_label(caption_text)
                item = {
                    "filename": filename,
                    "page": page_num + 1,
                    "y0": y0,
                    "caption_text": caption_text,
                    "label": label,
                }
                # 关键：caption 标注为 Figure → 重分类为图片
                if _is_figure_caption(caption_text):
                    image_items.append(item)
                else:
                    table_items.append(item)
    doc.close()
    # ── 第二遍：矫正 find_tables 的误判 ──
    # 如果表格与同页的图片高度重叠（复合图表的子区域），且 caption 不含 "Table"，
    # 则重分类为图片，归入邻近图片的 label
    for t_item in table_items[:]:
        t_page = t_item["page"]
        t_y0 = t_item["y0"]
        same_page_images = [i for i in image_items if i["page"] == t_page]
        if not same_page_images:
            continue
        # 检查是否有重叠的图片
        nearby = [
            i for i in same_page_images
            if abs(i["y0"] - t_y0) < 50
        ]
        if nearby and not (t_item["caption_text"] and _TABLE_CAPTION_RE.search(t_item["caption_text"])):
            # 重分类为图片，继承邻近图片的 label
            neighbor_label = nearby[0].get("label")
            t_item["label"] = neighbor_label
            image_items.append(t_item)
            table_items.remove(t_item)
    # ── 第三遍：按 (page, y0) 排序 → 序号匹配兜底 ──
    image_items.sort(key=lambda it: (it["page"], it["y0"]))
    table_items.sort(key=lambda it: (it["page"], it["y0"]))
    # 统计已通过 caption 确认的 Figure/Table 编号，避免序号重复分配
    used_figure_nums: set[int] = set()
    used_table_nums: set[int] = set()
    for item in image_items:
        if item["label"]:
            m = _FIGURE_CAPTION_RE.search(item["label"])
            if m:
                used_figure_nums.add(int(m.group(1)))
    for item in table_items:
        if item["label"]:
            m = _TABLE_CAPTION_RE.search(item["label"])
            if m:
                used_table_nums.add(int(m.group(1)))
    # 为未识别编号的图片分配序号（跳过已占用的编号）
    next_fig = 1
    for item in image_items:
        if item["label"] is None:
            while next_fig in used_figure_nums:
                next_fig += 1
            item["label"] = f"Figure {next_fig}"
            used_figure_nums.add(next_fig)
    next_tbl = 1
    for item in table_items:
        if item["label"] is None:
            while next_tbl in used_table_nums:
                next_tbl += 1
            item["label"] = f"Table {next_tbl}"
            used_table_nums.add(next_tbl)
    # ── 第三遍：构建 manifest ──
    manifest: dict[str, dict] = {}
    for item in image_items:
        manifest[item["filename"]] = {
            "page": item["page"],
            "type": "image",
            "label": item["label"],
            "caption_text": item.get("caption_text"),
            "figures": [item["label"]],
        }
    for item in table_items:
        manifest[item["filename"]] = {
            "page": item["page"],
            "type": "table",
            "label": item["label"],
            "caption_text": item.get("caption_text"),
            "tables": [item["label"]],
        }
    # 保存 manifest
    manifest_path = images_dest / "manifest.json"
    manifest_path.write_text(
        json.dumps(manifest, ensure_ascii=False, indent=2)
    )
    captioned = sum(
        1 for it in image_items + table_items if it["caption_text"]
    )
    label_matched = sum(
        1 for it in image_items + table_items
        if it["caption_text"] and _identify_label(it["caption_text"])
    )
    if extracted > 0:
        logger.info(
-            "Extracted %d items from PDF for %s "
+            "Extracted %d figure/table screenshots from PDF for %s "
-            "(%d images, %d tables, %d with captions, %d label-matched)",
+            "(from %d captions found, %d unique)",
-            extracted, arxiv_id,
+            extracted, arxiv_id, len(captions), len(unique_captions),
            len(image_items), len(table_items), captioned, label_matched,
        )
    return extracted
@@ -12,6 +12,7 @@ from zoneinfo import ZoneInfo
 from app.config import settings
 from app.database import SessionLocal
 from app.services.pipeline import run_pipeline
 from app.services.crawler import refresh_upvotes
 from app.utils import today_str
 logger = logging.getLogger(__name__)
@@ -66,6 +67,22 @@ def start_scheduler() -> AsyncIOScheduler | None:
        misfire_grace_time=3600,  # 允许迟到 1 小时内补执行
    )
    # upvote 刷新：每天流水线之后 30 分钟执行，刷新最近 7 天论文的投票数
    upvote_trigger = CronTrigger(
        hour=settings.SCHEDULE_HOUR,
        minute=settings.SCHEDULE_MINUTE + 30,
        timezone=tz,
    )
    scheduler.add_job(
        _upvote_refresh,
        trigger=upvote_trigger,
        id="upvote_refresh",
        name="upvote_refresh",
        replace_existing=True,
        max_instances=1,
        misfire_grace_time=3600,
    )
    scheduler.start()
    _scheduler = scheduler
    logger.info(
@@ -102,3 +119,19 @@ async def _daily_pipeline() -> None:
        logger.exception("Unexpected error in daily pipeline")
    finally:
        db.close()
 async def _upvote_refresh() -> None:
    """刷新最近 N 天论文的 upvotes。"""
    db: Session = SessionLocal()
    try:
        result = await refresh_upvotes(db)
        logger.info(
            "Upvote refresh completed: status=%s updated=%d",
            result.get("status"),
            result.get("updated", 0),
        )
    except Exception:
        logger.exception("Unexpected error in upvote refresh")
    finally:
        db.close()
@@ -138,20 +138,7 @@ a:hover {
 }
 /* ── Date Quick Nav ─────────────────────────────────────────────── */
 .date-quick-nav {
  margin-top: 32px;
  padding-top: 16px;
  border-top: 1px solid var(--border);
  font-size: 0.85rem;
  color: var(--ink-light);
  display: flex;
  align-items: center;
  gap: 8px;
  flex-wrap: wrap;
 }
 /* ── Chips (shared) ─────────────────────────────────────────────── */
 .date-chip,
 .tag-chip,
 .filter-chip {
  display: inline-block;
@@ -162,14 +149,12 @@ a:hover {
  font-size: 0.8rem;
  color: var(--ink-light);
 }
 .date-chip:hover,
 .tag-chip:hover,
 .filter-chip:hover {
  border-color: var(--accent);
  color: var(--accent);
  text-decoration: none;
 }
 .date-chip.active,
 .tag-chip.active,
 .filter-chip.active {
  background: var(--accent);
@@ -352,6 +337,11 @@ a:hover {
  margin-bottom: 12px;
 }
 .detail-upvote-time {
  font-size: 0.78rem;
  color: var(--border);
 }
 .detail-tags {
  margin-bottom: 12px;
  display: flex;
@@ -33,6 +33,7 @@
    <button class="admin-action-btn" onclick="adminAction('crawl')">🔄 抓取今天</button>
    <button class="admin-action-btn" onclick="adminAction('summarize')">📝 批量总结</button>
    <button class="admin-action-btn" onclick="adminAction('cleanup')">🧹 清理临时文件</button>
    <button class="admin-action-btn" onclick="refreshUpvotes()">👍 刷新投票</button>
  </div>
  <div class="admin-info-grid">
@@ -59,6 +60,10 @@
          <span class="info-value">{{ stats.next_run[:19] | replace('T', ' ') }}</span>
        </div>
        {% endif %}
        <div class="info-row">
          <span class="info-label">投票刷新</span>
          <span class="info-value">每日自动刷新最近 {{ stats.upvote_refresh_days | default(7) }} 天</span>
        </div>
        {% if stats.active_locks %}
        <div class="info-row">
          <span class="info-label">活跃任务</span>
@@ -181,5 +186,12 @@
      .then(data => { if (data) showToast(data.error ? "❌ " + data.error.substring(0,200) : "✅ 流水线已触发"); })
      .catch(err => showToast("❌ 请求失败"));
  }
  function refreshUpvotes() {
    fetch("/admin/refresh-upvotes", { method: "POST", headers: { "Content-Type": "application/json" } })
      .then(r => { if (r.status===303||r.status===401) { window.location.href="/admin/login"; return; } return r.json(); })
      .then(data => { if (data) showToast(data.error ? "❌ " + data.error.substring(0,200) : `✅ 已刷新 ${data.updated || 0} 篇论文投票`); })
      .catch(err => showToast("❌ 请求失败"));
  }
 </script>
 {% endblock %}
@@ -22,6 +22,9 @@ endblock %} {% block content %}
      >📅 {{ paper.published_at or paper.paper_date }}</span
    >
    <span class="detail-upvotes">👍 {{ paper.upvotes }}</span>
    {% if paper.crawled_at %}
    <span class="detail-upvote-time" title="投票数据更新时间">{{ paper.crawled_at.strftime('%m-%d %H:%M') }}</span>
    {% endif %}
  </div>
  {# 标签 #} {% if paper.tags %}
@@ -23,16 +23,6 @@ endblock %} {% block content %}
 </div>
 {% endif %}
 <div class="date-quick-nav">
  <span>有数据的日期：</span>
  {% for d in available_dates[:10] %}
  <a
    href="/day/{{ d }}"
    class="date-chip {% if d == current_date %}active{% endif %}"
    >{{ d }}</a
  >
  {% endfor %}
 </div>
 {% endblock %}
 {% block scripts %}
@@ -20,7 +20,7 @@
        {% endif %}
      </a>
    </h2>
-    <span class="paper-upvotes">👍 {{ paper.upvotes }}</span>
+    <span class="paper-upvotes" title="数据更新于 {{ paper.crawled_at.strftime('%m-%d %H:%M') if paper.crawled_at else '' }}">👍 {{ paper.upvotes }}</span>
    {% if variant == 'search' and distances and paper.arxiv_id in distances %}
    <span class="similarity-score" title="语义相似度距离">
      🎯 {{ "%.3f"|format(distances[paper.arxiv_id]) }}
@@ -57,6 +57,13 @@ def yesterday_str() -> str:
    return yesterday.isoformat()
 def recent_date_strs(n: int) -> list[str]:
    """最近 N 天的日期字符串列表（含今天，按 APP_TIMEZONE）。"""
    tz = ZoneInfo(settings.APP_TIMEZONE)
    today = datetime.now(tz).date()
    return [(today - timedelta(days=i)).isoformat() for i in range(n)]
 def latest_paper_date(db) -> str:
    """查询数据库中最新的 paper_date，无数据时回退到 today_str()。"""
    from sqlalchemy import func, select