feat: refactor PDF extraction to caption-based screenshots, add upvote refresh, clean up UI

- PDF extractor: rewrite from embedded bitmap extraction to caption-based page region screenshots. Finds Figure/Table captions via regex,截取上方/下方 page region, handles compound figures and vector graphics. - Upvote refresh: new crawler.refresh_upvotes() re-fetches upvotes for recent N days without inserting new papers. Scheduler runs daily 30min after pipeline. - Admin: add /admin/refresh-upvotes endpoint and dashboard button. - UI: remove date quick nav, show upvote update time on detail/card pages, clean up CSS date-chip styles. - Utils: add recent_date_strs() helper.
2026-06-09 18:01:01 +08:00
parent b72b5a31bb
commit 1fc6303e09
13 changed files with 460 additions and 311 deletions
@@ -41,6 +41,7 @@ class Settings(BaseSettings):
    SCHEDULE_HOUR: int = 4
    SCHEDULE_MINUTE: int = 0
    APP_WORKERS: int = 1
+    UPVOTE_REFRESH_DAYS: int = 7  # 刷新最近 N 天论文的 upvotes

    # 数据库
    DATABASE_URL: str = "sqlite:///data/db/papers.db"
@@ -26,7 +26,7 @@ from app.models import (
 )
 from app.services.admin import get_admin_stats
 from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
-from app.services.crawler import crawl_daily
+from app.services.crawler import crawl_daily, refresh_upvotes
 from app.services.pipeline import run_pipeline
 from app.services.scheduler import get_scheduler
 from app.services.summarizer import summarize_batch, summarize_single
@@ -129,16 +129,20 @@ async def admin_scheduler_status(_admin: None = Depends(verify_admin)):
    """调度器运行状态（JSON）。"""
    scheduler = get_scheduler()
    next_run = None
+    upvote_next_run = None
    if scheduler:
        for job in scheduler.get_jobs():
            if job.id == "daily_pipeline":
                next_run = job.next_run_time
-                break
+            elif job.id == "upvote_refresh":
+                upvote_next_run = job.next_run_time
    return {
        "enabled": scheduler is not None,
        "schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
        "timezone": settings.APP_TIMEZONE,
        "next_run": next_run.isoformat() if next_run else None,
+        "upvote_next_run": upvote_next_run.isoformat() if upvote_next_run else None,
+        "upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
    }


@@ -159,6 +163,19 @@ async def admin_trigger_pipeline(
    return {"status": "success", "message": "流水线执行完成"}


+@router.post("/refresh-upvotes")
+async def admin_refresh_upvotes(
+    _admin: None = Depends(verify_admin),
+    db: Session = Depends(get_db),
+    days: int | None = Query(None, description="刷新最近 N 天，默认使用配置值"),
+):
+    """手动刷新最近 N 天论文的 upvotes。"""
+    result = await refresh_upvotes(db, days=days)
+    if result["status"] == "failed":
+        raise HTTPException(status_code=500, detail=result.get("error"))
+    return result
+
+
 # ── 请求模型 ──────────────────────────────────────────────────────────


@@ -315,11 +315,16 @@ def _link_figures_with_images(
    fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))]
    table_type_unmatched = [f for f in unmatched if not _is_figure_type(f.get("id", ""))]

-    # 提取的图片也按类型分流，按文件名排序
+    # 提取的图片按类型分流，按文件名中的编号排序
    def _sort_key(name: str) -> tuple[int, int]:
-        m = re.search(r'page(\d+)_(?:img|table)(\d+)', name)
+        # 新格式：figure_1.png, table_1.png
+        m = re.search(r'(?:figure|table)_(\d+)', name)
        if m:
-            return (int(m.group(1)), int(m.group(2)))
+            return (0, int(m.group(1)))
+        # 旧格式：page2_img1.png, page5_table1.png
+        m2 = re.search(r'page(\d+)_(?:img|table)(\d+)', name)
+        if m2:
+            return (int(m2.group(1)), int(m2.group(2)))
        return (0, 0)

    fig_images = sorted(
@@ -106,4 +106,5 @@ def get_admin_stats(db: Session) -> dict:
        "next_run": next_run.isoformat() if next_run else None,
        "recent_logs": recent_logs,
        "active_locks": active_locks,
+        "upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
    }
@@ -16,7 +16,7 @@ from app.models import (
    SummaryState,
    SummaryStatus,
 )
-from app.utils import make_http_client, utc_now
+from app.utils import make_http_client, recent_date_strs, utc_now

 logger = logging.getLogger(__name__)

@@ -205,3 +205,81 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -
        log_entry.completed_at = utc_now()
        db.commit()
        return {"found": 0, "new": 0, "status": "failed", "error": str(exc)}
+
+
+async def refresh_upvotes(db: Session, days: int | None = None) -> dict:
+    """重新抓取最近 N 天论文的 upvotes，不插入新论文。
+
+    遍历每天调用 HF API，对已有论文仅更新 upvotes 和 crawled_at。
+    """
+    days = days or settings.UPVOTE_REFRESH_DAYS
+    date_strs = recent_date_strs(days)
+    now = utc_now()
+
+    log_entry = CrawlLog(
+        task="upvote_refresh",
+        status="running",
+        date=date_type.fromisoformat(date_strs[0]),
+        started_at=now,
+    )
+    db.add(log_entry)
+    db.commit()
+
+    total_updated = 0
+    errors: list[str] = []
+
+    try:
+        for ds in date_strs:
+            try:
+                raw_papers = await fetch_daily(ds)
+                updated = _update_upvotes_only(db, raw_papers)
+                total_updated += updated
+                logger.info("Refreshed upvotes for %s: %d papers", ds, updated)
+            except Exception as exc:
+                msg = f"{ds}: {exc}"
+                errors.append(msg)
+                logger.warning("Failed to refresh upvotes for %s: %s", ds, exc)
+
+        log_entry.status = "success" if not errors else "partial"
+        log_entry.papers_found = total_updated
+        log_entry.details_json = f'{{"days": {days}, "errors": {len(errors)}}}'
+        log_entry.completed_at = utc_now()
+        db.commit()
+
+        return {
+            "status": "success" if not errors else "partial",
+            "updated": total_updated,
+            "days": days,
+            "errors": errors or None,
+        }
+    except Exception as exc:
+        logger.exception("Upvote refresh failed")
+        log_entry.status = "failed"
+        log_entry.error = str(exc)
+        log_entry.completed_at = utc_now()
+        db.commit()
+        return {"status": "failed", "updated": total_updated, "error": str(exc)}
+
+
+def _update_upvotes_only(db: Session, papers_raw: list[dict]) -> int:
+    """对已有论文更新 upvotes，跳过不存在的新论文。"""
+    now = datetime.now(timezone.utc)
+    updated = 0
+
+    for item in papers_raw:
+        meta = _parse_paper(item)
+        arxiv_id = meta["arxiv_id"]
+        if not arxiv_id:
+            continue
+
+        existing = db.execute(
+            select(Paper).where(Paper.arxiv_id == arxiv_id)
+        ).scalar_one_or_none()
+
+        if existing:
+            existing.upvotes = meta["upvotes"]
+            existing.crawled_at = now
+            updated += 1
+
+    db.commit()
+    return updated
@@ -1,12 +1,12 @@
-"""PDF 图片与表格提取 — 从 PDF 中提取嵌入图片和表格截图。
+"""PDF 图片与表格提取 — 基于 caption 定位的页面区域截图。

-策略：
-1. 提取 PDF 中嵌入的图片（图表、插图等），按页面位置排序
-2. 检测表格区域，渲染为截图
-3. 为每张图/表格提取附近的说明文字（caption），从中识别 Figure N / Table N
-4. 根据 caption 内容矫正类型：标注为 "Figure" 的表格区域 → 归为图片
-5. 序号匹配兜底：第 N 张图 → Figure N（学术论文图表严格按顺序出现）
-6. 保存 manifest.json 供后续与 AI 总结的 figures 字段匹配
+核心思路：学术论文排版极其规整，Figure caption 在图下方，Table caption 在表格上方。
+因此反过来：先找 caption 文字 → 向上/向下截取页面区域 → 渲染为 PNG。
+
+优势（相比提取嵌入位图）：
+- 复合图表不会被拆成碎片（整块截取）
+- 矢量图也能截取（页面渲染包含一切）
+- 不依赖 find_tables()（纯文本匹配 caption）
 """

 from __future__ import annotations
@@ -21,124 +21,252 @@ from app.utils import TMP_DIR

 logger = logging.getLogger(__name__)

-# 最小面积阈值（像素），小于此值的图片视为图标/装饰
-_MIN_AREA = 10_000  # ~100x100
-_MIN_DIM = 80
+# ── 截取区域参数 ───────────────────────────────────────────────────────

-# Caption 搜索区域 — Figure caption 在图下方，Table caption 在图上方
-_CAPTION_MARGIN = 10   # 贴边距离
-_CAPTION_MAX_DISTANCE = 250  # 最远搜索距离
-_CAPTION_SIDE_PADDING = 40   # 左右扩展
+# Figure: caption 上方搜索图的范围（点）
+_FIGURE_MAX_HEIGHT = 450       # 最大向上搜索范围
+_FIGURE_MIN_HEIGHT = 50        # 最小有效截图高度
+_FIGURE_DEFAULT_HEIGHT = 280   # 上方未找到内容块时的默认图高度

-# Figure/Table 标注正则
-_FIGURE_CAPTION_RE = re.compile(
-    r'\b(?:Fig\.?|Figure)\s*(\d+)\b', re.IGNORECASE
+# Table: caption 下方搜索表格的范围
+_TABLE_MAX_HEIGHT = 500        # 最大向下搜索范围
+_TABLE_MIN_HEIGHT = 30
+
+# caption 左右扩展（双栏论文中 caption 可能比表格窄）
+_REGION_SIDE_PADDING = 10
+# 表格通常比 caption 文字宽，使用更大的水平扩展
+_TABLE_SIDE_PADDING = 60
+
+# 正文行距的 2 倍 ≈ 空白间隙阈值
+_CONTENT_GAP_THRESHOLD = 30
+
+
+# ── Caption 正则 ───────────────────────────────────────────────────────
+
+# 要求以 Figure/Table 开头（避免匹配正文中的 "see Figure 3" 等）
+_CAPTION_RE = re.compile(
+    r'^(?:Fig\.?|Figure)\s+(\d+)\s*[:\.]',
+    re.IGNORECASE,
 )
 _TABLE_CAPTION_RE = re.compile(
-    r'\bTable\s*(\d+)\b', re.IGNORECASE
+    r'^Table\s+(\d+)\s*[:\.]',
+    re.IGNORECASE,
 )


-def _extract_caption_text(page, bbox, page_height: float, *,
-                          search_above: bool = False,
-                          search_both: bool = False) -> str | None:
-    """从图片/表格附近区域提取 caption 文字。
+def _find_captions(doc) -> list[dict]:
+    """扫描整个文档，找到所有 Figure/Table caption 的位置和信息。"""
+    captions = []

-    search_above=True：搜索上方（Table caption 通常在上）
-    默认搜索下方（Figure caption 通常在下）
-    search_both=True：上下都搜，返回包含 Figure/Table 标注的那边
-    """
-    import pymupdf
-
-    x0 = max(0, bbox.x0 - _CAPTION_SIDE_PADDING)
-    x1 = bbox.x1 + _CAPTION_SIDE_PADDING
-
-    def _search(y0: float, y1: float) -> str | None:
-        rect = pymupdf.Rect(x0, y0, x1, y1)
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        page_width = page.rect.width
+        page_height = page.rect.height
        blocks = page.get_text("blocks")
-        parts: list[str] = []
+
        for block in blocks:
            if len(block) < 5:
                continue
-            block_rect = pymupdf.Rect(block[:4])
-            if block_rect.intersects(rect):
-                text = str(block[4]).strip()
-                if text:
-                    parts.append(text)
-        if parts:
-            return " ".join(parts)
-        text = page.get_textbox(rect)
-        if text and len(text.strip()) >= 5:
-            return text.strip()
-        return None
+            text = str(block[4]).strip()
+            if not text:
+                continue

-    if search_both:
-        # 上方
-        above_y1 = max(0, bbox.y0 - _CAPTION_MARGIN)
-        above_y0 = max(0, bbox.y0 - _CAPTION_MAX_DISTANCE)
-        above = _search(above_y0, above_y1)
-        # 下方
-        below_y0 = bbox.y1 + _CAPTION_MARGIN
-        below_y1 = min(page_height, bbox.y1 + _CAPTION_MAX_DISTANCE)
-        below = _search(below_y0, below_y1)
+            bx0, by0, bx1, by1 = block[0], block[1], block[2], block[3]
+            # 只取 block 第一行做匹配（避免 block 包含多段文字干扰）
+            first_line = text.split("\n")[0].strip()

-        # 优先返回包含 Figure/Table 标注的那边
-        if above and (_FIGURE_CAPTION_RE.search(above) or _TABLE_CAPTION_RE.search(above)):
-            return above
-        if below and (_FIGURE_CAPTION_RE.search(below) or _TABLE_CAPTION_RE.search(below)):
-            return below
-        # 否则返回更长的
-        if above and below:
-            return above if len(above) >= len(below) else below
-        return above or below
+            m = _CAPTION_RE.match(first_line)
+            if m:
+                captions.append({
+                    "type": "figure",
+                    "num": int(m.group(1)),
+                    "label": f"Figure {m.group(1)}",
+                    "page_num": page_num,
+                    "caption_y0": by0,
+                    "caption_y1": by1,
+                    "caption_x0": bx0,
+                    "caption_x1": bx1,
+                    "caption_text": text,
+                    "page_width": page_width,
+                    "page_height": page_height,
+                })
+                continue

-    if search_above:
-        y1 = max(0, bbox.y0 - _CAPTION_MARGIN)
-        y0 = max(0, bbox.y0 - _CAPTION_MAX_DISTANCE)
+            m = _TABLE_CAPTION_RE.match(first_line)
+            if m:
+                captions.append({
+                    "type": "table",
+                    "num": int(m.group(1)),
+                    "label": f"Table {m.group(1)}",
+                    "page_num": page_num,
+                    "caption_y0": by0,
+                    "caption_y1": by1,
+                    "caption_x0": bx0,
+                    "caption_x1": bx1,
+                    "caption_text": text,
+                    "page_width": page_width,
+                    "page_height": page_height,
+                })
+
+    return captions
+
+
+def _find_figure_top(page, caption: dict) -> float:
+    """向上扫描页面，找到 Figure 的上边界。
+
+    策略：
+    1. 收集 caption 上方的所有内容块（文本 + 嵌入图片）
+    2. 找到最顶部的内容块作为图的上界
+    3. 检查内容块之间的大间隙（表示图从间隙下方开始）
+    4. 如果没找到任何内容块，使用默认图高度
+
+    注意：只扫描 text blocks 是不够的，因为 figure 本身是图片/矢量图，
+    不会被 get_text("blocks") 返回。必须同时用 get_image_info() 检测嵌入图片。
+    """
+    caption_y = caption["caption_y0"]
+    cx0 = caption["caption_x0"] - _REGION_SIDE_PADDING
+    cx1 = caption["caption_x1"] + _REGION_SIDE_PADDING
+
+    # 收集 caption 上方、同列范围内的所有内容块
+    # 每个元素: (x0, y0, x1, y1)
+    above_blocks: list[tuple[float, float, float, float]] = []
+
+    # ── 1. 文本块 ──
+    for b in page.get_text("blocks"):
+        if len(b) < 5:
+            continue
+        bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
+        if by1 <= caption_y and by1 > caption_y - _FIGURE_MAX_HEIGHT:
+            if bx1 > cx0 and bx0 < cx1:
+                above_blocks.append((bx0, by0, bx1, by1))
+
+    # ── 2. 嵌入图片块 — 关键！figure 本身是图片，不是文本 ──
+    for img_info in page.get_image_info():
+        bbox = img_info.get("bbox")
+        if bbox is None:
+            continue
+        # Rect 对象: x0, y0, x1, y1
+        ix0, iy0, ix1, iy1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
+        if iy1 <= caption_y and iy1 > caption_y - _FIGURE_MAX_HEIGHT:
+            if ix1 > cx0 and ix0 < cx1:
+                above_blocks.append((ix0, iy0, ix1, iy1))
+
+    # ── 没有内容块 → 用默认高度（可能是纯矢量图，如 TikZ/matplotlib PDF） ──
+    if not above_blocks:
+        return max(0, caption_y - _FIGURE_DEFAULT_HEIGHT)
+
+    # ── 找到内容区域的上边界 ──
+    # 按 y 从下到上排序（离 caption 最近的在前）
+    above_blocks.sort(key=lambda b: b[1], reverse=True)
+
+    # 从 caption 向上扫描，找到第一个大间隙以上作为图的上界
+    # 典型结构: [正文段落] ...空白... [图内容(图片/矢量)] [caption]
+    # 空白间隙 ≈ 图的上边界
+    figure_top = above_blocks[-1][1]  # 最上面的块顶部（默认兜底）
+
+    prev_bottom = caption_y  # 从 caption 顶部开始向上
+    for b in above_blocks:
+        # b = (x0, y0, x1, y1), 我们关心 y 范围
+        gap = prev_bottom - b[3]  # b[3] = by1 = 当前块底部
+        if gap > _CONTENT_GAP_THRESHOLD:
+            # 大间隙 → 图上边界在间隙下方
+            figure_top = prev_bottom - 5
+            break
+        # 小间隙 → 当前块属于图的一部分（或紧挨着图），继续向上
+        prev_bottom = b[1]  # b[1] = by0 = 当前块顶部
    else:
-        y0 = bbox.y1 + _CAPTION_MARGIN
-        y1 = min(page_height, bbox.y1 + _CAPTION_MAX_DISTANCE)
+        # 所有块都紧挨着 → 图从最上面块的顶部开始
+        figure_top = above_blocks[-1][1]

-    return _search(y0, y1)
+    # 限制最大高度
+    if caption_y - figure_top > _FIGURE_MAX_HEIGHT:
+        figure_top = caption_y - _FIGURE_MAX_HEIGHT
+
+    # 不低于页面顶部
+    figure_top = max(0, figure_top)
+
+    return figure_top


-def _identify_label(caption_text: str | None) -> str | None:
-    """从 caption 文字中识别 Figure N / Table N 编号。"""
-    if not caption_text:
-        return None
+def _find_table_region(page, caption: dict) -> tuple[float, float, float, float]:
+    """向下扫描页面，找到 Table 的下边界和水平范围。

-    m = _FIGURE_CAPTION_RE.search(caption_text)
-    if m:
-        return f"Figure {m.group(1)}"
+    返回: (x0, bottom, x1) — 裁剪区域的左、下、右边界。
+    上边界由调用方根据 caption 位置确定。

-    m = _TABLE_CAPTION_RE.search(caption_text)
-    if m:
-        return f"Table {m.group(1)}"
+    策略：
+    1. 收集 caption 下方的文本块（表格内容是文本）
+    2. 找到连续内容区域的底部（遇到大间隙时停止）
+    3. 同时检测表格内容的水平范围（表格通常比 caption 宽）
+    """
+    blocks = page.get_text("blocks")
+    caption_y = caption["caption_y1"]  # caption 底部作为扫描起点
+    caption_x0 = caption["caption_x0"]
+    caption_x1 = caption["caption_x1"]
+    page_height = caption["page_height"]
+    page_width = caption["page_width"]

-    return None
+    # 先用较宽的范围收集可能的表格内容块
+    search_x0 = max(0, caption_x0 - _TABLE_SIDE_PADDING)
+    search_x1 = min(page_width, caption_x1 + _TABLE_SIDE_PADDING)

+    below_blocks: list[tuple[float, float, float, float]] = []
+    for b in blocks:
+        if len(b) < 5:
+            continue
+        bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
+        if by0 > caption_y and by0 < caption_y + _TABLE_MAX_HEIGHT:
+            if bx1 > search_x0 and bx0 < search_x1:
+                below_blocks.append((bx0, by0, bx1, by1))

-def _is_figure_caption(caption_text: str | None) -> bool:
-    """判断 caption 是否标注为 Figure（用于矫正 find_tables 的误判）。"""
-    if not caption_text:
-        return False
-    return bool(_FIGURE_CAPTION_RE.search(caption_text))
+    if not below_blocks:
+        # 没有内容 → 使用默认高度和 caption 宽度
+        return (
+            max(0, caption_x0 - _REGION_SIDE_PADDING),
+            min(page_height, caption_y + _TABLE_MIN_HEIGHT),
+            min(page_width, caption_x1 + _REGION_SIDE_PADDING),
+        )
+
+    # ── 找到连续内容区域的底部 ──
+    below_blocks.sort(key=lambda b: b[1])  # 按 y 升序
+
+    prev_y = caption_y
+    bottom = below_blocks[-1][3] + 5  # 最后一块的底部 + margin
+
+    for b in below_blocks:
+        gap = b[1] - prev_y  # b[1] = by0
+        if gap > _CONTENT_GAP_THRESHOLD:
+            bottom = prev_y + 5
+            break
+        prev_y = b[3]  # b[3] = by1
+
+    # 限制最大高度
+    if bottom - caption_y > _TABLE_MAX_HEIGHT:
+        bottom = caption_y + _TABLE_MAX_HEIGHT
+
+    # ── 检测表格内容的水平范围 ──
+    # 表格通常比 caption 宽，用内容块的实际宽度
+    content_x0 = min(caption_x0, min(b[0] for b in below_blocks))
+    content_x1 = max(caption_x1, max(b[2] for b in below_blocks))
+
+    # 添加边距，但不超出页面
+    x0 = max(0, content_x0 - _REGION_SIDE_PADDING)
+    x1 = min(page_width, content_x1 + _REGION_SIDE_PADDING)
+
+    return (x0, bottom, x1)


 def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
-    """从 PDF 提取嵌入图片和表格截图，生成 manifest。
+    """从 PDF 提取 Figure/Table 截图，生成 manifest。

-    匹配策略：
-    1. 提取图片→提取 caption 文字→从中识别 Figure/Table 编号
-    2. 表格区域若 caption 标注为 "Figure"，则重分类为图片
-    3. 未能从 caption 识别编号的，按（页码, 纵向位置）排序后用序号匹配兜底
+    策略：找 caption → 定位区域 → 渲染页面截图。

    Args:
        arxiv_id: 论文 ID
        pdf_path: PDF 路径，默认 data/tmp/{arxiv_id}/paper.pdf

    Returns:
-        提取的图片+表格数量
+        提取的图片数量
    """
    import pymupdf

@@ -153,216 +281,100 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
    images_dest.mkdir(parents=True, exist_ok=True)

    doc = pymupdf.open(str(pdf_path))
+    captions = _find_captions(doc)
+
+    if not captions:
+        logger.info("No Figure/Table captions found in PDF for %s", arxiv_id)
+        doc.close()
+        return 0
+
+    # 去重：同一页同一 label 可能匹配到多个 block（如正文引用 "Figure 7"）
+    # 保留每个 (type, num) 的第一个匹配（即真正的 caption）
+    seen_labels: dict[str, dict] = {}
+    for cap in captions:
+        key = cap["label"]
+        if key not in seen_labels:
+            seen_labels[key] = cap
+
+    unique_captions = list(seen_labels.values())
    extracted = 0
-    seen_hashes: set[int] = set()
+    manifest: dict[str, dict] = {}

-    # ── 第一遍：收集所有图片和表格 ──
-    image_items: list[dict] = []
-    table_items: list[dict] = []
+    zoom = 2  # 2x 渲染，保证清晰度

-    for page_num in range(len(doc)):
-        page = doc[page_num]
-        page_height = page.rect.height
+    for cap in unique_captions:
+        page = doc[cap["page_num"]]
+        pw = cap["page_width"]
+        ph = cap["page_height"]

-        # 1. 提取嵌入图片
-        image_list = page.get_images(full=True)
-        for img_index, img_info in enumerate(image_list):
-            xref = img_info[0]
-            try:
-                pix = pymupdf.Pixmap(doc, xref)
-            except Exception:
+        if cap["type"] == "figure":
+            # Figure: caption 上方是图 → 向上找图的上边界
+            top = _find_figure_top(page, cap)
+            bottom = cap["caption_y1"] + 5  # 包含 caption
+            # 水平范围：caption 宽度 + 边距（图和 caption 通常等宽）
+            # 但也要考虑图内容的实际宽度
+            x0 = max(0, cap["caption_x0"] - _REGION_SIDE_PADDING)
+            x1 = min(pw, cap["caption_x1"] + _REGION_SIDE_PADDING)
+
+            height = bottom - top
+            if height < _FIGURE_MIN_HEIGHT:
+                logger.debug(
+                    "Figure %s too small (%.0fpt), skipping", cap["label"], height
+                )
                continue

-            if pix.width < _MIN_DIM or pix.height < _MIN_DIM:
-                continue
-            if pix.width * pix.height < _MIN_AREA:
+        else:
+            # Table: caption 下方是表格 → 向下找表格的下边界和水平范围
+            x0, bottom, x1 = _find_table_region(page, cap)
+            top = max(0, cap["caption_y0"] - 3)  # 包含 caption，上边留少许 margin
+
+            height = bottom - top
+            if height < _TABLE_MIN_HEIGHT:
+                logger.debug(
+                    "Table %s too small (%.0fpt), skipping", cap["label"], height
+                )
                continue

-            img_hash = hash(pix.tobytes()[:1024])
-            if img_hash in seen_hashes:
-                continue
-            seen_hashes.add(img_hash)
-
-            img_rects = page.get_image_rects(xref)
-            if not img_rects:
-                continue
-            bbox = img_rects[0]
-
-            if pix.n >= 5:
-                try:
-                    pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
-                except Exception:
-                    continue
-
-            filename = f"page{page_num + 1}_img{img_index + 1}.png"
-            pix.save(str(images_dest / filename))
-            extracted += 1
-
-            caption_text = _extract_caption_text(page, bbox, page_height)
-            label = _identify_label(caption_text)
-
-            image_items.append({
-                "filename": filename,
-                "page": page_num + 1,
-                "y0": bbox.y0,
-                "caption_text": caption_text,
-                "label": label,
-            })
-
-        # 2. 提取表格截图（同时搜索上方 caption，Table 标题通常在表格上方）
+        # 渲染截取
+        clip = pymupdf.Rect(x0, top, x1, bottom)
+        mat = pymupdf.Matrix(zoom, zoom)
        try:
-            tables = page.find_tables()
+            pix = page.get_pixmap(matrix=mat, clip=clip)
        except Exception:
-            tables = None
+            logger.debug("Failed to render %s region for %s", cap["label"], arxiv_id)
+            continue

-        if tables and tables.tables:
-            for table_index, table in enumerate(tables.tables):
-                bbox = table.bbox
-                if not bbox:
-                    continue
+        filename = f"{cap['label'].replace(' ', '_').lower()}.png"
+        pix.save(str(images_dest / filename))
+        extracted += 1

-                margin = 5
-                if hasattr(bbox, 'x0'):
-                    x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
-                    table_rect = bbox
-                else:
-                    x0, y0, x1, y1 = bbox
-                    table_rect = pymupdf.Rect(x0, y0, x1, y1)
-                clip_rect = pymupdf.Rect(
-                    x0 - margin, y0 - margin, x1 + margin, y1 + margin
-                )
-
-                zoom = 2
-                mat = pymupdf.Matrix(zoom, zoom)
-                try:
-                    pix = page.get_pixmap(matrix=mat, clip=clip_rect)
-                except Exception:
-                    continue
-
-                if pix.width < _MIN_DIM * 2 or pix.height < 30 * 2:
-                    continue
-
-                filename = f"page{page_num + 1}_table{table_index + 1}.png"
-                pix.save(str(images_dest / filename))
-                extracted += 1
-
-                # Table caption 上下都搜（学术论文惯例：Table 标题在上方，但实际排版各异）
-                caption_text = _extract_caption_text(
-                    page, table_rect, page_height, search_both=True,
-                )
-                label = _identify_label(caption_text)
-
-                item = {
-                    "filename": filename,
-                    "page": page_num + 1,
-                    "y0": y0,
-                    "caption_text": caption_text,
-                    "label": label,
-                }
-
-                # 关键：caption 标注为 Figure → 重分类为图片
-                if _is_figure_caption(caption_text):
-                    image_items.append(item)
-                else:
-                    table_items.append(item)
+        cap_preview = cap["caption_text"][:200] if cap["caption_text"] else ""
+        manifest[filename] = {
+            "page": cap["page_num"] + 1,
+            "type": cap["type"],
+            "label": cap["label"],
+            "caption_text": cap_preview,
+            "figures" if cap["type"] == "figure" else "tables": [cap["label"]],
+        }
+        logger.debug(
+            "Rendered %s: page %d, region (%.0f,%.0f)-(%.0f,%.0f) h=%.0fpt → %s",
+            cap["label"], cap["page_num"] + 1,
+            x0, top, x1, bottom, height, filename,
+        )

    doc.close()

-    # ── 第二遍：矫正 find_tables 的误判 ──
-    # 如果表格与同页的图片高度重叠（复合图表的子区域），且 caption 不含 "Table"，
-    # 则重分类为图片，归入邻近图片的 label
-    for t_item in table_items[:]:
-        t_page = t_item["page"]
-        t_y0 = t_item["y0"]
-        same_page_images = [i for i in image_items if i["page"] == t_page]
-        if not same_page_images:
-            continue
-        # 检查是否有重叠的图片
-        nearby = [
-            i for i in same_page_images
-            if abs(i["y0"] - t_y0) < 50
-        ]
-        if nearby and not (t_item["caption_text"] and _TABLE_CAPTION_RE.search(t_item["caption_text"])):
-            # 重分类为图片，继承邻近图片的 label
-            neighbor_label = nearby[0].get("label")
-            t_item["label"] = neighbor_label
-            image_items.append(t_item)
-            table_items.remove(t_item)
-
-    # ── 第三遍：按 (page, y0) 排序 → 序号匹配兜底 ──
-    image_items.sort(key=lambda it: (it["page"], it["y0"]))
-    table_items.sort(key=lambda it: (it["page"], it["y0"]))
-
-    # 统计已通过 caption 确认的 Figure/Table 编号，避免序号重复分配
-    used_figure_nums: set[int] = set()
-    used_table_nums: set[int] = set()
-    for item in image_items:
-        if item["label"]:
-            m = _FIGURE_CAPTION_RE.search(item["label"])
-            if m:
-                used_figure_nums.add(int(m.group(1)))
-    for item in table_items:
-        if item["label"]:
-            m = _TABLE_CAPTION_RE.search(item["label"])
-            if m:
-                used_table_nums.add(int(m.group(1)))
-
-    # 为未识别编号的图片分配序号（跳过已占用的编号）
-    next_fig = 1
-    for item in image_items:
-        if item["label"] is None:
-            while next_fig in used_figure_nums:
-                next_fig += 1
-            item["label"] = f"Figure {next_fig}"
-            used_figure_nums.add(next_fig)
-
-    next_tbl = 1
-    for item in table_items:
-        if item["label"] is None:
-            while next_tbl in used_table_nums:
-                next_tbl += 1
-            item["label"] = f"Table {next_tbl}"
-            used_table_nums.add(next_tbl)
-
-    # ── 第三遍：构建 manifest ──
-    manifest: dict[str, dict] = {}
-    for item in image_items:
-        manifest[item["filename"]] = {
-            "page": item["page"],
-            "type": "image",
-            "label": item["label"],
-            "caption_text": item.get("caption_text"),
-            "figures": [item["label"]],
-        }
-    for item in table_items:
-        manifest[item["filename"]] = {
-            "page": item["page"],
-            "type": "table",
-            "label": item["label"],
-            "caption_text": item.get("caption_text"),
-            "tables": [item["label"]],
-        }
-
    # 保存 manifest
    manifest_path = images_dest / "manifest.json"
    manifest_path.write_text(
        json.dumps(manifest, ensure_ascii=False, indent=2)
    )

-    captioned = sum(
-        1 for it in image_items + table_items if it["caption_text"]
-    )
-    label_matched = sum(
-        1 for it in image_items + table_items
-        if it["caption_text"] and _identify_label(it["caption_text"])
-    )
-
    if extracted > 0:
        logger.info(
-            "Extracted %d items from PDF for %s "
-            "(%d images, %d tables, %d with captions, %d label-matched)",
-            extracted, arxiv_id,
-            len(image_items), len(table_items), captioned, label_matched,
+            "Extracted %d figure/table screenshots from PDF for %s "
+            "(from %d captions found, %d unique)",
+            extracted, arxiv_id, len(captions), len(unique_captions),
        )

    return extracted
@@ -12,6 +12,7 @@ from zoneinfo import ZoneInfo
 from app.config import settings
 from app.database import SessionLocal
 from app.services.pipeline import run_pipeline
+from app.services.crawler import refresh_upvotes
 from app.utils import today_str

 logger = logging.getLogger(__name__)
@@ -66,6 +67,22 @@ def start_scheduler() -> AsyncIOScheduler | None:
        misfire_grace_time=3600,  # 允许迟到 1 小时内补执行
    )

+    # upvote 刷新：每天流水线之后 30 分钟执行，刷新最近 7 天论文的投票数
+    upvote_trigger = CronTrigger(
+        hour=settings.SCHEDULE_HOUR,
+        minute=settings.SCHEDULE_MINUTE + 30,
+        timezone=tz,
+    )
+    scheduler.add_job(
+        _upvote_refresh,
+        trigger=upvote_trigger,
+        id="upvote_refresh",
+        name="upvote_refresh",
+        replace_existing=True,
+        max_instances=1,
+        misfire_grace_time=3600,
+    )
+
    scheduler.start()
    _scheduler = scheduler
    logger.info(
@@ -102,3 +119,19 @@ async def _daily_pipeline() -> None:
        logger.exception("Unexpected error in daily pipeline")
    finally:
        db.close()
+
+
+async def _upvote_refresh() -> None:
+    """刷新最近 N 天论文的 upvotes。"""
+    db: Session = SessionLocal()
+    try:
+        result = await refresh_upvotes(db)
+        logger.info(
+            "Upvote refresh completed: status=%s updated=%d",
+            result.get("status"),
+            result.get("updated", 0),
+        )
+    except Exception:
+        logger.exception("Unexpected error in upvote refresh")
+    finally:
+        db.close()
@@ -138,20 +138,7 @@ a:hover {
 }

 /* ── Date Quick Nav ─────────────────────────────────────────────── */
-.date-quick-nav {
-  margin-top: 32px;
-  padding-top: 16px;
-  border-top: 1px solid var(--border);
-  font-size: 0.85rem;
-  color: var(--ink-light);
-  display: flex;
-  align-items: center;
-  gap: 8px;
-  flex-wrap: wrap;
-}
-
 /* ── Chips (shared) ─────────────────────────────────────────────── */
-.date-chip,
 .tag-chip,
 .filter-chip {
  display: inline-block;
@@ -162,14 +149,12 @@ a:hover {
  font-size: 0.8rem;
  color: var(--ink-light);
 }
-.date-chip:hover,
 .tag-chip:hover,
 .filter-chip:hover {
  border-color: var(--accent);
  color: var(--accent);
  text-decoration: none;
 }
-.date-chip.active,
 .tag-chip.active,
 .filter-chip.active {
  background: var(--accent);
@@ -352,6 +337,11 @@ a:hover {
  margin-bottom: 12px;
 }

+.detail-upvote-time {
+  font-size: 0.78rem;
+  color: var(--border);
+}
+
 .detail-tags {
  margin-bottom: 12px;
  display: flex;
@@ -33,6 +33,7 @@
    <button class="admin-action-btn" onclick="adminAction('crawl')">🔄 抓取今天</button>
    <button class="admin-action-btn" onclick="adminAction('summarize')">📝 批量总结</button>
    <button class="admin-action-btn" onclick="adminAction('cleanup')">🧹 清理临时文件</button>
+    <button class="admin-action-btn" onclick="refreshUpvotes()">👍 刷新投票</button>
  </div>

  <div class="admin-info-grid">
@@ -59,6 +60,10 @@
          <span class="info-value">{{ stats.next_run[:19] | replace('T', ' ') }}</span>
        </div>
        {% endif %}
+        <div class="info-row">
+          <span class="info-label">投票刷新</span>
+          <span class="info-value">每日自动刷新最近 {{ stats.upvote_refresh_days | default(7) }} 天</span>
+        </div>
        {% if stats.active_locks %}
        <div class="info-row">
          <span class="info-label">活跃任务</span>
@@ -181,5 +186,12 @@
      .then(data => { if (data) showToast(data.error ? "❌ " + data.error.substring(0,200) : "✅ 流水线已触发"); })
      .catch(err => showToast("❌ 请求失败"));
  }
+
+  function refreshUpvotes() {
+    fetch("/admin/refresh-upvotes", { method: "POST", headers: { "Content-Type": "application/json" } })
+      .then(r => { if (r.status===303||r.status===401) { window.location.href="/admin/login"; return; } return r.json(); })
+      .then(data => { if (data) showToast(data.error ? "❌ " + data.error.substring(0,200) : `✅ 已刷新 ${data.updated || 0} 篇论文投票`); })
+      .catch(err => showToast("❌ 请求失败"));
+  }
 </script>
 {% endblock %}
@@ -22,6 +22,9 @@ endblock %} {% block content %}
      >📅 {{ paper.published_at or paper.paper_date }}</span
    >
    <span class="detail-upvotes">👍 {{ paper.upvotes }}</span>
+    {% if paper.crawled_at %}
+    <span class="detail-upvote-time" title="投票数据更新时间">{{ paper.crawled_at.strftime('%m-%d %H:%M') }}</span>
+    {% endif %}
  </div>

  {# 标签 #} {% if paper.tags %}
@@ -23,16 +23,6 @@ endblock %} {% block content %}
 </div>
 {% endif %}

-<div class="date-quick-nav">
-  <span>有数据的日期：</span>
-  {% for d in available_dates[:10] %}
-  <a
-    href="/day/{{ d }}"
-    class="date-chip {% if d == current_date %}active{% endif %}"
-    >{{ d }}</a
-  >
-  {% endfor %}
-</div>
 {% endblock %}

 {% block scripts %}
@@ -20,7 +20,7 @@
        {% endif %}
      </a>
    </h2>
-    <span class="paper-upvotes">👍 {{ paper.upvotes }}</span>
+    <span class="paper-upvotes" title="数据更新于 {{ paper.crawled_at.strftime('%m-%d %H:%M') if paper.crawled_at else '' }}">👍 {{ paper.upvotes }}</span>
    {% if variant == 'search' and distances and paper.arxiv_id in distances %}
    <span class="similarity-score" title="语义相似度距离">
      🎯 {{ "%.3f"|format(distances[paper.arxiv_id]) }}
@@ -57,6 +57,13 @@ def yesterday_str() -> str:
    return yesterday.isoformat()


+def recent_date_strs(n: int) -> list[str]:
+    """最近 N 天的日期字符串列表（含今天，按 APP_TIMEZONE）。"""
+    tz = ZoneInfo(settings.APP_TIMEZONE)
+    today = datetime.now(tz).date()
+    return [(today - timedelta(days=i)).isoformat() for i in range(n)]
+
+
 def latest_paper_date(db) -> str:
    """查询数据库中最新的 paper_date，无数据时回退到 today_str()。"""
    from sqlalchemy import func, select