From 1fc6303e090e6d80ff57b7ed40db91177db4e4cc Mon Sep 17 00:00:00 2001 From: rain-bus Date: Tue, 9 Jun 2026 18:01:01 +0800 Subject: [PATCH] feat: refactor PDF extraction to caption-based screenshots, add upvote refresh, clean up UI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - PDF extractor: rewrite from embedded bitmap extraction to caption-based page region screenshots. Finds Figure/Table captions via regex,截取上方/下方 page region, handles compound figures and vector graphics. - Upvote refresh: new crawler.refresh_upvotes() re-fetches upvotes for recent N days without inserting new papers. Scheduler runs daily 30min after pipeline. - Admin: add /admin/refresh-upvotes endpoint and dashboard button. - UI: remove date quick nav, show upvote update time on detail/card pages, clean up CSS date-chip styles. - Utils: add recent_date_strs() helper. --- app/config.py | 1 + app/routes/admin.py | 21 +- app/routes/pages.py | 11 +- app/services/admin.py | 1 + app/services/crawler.py | 80 +++- app/services/pdf_image_extractor.py | 570 +++++++++++++------------ app/services/scheduler.py | 33 ++ app/static/css/style.css | 20 +- app/templates/admin_dashboard.html | 12 + app/templates/detail.html | 3 + app/templates/index.html | 10 - app/templates/partials/paper_card.html | 2 +- app/utils.py | 7 + 13 files changed, 460 insertions(+), 311 deletions(-) diff --git a/app/config.py b/app/config.py index 66987e6..d2f3ebc 100644 --- a/app/config.py +++ b/app/config.py @@ -41,6 +41,7 @@ class Settings(BaseSettings): SCHEDULE_HOUR: int = 4 SCHEDULE_MINUTE: int = 0 APP_WORKERS: int = 1 + UPVOTE_REFRESH_DAYS: int = 7 # 刷新最近 N 天论文的 upvotes # 数据库 DATABASE_URL: str = "sqlite:///data/db/papers.db" diff --git a/app/routes/admin.py b/app/routes/admin.py index 1587bde..df877ae 100644 --- a/app/routes/admin.py +++ b/app/routes/admin.py @@ -26,7 +26,7 @@ from app.models import ( ) from app.services.admin import get_admin_stats from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range -from app.services.crawler import crawl_daily +from app.services.crawler import crawl_daily, refresh_upvotes from app.services.pipeline import run_pipeline from app.services.scheduler import get_scheduler from app.services.summarizer import summarize_batch, summarize_single @@ -129,16 +129,20 @@ async def admin_scheduler_status(_admin: None = Depends(verify_admin)): """调度器运行状态(JSON)。""" scheduler = get_scheduler() next_run = None + upvote_next_run = None if scheduler: for job in scheduler.get_jobs(): if job.id == "daily_pipeline": next_run = job.next_run_time - break + elif job.id == "upvote_refresh": + upvote_next_run = job.next_run_time return { "enabled": scheduler is not None, "schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}", "timezone": settings.APP_TIMEZONE, "next_run": next_run.isoformat() if next_run else None, + "upvote_next_run": upvote_next_run.isoformat() if upvote_next_run else None, + "upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS, } @@ -159,6 +163,19 @@ async def admin_trigger_pipeline( return {"status": "success", "message": "流水线执行完成"} +@router.post("/refresh-upvotes") +async def admin_refresh_upvotes( + _admin: None = Depends(verify_admin), + db: Session = Depends(get_db), + days: int | None = Query(None, description="刷新最近 N 天,默认使用配置值"), +): + """手动刷新最近 N 天论文的 upvotes。""" + result = await refresh_upvotes(db, days=days) + if result["status"] == "failed": + raise HTTPException(status_code=500, detail=result.get("error")) + return result + + # ── 请求模型 ────────────────────────────────────────────────────────── diff --git a/app/routes/pages.py b/app/routes/pages.py index a06d892..6380934 100644 --- a/app/routes/pages.py +++ b/app/routes/pages.py @@ -315,11 +315,16 @@ def _link_figures_with_images( fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))] table_type_unmatched = [f for f in unmatched if not _is_figure_type(f.get("id", ""))] - # 提取的图片也按类型分流,按文件名排序 + # 提取的图片按类型分流,按文件名中的编号排序 def _sort_key(name: str) -> tuple[int, int]: - m = re.search(r'page(\d+)_(?:img|table)(\d+)', name) + # 新格式:figure_1.png, table_1.png + m = re.search(r'(?:figure|table)_(\d+)', name) if m: - return (int(m.group(1)), int(m.group(2))) + return (0, int(m.group(1))) + # 旧格式:page2_img1.png, page5_table1.png + m2 = re.search(r'page(\d+)_(?:img|table)(\d+)', name) + if m2: + return (int(m2.group(1)), int(m2.group(2))) return (0, 0) fig_images = sorted( diff --git a/app/services/admin.py b/app/services/admin.py index 8346322..23bde9c 100644 --- a/app/services/admin.py +++ b/app/services/admin.py @@ -106,4 +106,5 @@ def get_admin_stats(db: Session) -> dict: "next_run": next_run.isoformat() if next_run else None, "recent_logs": recent_logs, "active_locks": active_locks, + "upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS, } diff --git a/app/services/crawler.py b/app/services/crawler.py index 8943a4b..feb67c0 100644 --- a/app/services/crawler.py +++ b/app/services/crawler.py @@ -16,7 +16,7 @@ from app.models import ( SummaryState, SummaryStatus, ) -from app.utils import make_http_client, utc_now +from app.utils import make_http_client, recent_date_strs, utc_now logger = logging.getLogger(__name__) @@ -205,3 +205,81 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) - log_entry.completed_at = utc_now() db.commit() return {"found": 0, "new": 0, "status": "failed", "error": str(exc)} + + +async def refresh_upvotes(db: Session, days: int | None = None) -> dict: + """重新抓取最近 N 天论文的 upvotes,不插入新论文。 + + 遍历每天调用 HF API,对已有论文仅更新 upvotes 和 crawled_at。 + """ + days = days or settings.UPVOTE_REFRESH_DAYS + date_strs = recent_date_strs(days) + now = utc_now() + + log_entry = CrawlLog( + task="upvote_refresh", + status="running", + date=date_type.fromisoformat(date_strs[0]), + started_at=now, + ) + db.add(log_entry) + db.commit() + + total_updated = 0 + errors: list[str] = [] + + try: + for ds in date_strs: + try: + raw_papers = await fetch_daily(ds) + updated = _update_upvotes_only(db, raw_papers) + total_updated += updated + logger.info("Refreshed upvotes for %s: %d papers", ds, updated) + except Exception as exc: + msg = f"{ds}: {exc}" + errors.append(msg) + logger.warning("Failed to refresh upvotes for %s: %s", ds, exc) + + log_entry.status = "success" if not errors else "partial" + log_entry.papers_found = total_updated + log_entry.details_json = f'{{"days": {days}, "errors": {len(errors)}}}' + log_entry.completed_at = utc_now() + db.commit() + + return { + "status": "success" if not errors else "partial", + "updated": total_updated, + "days": days, + "errors": errors or None, + } + except Exception as exc: + logger.exception("Upvote refresh failed") + log_entry.status = "failed" + log_entry.error = str(exc) + log_entry.completed_at = utc_now() + db.commit() + return {"status": "failed", "updated": total_updated, "error": str(exc)} + + +def _update_upvotes_only(db: Session, papers_raw: list[dict]) -> int: + """对已有论文更新 upvotes,跳过不存在的新论文。""" + now = datetime.now(timezone.utc) + updated = 0 + + for item in papers_raw: + meta = _parse_paper(item) + arxiv_id = meta["arxiv_id"] + if not arxiv_id: + continue + + existing = db.execute( + select(Paper).where(Paper.arxiv_id == arxiv_id) + ).scalar_one_or_none() + + if existing: + existing.upvotes = meta["upvotes"] + existing.crawled_at = now + updated += 1 + + db.commit() + return updated diff --git a/app/services/pdf_image_extractor.py b/app/services/pdf_image_extractor.py index e62c645..ed7ea34 100644 --- a/app/services/pdf_image_extractor.py +++ b/app/services/pdf_image_extractor.py @@ -1,12 +1,12 @@ -"""PDF 图片与表格提取 — 从 PDF 中提取嵌入图片和表格截图。 +"""PDF 图片与表格提取 — 基于 caption 定位的页面区域截图。 -策略: -1. 提取 PDF 中嵌入的图片(图表、插图等),按页面位置排序 -2. 检测表格区域,渲染为截图 -3. 为每张图/表格提取附近的说明文字(caption),从中识别 Figure N / Table N -4. 根据 caption 内容矫正类型:标注为 "Figure" 的表格区域 → 归为图片 -5. 序号匹配兜底:第 N 张图 → Figure N(学术论文图表严格按顺序出现) -6. 保存 manifest.json 供后续与 AI 总结的 figures 字段匹配 +核心思路:学术论文排版极其规整,Figure caption 在图下方,Table caption 在表格上方。 +因此反过来:先找 caption 文字 → 向上/向下截取页面区域 → 渲染为 PNG。 + +优势(相比提取嵌入位图): +- 复合图表不会被拆成碎片(整块截取) +- 矢量图也能截取(页面渲染包含一切) +- 不依赖 find_tables()(纯文本匹配 caption) """ from __future__ import annotations @@ -21,124 +21,252 @@ from app.utils import TMP_DIR logger = logging.getLogger(__name__) -# 最小面积阈值(像素),小于此值的图片视为图标/装饰 -_MIN_AREA = 10_000 # ~100x100 -_MIN_DIM = 80 +# ── 截取区域参数 ─────────────────────────────────────────────────────── -# Caption 搜索区域 — Figure caption 在图下方,Table caption 在图上方 -_CAPTION_MARGIN = 10 # 贴边距离 -_CAPTION_MAX_DISTANCE = 250 # 最远搜索距离 -_CAPTION_SIDE_PADDING = 40 # 左右扩展 +# Figure: caption 上方搜索图的范围(点) +_FIGURE_MAX_HEIGHT = 450 # 最大向上搜索范围 +_FIGURE_MIN_HEIGHT = 50 # 最小有效截图高度 +_FIGURE_DEFAULT_HEIGHT = 280 # 上方未找到内容块时的默认图高度 -# Figure/Table 标注正则 -_FIGURE_CAPTION_RE = re.compile( - r'\b(?:Fig\.?|Figure)\s*(\d+)\b', re.IGNORECASE +# Table: caption 下方搜索表格的范围 +_TABLE_MAX_HEIGHT = 500 # 最大向下搜索范围 +_TABLE_MIN_HEIGHT = 30 + +# caption 左右扩展(双栏论文中 caption 可能比表格窄) +_REGION_SIDE_PADDING = 10 +# 表格通常比 caption 文字宽,使用更大的水平扩展 +_TABLE_SIDE_PADDING = 60 + +# 正文行距的 2 倍 ≈ 空白间隙阈值 +_CONTENT_GAP_THRESHOLD = 30 + + +# ── Caption 正则 ─────────────────────────────────────────────────────── + +# 要求以 Figure/Table 开头(避免匹配正文中的 "see Figure 3" 等) +_CAPTION_RE = re.compile( + r'^(?:Fig\.?|Figure)\s+(\d+)\s*[:\.]', + re.IGNORECASE, ) _TABLE_CAPTION_RE = re.compile( - r'\bTable\s*(\d+)\b', re.IGNORECASE + r'^Table\s+(\d+)\s*[:\.]', + re.IGNORECASE, ) -def _extract_caption_text(page, bbox, page_height: float, *, - search_above: bool = False, - search_both: bool = False) -> str | None: - """从图片/表格附近区域提取 caption 文字。 +def _find_captions(doc) -> list[dict]: + """扫描整个文档,找到所有 Figure/Table caption 的位置和信息。""" + captions = [] - search_above=True:搜索上方(Table caption 通常在上) - 默认搜索下方(Figure caption 通常在下) - search_both=True:上下都搜,返回包含 Figure/Table 标注的那边 - """ - import pymupdf - - x0 = max(0, bbox.x0 - _CAPTION_SIDE_PADDING) - x1 = bbox.x1 + _CAPTION_SIDE_PADDING - - def _search(y0: float, y1: float) -> str | None: - rect = pymupdf.Rect(x0, y0, x1, y1) + for page_num in range(len(doc)): + page = doc[page_num] + page_width = page.rect.width + page_height = page.rect.height blocks = page.get_text("blocks") - parts: list[str] = [] + for block in blocks: if len(block) < 5: continue - block_rect = pymupdf.Rect(block[:4]) - if block_rect.intersects(rect): - text = str(block[4]).strip() - if text: - parts.append(text) - if parts: - return " ".join(parts) - text = page.get_textbox(rect) - if text and len(text.strip()) >= 5: - return text.strip() - return None + text = str(block[4]).strip() + if not text: + continue - if search_both: - # 上方 - above_y1 = max(0, bbox.y0 - _CAPTION_MARGIN) - above_y0 = max(0, bbox.y0 - _CAPTION_MAX_DISTANCE) - above = _search(above_y0, above_y1) - # 下方 - below_y0 = bbox.y1 + _CAPTION_MARGIN - below_y1 = min(page_height, bbox.y1 + _CAPTION_MAX_DISTANCE) - below = _search(below_y0, below_y1) + bx0, by0, bx1, by1 = block[0], block[1], block[2], block[3] + # 只取 block 第一行做匹配(避免 block 包含多段文字干扰) + first_line = text.split("\n")[0].strip() - # 优先返回包含 Figure/Table 标注的那边 - if above and (_FIGURE_CAPTION_RE.search(above) or _TABLE_CAPTION_RE.search(above)): - return above - if below and (_FIGURE_CAPTION_RE.search(below) or _TABLE_CAPTION_RE.search(below)): - return below - # 否则返回更长的 - if above and below: - return above if len(above) >= len(below) else below - return above or below + m = _CAPTION_RE.match(first_line) + if m: + captions.append({ + "type": "figure", + "num": int(m.group(1)), + "label": f"Figure {m.group(1)}", + "page_num": page_num, + "caption_y0": by0, + "caption_y1": by1, + "caption_x0": bx0, + "caption_x1": bx1, + "caption_text": text, + "page_width": page_width, + "page_height": page_height, + }) + continue - if search_above: - y1 = max(0, bbox.y0 - _CAPTION_MARGIN) - y0 = max(0, bbox.y0 - _CAPTION_MAX_DISTANCE) + m = _TABLE_CAPTION_RE.match(first_line) + if m: + captions.append({ + "type": "table", + "num": int(m.group(1)), + "label": f"Table {m.group(1)}", + "page_num": page_num, + "caption_y0": by0, + "caption_y1": by1, + "caption_x0": bx0, + "caption_x1": bx1, + "caption_text": text, + "page_width": page_width, + "page_height": page_height, + }) + + return captions + + +def _find_figure_top(page, caption: dict) -> float: + """向上扫描页面,找到 Figure 的上边界。 + + 策略: + 1. 收集 caption 上方的所有内容块(文本 + 嵌入图片) + 2. 找到最顶部的内容块作为图的上界 + 3. 检查内容块之间的大间隙(表示图从间隙下方开始) + 4. 如果没找到任何内容块,使用默认图高度 + + 注意:只扫描 text blocks 是不够的,因为 figure 本身是图片/矢量图, + 不会被 get_text("blocks") 返回。必须同时用 get_image_info() 检测嵌入图片。 + """ + caption_y = caption["caption_y0"] + cx0 = caption["caption_x0"] - _REGION_SIDE_PADDING + cx1 = caption["caption_x1"] + _REGION_SIDE_PADDING + + # 收集 caption 上方、同列范围内的所有内容块 + # 每个元素: (x0, y0, x1, y1) + above_blocks: list[tuple[float, float, float, float]] = [] + + # ── 1. 文本块 ── + for b in page.get_text("blocks"): + if len(b) < 5: + continue + bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3] + if by1 <= caption_y and by1 > caption_y - _FIGURE_MAX_HEIGHT: + if bx1 > cx0 and bx0 < cx1: + above_blocks.append((bx0, by0, bx1, by1)) + + # ── 2. 嵌入图片块 — 关键!figure 本身是图片,不是文本 ── + for img_info in page.get_image_info(): + bbox = img_info.get("bbox") + if bbox is None: + continue + # Rect 对象: x0, y0, x1, y1 + ix0, iy0, ix1, iy1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1 + if iy1 <= caption_y and iy1 > caption_y - _FIGURE_MAX_HEIGHT: + if ix1 > cx0 and ix0 < cx1: + above_blocks.append((ix0, iy0, ix1, iy1)) + + # ── 没有内容块 → 用默认高度(可能是纯矢量图,如 TikZ/matplotlib PDF) ── + if not above_blocks: + return max(0, caption_y - _FIGURE_DEFAULT_HEIGHT) + + # ── 找到内容区域的上边界 ── + # 按 y 从下到上排序(离 caption 最近的在前) + above_blocks.sort(key=lambda b: b[1], reverse=True) + + # 从 caption 向上扫描,找到第一个大间隙以上作为图的上界 + # 典型结构: [正文段落] ...空白... [图内容(图片/矢量)] [caption] + # 空白间隙 ≈ 图的上边界 + figure_top = above_blocks[-1][1] # 最上面的块顶部(默认兜底) + + prev_bottom = caption_y # 从 caption 顶部开始向上 + for b in above_blocks: + # b = (x0, y0, x1, y1), 我们关心 y 范围 + gap = prev_bottom - b[3] # b[3] = by1 = 当前块底部 + if gap > _CONTENT_GAP_THRESHOLD: + # 大间隙 → 图上边界在间隙下方 + figure_top = prev_bottom - 5 + break + # 小间隙 → 当前块属于图的一部分(或紧挨着图),继续向上 + prev_bottom = b[1] # b[1] = by0 = 当前块顶部 else: - y0 = bbox.y1 + _CAPTION_MARGIN - y1 = min(page_height, bbox.y1 + _CAPTION_MAX_DISTANCE) + # 所有块都紧挨着 → 图从最上面块的顶部开始 + figure_top = above_blocks[-1][1] - return _search(y0, y1) + # 限制最大高度 + if caption_y - figure_top > _FIGURE_MAX_HEIGHT: + figure_top = caption_y - _FIGURE_MAX_HEIGHT + + # 不低于页面顶部 + figure_top = max(0, figure_top) + + return figure_top -def _identify_label(caption_text: str | None) -> str | None: - """从 caption 文字中识别 Figure N / Table N 编号。""" - if not caption_text: - return None +def _find_table_region(page, caption: dict) -> tuple[float, float, float, float]: + """向下扫描页面,找到 Table 的下边界和水平范围。 - m = _FIGURE_CAPTION_RE.search(caption_text) - if m: - return f"Figure {m.group(1)}" + 返回: (x0, bottom, x1) — 裁剪区域的左、下、右边界。 + 上边界由调用方根据 caption 位置确定。 - m = _TABLE_CAPTION_RE.search(caption_text) - if m: - return f"Table {m.group(1)}" + 策略: + 1. 收集 caption 下方的文本块(表格内容是文本) + 2. 找到连续内容区域的底部(遇到大间隙时停止) + 3. 同时检测表格内容的水平范围(表格通常比 caption 宽) + """ + blocks = page.get_text("blocks") + caption_y = caption["caption_y1"] # caption 底部作为扫描起点 + caption_x0 = caption["caption_x0"] + caption_x1 = caption["caption_x1"] + page_height = caption["page_height"] + page_width = caption["page_width"] - return None + # 先用较宽的范围收集可能的表格内容块 + search_x0 = max(0, caption_x0 - _TABLE_SIDE_PADDING) + search_x1 = min(page_width, caption_x1 + _TABLE_SIDE_PADDING) + below_blocks: list[tuple[float, float, float, float]] = [] + for b in blocks: + if len(b) < 5: + continue + bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3] + if by0 > caption_y and by0 < caption_y + _TABLE_MAX_HEIGHT: + if bx1 > search_x0 and bx0 < search_x1: + below_blocks.append((bx0, by0, bx1, by1)) -def _is_figure_caption(caption_text: str | None) -> bool: - """判断 caption 是否标注为 Figure(用于矫正 find_tables 的误判)。""" - if not caption_text: - return False - return bool(_FIGURE_CAPTION_RE.search(caption_text)) + if not below_blocks: + # 没有内容 → 使用默认高度和 caption 宽度 + return ( + max(0, caption_x0 - _REGION_SIDE_PADDING), + min(page_height, caption_y + _TABLE_MIN_HEIGHT), + min(page_width, caption_x1 + _REGION_SIDE_PADDING), + ) + + # ── 找到连续内容区域的底部 ── + below_blocks.sort(key=lambda b: b[1]) # 按 y 升序 + + prev_y = caption_y + bottom = below_blocks[-1][3] + 5 # 最后一块的底部 + margin + + for b in below_blocks: + gap = b[1] - prev_y # b[1] = by0 + if gap > _CONTENT_GAP_THRESHOLD: + bottom = prev_y + 5 + break + prev_y = b[3] # b[3] = by1 + + # 限制最大高度 + if bottom - caption_y > _TABLE_MAX_HEIGHT: + bottom = caption_y + _TABLE_MAX_HEIGHT + + # ── 检测表格内容的水平范围 ── + # 表格通常比 caption 宽,用内容块的实际宽度 + content_x0 = min(caption_x0, min(b[0] for b in below_blocks)) + content_x1 = max(caption_x1, max(b[2] for b in below_blocks)) + + # 添加边距,但不超出页面 + x0 = max(0, content_x0 - _REGION_SIDE_PADDING) + x1 = min(page_width, content_x1 + _REGION_SIDE_PADDING) + + return (x0, bottom, x1) def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int: - """从 PDF 提取嵌入图片和表格截图,生成 manifest。 + """从 PDF 提取 Figure/Table 截图,生成 manifest。 - 匹配策略: - 1. 提取图片→提取 caption 文字→从中识别 Figure/Table 编号 - 2. 表格区域若 caption 标注为 "Figure",则重分类为图片 - 3. 未能从 caption 识别编号的,按(页码, 纵向位置)排序后用序号匹配兜底 + 策略:找 caption → 定位区域 → 渲染页面截图。 Args: arxiv_id: 论文 ID pdf_path: PDF 路径,默认 data/tmp/{arxiv_id}/paper.pdf Returns: - 提取的图片+表格数量 + 提取的图片数量 """ import pymupdf @@ -153,216 +281,100 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int: images_dest.mkdir(parents=True, exist_ok=True) doc = pymupdf.open(str(pdf_path)) + captions = _find_captions(doc) + + if not captions: + logger.info("No Figure/Table captions found in PDF for %s", arxiv_id) + doc.close() + return 0 + + # 去重:同一页同一 label 可能匹配到多个 block(如正文引用 "Figure 7") + # 保留每个 (type, num) 的第一个匹配(即真正的 caption) + seen_labels: dict[str, dict] = {} + for cap in captions: + key = cap["label"] + if key not in seen_labels: + seen_labels[key] = cap + + unique_captions = list(seen_labels.values()) extracted = 0 - seen_hashes: set[int] = set() + manifest: dict[str, dict] = {} - # ── 第一遍:收集所有图片和表格 ── - image_items: list[dict] = [] - table_items: list[dict] = [] + zoom = 2 # 2x 渲染,保证清晰度 - for page_num in range(len(doc)): - page = doc[page_num] - page_height = page.rect.height + for cap in unique_captions: + page = doc[cap["page_num"]] + pw = cap["page_width"] + ph = cap["page_height"] - # 1. 提取嵌入图片 - image_list = page.get_images(full=True) - for img_index, img_info in enumerate(image_list): - xref = img_info[0] - try: - pix = pymupdf.Pixmap(doc, xref) - except Exception: + if cap["type"] == "figure": + # Figure: caption 上方是图 → 向上找图的上边界 + top = _find_figure_top(page, cap) + bottom = cap["caption_y1"] + 5 # 包含 caption + # 水平范围:caption 宽度 + 边距(图和 caption 通常等宽) + # 但也要考虑图内容的实际宽度 + x0 = max(0, cap["caption_x0"] - _REGION_SIDE_PADDING) + x1 = min(pw, cap["caption_x1"] + _REGION_SIDE_PADDING) + + height = bottom - top + if height < _FIGURE_MIN_HEIGHT: + logger.debug( + "Figure %s too small (%.0fpt), skipping", cap["label"], height + ) continue - if pix.width < _MIN_DIM or pix.height < _MIN_DIM: - continue - if pix.width * pix.height < _MIN_AREA: + else: + # Table: caption 下方是表格 → 向下找表格的下边界和水平范围 + x0, bottom, x1 = _find_table_region(page, cap) + top = max(0, cap["caption_y0"] - 3) # 包含 caption,上边留少许 margin + + height = bottom - top + if height < _TABLE_MIN_HEIGHT: + logger.debug( + "Table %s too small (%.0fpt), skipping", cap["label"], height + ) continue - img_hash = hash(pix.tobytes()[:1024]) - if img_hash in seen_hashes: - continue - seen_hashes.add(img_hash) - - img_rects = page.get_image_rects(xref) - if not img_rects: - continue - bbox = img_rects[0] - - if pix.n >= 5: - try: - pix = pymupdf.Pixmap(pymupdf.csRGB, pix) - except Exception: - continue - - filename = f"page{page_num + 1}_img{img_index + 1}.png" - pix.save(str(images_dest / filename)) - extracted += 1 - - caption_text = _extract_caption_text(page, bbox, page_height) - label = _identify_label(caption_text) - - image_items.append({ - "filename": filename, - "page": page_num + 1, - "y0": bbox.y0, - "caption_text": caption_text, - "label": label, - }) - - # 2. 提取表格截图(同时搜索上方 caption,Table 标题通常在表格上方) + # 渲染截取 + clip = pymupdf.Rect(x0, top, x1, bottom) + mat = pymupdf.Matrix(zoom, zoom) try: - tables = page.find_tables() + pix = page.get_pixmap(matrix=mat, clip=clip) except Exception: - tables = None + logger.debug("Failed to render %s region for %s", cap["label"], arxiv_id) + continue - if tables and tables.tables: - for table_index, table in enumerate(tables.tables): - bbox = table.bbox - if not bbox: - continue + filename = f"{cap['label'].replace(' ', '_').lower()}.png" + pix.save(str(images_dest / filename)) + extracted += 1 - margin = 5 - if hasattr(bbox, 'x0'): - x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1 - table_rect = bbox - else: - x0, y0, x1, y1 = bbox - table_rect = pymupdf.Rect(x0, y0, x1, y1) - clip_rect = pymupdf.Rect( - x0 - margin, y0 - margin, x1 + margin, y1 + margin - ) - - zoom = 2 - mat = pymupdf.Matrix(zoom, zoom) - try: - pix = page.get_pixmap(matrix=mat, clip=clip_rect) - except Exception: - continue - - if pix.width < _MIN_DIM * 2 or pix.height < 30 * 2: - continue - - filename = f"page{page_num + 1}_table{table_index + 1}.png" - pix.save(str(images_dest / filename)) - extracted += 1 - - # Table caption 上下都搜(学术论文惯例:Table 标题在上方,但实际排版各异) - caption_text = _extract_caption_text( - page, table_rect, page_height, search_both=True, - ) - label = _identify_label(caption_text) - - item = { - "filename": filename, - "page": page_num + 1, - "y0": y0, - "caption_text": caption_text, - "label": label, - } - - # 关键:caption 标注为 Figure → 重分类为图片 - if _is_figure_caption(caption_text): - image_items.append(item) - else: - table_items.append(item) + cap_preview = cap["caption_text"][:200] if cap["caption_text"] else "" + manifest[filename] = { + "page": cap["page_num"] + 1, + "type": cap["type"], + "label": cap["label"], + "caption_text": cap_preview, + "figures" if cap["type"] == "figure" else "tables": [cap["label"]], + } + logger.debug( + "Rendered %s: page %d, region (%.0f,%.0f)-(%.0f,%.0f) h=%.0fpt → %s", + cap["label"], cap["page_num"] + 1, + x0, top, x1, bottom, height, filename, + ) doc.close() - # ── 第二遍:矫正 find_tables 的误判 ── - # 如果表格与同页的图片高度重叠(复合图表的子区域),且 caption 不含 "Table", - # 则重分类为图片,归入邻近图片的 label - for t_item in table_items[:]: - t_page = t_item["page"] - t_y0 = t_item["y0"] - same_page_images = [i for i in image_items if i["page"] == t_page] - if not same_page_images: - continue - # 检查是否有重叠的图片 - nearby = [ - i for i in same_page_images - if abs(i["y0"] - t_y0) < 50 - ] - if nearby and not (t_item["caption_text"] and _TABLE_CAPTION_RE.search(t_item["caption_text"])): - # 重分类为图片,继承邻近图片的 label - neighbor_label = nearby[0].get("label") - t_item["label"] = neighbor_label - image_items.append(t_item) - table_items.remove(t_item) - - # ── 第三遍:按 (page, y0) 排序 → 序号匹配兜底 ── - image_items.sort(key=lambda it: (it["page"], it["y0"])) - table_items.sort(key=lambda it: (it["page"], it["y0"])) - - # 统计已通过 caption 确认的 Figure/Table 编号,避免序号重复分配 - used_figure_nums: set[int] = set() - used_table_nums: set[int] = set() - for item in image_items: - if item["label"]: - m = _FIGURE_CAPTION_RE.search(item["label"]) - if m: - used_figure_nums.add(int(m.group(1))) - for item in table_items: - if item["label"]: - m = _TABLE_CAPTION_RE.search(item["label"]) - if m: - used_table_nums.add(int(m.group(1))) - - # 为未识别编号的图片分配序号(跳过已占用的编号) - next_fig = 1 - for item in image_items: - if item["label"] is None: - while next_fig in used_figure_nums: - next_fig += 1 - item["label"] = f"Figure {next_fig}" - used_figure_nums.add(next_fig) - - next_tbl = 1 - for item in table_items: - if item["label"] is None: - while next_tbl in used_table_nums: - next_tbl += 1 - item["label"] = f"Table {next_tbl}" - used_table_nums.add(next_tbl) - - # ── 第三遍:构建 manifest ── - manifest: dict[str, dict] = {} - for item in image_items: - manifest[item["filename"]] = { - "page": item["page"], - "type": "image", - "label": item["label"], - "caption_text": item.get("caption_text"), - "figures": [item["label"]], - } - for item in table_items: - manifest[item["filename"]] = { - "page": item["page"], - "type": "table", - "label": item["label"], - "caption_text": item.get("caption_text"), - "tables": [item["label"]], - } - # 保存 manifest manifest_path = images_dest / "manifest.json" manifest_path.write_text( json.dumps(manifest, ensure_ascii=False, indent=2) ) - captioned = sum( - 1 for it in image_items + table_items if it["caption_text"] - ) - label_matched = sum( - 1 for it in image_items + table_items - if it["caption_text"] and _identify_label(it["caption_text"]) - ) - if extracted > 0: logger.info( - "Extracted %d items from PDF for %s " - "(%d images, %d tables, %d with captions, %d label-matched)", - extracted, arxiv_id, - len(image_items), len(table_items), captioned, label_matched, + "Extracted %d figure/table screenshots from PDF for %s " + "(from %d captions found, %d unique)", + extracted, arxiv_id, len(captions), len(unique_captions), ) return extracted diff --git a/app/services/scheduler.py b/app/services/scheduler.py index ea2c6f5..698056d 100644 --- a/app/services/scheduler.py +++ b/app/services/scheduler.py @@ -12,6 +12,7 @@ from zoneinfo import ZoneInfo from app.config import settings from app.database import SessionLocal from app.services.pipeline import run_pipeline +from app.services.crawler import refresh_upvotes from app.utils import today_str logger = logging.getLogger(__name__) @@ -66,6 +67,22 @@ def start_scheduler() -> AsyncIOScheduler | None: misfire_grace_time=3600, # 允许迟到 1 小时内补执行 ) + # upvote 刷新:每天流水线之后 30 分钟执行,刷新最近 7 天论文的投票数 + upvote_trigger = CronTrigger( + hour=settings.SCHEDULE_HOUR, + minute=settings.SCHEDULE_MINUTE + 30, + timezone=tz, + ) + scheduler.add_job( + _upvote_refresh, + trigger=upvote_trigger, + id="upvote_refresh", + name="upvote_refresh", + replace_existing=True, + max_instances=1, + misfire_grace_time=3600, + ) + scheduler.start() _scheduler = scheduler logger.info( @@ -102,3 +119,19 @@ async def _daily_pipeline() -> None: logger.exception("Unexpected error in daily pipeline") finally: db.close() + + +async def _upvote_refresh() -> None: + """刷新最近 N 天论文的 upvotes。""" + db: Session = SessionLocal() + try: + result = await refresh_upvotes(db) + logger.info( + "Upvote refresh completed: status=%s updated=%d", + result.get("status"), + result.get("updated", 0), + ) + except Exception: + logger.exception("Unexpected error in upvote refresh") + finally: + db.close() diff --git a/app/static/css/style.css b/app/static/css/style.css index 49adc1d..03d2313 100644 --- a/app/static/css/style.css +++ b/app/static/css/style.css @@ -138,20 +138,7 @@ a:hover { } /* ── Date Quick Nav ─────────────────────────────────────────────── */ -.date-quick-nav { - margin-top: 32px; - padding-top: 16px; - border-top: 1px solid var(--border); - font-size: 0.85rem; - color: var(--ink-light); - display: flex; - align-items: center; - gap: 8px; - flex-wrap: wrap; -} - /* ── Chips (shared) ─────────────────────────────────────────────── */ -.date-chip, .tag-chip, .filter-chip { display: inline-block; @@ -162,14 +149,12 @@ a:hover { font-size: 0.8rem; color: var(--ink-light); } -.date-chip:hover, .tag-chip:hover, .filter-chip:hover { border-color: var(--accent); color: var(--accent); text-decoration: none; } -.date-chip.active, .tag-chip.active, .filter-chip.active { background: var(--accent); @@ -352,6 +337,11 @@ a:hover { margin-bottom: 12px; } +.detail-upvote-time { + font-size: 0.78rem; + color: var(--border); +} + .detail-tags { margin-bottom: 12px; display: flex; diff --git a/app/templates/admin_dashboard.html b/app/templates/admin_dashboard.html index 862b13e..e58d84d 100644 --- a/app/templates/admin_dashboard.html +++ b/app/templates/admin_dashboard.html @@ -33,6 +33,7 @@ +
@@ -59,6 +60,10 @@ {{ stats.next_run[:19] | replace('T', ' ') }}
{% endif %} +
+ 投票刷新 + 每日自动刷新最近 {{ stats.upvote_refresh_days | default(7) }} 天 +
{% if stats.active_locks %}
活跃任务 @@ -181,5 +186,12 @@ .then(data => { if (data) showToast(data.error ? "❌ " + data.error.substring(0,200) : "✅ 流水线已触发"); }) .catch(err => showToast("❌ 请求失败")); } + + function refreshUpvotes() { + fetch("/admin/refresh-upvotes", { method: "POST", headers: { "Content-Type": "application/json" } }) + .then(r => { if (r.status===303||r.status===401) { window.location.href="/admin/login"; return; } return r.json(); }) + .then(data => { if (data) showToast(data.error ? "❌ " + data.error.substring(0,200) : `✅ 已刷新 ${data.updated || 0} 篇论文投票`); }) + .catch(err => showToast("❌ 请求失败")); + } {% endblock %} diff --git a/app/templates/detail.html b/app/templates/detail.html index 3536dcf..06f6224 100644 --- a/app/templates/detail.html +++ b/app/templates/detail.html @@ -22,6 +22,9 @@ endblock %} {% block content %} >📅 {{ paper.published_at or paper.paper_date }} 👍 {{ paper.upvotes }} + {% if paper.crawled_at %} + {{ paper.crawled_at.strftime('%m-%d %H:%M') }} + {% endif %}
{# 标签 #} {% if paper.tags %} diff --git a/app/templates/index.html b/app/templates/index.html index d9cb9a6..3dc7ec9 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -23,16 +23,6 @@ endblock %} {% block content %} {% endif %} -
- 有数据的日期: - {% for d in available_dates[:10] %} - {{ d }} - {% endfor %} -
{% endblock %} {% block scripts %} diff --git a/app/templates/partials/paper_card.html b/app/templates/partials/paper_card.html index 300fdf3..3a18e87 100644 --- a/app/templates/partials/paper_card.html +++ b/app/templates/partials/paper_card.html @@ -20,7 +20,7 @@ {% endif %} - 👍 {{ paper.upvotes }} + 👍 {{ paper.upvotes }} {% if variant == 'search' and distances and paper.arxiv_id in distances %} 🎯 {{ "%.3f"|format(distances[paper.arxiv_id]) }} diff --git a/app/utils.py b/app/utils.py index a16b79c..8401e05 100644 --- a/app/utils.py +++ b/app/utils.py @@ -57,6 +57,13 @@ def yesterday_str() -> str: return yesterday.isoformat() +def recent_date_strs(n: int) -> list[str]: + """最近 N 天的日期字符串列表(含今天,按 APP_TIMEZONE)。""" + tz = ZoneInfo(settings.APP_TIMEZONE) + today = datetime.now(tz).date() + return [(today - timedelta(days=i)).isoformat() for i in range(n)] + + def latest_paper_date(db) -> str: """查询数据库中最新的 paper_date,无数据时回退到 today_str()。""" from sqlalchemy import func, select