feat: refactor PDF extraction to caption-based screenshots, add upvote refresh, clean up UI

- PDF extractor: rewrite from embedded bitmap extraction to caption-based
  page region screenshots. Finds Figure/Table captions via regex,截取上方/下方
  page region, handles compound figures and vector graphics.
- Upvote refresh: new crawler.refresh_upvotes() re-fetches upvotes for recent
  N days without inserting new papers. Scheduler runs daily 30min after pipeline.
- Admin: add /admin/refresh-upvotes endpoint and dashboard button.
- UI: remove date quick nav, show upvote update time on detail/card pages,
  clean up CSS date-chip styles.
- Utils: add recent_date_strs() helper.
This commit is contained in:
2026-06-09 18:01:01 +08:00
parent b72b5a31bb
commit 1fc6303e09
13 changed files with 460 additions and 311 deletions
+1
View File
@@ -41,6 +41,7 @@ class Settings(BaseSettings):
SCHEDULE_HOUR: int = 4 SCHEDULE_HOUR: int = 4
SCHEDULE_MINUTE: int = 0 SCHEDULE_MINUTE: int = 0
APP_WORKERS: int = 1 APP_WORKERS: int = 1
UPVOTE_REFRESH_DAYS: int = 7 # 刷新最近 N 天论文的 upvotes
# 数据库 # 数据库
DATABASE_URL: str = "sqlite:///data/db/papers.db" DATABASE_URL: str = "sqlite:///data/db/papers.db"
+19 -2
View File
@@ -26,7 +26,7 @@ from app.models import (
) )
from app.services.admin import get_admin_stats from app.services.admin import get_admin_stats
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
from app.services.crawler import crawl_daily from app.services.crawler import crawl_daily, refresh_upvotes
from app.services.pipeline import run_pipeline from app.services.pipeline import run_pipeline
from app.services.scheduler import get_scheduler from app.services.scheduler import get_scheduler
from app.services.summarizer import summarize_batch, summarize_single from app.services.summarizer import summarize_batch, summarize_single
@@ -129,16 +129,20 @@ async def admin_scheduler_status(_admin: None = Depends(verify_admin)):
"""调度器运行状态(JSON)。""" """调度器运行状态(JSON)。"""
scheduler = get_scheduler() scheduler = get_scheduler()
next_run = None next_run = None
upvote_next_run = None
if scheduler: if scheduler:
for job in scheduler.get_jobs(): for job in scheduler.get_jobs():
if job.id == "daily_pipeline": if job.id == "daily_pipeline":
next_run = job.next_run_time next_run = job.next_run_time
break elif job.id == "upvote_refresh":
upvote_next_run = job.next_run_time
return { return {
"enabled": scheduler is not None, "enabled": scheduler is not None,
"schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}", "schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
"timezone": settings.APP_TIMEZONE, "timezone": settings.APP_TIMEZONE,
"next_run": next_run.isoformat() if next_run else None, "next_run": next_run.isoformat() if next_run else None,
"upvote_next_run": upvote_next_run.isoformat() if upvote_next_run else None,
"upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
} }
@@ -159,6 +163,19 @@ async def admin_trigger_pipeline(
return {"status": "success", "message": "流水线执行完成"} return {"status": "success", "message": "流水线执行完成"}
@router.post("/refresh-upvotes")
async def admin_refresh_upvotes(
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
days: int | None = Query(None, description="刷新最近 N 天,默认使用配置值"),
):
"""手动刷新最近 N 天论文的 upvotes。"""
result = await refresh_upvotes(db, days=days)
if result["status"] == "failed":
raise HTTPException(status_code=500, detail=result.get("error"))
return result
# ── 请求模型 ────────────────────────────────────────────────────────── # ── 请求模型 ──────────────────────────────────────────────────────────
+8 -3
View File
@@ -315,11 +315,16 @@ def _link_figures_with_images(
fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))] fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))]
table_type_unmatched = [f for f in unmatched if not _is_figure_type(f.get("id", ""))] table_type_unmatched = [f for f in unmatched if not _is_figure_type(f.get("id", ""))]
# 提取的图片按类型分流,按文件名排序 # 提取的图片按类型分流,按文件名中的编号排序
def _sort_key(name: str) -> tuple[int, int]: def _sort_key(name: str) -> tuple[int, int]:
m = re.search(r'page(\d+)_(?:img|table)(\d+)', name) # 新格式:figure_1.png, table_1.png
m = re.search(r'(?:figure|table)_(\d+)', name)
if m: if m:
return (int(m.group(1)), int(m.group(2))) return (0, int(m.group(1)))
# 旧格式:page2_img1.png, page5_table1.png
m2 = re.search(r'page(\d+)_(?:img|table)(\d+)', name)
if m2:
return (int(m2.group(1)), int(m2.group(2)))
return (0, 0) return (0, 0)
fig_images = sorted( fig_images = sorted(
+1
View File
@@ -106,4 +106,5 @@ def get_admin_stats(db: Session) -> dict:
"next_run": next_run.isoformat() if next_run else None, "next_run": next_run.isoformat() if next_run else None,
"recent_logs": recent_logs, "recent_logs": recent_logs,
"active_locks": active_locks, "active_locks": active_locks,
"upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
} }
+79 -1
View File
@@ -16,7 +16,7 @@ from app.models import (
SummaryState, SummaryState,
SummaryStatus, SummaryStatus,
) )
from app.utils import make_http_client, utc_now from app.utils import make_http_client, recent_date_strs, utc_now
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -205,3 +205,81 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -
log_entry.completed_at = utc_now() log_entry.completed_at = utc_now()
db.commit() db.commit()
return {"found": 0, "new": 0, "status": "failed", "error": str(exc)} return {"found": 0, "new": 0, "status": "failed", "error": str(exc)}
async def refresh_upvotes(db: Session, days: int | None = None) -> dict:
"""重新抓取最近 N 天论文的 upvotes,不插入新论文。
遍历每天调用 HF API,对已有论文仅更新 upvotes 和 crawled_at。
"""
days = days or settings.UPVOTE_REFRESH_DAYS
date_strs = recent_date_strs(days)
now = utc_now()
log_entry = CrawlLog(
task="upvote_refresh",
status="running",
date=date_type.fromisoformat(date_strs[0]),
started_at=now,
)
db.add(log_entry)
db.commit()
total_updated = 0
errors: list[str] = []
try:
for ds in date_strs:
try:
raw_papers = await fetch_daily(ds)
updated = _update_upvotes_only(db, raw_papers)
total_updated += updated
logger.info("Refreshed upvotes for %s: %d papers", ds, updated)
except Exception as exc:
msg = f"{ds}: {exc}"
errors.append(msg)
logger.warning("Failed to refresh upvotes for %s: %s", ds, exc)
log_entry.status = "success" if not errors else "partial"
log_entry.papers_found = total_updated
log_entry.details_json = f'{{"days": {days}, "errors": {len(errors)}}}'
log_entry.completed_at = utc_now()
db.commit()
return {
"status": "success" if not errors else "partial",
"updated": total_updated,
"days": days,
"errors": errors or None,
}
except Exception as exc:
logger.exception("Upvote refresh failed")
log_entry.status = "failed"
log_entry.error = str(exc)
log_entry.completed_at = utc_now()
db.commit()
return {"status": "failed", "updated": total_updated, "error": str(exc)}
def _update_upvotes_only(db: Session, papers_raw: list[dict]) -> int:
"""对已有论文更新 upvotes,跳过不存在的新论文。"""
now = datetime.now(timezone.utc)
updated = 0
for item in papers_raw:
meta = _parse_paper(item)
arxiv_id = meta["arxiv_id"]
if not arxiv_id:
continue
existing = db.execute(
select(Paper).where(Paper.arxiv_id == arxiv_id)
).scalar_one_or_none()
if existing:
existing.upvotes = meta["upvotes"]
existing.crawled_at = now
updated += 1
db.commit()
return updated
+291 -279
View File
@@ -1,12 +1,12 @@
"""PDF 图片与表格提取 — 从 PDF 中提取嵌入图片和表格截图。 """PDF 图片与表格提取 — 基于 caption 定位的页面区域截图。
策略: 核心思路:学术论文排版极其规整,Figure caption 在图下方,Table caption 在表格上方。
1. 提取 PDF 中嵌入的图片(图表、插图等),按页面位置排序 因此反过来:先找 caption 文字 → 向上/向下截取页面区域 → 渲染为 PNG。
2. 检测表格区域,渲染为截图
3. 为每张图/表格提取附近的说明文字(caption),从中识别 Figure N / Table N 优势(相比提取嵌入位图):
4. 根据 caption 内容矫正类型:标注为 "Figure" 的表格区域 → 归为图片 - 复合图表不会被拆成碎片(整块截取)
5. 序号匹配兜底:第 N 张图 → Figure N(学术论文图表严格按顺序出现 - 矢量图也能截取(页面渲染包含一切
6. 保存 manifest.json 供后续与 AI 总结的 figures 字段匹配 - 不依赖 find_tables()(纯文本匹配 caption
""" """
from __future__ import annotations from __future__ import annotations
@@ -21,124 +21,252 @@ from app.utils import TMP_DIR
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# 最小面积阈值(像素),小于此值的图片视为图标/装饰 # ── 截取区域参数 ───────────────────────────────────────────────────────
_MIN_AREA = 10_000 # ~100x100
_MIN_DIM = 80
# Caption 搜索区域 — Figure caption 在图下方,Table caption 在图上方 # Figure: caption 上方搜索图的范围(点)
_CAPTION_MARGIN = 10 # 贴边距离 _FIGURE_MAX_HEIGHT = 450 # 最大向上搜索范围
_CAPTION_MAX_DISTANCE = 250 # 最远搜索距离 _FIGURE_MIN_HEIGHT = 50 # 最小有效截图高度
_CAPTION_SIDE_PADDING = 40 # 左右扩展 _FIGURE_DEFAULT_HEIGHT = 280 # 上方未找到内容块时的默认图高度
# Figure/Table 标注正则 # Table: caption 下方搜索表格的范围
_FIGURE_CAPTION_RE = re.compile( _TABLE_MAX_HEIGHT = 500 # 最大向下搜索范围
r'\b(?:Fig\.?|Figure)\s*(\d+)\b', re.IGNORECASE _TABLE_MIN_HEIGHT = 30
# caption 左右扩展(双栏论文中 caption 可能比表格窄)
_REGION_SIDE_PADDING = 10
# 表格通常比 caption 文字宽,使用更大的水平扩展
_TABLE_SIDE_PADDING = 60
# 正文行距的 2 倍 ≈ 空白间隙阈值
_CONTENT_GAP_THRESHOLD = 30
# ── Caption 正则 ───────────────────────────────────────────────────────
# 要求以 Figure/Table 开头(避免匹配正文中的 "see Figure 3" 等)
_CAPTION_RE = re.compile(
r'^(?:Fig\.?|Figure)\s+(\d+)\s*[:\.]',
re.IGNORECASE,
) )
_TABLE_CAPTION_RE = re.compile( _TABLE_CAPTION_RE = re.compile(
r'\bTable\s*(\d+)\b', re.IGNORECASE r'^Table\s+(\d+)\s*[:\.]',
re.IGNORECASE,
) )
def _extract_caption_text(page, bbox, page_height: float, *, def _find_captions(doc) -> list[dict]:
search_above: bool = False, """扫描整个文档,找到所有 Figure/Table caption 的位置和信息。"""
search_both: bool = False) -> str | None: captions = []
"""从图片/表格附近区域提取 caption 文字。
search_above=True:搜索上方(Table caption 通常在上) for page_num in range(len(doc)):
默认搜索下方(Figure caption 通常在下) page = doc[page_num]
search_both=True:上下都搜,返回包含 Figure/Table 标注的那边 page_width = page.rect.width
""" page_height = page.rect.height
import pymupdf
x0 = max(0, bbox.x0 - _CAPTION_SIDE_PADDING)
x1 = bbox.x1 + _CAPTION_SIDE_PADDING
def _search(y0: float, y1: float) -> str | None:
rect = pymupdf.Rect(x0, y0, x1, y1)
blocks = page.get_text("blocks") blocks = page.get_text("blocks")
parts: list[str] = []
for block in blocks: for block in blocks:
if len(block) < 5: if len(block) < 5:
continue continue
block_rect = pymupdf.Rect(block[:4]) text = str(block[4]).strip()
if block_rect.intersects(rect): if not text:
text = str(block[4]).strip() continue
if text:
parts.append(text)
if parts:
return " ".join(parts)
text = page.get_textbox(rect)
if text and len(text.strip()) >= 5:
return text.strip()
return None
if search_both: bx0, by0, bx1, by1 = block[0], block[1], block[2], block[3]
# 上方 # 只取 block 第一行做匹配(避免 block 包含多段文字干扰)
above_y1 = max(0, bbox.y0 - _CAPTION_MARGIN) first_line = text.split("\n")[0].strip()
above_y0 = max(0, bbox.y0 - _CAPTION_MAX_DISTANCE)
above = _search(above_y0, above_y1)
# 下方
below_y0 = bbox.y1 + _CAPTION_MARGIN
below_y1 = min(page_height, bbox.y1 + _CAPTION_MAX_DISTANCE)
below = _search(below_y0, below_y1)
# 优先返回包含 Figure/Table 标注的那边 m = _CAPTION_RE.match(first_line)
if above and (_FIGURE_CAPTION_RE.search(above) or _TABLE_CAPTION_RE.search(above)): if m:
return above captions.append({
if below and (_FIGURE_CAPTION_RE.search(below) or _TABLE_CAPTION_RE.search(below)): "type": "figure",
return below "num": int(m.group(1)),
# 否则返回更长的 "label": f"Figure {m.group(1)}",
if above and below: "page_num": page_num,
return above if len(above) >= len(below) else below "caption_y0": by0,
return above or below "caption_y1": by1,
"caption_x0": bx0,
"caption_x1": bx1,
"caption_text": text,
"page_width": page_width,
"page_height": page_height,
})
continue
if search_above: m = _TABLE_CAPTION_RE.match(first_line)
y1 = max(0, bbox.y0 - _CAPTION_MARGIN) if m:
y0 = max(0, bbox.y0 - _CAPTION_MAX_DISTANCE) captions.append({
"type": "table",
"num": int(m.group(1)),
"label": f"Table {m.group(1)}",
"page_num": page_num,
"caption_y0": by0,
"caption_y1": by1,
"caption_x0": bx0,
"caption_x1": bx1,
"caption_text": text,
"page_width": page_width,
"page_height": page_height,
})
return captions
def _find_figure_top(page, caption: dict) -> float:
"""向上扫描页面,找到 Figure 的上边界。
策略:
1. 收集 caption 上方的所有内容块(文本 + 嵌入图片)
2. 找到最顶部的内容块作为图的上界
3. 检查内容块之间的大间隙(表示图从间隙下方开始)
4. 如果没找到任何内容块,使用默认图高度
注意:只扫描 text blocks 是不够的,因为 figure 本身是图片/矢量图,
不会被 get_text("blocks") 返回。必须同时用 get_image_info() 检测嵌入图片。
"""
caption_y = caption["caption_y0"]
cx0 = caption["caption_x0"] - _REGION_SIDE_PADDING
cx1 = caption["caption_x1"] + _REGION_SIDE_PADDING
# 收集 caption 上方、同列范围内的所有内容块
# 每个元素: (x0, y0, x1, y1)
above_blocks: list[tuple[float, float, float, float]] = []
# ── 1. 文本块 ──
for b in page.get_text("blocks"):
if len(b) < 5:
continue
bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
if by1 <= caption_y and by1 > caption_y - _FIGURE_MAX_HEIGHT:
if bx1 > cx0 and bx0 < cx1:
above_blocks.append((bx0, by0, bx1, by1))
# ── 2. 嵌入图片块 — 关键!figure 本身是图片,不是文本 ──
for img_info in page.get_image_info():
bbox = img_info.get("bbox")
if bbox is None:
continue
# Rect 对象: x0, y0, x1, y1
ix0, iy0, ix1, iy1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
if iy1 <= caption_y and iy1 > caption_y - _FIGURE_MAX_HEIGHT:
if ix1 > cx0 and ix0 < cx1:
above_blocks.append((ix0, iy0, ix1, iy1))
# ── 没有内容块 → 用默认高度(可能是纯矢量图,如 TikZ/matplotlib PDF ──
if not above_blocks:
return max(0, caption_y - _FIGURE_DEFAULT_HEIGHT)
# ── 找到内容区域的上边界 ──
# 按 y 从下到上排序(离 caption 最近的在前)
above_blocks.sort(key=lambda b: b[1], reverse=True)
# 从 caption 向上扫描,找到第一个大间隙以上作为图的上界
# 典型结构: [正文段落] ...空白... [图内容(图片/矢量)] [caption]
# 空白间隙 ≈ 图的上边界
figure_top = above_blocks[-1][1] # 最上面的块顶部(默认兜底)
prev_bottom = caption_y # 从 caption 顶部开始向上
for b in above_blocks:
# b = (x0, y0, x1, y1), 我们关心 y 范围
gap = prev_bottom - b[3] # b[3] = by1 = 当前块底部
if gap > _CONTENT_GAP_THRESHOLD:
# 大间隙 → 图上边界在间隙下方
figure_top = prev_bottom - 5
break
# 小间隙 → 当前块属于图的一部分(或紧挨着图),继续向上
prev_bottom = b[1] # b[1] = by0 = 当前块顶部
else: else:
y0 = bbox.y1 + _CAPTION_MARGIN # 所有块都紧挨着 → 图从最上面块的顶部开始
y1 = min(page_height, bbox.y1 + _CAPTION_MAX_DISTANCE) figure_top = above_blocks[-1][1]
return _search(y0, y1) # 限制最大高度
if caption_y - figure_top > _FIGURE_MAX_HEIGHT:
figure_top = caption_y - _FIGURE_MAX_HEIGHT
# 不低于页面顶部
figure_top = max(0, figure_top)
return figure_top
def _identify_label(caption_text: str | None) -> str | None: def _find_table_region(page, caption: dict) -> tuple[float, float, float, float]:
"""从 caption 文字中识别 Figure N / Table N 编号。""" """向下扫描页面,找到 Table 的下边界和水平范围。
if not caption_text:
return None
m = _FIGURE_CAPTION_RE.search(caption_text) 返回: (x0, bottom, x1) — 裁剪区域的左、下、右边界。
if m: 上边界由调用方根据 caption 位置确定。
return f"Figure {m.group(1)}"
m = _TABLE_CAPTION_RE.search(caption_text) 策略:
if m: 1. 收集 caption 下方的文本块(表格内容是文本)
return f"Table {m.group(1)}" 2. 找到连续内容区域的底部(遇到大间隙时停止)
3. 同时检测表格内容的水平范围(表格通常比 caption 宽)
"""
blocks = page.get_text("blocks")
caption_y = caption["caption_y1"] # caption 底部作为扫描起点
caption_x0 = caption["caption_x0"]
caption_x1 = caption["caption_x1"]
page_height = caption["page_height"]
page_width = caption["page_width"]
return None # 先用较宽的范围收集可能的表格内容块
search_x0 = max(0, caption_x0 - _TABLE_SIDE_PADDING)
search_x1 = min(page_width, caption_x1 + _TABLE_SIDE_PADDING)
below_blocks: list[tuple[float, float, float, float]] = []
for b in blocks:
if len(b) < 5:
continue
bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
if by0 > caption_y and by0 < caption_y + _TABLE_MAX_HEIGHT:
if bx1 > search_x0 and bx0 < search_x1:
below_blocks.append((bx0, by0, bx1, by1))
def _is_figure_caption(caption_text: str | None) -> bool: if not below_blocks:
"""判断 caption 是否标注为 Figure(用于矫正 find_tables 的误判)。""" # 没有内容 → 使用默认高度和 caption 宽度
if not caption_text: return (
return False max(0, caption_x0 - _REGION_SIDE_PADDING),
return bool(_FIGURE_CAPTION_RE.search(caption_text)) min(page_height, caption_y + _TABLE_MIN_HEIGHT),
min(page_width, caption_x1 + _REGION_SIDE_PADDING),
)
# ── 找到连续内容区域的底部 ──
below_blocks.sort(key=lambda b: b[1]) # 按 y 升序
prev_y = caption_y
bottom = below_blocks[-1][3] + 5 # 最后一块的底部 + margin
for b in below_blocks:
gap = b[1] - prev_y # b[1] = by0
if gap > _CONTENT_GAP_THRESHOLD:
bottom = prev_y + 5
break
prev_y = b[3] # b[3] = by1
# 限制最大高度
if bottom - caption_y > _TABLE_MAX_HEIGHT:
bottom = caption_y + _TABLE_MAX_HEIGHT
# ── 检测表格内容的水平范围 ──
# 表格通常比 caption 宽,用内容块的实际宽度
content_x0 = min(caption_x0, min(b[0] for b in below_blocks))
content_x1 = max(caption_x1, max(b[2] for b in below_blocks))
# 添加边距,但不超出页面
x0 = max(0, content_x0 - _REGION_SIDE_PADDING)
x1 = min(page_width, content_x1 + _REGION_SIDE_PADDING)
return (x0, bottom, x1)
def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int: def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
"""从 PDF 提取嵌入图片和表格截图,生成 manifest。 """从 PDF 提取 Figure/Table 截图,生成 manifest。
匹配策略: 策略:找 caption → 定位区域 → 渲染页面截图。
1. 提取图片→提取 caption 文字→从中识别 Figure/Table 编号
2. 表格区域若 caption 标注为 "Figure",则重分类为图片
3. 未能从 caption 识别编号的,按(页码, 纵向位置)排序后用序号匹配兜底
Args: Args:
arxiv_id: 论文 ID arxiv_id: 论文 ID
pdf_path: PDF 路径,默认 data/tmp/{arxiv_id}/paper.pdf pdf_path: PDF 路径,默认 data/tmp/{arxiv_id}/paper.pdf
Returns: Returns:
提取的图片+表格数量 提取的图片数量
""" """
import pymupdf import pymupdf
@@ -153,216 +281,100 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
images_dest.mkdir(parents=True, exist_ok=True) images_dest.mkdir(parents=True, exist_ok=True)
doc = pymupdf.open(str(pdf_path)) doc = pymupdf.open(str(pdf_path))
captions = _find_captions(doc)
if not captions:
logger.info("No Figure/Table captions found in PDF for %s", arxiv_id)
doc.close()
return 0
# 去重:同一页同一 label 可能匹配到多个 block(如正文引用 "Figure 7"
# 保留每个 (type, num) 的第一个匹配(即真正的 caption)
seen_labels: dict[str, dict] = {}
for cap in captions:
key = cap["label"]
if key not in seen_labels:
seen_labels[key] = cap
unique_captions = list(seen_labels.values())
extracted = 0 extracted = 0
seen_hashes: set[int] = set() manifest: dict[str, dict] = {}
# ── 第一遍:收集所有图片和表格 ── zoom = 2 # 2x 渲染,保证清晰度
image_items: list[dict] = []
table_items: list[dict] = []
for page_num in range(len(doc)): for cap in unique_captions:
page = doc[page_num] page = doc[cap["page_num"]]
page_height = page.rect.height pw = cap["page_width"]
ph = cap["page_height"]
# 1. 提取嵌入图片 if cap["type"] == "figure":
image_list = page.get_images(full=True) # Figure: caption 上方是图 → 向上找图的上边界
for img_index, img_info in enumerate(image_list): top = _find_figure_top(page, cap)
xref = img_info[0] bottom = cap["caption_y1"] + 5 # 包含 caption
try: # 水平范围:caption 宽度 + 边距(图和 caption 通常等宽)
pix = pymupdf.Pixmap(doc, xref) # 但也要考虑图内容的实际宽度
except Exception: x0 = max(0, cap["caption_x0"] - _REGION_SIDE_PADDING)
x1 = min(pw, cap["caption_x1"] + _REGION_SIDE_PADDING)
height = bottom - top
if height < _FIGURE_MIN_HEIGHT:
logger.debug(
"Figure %s too small (%.0fpt), skipping", cap["label"], height
)
continue continue
if pix.width < _MIN_DIM or pix.height < _MIN_DIM: else:
continue # Table: caption 下方是表格 → 向下找表格的下边界和水平范围
if pix.width * pix.height < _MIN_AREA: x0, bottom, x1 = _find_table_region(page, cap)
top = max(0, cap["caption_y0"] - 3) # 包含 caption,上边留少许 margin
height = bottom - top
if height < _TABLE_MIN_HEIGHT:
logger.debug(
"Table %s too small (%.0fpt), skipping", cap["label"], height
)
continue continue
img_hash = hash(pix.tobytes()[:1024]) # 渲染截取
if img_hash in seen_hashes: clip = pymupdf.Rect(x0, top, x1, bottom)
continue mat = pymupdf.Matrix(zoom, zoom)
seen_hashes.add(img_hash)
img_rects = page.get_image_rects(xref)
if not img_rects:
continue
bbox = img_rects[0]
if pix.n >= 5:
try:
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
except Exception:
continue
filename = f"page{page_num + 1}_img{img_index + 1}.png"
pix.save(str(images_dest / filename))
extracted += 1
caption_text = _extract_caption_text(page, bbox, page_height)
label = _identify_label(caption_text)
image_items.append({
"filename": filename,
"page": page_num + 1,
"y0": bbox.y0,
"caption_text": caption_text,
"label": label,
})
# 2. 提取表格截图(同时搜索上方 caption,Table 标题通常在表格上方)
try: try:
tables = page.find_tables() pix = page.get_pixmap(matrix=mat, clip=clip)
except Exception: except Exception:
tables = None logger.debug("Failed to render %s region for %s", cap["label"], arxiv_id)
continue
if tables and tables.tables: filename = f"{cap['label'].replace(' ', '_').lower()}.png"
for table_index, table in enumerate(tables.tables): pix.save(str(images_dest / filename))
bbox = table.bbox extracted += 1
if not bbox:
continue
margin = 5 cap_preview = cap["caption_text"][:200] if cap["caption_text"] else ""
if hasattr(bbox, 'x0'): manifest[filename] = {
x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1 "page": cap["page_num"] + 1,
table_rect = bbox "type": cap["type"],
else: "label": cap["label"],
x0, y0, x1, y1 = bbox "caption_text": cap_preview,
table_rect = pymupdf.Rect(x0, y0, x1, y1) "figures" if cap["type"] == "figure" else "tables": [cap["label"]],
clip_rect = pymupdf.Rect( }
x0 - margin, y0 - margin, x1 + margin, y1 + margin logger.debug(
) "Rendered %s: page %d, region (%.0f,%.0f)-(%.0f,%.0f) h=%.0fpt → %s",
cap["label"], cap["page_num"] + 1,
zoom = 2 x0, top, x1, bottom, height, filename,
mat = pymupdf.Matrix(zoom, zoom) )
try:
pix = page.get_pixmap(matrix=mat, clip=clip_rect)
except Exception:
continue
if pix.width < _MIN_DIM * 2 or pix.height < 30 * 2:
continue
filename = f"page{page_num + 1}_table{table_index + 1}.png"
pix.save(str(images_dest / filename))
extracted += 1
# Table caption 上下都搜(学术论文惯例:Table 标题在上方,但实际排版各异)
caption_text = _extract_caption_text(
page, table_rect, page_height, search_both=True,
)
label = _identify_label(caption_text)
item = {
"filename": filename,
"page": page_num + 1,
"y0": y0,
"caption_text": caption_text,
"label": label,
}
# 关键:caption 标注为 Figure → 重分类为图片
if _is_figure_caption(caption_text):
image_items.append(item)
else:
table_items.append(item)
doc.close() doc.close()
# ── 第二遍:矫正 find_tables 的误判 ──
# 如果表格与同页的图片高度重叠(复合图表的子区域),且 caption 不含 "Table"
# 则重分类为图片,归入邻近图片的 label
for t_item in table_items[:]:
t_page = t_item["page"]
t_y0 = t_item["y0"]
same_page_images = [i for i in image_items if i["page"] == t_page]
if not same_page_images:
continue
# 检查是否有重叠的图片
nearby = [
i for i in same_page_images
if abs(i["y0"] - t_y0) < 50
]
if nearby and not (t_item["caption_text"] and _TABLE_CAPTION_RE.search(t_item["caption_text"])):
# 重分类为图片,继承邻近图片的 label
neighbor_label = nearby[0].get("label")
t_item["label"] = neighbor_label
image_items.append(t_item)
table_items.remove(t_item)
# ── 第三遍:按 (page, y0) 排序 → 序号匹配兜底 ──
image_items.sort(key=lambda it: (it["page"], it["y0"]))
table_items.sort(key=lambda it: (it["page"], it["y0"]))
# 统计已通过 caption 确认的 Figure/Table 编号,避免序号重复分配
used_figure_nums: set[int] = set()
used_table_nums: set[int] = set()
for item in image_items:
if item["label"]:
m = _FIGURE_CAPTION_RE.search(item["label"])
if m:
used_figure_nums.add(int(m.group(1)))
for item in table_items:
if item["label"]:
m = _TABLE_CAPTION_RE.search(item["label"])
if m:
used_table_nums.add(int(m.group(1)))
# 为未识别编号的图片分配序号(跳过已占用的编号)
next_fig = 1
for item in image_items:
if item["label"] is None:
while next_fig in used_figure_nums:
next_fig += 1
item["label"] = f"Figure {next_fig}"
used_figure_nums.add(next_fig)
next_tbl = 1
for item in table_items:
if item["label"] is None:
while next_tbl in used_table_nums:
next_tbl += 1
item["label"] = f"Table {next_tbl}"
used_table_nums.add(next_tbl)
# ── 第三遍:构建 manifest ──
manifest: dict[str, dict] = {}
for item in image_items:
manifest[item["filename"]] = {
"page": item["page"],
"type": "image",
"label": item["label"],
"caption_text": item.get("caption_text"),
"figures": [item["label"]],
}
for item in table_items:
manifest[item["filename"]] = {
"page": item["page"],
"type": "table",
"label": item["label"],
"caption_text": item.get("caption_text"),
"tables": [item["label"]],
}
# 保存 manifest # 保存 manifest
manifest_path = images_dest / "manifest.json" manifest_path = images_dest / "manifest.json"
manifest_path.write_text( manifest_path.write_text(
json.dumps(manifest, ensure_ascii=False, indent=2) json.dumps(manifest, ensure_ascii=False, indent=2)
) )
captioned = sum(
1 for it in image_items + table_items if it["caption_text"]
)
label_matched = sum(
1 for it in image_items + table_items
if it["caption_text"] and _identify_label(it["caption_text"])
)
if extracted > 0: if extracted > 0:
logger.info( logger.info(
"Extracted %d items from PDF for %s " "Extracted %d figure/table screenshots from PDF for %s "
"(%d images, %d tables, %d with captions, %d label-matched)", "(from %d captions found, %d unique)",
extracted, arxiv_id, extracted, arxiv_id, len(captions), len(unique_captions),
len(image_items), len(table_items), captioned, label_matched,
) )
return extracted return extracted
+33
View File
@@ -12,6 +12,7 @@ from zoneinfo import ZoneInfo
from app.config import settings from app.config import settings
from app.database import SessionLocal from app.database import SessionLocal
from app.services.pipeline import run_pipeline from app.services.pipeline import run_pipeline
from app.services.crawler import refresh_upvotes
from app.utils import today_str from app.utils import today_str
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -66,6 +67,22 @@ def start_scheduler() -> AsyncIOScheduler | None:
misfire_grace_time=3600, # 允许迟到 1 小时内补执行 misfire_grace_time=3600, # 允许迟到 1 小时内补执行
) )
# upvote 刷新:每天流水线之后 30 分钟执行,刷新最近 7 天论文的投票数
upvote_trigger = CronTrigger(
hour=settings.SCHEDULE_HOUR,
minute=settings.SCHEDULE_MINUTE + 30,
timezone=tz,
)
scheduler.add_job(
_upvote_refresh,
trigger=upvote_trigger,
id="upvote_refresh",
name="upvote_refresh",
replace_existing=True,
max_instances=1,
misfire_grace_time=3600,
)
scheduler.start() scheduler.start()
_scheduler = scheduler _scheduler = scheduler
logger.info( logger.info(
@@ -102,3 +119,19 @@ async def _daily_pipeline() -> None:
logger.exception("Unexpected error in daily pipeline") logger.exception("Unexpected error in daily pipeline")
finally: finally:
db.close() db.close()
async def _upvote_refresh() -> None:
"""刷新最近 N 天论文的 upvotes。"""
db: Session = SessionLocal()
try:
result = await refresh_upvotes(db)
logger.info(
"Upvote refresh completed: status=%s updated=%d",
result.get("status"),
result.get("updated", 0),
)
except Exception:
logger.exception("Unexpected error in upvote refresh")
finally:
db.close()
+5 -15
View File
@@ -138,20 +138,7 @@ a:hover {
} }
/* ── Date Quick Nav ─────────────────────────────────────────────── */ /* ── Date Quick Nav ─────────────────────────────────────────────── */
.date-quick-nav {
margin-top: 32px;
padding-top: 16px;
border-top: 1px solid var(--border);
font-size: 0.85rem;
color: var(--ink-light);
display: flex;
align-items: center;
gap: 8px;
flex-wrap: wrap;
}
/* ── Chips (shared) ─────────────────────────────────────────────── */ /* ── Chips (shared) ─────────────────────────────────────────────── */
.date-chip,
.tag-chip, .tag-chip,
.filter-chip { .filter-chip {
display: inline-block; display: inline-block;
@@ -162,14 +149,12 @@ a:hover {
font-size: 0.8rem; font-size: 0.8rem;
color: var(--ink-light); color: var(--ink-light);
} }
.date-chip:hover,
.tag-chip:hover, .tag-chip:hover,
.filter-chip:hover { .filter-chip:hover {
border-color: var(--accent); border-color: var(--accent);
color: var(--accent); color: var(--accent);
text-decoration: none; text-decoration: none;
} }
.date-chip.active,
.tag-chip.active, .tag-chip.active,
.filter-chip.active { .filter-chip.active {
background: var(--accent); background: var(--accent);
@@ -352,6 +337,11 @@ a:hover {
margin-bottom: 12px; margin-bottom: 12px;
} }
.detail-upvote-time {
font-size: 0.78rem;
color: var(--border);
}
.detail-tags { .detail-tags {
margin-bottom: 12px; margin-bottom: 12px;
display: flex; display: flex;
+12
View File
@@ -33,6 +33,7 @@
<button class="admin-action-btn" onclick="adminAction('crawl')">🔄 抓取今天</button> <button class="admin-action-btn" onclick="adminAction('crawl')">🔄 抓取今天</button>
<button class="admin-action-btn" onclick="adminAction('summarize')">📝 批量总结</button> <button class="admin-action-btn" onclick="adminAction('summarize')">📝 批量总结</button>
<button class="admin-action-btn" onclick="adminAction('cleanup')">🧹 清理临时文件</button> <button class="admin-action-btn" onclick="adminAction('cleanup')">🧹 清理临时文件</button>
<button class="admin-action-btn" onclick="refreshUpvotes()">👍 刷新投票</button>
</div> </div>
<div class="admin-info-grid"> <div class="admin-info-grid">
@@ -59,6 +60,10 @@
<span class="info-value">{{ stats.next_run[:19] | replace('T', ' ') }}</span> <span class="info-value">{{ stats.next_run[:19] | replace('T', ' ') }}</span>
</div> </div>
{% endif %} {% endif %}
<div class="info-row">
<span class="info-label">投票刷新</span>
<span class="info-value">每日自动刷新最近 {{ stats.upvote_refresh_days | default(7) }} 天</span>
</div>
{% if stats.active_locks %} {% if stats.active_locks %}
<div class="info-row"> <div class="info-row">
<span class="info-label">活跃任务</span> <span class="info-label">活跃任务</span>
@@ -181,5 +186,12 @@
.then(data => { if (data) showToast(data.error ? "❌ " + data.error.substring(0,200) : "✅ 流水线已触发"); }) .then(data => { if (data) showToast(data.error ? "❌ " + data.error.substring(0,200) : "✅ 流水线已触发"); })
.catch(err => showToast("❌ 请求失败")); .catch(err => showToast("❌ 请求失败"));
} }
function refreshUpvotes() {
fetch("/admin/refresh-upvotes", { method: "POST", headers: { "Content-Type": "application/json" } })
.then(r => { if (r.status===303||r.status===401) { window.location.href="/admin/login"; return; } return r.json(); })
.then(data => { if (data) showToast(data.error ? "❌ " + data.error.substring(0,200) : `✅ 已刷新 ${data.updated || 0} 篇论文投票`); })
.catch(err => showToast("❌ 请求失败"));
}
</script> </script>
{% endblock %} {% endblock %}
+3
View File
@@ -22,6 +22,9 @@ endblock %} {% block content %}
>📅 {{ paper.published_at or paper.paper_date }}</span >📅 {{ paper.published_at or paper.paper_date }}</span
> >
<span class="detail-upvotes">👍 {{ paper.upvotes }}</span> <span class="detail-upvotes">👍 {{ paper.upvotes }}</span>
{% if paper.crawled_at %}
<span class="detail-upvote-time" title="投票数据更新时间">{{ paper.crawled_at.strftime('%m-%d %H:%M') }}</span>
{% endif %}
</div> </div>
{# 标签 #} {% if paper.tags %} {# 标签 #} {% if paper.tags %}
-10
View File
@@ -23,16 +23,6 @@ endblock %} {% block content %}
</div> </div>
{% endif %} {% endif %}
<div class="date-quick-nav">
<span>有数据的日期:</span>
{% for d in available_dates[:10] %}
<a
href="/day/{{ d }}"
class="date-chip {% if d == current_date %}active{% endif %}"
>{{ d }}</a
>
{% endfor %}
</div>
{% endblock %} {% endblock %}
{% block scripts %} {% block scripts %}
+1 -1
View File
@@ -20,7 +20,7 @@
{% endif %} {% endif %}
</a> </a>
</h2> </h2>
<span class="paper-upvotes">👍 {{ paper.upvotes }}</span> <span class="paper-upvotes" title="数据更新于 {{ paper.crawled_at.strftime('%m-%d %H:%M') if paper.crawled_at else '' }}">👍 {{ paper.upvotes }}</span>
{% if variant == 'search' and distances and paper.arxiv_id in distances %} {% if variant == 'search' and distances and paper.arxiv_id in distances %}
<span class="similarity-score" title="语义相似度距离"> <span class="similarity-score" title="语义相似度距离">
🎯 {{ "%.3f"|format(distances[paper.arxiv_id]) }} 🎯 {{ "%.3f"|format(distances[paper.arxiv_id]) }}
+7
View File
@@ -57,6 +57,13 @@ def yesterday_str() -> str:
return yesterday.isoformat() return yesterday.isoformat()
def recent_date_strs(n: int) -> list[str]:
"""最近 N 天的日期字符串列表(含今天,按 APP_TIMEZONE)。"""
tz = ZoneInfo(settings.APP_TIMEZONE)
today = datetime.now(tz).date()
return [(today - timedelta(days=i)).isoformat() for i in range(n)]
def latest_paper_date(db) -> str: def latest_paper_date(db) -> str:
"""查询数据库中最新的 paper_date,无数据时回退到 today_str()。""" """查询数据库中最新的 paper_date,无数据时回退到 today_str()。"""
from sqlalchemy import func, select from sqlalchemy import func, select