feat: refactor PDF extraction to caption-based screenshots, add upvote refresh, clean up UI

- PDF extractor: rewrite from embedded bitmap extraction to caption-based
  page region screenshots. Finds Figure/Table captions via regex,截取上方/下方
  page region, handles compound figures and vector graphics.
- Upvote refresh: new crawler.refresh_upvotes() re-fetches upvotes for recent
  N days without inserting new papers. Scheduler runs daily 30min after pipeline.
- Admin: add /admin/refresh-upvotes endpoint and dashboard button.
- UI: remove date quick nav, show upvote update time on detail/card pages,
  clean up CSS date-chip styles.
- Utils: add recent_date_strs() helper.
This commit is contained in:
2026-06-09 18:01:01 +08:00
parent b72b5a31bb
commit 1fc6303e09
13 changed files with 460 additions and 311 deletions
+1
View File
@@ -41,6 +41,7 @@ class Settings(BaseSettings):
SCHEDULE_HOUR: int = 4
SCHEDULE_MINUTE: int = 0
APP_WORKERS: int = 1
UPVOTE_REFRESH_DAYS: int = 7 # 刷新最近 N 天论文的 upvotes
# 数据库
DATABASE_URL: str = "sqlite:///data/db/papers.db"
+19 -2
View File
@@ -26,7 +26,7 @@ from app.models import (
)
from app.services.admin import get_admin_stats
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
from app.services.crawler import crawl_daily
from app.services.crawler import crawl_daily, refresh_upvotes
from app.services.pipeline import run_pipeline
from app.services.scheduler import get_scheduler
from app.services.summarizer import summarize_batch, summarize_single
@@ -129,16 +129,20 @@ async def admin_scheduler_status(_admin: None = Depends(verify_admin)):
"""调度器运行状态(JSON)。"""
scheduler = get_scheduler()
next_run = None
upvote_next_run = None
if scheduler:
for job in scheduler.get_jobs():
if job.id == "daily_pipeline":
next_run = job.next_run_time
break
elif job.id == "upvote_refresh":
upvote_next_run = job.next_run_time
return {
"enabled": scheduler is not None,
"schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
"timezone": settings.APP_TIMEZONE,
"next_run": next_run.isoformat() if next_run else None,
"upvote_next_run": upvote_next_run.isoformat() if upvote_next_run else None,
"upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
}
@@ -159,6 +163,19 @@ async def admin_trigger_pipeline(
return {"status": "success", "message": "流水线执行完成"}
@router.post("/refresh-upvotes")
async def admin_refresh_upvotes(
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
days: int | None = Query(None, description="刷新最近 N 天,默认使用配置值"),
):
"""手动刷新最近 N 天论文的 upvotes。"""
result = await refresh_upvotes(db, days=days)
if result["status"] == "failed":
raise HTTPException(status_code=500, detail=result.get("error"))
return result
# ── 请求模型 ──────────────────────────────────────────────────────────
+8 -3
View File
@@ -315,11 +315,16 @@ def _link_figures_with_images(
fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))]
table_type_unmatched = [f for f in unmatched if not _is_figure_type(f.get("id", ""))]
# 提取的图片按类型分流,按文件名排序
# 提取的图片按类型分流,按文件名中的编号排序
def _sort_key(name: str) -> tuple[int, int]:
m = re.search(r'page(\d+)_(?:img|table)(\d+)', name)
# 新格式:figure_1.png, table_1.png
m = re.search(r'(?:figure|table)_(\d+)', name)
if m:
return (int(m.group(1)), int(m.group(2)))
return (0, int(m.group(1)))
# 旧格式:page2_img1.png, page5_table1.png
m2 = re.search(r'page(\d+)_(?:img|table)(\d+)', name)
if m2:
return (int(m2.group(1)), int(m2.group(2)))
return (0, 0)
fig_images = sorted(
+1
View File
@@ -106,4 +106,5 @@ def get_admin_stats(db: Session) -> dict:
"next_run": next_run.isoformat() if next_run else None,
"recent_logs": recent_logs,
"active_locks": active_locks,
"upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
}
+79 -1
View File
@@ -16,7 +16,7 @@ from app.models import (
SummaryState,
SummaryStatus,
)
from app.utils import make_http_client, utc_now
from app.utils import make_http_client, recent_date_strs, utc_now
logger = logging.getLogger(__name__)
@@ -205,3 +205,81 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -
log_entry.completed_at = utc_now()
db.commit()
return {"found": 0, "new": 0, "status": "failed", "error": str(exc)}
async def refresh_upvotes(db: Session, days: int | None = None) -> dict:
"""重新抓取最近 N 天论文的 upvotes,不插入新论文。
遍历每天调用 HF API,对已有论文仅更新 upvotes 和 crawled_at。
"""
days = days or settings.UPVOTE_REFRESH_DAYS
date_strs = recent_date_strs(days)
now = utc_now()
log_entry = CrawlLog(
task="upvote_refresh",
status="running",
date=date_type.fromisoformat(date_strs[0]),
started_at=now,
)
db.add(log_entry)
db.commit()
total_updated = 0
errors: list[str] = []
try:
for ds in date_strs:
try:
raw_papers = await fetch_daily(ds)
updated = _update_upvotes_only(db, raw_papers)
total_updated += updated
logger.info("Refreshed upvotes for %s: %d papers", ds, updated)
except Exception as exc:
msg = f"{ds}: {exc}"
errors.append(msg)
logger.warning("Failed to refresh upvotes for %s: %s", ds, exc)
log_entry.status = "success" if not errors else "partial"
log_entry.papers_found = total_updated
log_entry.details_json = f'{{"days": {days}, "errors": {len(errors)}}}'
log_entry.completed_at = utc_now()
db.commit()
return {
"status": "success" if not errors else "partial",
"updated": total_updated,
"days": days,
"errors": errors or None,
}
except Exception as exc:
logger.exception("Upvote refresh failed")
log_entry.status = "failed"
log_entry.error = str(exc)
log_entry.completed_at = utc_now()
db.commit()
return {"status": "failed", "updated": total_updated, "error": str(exc)}
def _update_upvotes_only(db: Session, papers_raw: list[dict]) -> int:
"""对已有论文更新 upvotes,跳过不存在的新论文。"""
now = datetime.now(timezone.utc)
updated = 0
for item in papers_raw:
meta = _parse_paper(item)
arxiv_id = meta["arxiv_id"]
if not arxiv_id:
continue
existing = db.execute(
select(Paper).where(Paper.arxiv_id == arxiv_id)
).scalar_one_or_none()
if existing:
existing.upvotes = meta["upvotes"]
existing.crawled_at = now
updated += 1
db.commit()
return updated
+291 -279
View File
@@ -1,12 +1,12 @@
"""PDF 图片与表格提取 — 从 PDF 中提取嵌入图片和表格截图。
"""PDF 图片与表格提取 — 基于 caption 定位的页面区域截图。
策略:
1. 提取 PDF 中嵌入的图片(图表、插图等),按页面位置排序
2. 检测表格区域,渲染为截图
3. 为每张图/表格提取附近的说明文字(caption),从中识别 Figure N / Table N
4. 根据 caption 内容矫正类型:标注为 "Figure" 的表格区域 → 归为图片
5. 序号匹配兜底:第 N 张图 → Figure N(学术论文图表严格按顺序出现
6. 保存 manifest.json 供后续与 AI 总结的 figures 字段匹配
核心思路:学术论文排版极其规整,Figure caption 在图下方,Table caption 在表格上方。
因此反过来:先找 caption 文字 → 向上/向下截取页面区域 → 渲染为 PNG。
优势(相比提取嵌入位图):
- 复合图表不会被拆成碎片(整块截取)
- 矢量图也能截取(页面渲染包含一切
- 不依赖 find_tables()(纯文本匹配 caption
"""
from __future__ import annotations
@@ -21,124 +21,252 @@ from app.utils import TMP_DIR
logger = logging.getLogger(__name__)
# 最小面积阈值(像素),小于此值的图片视为图标/装饰
_MIN_AREA = 10_000 # ~100x100
_MIN_DIM = 80
# ── 截取区域参数 ───────────────────────────────────────────────────────
# Caption 搜索区域 — Figure caption 在图下方,Table caption 在图上方
_CAPTION_MARGIN = 10 # 贴边距离
_CAPTION_MAX_DISTANCE = 250 # 最远搜索距离
_CAPTION_SIDE_PADDING = 40 # 左右扩展
# Figure: caption 上方搜索图的范围(点)
_FIGURE_MAX_HEIGHT = 450 # 最大向上搜索范围
_FIGURE_MIN_HEIGHT = 50 # 最小有效截图高度
_FIGURE_DEFAULT_HEIGHT = 280 # 上方未找到内容块时的默认图高度
# Figure/Table 标注正则
_FIGURE_CAPTION_RE = re.compile(
r'\b(?:Fig\.?|Figure)\s*(\d+)\b', re.IGNORECASE
# Table: caption 下方搜索表格的范围
_TABLE_MAX_HEIGHT = 500 # 最大向下搜索范围
_TABLE_MIN_HEIGHT = 30
# caption 左右扩展(双栏论文中 caption 可能比表格窄)
_REGION_SIDE_PADDING = 10
# 表格通常比 caption 文字宽,使用更大的水平扩展
_TABLE_SIDE_PADDING = 60
# 正文行距的 2 倍 ≈ 空白间隙阈值
_CONTENT_GAP_THRESHOLD = 30
# ── Caption 正则 ───────────────────────────────────────────────────────
# 要求以 Figure/Table 开头(避免匹配正文中的 "see Figure 3" 等)
_CAPTION_RE = re.compile(
r'^(?:Fig\.?|Figure)\s+(\d+)\s*[:\.]',
re.IGNORECASE,
)
_TABLE_CAPTION_RE = re.compile(
r'\bTable\s*(\d+)\b', re.IGNORECASE
r'^Table\s+(\d+)\s*[:\.]',
re.IGNORECASE,
)
def _extract_caption_text(page, bbox, page_height: float, *,
search_above: bool = False,
search_both: bool = False) -> str | None:
"""从图片/表格附近区域提取 caption 文字。
def _find_captions(doc) -> list[dict]:
"""扫描整个文档,找到所有 Figure/Table caption 的位置和信息。"""
captions = []
search_above=True:搜索上方(Table caption 通常在上)
默认搜索下方(Figure caption 通常在下)
search_both=True:上下都搜,返回包含 Figure/Table 标注的那边
"""
import pymupdf
x0 = max(0, bbox.x0 - _CAPTION_SIDE_PADDING)
x1 = bbox.x1 + _CAPTION_SIDE_PADDING
def _search(y0: float, y1: float) -> str | None:
rect = pymupdf.Rect(x0, y0, x1, y1)
for page_num in range(len(doc)):
page = doc[page_num]
page_width = page.rect.width
page_height = page.rect.height
blocks = page.get_text("blocks")
parts: list[str] = []
for block in blocks:
if len(block) < 5:
continue
block_rect = pymupdf.Rect(block[:4])
if block_rect.intersects(rect):
text = str(block[4]).strip()
if text:
parts.append(text)
if parts:
return " ".join(parts)
text = page.get_textbox(rect)
if text and len(text.strip()) >= 5:
return text.strip()
return None
text = str(block[4]).strip()
if not text:
continue
if search_both:
# 上方
above_y1 = max(0, bbox.y0 - _CAPTION_MARGIN)
above_y0 = max(0, bbox.y0 - _CAPTION_MAX_DISTANCE)
above = _search(above_y0, above_y1)
# 下方
below_y0 = bbox.y1 + _CAPTION_MARGIN
below_y1 = min(page_height, bbox.y1 + _CAPTION_MAX_DISTANCE)
below = _search(below_y0, below_y1)
bx0, by0, bx1, by1 = block[0], block[1], block[2], block[3]
# 只取 block 第一行做匹配(避免 block 包含多段文字干扰)
first_line = text.split("\n")[0].strip()
# 优先返回包含 Figure/Table 标注的那边
if above and (_FIGURE_CAPTION_RE.search(above) or _TABLE_CAPTION_RE.search(above)):
return above
if below and (_FIGURE_CAPTION_RE.search(below) or _TABLE_CAPTION_RE.search(below)):
return below
# 否则返回更长的
if above and below:
return above if len(above) >= len(below) else below
return above or below
m = _CAPTION_RE.match(first_line)
if m:
captions.append({
"type": "figure",
"num": int(m.group(1)),
"label": f"Figure {m.group(1)}",
"page_num": page_num,
"caption_y0": by0,
"caption_y1": by1,
"caption_x0": bx0,
"caption_x1": bx1,
"caption_text": text,
"page_width": page_width,
"page_height": page_height,
})
continue
if search_above:
y1 = max(0, bbox.y0 - _CAPTION_MARGIN)
y0 = max(0, bbox.y0 - _CAPTION_MAX_DISTANCE)
m = _TABLE_CAPTION_RE.match(first_line)
if m:
captions.append({
"type": "table",
"num": int(m.group(1)),
"label": f"Table {m.group(1)}",
"page_num": page_num,
"caption_y0": by0,
"caption_y1": by1,
"caption_x0": bx0,
"caption_x1": bx1,
"caption_text": text,
"page_width": page_width,
"page_height": page_height,
})
return captions
def _find_figure_top(page, caption: dict) -> float:
"""向上扫描页面,找到 Figure 的上边界。
策略:
1. 收集 caption 上方的所有内容块(文本 + 嵌入图片)
2. 找到最顶部的内容块作为图的上界
3. 检查内容块之间的大间隙(表示图从间隙下方开始)
4. 如果没找到任何内容块,使用默认图高度
注意:只扫描 text blocks 是不够的,因为 figure 本身是图片/矢量图,
不会被 get_text("blocks") 返回。必须同时用 get_image_info() 检测嵌入图片。
"""
caption_y = caption["caption_y0"]
cx0 = caption["caption_x0"] - _REGION_SIDE_PADDING
cx1 = caption["caption_x1"] + _REGION_SIDE_PADDING
# 收集 caption 上方、同列范围内的所有内容块
# 每个元素: (x0, y0, x1, y1)
above_blocks: list[tuple[float, float, float, float]] = []
# ── 1. 文本块 ──
for b in page.get_text("blocks"):
if len(b) < 5:
continue
bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
if by1 <= caption_y and by1 > caption_y - _FIGURE_MAX_HEIGHT:
if bx1 > cx0 and bx0 < cx1:
above_blocks.append((bx0, by0, bx1, by1))
# ── 2. 嵌入图片块 — 关键!figure 本身是图片,不是文本 ──
for img_info in page.get_image_info():
bbox = img_info.get("bbox")
if bbox is None:
continue
# Rect 对象: x0, y0, x1, y1
ix0, iy0, ix1, iy1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
if iy1 <= caption_y and iy1 > caption_y - _FIGURE_MAX_HEIGHT:
if ix1 > cx0 and ix0 < cx1:
above_blocks.append((ix0, iy0, ix1, iy1))
# ── 没有内容块 → 用默认高度(可能是纯矢量图,如 TikZ/matplotlib PDF ──
if not above_blocks:
return max(0, caption_y - _FIGURE_DEFAULT_HEIGHT)
# ── 找到内容区域的上边界 ──
# 按 y 从下到上排序(离 caption 最近的在前)
above_blocks.sort(key=lambda b: b[1], reverse=True)
# 从 caption 向上扫描,找到第一个大间隙以上作为图的上界
# 典型结构: [正文段落] ...空白... [图内容(图片/矢量)] [caption]
# 空白间隙 ≈ 图的上边界
figure_top = above_blocks[-1][1] # 最上面的块顶部(默认兜底)
prev_bottom = caption_y # 从 caption 顶部开始向上
for b in above_blocks:
# b = (x0, y0, x1, y1), 我们关心 y 范围
gap = prev_bottom - b[3] # b[3] = by1 = 当前块底部
if gap > _CONTENT_GAP_THRESHOLD:
# 大间隙 → 图上边界在间隙下方
figure_top = prev_bottom - 5
break
# 小间隙 → 当前块属于图的一部分(或紧挨着图),继续向上
prev_bottom = b[1] # b[1] = by0 = 当前块顶部
else:
y0 = bbox.y1 + _CAPTION_MARGIN
y1 = min(page_height, bbox.y1 + _CAPTION_MAX_DISTANCE)
# 所有块都紧挨着 → 图从最上面块的顶部开始
figure_top = above_blocks[-1][1]
return _search(y0, y1)
# 限制最大高度
if caption_y - figure_top > _FIGURE_MAX_HEIGHT:
figure_top = caption_y - _FIGURE_MAX_HEIGHT
# 不低于页面顶部
figure_top = max(0, figure_top)
return figure_top
def _identify_label(caption_text: str | None) -> str | None:
"""从 caption 文字中识别 Figure N / Table N 编号。"""
if not caption_text:
return None
def _find_table_region(page, caption: dict) -> tuple[float, float, float, float]:
"""向下扫描页面,找到 Table 的下边界和水平范围。
m = _FIGURE_CAPTION_RE.search(caption_text)
if m:
return f"Figure {m.group(1)}"
返回: (x0, bottom, x1) — 裁剪区域的左、下、右边界。
上边界由调用方根据 caption 位置确定。
m = _TABLE_CAPTION_RE.search(caption_text)
if m:
return f"Table {m.group(1)}"
策略:
1. 收集 caption 下方的文本块(表格内容是文本)
2. 找到连续内容区域的底部(遇到大间隙时停止)
3. 同时检测表格内容的水平范围(表格通常比 caption 宽)
"""
blocks = page.get_text("blocks")
caption_y = caption["caption_y1"] # caption 底部作为扫描起点
caption_x0 = caption["caption_x0"]
caption_x1 = caption["caption_x1"]
page_height = caption["page_height"]
page_width = caption["page_width"]
return None
# 先用较宽的范围收集可能的表格内容块
search_x0 = max(0, caption_x0 - _TABLE_SIDE_PADDING)
search_x1 = min(page_width, caption_x1 + _TABLE_SIDE_PADDING)
below_blocks: list[tuple[float, float, float, float]] = []
for b in blocks:
if len(b) < 5:
continue
bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
if by0 > caption_y and by0 < caption_y + _TABLE_MAX_HEIGHT:
if bx1 > search_x0 and bx0 < search_x1:
below_blocks.append((bx0, by0, bx1, by1))
def _is_figure_caption(caption_text: str | None) -> bool:
"""判断 caption 是否标注为 Figure(用于矫正 find_tables 的误判)。"""
if not caption_text:
return False
return bool(_FIGURE_CAPTION_RE.search(caption_text))
if not below_blocks:
# 没有内容 → 使用默认高度和 caption 宽度
return (
max(0, caption_x0 - _REGION_SIDE_PADDING),
min(page_height, caption_y + _TABLE_MIN_HEIGHT),
min(page_width, caption_x1 + _REGION_SIDE_PADDING),
)
# ── 找到连续内容区域的底部 ──
below_blocks.sort(key=lambda b: b[1]) # 按 y 升序
prev_y = caption_y
bottom = below_blocks[-1][3] + 5 # 最后一块的底部 + margin
for b in below_blocks:
gap = b[1] - prev_y # b[1] = by0
if gap > _CONTENT_GAP_THRESHOLD:
bottom = prev_y + 5
break
prev_y = b[3] # b[3] = by1
# 限制最大高度
if bottom - caption_y > _TABLE_MAX_HEIGHT:
bottom = caption_y + _TABLE_MAX_HEIGHT
# ── 检测表格内容的水平范围 ──
# 表格通常比 caption 宽,用内容块的实际宽度
content_x0 = min(caption_x0, min(b[0] for b in below_blocks))
content_x1 = max(caption_x1, max(b[2] for b in below_blocks))
# 添加边距,但不超出页面
x0 = max(0, content_x0 - _REGION_SIDE_PADDING)
x1 = min(page_width, content_x1 + _REGION_SIDE_PADDING)
return (x0, bottom, x1)
def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
"""从 PDF 提取嵌入图片和表格截图,生成 manifest。
"""从 PDF 提取 Figure/Table 截图,生成 manifest。
匹配策略:
1. 提取图片→提取 caption 文字→从中识别 Figure/Table 编号
2. 表格区域若 caption 标注为 "Figure",则重分类为图片
3. 未能从 caption 识别编号的,按(页码, 纵向位置)排序后用序号匹配兜底
策略:找 caption → 定位区域 → 渲染页面截图。
Args:
arxiv_id: 论文 ID
pdf_path: PDF 路径,默认 data/tmp/{arxiv_id}/paper.pdf
Returns:
提取的图片+表格数量
提取的图片数量
"""
import pymupdf
@@ -153,216 +281,100 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
images_dest.mkdir(parents=True, exist_ok=True)
doc = pymupdf.open(str(pdf_path))
captions = _find_captions(doc)
if not captions:
logger.info("No Figure/Table captions found in PDF for %s", arxiv_id)
doc.close()
return 0
# 去重:同一页同一 label 可能匹配到多个 block(如正文引用 "Figure 7"
# 保留每个 (type, num) 的第一个匹配(即真正的 caption)
seen_labels: dict[str, dict] = {}
for cap in captions:
key = cap["label"]
if key not in seen_labels:
seen_labels[key] = cap
unique_captions = list(seen_labels.values())
extracted = 0
seen_hashes: set[int] = set()
manifest: dict[str, dict] = {}
# ── 第一遍:收集所有图片和表格 ──
image_items: list[dict] = []
table_items: list[dict] = []
zoom = 2 # 2x 渲染,保证清晰度
for page_num in range(len(doc)):
page = doc[page_num]
page_height = page.rect.height
for cap in unique_captions:
page = doc[cap["page_num"]]
pw = cap["page_width"]
ph = cap["page_height"]
# 1. 提取嵌入图片
image_list = page.get_images(full=True)
for img_index, img_info in enumerate(image_list):
xref = img_info[0]
try:
pix = pymupdf.Pixmap(doc, xref)
except Exception:
if cap["type"] == "figure":
# Figure: caption 上方是图 → 向上找图的上边界
top = _find_figure_top(page, cap)
bottom = cap["caption_y1"] + 5 # 包含 caption
# 水平范围:caption 宽度 + 边距(图和 caption 通常等宽)
# 但也要考虑图内容的实际宽度
x0 = max(0, cap["caption_x0"] - _REGION_SIDE_PADDING)
x1 = min(pw, cap["caption_x1"] + _REGION_SIDE_PADDING)
height = bottom - top
if height < _FIGURE_MIN_HEIGHT:
logger.debug(
"Figure %s too small (%.0fpt), skipping", cap["label"], height
)
continue
if pix.width < _MIN_DIM or pix.height < _MIN_DIM:
continue
if pix.width * pix.height < _MIN_AREA:
else:
# Table: caption 下方是表格 → 向下找表格的下边界和水平范围
x0, bottom, x1 = _find_table_region(page, cap)
top = max(0, cap["caption_y0"] - 3) # 包含 caption,上边留少许 margin
height = bottom - top
if height < _TABLE_MIN_HEIGHT:
logger.debug(
"Table %s too small (%.0fpt), skipping", cap["label"], height
)
continue
img_hash = hash(pix.tobytes()[:1024])
if img_hash in seen_hashes:
continue
seen_hashes.add(img_hash)
img_rects = page.get_image_rects(xref)
if not img_rects:
continue
bbox = img_rects[0]
if pix.n >= 5:
try:
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
except Exception:
continue
filename = f"page{page_num + 1}_img{img_index + 1}.png"
pix.save(str(images_dest / filename))
extracted += 1
caption_text = _extract_caption_text(page, bbox, page_height)
label = _identify_label(caption_text)
image_items.append({
"filename": filename,
"page": page_num + 1,
"y0": bbox.y0,
"caption_text": caption_text,
"label": label,
})
# 2. 提取表格截图(同时搜索上方 caption,Table 标题通常在表格上方)
# 渲染截取
clip = pymupdf.Rect(x0, top, x1, bottom)
mat = pymupdf.Matrix(zoom, zoom)
try:
tables = page.find_tables()
pix = page.get_pixmap(matrix=mat, clip=clip)
except Exception:
tables = None
logger.debug("Failed to render %s region for %s", cap["label"], arxiv_id)
continue
if tables and tables.tables:
for table_index, table in enumerate(tables.tables):
bbox = table.bbox
if not bbox:
continue
filename = f"{cap['label'].replace(' ', '_').lower()}.png"
pix.save(str(images_dest / filename))
extracted += 1
margin = 5
if hasattr(bbox, 'x0'):
x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
table_rect = bbox
else:
x0, y0, x1, y1 = bbox
table_rect = pymupdf.Rect(x0, y0, x1, y1)
clip_rect = pymupdf.Rect(
x0 - margin, y0 - margin, x1 + margin, y1 + margin
)
zoom = 2
mat = pymupdf.Matrix(zoom, zoom)
try:
pix = page.get_pixmap(matrix=mat, clip=clip_rect)
except Exception:
continue
if pix.width < _MIN_DIM * 2 or pix.height < 30 * 2:
continue
filename = f"page{page_num + 1}_table{table_index + 1}.png"
pix.save(str(images_dest / filename))
extracted += 1
# Table caption 上下都搜(学术论文惯例:Table 标题在上方,但实际排版各异)
caption_text = _extract_caption_text(
page, table_rect, page_height, search_both=True,
)
label = _identify_label(caption_text)
item = {
"filename": filename,
"page": page_num + 1,
"y0": y0,
"caption_text": caption_text,
"label": label,
}
# 关键:caption 标注为 Figure → 重分类为图片
if _is_figure_caption(caption_text):
image_items.append(item)
else:
table_items.append(item)
cap_preview = cap["caption_text"][:200] if cap["caption_text"] else ""
manifest[filename] = {
"page": cap["page_num"] + 1,
"type": cap["type"],
"label": cap["label"],
"caption_text": cap_preview,
"figures" if cap["type"] == "figure" else "tables": [cap["label"]],
}
logger.debug(
"Rendered %s: page %d, region (%.0f,%.0f)-(%.0f,%.0f) h=%.0fpt → %s",
cap["label"], cap["page_num"] + 1,
x0, top, x1, bottom, height, filename,
)
doc.close()
# ── 第二遍:矫正 find_tables 的误判 ──
# 如果表格与同页的图片高度重叠(复合图表的子区域),且 caption 不含 "Table"
# 则重分类为图片,归入邻近图片的 label
for t_item in table_items[:]:
t_page = t_item["page"]
t_y0 = t_item["y0"]
same_page_images = [i for i in image_items if i["page"] == t_page]
if not same_page_images:
continue
# 检查是否有重叠的图片
nearby = [
i for i in same_page_images
if abs(i["y0"] - t_y0) < 50
]
if nearby and not (t_item["caption_text"] and _TABLE_CAPTION_RE.search(t_item["caption_text"])):
# 重分类为图片,继承邻近图片的 label
neighbor_label = nearby[0].get("label")
t_item["label"] = neighbor_label
image_items.append(t_item)
table_items.remove(t_item)
# ── 第三遍:按 (page, y0) 排序 → 序号匹配兜底 ──
image_items.sort(key=lambda it: (it["page"], it["y0"]))
table_items.sort(key=lambda it: (it["page"], it["y0"]))
# 统计已通过 caption 确认的 Figure/Table 编号,避免序号重复分配
used_figure_nums: set[int] = set()
used_table_nums: set[int] = set()
for item in image_items:
if item["label"]:
m = _FIGURE_CAPTION_RE.search(item["label"])
if m:
used_figure_nums.add(int(m.group(1)))
for item in table_items:
if item["label"]:
m = _TABLE_CAPTION_RE.search(item["label"])
if m:
used_table_nums.add(int(m.group(1)))
# 为未识别编号的图片分配序号(跳过已占用的编号)
next_fig = 1
for item in image_items:
if item["label"] is None:
while next_fig in used_figure_nums:
next_fig += 1
item["label"] = f"Figure {next_fig}"
used_figure_nums.add(next_fig)
next_tbl = 1
for item in table_items:
if item["label"] is None:
while next_tbl in used_table_nums:
next_tbl += 1
item["label"] = f"Table {next_tbl}"
used_table_nums.add(next_tbl)
# ── 第三遍:构建 manifest ──
manifest: dict[str, dict] = {}
for item in image_items:
manifest[item["filename"]] = {
"page": item["page"],
"type": "image",
"label": item["label"],
"caption_text": item.get("caption_text"),
"figures": [item["label"]],
}
for item in table_items:
manifest[item["filename"]] = {
"page": item["page"],
"type": "table",
"label": item["label"],
"caption_text": item.get("caption_text"),
"tables": [item["label"]],
}
# 保存 manifest
manifest_path = images_dest / "manifest.json"
manifest_path.write_text(
json.dumps(manifest, ensure_ascii=False, indent=2)
)
captioned = sum(
1 for it in image_items + table_items if it["caption_text"]
)
label_matched = sum(
1 for it in image_items + table_items
if it["caption_text"] and _identify_label(it["caption_text"])
)
if extracted > 0:
logger.info(
"Extracted %d items from PDF for %s "
"(%d images, %d tables, %d with captions, %d label-matched)",
extracted, arxiv_id,
len(image_items), len(table_items), captioned, label_matched,
"Extracted %d figure/table screenshots from PDF for %s "
"(from %d captions found, %d unique)",
extracted, arxiv_id, len(captions), len(unique_captions),
)
return extracted
+33
View File
@@ -12,6 +12,7 @@ from zoneinfo import ZoneInfo
from app.config import settings
from app.database import SessionLocal
from app.services.pipeline import run_pipeline
from app.services.crawler import refresh_upvotes
from app.utils import today_str
logger = logging.getLogger(__name__)
@@ -66,6 +67,22 @@ def start_scheduler() -> AsyncIOScheduler | None:
misfire_grace_time=3600, # 允许迟到 1 小时内补执行
)
# upvote 刷新:每天流水线之后 30 分钟执行,刷新最近 7 天论文的投票数
upvote_trigger = CronTrigger(
hour=settings.SCHEDULE_HOUR,
minute=settings.SCHEDULE_MINUTE + 30,
timezone=tz,
)
scheduler.add_job(
_upvote_refresh,
trigger=upvote_trigger,
id="upvote_refresh",
name="upvote_refresh",
replace_existing=True,
max_instances=1,
misfire_grace_time=3600,
)
scheduler.start()
_scheduler = scheduler
logger.info(
@@ -102,3 +119,19 @@ async def _daily_pipeline() -> None:
logger.exception("Unexpected error in daily pipeline")
finally:
db.close()
async def _upvote_refresh() -> None:
"""刷新最近 N 天论文的 upvotes。"""
db: Session = SessionLocal()
try:
result = await refresh_upvotes(db)
logger.info(
"Upvote refresh completed: status=%s updated=%d",
result.get("status"),
result.get("updated", 0),
)
except Exception:
logger.exception("Unexpected error in upvote refresh")
finally:
db.close()
+5 -15
View File
@@ -138,20 +138,7 @@ a:hover {
}
/* ── Date Quick Nav ─────────────────────────────────────────────── */
.date-quick-nav {
margin-top: 32px;
padding-top: 16px;
border-top: 1px solid var(--border);
font-size: 0.85rem;
color: var(--ink-light);
display: flex;
align-items: center;
gap: 8px;
flex-wrap: wrap;
}
/* ── Chips (shared) ─────────────────────────────────────────────── */
.date-chip,
.tag-chip,
.filter-chip {
display: inline-block;
@@ -162,14 +149,12 @@ a:hover {
font-size: 0.8rem;
color: var(--ink-light);
}
.date-chip:hover,
.tag-chip:hover,
.filter-chip:hover {
border-color: var(--accent);
color: var(--accent);
text-decoration: none;
}
.date-chip.active,
.tag-chip.active,
.filter-chip.active {
background: var(--accent);
@@ -352,6 +337,11 @@ a:hover {
margin-bottom: 12px;
}
.detail-upvote-time {
font-size: 0.78rem;
color: var(--border);
}
.detail-tags {
margin-bottom: 12px;
display: flex;
+12
View File
@@ -33,6 +33,7 @@
<button class="admin-action-btn" onclick="adminAction('crawl')">🔄 抓取今天</button>
<button class="admin-action-btn" onclick="adminAction('summarize')">📝 批量总结</button>
<button class="admin-action-btn" onclick="adminAction('cleanup')">🧹 清理临时文件</button>
<button class="admin-action-btn" onclick="refreshUpvotes()">👍 刷新投票</button>
</div>
<div class="admin-info-grid">
@@ -59,6 +60,10 @@
<span class="info-value">{{ stats.next_run[:19] | replace('T', ' ') }}</span>
</div>
{% endif %}
<div class="info-row">
<span class="info-label">投票刷新</span>
<span class="info-value">每日自动刷新最近 {{ stats.upvote_refresh_days | default(7) }} 天</span>
</div>
{% if stats.active_locks %}
<div class="info-row">
<span class="info-label">活跃任务</span>
@@ -181,5 +186,12 @@
.then(data => { if (data) showToast(data.error ? "❌ " + data.error.substring(0,200) : "✅ 流水线已触发"); })
.catch(err => showToast("❌ 请求失败"));
}
function refreshUpvotes() {
fetch("/admin/refresh-upvotes", { method: "POST", headers: { "Content-Type": "application/json" } })
.then(r => { if (r.status===303||r.status===401) { window.location.href="/admin/login"; return; } return r.json(); })
.then(data => { if (data) showToast(data.error ? "❌ " + data.error.substring(0,200) : `✅ 已刷新 ${data.updated || 0} 篇论文投票`); })
.catch(err => showToast("❌ 请求失败"));
}
</script>
{% endblock %}
+3
View File
@@ -22,6 +22,9 @@ endblock %} {% block content %}
>📅 {{ paper.published_at or paper.paper_date }}</span
>
<span class="detail-upvotes">👍 {{ paper.upvotes }}</span>
{% if paper.crawled_at %}
<span class="detail-upvote-time" title="投票数据更新时间">{{ paper.crawled_at.strftime('%m-%d %H:%M') }}</span>
{% endif %}
</div>
{# 标签 #} {% if paper.tags %}
-10
View File
@@ -23,16 +23,6 @@ endblock %} {% block content %}
</div>
{% endif %}
<div class="date-quick-nav">
<span>有数据的日期:</span>
{% for d in available_dates[:10] %}
<a
href="/day/{{ d }}"
class="date-chip {% if d == current_date %}active{% endif %}"
>{{ d }}</a
>
{% endfor %}
</div>
{% endblock %}
{% block scripts %}
+1 -1
View File
@@ -20,7 +20,7 @@
{% endif %}
</a>
</h2>
<span class="paper-upvotes">👍 {{ paper.upvotes }}</span>
<span class="paper-upvotes" title="数据更新于 {{ paper.crawled_at.strftime('%m-%d %H:%M') if paper.crawled_at else '' }}">👍 {{ paper.upvotes }}</span>
{% if variant == 'search' and distances and paper.arxiv_id in distances %}
<span class="similarity-score" title="语义相似度距离">
🎯 {{ "%.3f"|format(distances[paper.arxiv_id]) }}
+7
View File
@@ -57,6 +57,13 @@ def yesterday_str() -> str:
return yesterday.isoformat()
def recent_date_strs(n: int) -> list[str]:
"""最近 N 天的日期字符串列表(含今天,按 APP_TIMEZONE)。"""
tz = ZoneInfo(settings.APP_TIMEZONE)
today = datetime.now(tz).date()
return [(today - timedelta(days=i)).isoformat() for i in range(n)]
def latest_paper_date(db) -> str:
"""查询数据库中最新的 paper_date,无数据时回退到 today_str()。"""
from sqlalchemy import func, select