feat: refactor PDF extraction to caption-based screenshots, add upvote refresh, clean up UI
- PDF extractor: rewrite from embedded bitmap extraction to caption-based page region screenshots. Finds Figure/Table captions via regex,截取上方/下方 page region, handles compound figures and vector graphics. - Upvote refresh: new crawler.refresh_upvotes() re-fetches upvotes for recent N days without inserting new papers. Scheduler runs daily 30min after pipeline. - Admin: add /admin/refresh-upvotes endpoint and dashboard button. - UI: remove date quick nav, show upvote update time on detail/card pages, clean up CSS date-chip styles. - Utils: add recent_date_strs() helper.
This commit is contained in:
@@ -41,6 +41,7 @@ class Settings(BaseSettings):
|
|||||||
SCHEDULE_HOUR: int = 4
|
SCHEDULE_HOUR: int = 4
|
||||||
SCHEDULE_MINUTE: int = 0
|
SCHEDULE_MINUTE: int = 0
|
||||||
APP_WORKERS: int = 1
|
APP_WORKERS: int = 1
|
||||||
|
UPVOTE_REFRESH_DAYS: int = 7 # 刷新最近 N 天论文的 upvotes
|
||||||
|
|
||||||
# 数据库
|
# 数据库
|
||||||
DATABASE_URL: str = "sqlite:///data/db/papers.db"
|
DATABASE_URL: str = "sqlite:///data/db/papers.db"
|
||||||
|
|||||||
+19
-2
@@ -26,7 +26,7 @@ from app.models import (
|
|||||||
)
|
)
|
||||||
from app.services.admin import get_admin_stats
|
from app.services.admin import get_admin_stats
|
||||||
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
|
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
|
||||||
from app.services.crawler import crawl_daily
|
from app.services.crawler import crawl_daily, refresh_upvotes
|
||||||
from app.services.pipeline import run_pipeline
|
from app.services.pipeline import run_pipeline
|
||||||
from app.services.scheduler import get_scheduler
|
from app.services.scheduler import get_scheduler
|
||||||
from app.services.summarizer import summarize_batch, summarize_single
|
from app.services.summarizer import summarize_batch, summarize_single
|
||||||
@@ -129,16 +129,20 @@ async def admin_scheduler_status(_admin: None = Depends(verify_admin)):
|
|||||||
"""调度器运行状态(JSON)。"""
|
"""调度器运行状态(JSON)。"""
|
||||||
scheduler = get_scheduler()
|
scheduler = get_scheduler()
|
||||||
next_run = None
|
next_run = None
|
||||||
|
upvote_next_run = None
|
||||||
if scheduler:
|
if scheduler:
|
||||||
for job in scheduler.get_jobs():
|
for job in scheduler.get_jobs():
|
||||||
if job.id == "daily_pipeline":
|
if job.id == "daily_pipeline":
|
||||||
next_run = job.next_run_time
|
next_run = job.next_run_time
|
||||||
break
|
elif job.id == "upvote_refresh":
|
||||||
|
upvote_next_run = job.next_run_time
|
||||||
return {
|
return {
|
||||||
"enabled": scheduler is not None,
|
"enabled": scheduler is not None,
|
||||||
"schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
|
"schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
|
||||||
"timezone": settings.APP_TIMEZONE,
|
"timezone": settings.APP_TIMEZONE,
|
||||||
"next_run": next_run.isoformat() if next_run else None,
|
"next_run": next_run.isoformat() if next_run else None,
|
||||||
|
"upvote_next_run": upvote_next_run.isoformat() if upvote_next_run else None,
|
||||||
|
"upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -159,6 +163,19 @@ async def admin_trigger_pipeline(
|
|||||||
return {"status": "success", "message": "流水线执行完成"}
|
return {"status": "success", "message": "流水线执行完成"}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/refresh-upvotes")
|
||||||
|
async def admin_refresh_upvotes(
|
||||||
|
_admin: None = Depends(verify_admin),
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
days: int | None = Query(None, description="刷新最近 N 天,默认使用配置值"),
|
||||||
|
):
|
||||||
|
"""手动刷新最近 N 天论文的 upvotes。"""
|
||||||
|
result = await refresh_upvotes(db, days=days)
|
||||||
|
if result["status"] == "failed":
|
||||||
|
raise HTTPException(status_code=500, detail=result.get("error"))
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
# ── 请求模型 ──────────────────────────────────────────────────────────
|
# ── 请求模型 ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
+8
-3
@@ -315,11 +315,16 @@ def _link_figures_with_images(
|
|||||||
fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))]
|
fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))]
|
||||||
table_type_unmatched = [f for f in unmatched if not _is_figure_type(f.get("id", ""))]
|
table_type_unmatched = [f for f in unmatched if not _is_figure_type(f.get("id", ""))]
|
||||||
|
|
||||||
# 提取的图片也按类型分流,按文件名排序
|
# 提取的图片按类型分流,按文件名中的编号排序
|
||||||
def _sort_key(name: str) -> tuple[int, int]:
|
def _sort_key(name: str) -> tuple[int, int]:
|
||||||
m = re.search(r'page(\d+)_(?:img|table)(\d+)', name)
|
# 新格式:figure_1.png, table_1.png
|
||||||
|
m = re.search(r'(?:figure|table)_(\d+)', name)
|
||||||
if m:
|
if m:
|
||||||
return (int(m.group(1)), int(m.group(2)))
|
return (0, int(m.group(1)))
|
||||||
|
# 旧格式:page2_img1.png, page5_table1.png
|
||||||
|
m2 = re.search(r'page(\d+)_(?:img|table)(\d+)', name)
|
||||||
|
if m2:
|
||||||
|
return (int(m2.group(1)), int(m2.group(2)))
|
||||||
return (0, 0)
|
return (0, 0)
|
||||||
|
|
||||||
fig_images = sorted(
|
fig_images = sorted(
|
||||||
|
|||||||
@@ -106,4 +106,5 @@ def get_admin_stats(db: Session) -> dict:
|
|||||||
"next_run": next_run.isoformat() if next_run else None,
|
"next_run": next_run.isoformat() if next_run else None,
|
||||||
"recent_logs": recent_logs,
|
"recent_logs": recent_logs,
|
||||||
"active_locks": active_locks,
|
"active_locks": active_locks,
|
||||||
|
"upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
|
||||||
}
|
}
|
||||||
|
|||||||
+79
-1
@@ -16,7 +16,7 @@ from app.models import (
|
|||||||
SummaryState,
|
SummaryState,
|
||||||
SummaryStatus,
|
SummaryStatus,
|
||||||
)
|
)
|
||||||
from app.utils import make_http_client, utc_now
|
from app.utils import make_http_client, recent_date_strs, utc_now
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -205,3 +205,81 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -
|
|||||||
log_entry.completed_at = utc_now()
|
log_entry.completed_at = utc_now()
|
||||||
db.commit()
|
db.commit()
|
||||||
return {"found": 0, "new": 0, "status": "failed", "error": str(exc)}
|
return {"found": 0, "new": 0, "status": "failed", "error": str(exc)}
|
||||||
|
|
||||||
|
|
||||||
|
async def refresh_upvotes(db: Session, days: int | None = None) -> dict:
|
||||||
|
"""重新抓取最近 N 天论文的 upvotes,不插入新论文。
|
||||||
|
|
||||||
|
遍历每天调用 HF API,对已有论文仅更新 upvotes 和 crawled_at。
|
||||||
|
"""
|
||||||
|
days = days or settings.UPVOTE_REFRESH_DAYS
|
||||||
|
date_strs = recent_date_strs(days)
|
||||||
|
now = utc_now()
|
||||||
|
|
||||||
|
log_entry = CrawlLog(
|
||||||
|
task="upvote_refresh",
|
||||||
|
status="running",
|
||||||
|
date=date_type.fromisoformat(date_strs[0]),
|
||||||
|
started_at=now,
|
||||||
|
)
|
||||||
|
db.add(log_entry)
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
total_updated = 0
|
||||||
|
errors: list[str] = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
for ds in date_strs:
|
||||||
|
try:
|
||||||
|
raw_papers = await fetch_daily(ds)
|
||||||
|
updated = _update_upvotes_only(db, raw_papers)
|
||||||
|
total_updated += updated
|
||||||
|
logger.info("Refreshed upvotes for %s: %d papers", ds, updated)
|
||||||
|
except Exception as exc:
|
||||||
|
msg = f"{ds}: {exc}"
|
||||||
|
errors.append(msg)
|
||||||
|
logger.warning("Failed to refresh upvotes for %s: %s", ds, exc)
|
||||||
|
|
||||||
|
log_entry.status = "success" if not errors else "partial"
|
||||||
|
log_entry.papers_found = total_updated
|
||||||
|
log_entry.details_json = f'{{"days": {days}, "errors": {len(errors)}}}'
|
||||||
|
log_entry.completed_at = utc_now()
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "success" if not errors else "partial",
|
||||||
|
"updated": total_updated,
|
||||||
|
"days": days,
|
||||||
|
"errors": errors or None,
|
||||||
|
}
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Upvote refresh failed")
|
||||||
|
log_entry.status = "failed"
|
||||||
|
log_entry.error = str(exc)
|
||||||
|
log_entry.completed_at = utc_now()
|
||||||
|
db.commit()
|
||||||
|
return {"status": "failed", "updated": total_updated, "error": str(exc)}
|
||||||
|
|
||||||
|
|
||||||
|
def _update_upvotes_only(db: Session, papers_raw: list[dict]) -> int:
|
||||||
|
"""对已有论文更新 upvotes,跳过不存在的新论文。"""
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
updated = 0
|
||||||
|
|
||||||
|
for item in papers_raw:
|
||||||
|
meta = _parse_paper(item)
|
||||||
|
arxiv_id = meta["arxiv_id"]
|
||||||
|
if not arxiv_id:
|
||||||
|
continue
|
||||||
|
|
||||||
|
existing = db.execute(
|
||||||
|
select(Paper).where(Paper.arxiv_id == arxiv_id)
|
||||||
|
).scalar_one_or_none()
|
||||||
|
|
||||||
|
if existing:
|
||||||
|
existing.upvotes = meta["upvotes"]
|
||||||
|
existing.crawled_at = now
|
||||||
|
updated += 1
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
return updated
|
||||||
|
|||||||
+291
-279
@@ -1,12 +1,12 @@
|
|||||||
"""PDF 图片与表格提取 — 从 PDF 中提取嵌入图片和表格截图。
|
"""PDF 图片与表格提取 — 基于 caption 定位的页面区域截图。
|
||||||
|
|
||||||
策略:
|
核心思路:学术论文排版极其规整,Figure caption 在图下方,Table caption 在表格上方。
|
||||||
1. 提取 PDF 中嵌入的图片(图表、插图等),按页面位置排序
|
因此反过来:先找 caption 文字 → 向上/向下截取页面区域 → 渲染为 PNG。
|
||||||
2. 检测表格区域,渲染为截图
|
|
||||||
3. 为每张图/表格提取附近的说明文字(caption),从中识别 Figure N / Table N
|
优势(相比提取嵌入位图):
|
||||||
4. 根据 caption 内容矫正类型:标注为 "Figure" 的表格区域 → 归为图片
|
- 复合图表不会被拆成碎片(整块截取)
|
||||||
5. 序号匹配兜底:第 N 张图 → Figure N(学术论文图表严格按顺序出现)
|
- 矢量图也能截取(页面渲染包含一切)
|
||||||
6. 保存 manifest.json 供后续与 AI 总结的 figures 字段匹配
|
- 不依赖 find_tables()(纯文本匹配 caption)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
@@ -21,124 +21,252 @@ from app.utils import TMP_DIR
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# 最小面积阈值(像素),小于此值的图片视为图标/装饰
|
# ── 截取区域参数 ───────────────────────────────────────────────────────
|
||||||
_MIN_AREA = 10_000 # ~100x100
|
|
||||||
_MIN_DIM = 80
|
|
||||||
|
|
||||||
# Caption 搜索区域 — Figure caption 在图下方,Table caption 在图上方
|
# Figure: caption 上方搜索图的范围(点)
|
||||||
_CAPTION_MARGIN = 10 # 贴边距离
|
_FIGURE_MAX_HEIGHT = 450 # 最大向上搜索范围
|
||||||
_CAPTION_MAX_DISTANCE = 250 # 最远搜索距离
|
_FIGURE_MIN_HEIGHT = 50 # 最小有效截图高度
|
||||||
_CAPTION_SIDE_PADDING = 40 # 左右扩展
|
_FIGURE_DEFAULT_HEIGHT = 280 # 上方未找到内容块时的默认图高度
|
||||||
|
|
||||||
# Figure/Table 标注正则
|
# Table: caption 下方搜索表格的范围
|
||||||
_FIGURE_CAPTION_RE = re.compile(
|
_TABLE_MAX_HEIGHT = 500 # 最大向下搜索范围
|
||||||
r'\b(?:Fig\.?|Figure)\s*(\d+)\b', re.IGNORECASE
|
_TABLE_MIN_HEIGHT = 30
|
||||||
|
|
||||||
|
# caption 左右扩展(双栏论文中 caption 可能比表格窄)
|
||||||
|
_REGION_SIDE_PADDING = 10
|
||||||
|
# 表格通常比 caption 文字宽,使用更大的水平扩展
|
||||||
|
_TABLE_SIDE_PADDING = 60
|
||||||
|
|
||||||
|
# 正文行距的 2 倍 ≈ 空白间隙阈值
|
||||||
|
_CONTENT_GAP_THRESHOLD = 30
|
||||||
|
|
||||||
|
|
||||||
|
# ── Caption 正则 ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# 要求以 Figure/Table 开头(避免匹配正文中的 "see Figure 3" 等)
|
||||||
|
_CAPTION_RE = re.compile(
|
||||||
|
r'^(?:Fig\.?|Figure)\s+(\d+)\s*[:\.]',
|
||||||
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
_TABLE_CAPTION_RE = re.compile(
|
_TABLE_CAPTION_RE = re.compile(
|
||||||
r'\bTable\s*(\d+)\b', re.IGNORECASE
|
r'^Table\s+(\d+)\s*[:\.]',
|
||||||
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _extract_caption_text(page, bbox, page_height: float, *,
|
def _find_captions(doc) -> list[dict]:
|
||||||
search_above: bool = False,
|
"""扫描整个文档,找到所有 Figure/Table caption 的位置和信息。"""
|
||||||
search_both: bool = False) -> str | None:
|
captions = []
|
||||||
"""从图片/表格附近区域提取 caption 文字。
|
|
||||||
|
|
||||||
search_above=True:搜索上方(Table caption 通常在上)
|
for page_num in range(len(doc)):
|
||||||
默认搜索下方(Figure caption 通常在下)
|
page = doc[page_num]
|
||||||
search_both=True:上下都搜,返回包含 Figure/Table 标注的那边
|
page_width = page.rect.width
|
||||||
"""
|
page_height = page.rect.height
|
||||||
import pymupdf
|
|
||||||
|
|
||||||
x0 = max(0, bbox.x0 - _CAPTION_SIDE_PADDING)
|
|
||||||
x1 = bbox.x1 + _CAPTION_SIDE_PADDING
|
|
||||||
|
|
||||||
def _search(y0: float, y1: float) -> str | None:
|
|
||||||
rect = pymupdf.Rect(x0, y0, x1, y1)
|
|
||||||
blocks = page.get_text("blocks")
|
blocks = page.get_text("blocks")
|
||||||
parts: list[str] = []
|
|
||||||
for block in blocks:
|
for block in blocks:
|
||||||
if len(block) < 5:
|
if len(block) < 5:
|
||||||
continue
|
continue
|
||||||
block_rect = pymupdf.Rect(block[:4])
|
text = str(block[4]).strip()
|
||||||
if block_rect.intersects(rect):
|
if not text:
|
||||||
text = str(block[4]).strip()
|
continue
|
||||||
if text:
|
|
||||||
parts.append(text)
|
|
||||||
if parts:
|
|
||||||
return " ".join(parts)
|
|
||||||
text = page.get_textbox(rect)
|
|
||||||
if text and len(text.strip()) >= 5:
|
|
||||||
return text.strip()
|
|
||||||
return None
|
|
||||||
|
|
||||||
if search_both:
|
bx0, by0, bx1, by1 = block[0], block[1], block[2], block[3]
|
||||||
# 上方
|
# 只取 block 第一行做匹配(避免 block 包含多段文字干扰)
|
||||||
above_y1 = max(0, bbox.y0 - _CAPTION_MARGIN)
|
first_line = text.split("\n")[0].strip()
|
||||||
above_y0 = max(0, bbox.y0 - _CAPTION_MAX_DISTANCE)
|
|
||||||
above = _search(above_y0, above_y1)
|
|
||||||
# 下方
|
|
||||||
below_y0 = bbox.y1 + _CAPTION_MARGIN
|
|
||||||
below_y1 = min(page_height, bbox.y1 + _CAPTION_MAX_DISTANCE)
|
|
||||||
below = _search(below_y0, below_y1)
|
|
||||||
|
|
||||||
# 优先返回包含 Figure/Table 标注的那边
|
m = _CAPTION_RE.match(first_line)
|
||||||
if above and (_FIGURE_CAPTION_RE.search(above) or _TABLE_CAPTION_RE.search(above)):
|
if m:
|
||||||
return above
|
captions.append({
|
||||||
if below and (_FIGURE_CAPTION_RE.search(below) or _TABLE_CAPTION_RE.search(below)):
|
"type": "figure",
|
||||||
return below
|
"num": int(m.group(1)),
|
||||||
# 否则返回更长的
|
"label": f"Figure {m.group(1)}",
|
||||||
if above and below:
|
"page_num": page_num,
|
||||||
return above if len(above) >= len(below) else below
|
"caption_y0": by0,
|
||||||
return above or below
|
"caption_y1": by1,
|
||||||
|
"caption_x0": bx0,
|
||||||
|
"caption_x1": bx1,
|
||||||
|
"caption_text": text,
|
||||||
|
"page_width": page_width,
|
||||||
|
"page_height": page_height,
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
if search_above:
|
m = _TABLE_CAPTION_RE.match(first_line)
|
||||||
y1 = max(0, bbox.y0 - _CAPTION_MARGIN)
|
if m:
|
||||||
y0 = max(0, bbox.y0 - _CAPTION_MAX_DISTANCE)
|
captions.append({
|
||||||
|
"type": "table",
|
||||||
|
"num": int(m.group(1)),
|
||||||
|
"label": f"Table {m.group(1)}",
|
||||||
|
"page_num": page_num,
|
||||||
|
"caption_y0": by0,
|
||||||
|
"caption_y1": by1,
|
||||||
|
"caption_x0": bx0,
|
||||||
|
"caption_x1": bx1,
|
||||||
|
"caption_text": text,
|
||||||
|
"page_width": page_width,
|
||||||
|
"page_height": page_height,
|
||||||
|
})
|
||||||
|
|
||||||
|
return captions
|
||||||
|
|
||||||
|
|
||||||
|
def _find_figure_top(page, caption: dict) -> float:
|
||||||
|
"""向上扫描页面,找到 Figure 的上边界。
|
||||||
|
|
||||||
|
策略:
|
||||||
|
1. 收集 caption 上方的所有内容块(文本 + 嵌入图片)
|
||||||
|
2. 找到最顶部的内容块作为图的上界
|
||||||
|
3. 检查内容块之间的大间隙(表示图从间隙下方开始)
|
||||||
|
4. 如果没找到任何内容块,使用默认图高度
|
||||||
|
|
||||||
|
注意:只扫描 text blocks 是不够的,因为 figure 本身是图片/矢量图,
|
||||||
|
不会被 get_text("blocks") 返回。必须同时用 get_image_info() 检测嵌入图片。
|
||||||
|
"""
|
||||||
|
caption_y = caption["caption_y0"]
|
||||||
|
cx0 = caption["caption_x0"] - _REGION_SIDE_PADDING
|
||||||
|
cx1 = caption["caption_x1"] + _REGION_SIDE_PADDING
|
||||||
|
|
||||||
|
# 收集 caption 上方、同列范围内的所有内容块
|
||||||
|
# 每个元素: (x0, y0, x1, y1)
|
||||||
|
above_blocks: list[tuple[float, float, float, float]] = []
|
||||||
|
|
||||||
|
# ── 1. 文本块 ──
|
||||||
|
for b in page.get_text("blocks"):
|
||||||
|
if len(b) < 5:
|
||||||
|
continue
|
||||||
|
bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
|
||||||
|
if by1 <= caption_y and by1 > caption_y - _FIGURE_MAX_HEIGHT:
|
||||||
|
if bx1 > cx0 and bx0 < cx1:
|
||||||
|
above_blocks.append((bx0, by0, bx1, by1))
|
||||||
|
|
||||||
|
# ── 2. 嵌入图片块 — 关键!figure 本身是图片,不是文本 ──
|
||||||
|
for img_info in page.get_image_info():
|
||||||
|
bbox = img_info.get("bbox")
|
||||||
|
if bbox is None:
|
||||||
|
continue
|
||||||
|
# Rect 对象: x0, y0, x1, y1
|
||||||
|
ix0, iy0, ix1, iy1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
|
||||||
|
if iy1 <= caption_y and iy1 > caption_y - _FIGURE_MAX_HEIGHT:
|
||||||
|
if ix1 > cx0 and ix0 < cx1:
|
||||||
|
above_blocks.append((ix0, iy0, ix1, iy1))
|
||||||
|
|
||||||
|
# ── 没有内容块 → 用默认高度(可能是纯矢量图,如 TikZ/matplotlib PDF) ──
|
||||||
|
if not above_blocks:
|
||||||
|
return max(0, caption_y - _FIGURE_DEFAULT_HEIGHT)
|
||||||
|
|
||||||
|
# ── 找到内容区域的上边界 ──
|
||||||
|
# 按 y 从下到上排序(离 caption 最近的在前)
|
||||||
|
above_blocks.sort(key=lambda b: b[1], reverse=True)
|
||||||
|
|
||||||
|
# 从 caption 向上扫描,找到第一个大间隙以上作为图的上界
|
||||||
|
# 典型结构: [正文段落] ...空白... [图内容(图片/矢量)] [caption]
|
||||||
|
# 空白间隙 ≈ 图的上边界
|
||||||
|
figure_top = above_blocks[-1][1] # 最上面的块顶部(默认兜底)
|
||||||
|
|
||||||
|
prev_bottom = caption_y # 从 caption 顶部开始向上
|
||||||
|
for b in above_blocks:
|
||||||
|
# b = (x0, y0, x1, y1), 我们关心 y 范围
|
||||||
|
gap = prev_bottom - b[3] # b[3] = by1 = 当前块底部
|
||||||
|
if gap > _CONTENT_GAP_THRESHOLD:
|
||||||
|
# 大间隙 → 图上边界在间隙下方
|
||||||
|
figure_top = prev_bottom - 5
|
||||||
|
break
|
||||||
|
# 小间隙 → 当前块属于图的一部分(或紧挨着图),继续向上
|
||||||
|
prev_bottom = b[1] # b[1] = by0 = 当前块顶部
|
||||||
else:
|
else:
|
||||||
y0 = bbox.y1 + _CAPTION_MARGIN
|
# 所有块都紧挨着 → 图从最上面块的顶部开始
|
||||||
y1 = min(page_height, bbox.y1 + _CAPTION_MAX_DISTANCE)
|
figure_top = above_blocks[-1][1]
|
||||||
|
|
||||||
return _search(y0, y1)
|
# 限制最大高度
|
||||||
|
if caption_y - figure_top > _FIGURE_MAX_HEIGHT:
|
||||||
|
figure_top = caption_y - _FIGURE_MAX_HEIGHT
|
||||||
|
|
||||||
|
# 不低于页面顶部
|
||||||
|
figure_top = max(0, figure_top)
|
||||||
|
|
||||||
|
return figure_top
|
||||||
|
|
||||||
|
|
||||||
def _identify_label(caption_text: str | None) -> str | None:
|
def _find_table_region(page, caption: dict) -> tuple[float, float, float, float]:
|
||||||
"""从 caption 文字中识别 Figure N / Table N 编号。"""
|
"""向下扫描页面,找到 Table 的下边界和水平范围。
|
||||||
if not caption_text:
|
|
||||||
return None
|
|
||||||
|
|
||||||
m = _FIGURE_CAPTION_RE.search(caption_text)
|
返回: (x0, bottom, x1) — 裁剪区域的左、下、右边界。
|
||||||
if m:
|
上边界由调用方根据 caption 位置确定。
|
||||||
return f"Figure {m.group(1)}"
|
|
||||||
|
|
||||||
m = _TABLE_CAPTION_RE.search(caption_text)
|
策略:
|
||||||
if m:
|
1. 收集 caption 下方的文本块(表格内容是文本)
|
||||||
return f"Table {m.group(1)}"
|
2. 找到连续内容区域的底部(遇到大间隙时停止)
|
||||||
|
3. 同时检测表格内容的水平范围(表格通常比 caption 宽)
|
||||||
|
"""
|
||||||
|
blocks = page.get_text("blocks")
|
||||||
|
caption_y = caption["caption_y1"] # caption 底部作为扫描起点
|
||||||
|
caption_x0 = caption["caption_x0"]
|
||||||
|
caption_x1 = caption["caption_x1"]
|
||||||
|
page_height = caption["page_height"]
|
||||||
|
page_width = caption["page_width"]
|
||||||
|
|
||||||
return None
|
# 先用较宽的范围收集可能的表格内容块
|
||||||
|
search_x0 = max(0, caption_x0 - _TABLE_SIDE_PADDING)
|
||||||
|
search_x1 = min(page_width, caption_x1 + _TABLE_SIDE_PADDING)
|
||||||
|
|
||||||
|
below_blocks: list[tuple[float, float, float, float]] = []
|
||||||
|
for b in blocks:
|
||||||
|
if len(b) < 5:
|
||||||
|
continue
|
||||||
|
bx0, by0, bx1, by1 = b[0], b[1], b[2], b[3]
|
||||||
|
if by0 > caption_y and by0 < caption_y + _TABLE_MAX_HEIGHT:
|
||||||
|
if bx1 > search_x0 and bx0 < search_x1:
|
||||||
|
below_blocks.append((bx0, by0, bx1, by1))
|
||||||
|
|
||||||
def _is_figure_caption(caption_text: str | None) -> bool:
|
if not below_blocks:
|
||||||
"""判断 caption 是否标注为 Figure(用于矫正 find_tables 的误判)。"""
|
# 没有内容 → 使用默认高度和 caption 宽度
|
||||||
if not caption_text:
|
return (
|
||||||
return False
|
max(0, caption_x0 - _REGION_SIDE_PADDING),
|
||||||
return bool(_FIGURE_CAPTION_RE.search(caption_text))
|
min(page_height, caption_y + _TABLE_MIN_HEIGHT),
|
||||||
|
min(page_width, caption_x1 + _REGION_SIDE_PADDING),
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── 找到连续内容区域的底部 ──
|
||||||
|
below_blocks.sort(key=lambda b: b[1]) # 按 y 升序
|
||||||
|
|
||||||
|
prev_y = caption_y
|
||||||
|
bottom = below_blocks[-1][3] + 5 # 最后一块的底部 + margin
|
||||||
|
|
||||||
|
for b in below_blocks:
|
||||||
|
gap = b[1] - prev_y # b[1] = by0
|
||||||
|
if gap > _CONTENT_GAP_THRESHOLD:
|
||||||
|
bottom = prev_y + 5
|
||||||
|
break
|
||||||
|
prev_y = b[3] # b[3] = by1
|
||||||
|
|
||||||
|
# 限制最大高度
|
||||||
|
if bottom - caption_y > _TABLE_MAX_HEIGHT:
|
||||||
|
bottom = caption_y + _TABLE_MAX_HEIGHT
|
||||||
|
|
||||||
|
# ── 检测表格内容的水平范围 ──
|
||||||
|
# 表格通常比 caption 宽,用内容块的实际宽度
|
||||||
|
content_x0 = min(caption_x0, min(b[0] for b in below_blocks))
|
||||||
|
content_x1 = max(caption_x1, max(b[2] for b in below_blocks))
|
||||||
|
|
||||||
|
# 添加边距,但不超出页面
|
||||||
|
x0 = max(0, content_x0 - _REGION_SIDE_PADDING)
|
||||||
|
x1 = min(page_width, content_x1 + _REGION_SIDE_PADDING)
|
||||||
|
|
||||||
|
return (x0, bottom, x1)
|
||||||
|
|
||||||
|
|
||||||
def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
|
def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
|
||||||
"""从 PDF 提取嵌入图片和表格截图,生成 manifest。
|
"""从 PDF 提取 Figure/Table 截图,生成 manifest。
|
||||||
|
|
||||||
匹配策略:
|
策略:找 caption → 定位区域 → 渲染页面截图。
|
||||||
1. 提取图片→提取 caption 文字→从中识别 Figure/Table 编号
|
|
||||||
2. 表格区域若 caption 标注为 "Figure",则重分类为图片
|
|
||||||
3. 未能从 caption 识别编号的,按(页码, 纵向位置)排序后用序号匹配兜底
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
arxiv_id: 论文 ID
|
arxiv_id: 论文 ID
|
||||||
pdf_path: PDF 路径,默认 data/tmp/{arxiv_id}/paper.pdf
|
pdf_path: PDF 路径,默认 data/tmp/{arxiv_id}/paper.pdf
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
提取的图片+表格数量
|
提取的图片数量
|
||||||
"""
|
"""
|
||||||
import pymupdf
|
import pymupdf
|
||||||
|
|
||||||
@@ -153,216 +281,100 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
|
|||||||
images_dest.mkdir(parents=True, exist_ok=True)
|
images_dest.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
doc = pymupdf.open(str(pdf_path))
|
doc = pymupdf.open(str(pdf_path))
|
||||||
|
captions = _find_captions(doc)
|
||||||
|
|
||||||
|
if not captions:
|
||||||
|
logger.info("No Figure/Table captions found in PDF for %s", arxiv_id)
|
||||||
|
doc.close()
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# 去重:同一页同一 label 可能匹配到多个 block(如正文引用 "Figure 7")
|
||||||
|
# 保留每个 (type, num) 的第一个匹配(即真正的 caption)
|
||||||
|
seen_labels: dict[str, dict] = {}
|
||||||
|
for cap in captions:
|
||||||
|
key = cap["label"]
|
||||||
|
if key not in seen_labels:
|
||||||
|
seen_labels[key] = cap
|
||||||
|
|
||||||
|
unique_captions = list(seen_labels.values())
|
||||||
extracted = 0
|
extracted = 0
|
||||||
seen_hashes: set[int] = set()
|
manifest: dict[str, dict] = {}
|
||||||
|
|
||||||
# ── 第一遍:收集所有图片和表格 ──
|
zoom = 2 # 2x 渲染,保证清晰度
|
||||||
image_items: list[dict] = []
|
|
||||||
table_items: list[dict] = []
|
|
||||||
|
|
||||||
for page_num in range(len(doc)):
|
for cap in unique_captions:
|
||||||
page = doc[page_num]
|
page = doc[cap["page_num"]]
|
||||||
page_height = page.rect.height
|
pw = cap["page_width"]
|
||||||
|
ph = cap["page_height"]
|
||||||
|
|
||||||
# 1. 提取嵌入图片
|
if cap["type"] == "figure":
|
||||||
image_list = page.get_images(full=True)
|
# Figure: caption 上方是图 → 向上找图的上边界
|
||||||
for img_index, img_info in enumerate(image_list):
|
top = _find_figure_top(page, cap)
|
||||||
xref = img_info[0]
|
bottom = cap["caption_y1"] + 5 # 包含 caption
|
||||||
try:
|
# 水平范围:caption 宽度 + 边距(图和 caption 通常等宽)
|
||||||
pix = pymupdf.Pixmap(doc, xref)
|
# 但也要考虑图内容的实际宽度
|
||||||
except Exception:
|
x0 = max(0, cap["caption_x0"] - _REGION_SIDE_PADDING)
|
||||||
|
x1 = min(pw, cap["caption_x1"] + _REGION_SIDE_PADDING)
|
||||||
|
|
||||||
|
height = bottom - top
|
||||||
|
if height < _FIGURE_MIN_HEIGHT:
|
||||||
|
logger.debug(
|
||||||
|
"Figure %s too small (%.0fpt), skipping", cap["label"], height
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if pix.width < _MIN_DIM or pix.height < _MIN_DIM:
|
else:
|
||||||
continue
|
# Table: caption 下方是表格 → 向下找表格的下边界和水平范围
|
||||||
if pix.width * pix.height < _MIN_AREA:
|
x0, bottom, x1 = _find_table_region(page, cap)
|
||||||
|
top = max(0, cap["caption_y0"] - 3) # 包含 caption,上边留少许 margin
|
||||||
|
|
||||||
|
height = bottom - top
|
||||||
|
if height < _TABLE_MIN_HEIGHT:
|
||||||
|
logger.debug(
|
||||||
|
"Table %s too small (%.0fpt), skipping", cap["label"], height
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
img_hash = hash(pix.tobytes()[:1024])
|
# 渲染截取
|
||||||
if img_hash in seen_hashes:
|
clip = pymupdf.Rect(x0, top, x1, bottom)
|
||||||
continue
|
mat = pymupdf.Matrix(zoom, zoom)
|
||||||
seen_hashes.add(img_hash)
|
|
||||||
|
|
||||||
img_rects = page.get_image_rects(xref)
|
|
||||||
if not img_rects:
|
|
||||||
continue
|
|
||||||
bbox = img_rects[0]
|
|
||||||
|
|
||||||
if pix.n >= 5:
|
|
||||||
try:
|
|
||||||
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
|
|
||||||
filename = f"page{page_num + 1}_img{img_index + 1}.png"
|
|
||||||
pix.save(str(images_dest / filename))
|
|
||||||
extracted += 1
|
|
||||||
|
|
||||||
caption_text = _extract_caption_text(page, bbox, page_height)
|
|
||||||
label = _identify_label(caption_text)
|
|
||||||
|
|
||||||
image_items.append({
|
|
||||||
"filename": filename,
|
|
||||||
"page": page_num + 1,
|
|
||||||
"y0": bbox.y0,
|
|
||||||
"caption_text": caption_text,
|
|
||||||
"label": label,
|
|
||||||
})
|
|
||||||
|
|
||||||
# 2. 提取表格截图(同时搜索上方 caption,Table 标题通常在表格上方)
|
|
||||||
try:
|
try:
|
||||||
tables = page.find_tables()
|
pix = page.get_pixmap(matrix=mat, clip=clip)
|
||||||
except Exception:
|
except Exception:
|
||||||
tables = None
|
logger.debug("Failed to render %s region for %s", cap["label"], arxiv_id)
|
||||||
|
continue
|
||||||
|
|
||||||
if tables and tables.tables:
|
filename = f"{cap['label'].replace(' ', '_').lower()}.png"
|
||||||
for table_index, table in enumerate(tables.tables):
|
pix.save(str(images_dest / filename))
|
||||||
bbox = table.bbox
|
extracted += 1
|
||||||
if not bbox:
|
|
||||||
continue
|
|
||||||
|
|
||||||
margin = 5
|
cap_preview = cap["caption_text"][:200] if cap["caption_text"] else ""
|
||||||
if hasattr(bbox, 'x0'):
|
manifest[filename] = {
|
||||||
x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
|
"page": cap["page_num"] + 1,
|
||||||
table_rect = bbox
|
"type": cap["type"],
|
||||||
else:
|
"label": cap["label"],
|
||||||
x0, y0, x1, y1 = bbox
|
"caption_text": cap_preview,
|
||||||
table_rect = pymupdf.Rect(x0, y0, x1, y1)
|
"figures" if cap["type"] == "figure" else "tables": [cap["label"]],
|
||||||
clip_rect = pymupdf.Rect(
|
}
|
||||||
x0 - margin, y0 - margin, x1 + margin, y1 + margin
|
logger.debug(
|
||||||
)
|
"Rendered %s: page %d, region (%.0f,%.0f)-(%.0f,%.0f) h=%.0fpt → %s",
|
||||||
|
cap["label"], cap["page_num"] + 1,
|
||||||
zoom = 2
|
x0, top, x1, bottom, height, filename,
|
||||||
mat = pymupdf.Matrix(zoom, zoom)
|
)
|
||||||
try:
|
|
||||||
pix = page.get_pixmap(matrix=mat, clip=clip_rect)
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if pix.width < _MIN_DIM * 2 or pix.height < 30 * 2:
|
|
||||||
continue
|
|
||||||
|
|
||||||
filename = f"page{page_num + 1}_table{table_index + 1}.png"
|
|
||||||
pix.save(str(images_dest / filename))
|
|
||||||
extracted += 1
|
|
||||||
|
|
||||||
# Table caption 上下都搜(学术论文惯例:Table 标题在上方,但实际排版各异)
|
|
||||||
caption_text = _extract_caption_text(
|
|
||||||
page, table_rect, page_height, search_both=True,
|
|
||||||
)
|
|
||||||
label = _identify_label(caption_text)
|
|
||||||
|
|
||||||
item = {
|
|
||||||
"filename": filename,
|
|
||||||
"page": page_num + 1,
|
|
||||||
"y0": y0,
|
|
||||||
"caption_text": caption_text,
|
|
||||||
"label": label,
|
|
||||||
}
|
|
||||||
|
|
||||||
# 关键:caption 标注为 Figure → 重分类为图片
|
|
||||||
if _is_figure_caption(caption_text):
|
|
||||||
image_items.append(item)
|
|
||||||
else:
|
|
||||||
table_items.append(item)
|
|
||||||
|
|
||||||
doc.close()
|
doc.close()
|
||||||
|
|
||||||
# ── 第二遍:矫正 find_tables 的误判 ──
|
|
||||||
# 如果表格与同页的图片高度重叠(复合图表的子区域),且 caption 不含 "Table",
|
|
||||||
# 则重分类为图片,归入邻近图片的 label
|
|
||||||
for t_item in table_items[:]:
|
|
||||||
t_page = t_item["page"]
|
|
||||||
t_y0 = t_item["y0"]
|
|
||||||
same_page_images = [i for i in image_items if i["page"] == t_page]
|
|
||||||
if not same_page_images:
|
|
||||||
continue
|
|
||||||
# 检查是否有重叠的图片
|
|
||||||
nearby = [
|
|
||||||
i for i in same_page_images
|
|
||||||
if abs(i["y0"] - t_y0) < 50
|
|
||||||
]
|
|
||||||
if nearby and not (t_item["caption_text"] and _TABLE_CAPTION_RE.search(t_item["caption_text"])):
|
|
||||||
# 重分类为图片,继承邻近图片的 label
|
|
||||||
neighbor_label = nearby[0].get("label")
|
|
||||||
t_item["label"] = neighbor_label
|
|
||||||
image_items.append(t_item)
|
|
||||||
table_items.remove(t_item)
|
|
||||||
|
|
||||||
# ── 第三遍:按 (page, y0) 排序 → 序号匹配兜底 ──
|
|
||||||
image_items.sort(key=lambda it: (it["page"], it["y0"]))
|
|
||||||
table_items.sort(key=lambda it: (it["page"], it["y0"]))
|
|
||||||
|
|
||||||
# 统计已通过 caption 确认的 Figure/Table 编号,避免序号重复分配
|
|
||||||
used_figure_nums: set[int] = set()
|
|
||||||
used_table_nums: set[int] = set()
|
|
||||||
for item in image_items:
|
|
||||||
if item["label"]:
|
|
||||||
m = _FIGURE_CAPTION_RE.search(item["label"])
|
|
||||||
if m:
|
|
||||||
used_figure_nums.add(int(m.group(1)))
|
|
||||||
for item in table_items:
|
|
||||||
if item["label"]:
|
|
||||||
m = _TABLE_CAPTION_RE.search(item["label"])
|
|
||||||
if m:
|
|
||||||
used_table_nums.add(int(m.group(1)))
|
|
||||||
|
|
||||||
# 为未识别编号的图片分配序号(跳过已占用的编号)
|
|
||||||
next_fig = 1
|
|
||||||
for item in image_items:
|
|
||||||
if item["label"] is None:
|
|
||||||
while next_fig in used_figure_nums:
|
|
||||||
next_fig += 1
|
|
||||||
item["label"] = f"Figure {next_fig}"
|
|
||||||
used_figure_nums.add(next_fig)
|
|
||||||
|
|
||||||
next_tbl = 1
|
|
||||||
for item in table_items:
|
|
||||||
if item["label"] is None:
|
|
||||||
while next_tbl in used_table_nums:
|
|
||||||
next_tbl += 1
|
|
||||||
item["label"] = f"Table {next_tbl}"
|
|
||||||
used_table_nums.add(next_tbl)
|
|
||||||
|
|
||||||
# ── 第三遍:构建 manifest ──
|
|
||||||
manifest: dict[str, dict] = {}
|
|
||||||
for item in image_items:
|
|
||||||
manifest[item["filename"]] = {
|
|
||||||
"page": item["page"],
|
|
||||||
"type": "image",
|
|
||||||
"label": item["label"],
|
|
||||||
"caption_text": item.get("caption_text"),
|
|
||||||
"figures": [item["label"]],
|
|
||||||
}
|
|
||||||
for item in table_items:
|
|
||||||
manifest[item["filename"]] = {
|
|
||||||
"page": item["page"],
|
|
||||||
"type": "table",
|
|
||||||
"label": item["label"],
|
|
||||||
"caption_text": item.get("caption_text"),
|
|
||||||
"tables": [item["label"]],
|
|
||||||
}
|
|
||||||
|
|
||||||
# 保存 manifest
|
# 保存 manifest
|
||||||
manifest_path = images_dest / "manifest.json"
|
manifest_path = images_dest / "manifest.json"
|
||||||
manifest_path.write_text(
|
manifest_path.write_text(
|
||||||
json.dumps(manifest, ensure_ascii=False, indent=2)
|
json.dumps(manifest, ensure_ascii=False, indent=2)
|
||||||
)
|
)
|
||||||
|
|
||||||
captioned = sum(
|
|
||||||
1 for it in image_items + table_items if it["caption_text"]
|
|
||||||
)
|
|
||||||
label_matched = sum(
|
|
||||||
1 for it in image_items + table_items
|
|
||||||
if it["caption_text"] and _identify_label(it["caption_text"])
|
|
||||||
)
|
|
||||||
|
|
||||||
if extracted > 0:
|
if extracted > 0:
|
||||||
logger.info(
|
logger.info(
|
||||||
"Extracted %d items from PDF for %s "
|
"Extracted %d figure/table screenshots from PDF for %s "
|
||||||
"(%d images, %d tables, %d with captions, %d label-matched)",
|
"(from %d captions found, %d unique)",
|
||||||
extracted, arxiv_id,
|
extracted, arxiv_id, len(captions), len(unique_captions),
|
||||||
len(image_items), len(table_items), captioned, label_matched,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return extracted
|
return extracted
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ from zoneinfo import ZoneInfo
|
|||||||
from app.config import settings
|
from app.config import settings
|
||||||
from app.database import SessionLocal
|
from app.database import SessionLocal
|
||||||
from app.services.pipeline import run_pipeline
|
from app.services.pipeline import run_pipeline
|
||||||
|
from app.services.crawler import refresh_upvotes
|
||||||
from app.utils import today_str
|
from app.utils import today_str
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -66,6 +67,22 @@ def start_scheduler() -> AsyncIOScheduler | None:
|
|||||||
misfire_grace_time=3600, # 允许迟到 1 小时内补执行
|
misfire_grace_time=3600, # 允许迟到 1 小时内补执行
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# upvote 刷新:每天流水线之后 30 分钟执行,刷新最近 7 天论文的投票数
|
||||||
|
upvote_trigger = CronTrigger(
|
||||||
|
hour=settings.SCHEDULE_HOUR,
|
||||||
|
minute=settings.SCHEDULE_MINUTE + 30,
|
||||||
|
timezone=tz,
|
||||||
|
)
|
||||||
|
scheduler.add_job(
|
||||||
|
_upvote_refresh,
|
||||||
|
trigger=upvote_trigger,
|
||||||
|
id="upvote_refresh",
|
||||||
|
name="upvote_refresh",
|
||||||
|
replace_existing=True,
|
||||||
|
max_instances=1,
|
||||||
|
misfire_grace_time=3600,
|
||||||
|
)
|
||||||
|
|
||||||
scheduler.start()
|
scheduler.start()
|
||||||
_scheduler = scheduler
|
_scheduler = scheduler
|
||||||
logger.info(
|
logger.info(
|
||||||
@@ -102,3 +119,19 @@ async def _daily_pipeline() -> None:
|
|||||||
logger.exception("Unexpected error in daily pipeline")
|
logger.exception("Unexpected error in daily pipeline")
|
||||||
finally:
|
finally:
|
||||||
db.close()
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def _upvote_refresh() -> None:
|
||||||
|
"""刷新最近 N 天论文的 upvotes。"""
|
||||||
|
db: Session = SessionLocal()
|
||||||
|
try:
|
||||||
|
result = await refresh_upvotes(db)
|
||||||
|
logger.info(
|
||||||
|
"Upvote refresh completed: status=%s updated=%d",
|
||||||
|
result.get("status"),
|
||||||
|
result.get("updated", 0),
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Unexpected error in upvote refresh")
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|||||||
@@ -138,20 +138,7 @@ a:hover {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* ── Date Quick Nav ─────────────────────────────────────────────── */
|
/* ── Date Quick Nav ─────────────────────────────────────────────── */
|
||||||
.date-quick-nav {
|
|
||||||
margin-top: 32px;
|
|
||||||
padding-top: 16px;
|
|
||||||
border-top: 1px solid var(--border);
|
|
||||||
font-size: 0.85rem;
|
|
||||||
color: var(--ink-light);
|
|
||||||
display: flex;
|
|
||||||
align-items: center;
|
|
||||||
gap: 8px;
|
|
||||||
flex-wrap: wrap;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ── Chips (shared) ─────────────────────────────────────────────── */
|
/* ── Chips (shared) ─────────────────────────────────────────────── */
|
||||||
.date-chip,
|
|
||||||
.tag-chip,
|
.tag-chip,
|
||||||
.filter-chip {
|
.filter-chip {
|
||||||
display: inline-block;
|
display: inline-block;
|
||||||
@@ -162,14 +149,12 @@ a:hover {
|
|||||||
font-size: 0.8rem;
|
font-size: 0.8rem;
|
||||||
color: var(--ink-light);
|
color: var(--ink-light);
|
||||||
}
|
}
|
||||||
.date-chip:hover,
|
|
||||||
.tag-chip:hover,
|
.tag-chip:hover,
|
||||||
.filter-chip:hover {
|
.filter-chip:hover {
|
||||||
border-color: var(--accent);
|
border-color: var(--accent);
|
||||||
color: var(--accent);
|
color: var(--accent);
|
||||||
text-decoration: none;
|
text-decoration: none;
|
||||||
}
|
}
|
||||||
.date-chip.active,
|
|
||||||
.tag-chip.active,
|
.tag-chip.active,
|
||||||
.filter-chip.active {
|
.filter-chip.active {
|
||||||
background: var(--accent);
|
background: var(--accent);
|
||||||
@@ -352,6 +337,11 @@ a:hover {
|
|||||||
margin-bottom: 12px;
|
margin-bottom: 12px;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.detail-upvote-time {
|
||||||
|
font-size: 0.78rem;
|
||||||
|
color: var(--border);
|
||||||
|
}
|
||||||
|
|
||||||
.detail-tags {
|
.detail-tags {
|
||||||
margin-bottom: 12px;
|
margin-bottom: 12px;
|
||||||
display: flex;
|
display: flex;
|
||||||
|
|||||||
@@ -33,6 +33,7 @@
|
|||||||
<button class="admin-action-btn" onclick="adminAction('crawl')">🔄 抓取今天</button>
|
<button class="admin-action-btn" onclick="adminAction('crawl')">🔄 抓取今天</button>
|
||||||
<button class="admin-action-btn" onclick="adminAction('summarize')">📝 批量总结</button>
|
<button class="admin-action-btn" onclick="adminAction('summarize')">📝 批量总结</button>
|
||||||
<button class="admin-action-btn" onclick="adminAction('cleanup')">🧹 清理临时文件</button>
|
<button class="admin-action-btn" onclick="adminAction('cleanup')">🧹 清理临时文件</button>
|
||||||
|
<button class="admin-action-btn" onclick="refreshUpvotes()">👍 刷新投票</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="admin-info-grid">
|
<div class="admin-info-grid">
|
||||||
@@ -59,6 +60,10 @@
|
|||||||
<span class="info-value">{{ stats.next_run[:19] | replace('T', ' ') }}</span>
|
<span class="info-value">{{ stats.next_run[:19] | replace('T', ' ') }}</span>
|
||||||
</div>
|
</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
<div class="info-row">
|
||||||
|
<span class="info-label">投票刷新</span>
|
||||||
|
<span class="info-value">每日自动刷新最近 {{ stats.upvote_refresh_days | default(7) }} 天</span>
|
||||||
|
</div>
|
||||||
{% if stats.active_locks %}
|
{% if stats.active_locks %}
|
||||||
<div class="info-row">
|
<div class="info-row">
|
||||||
<span class="info-label">活跃任务</span>
|
<span class="info-label">活跃任务</span>
|
||||||
@@ -181,5 +186,12 @@
|
|||||||
.then(data => { if (data) showToast(data.error ? "❌ " + data.error.substring(0,200) : "✅ 流水线已触发"); })
|
.then(data => { if (data) showToast(data.error ? "❌ " + data.error.substring(0,200) : "✅ 流水线已触发"); })
|
||||||
.catch(err => showToast("❌ 请求失败"));
|
.catch(err => showToast("❌ 请求失败"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function refreshUpvotes() {
|
||||||
|
fetch("/admin/refresh-upvotes", { method: "POST", headers: { "Content-Type": "application/json" } })
|
||||||
|
.then(r => { if (r.status===303||r.status===401) { window.location.href="/admin/login"; return; } return r.json(); })
|
||||||
|
.then(data => { if (data) showToast(data.error ? "❌ " + data.error.substring(0,200) : `✅ 已刷新 ${data.updated || 0} 篇论文投票`); })
|
||||||
|
.catch(err => showToast("❌ 请求失败"));
|
||||||
|
}
|
||||||
</script>
|
</script>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|||||||
@@ -22,6 +22,9 @@ endblock %} {% block content %}
|
|||||||
>📅 {{ paper.published_at or paper.paper_date }}</span
|
>📅 {{ paper.published_at or paper.paper_date }}</span
|
||||||
>
|
>
|
||||||
<span class="detail-upvotes">👍 {{ paper.upvotes }}</span>
|
<span class="detail-upvotes">👍 {{ paper.upvotes }}</span>
|
||||||
|
{% if paper.crawled_at %}
|
||||||
|
<span class="detail-upvote-time" title="投票数据更新时间">{{ paper.crawled_at.strftime('%m-%d %H:%M') }}</span>
|
||||||
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{# 标签 #} {% if paper.tags %}
|
{# 标签 #} {% if paper.tags %}
|
||||||
|
|||||||
@@ -23,16 +23,6 @@ endblock %} {% block content %}
|
|||||||
</div>
|
</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
<div class="date-quick-nav">
|
|
||||||
<span>有数据的日期:</span>
|
|
||||||
{% for d in available_dates[:10] %}
|
|
||||||
<a
|
|
||||||
href="/day/{{ d }}"
|
|
||||||
class="date-chip {% if d == current_date %}active{% endif %}"
|
|
||||||
>{{ d }}</a
|
|
||||||
>
|
|
||||||
{% endfor %}
|
|
||||||
</div>
|
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|
||||||
{% block scripts %}
|
{% block scripts %}
|
||||||
|
|||||||
@@ -20,7 +20,7 @@
|
|||||||
{% endif %}
|
{% endif %}
|
||||||
</a>
|
</a>
|
||||||
</h2>
|
</h2>
|
||||||
<span class="paper-upvotes">👍 {{ paper.upvotes }}</span>
|
<span class="paper-upvotes" title="数据更新于 {{ paper.crawled_at.strftime('%m-%d %H:%M') if paper.crawled_at else '' }}">👍 {{ paper.upvotes }}</span>
|
||||||
{% if variant == 'search' and distances and paper.arxiv_id in distances %}
|
{% if variant == 'search' and distances and paper.arxiv_id in distances %}
|
||||||
<span class="similarity-score" title="语义相似度距离">
|
<span class="similarity-score" title="语义相似度距离">
|
||||||
🎯 {{ "%.3f"|format(distances[paper.arxiv_id]) }}
|
🎯 {{ "%.3f"|format(distances[paper.arxiv_id]) }}
|
||||||
|
|||||||
@@ -57,6 +57,13 @@ def yesterday_str() -> str:
|
|||||||
return yesterday.isoformat()
|
return yesterday.isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
def recent_date_strs(n: int) -> list[str]:
|
||||||
|
"""最近 N 天的日期字符串列表(含今天,按 APP_TIMEZONE)。"""
|
||||||
|
tz = ZoneInfo(settings.APP_TIMEZONE)
|
||||||
|
today = datetime.now(tz).date()
|
||||||
|
return [(today - timedelta(days=i)).isoformat() for i in range(n)]
|
||||||
|
|
||||||
|
|
||||||
def latest_paper_date(db) -> str:
|
def latest_paper_date(db) -> str:
|
||||||
"""查询数据库中最新的 paper_date,无数据时回退到 today_str()。"""
|
"""查询数据库中最新的 paper_date,无数据时回退到 today_str()。"""
|
||||||
from sqlalchemy import func, select
|
from sqlalchemy import func, select
|
||||||
|
|||||||
Reference in New Issue
Block a user