feat: refactor PDF extraction to caption-based screenshots, add upvote refresh, clean up UI

- PDF extractor: rewrite from embedded bitmap extraction to caption-based
  page region screenshots. Finds Figure/Table captions via regex,截取上方/下方
  page region, handles compound figures and vector graphics.
- Upvote refresh: new crawler.refresh_upvotes() re-fetches upvotes for recent
  N days without inserting new papers. Scheduler runs daily 30min after pipeline.
- Admin: add /admin/refresh-upvotes endpoint and dashboard button.
- UI: remove date quick nav, show upvote update time on detail/card pages,
  clean up CSS date-chip styles.
- Utils: add recent_date_strs() helper.
This commit is contained in:
2026-06-09 18:01:01 +08:00
parent b72b5a31bb
commit 1fc6303e09
13 changed files with 460 additions and 311 deletions
+12
View File
@@ -33,6 +33,7 @@
<button class="admin-action-btn" onclick="adminAction('crawl')">🔄 抓取今天</button>
<button class="admin-action-btn" onclick="adminAction('summarize')">📝 批量总结</button>
<button class="admin-action-btn" onclick="adminAction('cleanup')">🧹 清理临时文件</button>
<button class="admin-action-btn" onclick="refreshUpvotes()">👍 刷新投票</button>
</div>
<div class="admin-info-grid">
@@ -59,6 +60,10 @@
<span class="info-value">{{ stats.next_run[:19] | replace('T', ' ') }}</span>
</div>
{% endif %}
<div class="info-row">
<span class="info-label">投票刷新</span>
<span class="info-value">每日自动刷新最近 {{ stats.upvote_refresh_days | default(7) }} 天</span>
</div>
{% if stats.active_locks %}
<div class="info-row">
<span class="info-label">活跃任务</span>
@@ -181,5 +186,12 @@
.then(data => { if (data) showToast(data.error ? "❌ " + data.error.substring(0,200) : "✅ 流水线已触发"); })
.catch(err => showToast("❌ 请求失败"));
}
function refreshUpvotes() {
fetch("/admin/refresh-upvotes", { method: "POST", headers: { "Content-Type": "application/json" } })
.then(r => { if (r.status===303||r.status===401) { window.location.href="/admin/login"; return; } return r.json(); })
.then(data => { if (data) showToast(data.error ? "❌ " + data.error.substring(0,200) : `✅ 已刷新 ${data.updated || 0} 篇论文投票`); })
.catch(err => showToast("❌ 请求失败"));
}
</script>
{% endblock %}
+3
View File
@@ -22,6 +22,9 @@ endblock %} {% block content %}
>📅 {{ paper.published_at or paper.paper_date }}</span
>
<span class="detail-upvotes">👍 {{ paper.upvotes }}</span>
{% if paper.crawled_at %}
<span class="detail-upvote-time" title="投票数据更新时间">{{ paper.crawled_at.strftime('%m-%d %H:%M') }}</span>
{% endif %}
</div>
{# 标签 #} {% if paper.tags %}
-10
View File
@@ -23,16 +23,6 @@ endblock %} {% block content %}
</div>
{% endif %}
<div class="date-quick-nav">
<span>有数据的日期:</span>
{% for d in available_dates[:10] %}
<a
href="/day/{{ d }}"
class="date-chip {% if d == current_date %}active{% endif %}"
>{{ d }}</a
>
{% endfor %}
</div>
{% endblock %}
{% block scripts %}
+1 -1
View File
@@ -20,7 +20,7 @@
{% endif %}
</a>
</h2>
<span class="paper-upvotes">👍 {{ paper.upvotes }}</span>
<span class="paper-upvotes" title="数据更新于 {{ paper.crawled_at.strftime('%m-%d %H:%M') if paper.crawled_at else '' }}">👍 {{ paper.upvotes }}</span>
{% if variant == 'search' and distances and paper.arxiv_id in distances %}
<span class="similarity-score" title="语义相似度距离">
🎯 {{ "%.3f"|format(distances[paper.arxiv_id]) }}