feat: refactor PDF extraction to caption-based screenshots, add upvote refresh, clean up UI

- PDF extractor: rewrite from embedded bitmap extraction to caption-based
  page region screenshots. Finds Figure/Table captions via regex,截取上方/下方
  page region, handles compound figures and vector graphics.
- Upvote refresh: new crawler.refresh_upvotes() re-fetches upvotes for recent
  N days without inserting new papers. Scheduler runs daily 30min after pipeline.
- Admin: add /admin/refresh-upvotes endpoint and dashboard button.
- UI: remove date quick nav, show upvote update time on detail/card pages,
  clean up CSS date-chip styles.
- Utils: add recent_date_strs() helper.
This commit is contained in:
2026-06-09 18:01:01 +08:00
parent b72b5a31bb
commit 1fc6303e09
13 changed files with 460 additions and 311 deletions
+19 -2
View File
@@ -26,7 +26,7 @@ from app.models import (
)
from app.services.admin import get_admin_stats
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
from app.services.crawler import crawl_daily
from app.services.crawler import crawl_daily, refresh_upvotes
from app.services.pipeline import run_pipeline
from app.services.scheduler import get_scheduler
from app.services.summarizer import summarize_batch, summarize_single
@@ -129,16 +129,20 @@ async def admin_scheduler_status(_admin: None = Depends(verify_admin)):
"""调度器运行状态(JSON)。"""
scheduler = get_scheduler()
next_run = None
upvote_next_run = None
if scheduler:
for job in scheduler.get_jobs():
if job.id == "daily_pipeline":
next_run = job.next_run_time
break
elif job.id == "upvote_refresh":
upvote_next_run = job.next_run_time
return {
"enabled": scheduler is not None,
"schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
"timezone": settings.APP_TIMEZONE,
"next_run": next_run.isoformat() if next_run else None,
"upvote_next_run": upvote_next_run.isoformat() if upvote_next_run else None,
"upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
}
@@ -159,6 +163,19 @@ async def admin_trigger_pipeline(
return {"status": "success", "message": "流水线执行完成"}
@router.post("/refresh-upvotes")
async def admin_refresh_upvotes(
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
days: int | None = Query(None, description="刷新最近 N 天,默认使用配置值"),
):
"""手动刷新最近 N 天论文的 upvotes。"""
result = await refresh_upvotes(db, days=days)
if result["status"] == "failed":
raise HTTPException(status_code=500, detail=result.get("error"))
return result
# ── 请求模型 ──────────────────────────────────────────────────────────