feat: refactor PDF extraction to caption-based screenshots, add upvote refresh, clean up UI
- PDF extractor: rewrite from embedded bitmap extraction to caption-based page region screenshots. Finds Figure/Table captions via regex,截取上方/下方 page region, handles compound figures and vector graphics. - Upvote refresh: new crawler.refresh_upvotes() re-fetches upvotes for recent N days without inserting new papers. Scheduler runs daily 30min after pipeline. - Admin: add /admin/refresh-upvotes endpoint and dashboard button. - UI: remove date quick nav, show upvote update time on detail/card pages, clean up CSS date-chip styles. - Utils: add recent_date_strs() helper.
This commit is contained in:
+19
-2
@@ -26,7 +26,7 @@ from app.models import (
|
||||
)
|
||||
from app.services.admin import get_admin_stats
|
||||
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
|
||||
from app.services.crawler import crawl_daily
|
||||
from app.services.crawler import crawl_daily, refresh_upvotes
|
||||
from app.services.pipeline import run_pipeline
|
||||
from app.services.scheduler import get_scheduler
|
||||
from app.services.summarizer import summarize_batch, summarize_single
|
||||
@@ -129,16 +129,20 @@ async def admin_scheduler_status(_admin: None = Depends(verify_admin)):
|
||||
"""调度器运行状态(JSON)。"""
|
||||
scheduler = get_scheduler()
|
||||
next_run = None
|
||||
upvote_next_run = None
|
||||
if scheduler:
|
||||
for job in scheduler.get_jobs():
|
||||
if job.id == "daily_pipeline":
|
||||
next_run = job.next_run_time
|
||||
break
|
||||
elif job.id == "upvote_refresh":
|
||||
upvote_next_run = job.next_run_time
|
||||
return {
|
||||
"enabled": scheduler is not None,
|
||||
"schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
|
||||
"timezone": settings.APP_TIMEZONE,
|
||||
"next_run": next_run.isoformat() if next_run else None,
|
||||
"upvote_next_run": upvote_next_run.isoformat() if upvote_next_run else None,
|
||||
"upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
|
||||
}
|
||||
|
||||
|
||||
@@ -159,6 +163,19 @@ async def admin_trigger_pipeline(
|
||||
return {"status": "success", "message": "流水线执行完成"}
|
||||
|
||||
|
||||
@router.post("/refresh-upvotes")
|
||||
async def admin_refresh_upvotes(
|
||||
_admin: None = Depends(verify_admin),
|
||||
db: Session = Depends(get_db),
|
||||
days: int | None = Query(None, description="刷新最近 N 天,默认使用配置值"),
|
||||
):
|
||||
"""手动刷新最近 N 天论文的 upvotes。"""
|
||||
result = await refresh_upvotes(db, days=days)
|
||||
if result["status"] == "failed":
|
||||
raise HTTPException(status_code=500, detail=result.get("error"))
|
||||
return result
|
||||
|
||||
|
||||
# ── 请求模型 ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
|
||||
+8
-3
@@ -315,11 +315,16 @@ def _link_figures_with_images(
|
||||
fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))]
|
||||
table_type_unmatched = [f for f in unmatched if not _is_figure_type(f.get("id", ""))]
|
||||
|
||||
# 提取的图片也按类型分流,按文件名排序
|
||||
# 提取的图片按类型分流,按文件名中的编号排序
|
||||
def _sort_key(name: str) -> tuple[int, int]:
|
||||
m = re.search(r'page(\d+)_(?:img|table)(\d+)', name)
|
||||
# 新格式:figure_1.png, table_1.png
|
||||
m = re.search(r'(?:figure|table)_(\d+)', name)
|
||||
if m:
|
||||
return (int(m.group(1)), int(m.group(2)))
|
||||
return (0, int(m.group(1)))
|
||||
# 旧格式:page2_img1.png, page5_table1.png
|
||||
m2 = re.search(r'page(\d+)_(?:img|table)(\d+)', name)
|
||||
if m2:
|
||||
return (int(m2.group(1)), int(m2.group(2)))
|
||||
return (0, 0)
|
||||
|
||||
fig_images = sorted(
|
||||
|
||||
Reference in New Issue
Block a user