feat: refactor PDF extraction to caption-based screenshots, add upvote refresh, clean up UI

- PDF extractor: rewrite from embedded bitmap extraction to caption-based
  page region screenshots. Finds Figure/Table captions via regex,截取上方/下方
  page region, handles compound figures and vector graphics.
- Upvote refresh: new crawler.refresh_upvotes() re-fetches upvotes for recent
  N days without inserting new papers. Scheduler runs daily 30min after pipeline.
- Admin: add /admin/refresh-upvotes endpoint and dashboard button.
- UI: remove date quick nav, show upvote update time on detail/card pages,
  clean up CSS date-chip styles.
- Utils: add recent_date_strs() helper.
This commit is contained in:
2026-06-09 18:01:01 +08:00
parent b72b5a31bb
commit 1fc6303e09
13 changed files with 460 additions and 311 deletions
+79 -1
View File
@@ -16,7 +16,7 @@ from app.models import (
SummaryState,
SummaryStatus,
)
from app.utils import make_http_client, utc_now
from app.utils import make_http_client, recent_date_strs, utc_now
logger = logging.getLogger(__name__)
@@ -205,3 +205,81 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -
log_entry.completed_at = utc_now()
db.commit()
return {"found": 0, "new": 0, "status": "failed", "error": str(exc)}
async def refresh_upvotes(db: Session, days: int | None = None) -> dict:
"""重新抓取最近 N 天论文的 upvotes,不插入新论文。
遍历每天调用 HF API,对已有论文仅更新 upvotes 和 crawled_at。
"""
days = days or settings.UPVOTE_REFRESH_DAYS
date_strs = recent_date_strs(days)
now = utc_now()
log_entry = CrawlLog(
task="upvote_refresh",
status="running",
date=date_type.fromisoformat(date_strs[0]),
started_at=now,
)
db.add(log_entry)
db.commit()
total_updated = 0
errors: list[str] = []
try:
for ds in date_strs:
try:
raw_papers = await fetch_daily(ds)
updated = _update_upvotes_only(db, raw_papers)
total_updated += updated
logger.info("Refreshed upvotes for %s: %d papers", ds, updated)
except Exception as exc:
msg = f"{ds}: {exc}"
errors.append(msg)
logger.warning("Failed to refresh upvotes for %s: %s", ds, exc)
log_entry.status = "success" if not errors else "partial"
log_entry.papers_found = total_updated
log_entry.details_json = f'{{"days": {days}, "errors": {len(errors)}}}'
log_entry.completed_at = utc_now()
db.commit()
return {
"status": "success" if not errors else "partial",
"updated": total_updated,
"days": days,
"errors": errors or None,
}
except Exception as exc:
logger.exception("Upvote refresh failed")
log_entry.status = "failed"
log_entry.error = str(exc)
log_entry.completed_at = utc_now()
db.commit()
return {"status": "failed", "updated": total_updated, "error": str(exc)}
def _update_upvotes_only(db: Session, papers_raw: list[dict]) -> int:
"""对已有论文更新 upvotes,跳过不存在的新论文。"""
now = datetime.now(timezone.utc)
updated = 0
for item in papers_raw:
meta = _parse_paper(item)
arxiv_id = meta["arxiv_id"]
if not arxiv_id:
continue
existing = db.execute(
select(Paper).where(Paper.arxiv_id == arxiv_id)
).scalar_one_or_none()
if existing:
existing.upvotes = meta["upvotes"]
existing.crawled_at = now
updated += 1
db.commit()
return updated