feat: refactor PDF extraction to caption-based screenshots, add upvote refresh, clean up UI
- PDF extractor: rewrite from embedded bitmap extraction to caption-based page region screenshots. Finds Figure/Table captions via regex,截取上方/下方 page region, handles compound figures and vector graphics. - Upvote refresh: new crawler.refresh_upvotes() re-fetches upvotes for recent N days without inserting new papers. Scheduler runs daily 30min after pipeline. - Admin: add /admin/refresh-upvotes endpoint and dashboard button. - UI: remove date quick nav, show upvote update time on detail/card pages, clean up CSS date-chip styles. - Utils: add recent_date_strs() helper.
This commit is contained in:
+79
-1
@@ -16,7 +16,7 @@ from app.models import (
|
||||
SummaryState,
|
||||
SummaryStatus,
|
||||
)
|
||||
from app.utils import make_http_client, utc_now
|
||||
from app.utils import make_http_client, recent_date_strs, utc_now
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -205,3 +205,81 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -
|
||||
log_entry.completed_at = utc_now()
|
||||
db.commit()
|
||||
return {"found": 0, "new": 0, "status": "failed", "error": str(exc)}
|
||||
|
||||
|
||||
async def refresh_upvotes(db: Session, days: int | None = None) -> dict:
|
||||
"""重新抓取最近 N 天论文的 upvotes,不插入新论文。
|
||||
|
||||
遍历每天调用 HF API,对已有论文仅更新 upvotes 和 crawled_at。
|
||||
"""
|
||||
days = days or settings.UPVOTE_REFRESH_DAYS
|
||||
date_strs = recent_date_strs(days)
|
||||
now = utc_now()
|
||||
|
||||
log_entry = CrawlLog(
|
||||
task="upvote_refresh",
|
||||
status="running",
|
||||
date=date_type.fromisoformat(date_strs[0]),
|
||||
started_at=now,
|
||||
)
|
||||
db.add(log_entry)
|
||||
db.commit()
|
||||
|
||||
total_updated = 0
|
||||
errors: list[str] = []
|
||||
|
||||
try:
|
||||
for ds in date_strs:
|
||||
try:
|
||||
raw_papers = await fetch_daily(ds)
|
||||
updated = _update_upvotes_only(db, raw_papers)
|
||||
total_updated += updated
|
||||
logger.info("Refreshed upvotes for %s: %d papers", ds, updated)
|
||||
except Exception as exc:
|
||||
msg = f"{ds}: {exc}"
|
||||
errors.append(msg)
|
||||
logger.warning("Failed to refresh upvotes for %s: %s", ds, exc)
|
||||
|
||||
log_entry.status = "success" if not errors else "partial"
|
||||
log_entry.papers_found = total_updated
|
||||
log_entry.details_json = f'{{"days": {days}, "errors": {len(errors)}}}'
|
||||
log_entry.completed_at = utc_now()
|
||||
db.commit()
|
||||
|
||||
return {
|
||||
"status": "success" if not errors else "partial",
|
||||
"updated": total_updated,
|
||||
"days": days,
|
||||
"errors": errors or None,
|
||||
}
|
||||
except Exception as exc:
|
||||
logger.exception("Upvote refresh failed")
|
||||
log_entry.status = "failed"
|
||||
log_entry.error = str(exc)
|
||||
log_entry.completed_at = utc_now()
|
||||
db.commit()
|
||||
return {"status": "failed", "updated": total_updated, "error": str(exc)}
|
||||
|
||||
|
||||
def _update_upvotes_only(db: Session, papers_raw: list[dict]) -> int:
|
||||
"""对已有论文更新 upvotes,跳过不存在的新论文。"""
|
||||
now = datetime.now(timezone.utc)
|
||||
updated = 0
|
||||
|
||||
for item in papers_raw:
|
||||
meta = _parse_paper(item)
|
||||
arxiv_id = meta["arxiv_id"]
|
||||
if not arxiv_id:
|
||||
continue
|
||||
|
||||
existing = db.execute(
|
||||
select(Paper).where(Paper.arxiv_id == arxiv_id)
|
||||
).scalar_one_or_none()
|
||||
|
||||
if existing:
|
||||
existing.upvotes = meta["upvotes"]
|
||||
existing.crawled_at = now
|
||||
updated += 1
|
||||
|
||||
db.commit()
|
||||
return updated
|
||||
|
||||
Reference in New Issue
Block a user