"""爬虫服务 — 从 HuggingFace Daily Papers API 抓取论文元数据。""" import logging from datetime import date as date_type, datetime, timezone import httpx from sqlalchemy import select, text from sqlalchemy.orm import Session from app.config import settings from app.models import ( CrawlLog, Paper, PaperAuthor, PaperTag, SummaryState, SummaryStatus, ) from app.utils import make_http_client, recent_date_strs, utc_now logger = logging.getLogger(__name__) async def fetch_daily(target_date: str, top_n: int | None = None) -> list[dict]: """从 HF Daily Papers API 获取指定日期的论文列表。 Args: target_date: YYYY-MM-DD 格式 top_n: 取前 N 篇,默认使用 settings.TOP_N Returns: 论文元数据列表 """ top_n = top_n or settings.TOP_N url = f"{settings.HF_API_BASE}/daily_papers" params = {"date": target_date} async with make_http_client() as client: for attempt in range(1, settings.HTTP_MAX_RETRIES + 1): try: logger.info( "Fetching HF Daily Papers: date=%s attempt=%d", target_date, attempt ) resp = await client.get(url, params=params) resp.raise_for_status() data = resp.json() break except (httpx.HTTPError, httpx.HTTPStatusError) as exc: logger.warning( "Fetch failed (attempt %d/%d): %s", attempt, settings.HTTP_MAX_RETRIES, exc, ) if attempt == settings.HTTP_MAX_RETRIES: raise else: data = [] papers = data[:top_n] logger.info( "Fetched %d papers for %s (raw=%d)", len(papers), target_date, len(data) ) return papers def _parse_paper(item: dict) -> dict: """从 HF API 响应中提取论文元数据。""" paper_info = item.get("paper", item) arxiv_id = paper_info.get("id", "") published_raw = paper_info.get("publishedAt", "") published_at = None if published_raw: try: published_at = date_type.fromisoformat(published_raw[:10]) except ValueError: pass return { "arxiv_id": arxiv_id, "title_en": paper_info.get("title", ""), "abstract": paper_info.get("abstract", ""), "published_at": published_at, "upvotes": item.get("paper", {}).get("upvotes", 0) or item.get("upvotes", 0), "hf_url": f"https://huggingface.co/papers/{arxiv_id}" if arxiv_id else "", "arxiv_url": f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else "", "pdf_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else "", "authors": [ a.get("name", a) if isinstance(a, dict) else a for a in paper_info.get("authors", []) ], "tags": [ t.get("name", t) if isinstance(t, dict) else t for t in (paper_info.get("tags") or []) ], } def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[Paper]: """将论文元数据写入数据库。已有论文仅更新可变字段(upvotes 等),不重复插入。""" now = datetime.now(timezone.utc) paper_date_obj = date_type.fromisoformat(paper_date) new_papers: list[Paper] = [] for item in papers_raw: meta = _parse_paper(item) arxiv_id = meta["arxiv_id"] if not arxiv_id: continue existing = db.execute( select(Paper).where(Paper.arxiv_id == arxiv_id) ).scalar_one_or_none() if existing: existing.upvotes = meta["upvotes"] existing.crawled_at = now logger.debug("Updated existing paper: %s", arxiv_id) else: paper = Paper( arxiv_id=arxiv_id, title_en=meta["title_en"], abstract=meta["abstract"], published_at=meta["published_at"], paper_date=paper_date_obj, crawled_at=now, upvotes=meta["upvotes"], hf_url=meta["hf_url"], arxiv_url=meta["arxiv_url"], pdf_url=meta["pdf_url"], ) db.add(paper) db.flush() seen_authors: set[str] = set() for idx, name in enumerate(meta["authors"]): if name and name not in seen_authors: seen_authors.add(name) db.add(PaperAuthor(paper_id=paper.id, name=name, position=idx)) for tag_name in meta["tags"]: if tag_name: db.add(PaperTag(paper_id=paper.id, tag=tag_name, source="hf")) db.add(SummaryStatus(paper_id=paper.id, status=SummaryState.PENDING)) authors_text = ", ".join(meta["authors"]) tags_text = ", ".join(meta["tags"]) db.execute( text( "INSERT INTO papers_fts(rowid, title_en, abstract, authors, tags) " "VALUES (:id, :title, :abstract, :authors, :tags)" ), { "id": paper.id, "title": meta["title_en"], "abstract": meta["abstract"] or "", "authors": authors_text, "tags": tags_text, }, ) new_papers.append(paper) logger.debug("Inserted new paper: %s", arxiv_id) db.commit() logger.info( "Upserted %d papers (%d new) for %s", len(papers_raw), len(new_papers), paper_date, ) return new_papers async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -> dict: """完整的抓取流程:获取 + 入库 + 写日志。""" now = utc_now() log_entry = CrawlLog( task="crawl", status="running", date=date_type.fromisoformat(target_date), started_at=now, ) db.add(log_entry) db.commit() try: raw_papers = await fetch_daily(target_date, top_n) new_papers = upsert_papers(db, raw_papers, target_date) log_entry.status = "success" log_entry.papers_found = len(raw_papers) log_entry.papers_new = len(new_papers) log_entry.completed_at = utc_now() db.commit() return { "found": len(raw_papers), "new": len(new_papers), "status": "success", "error": None, } except Exception as exc: logger.exception("Crawl failed for %s", target_date) log_entry.status = "failed" log_entry.error = str(exc) log_entry.completed_at = utc_now() db.commit() return {"found": 0, "new": 0, "status": "failed", "error": str(exc)} async def refresh_upvotes(db: Session, days: int | None = None) -> dict: """重新抓取最近 N 天论文的 upvotes,不插入新论文。 遍历每天调用 HF API,对已有论文仅更新 upvotes 和 crawled_at。 """ days = days or settings.UPVOTE_REFRESH_DAYS date_strs = recent_date_strs(days) now = utc_now() log_entry = CrawlLog( task="upvote_refresh", status="running", date=date_type.fromisoformat(date_strs[0]), started_at=now, ) db.add(log_entry) db.commit() total_updated = 0 errors: list[str] = [] try: for ds in date_strs: try: raw_papers = await fetch_daily(ds) updated = _update_upvotes_only(db, raw_papers) total_updated += updated logger.info("Refreshed upvotes for %s: %d papers", ds, updated) except Exception as exc: msg = f"{ds}: {exc}" errors.append(msg) logger.warning("Failed to refresh upvotes for %s: %s", ds, exc) log_entry.status = "success" if not errors else "partial" log_entry.papers_found = total_updated log_entry.details_json = f'{{"days": {days}, "errors": {len(errors)}}}' log_entry.completed_at = utc_now() db.commit() return { "status": "success" if not errors else "partial", "updated": total_updated, "days": days, "errors": errors or None, } except Exception as exc: logger.exception("Upvote refresh failed") log_entry.status = "failed" log_entry.error = str(exc) log_entry.completed_at = utc_now() db.commit() return {"status": "failed", "updated": total_updated, "error": str(exc)} def _update_upvotes_only(db: Session, papers_raw: list[dict]) -> int: """对已有论文更新 upvotes,跳过不存在的新论文。""" now = datetime.now(timezone.utc) updated = 0 for item in papers_raw: meta = _parse_paper(item) arxiv_id = meta["arxiv_id"] if not arxiv_id: continue existing = db.execute( select(Paper).where(Paper.arxiv_id == arxiv_id) ).scalar_one_or_none() if existing: existing.upvotes = meta["upvotes"] existing.crawled_at = now updated += 1 db.commit() return updated