feat: initial project structure

- Add FastAPI app with paper browsing UI and REST API - Add crawler service and database models - Add scripts for DB init and manual crawl - Add docs (api-and-ui, data-model, services) - Add requirements and project config
2026-06-05 21:56:40 +08:00
commit f1be24ab83
26 changed files with 2557 additions and 0 deletions
@@ -0,0 +1,182 @@
+"""爬虫服务 — 从 HuggingFace Daily Papers API 抓取论文元数据。"""
+
+import logging
+from datetime import date as date_type
+from datetime import datetime, timezone
+
+import httpx
+from sqlalchemy import select, text
+from sqlalchemy.orm import Session
+
+from app.config import settings
+from app.models import (
+    CrawlLog,
+    Paper,
+    PaperAuthor,
+    PaperTag,
+    SummaryStatus,
+)
+
+logger = logging.getLogger(__name__)
+
+
+async def fetch_daily(target_date: str, top_n: int | None = None) -> list[dict]:
+    """从 HF Daily Papers API 获取指定日期的论文列表。
+
+    Args:
+        target_date: YYYY-MM-DD 格式
+        top_n: 取前 N 篇，默认使用 settings.TOP_N
+
+    Returns:
+        论文元数据列表
+    """
+    top_n = top_n or settings.TOP_N
+    url = f"{settings.HF_API_BASE}/daily_papers"
+    params = {"date": target_date}
+
+    transport = None
+    if settings.http_proxy:
+        transport = httpx.AsyncHTTPTransport(proxy=settings.http_proxy)
+
+    async with httpx.AsyncClient(
+        timeout=settings.HTTP_TIMEOUT_SECONDS,
+        headers={"User-Agent": settings.HTTP_USER_AGENT},
+        transport=transport,
+    ) as client:
+        for attempt in range(1, settings.HTTP_MAX_RETRIES + 1):
+            try:
+                logger.info("Fetching HF Daily Papers: date=%s attempt=%d", target_date, attempt)
+                resp = await client.get(url, params=params)
+                resp.raise_for_status()
+                data = resp.json()
+                break
+            except (httpx.HTTPError, httpx.HTTPStatusError) as exc:
+                logger.warning("Fetch failed (attempt %d/%d): %s", attempt, settings.HTTP_MAX_RETRIES, exc)
+                if attempt == settings.HTTP_MAX_RETRIES:
+                    raise
+        else:
+            data = []
+
+    papers = data[:top_n]
+    logger.info("Fetched %d papers for %s (raw=%d)", len(papers), target_date, len(data))
+    return papers
+
+
+def _parse_paper(item: dict) -> dict:
+    """从 HF API 响应中提取论文元数据。"""
+    paper_info = item.get("paper", item)
+    arxiv_id = paper_info.get("id", "")
+    published_raw = paper_info.get("publishedAt", "")
+    published_at = None
+    if published_raw:
+        try:
+            published_at = date_type.fromisoformat(published_raw[:10])
+        except ValueError:
+            pass
+    return {
+        "arxiv_id": arxiv_id,
+        "title_en": paper_info.get("title", ""),
+        "abstract": paper_info.get("abstract", ""),
+        "published_at": published_at,
+        "upvotes": item.get("paper", {}).get("upvotes", 0) or item.get("upvotes", 0),
+        "hf_url": f"https://huggingface.co/papers/{arxiv_id}" if arxiv_id else "",
+        "arxiv_url": f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else "",
+        "pdf_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else "",
+        "authors": [a.get("name", a) if isinstance(a, dict) else a for a in paper_info.get("authors", [])],
+        "tags": [t.get("name", t) if isinstance(t, dict) else t for t in (paper_info.get("tags") or [])],
+    }
+
+
+def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[Paper]:
+    """将论文元数据写入数据库。已有论文仅更新可变字段（upvotes 等），不重复插入。"""
+    now = datetime.now(timezone.utc)
+    paper_date_obj = date_type.fromisoformat(paper_date)
+    new_papers: list[Paper] = []
+
+    for item in papers_raw:
+        meta = _parse_paper(item)
+        arxiv_id = meta["arxiv_id"]
+        if not arxiv_id:
+            continue
+
+        existing = db.execute(
+            select(Paper).where(Paper.arxiv_id == arxiv_id)
+        ).scalar_one_or_none()
+
+        if existing:
+            existing.upvotes = meta["upvotes"]
+            existing.crawled_at = now
+            logger.debug("Updated existing paper: %s", arxiv_id)
+        else:
+            paper = Paper(
+                arxiv_id=arxiv_id,
+                title_en=meta["title_en"],
+                abstract=meta["abstract"],
+                published_at=meta["published_at"],
+                paper_date=paper_date_obj,
+                crawled_at=now,
+                upvotes=meta["upvotes"],
+                hf_url=meta["hf_url"],
+                arxiv_url=meta["arxiv_url"],
+                pdf_url=meta["pdf_url"],
+            )
+            db.add(paper)
+            db.flush()
+
+            for idx, name in enumerate(meta["authors"]):
+                if name:
+                    db.add(PaperAuthor(paper_id=paper.id, name=name, position=idx))
+
+            for tag_name in meta["tags"]:
+                if tag_name:
+                    db.add(PaperTag(paper_id=paper.id, tag=tag_name, source="hf"))
+
+            db.add(SummaryStatus(paper_id=paper.id, status="pending"))
+
+            authors_text = ", ".join(meta["authors"])
+            tags_text = ", ".join(meta["tags"])
+            db.execute(
+                text(
+                    "INSERT INTO papers_fts(rowid, title_en, abstract, authors, tags) "
+                    "VALUES (:id, :title, :abstract, :authors, :tags)"
+                ),
+                {"id": paper.id, "title": meta["title_en"], "abstract": meta["abstract"] or "",
+                 "authors": authors_text, "tags": tags_text},
+            )
+
+            new_papers.append(paper)
+            logger.debug("Inserted new paper: %s", arxiv_id)
+
+    db.commit()
+    logger.info("Upserted %d papers (%d new) for %s", len(papers_raw), len(new_papers), paper_date)
+    return new_papers
+
+
+async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -> dict:
+    """完整的抓取流程：获取 + 入库 + 写日志。"""
+    now = datetime.now(timezone.utc)
+    log_entry = CrawlLog(
+        task="crawl",
+        status="running",
+        date=date_type.fromisoformat(target_date),
+        started_at=now,
+    )
+    db.add(log_entry)
+    db.commit()
+
+    try:
+        raw_papers = await fetch_daily(target_date, top_n)
+        new_papers = upsert_papers(db, raw_papers, target_date)
+        log_entry.status = "success"
+        log_entry.papers_found = len(raw_papers)
+        log_entry.papers_new = len(new_papers)
+        log_entry.completed_at = datetime.now(timezone.utc)
+        db.commit()
+        return {"found": len(raw_papers), "new": len(new_papers), "status": "success", "error": None}
+    except Exception as exc:
+        logger.exception("Crawl failed for %s", target_date)
+        log_entry.status = "failed"
+        log_entry.error = str(exc)
+        log_entry.completed_at = datetime.now(timezone.utc)
+        db.commit()
+        return {"found": 0, "new": 0, "status": "failed", "error": str(exc)}