208 lines
6.8 KiB
Python
208 lines
6.8 KiB
Python
"""爬虫服务 — 从 HuggingFace Daily Papers API 抓取论文元数据。"""
|
|
|
|
import logging
|
|
from datetime import date as date_type, datetime, timezone
|
|
|
|
import httpx
|
|
from sqlalchemy import select, text
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.config import settings
|
|
from app.models import (
|
|
CrawlLog,
|
|
Paper,
|
|
PaperAuthor,
|
|
PaperTag,
|
|
SummaryState,
|
|
SummaryStatus,
|
|
)
|
|
from app.utils import make_http_client, utc_now
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def fetch_daily(target_date: str, top_n: int | None = None) -> list[dict]:
|
|
"""从 HF Daily Papers API 获取指定日期的论文列表。
|
|
|
|
Args:
|
|
target_date: YYYY-MM-DD 格式
|
|
top_n: 取前 N 篇,默认使用 settings.TOP_N
|
|
|
|
Returns:
|
|
论文元数据列表
|
|
"""
|
|
top_n = top_n or settings.TOP_N
|
|
url = f"{settings.HF_API_BASE}/daily_papers"
|
|
params = {"date": target_date}
|
|
|
|
async with make_http_client() as client:
|
|
for attempt in range(1, settings.HTTP_MAX_RETRIES + 1):
|
|
try:
|
|
logger.info(
|
|
"Fetching HF Daily Papers: date=%s attempt=%d", target_date, attempt
|
|
)
|
|
resp = await client.get(url, params=params)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
break
|
|
except (httpx.HTTPError, httpx.HTTPStatusError) as exc:
|
|
logger.warning(
|
|
"Fetch failed (attempt %d/%d): %s",
|
|
attempt,
|
|
settings.HTTP_MAX_RETRIES,
|
|
exc,
|
|
)
|
|
if attempt == settings.HTTP_MAX_RETRIES:
|
|
raise
|
|
else:
|
|
data = []
|
|
|
|
papers = data[:top_n]
|
|
logger.info(
|
|
"Fetched %d papers for %s (raw=%d)", len(papers), target_date, len(data)
|
|
)
|
|
return papers
|
|
|
|
|
|
def _parse_paper(item: dict) -> dict:
|
|
"""从 HF API 响应中提取论文元数据。"""
|
|
paper_info = item.get("paper", item)
|
|
arxiv_id = paper_info.get("id", "")
|
|
published_raw = paper_info.get("publishedAt", "")
|
|
published_at = None
|
|
if published_raw:
|
|
try:
|
|
published_at = date_type.fromisoformat(published_raw[:10])
|
|
except ValueError:
|
|
pass
|
|
return {
|
|
"arxiv_id": arxiv_id,
|
|
"title_en": paper_info.get("title", ""),
|
|
"abstract": paper_info.get("abstract", ""),
|
|
"published_at": published_at,
|
|
"upvotes": item.get("paper", {}).get("upvotes", 0) or item.get("upvotes", 0),
|
|
"hf_url": f"https://huggingface.co/papers/{arxiv_id}" if arxiv_id else "",
|
|
"arxiv_url": f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else "",
|
|
"pdf_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else "",
|
|
"authors": [
|
|
a.get("name", a) if isinstance(a, dict) else a
|
|
for a in paper_info.get("authors", [])
|
|
],
|
|
"tags": [
|
|
t.get("name", t) if isinstance(t, dict) else t
|
|
for t in (paper_info.get("tags") or [])
|
|
],
|
|
}
|
|
|
|
|
|
def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[Paper]:
|
|
"""将论文元数据写入数据库。已有论文仅更新可变字段(upvotes 等),不重复插入。"""
|
|
now = datetime.now(timezone.utc)
|
|
paper_date_obj = date_type.fromisoformat(paper_date)
|
|
new_papers: list[Paper] = []
|
|
|
|
for item in papers_raw:
|
|
meta = _parse_paper(item)
|
|
arxiv_id = meta["arxiv_id"]
|
|
if not arxiv_id:
|
|
continue
|
|
|
|
existing = db.execute(
|
|
select(Paper).where(Paper.arxiv_id == arxiv_id)
|
|
).scalar_one_or_none()
|
|
|
|
if existing:
|
|
existing.upvotes = meta["upvotes"]
|
|
existing.crawled_at = now
|
|
logger.debug("Updated existing paper: %s", arxiv_id)
|
|
else:
|
|
paper = Paper(
|
|
arxiv_id=arxiv_id,
|
|
title_en=meta["title_en"],
|
|
abstract=meta["abstract"],
|
|
published_at=meta["published_at"],
|
|
paper_date=paper_date_obj,
|
|
crawled_at=now,
|
|
upvotes=meta["upvotes"],
|
|
hf_url=meta["hf_url"],
|
|
arxiv_url=meta["arxiv_url"],
|
|
pdf_url=meta["pdf_url"],
|
|
)
|
|
db.add(paper)
|
|
db.flush()
|
|
|
|
seen_authors: set[str] = set()
|
|
for idx, name in enumerate(meta["authors"]):
|
|
if name and name not in seen_authors:
|
|
seen_authors.add(name)
|
|
db.add(PaperAuthor(paper_id=paper.id, name=name, position=idx))
|
|
|
|
for tag_name in meta["tags"]:
|
|
if tag_name:
|
|
db.add(PaperTag(paper_id=paper.id, tag=tag_name, source="hf"))
|
|
|
|
db.add(SummaryStatus(paper_id=paper.id, status=SummaryState.PENDING))
|
|
|
|
authors_text = ", ".join(meta["authors"])
|
|
tags_text = ", ".join(meta["tags"])
|
|
db.execute(
|
|
text(
|
|
"INSERT INTO papers_fts(rowid, title_en, abstract, authors, tags) "
|
|
"VALUES (:id, :title, :abstract, :authors, :tags)"
|
|
),
|
|
{
|
|
"id": paper.id,
|
|
"title": meta["title_en"],
|
|
"abstract": meta["abstract"] or "",
|
|
"authors": authors_text,
|
|
"tags": tags_text,
|
|
},
|
|
)
|
|
|
|
new_papers.append(paper)
|
|
logger.debug("Inserted new paper: %s", arxiv_id)
|
|
|
|
db.commit()
|
|
logger.info(
|
|
"Upserted %d papers (%d new) for %s",
|
|
len(papers_raw),
|
|
len(new_papers),
|
|
paper_date,
|
|
)
|
|
return new_papers
|
|
|
|
|
|
async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -> dict:
|
|
"""完整的抓取流程:获取 + 入库 + 写日志。"""
|
|
now = utc_now()
|
|
log_entry = CrawlLog(
|
|
task="crawl",
|
|
status="running",
|
|
date=date_type.fromisoformat(target_date),
|
|
started_at=now,
|
|
)
|
|
db.add(log_entry)
|
|
db.commit()
|
|
|
|
try:
|
|
raw_papers = await fetch_daily(target_date, top_n)
|
|
new_papers = upsert_papers(db, raw_papers, target_date)
|
|
log_entry.status = "success"
|
|
log_entry.papers_found = len(raw_papers)
|
|
log_entry.papers_new = len(new_papers)
|
|
log_entry.completed_at = utc_now()
|
|
db.commit()
|
|
return {
|
|
"found": len(raw_papers),
|
|
"new": len(new_papers),
|
|
"status": "success",
|
|
"error": None,
|
|
}
|
|
except Exception as exc:
|
|
logger.exception("Crawl failed for %s", target_date)
|
|
log_entry.status = "failed"
|
|
log_entry.error = str(exc)
|
|
log_entry.completed_at = utc_now()
|
|
db.commit()
|
|
return {"found": 0, "new": 0, "status": "failed", "error": str(exc)}
|