Files
daily-paper/app/services/crawler.py
T
Rain-Bus 743d69efd0 refactor: extract admin business logic to services, introduce job queue, add derived index helpers
- Move DB operations from routes/admin.py to services/admin.py (get_logs_context, query_summary_statuses, retry_failed, delete/reset operations)
- Add services/jobs.py with Job/JobEvent-based async job queue (create_job, run_job, enqueue_job)
- Add services/derived.py with FTS5 reindex and paper index deletion helpers
- Refactor scheduler to use job queue instead of direct pipeline calls
- Add heartbeat_at/expires_at to TaskLock for lock health tracking
- Remove DESIGN_REVIEW.md
- Update tests: remove redundant integration tests, add unit tests for new services
2026-06-13 18:31:43 +08:00

273 lines
8.7 KiB
Python

"""爬虫服务 — 从 HuggingFace Daily Papers API 抓取论文元数据。"""
import logging
from datetime import date as date_type, datetime, timezone
import httpx
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.config import settings
from app.models import (
CrawlLog,
Paper,
PaperAuthor,
PaperTag,
SummaryState,
SummaryStatus,
)
from app.services.derived import reindex_paper_fts
from app.utils import make_http_client, recent_date_strs, utc_now
logger = logging.getLogger(__name__)
async def fetch_daily(target_date: str, top_n: int | None = None) -> list[dict]:
"""从 HF Daily Papers API 获取指定日期的论文列表。
Args:
target_date: YYYY-MM-DD 格式
top_n: 取前 N 篇,默认使用 settings.TOP_N
Returns:
论文元数据列表
"""
top_n = top_n or settings.TOP_N
url = f"{settings.HF_API_BASE}/daily_papers"
params = {"date": target_date}
async with make_http_client() as client:
for attempt in range(1, settings.HTTP_MAX_RETRIES + 1):
try:
logger.info(
"Fetching HF Daily Papers: date=%s attempt=%d", target_date, attempt
)
resp = await client.get(url, params=params)
resp.raise_for_status()
data = resp.json()
break
except (httpx.HTTPError, httpx.HTTPStatusError) as exc:
logger.warning(
"Fetch failed (attempt %d/%d): %s",
attempt,
settings.HTTP_MAX_RETRIES,
exc,
)
if attempt == settings.HTTP_MAX_RETRIES:
raise
else:
data = []
papers = data[:top_n]
logger.info(
"Fetched %d papers for %s (raw=%d)", len(papers), target_date, len(data)
)
return papers
def _parse_paper(item: dict) -> dict:
"""从 HF API 响应中提取论文元数据。"""
paper_info = item.get("paper", item)
arxiv_id = paper_info.get("id", "")
published_raw = paper_info.get("publishedAt", "")
published_at = None
if published_raw:
try:
published_at = date_type.fromisoformat(published_raw[:10])
except ValueError:
pass
return {
"arxiv_id": arxiv_id,
"title_en": paper_info.get("title", ""),
"abstract": paper_info.get("abstract", ""),
"published_at": published_at,
"upvotes": item.get("paper", {}).get("upvotes", 0) or item.get("upvotes", 0),
"hf_url": f"https://huggingface.co/papers/{arxiv_id}" if arxiv_id else "",
"arxiv_url": f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else "",
"pdf_url": f"https://arxiv.org/pdf/{arxiv_id}" if arxiv_id else "",
"authors": [
a.get("name", a) if isinstance(a, dict) else a
for a in paper_info.get("authors", [])
],
"tags": [
t.get("name", t) if isinstance(t, dict) else t
for t in (paper_info.get("tags") or [])
],
}
def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[Paper]:
"""将论文元数据写入数据库。已有论文仅更新可变字段(upvotes 等),不重复插入。"""
now = datetime.now(timezone.utc)
paper_date_obj = date_type.fromisoformat(paper_date)
new_papers: list[Paper] = []
for item in papers_raw:
meta = _parse_paper(item)
arxiv_id = meta["arxiv_id"]
if not arxiv_id:
continue
existing = db.execute(
select(Paper).where(Paper.arxiv_id == arxiv_id)
).scalar_one_or_none()
if existing:
existing.upvotes = meta["upvotes"]
existing.crawled_at = now
logger.debug("Updated existing paper: %s", arxiv_id)
else:
paper = Paper(
arxiv_id=arxiv_id,
title_en=meta["title_en"],
abstract=meta["abstract"],
published_at=meta["published_at"],
paper_date=paper_date_obj,
crawled_at=now,
upvotes=meta["upvotes"],
hf_url=meta["hf_url"],
arxiv_url=meta["arxiv_url"],
pdf_url=meta["pdf_url"],
)
db.add(paper)
db.flush()
seen_authors: set[str] = set()
for idx, name in enumerate(meta["authors"]):
if name and name not in seen_authors:
seen_authors.add(name)
db.add(PaperAuthor(paper_id=paper.id, name=name, position=idx))
for tag_name in meta["tags"]:
if tag_name:
db.add(PaperTag(paper_id=paper.id, tag=tag_name, source="hf"))
db.add(SummaryStatus(paper_id=paper.id, status=SummaryState.PENDING))
reindex_paper_fts(db, paper)
new_papers.append(paper)
logger.debug("Inserted new paper: %s", arxiv_id)
db.commit()
logger.info(
"Upserted %d papers (%d new) for %s",
len(papers_raw),
len(new_papers),
paper_date,
)
return new_papers
async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -> dict:
"""完整的抓取流程:获取 + 入库 + 写日志。"""
now = utc_now()
log_entry = CrawlLog(
task="crawl",
status="running",
date=date_type.fromisoformat(target_date),
started_at=now,
)
db.add(log_entry)
db.commit()
try:
raw_papers = await fetch_daily(target_date, top_n)
new_papers = upsert_papers(db, raw_papers, target_date)
log_entry.status = "success"
log_entry.papers_found = len(raw_papers)
log_entry.papers_new = len(new_papers)
log_entry.completed_at = utc_now()
db.commit()
return {
"found": len(raw_papers),
"new": len(new_papers),
"status": "success",
"error": None,
}
except Exception as exc:
logger.exception("Crawl failed for %s", target_date)
log_entry.status = "failed"
log_entry.error = str(exc)
log_entry.completed_at = utc_now()
db.commit()
return {"found": 0, "new": 0, "status": "failed", "error": str(exc)}
async def refresh_upvotes(db: Session, days: int | None = None) -> dict:
"""重新抓取最近 N 天论文的 upvotes,不插入新论文。
遍历每天调用 HF API,对已有论文仅更新 upvotes 和 crawled_at。
"""
days = days or settings.UPVOTE_REFRESH_DAYS
date_strs = recent_date_strs(days)
now = utc_now()
log_entry = CrawlLog(
task="upvote_refresh",
status="running",
date=date_type.fromisoformat(date_strs[0]),
started_at=now,
)
db.add(log_entry)
db.commit()
total_updated = 0
errors: list[str] = []
try:
for ds in date_strs:
try:
raw_papers = await fetch_daily(ds)
updated = _update_upvotes_only(db, raw_papers)
total_updated += updated
logger.info("Refreshed upvotes for %s: %d papers", ds, updated)
except Exception as exc:
msg = f"{ds}: {exc}"
errors.append(msg)
logger.warning("Failed to refresh upvotes for %s: %s", ds, exc)
log_entry.status = "success" if not errors else "partial"
log_entry.papers_found = total_updated
log_entry.details_json = f'{{"days": {days}, "errors": {len(errors)}}}'
log_entry.completed_at = utc_now()
db.commit()
return {
"status": "success" if not errors else "partial",
"updated": total_updated,
"days": days,
"errors": errors or None,
}
except Exception as exc:
logger.exception("Upvote refresh failed")
log_entry.status = "failed"
log_entry.error = str(exc)
log_entry.completed_at = utc_now()
db.commit()
return {"status": "failed", "updated": total_updated, "error": str(exc)}
def _update_upvotes_only(db: Session, papers_raw: list[dict]) -> int:
"""对已有论文更新 upvotes,跳过不存在的新论文。"""
now = datetime.now(timezone.utc)
updated = 0
for item in papers_raw:
meta = _parse_paper(item)
arxiv_id = meta["arxiv_id"]
if not arxiv_id:
continue
existing = db.execute(
select(Paper).where(Paper.arxiv_id == arxiv_id)
).scalar_one_or_none()
if existing:
existing.upvotes = meta["upvotes"]
existing.crawled_at = now
updated += 1
db.commit()
return updated