feat: add admin dashboard, pipeline service, lightbox, and update dependencies

This commit is contained in:
2026-06-09 09:32:10 +08:00
parent 0d293422ac
commit 32978b3fc5
50 changed files with 4054 additions and 1618 deletions
+10 -8
View File
@@ -1,8 +1,7 @@
"""爬虫服务 — 从 HuggingFace Daily Papers API 抓取论文元数据。"""
import logging
from datetime import date as date_type
from datetime import datetime, timezone
from datetime import date as date_type, datetime, timezone
import httpx
from sqlalchemy import select, text
@@ -14,9 +13,10 @@ from app.models import (
Paper,
PaperAuthor,
PaperTag,
SummaryState,
SummaryStatus,
)
from app.utils import make_http_client
from app.utils import make_http_client, utc_now
logger = logging.getLogger(__name__)
@@ -131,15 +131,17 @@ def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[
db.add(paper)
db.flush()
seen_authors: set[str] = set()
for idx, name in enumerate(meta["authors"]):
if name:
if name and name not in seen_authors:
seen_authors.add(name)
db.add(PaperAuthor(paper_id=paper.id, name=name, position=idx))
for tag_name in meta["tags"]:
if tag_name:
db.add(PaperTag(paper_id=paper.id, tag=tag_name, source="hf"))
db.add(SummaryStatus(paper_id=paper.id, status="pending"))
db.add(SummaryStatus(paper_id=paper.id, status=SummaryState.PENDING))
authors_text = ", ".join(meta["authors"])
tags_text = ", ".join(meta["tags"])
@@ -172,7 +174,7 @@ def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[
async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -> dict:
"""完整的抓取流程:获取 + 入库 + 写日志。"""
now = datetime.now(timezone.utc)
now = utc_now()
log_entry = CrawlLog(
task="crawl",
status="running",
@@ -188,7 +190,7 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -
log_entry.status = "success"
log_entry.papers_found = len(raw_papers)
log_entry.papers_new = len(new_papers)
log_entry.completed_at = datetime.now(timezone.utc)
log_entry.completed_at = utc_now()
db.commit()
return {
"found": len(raw_papers),
@@ -200,6 +202,6 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -
logger.exception("Crawl failed for %s", target_date)
log_entry.status = "failed"
log_entry.error = str(exc)
log_entry.completed_at = datetime.now(timezone.utc)
log_entry.completed_at = utc_now()
db.commit()
return {"found": 0, "new": 0, "status": "failed", "error": str(exc)}