feat: add admin dashboard, pipeline service, lightbox, and update dependencies
This commit is contained in:
+10
-8
@@ -1,8 +1,7 @@
|
||||
"""爬虫服务 — 从 HuggingFace Daily Papers API 抓取论文元数据。"""
|
||||
|
||||
import logging
|
||||
from datetime import date as date_type
|
||||
from datetime import datetime, timezone
|
||||
from datetime import date as date_type, datetime, timezone
|
||||
|
||||
import httpx
|
||||
from sqlalchemy import select, text
|
||||
@@ -14,9 +13,10 @@ from app.models import (
|
||||
Paper,
|
||||
PaperAuthor,
|
||||
PaperTag,
|
||||
SummaryState,
|
||||
SummaryStatus,
|
||||
)
|
||||
from app.utils import make_http_client
|
||||
from app.utils import make_http_client, utc_now
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -131,15 +131,17 @@ def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[
|
||||
db.add(paper)
|
||||
db.flush()
|
||||
|
||||
seen_authors: set[str] = set()
|
||||
for idx, name in enumerate(meta["authors"]):
|
||||
if name:
|
||||
if name and name not in seen_authors:
|
||||
seen_authors.add(name)
|
||||
db.add(PaperAuthor(paper_id=paper.id, name=name, position=idx))
|
||||
|
||||
for tag_name in meta["tags"]:
|
||||
if tag_name:
|
||||
db.add(PaperTag(paper_id=paper.id, tag=tag_name, source="hf"))
|
||||
|
||||
db.add(SummaryStatus(paper_id=paper.id, status="pending"))
|
||||
db.add(SummaryStatus(paper_id=paper.id, status=SummaryState.PENDING))
|
||||
|
||||
authors_text = ", ".join(meta["authors"])
|
||||
tags_text = ", ".join(meta["tags"])
|
||||
@@ -172,7 +174,7 @@ def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[
|
||||
|
||||
async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -> dict:
|
||||
"""完整的抓取流程:获取 + 入库 + 写日志。"""
|
||||
now = datetime.now(timezone.utc)
|
||||
now = utc_now()
|
||||
log_entry = CrawlLog(
|
||||
task="crawl",
|
||||
status="running",
|
||||
@@ -188,7 +190,7 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -
|
||||
log_entry.status = "success"
|
||||
log_entry.papers_found = len(raw_papers)
|
||||
log_entry.papers_new = len(new_papers)
|
||||
log_entry.completed_at = datetime.now(timezone.utc)
|
||||
log_entry.completed_at = utc_now()
|
||||
db.commit()
|
||||
return {
|
||||
"found": len(raw_papers),
|
||||
@@ -200,6 +202,6 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -
|
||||
logger.exception("Crawl failed for %s", target_date)
|
||||
log_entry.status = "failed"
|
||||
log_entry.error = str(exc)
|
||||
log_entry.completed_at = datetime.now(timezone.utc)
|
||||
log_entry.completed_at = utc_now()
|
||||
db.commit()
|
||||
return {"found": 0, "new": 0, "status": "failed", "error": str(exc)}
|
||||
|
||||
Reference in New Issue
Block a user