feat: add concurrency safety, caption detection, admin enhancements, and performance improvements

This commit is contained in:
2026-06-14 22:20:02 +08:00
parent 8f13c31991
commit 29fb20828e
23 changed files with 1782 additions and 114 deletions
+64
View File
@@ -270,3 +270,67 @@ def _update_upvotes_only(db: Session, papers_raw: list[dict]) -> int:
db.commit()
return updated
async def recrawl_single(db: Session, arxiv_id: str) -> dict:
"""重新抓取一篇已存在论文的完整元数据。
基于 paper.paper_date 重新拉取 HF Daily 列表,命中后全字段刷新
(标题/摘要/作者/标签/链接/upvotes)并重建 FTS。若该论文不在其收录日的
列表中则无法重抓。
"""
paper = db.execute(
select(Paper).where(Paper.arxiv_id == arxiv_id)
).scalar_one_or_none()
if not paper:
return {"updated": False, "reason": "not_found", "arxiv_id": arxiv_id}
target_date = paper.paper_date.isoformat()
raw_papers = await fetch_daily(target_date)
target = None
for item in raw_papers:
if _parse_paper(item)["arxiv_id"] == arxiv_id:
target = item
break
if target is None:
return {
"updated": False,
"reason": "not_in_daily",
"arxiv_id": arxiv_id,
"date": target_date,
}
meta = _parse_paper(target)
now = utc_now()
# 全字段刷新
paper.title_en = meta["title_en"]
paper.abstract = meta["abstract"]
paper.published_at = meta["published_at"]
paper.hf_url = meta["hf_url"]
paper.arxiv_url = meta["arxiv_url"]
paper.pdf_url = meta["pdf_url"]
paper.upvotes = meta["upvotes"]
paper.crawled_at = now
# 重建 authors(删旧再加新)
paper.authors.clear()
seen_authors: set[str] = set()
for idx, name in enumerate(meta["authors"]):
if name and name not in seen_authors:
seen_authors.add(name)
db.add(PaperAuthor(paper_id=paper.id, name=name, position=idx))
# 重建 tags
paper.tags.clear()
for tag_name in meta["tags"]:
if tag_name:
db.add(PaperTag(paper_id=paper.id, tag=tag_name, source="hf"))
db.flush()
reindex_paper_fts(db, paper)
db.commit()
logger.info("Re-crawled paper %s (full metadata refresh)", arxiv_id)
return {"updated": True, "arxiv_id": arxiv_id, "date": target_date}