feat: add concurrency safety, caption detection, admin enhancements, and performance improvements
This commit is contained in:
@@ -270,3 +270,67 @@ def _update_upvotes_only(db: Session, papers_raw: list[dict]) -> int:
|
||||
|
||||
db.commit()
|
||||
return updated
|
||||
|
||||
|
||||
async def recrawl_single(db: Session, arxiv_id: str) -> dict:
|
||||
"""重新抓取一篇已存在论文的完整元数据。
|
||||
|
||||
基于 paper.paper_date 重新拉取 HF Daily 列表,命中后全字段刷新
|
||||
(标题/摘要/作者/标签/链接/upvotes)并重建 FTS。若该论文不在其收录日的
|
||||
列表中则无法重抓。
|
||||
"""
|
||||
paper = db.execute(
|
||||
select(Paper).where(Paper.arxiv_id == arxiv_id)
|
||||
).scalar_one_or_none()
|
||||
if not paper:
|
||||
return {"updated": False, "reason": "not_found", "arxiv_id": arxiv_id}
|
||||
|
||||
target_date = paper.paper_date.isoformat()
|
||||
raw_papers = await fetch_daily(target_date)
|
||||
|
||||
target = None
|
||||
for item in raw_papers:
|
||||
if _parse_paper(item)["arxiv_id"] == arxiv_id:
|
||||
target = item
|
||||
break
|
||||
|
||||
if target is None:
|
||||
return {
|
||||
"updated": False,
|
||||
"reason": "not_in_daily",
|
||||
"arxiv_id": arxiv_id,
|
||||
"date": target_date,
|
||||
}
|
||||
|
||||
meta = _parse_paper(target)
|
||||
now = utc_now()
|
||||
|
||||
# 全字段刷新
|
||||
paper.title_en = meta["title_en"]
|
||||
paper.abstract = meta["abstract"]
|
||||
paper.published_at = meta["published_at"]
|
||||
paper.hf_url = meta["hf_url"]
|
||||
paper.arxiv_url = meta["arxiv_url"]
|
||||
paper.pdf_url = meta["pdf_url"]
|
||||
paper.upvotes = meta["upvotes"]
|
||||
paper.crawled_at = now
|
||||
|
||||
# 重建 authors(删旧再加新)
|
||||
paper.authors.clear()
|
||||
seen_authors: set[str] = set()
|
||||
for idx, name in enumerate(meta["authors"]):
|
||||
if name and name not in seen_authors:
|
||||
seen_authors.add(name)
|
||||
db.add(PaperAuthor(paper_id=paper.id, name=name, position=idx))
|
||||
|
||||
# 重建 tags
|
||||
paper.tags.clear()
|
||||
for tag_name in meta["tags"]:
|
||||
if tag_name:
|
||||
db.add(PaperTag(paper_id=paper.id, tag=tag_name, source="hf"))
|
||||
|
||||
db.flush()
|
||||
reindex_paper_fts(db, paper)
|
||||
db.commit()
|
||||
logger.info("Re-crawled paper %s (full metadata refresh)", arxiv_id)
|
||||
return {"updated": True, "arxiv_id": arxiv_id, "date": target_date}
|
||||
|
||||
Reference in New Issue
Block a user