refactor: extract admin business logic to services, introduce job queue, add derived index helpers

- Move DB operations from routes/admin.py to services/admin.py (get_logs_context, query_summary_statuses, retry_failed, delete/reset operations)
- Add services/jobs.py with Job/JobEvent-based async job queue (create_job, run_job, enqueue_job)
- Add services/derived.py with FTS5 reindex and paper index deletion helpers
- Refactor scheduler to use job queue instead of direct pipeline calls
- Add heartbeat_at/expires_at to TaskLock for lock health tracking
- Remove DESIGN_REVIEW.md
- Update tests: remove redundant integration tests, add unit tests for new services
This commit is contained in:
2026-06-13 18:31:43 +08:00
parent 21f16e6756
commit 743d69efd0
20 changed files with 1391 additions and 1063 deletions
+324 -3
View File
@@ -1,17 +1,30 @@
"""管理后台服务 — 统计聚合、系统状态。"""
"""管理后台服务 — 统计聚合、系统状态、管理操作"""
from __future__ import annotations
import json
from datetime import date
from pathlib import Path
from typing import Callable
from sqlalchemy import func, select, text
from sqlalchemy.orm import Session
from app.config import settings
from app.models import CrawlLog, Paper, PaperTag, SummaryState, SummaryStatus, TaskLock
from app.models import (
CrawlLog,
DataDeleteJob,
Job,
JobEvent,
Paper,
PaperTag,
SummaryState,
SummaryStatus,
TaskLock,
)
from app.services.derived import delete_paper_indexes
from app.services.scheduler import get_scheduler
from app.utils import PAPERS_DIR, TMP_DIR
from app.utils import PAPERS_DIR, TMP_DIR, utc_now
# admin_papers 排序映射
SORT_MAP = {
@@ -190,3 +203,311 @@ def query_papers(
statuses[paper_id_to_arxiv.get(pid, "")] = st
return papers, total or 0, statuses
def get_scheduler_history(db: Session, limit: int = 10) -> list[CrawlLog]:
"""最近的调度器运行日志。"""
return (
db.execute(
select(CrawlLog)
.where(CrawlLog.task == "scheduler")
.order_by(CrawlLog.started_at.desc())
.limit(limit)
)
.scalars()
.all()
)
def get_scheduler_status() -> dict:
"""调度器运行状态。"""
scheduler = get_scheduler()
next_run = None
upvote_next_run = None
if scheduler:
for job in scheduler.get_jobs():
if job.id == "daily_pipeline":
next_run = job.next_run_time
elif job.id == "upvote_refresh":
upvote_next_run = job.next_run_time
return {
"enabled": scheduler is not None,
"schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
"timezone": settings.APP_TIMEZONE,
"next_run": next_run.isoformat() if next_run else None,
"upvote_next_run": upvote_next_run.isoformat() if upvote_next_run else None,
"upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
}
def run_cleanup_now(db: Session, cleanup_func: Callable[[], dict]) -> dict:
"""同步执行临时目录清理,并写入 CrawlLog。"""
log_entry = CrawlLog(task="cleanup", status="running", started_at=utc_now())
db.add(log_entry)
db.commit()
try:
result = cleanup_func()
log_entry.status = "success"
log_entry.completed_at = utc_now()
log_entry.details_json = json.dumps(
{
"scanned": result.get("scanned", 0),
"removed": result.get("removed", 0),
},
ensure_ascii=False,
)
if result.get("errors"):
log_entry.error = "; ".join(result["errors"])[:2000]
db.commit()
return result
except Exception as exc:
log_entry.status = "failed"
log_entry.error = str(exc)[:2000]
log_entry.completed_at = utc_now()
db.commit()
raise
def get_job_detail(db: Session, job_id: int) -> dict | None:
"""后台任务详情和阶段事件,返回可 JSON 序列化 dict。"""
job = db.get(Job, job_id)
if not job:
return None
events = (
db.execute(
select(JobEvent)
.where(JobEvent.job_id == job_id)
.order_by(JobEvent.created_at.asc())
)
.scalars()
.all()
)
return {
"id": job.id,
"type": job.type,
"status": job.status,
"owner": job.owner,
"payload": json.loads(job.payload_json or "{}"),
"result": json.loads(job.result_json or "{}") if job.result_json else None,
"error": job.error,
"created_at": job.created_at.isoformat(),
"started_at": job.started_at.isoformat() if job.started_at else None,
"completed_at": job.completed_at.isoformat() if job.completed_at else None,
"events": [
{
"stage": event.stage,
"status": event.status,
"message": event.message,
"payload": json.loads(event.payload_json or "{}")
if event.payload_json
else None,
"created_at": event.created_at.isoformat(),
}
for event in events
],
}
def get_logs_context(db: Session, *, page: int, per_page: int) -> dict:
"""管理日志页上下文。"""
crawl_logs = (
db.execute(
select(CrawlLog)
.order_by(CrawlLog.started_at.desc())
.limit(per_page)
.offset((page - 1) * per_page)
)
.scalars()
.all()
)
delete_jobs = (
db.execute(
select(DataDeleteJob)
.order_by(DataDeleteJob.started_at.desc())
.limit(per_page)
.offset((page - 1) * per_page)
)
.scalars()
.all()
)
summary_total = db.scalar(select(func.count(Paper.id))) or 0
summary_done = (
db.scalar(
select(func.count(SummaryStatus.id)).where(
SummaryStatus.status == SummaryState.DONE
)
)
or 0
)
summary_pending = (
db.scalar(
select(func.count(SummaryStatus.id)).where(
SummaryStatus.status.in_(
[SummaryState.PENDING, SummaryState.PROCESSING]
)
)
)
or 0
)
summary_failed = (
db.scalar(
select(func.count(SummaryStatus.id)).where(
SummaryStatus.status.in_(
[SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]
)
)
)
or 0
)
return {
"crawl_logs": crawl_logs,
"delete_jobs": delete_jobs,
"page": page,
"per_page": per_page,
"summary_total": summary_total,
"summary_done": summary_done,
"summary_pending": summary_pending,
"summary_failed": summary_failed,
}
def query_summary_statuses(
db: Session,
*,
status: str,
page: int,
per_page: int,
) -> tuple[list[tuple[Paper, SummaryStatus | None]], int]:
"""总结状态列表查询。"""
query = (
select(Paper, SummaryStatus)
.outerjoin(SummaryStatus, SummaryStatus.paper_id == Paper.id)
.order_by(Paper.paper_date.desc())
)
if status != "all":
if status == "none":
query = query.where(SummaryStatus.paper_id == None) # noqa: E711
else:
query = query.where(SummaryStatus.status == status)
total = db.scalar(select(func.count()).select_from(query.subquery())) or 0
results = db.execute(query.offset((page - 1) * per_page).limit(per_page)).all()
return results, total
def serialize_summary_statuses(
results: list[tuple[Paper, SummaryStatus | None]],
*,
total: int,
page: int,
per_page: int,
) -> dict:
"""总结状态列表 JSON 响应。"""
items = []
for paper, ss in results:
items.append(
{
"arxiv_id": paper.arxiv_id,
"title": paper.title_zh or paper.title_en,
"paper_date": str(paper.paper_date),
"summary_status": ss.status if ss else "none",
"retry_count": ss.retry_count if ss else 0,
"error_type": ss.error_type if ss else None,
"error": ss.error if ss else None,
}
)
return {"items": items, "total": total, "page": page, "per_page": per_page}
def retry_failed_summaries(db: Session) -> int:
"""将失败/永久失败的总结任务重置为 pending。"""
failed_ids = (
db.execute(
select(Paper.arxiv_id)
.join(SummaryStatus, SummaryStatus.paper_id == Paper.id)
.where(
SummaryStatus.status.in_(
[SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]
)
)
)
.scalars()
.all()
)
if not failed_ids:
return 0
db.execute(
SummaryStatus.__table__.update()
.where(
SummaryStatus.status.in_(
[SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]
)
)
.values(status=SummaryState.PENDING, error=None, error_type=None)
)
db.commit()
return len(failed_ids)
def delete_paper_by_arxiv(db: Session, arxiv_id: str) -> bool:
"""删除单篇论文和派生索引。"""
paper = db.scalar(select(Paper).where(Paper.arxiv_id == arxiv_id))
if not paper:
return False
paper_id = paper.id
db.delete(paper)
db.commit()
delete_paper_indexes(db, paper_id=paper_id, arxiv_id=arxiv_id)
db.commit()
return True
def delete_papers_by_arxiv_ids(db: Session, arxiv_ids: list[str]) -> int:
"""批量删除论文和派生索引。"""
papers = (
db.execute(select(Paper).where(Paper.arxiv_id.in_(arxiv_ids))).scalars().all()
)
deleted = [(paper.id, paper.arxiv_id) for paper in papers]
for paper in papers:
db.delete(paper)
db.commit()
for paper_id, arxiv_id in deleted:
delete_paper_indexes(db, paper_id=paper_id, arxiv_id=arxiv_id)
db.commit()
return len(deleted)
def reset_summaries_pending(db: Session, arxiv_ids: list[str]) -> int:
"""将指定论文的总结状态重置为 pending,没有状态则创建。"""
paper_ids = (
db.execute(select(Paper.id).where(Paper.arxiv_id.in_(arxiv_ids)))
.scalars()
.all()
)
if not paper_ids:
return 0
existing_statuses = (
db.execute(select(SummaryStatus).where(SummaryStatus.paper_id.in_(paper_ids)))
.scalars()
.all()
)
existing_ids = {status.paper_id for status in existing_statuses}
for status in existing_statuses:
status.status = SummaryState.PENDING
status.quality = None
status.error = None
status.error_type = None
status.raw_output_saved = False
status.started_at = None
status.completed_at = None
for paper_id in paper_ids:
if paper_id not in existing_ids:
db.add(SummaryStatus(paper_id=paper_id, status=SummaryState.PENDING))
db.commit()
return len(paper_ids)
+3 -16
View File
@@ -4,7 +4,7 @@ import logging
from datetime import date as date_type, datetime, timezone
import httpx
from sqlalchemy import select, text
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.config import settings
@@ -16,6 +16,7 @@ from app.models import (
SummaryState,
SummaryStatus,
)
from app.services.derived import reindex_paper_fts
from app.utils import make_http_client, recent_date_strs, utc_now
logger = logging.getLogger(__name__)
@@ -143,21 +144,7 @@ def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[
db.add(SummaryStatus(paper_id=paper.id, status=SummaryState.PENDING))
authors_text = ", ".join(meta["authors"])
tags_text = ", ".join(meta["tags"])
db.execute(
text(
"INSERT INTO papers_fts(rowid, title_en, abstract, authors, tags) "
"VALUES (:id, :title, :abstract, :authors, :tags)"
),
{
"id": paper.id,
"title": meta["title_en"],
"abstract": meta["abstract"] or "",
"authors": authors_text,
"tags": tags_text,
},
)
reindex_paper_fts(db, paper)
new_papers.append(paper)
logger.debug("Inserted new paper: %s", arxiv_id)
+140
View File
@@ -0,0 +1,140 @@
"""派生数据维护 — FTS5 / ChromaDB 等可重建索引。"""
from __future__ import annotations
import logging
from sqlalchemy import select, text
from sqlalchemy.orm import Session
from app.models import Paper
logger = logging.getLogger(__name__)
def _summary_text(paper: Paper) -> str:
summary = paper.summary
if not summary:
return ""
parts = [
summary.one_line,
summary.motivation_problem,
summary.motivation_goal,
summary.method_overview,
summary.method_key_idea,
summary.results_main_json,
]
return " ".join(p for p in parts if p)
def delete_fts_paper(db: Session, paper_id: int) -> None:
"""删除单篇论文的 FTS5 行。FTS5 以 papers.id 作为 rowid。"""
db.execute(
text("DELETE FROM papers_fts WHERE rowid = :paper_id"),
{"paper_id": paper_id},
)
def delete_paper_indexes(db: Session, *, paper_id: int, arxiv_id: str) -> None:
"""删除单篇论文的所有派生索引。失败项记录日志但不阻断主删除。"""
try:
delete_fts_paper(db, paper_id)
except Exception:
logger.warning("Failed to clean FTS index for %s", arxiv_id, exc_info=True)
try:
from app.services.embedder import delete_paper
delete_paper(arxiv_id)
except Exception:
logger.warning("Failed to clean ChromaDB index for %s", arxiv_id, exc_info=True)
def reindex_paper_fts(db: Session, paper: Paper) -> None:
"""按 DB 权威数据重建单篇论文的 FTS5 派生索引。"""
authors_text = ", ".join(
a.name for a in sorted(paper.authors, key=lambda a: a.position or 0)
)
tags_text = ", ".join(t.tag for t in paper.tags)
delete_fts_paper(db, paper.id)
db.execute(
text(
"""
INSERT INTO papers_fts(
rowid, title_en, title_zh, abstract, authors, tags, summary_text
)
VALUES (
:id, :title_en, :title_zh, :abstract, :authors, :tags, :summary_text
)
"""
),
{
"id": paper.id,
"title_en": paper.title_en or "",
"title_zh": paper.title_zh or "",
"abstract": paper.abstract or "",
"authors": authors_text,
"tags": tags_text,
"summary_text": _summary_text(paper),
},
)
def reindex_fts(db: Session, paper_ids: list[int] | None = None) -> dict:
"""全量或局部重建 FTS5 索引。"""
query = select(Paper)
if paper_ids:
query = query.where(Paper.id.in_(paper_ids))
papers = db.execute(query).scalars().all()
if paper_ids is None:
db.execute(text("DELETE FROM papers_fts"))
count = 0
for paper in papers:
reindex_paper_fts(db, paper)
count += 1
db.commit()
logger.info("FTS reindexed: %d papers", count)
return {"status": "success", "indexed": count}
def reindex_chroma(db: Session) -> dict:
"""按 DB 权威数据重建 ChromaDB 语义索引。"""
from app.services.embedder import index_paper
papers = db.execute(select(Paper).where(Paper.summary.has())).scalars().all()
indexed = 0
errors: list[str] = []
for paper in papers:
try:
texts_dict = {
"arxiv_id": paper.arxiv_id,
"title_zh": paper.title_zh or "",
"title_en": paper.title_en or "",
"tags": " ".join(t.tag for t in paper.tags),
"one_line": paper.summary.one_line if paper.summary else "",
"motivation_problem": (
paper.summary.motivation_problem if paper.summary else ""
),
"method_key_idea": (
paper.summary.method_key_idea if paper.summary else ""
),
"paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
}
index_paper(paper.arxiv_id, texts_dict)
indexed += 1
except Exception as exc:
errors.append(f"{paper.arxiv_id}: {exc}")
logger.warning(
"Failed to reindex ChromaDB for %s",
paper.arxiv_id,
exc_info=True,
)
return {
"status": "success" if not errors else "partial",
"indexed": indexed,
"errors": errors or None,
}
+244
View File
@@ -0,0 +1,244 @@
"""统一后台任务系统 — 创建、运行、事件记录、失败恢复。"""
from __future__ import annotations
import json
import logging
from datetime import date, timedelta
from typing import Any
from fastapi import BackgroundTasks
from sqlalchemy import or_, select
from sqlalchemy.orm import Session
from app.config import settings
from app.database import SessionLocal
from app.models import Job, JobEvent, JobEventStatus, JobStatus, TaskLock
from app.utils import truncate_error, utc_now
logger = logging.getLogger(__name__)
STALE_JOB_AFTER = timedelta(hours=6)
def _dumps(value: Any) -> str:
return json.dumps(value, ensure_ascii=False, default=str)
def _loads(value: str | None) -> dict:
if not value:
return {}
try:
data = json.loads(value)
return data if isinstance(data, dict) else {}
except json.JSONDecodeError:
return {}
def create_job(
db: Session,
job_type: str,
*,
owner: str,
payload: dict | None = None,
) -> Job:
"""创建后台任务主记录。"""
job = Job(
type=job_type,
status=JobStatus.QUEUED,
owner=owner,
payload_json=_dumps(payload or {}),
created_at=utc_now(),
)
db.add(job)
db.commit()
db.refresh(job)
add_job_event(
db,
job,
stage="created",
status=JobEventStatus.INFO,
message=f"Job queued: {job_type}",
payload=payload or {},
)
return job
def add_job_event(
db: Session,
job: Job,
*,
stage: str,
status: str,
message: str | None = None,
payload: dict | None = None,
) -> None:
"""追加一条任务阶段事件。"""
db.add(
JobEvent(
job_id=job.id,
stage=stage,
status=str(status),
message=message,
payload_json=_dumps(payload) if payload is not None else None,
created_at=utc_now(),
)
)
job.heartbeat_at = utc_now()
db.commit()
def enqueue_job(background_tasks: BackgroundTasks, job_id: int) -> None:
"""把任务提交给 FastAPI BackgroundTasks。"""
background_tasks.add_task(run_job_by_id, job_id)
async def run_job_by_id(job_id: int) -> None:
"""使用独立 DB session 运行一个已创建的 job。"""
db = SessionLocal()
try:
await run_job(db, job_id)
finally:
db.close()
async def run_job(db: Session, job_id: int) -> dict:
"""运行 job,并把状态/result/error 写回 jobs/job_events。"""
job = db.get(Job, job_id)
if not job:
raise ValueError(f"Job not found: {job_id}")
if job.status == JobStatus.RUNNING:
raise RuntimeError(f"Job already running: {job_id}")
payload = _loads(job.payload_json)
job.status = JobStatus.RUNNING
job.started_at = utc_now()
job.heartbeat_at = job.started_at
db.commit()
add_job_event(db, job, stage="run", status=JobEventStatus.STARTED)
try:
result = await _dispatch_job(db, job, payload)
except Exception as exc:
logger.exception("Job failed: id=%s type=%s", job.id, job.type)
error = truncate_error(exc, limit=4000)
job.status = JobStatus.FAILED
job.error = error
job.completed_at = utc_now()
db.commit()
add_job_event(db, job, stage="run", status=JobEventStatus.FAILED, message=error)
return {"status": "failed", "error": error}
job.status = JobStatus.SUCCESS
job.result_json = _dumps(result)
job.completed_at = utc_now()
job.error = None
db.commit()
add_job_event(
db,
job,
stage="run",
status=JobEventStatus.SUCCESS,
payload=result if isinstance(result, dict) else {"result": result},
)
return result if isinstance(result, dict) else {"status": "success", "result": result}
async def _dispatch_job(db: Session, job: Job, payload: dict) -> dict:
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
from app.services.crawler import refresh_upvotes
from app.services.derived import reindex_chroma, reindex_fts
from app.services.pipeline import run_crawl, run_pipeline
from app.services.summarizer import summarize_batch, summarize_single
if job.type == "crawl_daily":
return await run_crawl(
db,
payload["target_date"],
owner=job.owner or f"job:{job.id}",
top_n=payload.get("top_n"),
)
if job.type == "pipeline_daily":
return await run_pipeline(
db,
payload["target_date"],
owner=job.owner or f"job:{job.id}",
)
if job.type == "summarize_batch":
return await summarize_batch(
db,
pdf_mode=payload.get("pdf_mode", settings.SUMMARY_PDF_MODE),
)
if job.type == "summarize_one":
return await summarize_single(
db,
payload["arxiv_id"],
force=payload.get("force", True),
pdf_mode=payload.get("pdf_mode", settings.SUMMARY_PDF_MODE),
)
if job.type == "refresh_upvotes":
return await refresh_upvotes(db, days=payload.get("days"))
if job.type == "delete_range":
return await delete_papers_by_date_range(
db,
date.fromisoformat(payload["date_start"]),
date.fromisoformat(payload["date_end"]),
include_notes=payload.get("include_notes", True),
)
if job.type == "cleanup_tmp":
return cleanup_tmp()
if job.type == "reindex_fts":
return reindex_fts(db)
if job.type == "reindex_chroma":
return reindex_chroma(db)
raise ValueError(f"Unsupported job type: {job.type}")
def recover_stale_jobs(db: Session) -> int:
"""启动时将过期 running job/lock 标记为 stale,避免永久卡住。"""
now = utc_now()
cutoff = now - STALE_JOB_AFTER
stale_jobs = (
db.execute(
select(Job).where(
Job.status == JobStatus.RUNNING,
or_(Job.heartbeat_at == None, Job.heartbeat_at < cutoff), # noqa: E711
)
)
.scalars()
.all()
)
for job in stale_jobs:
job.status = JobStatus.STALE
job.error = "Marked stale after process restart or missed heartbeat"
job.completed_at = now
db.add(
JobEvent(
job_id=job.id,
stage="recovery",
status=JobEventStatus.FAILED,
message=job.error,
created_at=now,
)
)
stale_locks = (
db.execute(
select(TaskLock).where(
TaskLock.status == "running",
TaskLock.acquired_at < cutoff,
)
)
.scalars()
.all()
)
for lock in stale_locks:
lock.status = "stale"
lock.released_at = now
db.commit()
recovered = len(stale_jobs) + len(stale_locks)
if recovered:
logger.warning("Recovered stale runtime records: %d", recovered)
return recovered
+12 -2
View File
@@ -7,6 +7,7 @@ from __future__ import annotations
import logging
from datetime import date as date_type
from datetime import timedelta
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session
@@ -32,6 +33,8 @@ def acquire_lock(db: Session, task: str, lock_key: str, owner: str) -> TaskLock:
status="running",
owner=owner,
acquired_at=utc_now(),
heartbeat_at=utc_now(),
expires_at=utc_now() + timedelta(hours=6),
)
try:
db.add(lock)
@@ -42,7 +45,12 @@ def acquire_lock(db: Session, task: str, lock_key: str, owner: str) -> TaskLock:
return lock
async def run_crawl(db: Session, target_date: str, owner: str = "admin_crawl") -> dict:
async def run_crawl(
db: Session,
target_date: str,
owner: str = "admin_crawl",
top_n: int | None = None,
) -> dict:
"""执行单次抓取(带防重入锁)。
Args:
@@ -55,7 +63,7 @@ async def run_crawl(db: Session, target_date: str, owner: str = "admin_crawl") -
"""
lock = acquire_lock(db, "crawl", target_date, owner)
try:
return await crawl_daily(db, target_date)
return await crawl_daily(db, target_date, top_n=top_n)
finally:
release_lock(db, lock)
@@ -83,6 +91,8 @@ async def run_pipeline(db: Session, target_date: str, owner: str) -> dict:
status="running",
owner=owner,
acquired_at=now,
heartbeat_at=now,
expires_at=now + timedelta(hours=6),
)
try:
db.add(lock)
+10 -4
View File
@@ -11,8 +11,7 @@ from zoneinfo import ZoneInfo
from app.config import settings
from app.database import SessionLocal
from app.services.pipeline import run_pipeline
from app.services.crawler import refresh_upvotes
from app.services.jobs import create_job, run_job
from app.utils import today_str
logger = logging.getLogger(__name__)
@@ -112,7 +111,13 @@ async def _daily_pipeline() -> None:
db: Session = SessionLocal()
try:
await run_pipeline(db, today, owner="daily_pipeline")
job = create_job(
db,
"pipeline_daily",
owner="daily_pipeline",
payload={"target_date": today},
)
await run_job(db, job.id)
except RuntimeError:
logger.warning("Daily pipeline already running for %s, skipping", today)
except Exception:
@@ -125,7 +130,8 @@ async def _upvote_refresh() -> None:
"""刷新最近 N 天论文的 upvotes。"""
db: Session = SessionLocal()
try:
result = await refresh_upvotes(db)
job = create_job(db, "refresh_upvotes", owner="upvote_refresh", payload={})
result = await run_job(db, job.id)
logger.info(
"Upvote refresh completed: status=%s updated=%d",
result.get("status"),
+4 -15
View File
@@ -3,8 +3,6 @@
from __future__ import annotations
import logging
from sqlalchemy import text
from sqlalchemy.orm import Session
from app.models import (
@@ -13,6 +11,7 @@ from app.models import (
PaperTag,
SummaryState,
)
from app.services.derived import reindex_paper_fts
from app.services.pdf_downloader import paper_dir
from app.services.schemas import (
SummarySchema,
@@ -75,19 +74,9 @@ def _update_summary_in_db(
db.add(PaperTag(paper_id=paper.id, tag=tag_name, source="ai"))
existing_tag_names.add(tag_name)
# 4. FTS5 更新
summary_text = _build_fts_summary_text(schema)
db.execute(
text(
"UPDATE papers_fts SET title_zh=:title_zh, summary_text=:summary_text "
"WHERE rowid=:paper_id"
),
{
"title_zh": schema.title_zh,
"summary_text": summary_text,
"paper_id": paper.id,
},
)
# 4. FTS5 派生索引
db.flush()
reindex_paper_fts(db, paper)
db.commit()
logger.info("DB updated: paper=%s quality=%s", paper.arxiv_id, quality)