feat: add compare, trends routes, embedder service, and phase5 tests
This commit is contained in:
@@ -1,15 +1,19 @@
|
||||
"""FTS5 全文搜索服务 — 关键词 + 标签筛选,命中片段高亮,分页。"""
|
||||
"""搜索服务 — FTS5 关键词搜索 + ChromaDB 语义搜索,命中片段高亮,分页。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.orm import Session, joinedload
|
||||
|
||||
from app.config import settings
|
||||
from app.models import Paper
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ── 输入清洗 ──────────────────────────────────────────────────────────
|
||||
|
||||
# FTS5 查询语法中的特殊字符,用户输入时需要移除
|
||||
@@ -41,8 +45,9 @@ def search_papers(
|
||||
sort: str = "relevance",
|
||||
page: int = 1,
|
||||
page_size: int = 20,
|
||||
mode: str = "keyword",
|
||||
) -> dict:
|
||||
"""FTS5 搜索论文。
|
||||
"""搜索论文,支持 keyword (FTS5) 和 semantic (ChromaDB) 两种模式。
|
||||
|
||||
返回::
|
||||
{
|
||||
@@ -51,8 +56,14 @@ def search_papers(
|
||||
"total": int,
|
||||
"page": int,
|
||||
"total_pages": int,
|
||||
"distances": dict[str, float], # arxiv_id → distance (仅 semantic)
|
||||
}
|
||||
"""
|
||||
# ── semantic 模式 ──
|
||||
if mode == "semantic" and settings.CHROMA_ENABLED and query:
|
||||
return _search_semantic(db, query, tag, sort, page, page_size)
|
||||
|
||||
# ── keyword 模式(默认)──
|
||||
match_expr = _sanitize_query(query) if query else None
|
||||
|
||||
# ── 无关键词 + 无标签 → 空结果 ──
|
||||
@@ -63,6 +74,7 @@ def search_papers(
|
||||
"total": 0,
|
||||
"page": page,
|
||||
"total_pages": 0,
|
||||
"distances": {},
|
||||
}
|
||||
|
||||
# ── 构建条件性 JOIN 和 WHERE 片段 ──
|
||||
@@ -146,6 +158,75 @@ def _search_with_fts(
|
||||
"total": total,
|
||||
"page": page,
|
||||
"total_pages": math.ceil(total / page_size) if total else 0,
|
||||
"distances": {},
|
||||
}
|
||||
|
||||
|
||||
def _search_semantic(
|
||||
db: Session,
|
||||
query: str,
|
||||
tag: str | None,
|
||||
sort: str,
|
||||
page: int,
|
||||
page_size: int,
|
||||
) -> dict:
|
||||
"""ChromaDB 语义搜索,失败时回退到 FTS5。"""
|
||||
try:
|
||||
from app.services.embedder import search_similar
|
||||
|
||||
top_k = page_size * 3 # 多取一些用于 tag 过滤
|
||||
candidates = search_similar(query, top_k=top_k)
|
||||
except Exception:
|
||||
logger.exception("Semantic search failed, falling back to keyword")
|
||||
candidates = []
|
||||
|
||||
if not candidates:
|
||||
# 回退到 FTS5
|
||||
return _search_with_fts(
|
||||
db,
|
||||
_sanitize_query(query) or query,
|
||||
"JOIN paper_tags pt ON pt.paper_id = p.id" if tag else "",
|
||||
"AND pt.tag = :tag" if tag else "",
|
||||
{"tag": tag} if tag else {},
|
||||
sort, page, page_size, (page - 1) * page_size,
|
||||
)
|
||||
|
||||
# 按 arxiv_id 从 DB 加载完整数据
|
||||
arxiv_ids = [c["arxiv_id"] for c in candidates]
|
||||
distance_map = {c["arxiv_id"]: c["distance"] for c in candidates}
|
||||
|
||||
papers_query = (
|
||||
db.query(Paper)
|
||||
.filter(Paper.arxiv_id.in_(arxiv_ids))
|
||||
.options(
|
||||
joinedload(Paper.authors),
|
||||
joinedload(Paper.tags),
|
||||
joinedload(Paper.summary_status),
|
||||
joinedload(Paper.bookmark),
|
||||
joinedload(Paper.reading_status),
|
||||
)
|
||||
)
|
||||
if tag:
|
||||
papers_query = papers_query.filter(Paper.tags.any(tag=tag))
|
||||
|
||||
papers = papers_query.all()
|
||||
|
||||
# 按语义距离排序
|
||||
id_order = {aid: idx for idx, aid in enumerate(arxiv_ids)}
|
||||
papers.sort(key=lambda p: id_order.get(p.arxiv_id, 999))
|
||||
|
||||
# 分页
|
||||
total = len(papers)
|
||||
start = (page - 1) * page_size
|
||||
page_papers = papers[start:start + page_size]
|
||||
|
||||
return {
|
||||
"results": page_papers,
|
||||
"snippets": {},
|
||||
"total": total,
|
||||
"page": page,
|
||||
"total_pages": math.ceil(total / page_size) if total else 0,
|
||||
"distances": distance_map,
|
||||
}
|
||||
|
||||
|
||||
@@ -187,6 +268,7 @@ def _search_tag_only(
|
||||
"total": total,
|
||||
"page": page,
|
||||
"total_pages": math.ceil(total / page_size) if total else 0,
|
||||
"distances": {},
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user