feat: add concurrency safety, caption detection, admin enhancements, and performance improvements

This commit is contained in:
2026-06-14 22:20:02 +08:00
parent 8f13c31991
commit 29fb20828e
23 changed files with 1782 additions and 114 deletions
+83
View File
@@ -2,6 +2,8 @@
from __future__ import annotations
import threading
import time
from unittest.mock import MagicMock, patch
@@ -154,3 +156,84 @@ class TestEmbeddingApi:
)
result = emb._get_embedding("test")
assert result is None
# ═══════════════════════════════════════════════════════════════════════
# 并发安全:init() 双重检查锁 + 集合访问串行化
# ═══════════════════════════════════════════════════════════════════════
class TestEmbedderConcurrency:
"""后处理经 asyncio.to_thread 多 worker 并发调 index_paper 的安全性。"""
def test_init_serialized_under_concurrency(self, monkeypatch, tmp_path):
"""并发 init() 只调一次 PersistentClientchromadb SharedSystemClient 缓存竞争修复)。
复现崩坏条件:10 线程同时 init()fake PersistentClient 故意 sleep 拉长建连窗口。
修复前会有多线程同时进入 _create_system_if_not_exists → 并发 mutate 类级缓存;
修复后(双重检查锁)只有抢到锁的那个线程建连。
"""
monkeypatch.setattr(settings, "CHROMA_ENABLED", True)
monkeypatch.setattr(settings, "CHROMA_DIR", str(tmp_path / "chroma"))
import app.services.embedder as emb
emb._chroma.reset()
counter = {"n": 0}
counter_lock = threading.Lock()
def fake_persistent_client(path):
with counter_lock:
counter["n"] += 1
time.sleep(0.05) # 拉长建连窗口,放大并发竞争
client = MagicMock()
client.get_collection.side_effect = Exception(
"not exist"
) # 触发 create 路径
client.create_collection.return_value = MagicMock()
return client
with patch("chromadb.PersistentClient", side_effect=fake_persistent_client):
threads = [threading.Thread(target=emb._chroma.init) for _ in range(10)]
for t in threads:
t.start()
for t in threads:
t.join()
assert counter["n"] == 1, f"PersistentClient 应只调一次,实际 {counter['n']}"
assert emb._chroma._client is not None
emb._chroma.reset()
def test_index_paper_concurrent_no_error(self, monkeypatch, tmp_path):
"""并发 index_paperembedding 锁外并行,集合写入串行化,全部成功。"""
monkeypatch.setattr(settings, "CHROMA_ENABLED", True)
monkeypatch.setattr(settings, "CHROMA_DIR", str(tmp_path / "chroma"))
import app.services.embedder as emb
emb._chroma.reset()
# 跳过 init,直接注入 mock collection
emb._chroma._client = MagicMock()
col = MagicMock()
col.count.return_value = 0
emb._chroma._collection = col
with patch.object(emb, "_get_embedding", return_value=[0.1, 0.2, 0.3]):
errors: list[BaseException] = []
def worker(i: int) -> None:
try:
emb.index_paper(
f"id-{i}", {"arxiv_id": f"id-{i}", "title_zh": f"标题{i}"}
)
except BaseException as exc: # noqa: BLE001 — 收集所有错误
errors.append(exc)
threads = [threading.Thread(target=worker, args=(i,)) for i in range(10)]
for t in threads:
t.start()
for t in threads:
t.join()
assert errors == []
assert col.upsert.call_count == 10
emb._chroma.reset()