fix(storage): stop ChromaDB from crashing when reopening an existing palace

This commit is contained in:
Legion345
2026-04-28 13:08:04 -07:00
parent fdfaf017ab
commit d7f4638157
2 changed files with 39 additions and 9 deletions
+13 -9
View File
@@ -8,6 +8,7 @@ from pathlib import Path
from typing import Any, Optional from typing import Any, Optional
import chromadb import chromadb
from chromadb.errors import NotFoundError as _ChromaNotFoundError
from .base import ( from .base import (
BaseBackend, BaseBackend,
@@ -1037,15 +1038,18 @@ class ChromaBackend(BaseBackend):
ef_kwargs = {"embedding_function": ef} if ef is not None else {} ef_kwargs = {"embedding_function": ef} if ef is not None else {}
if create: if create:
collection = client.get_or_create_collection( try:
collection_name, collection = client.get_collection(collection_name, **ef_kwargs)
metadata={ except _ChromaNotFoundError:
"hnsw:space": hnsw_space, collection = client.create_collection(
"hnsw:num_threads": 1, collection_name,
**_HNSW_BLOAT_GUARD, metadata={
}, "hnsw:space": hnsw_space,
**ef_kwargs, "hnsw:num_threads": 1,
) **_HNSW_BLOAT_GUARD,
},
**ef_kwargs,
)
else: else:
collection = client.get_collection(collection_name, **ef_kwargs) collection = client.get_collection(collection_name, **ef_kwargs)
_pin_hnsw_threads(collection) _pin_hnsw_threads(collection)
+26
View File
@@ -372,6 +372,32 @@ def test_chroma_backend_create_collection_sets_hnsw_bloat_guard(tmp_path):
assert col.metadata.get("hnsw:sync_threshold") == 50_000 assert col.metadata.get("hnsw:sync_threshold") == 50_000
def test_get_collection_create_true_is_idempotent(tmp_path):
"""Calling get_collection(create=True) twice on the same name must not crash.
ChromaDB 1.5.x's Rust bindings SIGSEGV when get_or_create_collection is
called with metadata that differs from the stored collection metadata. The
fix splits the call into get_collection -> fallback create_collection so the
metadata-comparison codepath in chromadb_rust_bindings is never reached for
existing collections. Regression guard for issue #1089.
"""
palace = str(tmp_path / "palace")
backend = ChromaBackend()
backend.get_collection(palace, collection_name="mempalace_drawers", create=True)
col2 = backend.get_collection(palace, collection_name="mempalace_drawers", create=True)
assert isinstance(col2, ChromaCollection)
def test_get_collection_create_true_preserves_existing_metadata(tmp_path):
"""Existing collection metadata is not overwritten when reopened with create=True."""
palace = str(tmp_path / "palace")
backend = ChromaBackend()
backend.get_collection(palace, collection_name="mempalace_drawers", create=True)
col = backend.get_collection(palace, collection_name="mempalace_drawers", create=True)
assert col._collection.metadata["hnsw:space"] == "cosine"
assert col._collection.metadata.get("hnsw:batch_size") == 50_000
def test_fix_blob_seq_ids_converts_blobs_to_integers(tmp_path): def test_fix_blob_seq_ids_converts_blobs_to_integers(tmp_path):
"""Simulate a ChromaDB 0.6.x database with BLOB seq_ids and verify repair.""" """Simulate a ChromaDB 0.6.x database with BLOB seq_ids and verify repair."""
db_path = tmp_path / "chroma.sqlite3" db_path = tmp_path / "chroma.sqlite3"