Merge pull request #1191 from funguf/fix/hnsw-index-bloat-rebased

fix: prevent HNSW index bloat from resize+persist cycles
This commit is contained in:
Igor Lins e Silva
2026-04-27 03:37:57 -03:00
committed by GitHub
3 changed files with 77 additions and 3 deletions
+35 -2
View File
@@ -28,6 +28,31 @@ _REQUIRED_OPERATORS = frozenset({"$eq", "$ne", "$in", "$nin", "$and", "$or", "$c
_OPTIONAL_OPERATORS = frozenset({"$gt", "$gte", "$lt", "$lte"}) _OPTIONAL_OPERATORS = frozenset({"$gt", "$gte", "$lt", "$lte"})
_SUPPORTED_OPERATORS = _REQUIRED_OPERATORS | _OPTIONAL_OPERATORS _SUPPORTED_OPERATORS = _REQUIRED_OPERATORS | _OPTIONAL_OPERATORS
# HNSW tuning to prevent link_lists.bin bloat on large mines (#344).
#
# With default params (batch_size=100, sync_threshold=1000, initial capacity
# 1000), inserting tens of thousands of drawers triggers ~30 index resizes
# and hundreds of persistDirty() calls. persistDirty uses relative seek
# positioning in link_lists.bin; accumulated seek drift across resize cycles
# causes the OS to extend the sparse file with zero-filled regions, each
# cycle compounding the next. Result: link_lists.bin grows into hundreds of
# GB sparse, after which `status`/`search`/`repair` segfault.
#
# Setting large batch and sync thresholds at collection creation defers
# persistence until a single large batch completes, breaking the resize+
# persist feedback loop. Empirically validated on a 39,792-drawer rebuild
# (palace 376 MB, link_lists.bin 0 bytes, no segfault) in 2026-04.
#
# Note: chromadb 1.5.x exposes a `collection.modify(configuration={"hnsw":
# {"batch_size": ..., "sync_threshold": ...}})` retrofit path for already-
# created collections (`UpdateHNSWConfiguration` in chromadb's API), but
# this PR doesn't pursue that — once link_lists.bin has bloated, the index
# is already corrupt and the only known recovery is a fresh mine.
_HNSW_BLOAT_GUARD = {
"hnsw:batch_size": 50_000,
"hnsw:sync_threshold": 50_000,
}
def _validate_where(where: Optional[dict]) -> None: def _validate_where(where: Optional[dict]) -> None:
"""Scan a where-clause for unknown operators and raise ``UnsupportedFilterError``. """Scan a where-clause for unknown operators and raise ``UnsupportedFilterError``.
@@ -1014,7 +1039,11 @@ class ChromaBackend(BaseBackend):
if create: if create:
collection = client.get_or_create_collection( collection = client.get_or_create_collection(
collection_name, collection_name,
metadata={"hnsw:space": hnsw_space, "hnsw:num_threads": 1}, metadata={
"hnsw:space": hnsw_space,
"hnsw:num_threads": 1,
**_HNSW_BLOAT_GUARD,
},
**ef_kwargs, **ef_kwargs,
) )
else: else:
@@ -1064,7 +1093,11 @@ class ChromaBackend(BaseBackend):
ef_kwargs = {"embedding_function": ef} if ef is not None else {} ef_kwargs = {"embedding_function": ef} if ef is not None else {}
collection = self._client(palace_path).create_collection( collection = self._client(palace_path).create_collection(
collection_name, collection_name,
metadata={"hnsw:space": hnsw_space, "hnsw:num_threads": 1}, metadata={
"hnsw:space": hnsw_space,
"hnsw:num_threads": 1,
**_HNSW_BLOAT_GUARD,
},
**ef_kwargs, **ef_kwargs,
) )
return ChromaCollection(collection) return ChromaCollection(collection)
+6 -1
View File
@@ -60,6 +60,7 @@ from .version import __version__ # noqa: E402
from .backends.chroma import ( # noqa: E402 from .backends.chroma import ( # noqa: E402
ChromaBackend, ChromaBackend,
ChromaCollection, ChromaCollection,
_HNSW_BLOAT_GUARD,
_pin_hnsw_threads, _pin_hnsw_threads,
hnsw_capacity_status, hnsw_capacity_status,
) )
@@ -285,7 +286,11 @@ def _get_collection(create=False):
# so the retrofit runs every time _get_collection opens a cache). # so the retrofit runs every time _get_collection opens a cache).
raw = client.get_or_create_collection( raw = client.get_or_create_collection(
_config.collection_name, _config.collection_name,
metadata={"hnsw:space": "cosine", "hnsw:num_threads": 1}, metadata={
"hnsw:space": "cosine",
"hnsw:num_threads": 1,
**_HNSW_BLOAT_GUARD,
},
) )
_pin_hnsw_threads(raw) _pin_hnsw_threads(raw)
_collection_cache = ChromaCollection(raw) _collection_cache = ChromaCollection(raw)
+36
View File
@@ -336,6 +336,42 @@ def test_chroma_backend_creates_collection_with_cosine_distance(tmp_path):
assert col.metadata.get("hnsw:space") == "cosine" assert col.metadata.get("hnsw:space") == "cosine"
def test_chroma_backend_sets_hnsw_bloat_guard_on_creation(tmp_path):
"""The HNSW guard from #344 must land on freshly-created collection metadata.
Without batch_size + sync_threshold, mining ~10K+ drawers triggers the
resize+persist drift that bloats link_lists.bin into hundreds of GB sparse
and segfaults `status` / `search` / `repair`. The guard belongs at
collection-creation time so every fresh palace gets it without needing
a runtime retrofit. Asserting both keys land on the persisted metadata
also covers the #1161 "config silently dropped" concern at CI time.
"""
palace_path = tmp_path / "palace"
ChromaBackend().get_collection(
str(palace_path),
collection_name="mempalace_drawers",
create=True,
)
client = chromadb.PersistentClient(path=str(palace_path))
col = client.get_collection("mempalace_drawers")
assert col.metadata.get("hnsw:batch_size") == 50_000
assert col.metadata.get("hnsw:sync_threshold") == 50_000
def test_chroma_backend_create_collection_sets_hnsw_bloat_guard(tmp_path):
"""Same guard must apply via the legacy create_collection() path."""
palace_path = tmp_path / "palace"
ChromaBackend().create_collection(str(palace_path), "mempalace_drawers")
client = chromadb.PersistentClient(path=str(palace_path))
col = client.get_collection("mempalace_drawers")
assert col.metadata.get("hnsw:batch_size") == 50_000
assert col.metadata.get("hnsw:sync_threshold") == 50_000
def test_fix_blob_seq_ids_converts_blobs_to_integers(tmp_path): def test_fix_blob_seq_ids_converts_blobs_to_integers(tmp_path):
"""Simulate a ChromaDB 0.6.x database with BLOB seq_ids and verify repair.""" """Simulate a ChromaDB 0.6.x database with BLOB seq_ids and verify repair."""
db_path = tmp_path / "chroma.sqlite3" db_path = tmp_path / "chroma.sqlite3"