fix: prevent HNSW index bloat via batch_size + sync_threshold metadata

Sets `hnsw:batch_size` and `hnsw:sync_threshold` to 50_000 at every
collection-creation call site:

* `mempalace/backends/chroma.py` — `get_collection(create=True)` and the
  legacy `create_collection()` path. Preserves existing `hnsw:space`,
  `hnsw:num_threads=1` (race fix from #976), and `**ef_kwargs`
  (embedding-function plumbing from a4868a3).
* `mempalace/mcp_server.py` — the direct `client.get_or_create_collection`
  path used when a palace is first opened by the MCP server. Without this
  third site, MCP-bootstrapped palaces would skip the guard and could
  still trigger the original bloat.

Without these defaults, mining ~10K+ drawers triggers ~30 HNSW index
resizes and hundreds of persistDirty() calls. persistDirty uses relative
seek positioning in link_lists.bin; accumulated seek drift across resize
cycles causes the OS to extend the sparse file with zero-filled regions,
each cycle compounding the next. Result: link_lists.bin grows into
hundreds of GB sparse, after which `status`, `search`, and `repair` all
segfault and the palace is unrecoverable.

Empirical: rebuilt a palace from scratch on 39,792 drawers across 5
wings with this fix applied. Final palace 376 MB, link_lists.bin stays
at 0 bytes across both Chroma collection dirs, status and search both
return cleanly. Same workload without the fix bloated the palace to
565 GB sparse (30 GB on disk) and segfaulted at ~15K drawers.

Migration note: chromadb 1.5.x exposes a
`collection.modify(configuration={"hnsw": {...}})` retrofit path for
already-created collections (`UpdateHNSWConfiguration`), but this PR
doesn't pursue it — by the time link_lists.bin has bloated the index
is already corrupt and the only known recovery is a fresh mine.

Tests assert both keys land on the persisted collection metadata in
both `ChromaBackend` code paths, which also covers the #1161 "config
silently dropped" concern at CI time. A separate smoke test was used
to verify the metadata round-trips through `chromadb.PersistentClient`
reopen on chromadb 1.5.8.

Closes #344
Supersedes #346

Co-authored-by: robot-rocket-science <robot-rocket-science@users.noreply.github.com>
This commit is contained in:
Fergus Ching
2026-04-25 12:57:32 +00:00
committed by igorls
parent bc5d3fa911
commit 88a53b2ffa
3 changed files with 77 additions and 3 deletions
+35 -2
View File
@@ -28,6 +28,31 @@ _REQUIRED_OPERATORS = frozenset({"$eq", "$ne", "$in", "$nin", "$and", "$or", "$c
_OPTIONAL_OPERATORS = frozenset({"$gt", "$gte", "$lt", "$lte"})
_SUPPORTED_OPERATORS = _REQUIRED_OPERATORS | _OPTIONAL_OPERATORS
# HNSW tuning to prevent link_lists.bin bloat on large mines (#344).
#
# With default params (batch_size=100, sync_threshold=1000, initial capacity
# 1000), inserting tens of thousands of drawers triggers ~30 index resizes
# and hundreds of persistDirty() calls. persistDirty uses relative seek
# positioning in link_lists.bin; accumulated seek drift across resize cycles
# causes the OS to extend the sparse file with zero-filled regions, each
# cycle compounding the next. Result: link_lists.bin grows into hundreds of
# GB sparse, after which `status`/`search`/`repair` segfault.
#
# Setting large batch and sync thresholds at collection creation defers
# persistence until a single large batch completes, breaking the resize+
# persist feedback loop. Empirically validated on a 39,792-drawer rebuild
# (palace 376 MB, link_lists.bin 0 bytes, no segfault) in 2026-04.
#
# Note: chromadb 1.5.x exposes a `collection.modify(configuration={"hnsw":
# {"batch_size": ..., "sync_threshold": ...}})` retrofit path for already-
# created collections (`UpdateHNSWConfiguration` in chromadb's API), but
# this PR doesn't pursue that — once link_lists.bin has bloated, the index
# is already corrupt and the only known recovery is a fresh mine.
_HNSW_BLOAT_GUARD = {
"hnsw:batch_size": 50_000,
"hnsw:sync_threshold": 50_000,
}
def _validate_where(where: Optional[dict]) -> None:
"""Scan a where-clause for unknown operators and raise ``UnsupportedFilterError``.
@@ -992,7 +1017,11 @@ class ChromaBackend(BaseBackend):
if create:
collection = client.get_or_create_collection(
collection_name,
metadata={"hnsw:space": hnsw_space, "hnsw:num_threads": 1},
metadata={
"hnsw:space": hnsw_space,
"hnsw:num_threads": 1,
**_HNSW_BLOAT_GUARD,
},
**ef_kwargs,
)
else:
@@ -1042,7 +1071,11 @@ class ChromaBackend(BaseBackend):
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
collection = self._client(palace_path).create_collection(
collection_name,
metadata={"hnsw:space": hnsw_space, "hnsw:num_threads": 1},
metadata={
"hnsw:space": hnsw_space,
"hnsw:num_threads": 1,
**_HNSW_BLOAT_GUARD,
},
**ef_kwargs,
)
return ChromaCollection(collection)
+6 -1
View File
@@ -60,6 +60,7 @@ from .version import __version__ # noqa: E402
from .backends.chroma import ( # noqa: E402
ChromaBackend,
ChromaCollection,
_HNSW_BLOAT_GUARD,
_pin_hnsw_threads,
hnsw_capacity_status,
)
@@ -285,7 +286,11 @@ def _get_collection(create=False):
# so the retrofit runs every time _get_collection opens a cache).
raw = client.get_or_create_collection(
_config.collection_name,
metadata={"hnsw:space": "cosine", "hnsw:num_threads": 1},
metadata={
"hnsw:space": "cosine",
"hnsw:num_threads": 1,
**_HNSW_BLOAT_GUARD,
},
)
_pin_hnsw_threads(raw)
_collection_cache = ChromaCollection(raw)
+36
View File
@@ -336,6 +336,42 @@ def test_chroma_backend_creates_collection_with_cosine_distance(tmp_path):
assert col.metadata.get("hnsw:space") == "cosine"
def test_chroma_backend_sets_hnsw_bloat_guard_on_creation(tmp_path):
"""The HNSW guard from #344 must land on freshly-created collection metadata.
Without batch_size + sync_threshold, mining ~10K+ drawers triggers the
resize+persist drift that bloats link_lists.bin into hundreds of GB sparse
and segfaults `status` / `search` / `repair`. The guard belongs at
collection-creation time so every fresh palace gets it without needing
a runtime retrofit. Asserting both keys land on the persisted metadata
also covers the #1161 "config silently dropped" concern at CI time.
"""
palace_path = tmp_path / "palace"
ChromaBackend().get_collection(
str(palace_path),
collection_name="mempalace_drawers",
create=True,
)
client = chromadb.PersistentClient(path=str(palace_path))
col = client.get_collection("mempalace_drawers")
assert col.metadata.get("hnsw:batch_size") == 50_000
assert col.metadata.get("hnsw:sync_threshold") == 50_000
def test_chroma_backend_create_collection_sets_hnsw_bloat_guard(tmp_path):
"""Same guard must apply via the legacy create_collection() path."""
palace_path = tmp_path / "palace"
ChromaBackend().create_collection(str(palace_path), "mempalace_drawers")
client = chromadb.PersistentClient(path=str(palace_path))
col = client.get_collection("mempalace_drawers")
assert col.metadata.get("hnsw:batch_size") == 50_000
assert col.metadata.get("hnsw:sync_threshold") == 50_000
def test_fix_blob_seq_ids_converts_blobs_to_integers(tmp_path):
"""Simulate a ChromaDB 0.6.x database with BLOB seq_ids and verify repair."""
db_path = tmp_path / "chroma.sqlite3"