From 942aae3ed5408cebc40f6470768c0caeb6d3f848 Mon Sep 17 00:00:00 2001 From: jp Date: Sun, 26 Apr 2026 13:20:14 -0700 Subject: [PATCH] fix(hnsw): address @igorls's #1173 review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restore three `_pin_hnsw_threads` tests that the previous integrity-gate commit deleted. The function is still live code on develop (defined at chroma.py:207, called from chroma.py:705 + mcp_server.py), so the deletion left the import unused (ruff F401) and dropped coverage on a function unrelated to this PR's scope. Restored verbatim from main. Plus three nits @igorls flagged: - **Thread-safety doc**: `_quarantined_paths` mutation is lock-free; documented that idempotency of `quarantine_stale_hnsw` is the safety property (concurrent same-palace calls produce a benign redundant rename attempt that no-ops, no need for a lock). - **Pickle protocol assumption**: `_segment_appears_healthy` requires PROTO ≥ 2 (`0x80`). Documented; matches what chromadb writes today, and a future protocol-0/1 emission would conservatively quarantine + lazy-rebuild rather than mis-classify as healthy. - Class-level vs module-level scope: keeping class-level — the conftest reset is the controlled case, and module-level wouldn't remove the foot-gun, just relocate it. Conftest reset documented in the existing comment is the right pattern for test isolation. Style nit (`Path(marker).touch()` vs `open(marker, "a").close()`) deferred — that pattern lives in #1177's territory, not #1173's. 37/37 tests pass on the PR branch. Co-Authored-By: Claude Opus 4.7 (1M context) --- mempalace/backends/chroma.py | 14 +++++++++++ tests/test_backends.py | 49 ++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py index 034c28b..d43b884 100644 --- a/mempalace/backends/chroma.py +++ b/mempalace/backends/chroma.py @@ -72,6 +72,12 @@ def _segment_appears_healthy(seg_dir: str) -> bool: can execute arbitrary code, and the byte-sniff is sufficient to distinguish a complete write from truncation, zero-fill, or partial-flush corruption. + + Assumes pickle protocol >= 2 (``0x80`` PROTO marker). Matches what + chromadb writes today; if a future chromadb version emits protocol + 0/1 segments, this check would start returning False on healthy + files and quarantine_stale_hnsw would conservatively rename them + out of the way (lazy rebuild on next open recovers). """ meta_path = os.path.join(seg_dir, "index_metadata.pickle") if not os.path.isfile(meta_path): @@ -621,6 +627,14 @@ class ChromaBackend(BaseBackend): # Real runtime drift is still handled — palace-daemon's ``_auto_repair`` # calls :func:`quarantine_stale_hnsw` directly on observed HNSW errors, # which bypasses this gate. + # + # Thread-safety: this set is mutated without a lock. Two concurrent + # ``make_client()`` calls for the same palace can both pass the + # membership check and both invoke ``quarantine_stale_hnsw``. That's + # safe because the function is idempotent (mtime check + timestamped + # rename of distinct directories), so the worst-case race produces + # one redundant rename attempt that no-ops. Idempotency is the + # safety property; locking would add cost without correctness gain. _quarantined_paths: set[str] = set() @staticmethod diff --git a/tests/test_backends.py b/tests/test_backends.py index ec4012a..da4f4d8 100644 --- a/tests/test_backends.py +++ b/tests/test_backends.py @@ -578,3 +578,52 @@ def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch assert calls == [palace_a, palace_b] +# ── _pin_hnsw_threads (per-process retrofit, separate from this PR's gate) ── + + +def test_pin_hnsw_threads_retrofits_legacy_collection(tmp_path): + """Legacy collections (created without num_threads) get the retrofit applied.""" + palace_path = tmp_path / "legacy-palace" + palace_path.mkdir() + + client = chromadb.PersistentClient(path=str(palace_path)) + col = client.create_collection( + "mempalace_drawers", + metadata={"hnsw:space": "cosine"}, # no num_threads — legacy + ) + assert col.configuration_json.get("hnsw", {}).get("num_threads") is None + + _pin_hnsw_threads(col) + + assert col.configuration_json["hnsw"]["num_threads"] == 1 + + +def test_pin_hnsw_threads_swallows_all_errors(): + """Retrofit never raises even when collection.modify explodes.""" + + class _ExplodingCollection: + def modify(self, *args, **kwargs): + raise RuntimeError("boom") + + _pin_hnsw_threads(_ExplodingCollection()) # must not raise + + +def test_get_collection_applies_retrofit_on_existing_palace(tmp_path): + """ChromaBackend.get_collection(create=False) applies the retrofit.""" + palace_path = tmp_path / "palace" + palace_path.mkdir() + + # Simulate a legacy palace: create collection without num_threads + bootstrap_client = chromadb.PersistentClient(path=str(palace_path)) + bootstrap_client.create_collection("mempalace_drawers", metadata={"hnsw:space": "cosine"}) + del bootstrap_client # drop reference so a fresh client reopens cleanly + + wrapper = ChromaBackend().get_collection( + str(palace_path), + collection_name="mempalace_drawers", + create=False, + ) + + assert wrapper._collection.configuration_json["hnsw"]["num_threads"] == 1 + +