merge: develop into hnsw-repair (resolve chroma.py + test_backends.py conflicts)

Develop (post-#1162 lock-plumbing era) refactored the per-open quarantine pass into ChromaBackend._prepare_palace_for_open. This branch's inline-expansion form added quarantine_invalid_hnsw_metadata as a third check, plus a "discard from _quarantined_paths on inode swap" guard so re-opens against a different physical DB re-run quarantine. Resolution merges both: - _prepare_palace_for_open now also calls quarantine_invalid_hnsw_metadata, gated by the same _quarantined_paths set. - _client keeps the inode_changed -> _quarantined_paths.discard() guard before calling the helper, so a fresh inode triggers a fresh pass. - make_client collapses to a single _prepare_palace_for_open() call. - test_backends.py keeps both the pickle (#1285) and shutil (develop) imports — both are used.
2026-05-07 07:48:45 -03:00
parent 0e32b9643c 435f0ad348
commit be6dc033fd
45 changed files with 3380 additions and 165 deletions
@@ -40,8 +40,9 @@ def _patch_mcp_config(monkeypatch, palace_path, tmp_path):

    import mempalace.mcp_server as mcp_mod

+    kg = KnowledgeGraph(db_path=str(tmp_path / "kg.sqlite3"))
    monkeypatch.setattr(mcp_mod, "_config", cfg)
-    monkeypatch.setattr(mcp_mod, "_kg", KnowledgeGraph(db_path=str(tmp_path / "kg.sqlite3")))
+    monkeypatch.setattr(mcp_mod, "_get_kg", lambda: kg)


 def _get_rss_mb():
@@ -84,8 +84,9 @@ class TestToolStatusMemoryProfile:

        cfg = MempalaceConfig(config_dir=str(tmp_path / "cfg"))
        monkeypatch.setattr(cfg, "_file_config", {"palace_path": palace_path})
+        kg = KnowledgeGraph(db_path=str(tmp_path / "kg.sqlite3"))
        monkeypatch.setattr(mcp_mod, "_config", cfg)
-        monkeypatch.setattr(mcp_mod, "_kg", KnowledgeGraph(db_path=str(tmp_path / "kg.sqlite3")))
+        monkeypatch.setattr(mcp_mod, "_get_kg", lambda: kg)

        from mempalace.mcp_server import tool_status

@@ -1,5 +1,6 @@
 import os
 import pickle
+import shutil
 import sqlite3
 from pathlib import Path

@@ -208,6 +209,52 @@ def test_query_empty_preserves_embeddings_outer_shape_when_requested():
    assert not_requested.embeddings is None


+def test_chroma_close_palace_releases_sqlite_lock_for_reopen(tmp_path):
+    """close_palace must release chromadb's rust-side SQLite file lock so
+    a fresh PersistentClient on the same path after shutil.rmtree can
+    write without hitting SQLITE_READONLY_DBMOVED."""
+    backend = ChromaBackend()
+    palace_path = tmp_path / "palace-a"
+    ref = PalaceRef(id=str(palace_path), local_path=str(palace_path))
+
+    col = backend.get_collection(palace=ref, collection_name="mempalace_drawers", create=True)
+    col.upsert(documents=["hello"], ids=["a"], metadatas=[{"k": "v"}])
+
+    backend.close_palace(ref)
+    shutil.rmtree(palace_path)
+
+    col = backend.get_collection(palace=ref, collection_name="mempalace_drawers", create=True)
+    col.upsert(documents=["world"], ids=["b"], metadatas=[{"k": "v2"}])
+    assert col.count() == 1
+
+
+def test_chroma_close_releases_all_cached_clients(tmp_path):
+    """close() must release every cached client's SQLite file lock so any
+    of their palace paths can be reopened by a fresh backend in the same
+    process."""
+    backend = ChromaBackend()
+    palace_a = tmp_path / "palace-a"
+    palace_b = tmp_path / "palace-b"
+    ref_a = PalaceRef(id=str(palace_a), local_path=str(palace_a))
+    ref_b = PalaceRef(id=str(palace_b), local_path=str(palace_b))
+
+    for ref in (ref_a, ref_b):
+        backend.get_collection(palace=ref, collection_name="mempalace_drawers", create=True).upsert(
+            documents=["x"], ids=["x"], metadatas=[{"k": "v"}]
+        )
+
+    backend.close()
+
+    for path in (palace_a, palace_b):
+        shutil.rmtree(path)
+        ref = PalaceRef(id=str(path), local_path=str(path))
+        fresh = ChromaBackend()
+        col = fresh.get_collection(palace=ref, collection_name="mempalace_drawers", create=True)
+        col.upsert(documents=["y"], ids=["y"], metadatas=[{"k": "v2"}])
+        assert col.count() == 1
+        fresh.close()
+
+
 def test_chroma_cache_invalidates_when_db_file_missing(tmp_path):
    """A palace rebuild that removes chroma.sqlite3 must drop the stale cache.

@@ -735,9 +782,9 @@ def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeyp
    ChromaBackend.make_client(palace_path)
    ChromaBackend.make_client(palace_path)

-    assert calls == [palace_path], (
-        "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
-    )
+    assert calls == [
+        palace_path
+    ], "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"


 def test_make_client_gates_invalid_metadata_on_first_call(tmp_path, monkeypatch):
@@ -797,6 +844,67 @@ def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch
    assert calls == [palace_a, palace_b]


+# ── _client() cold-start gate (#1121, #1132, #1263) ──────────────────────
+
+
+def test_client_quarantines_corrupt_segment_on_first_open(tmp_path, monkeypatch):
+    """The instance ``_client()`` path must run ``quarantine_stale_hnsw``
+    on first open, mirroring the ``make_client()`` static helper. Before
+    PR #1173's wiring was extended here, CLI mining / search / repair /
+    status all skipped the quarantine pass and would SIGSEGV on a stale
+    HNSW segment (#1121, #1132, #1263)."""
+    now = 1_700_000_000.0
+    palace, seg = _make_palace_with_segment(
+        tmp_path,
+        hnsw_mtime=now - 7200,
+        sqlite_mtime=now,
+        meta_bytes=_CORRUPT_META,
+    )
+
+    monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
+
+    backend = ChromaBackend()
+    try:
+        backend._client(str(palace))
+    finally:
+        backend.close()
+
+    assert not seg.exists(), "_client() should have quarantined the corrupt segment"
+    drift_dirs = [p for p in palace.iterdir() if ".drift-" in p.name]
+    assert len(drift_dirs) == 1
+
+
+def test_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeypatch):
+    """Repeated ``_client()`` calls for the same palace re-run quarantine
+    at most once — the ``_quarantined_paths`` gate prevents runtime
+    thrash on hot paths (``_client()`` is hit on every backend op)."""
+    palace_path = str(tmp_path / "palace")
+    os.makedirs(palace_path, exist_ok=True)
+    (Path(palace_path) / "chroma.sqlite3").write_text("")
+
+    monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
+
+    calls: list[str] = []
+
+    def _spy(path, stale_seconds=300.0):
+        calls.append(path)
+        return []
+
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy)
+
+    backend = ChromaBackend()
+    try:
+        backend._client(palace_path)
+        backend._client(palace_path)
+        backend._client(palace_path)
+    finally:
+        backend.close()
+
+    assert (
+        calls == [palace_path]
+    ), "quarantine_stale_hnsw should fire once per palace per process from _client(), not on every call"
+
+
 # ── _pin_hnsw_threads (per-process retrofit, separate from this PR's gate) ──


@@ -0,0 +1,321 @@
+"""Tests for ChromaCollection's palace-write-lock integration.
+
+Closes the gap left by ``mine_palace_lock`` only protecting the
+``mempalace mine`` pipeline: MCP/direct writers that call
+``ChromaCollection.add/upsert/update/delete`` must also serialize against
+mine and against each other to avoid the multi-threaded HNSW corruption
+documented in #974/#965.
+
+Property tested:
+
+* ``ChromaCollection(c, palace_path=p)`` wraps every write with
+  ``mine_palace_lock(p)``.
+* Writes raise ``MineAlreadyRunning`` when another holder owns the lock
+  (instead of silently racing into the underlying chromadb call).
+* Re-entrant composition with ``miner.mine()`` does not self-deadlock:
+  ``with mine_palace_lock(p): col.upsert(...)`` runs to completion.
+* ``ChromaCollection(c)`` (no palace_path) preserves legacy no-lock
+  behaviour for tests/callers that build the adapter directly without
+  going through ``ChromaBackend``.
+
+POSIX-only: ``mine_palace_lock`` uses ``fcntl`` on Unix and ``msvcrt`` on
+Windows; the contention semantics differ enough that the cross-process
+tests are skipped on Windows runners.
+"""
+
+from __future__ import annotations
+
+import multiprocessing
+import os
+import time
+
+import pytest
+
+from mempalace.backends.chroma import ChromaCollection
+from mempalace.palace import MineAlreadyRunning, mine_palace_lock
+
+
+def _get_mp_context():
+    """Same start-method picker as test_palace_locks.py."""
+    start_method = "spawn" if os.name == "nt" else "fork"
+    return multiprocessing.get_context(start_method)
+
+
+# ---------------------------------------------------------------------------
+# Fakes
+# ---------------------------------------------------------------------------
+
+
+class _FakeChromaCollection:
+    """Records calls; never blocks. Stand-in for chromadb.Collection."""
+
+    def __init__(self):
+        self.adds: list[dict] = []
+        self.upserts: list[dict] = []
+        self.updates: list[dict] = []
+        self.deletes: list[dict] = []
+
+    def add(self, **kwargs):
+        self.adds.append(kwargs)
+
+    def upsert(self, **kwargs):
+        self.upserts.append(kwargs)
+
+    def update(self, **kwargs):
+        self.updates.append(kwargs)
+
+    def delete(self, **kwargs):
+        self.deletes.append(kwargs)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _hold_lock(palace_path: str, ready_flag: str, release_flag: str) -> int:
+    """Acquire ``mine_palace_lock``, signal readiness, wait for release.
+
+    Mirrors the helper in ``test_palace_locks.py`` so the contention
+    semantics match across both test files.
+    """
+    try:
+        with mine_palace_lock(palace_path):
+            open(ready_flag, "w").close()
+            for _ in range(500):
+                if os.path.exists(release_flag):
+                    return 0
+                time.sleep(0.01)
+            return 0
+    except MineAlreadyRunning:
+        return 1
+
+
+# ---------------------------------------------------------------------------
+# Tests — opt-in lock wiring
+# ---------------------------------------------------------------------------
+
+
+def test_palace_path_none_skips_lock(tmp_path, monkeypatch):
+    """Legacy callers (``ChromaCollection(c)``) keep no-lock behaviour.
+
+    A ``ChromaCollection`` built without ``palace_path`` must not touch the
+    lock infrastructure at all. This guards against regressions where a
+    test or third-party caller relies on the historical bare-write path.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    fake = _FakeChromaCollection()
+    col = ChromaCollection(fake)  # no palace_path -> no lock
+
+    # Hold the lock in a child process. Without palace_path, the parent
+    # write must still succeed (the lock does not gate this caller).
+    palace = str(tmp_path / "palace")
+    ready = str(tmp_path / "ready")
+    release = str(tmp_path / "release")
+    ctx = _get_mp_context()
+    holder = ctx.Process(target=_hold_lock, args=(palace, ready, release))
+    holder.start()
+    try:
+        for _ in range(500):
+            if os.path.exists(ready):
+                break
+            time.sleep(0.01)
+        assert os.path.exists(ready), "holder failed to acquire lock"
+
+        col.upsert(documents=["doc"], ids=["id-1"])
+        assert fake.upserts == [{"documents": ["doc"], "ids": ["id-1"]}]
+    finally:
+        open(release, "w").close()
+        holder.join(timeout=5)
+
+
+def test_writer_blocks_during_mine(tmp_path, monkeypatch):
+    """A held ``mine_palace_lock`` causes ``ChromaCollection`` writes to raise.
+
+    This is the property that closes the MCP-bypass gap: when a mine is in
+    flight, MCP/direct writes raise ``MineAlreadyRunning`` rather than
+    silently entering chromadb's write path concurrent with mine.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+    ready = str(tmp_path / "ready")
+    release = str(tmp_path / "release")
+
+    ctx = _get_mp_context()
+    holder = ctx.Process(target=_hold_lock, args=(palace, ready, release))
+    holder.start()
+    try:
+        for _ in range(500):
+            if os.path.exists(ready):
+                break
+            time.sleep(0.01)
+        assert os.path.exists(ready), "holder failed to acquire lock"
+
+        fake = _FakeChromaCollection()
+        col = ChromaCollection(fake, palace_path=palace)
+
+        with pytest.raises(MineAlreadyRunning):
+            col.upsert(documents=["doc"], ids=["id-1"])
+        with pytest.raises(MineAlreadyRunning):
+            col.add(documents=["doc"], ids=["id-2"])
+        with pytest.raises(MineAlreadyRunning):
+            col.update(ids=["id-3"], documents=["doc"])
+        with pytest.raises(MineAlreadyRunning):
+            col.delete(ids=["id-4"])
+
+        # The fake must have received NO calls — the lock must gate
+        # before reaching the underlying chromadb layer.
+        assert fake.upserts == []
+        assert fake.adds == []
+        assert fake.updates == []
+        assert fake.deletes == []
+    finally:
+        open(release, "w").close()
+        holder.join(timeout=5)
+
+
+def test_reentrant_inside_mine_passes_through(tmp_path, monkeypatch):
+    """``ChromaCollection.upsert`` inside ``mine_palace_lock`` does not deadlock.
+
+    ``miner.mine()`` already holds ``mine_palace_lock(palace_path)`` for the
+    full mine pipeline; ``_mine_body`` then calls
+    ``collection.upsert(...)``. With the per-thread re-entrant guard in
+    ``mine_palace_lock``, the inner acquire is a pass-through and the
+    underlying chromadb call runs immediately.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+    fake = _FakeChromaCollection()
+    col = ChromaCollection(fake, palace_path=palace)
+
+    with mine_palace_lock(palace):
+        # If the re-entrant guard were missing, this would self-deadlock on
+        # the underlying flock. We rely on pytest-timeout (configured in
+        # pyproject.toml) to enforce this in CI; the assertion just confirms
+        # the call landed.
+        col.upsert(documents=["d"], ids=["i"], metadatas=[{"k": "v"}])
+        col.add(documents=["d2"], ids=["i2"])
+        col.update(ids=["i"], documents=["d-updated"])
+        col.delete(ids=["i2"])
+
+    assert len(fake.upserts) == 1
+    assert len(fake.adds) == 1
+    assert len(fake.updates) == 1
+    assert len(fake.deletes) == 1
+
+
+class _SlowFakeChromaCollection(_FakeChromaCollection):
+    """Fake whose write methods hold the caller for ``hold_seconds``.
+
+    Used to keep ``mine_palace_lock`` acquired long enough for a sibling
+    process to contend deterministically.
+    """
+
+    def __init__(self, hold_seconds: float = 0.3):
+        super().__init__()
+        self._hold = hold_seconds
+
+    def upsert(self, **kwargs):
+        time.sleep(self._hold)
+        super().upsert(**kwargs)
+
+
+def _slow_writer_target(palace_path, tmp_path_str, pid, result_q):
+    """Subprocess target: try a slow upsert, report ok/busy."""
+    os.environ["HOME"] = tmp_path_str
+    # Fresh import inside child so HOME monkeypatch routes the lock dir.
+    from mempalace.backends.chroma import ChromaCollection as _CC
+    from mempalace.palace import MineAlreadyRunning as _MAR
+
+    fake = _SlowFakeChromaCollection(hold_seconds=0.3)
+    col = _CC(fake, palace_path=palace_path)
+    try:
+        col.upsert(documents=[f"d{pid}"], ids=[f"i{pid}"])
+        result_q.put(("ok", pid))
+    except _MAR:
+        result_q.put(("busy", pid))
+
+
+def test_concurrent_writers_serialize(tmp_path, monkeypatch):
+    """Two processes calling ``ChromaCollection.upsert`` against the same
+    palace must be serialized: at most one enters chromadb at a time, the
+    other raises ``MineAlreadyRunning``.
+
+    This is the property that prevents the parallel HNSW insert race that
+    drives #974/#965 — under concurrent MCP write fan-out, exactly one
+    writer reaches chromadb and the rest fail loudly instead of corrupting
+    the index.
+
+    The slow fake holds the lock for 0.3s per writer, large enough for the
+    second process to contend even on slow CI runners.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+
+    ctx = _get_mp_context()
+    result_q = ctx.Queue()
+
+    p1 = ctx.Process(target=_slow_writer_target, args=(palace, str(tmp_path), 1, result_q))
+    p2 = ctx.Process(target=_slow_writer_target, args=(palace, str(tmp_path), 2, result_q))
+    p1.start()
+    # Tiny stagger so p1 wins the race deterministically; without it the
+    # OS scheduler can pick either, which is also a valid outcome but
+    # makes the assertion brittle on slow CI.
+    time.sleep(0.05)
+    p2.start()
+    p1.join(timeout=5)
+    p2.join(timeout=5)
+
+    outcomes = [result_q.get(timeout=1) for _ in range(2)]
+    statuses = sorted(o[0] for o in outcomes)
+    assert statuses == ["busy", "ok"], f"expected one ok + one busy, got {outcomes}"
+
+
+def test_read_path_does_not_acquire_lock(tmp_path, monkeypatch):
+    """``query`` / ``get`` / ``count`` must not be gated by the write lock.
+
+    Read traffic is the dominant workload (semantic search, MCP get, etc.)
+    and serializing it against mine would tank latency for no correctness
+    benefit. This test pins that property: with another process holding
+    the write lock, reads must still complete instantly.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+    ready = str(tmp_path / "ready")
+    release = str(tmp_path / "release")
+
+    ctx = _get_mp_context()
+    holder = ctx.Process(target=_hold_lock, args=(palace, ready, release))
+    holder.start()
+    try:
+        for _ in range(500):
+            if os.path.exists(ready):
+                break
+            time.sleep(0.01)
+        assert os.path.exists(ready), "holder failed to acquire lock"
+
+        # _FakeChromaCollection doesn't implement query/get/count; we only
+        # need to confirm the wrapper does not call into mine_palace_lock
+        # for reads, which we assert by observing the wrapped methods are
+        # NOT in ChromaCollection's _write_lock path. A direct check via
+        # source inspection is more honest than mocking the entire chroma
+        # surface here.
+        import inspect
+
+        from mempalace.backends.chroma import ChromaCollection as _CC
+
+        for write_attr in ("add", "upsert", "update", "delete"):
+            src = inspect.getsource(getattr(_CC, write_attr))
+            assert "_write_lock" in src, f"{write_attr} should acquire write lock"
+
+        for read_attr in ("query", "get", "count"):
+            method = getattr(_CC, read_attr, None)
+            if method is None:
+                continue
+            src = inspect.getsource(method)
+            assert (
+                "_write_lock" not in src
+            ), f"{read_attr} must NOT acquire the write lock (read path)"
+    finally:
+        open(release, "w").close()
+        holder.join(timeout=5)
@@ -175,6 +175,61 @@ def test_cmd_init_normalizes_wing_name_for_topics_registry(mock_config_cls, tmp_
        assert mock_register.call_args.kwargs["wing"] == "my_cool_app"


+def test_cmd_init_honors_palace_flag(tmp_path, monkeypatch):
+    """Regression for #1313: ``cmd_init`` must honor ``--palace`` instead of
+    silently writing to ``~/.mempalace``. Mirrors the env-var pattern used
+    by ``cmd_mine`` / ``cmd_status`` / ``mcp_server`` so every downstream
+    read of ``cfg.palace_path`` (Pass 0, ``cfg.init()``, post-init mine)
+    routes to the user-specified location.
+    """
+    project = tmp_path / "project"
+    project.mkdir()
+    palace = tmp_path / "custom_palace"
+
+    # Make sure no leftover env var from another test leaks in — we want to
+    # verify that --palace ALONE drives the resolution. Prime monkeypatch's
+    # undo list with setenv first so that the env var ``cmd_init`` writes
+    # below is rolled back at teardown (``delenv(raising=False)`` on a
+    # missing key registers no undo entry, which would leak into the next
+    # test).
+    monkeypatch.setenv("MEMPALACE_PALACE_PATH", "")
+    monkeypatch.setenv("MEMPAL_PALACE_PATH", "")
+    monkeypatch.delenv("MEMPALACE_PALACE_PATH")
+    monkeypatch.delenv("MEMPAL_PALACE_PATH")
+
+    args = argparse.Namespace(
+        dir=str(project),
+        palace=str(palace),
+        yes=True,
+        auto_mine=False,
+    )
+
+    captured = {}
+
+    def fake_pass_zero(project_dir, palace_dir, llm_provider):
+        # Capture the palace_dir Pass 0 sees — this is the smoking-gun
+        # value for the bug. Pre-fix it was always ~/.mempalace.
+        captured["pass_zero_palace_dir"] = palace_dir
+        return None
+
+    with (
+        patch("mempalace.entity_detector.scan_for_detection", return_value=[]),
+        patch("mempalace.room_detector_local.detect_rooms_local"),
+        patch("mempalace.cli._run_pass_zero", side_effect=fake_pass_zero),
+        patch("mempalace.cli._maybe_run_mine_after_init"),
+    ):
+        cmd_init(args)
+
+    expected = str(palace)
+    # Pass 0 must have been handed the --palace location, not ~/.mempalace.
+    assert captured["pass_zero_palace_dir"] == expected
+    # And the env var must point at the custom palace so any downstream
+    # ``cfg.palace_path`` read in this process resolves correctly too.
+    import os
+
+    assert os.environ.get("MEMPALACE_PALACE_PATH") == os.path.abspath(expected)
+
+
@patch("mempalace.cli.MempalaceConfig")
 def test_cmd_init_with_entities_zero_total(mock_config_cls, tmp_path, capsys):
    """When entities detected but total is 0, prints 'No entities' message."""
@@ -934,7 +989,7 @@ def test_cmd_compress_with_config(mock_config_cls, tmp_path, capsys):

@patch("mempalace.cli.MempalaceConfig")
 def test_cmd_compress_stores_results(mock_config_cls, capsys):
-    """Non-dry-run compress stores to mempalace_compressed collection."""
+    """Non-dry-run compress stores to mempalace_closets collection (#1244)."""
    mock_config_cls.return_value.palace_path = "/fake/palace"
    args = argparse.Namespace(palace=None, wing=None, dry_run=False, config=None)
    mock_col = MagicMock()
@@ -972,6 +1027,53 @@ def test_cmd_compress_stores_results(mock_config_cls, capsys):
    assert "Stored" in out
    assert "Total:" in out
    mock_comp_col.upsert.assert_called_once()
+    # Verify the compress output goes to the closets collection so that
+    # palace.get_closets_collection() / searcher can read it back (#1244).
+    (call_args, _kwargs) = mock_backend.get_or_create_collection.call_args
+    assert (
+        call_args[1] == "mempalace_closets"
+    ), f"compress should write to mempalace_closets, got {call_args[1]!r}"
+    assert "mempalace_closets" in out
+
+
+def test_cmd_compress_output_readable_via_get_closets_collection(tmp_path, capsys):
+    """End-to-end: cmd_compress output must be readable via the same code
+    path palace.py uses (`get_closets_collection`). Regression for #1244."""
+    from mempalace.backends.chroma import ChromaBackend
+    from mempalace.palace import get_closets_collection, get_collection
+
+    palace_path = str(tmp_path / "palace")
+
+    # Seed a drawer in the palace so cmd_compress has something to compress.
+    drawers = get_collection(palace_path, "mempalace_drawers", create=True)
+    drawers.upsert(
+        ids=["drawer-1"],
+        documents=["The quick brown fox jumps over the lazy dog."],
+        metadatas=[{"wing": "test", "room": "demo", "source_file": "fox.txt"}],
+    )
+
+    args = argparse.Namespace(palace=palace_path, wing=None, dry_run=False, config=None)
+    with patch("mempalace.cli.MempalaceConfig") as mock_config_cls:
+        mock_config_cls.return_value.palace_path = palace_path
+        # Use a real ChromaBackend so the write actually lands on disk and
+        # the read-side helper can find it.
+        with patch("mempalace.backends.chroma.ChromaBackend", side_effect=ChromaBackend):
+            cmd_compress(args)
+
+    out = capsys.readouterr().out
+    assert "Stored" in out
+
+    # Now read via the *same* code path palace.py / searcher uses.
+    closets = get_closets_collection(palace_path, create=False)
+    got = closets.get(ids=["drawer-1"], include=["documents", "metadatas"])
+    assert got["ids"] == ["drawer-1"], (
+        "compressed drawer not found in mempalace_closets — "
+        "cmd_compress wrote to the wrong collection (#1244)"
+    )
+    assert got["documents"] and got["documents"][0], "empty compressed doc"
+    meta = got["metadatas"][0]
+    assert meta.get("wing") == "test"
+    assert "compression_ratio" in meta


 def test_cmd_repair_trailing_slash_does_not_recurse():
@@ -985,3 +1087,58 @@ def test_cmd_repair_trailing_slash_does_not_recurse():
    palace_path = os.path.expanduser(args.palace).rstrip(os.sep)
    backup_path = palace_path + ".backup"
    assert not backup_path.startswith(palace_path + os.sep)
+
+
+# ── stdio reconfigure on Windows ─────────────────────────────────────
+
+
+class _ReconfigurableStringIO:
+    def __init__(self):
+        self.reconfigure_calls = []
+
+    def reconfigure(self, **kwargs):
+        self.reconfigure_calls.append(kwargs)
+
+
+def test_reconfigures_stdio_to_utf8_on_windows():
+    """Windows `mempalace` CLI must decode/encode stdio as UTF-8.
+
+    Without this, piped non-ASCII input (`mempalace search ... < q.txt`)
+    or piped non-ASCII output (`mempalace search "..." > out.txt`) is
+    mojibaked through the system ANSI codepage on non-Latin Windows
+    locales (cp1252/cp1251/cp950).
+    """
+    from mempalace.cli import _reconfigure_stdio_utf8_on_windows
+
+    stdin = _ReconfigurableStringIO()
+    stdout = _ReconfigurableStringIO()
+    stderr = _ReconfigurableStringIO()
+    with (
+        patch.object(sys, "platform", "win32"),
+        patch.object(sys, "stdin", stdin),
+        patch.object(sys, "stdout", stdout),
+        patch.object(sys, "stderr", stderr),
+    ):
+        _reconfigure_stdio_utf8_on_windows()
+
+    # Per-stream errors policy: stdin survives bad bytes via
+    # surrogateescape so a redirected non-UTF-8 file does not crash
+    # the read; stdout/stderr use replace so a drawer carrying a
+    # round-tripped surrogate half does not crash mid-print.
+    assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
+    assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
+    assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
+
+
+def test_reconfigure_stdio_is_noop_off_windows():
+    """Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""
+    from mempalace.cli import _reconfigure_stdio_utf8_on_windows
+
+    stdin = _ReconfigurableStringIO()
+    with (
+        patch.object(sys, "platform", "linux"),
+        patch.object(sys, "stdin", stdin),
+    ):
+        _reconfigure_stdio_utf8_on_windows()
+
+    assert stdin.reconfigure_calls == []
@@ -296,6 +296,182 @@ class TestRegenerateClosets:
            assert meta.get("generated_by", "").startswith("llm:")
            assert meta.get("normalize_version") == NORMALIZE_VERSION

+    def test_regen_paginates_drawer_fetch(self, tmp_path):
+        """Regression for #1073: drawers_col.get must be paginated at
+        batch_size=5000. A single get(limit=total, ...) on a palace with
+        more than SQLite's SQLITE_MAX_VARIABLE_NUMBER (32766) drawers
+        blows up inside chromadb. Matches the miner.status pattern
+        introduced in #851 (see #802, #850, #1073)."""
+        from mempalace import closet_llm as closet_llm_mod
+
+        palace = str(tmp_path / "palace")
+
+        # Build a fake collection: 12_000 drawers across 3 source files,
+        # enough to force 3 batches of batch_size=5000 (5000 + 5000 + 2000).
+        n_drawers = 12_000
+        ids = [f"d{i:05d}" for i in range(n_drawers)]
+        docs = [f"doc body {i}" for i in range(n_drawers)]
+        metas = [
+            {
+                "wing": "w",
+                "room": "r",
+                "source_file": f"/src/file_{i % 3}.md",
+                "entities": "",
+            }
+            for i in range(n_drawers)
+        ]
+
+        get_calls: list = []
+
+        class FakeDrawersCol:
+            def count(self):
+                return n_drawers
+
+            def get(self, limit=None, offset=0, include=None, **kwargs):
+                get_calls.append({"limit": limit, "offset": offset, "include": include})
+                end = min(offset + (limit or n_drawers), n_drawers)
+                return {
+                    "ids": ids[offset:end],
+                    "documents": docs[offset:end],
+                    "metadatas": metas[offset:end],
+                }
+
+        class FakeClosetsCol:
+            """Accept the purge + upsert calls the success path makes."""
+
+            def get(self, *a, **kw):
+                return {"ids": [], "documents": [], "metadatas": []}
+
+            def delete(self, *a, **kw):
+                return None
+
+            def upsert(self, *a, **kw):
+                return None
+
+        fake_drawers = FakeDrawersCol()
+        fake_closets = FakeClosetsCol()
+
+        def fake_urlopen(req, timeout=None):
+            return _FakeResp(
+                {
+                    "choices": [
+                        {"message": {"content": '{"topics":["t1"],"quotes":[],"summary":""}'}}
+                    ],
+                    "usage": {"prompt_tokens": 1, "completion_tokens": 1},
+                }
+            )
+
+        cfg = LLMConfig(endpoint="http://local/v1", model="m")
+
+        with (
+            patch.object(closet_llm_mod, "get_collection", return_value=fake_drawers),
+            patch.object(closet_llm_mod, "get_closets_collection", return_value=fake_closets),
+            patch.object(closet_llm_mod, "purge_file_closets", return_value=None),
+            patch.object(closet_llm_mod, "upsert_closet_lines", return_value=None),
+            patch("urllib.request.urlopen", side_effect=fake_urlopen),
+        ):
+            result = regenerate_closets(palace, cfg=cfg, dry_run=True)
+
+        # Three paginated calls: (limit=5000, offset=0), (5000, 5000), (5000, 10000).
+        assert len(get_calls) == 3, f"expected 3 batched fetches, got {len(get_calls)}"
+        for call in get_calls:
+            assert (
+                call["limit"] == 5000
+            ), f"batch must be 5000 — got {call['limit']} (would risk SQLITE_MAX_VARIABLE_NUMBER)"
+            # include must still request both documents and metadatas
+            assert "documents" in call["include"]
+            assert "metadatas" in call["include"]
+        assert [c["offset"] for c in get_calls] == [0, 5000, 10_000]
+
+        # by_source aggregation must be preserved exactly across batches:
+        # 12_000 drawers, 3 source files → 4_000 drawers each.
+        # dry_run=True short-circuits LLM calls but still walks by_source.
+        assert result.get("processed", 0) == 0  # dry_run
+        # Verify no single call tried to pull more than batch_size.
+        assert max(c["limit"] for c in get_calls) <= 5000
+
+    def test_regen_by_source_aggregates_across_batches(self, tmp_path):
+        """Pagination must not change the by_source grouping — drawers for
+        the same source_file split across batches still land in one group."""
+        from mempalace import closet_llm as closet_llm_mod
+
+        palace = str(tmp_path / "palace")
+
+        # 7_500 drawers, alternating between two source files → forces
+        # splits across the 5000/2500 boundary. Each source ends up with
+        # 3_750 drawers after regrouping.
+        n_drawers = 7_500
+        ids = [f"d{i:05d}" for i in range(n_drawers)]
+        docs = [f"body-{i}" for i in range(n_drawers)]
+        metas = [
+            {
+                "wing": "w",
+                "room": "r",
+                "source_file": f"/src/file_{i % 2}.md",
+                "entities": "",
+            }
+            for i in range(n_drawers)
+        ]
+
+        captured_sources: dict = {}
+
+        class FakeDrawersCol:
+            def count(self):
+                return n_drawers
+
+            def get(self, limit=None, offset=0, include=None, **kwargs):
+                end = min(offset + (limit or n_drawers), n_drawers)
+                return {
+                    "ids": ids[offset:end],
+                    "documents": docs[offset:end],
+                    "metadatas": metas[offset:end],
+                }
+
+        class FakeClosetsCol:
+            def get(self, *a, **kw):
+                return {"ids": [], "documents": [], "metadatas": []}
+
+            def delete(self, *a, **kw):
+                return None
+
+            def upsert(self, *a, **kw):
+                return None
+
+        # Hook _call_llm to inspect what regenerate_closets aggregated
+        # per source before the HTTP boundary.
+        real_call_llm = closet_llm_mod._call_llm
+
+        def spying_call_llm(cfg, source_file, wing, room, content):
+            captured_sources[source_file] = content
+            return (
+                {"topics": ["t"], "quotes": [], "summary": ""},
+                {"prompt_tokens": 1, "completion_tokens": 1},
+            )
+
+        cfg = LLMConfig(endpoint="http://local/v1", model="m")
+
+        with (
+            patch.object(closet_llm_mod, "get_collection", return_value=FakeDrawersCol()),
+            patch.object(closet_llm_mod, "get_closets_collection", return_value=FakeClosetsCol()),
+            patch.object(closet_llm_mod, "purge_file_closets", return_value=None),
+            patch.object(closet_llm_mod, "upsert_closet_lines", return_value=None),
+            patch.object(closet_llm_mod, "_call_llm", side_effect=spying_call_llm),
+        ):
+            regenerate_closets(palace, cfg=cfg)
+
+        # Both sources survived the pagination boundary.
+        assert set(captured_sources.keys()) == {"/src/file_0.md", "/src/file_1.md"}
+        # Each source accumulated exactly 3_750 drawer bodies, concatenated
+        # with the "\n\n" separator the regenerate path uses.
+        for source, content in captured_sources.items():
+            assert content.count("\n\n") == 3_749, (
+                f"{source}: expected 3_750 chunks joined (3_749 separators), "
+                f"got {content.count(chr(10) + chr(10)) + 1}"
+            )
+
+        # Silence unused-var lint.
+        assert real_call_llm is not None
+
    def test_regen_uses_basename_not_split_slash(self, tmp_path, monkeypatch):
        """Regression: the old closet_id base used ``source.split('/')[-1]``
        which silently degrades on Windows paths (``C:\\proj\\a.md`` →
@@ -3,7 +3,13 @@ import json
 import tempfile

 import pytest
-from mempalace.config import MempalaceConfig, normalize_wing_name, sanitize_kg_value, sanitize_name
+from mempalace.config import (
+    MempalaceConfig,
+    normalize_wing_name,
+    sanitize_iso_date,
+    sanitize_kg_value,
+    sanitize_name,
+)


 def test_default_config():
@@ -212,3 +218,69 @@ def test_kg_value_rejects_null_bytes():
 def test_kg_value_rejects_over_length():
    with pytest.raises(ValueError):
        sanitize_kg_value("a" * 129)
+
+
+# --- sanitize_iso_date ---
+
+
+def test_iso_date_rejects_year_only():
+    # Partial dates re-introduce silent empty result sets via lexicographic
+    # TEXT comparison in KG queries (e.g. "2026-01-01" <= "2026" is False).
+    with pytest.raises(ValueError):
+        sanitize_iso_date("2026")
+
+
+def test_iso_date_rejects_year_month():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("2026-03")
+
+
+def test_iso_date_accepts_full_date():
+    assert sanitize_iso_date("2026-03-15") == "2026-03-15"
+
+
+def test_iso_date_passes_through_none():
+    assert sanitize_iso_date(None) is None
+
+
+def test_iso_date_passes_through_empty_string():
+    assert sanitize_iso_date("") == ""
+
+
+def test_iso_date_strips_whitespace():
+    assert sanitize_iso_date("  2026-03-15  ") == "2026-03-15"
+
+
+def test_iso_date_rejects_natural_language():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("March 2026")
+
+
+def test_iso_date_rejects_abbreviated_month():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("Jan 2025")
+
+
+def test_iso_date_rejects_us_format():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("03/15/2026")
+
+
+def test_iso_date_rejects_invalid_month():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("2026-13")
+
+
+def test_iso_date_rejects_invalid_day():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("2026-02-32")
+
+
+def test_iso_date_rejects_non_string():
+    with pytest.raises(ValueError):
+        sanitize_iso_date(20260315)
+
+
+def test_iso_date_error_names_field():
+    with pytest.raises(ValueError, match="valid_from"):
+        sanitize_iso_date("yesterday", "valid_from")
@@ -2,6 +2,8 @@

 from unittest.mock import patch

+import pytest
+
 from mempalace.entity_registry import (
    COMMON_ENGLISH_WORDS,
    PERSON_CONTEXT_PATTERNS,
@@ -71,6 +73,50 @@ def test_save_creates_file(tmp_path):
    assert (tmp_path / "entity_registry.json").exists()


+def test_save_is_atomic_does_not_leave_tmp(tmp_path):
+    # Atomic write must not leave the .tmp sidecar file after a successful save.
+    registry = EntityRegistry.load(config_dir=tmp_path)
+    registry.save()
+    leftover = list(tmp_path.glob("entity_registry.json.tmp*"))
+    assert leftover == [], f"atomic write leaked tmp file(s): {leftover}"
+
+
+def test_save_preserves_previous_on_serialization_failure(tmp_path, monkeypatch):
+    # If serialization fails mid-write, the previous registry must remain
+    # intact — this is the whole point of atomic write vs truncating in place.
+    registry = EntityRegistry.load(config_dir=tmp_path)
+    registry.seed(
+        mode="personal",
+        people=[{"name": "Alice", "relationship": "friend", "context": "personal"}],
+        projects=[],
+    )
+    registry.save()
+    target = tmp_path / "entity_registry.json"
+    original = target.read_text(encoding="utf-8")
+
+    # Force os.replace to raise — simulates filesystem full / permission flip
+    # AFTER the temp file is written but BEFORE the rename completes.
+    import os as _os
+
+    real_replace = _os.replace
+
+    def boom(src, dst):
+        raise OSError("simulated rename failure")
+
+    monkeypatch.setattr(_os, "replace", boom)
+    with pytest.raises(OSError):
+        registry.seed(
+            mode="personal",
+            people=[{"name": "Bob", "relationship": "friend", "context": "personal"}],
+            projects=[],
+        )
+        registry.save()
+
+    # Restore os.replace before reading so the assertion can rely on it.
+    monkeypatch.setattr(_os, "replace", real_replace)
+    assert target.read_text(encoding="utf-8") == original
+
+
 # ── seed ────────────────────────────────────────────────────────────────


@@ -286,3 +286,66 @@ class TestCLI:
        assert "similar_name" in out
        # Silence unused import warning.
        _ = (MagicMock, patch, fact_checker)
+
+    def test_reconfigures_stdio_to_utf8_on_windows(self):
+        """Windows fact_checker --stdin must decode payload as UTF-8.
+
+        Without this, Python defaults stdio to the system ANSI codepage
+        (cp1252/cp1251/cp950), which mojibakes non-ASCII text before
+        pattern parsing sees it.
+        """
+        import io
+        import sys
+
+        from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows
+
+        class _ReconfigurableStringIO(io.StringIO):
+            def __init__(self, initial_value=""):
+                super().__init__(initial_value)
+                self.reconfigure_calls = []
+
+            def reconfigure(self, **kwargs):
+                self.reconfigure_calls.append(kwargs)
+
+        stdin = _ReconfigurableStringIO()
+        stdout = _ReconfigurableStringIO()
+        stderr = _ReconfigurableStringIO()
+        with (
+            patch.object(sys, "platform", "win32"),
+            patch.object(sys, "stdin", stdin),
+            patch.object(sys, "stdout", stdout),
+            patch.object(sys, "stderr", stderr),
+        ):
+            _reconfigure_stdio_utf8_on_windows()
+
+        # Per-stream errors policy: stdin uses surrogateescape so a stray
+        # malformed byte from a redirected file does not crash the read,
+        # stdout/stderr use replace so an extracted fact carrying a
+        # surrogate half does not crash mid-print.
+        assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
+        assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
+        assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
+
+    def test_reconfigure_stdio_is_noop_off_windows(self):
+        """Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""
+        import io
+        import sys
+
+        from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows
+
+        class _ReconfigurableStringIO(io.StringIO):
+            def __init__(self):
+                super().__init__()
+                self.reconfigure_calls = []
+
+            def reconfigure(self, **kwargs):
+                self.reconfigure_calls.append(kwargs)
+
+        stdin = _ReconfigurableStringIO()
+        with (
+            patch.object(sys, "platform", "linux"),
+            patch.object(sys, "stdin", stdin),
+        ):
+            _reconfigure_stdio_utf8_on_windows()
+
+        assert stdin.reconfigure_calls == []
@@ -8,6 +8,7 @@ from unittest.mock import MagicMock, patch

 import pytest

+import mempalace.hooks_cli as hooks_cli_mod
 from mempalace.hooks_cli import (
    SAVE_INTERVAL,
    _count_human_messages,
@@ -959,3 +960,108 @@ def test_stop_hook_rejects_injected_stop_hook_active(tmp_path):
    # The injected value is not "true"/"1"/"yes", so the hook should NOT pass through.
    # Save must have been attempted.
    assert mock_save.called
+
+
+# --- Absent palace root: hooks must not recreate ~/.mempalace ---
+#
+# When the user removes ~/.mempalace (e.g. `rm -rf`), that is the strongest
+# possible "do not auto-capture" signal. Hooks must short-circuit BEFORE
+# touching disk — including before the log-line that previously triggered
+# STATE_DIR.mkdir() on its own.
+
+
+def _redirect_palace_root(monkeypatch, tmp_path):
+    """Point PALACE_ROOT and STATE_DIR at a tmp location that does NOT exist."""
+    fake_root = tmp_path / "absent-mempalace"
+    monkeypatch.setattr(hooks_cli_mod, "PALACE_ROOT", fake_root)
+    monkeypatch.setattr(hooks_cli_mod, "STATE_DIR", fake_root / "hook_state")
+    monkeypatch.setattr(hooks_cli_mod, "_state_dir_initialized", False)
+    return fake_root
+
+
+def test_hook_stop_does_not_create_palace_dir_when_absent(tmp_path, monkeypatch):
+    fake_root = _redirect_palace_root(monkeypatch, tmp_path)
+    transcript = tmp_path / "t.jsonl"
+    transcript.write_text("")
+    buf = io.StringIO()
+    with contextlib.redirect_stdout(buf):
+        hook_stop(
+            {"session_id": "absent", "transcript_path": str(transcript), "stop_hook_active": False},
+            "claude-code",
+        )
+    assert json.loads(buf.getvalue() or "{}") == {}
+    assert not fake_root.exists()
+
+
+def test_hook_precompact_does_not_create_palace_dir_when_absent(tmp_path, monkeypatch):
+    fake_root = _redirect_palace_root(monkeypatch, tmp_path)
+    transcript = tmp_path / "t.jsonl"
+    transcript.write_text("")
+    buf = io.StringIO()
+    with contextlib.redirect_stdout(buf):
+        hook_precompact(
+            {"session_id": "absent", "transcript_path": str(transcript)},
+            "claude-code",
+        )
+    assert json.loads(buf.getvalue() or "{}") == {}
+    assert not fake_root.exists()
+
+
+def test_hook_session_start_does_not_create_palace_dir_when_absent(tmp_path, monkeypatch):
+    fake_root = _redirect_palace_root(monkeypatch, tmp_path)
+    buf = io.StringIO()
+    with contextlib.redirect_stdout(buf):
+        hook_session_start({"session_id": "absent"}, "claude-code")
+    assert json.loads(buf.getvalue() or "{}") == {}
+    assert not fake_root.exists()
+
+
+def test_log_does_not_create_palace_dir_when_absent(tmp_path, monkeypatch):
+    fake_root = _redirect_palace_root(monkeypatch, tmp_path)
+    _log("test message")
+    assert not fake_root.exists()
+
+
+def test_existing_dir_proceeds_normally(tmp_path, monkeypatch):
+    """Regression: when PALACE_ROOT exists, hooks must proceed (no short-circuit)."""
+    fake_root = tmp_path / "present-mempalace"
+    fake_root.mkdir()
+    monkeypatch.setattr(hooks_cli_mod, "PALACE_ROOT", fake_root)
+    monkeypatch.setattr(hooks_cli_mod, "STATE_DIR", fake_root / "hook_state")
+    monkeypatch.setattr(hooks_cli_mod, "_state_dir_initialized", False)
+    _log("test message")
+    # _log should have created the state dir under the existing palace root
+    assert (fake_root / "hook_state").exists()
+    assert (fake_root / "hook_state" / "hook.log").is_file()
+
+
+def test_regular_file_at_palace_root_treated_as_absent(tmp_path, monkeypatch):
+    """A regular file at ~/.mempalace must be treated the same as absent.
+
+    ``Path.exists()`` returns True for a regular file, which would let the
+    kill-switch be bypassed and crash later when ``STATE_DIR.mkdir()`` runs
+    on ``NotADirectoryError``. ``_palace_root_exists()`` must use
+    ``is_dir()`` so a stray file (or broken symlink) short-circuits cleanly.
+    """
+    fake_root = tmp_path / "file-not-dir"
+    fake_root.write_text("oops, this is a file not a directory")
+    monkeypatch.setattr(hooks_cli_mod, "PALACE_ROOT", fake_root)
+    monkeypatch.setattr(hooks_cli_mod, "STATE_DIR", fake_root / "hook_state")
+    monkeypatch.setattr(hooks_cli_mod, "_state_dir_initialized", False)
+
+    # _palace_root_exists() is the source of truth — it must return False.
+    assert hooks_cli_mod._palace_root_exists() is False
+
+    # Hooks must short-circuit (return {} on stdout) and not touch disk.
+    buf = io.StringIO()
+    with contextlib.redirect_stdout(buf):
+        hook_session_start({"session_id": "file-at-root"}, "claude-code")
+    assert json.loads(buf.getvalue() or "{}") == {}
+
+    # _log must also short-circuit — it must NOT try to mkdir a path under a
+    # regular file (which would raise NotADirectoryError).
+    _log("test message")  # would raise if not short-circuited
+
+    # The stray file is left untouched; we never try to convert it.
+    assert fake_root.is_file()
+    assert fake_root.read_text() == "oops, this is a file not a directory"
@@ -0,0 +1,234 @@
+"""Tests for ``candidate_strategy="union"`` in ``search_memories``.
+
+The default ``"vector"`` strategy gathers candidates from the vector index
+only. Docs with strong BM25 signal but vector embeddings far from the query
+get skipped — terminology guides looked up by narrative-shaped queries are
+the canonical case.
+
+The ``"union"`` strategy also pulls top-K BM25-only candidates from sqlite
+FTS5 and merges them into the rerank pool. Both signal sources contribute
+candidates; the hybrid rerank picks the best from a richer pool.
+
+Default behavior is unchanged ("vector") — these tests exercise opt-in
+"union" mode.
+"""
+
+from mempalace.palace import get_collection
+from mempalace.searcher import search_memories
+
+
+def _seed_drawers(palace_path):
+    """Seed a corpus where the right doc for one query is BM25-strong but
+    vector-distant.
+
+    D1-D3 are short narrative tickets that semantically cluster around
+    "customer support / order / shipped" vocabulary. D4 is a meta-document
+    of bullet rules ("brand voice") that contains rare keywords like
+    "Absolutely" and "apologize" the query repeats verbatim — strong BM25
+    signal but stylistically far from the narrative tickets.
+    """
+    col = get_collection(palace_path, create=True)
+    col.upsert(
+        ids=["D1", "D2", "D3", "D4"],
+        documents=[
+            "Customer wrote in asking why their order shipped without "
+            "the promo sticker. Standard reply explaining the threshold.",
+            "Order delivery delayed three days; customer requested a "
+            "refund. Support agent processed return via ticket queue.",
+            "Customer asked about the missing freebie; the reply "
+            "explained the campaign mechanics and shipped status.",
+            "Brand voice rules: dry, sturdy, never effusive. "
+            "Never 'Absolutely!' Never apologize for policy — explain it. "
+            "Avoid premium / curated / elevated vocabulary.",
+        ],
+        metadatas=[
+            {"wing": "shop", "room": "support", "source_file": "ticket_D1.md"},
+            {"wing": "shop", "room": "support", "source_file": "ticket_D2.md"},
+            {"wing": "shop", "room": "support", "source_file": "ticket_D3.md"},
+            {"wing": "shop", "room": "guides", "source_file": "brand_voice_D4.md"},
+        ],
+    )
+
+
+_NARRATIVE_QUERY = (
+    "A support agent is drafting a reply to a customer asking why their "
+    "order shipped without a free sticker. Draft the reply, but never say "
+    "'Absolutely!' and do not apologize for policy."
+)
+
+
+class TestCandidateUnion:
+    def test_default_vector_strategy_unchanged(self, tmp_path):
+        """Default behavior must be identical to omitting the parameter."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        without = search_memories(_NARRATIVE_QUERY, palace, n_results=5)
+        with_default = search_memories(
+            _NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="vector"
+        )
+        ids_a = [h["source_file"] for h in without["results"]]
+        ids_b = [h["source_file"] for h in with_default["results"]]
+        assert ids_a == ids_b, "explicit candidate_strategy='vector' must match default"
+
+    def test_union_surfaces_bm25_strong_vector_distant_doc(self, tmp_path):
+        """The brand-voice doc has strong BM25 signal for the query but is
+        stylistically far from the narrative tickets. Union mode must
+        retrieve it; vector-only mode is allowed to miss it."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        result = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union")
+        ids = [h["source_file"] for h in result["results"]]
+        assert "brand_voice_D4.md" in ids, (
+            "union mode must surface BM25-strong docs even when vector signal "
+            f"is weak; got {ids}"
+        )
+
+    def test_union_preserves_vector_hits(self, tmp_path):
+        """Union mode must not drop docs that vector-only mode finds —
+        the rerank pool grows, it doesn't shrink."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        vector = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="vector")
+        union = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union")
+        vec_ids = {h["source_file"] for h in vector["results"]}
+        union_ids = {h["source_file"] for h in union["results"]}
+        # In a 4-doc corpus with n_results=5, both should return all 4.
+        # The invariant is: union should not lose anything vector found.
+        missing = vec_ids - union_ids
+        assert not missing, f"union dropped docs that vector found: {missing}"
+
+    def test_union_handles_empty_palace(self, tmp_path):
+        """No drawers — union mode should return empty results, not crash."""
+        palace = str(tmp_path / "palace")
+        get_collection(palace, create=True)  # create empty collection
+        result = search_memories("anything", palace, n_results=5, candidate_strategy="union")
+        assert result.get("results", []) == []
+
+    def test_invalid_candidate_strategy_raises(self, tmp_path):
+        """Bad arg should raise rather than silently fall back."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        import pytest
+
+        with pytest.raises(ValueError, match="candidate_strategy"):
+            search_memories("anything", palace, n_results=5, candidate_strategy="bogus")
+
+    def test_invalid_strategy_raises_even_when_vector_disabled(self, tmp_path):
+        """Validation must happen before the ``vector_disabled`` early return —
+        invalid values must fail consistently regardless of routing."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        import pytest
+
+        with pytest.raises(ValueError, match="candidate_strategy"):
+            search_memories(
+                "anything",
+                palace,
+                n_results=5,
+                vector_disabled=True,
+                candidate_strategy="bogus",
+            )
+
+    def test_union_respects_n_results_limit(self, tmp_path):
+        """When the merged candidate set is larger than ``n_results``, the
+        result must be trimmed back to the requested size — the MCP
+        ``limit`` contract depends on this invariant."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        # 4-doc corpus, n_results=2 → union pool can grow to ~8 candidates,
+        # rerank reorders them, but final list must respect the cap.
+        result = search_memories(_NARRATIVE_QUERY, palace, n_results=2, candidate_strategy="union")
+        assert (
+            len(result["results"]) <= 2
+        ), f"union must trim to n_results=2; got {len(result['results'])} results"
+
+    def test_union_skipped_when_max_distance_set(self, tmp_path):
+        """``max_distance`` is a vector-distance threshold; BM25-only
+        candidates have ``distance=None`` and cannot satisfy it. Union
+        must not silently inject them when a strict threshold is set,
+        otherwise the existing ``max_distance`` guarantee regresses."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        # Sanity: without max_distance, union surfaces the BM25-strong doc.
+        unfiltered = search_memories(
+            _NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union"
+        )
+        assert "brand_voice_D4.md" in {h["source_file"] for h in unfiltered["results"]}
+
+        # With a tight max_distance, union must NOT inject BM25-only hits —
+        # every returned hit must have a real (non-None) distance.
+        filtered = search_memories(
+            _NARRATIVE_QUERY,
+            palace,
+            n_results=5,
+            candidate_strategy="union",
+            max_distance=0.5,
+        )
+        for h in filtered["results"]:
+            assert h.get("distance") is not None, (
+                f"union under max_distance must not inject BM25-only "
+                f"(distance=None) candidates; offending hit: {h}"
+            )
+            assert h["distance"] <= 0.5, f"hit violates max_distance=0.5: distance={h['distance']}"
+
+    def test_union_dedup_is_chunk_precise_not_basename(self, tmp_path):
+        """Two files with the same basename in different directories must
+        not collide — union must dedup on full path (or chunk-level key),
+        not on basename alone. Otherwise a BM25-strong README from one
+        directory silently shadows a BM25-strong README from another.
+        """
+        palace = str(tmp_path / "palace")
+        col = get_collection(palace, create=True)
+        col.upsert(
+            ids=["A_README", "B_README", "narrative"],
+            documents=[
+                # Both README files share the basename README.md but live
+                # in different directories. Each contains distinctive
+                # terminology a query might surface via BM25.
+                "PROJECT ALPHA: configuration for the Frobnitz subsystem. "
+                "Set FROBNITZ_TIMEOUT=30 to enable widget rotation.",
+                "PROJECT BETA: configuration for the Wibble subsystem. "
+                "Set WIBBLE_THRESHOLD=0.5 to enable signal smoothing.",
+                "Engineers occasionally chat about how the legacy "
+                "subsystems all need their config knobs tweaked.",
+            ],
+            metadatas=[
+                {"wing": "code", "room": "docs", "source_file": "alpha/README.md"},
+                {"wing": "code", "room": "docs", "source_file": "beta/README.md"},
+                {"wing": "code", "room": "docs", "source_file": "chat.md"},
+            ],
+        )
+        # Query that hits BM25 for BOTH READMEs (distinct vocab from each).
+        # Vector-only might pick the chat doc as semantically "closest";
+        # union must surface both READMEs without basename collision.
+        result = search_memories(
+            "FROBNITZ_TIMEOUT WIBBLE_THRESHOLD configuration",
+            palace,
+            n_results=5,
+            candidate_strategy="union",
+        )
+        sources = [h["source_file"] for h in result["results"]]
+        readme_count = sum(1 for s in sources if s == "README.md")
+        assert readme_count >= 2, (
+            f"union must surface both README.md files from different dirs "
+            f"(basename collision would drop one); got sources={sources}"
+        )
+
+
+class TestHybridRankTolerantOfMissingDistance:
+    """``_hybrid_rank`` accepts ``distance=None`` — required for BM25-only
+    candidates injected by union mode."""
+
+    def test_distance_none_scored_as_zero_vector_sim(self):
+        from mempalace.searcher import _hybrid_rank
+
+        results = [
+            {"text": "alpha beta gamma", "distance": 0.2},  # close vector match
+            {"text": "alpha alpha alpha", "distance": None},  # BM25-only — heavy term repetition
+        ]
+        # Query matches "alpha" heavily; the BM25-only candidate with no
+        # vector signal should still rank competitively on BM25 alone.
+        ranked = _hybrid_rank(results, "alpha")
+        assert all("bm25_score" in r for r in ranked), "rerank should add bm25_score"
+        # Both must survive — neither should crash on distance=None.
+        assert len(ranked) == 2
@@ -5,6 +5,8 @@ Covers: entity CRUD, triple CRUD, temporal queries, invalidation,
 timeline, stats, and edge cases (duplicate triples, ID collisions).
 """

+import pytest
+

 class TestEntityOperations:
    def test_add_entity(self, kg):
@@ -45,6 +47,38 @@ class TestTripleOperations:
        tid2 = kg.add_triple("Alice", "works_at", "Acme")
        assert tid1 != tid2  # new triple since old one was closed

+    def test_add_triple_rejects_inverted_interval(self, kg):
+        # valid_to before valid_from would never satisfy
+        # `valid_from <= as_of AND valid_to >= as_of` — silently invisible
+        # to every query. Reject at write time instead.
+        with pytest.raises(ValueError, match="before valid_from"):
+            kg.add_triple(
+                "Alice",
+                "worked_at",
+                "Acme",
+                valid_from="2026-03-01",
+                valid_to="2026-02-01",
+            )
+
+    def test_add_triple_accepts_equal_dates(self, kg):
+        # Same-day intervals are valid (point-in-time facts).
+        tid = kg.add_triple(
+            "Alice",
+            "joined",
+            "Acme",
+            valid_from="2026-03-15",
+            valid_to="2026-03-15",
+        )
+        assert tid.startswith("t_alice_joined_acme_")
+
+    def test_add_triple_allows_only_one_bound(self, kg):
+        # The guard only fires when BOTH bounds are set.
+        tid1 = kg.add_triple("Alice", "knows", "Bob", valid_from="2026-01-01")
+        assert tid1.startswith("t_alice_knows_bob_")
+        kg.invalidate("Alice", "knows", "Bob", ended="2026-02-01")
+        tid2 = kg.add_triple("Alice", "knew", "Bob", valid_to="2026-03-01")
+        assert tid2.startswith("t_alice_knew_bob_")
+

 class TestQueries:
    def test_query_outgoing(self, seeded_kg):
@@ -655,3 +655,72 @@ def test_memory_stack_status_with_palace(tmp_path):

    assert result["total_drawers"] == 42
    assert result["L0_identity"]["exists"] is True
+
+
+# ── Layer1 / Layer2 None-metadata guards ───────────────────────────────
+#
+# Chroma 1.5.x can return ``None`` inside the ``metadatas`` / ``documents``
+# lists for partially-flushed rows. The Layer1.generate() and
+# Layer2.retrieve() loops previously called ``meta.get(...)`` without
+# coercing, raising ``AttributeError: 'NoneType' object has no attribute
+# 'get'`` and blowing up the whole wake-up render. These tests guard that
+# the loops tolerate the None entries and render the rest of the result.
+
+
+def test_layer1_handles_none_metadata():
+    """Layer1.generate tolerates None entries in the metadatas list."""
+    docs = ["important memory", "another memory"]
+    metas = [{"room": "decisions", "source_file": "a.txt"}, None]
+    mock_col = _mock_chromadb_for_layer(docs, metas)
+
+    with (
+        patch("mempalace.layers.MempalaceConfig") as mock_cfg,
+        patch("mempalace.layers._get_collection", return_value=mock_col),
+    ):
+        mock_cfg.return_value.palace_path = "/fake"
+        layer = Layer1(palace_path="/fake")
+        # Should not raise AttributeError on the None entry.
+        result = layer.generate()
+
+    assert "ESSENTIAL STORY" in result
+    assert "important memory" in result
+
+
+def test_layer1_handles_none_document():
+    """Layer1.generate tolerates None entries in the documents list."""
+    docs = ["first doc", None]
+    metas = [
+        {"room": "r", "source_file": "a.txt"},
+        {"room": "r", "source_file": "b.txt"},
+    ]
+    mock_col = _mock_chromadb_for_layer(docs, metas)
+
+    with (
+        patch("mempalace.layers.MempalaceConfig") as mock_cfg,
+        patch("mempalace.layers._get_collection", return_value=mock_col),
+    ):
+        mock_cfg.return_value.palace_path = "/fake"
+        layer = Layer1(palace_path="/fake")
+        result = layer.generate()
+
+    assert result  # Render succeeded despite the None document.
+
+
+def test_layer2_handles_none_metadata():
+    """Layer2.retrieve tolerates None entries in the metadatas list."""
+    mock_col = MagicMock()
+    mock_col.get.return_value = {
+        "documents": ["first doc", "second doc"],
+        "metadatas": [{"room": "r", "source_file": "a.txt"}, None],
+    }
+
+    with (
+        patch("mempalace.layers.MempalaceConfig") as mock_cfg,
+        patch("mempalace.layers._get_collection", return_value=mock_col),
+    ):
+        mock_cfg.return_value.palace_path = "/fake"
+        layer = Layer2(palace_path="/fake")
+        # Should not raise AttributeError on the None entry.
+        result = layer.retrieve()
+
+    assert "L2 — ON-DEMAND" in result
@@ -8,7 +8,9 @@ via monkeypatch to avoid touching real data.

 from datetime import datetime
 import json
+import os
 import sys
+from unittest.mock import MagicMock

 import pytest

@@ -18,7 +20,7 @@ def _patch_mcp_server(monkeypatch, config, kg):
    from mempalace import mcp_server

    monkeypatch.setattr(mcp_server, "_config", config)
-    monkeypatch.setattr(mcp_server, "_kg", kg)
+    monkeypatch.setattr(mcp_server, "_get_kg", lambda: kg)


 def _get_collection(palace_path, create=False):
@@ -146,6 +148,20 @@ class TestHandleRequest:
        )
        assert resp["error"]["code"] == -32601

+    def test_tools_call_missing_params(self):
+        from mempalace.mcp_server import handle_request
+
+        for bad_params in [None, {}, {"arguments": {}}]:
+            resp = handle_request(
+                {
+                    "method": "tools/call",
+                    "id": 15,
+                    "params": bad_params,
+                }
+            )
+            assert resp["error"]["code"] == -32602
+            assert "Invalid params" in resp["error"]["message"]
+
    def test_unknown_method(self):
        from mempalace.mcp_server import handle_request

@@ -188,6 +204,17 @@ class TestHandleRequest:
        resp = handle_request({"method": None, "id": 99, "params": {}})
        assert resp["error"]["code"] == -32601

+    @pytest.mark.parametrize("payload", [None, [], "plain", 42, True])
+    def test_handle_request_invalid_payload_returns_jsonrpc_error(self, payload):
+        from mempalace.mcp_server import handle_request
+
+        resp = handle_request(payload)
+        assert resp == {
+            "jsonrpc": "2.0",
+            "id": None,
+            "error": {"code": -32600, "message": "Invalid Request"},
+        }
+
    def test_tools_call_dispatches(self, monkeypatch, config, palace_path, seeded_kg):
        _patch_mcp_server(monkeypatch, config, seeded_kg)
        from mempalace.mcp_server import handle_request
@@ -495,6 +522,41 @@ class TestWriteTools:
        result = tool_delete_drawer("nonexistent_drawer")
        assert result["success"] is False

+    def test_check_duplicate_handles_none_metadata(self, monkeypatch, config, kg):
+        """tool_check_duplicate must tolerate None entries in the result lists
+        that ChromaDB 1.5.x returns for partially-flushed rows.
+
+        Previously ``meta = results["metadatas"][0][i]`` was unguarded and
+        raised ``AttributeError: 'NoneType' object has no attribute 'get'``
+        the moment the first matching drawer came back with None metadata —
+        surfacing to the MCP client as the uninformative
+        ``"Duplicate check failed"`` because the broad ``except Exception``
+        wrapper swallows the real cause.
+        """
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace import mcp_server
+
+        mock_col = MagicMock()
+        mock_col.query.return_value = {
+            "ids": [["d1", "d2"]],
+            "distances": [[0.05, 0.05]],
+            "metadatas": [[{"wing": "w", "room": "r"}, None]],
+            "documents": [["first doc", None]],
+        }
+        monkeypatch.setattr(mcp_server, "_get_collection", lambda: mock_col)
+
+        result = mcp_server.tool_check_duplicate("any content", threshold=0.5)
+
+        # Both entries land in matches (above threshold), None ones rendered
+        # with sentinel values rather than crashing the whole response.
+        assert result.get("is_duplicate") is True
+        assert len(result["matches"]) == 2
+        # The None-metadata entry falls back to sentinels.
+        none_entry = result["matches"][1]
+        assert none_entry["wing"] == "?"
+        assert none_entry["room"] == "?"
+        assert none_entry["content"] == ""
+
    def test_check_duplicate(self, monkeypatch, config, palace_path, seeded_collection, kg):
        _patch_mcp_server(monkeypatch, config, kg)
        from mempalace.mcp_server import tool_check_duplicate
@@ -531,6 +593,45 @@ class TestWriteTools:
        result = tool_get_drawer("nonexistent_drawer")
        assert "error" in result

+    def test_get_drawer_does_not_leak_absolute_source_file_path(
+        self, monkeypatch, config, palace_path, collection, kg
+    ):
+        """tool_get_drawer must not expose the absolute filesystem path
+        that the miners write into ``source_file``. Same threat class as
+        the palace_path leak in mempalace_status: in nested-agent or
+        multi-server MCP topologies the client is a separate trust
+        domain, and the directory layout of the host has no documented
+        client-side use. Basename is enough for citation."""
+        _patch_mcp_server(monkeypatch, config, kg)
+
+        secret_dir = "/private/home/alice/secret-research/2026"
+        absolute_source = f"{secret_dir}/notes.md"
+        collection.add(
+            ids=["drawer_leak_probe"],
+            documents=["verbatim drawer body for leak probe"],
+            metadatas=[
+                {
+                    "wing": "research",
+                    "room": "notes",
+                    "source_file": absolute_source,
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-03T00:00:00",
+                }
+            ],
+        )
+
+        from mempalace.mcp_server import tool_get_drawer
+
+        result = tool_get_drawer("drawer_leak_probe")
+        assert result["drawer_id"] == "drawer_leak_probe"
+        assert result["metadata"]["source_file"] == "notes.md"
+        # Defense-in-depth: no field anywhere in the response should
+        # contain the absolute path or its parent directory.
+        serialized = json.dumps(result)
+        assert absolute_source not in serialized
+        assert secret_dir not in serialized
+
    def test_list_drawers(self, monkeypatch, config, palace_path, seeded_collection, kg):
        _patch_mcp_server(monkeypatch, config, kg)
        from mempalace.mcp_server import tool_list_drawers
@@ -650,6 +751,90 @@ class TestKGTools:
            ended="2026-03-01",
        )
        assert result["success"] is True
+        # Regression #1314: response must echo the actual ended date,
+        # not silently drop it and return the literal string "today".
+        assert result["ended"] == "2026-03-01"
+
+    def test_kg_add_forwards_valid_to(self, monkeypatch, config, palace_path, kg):
+        """Regression #1314 case 1: valid_to must round-trip through kg_add."""
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace.mcp_server import tool_kg_add
+
+        result = tool_kg_add(
+            subject="_test_temporal",
+            predicate="had_value",
+            object="probe",
+            valid_from="2026-01-01",
+            valid_to="2026-04-28",
+        )
+        assert result["success"] is True
+
+        facts = kg.query_entity("_test_temporal")
+        assert len(facts) == 1
+        assert facts[0]["valid_from"] == "2026-01-01"
+        assert facts[0]["valid_to"] == "2026-04-28"
+        # An already-ended fact must not be reported as still current.
+        assert facts[0]["current"] is False
+
+    def test_kg_add_forwards_source_provenance(self, monkeypatch, config, palace_path, kg):
+        """Regression #1314 case 3: source_file / source_drawer_id reach storage."""
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace.mcp_server import tool_kg_add
+
+        result = tool_kg_add(
+            subject="operating-verb",
+            predicate="candidate",
+            object="husbandry",
+            valid_from="2026-04-28",
+            source_closet="closet-42",
+            source_file="docs/decisions.md",
+            source_drawer_id="drawer_abc123",
+        )
+        assert result["success"] is True
+
+        triple_id = result["triple_id"]
+        # Read raw row to verify all provenance columns persisted.
+        with kg._lock:
+            row = (
+                kg._conn()
+                .execute(
+                    "SELECT source_closet, source_file, source_drawer_id FROM triples WHERE id = ?",
+                    (triple_id,),
+                )
+                .fetchone()
+            )
+        assert row is not None
+        assert row["source_closet"] == "closet-42"
+        assert row["source_file"] == "docs/decisions.md"
+        assert row["source_drawer_id"] == "drawer_abc123"
+
+    def test_kg_invalidate_returns_actual_ended_date(
+        self, monkeypatch, config, palace_path, seeded_kg
+    ):
+        """Regression #1314 case 2: response reports the resolved date, not 'today'."""
+        from datetime import date as _date
+
+        _patch_mcp_server(monkeypatch, config, seeded_kg)
+        from mempalace.mcp_server import tool_kg_invalidate
+
+        # Caller-supplied date round-trips into the response.
+        explicit = tool_kg_invalidate(
+            subject="Max",
+            predicate="does",
+            object="swimming",
+            ended="2026-04-28",
+        )
+        assert explicit["ended"] == "2026-04-28"
+
+        # Caller-omitted date resolves to today's ISO date — never the
+        # literal string "today" the buggy implementation used to return.
+        implicit = tool_kg_invalidate(
+            subject="Max",
+            predicate="loves",
+            object="Chess",
+        )
+        assert implicit["ended"] != "today"
+        assert implicit["ended"] == _date.today().isoformat()

    def test_kg_timeline(self, monkeypatch, config, palace_path, seeded_kg):
        _patch_mcp_server(monkeypatch, config, seeded_kg)
@@ -665,6 +850,59 @@ class TestKGTools:
        result = tool_kg_stats()
        assert result["entities"] >= 4

+    # --- Date validation at the MCP boundary (issue #1164) ---
+
+    def test_kg_add_rejects_invalid_valid_from(self, monkeypatch, config, palace_path, kg):
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace.mcp_server import tool_kg_add
+
+        result = tool_kg_add(
+            subject="Alice",
+            predicate="likes",
+            object="coffee",
+            valid_from="Jan 2025",
+        )
+        assert result["success"] is False
+        assert "valid_from" in result["error"]
+        assert "ISO-8601" in result["error"]
+
+    def test_kg_query_rejects_invalid_as_of(self, monkeypatch, config, palace_path, seeded_kg):
+        _patch_mcp_server(monkeypatch, config, seeded_kg)
+        from mempalace.mcp_server import tool_kg_query
+
+        result = tool_kg_query(entity="Max", as_of="March 2026")
+        assert "error" in result
+        assert "as_of" in result["error"]
+
+    def test_kg_invalidate_rejects_invalid_ended(self, monkeypatch, config, palace_path, seeded_kg):
+        _patch_mcp_server(monkeypatch, config, seeded_kg)
+        from mempalace.mcp_server import tool_kg_invalidate
+
+        result = tool_kg_invalidate(
+            subject="Max",
+            predicate="does",
+            object="chess",
+            ended="yesterday",
+        )
+        assert result["success"] is False
+        assert "ended" in result["error"]
+
+    def test_kg_query_rejects_partial_iso_dates(self, monkeypatch, config, palace_path, seeded_kg):
+        _patch_mcp_server(monkeypatch, config, seeded_kg)
+        from mempalace.mcp_server import tool_kg_query
+
+        # Partial ISO dates are rejected: KG queries compare TEXT dates
+        # lexicographically, so "2026-01-01" <= "2026" is False, which
+        # silently excludes facts. Reject at the boundary — only YYYY-MM-DD
+        # produces correct results.
+        for value in ("2026", "2026-03"):
+            result = tool_kg_query(entity="Max", as_of=value)
+            assert "error" in result, f"accepted partial date {value!r}: {result}"
+
+        # Full ISO-8601 dates still pass.
+        result = tool_kg_query(entity="Max", as_of="2026-03-15")
+        assert "error" not in result, f"rejected valid date: {result}"
+

 # ── Diary Tools ─────────────────────────────────────────────────────────

@@ -682,7 +920,8 @@ class TestDiaryTools:
            topic="architecture",
        )
        assert w["success"] is True
-        assert w["agent"] == "TestAgent"
+        # agent_name is normalized to lowercase on write (#1243).
+        assert w["agent"] == "testagent"

        r = tool_diary_read(agent_name="TestAgent")
        assert r["total"] == 1
@@ -774,6 +1013,50 @@ class TestDiaryTools:
        assert r_scoped["total"] == 1
        assert r_scoped["entries"][0]["content"] == "project-wing entry"

+    def test_diary_read_case_insensitive_agent(self, monkeypatch, config, palace_path, kg):
+        """Regression for #1243: diary_read must be case-insensitive over
+        agent_name. Writing as "Claude" and reading as "claude" (or vice
+        versa) must surface the same entries — sanitize_name preserved
+        case, which silently dropped reads when the agent name's casing
+        differed from the write."""
+        _patch_mcp_server(monkeypatch, config, kg)
+        _client, _col = _get_collection(palace_path, create=True)
+        del _client
+        from mempalace.mcp_server import tool_diary_read, tool_diary_write
+
+        # Write as "Claude" → read as "claude" should match.
+        w1 = tool_diary_write(
+            agent_name="Claude",
+            entry="entry written as Claude",
+            topic="general",
+        )
+        assert w1["success"]
+
+        r1 = tool_diary_read(agent_name="claude")
+        assert "entries" in r1, r1
+        contents1 = {e["content"] for e in r1["entries"]}
+        assert "entry written as Claude" in contents1
+
+        # Write as "CLAUDE" → read as "Claude" should also match the
+        # same agent. After normalization both writes target the same
+        # lowercase agent identity, so both entries are returned.
+        w2 = tool_diary_write(
+            agent_name="CLAUDE",
+            entry="entry written as CLAUDE",
+            topic="general",
+        )
+        assert w2["success"]
+
+        r2 = tool_diary_read(agent_name="Claude")
+        contents2 = {e["content"] for e in r2["entries"]}
+        assert "entry written as Claude" in contents2
+        assert "entry written as CLAUDE" in contents2
+
+        # The stored agent metadata is the lowercase form, and the
+        # default wing is derived from that lowercase form too.
+        assert w1["agent"] == "claude"
+        assert w2["agent"] == "claude"
+

 # ── Cache Invalidation (inode/mtime) ──────────────────────────────────

@@ -919,3 +1202,352 @@ class TestCacheInvalidation:
        col2 = mcp_server._get_collection(create=True)
        assert col2 is not None
        assert calls == [], f"get_or_create_collection was called: {calls}"
+
+    def test_get_collection_passes_embedding_function(self, monkeypatch, config, palace_path, kg):
+        """Regression for #1299.
+
+        ``mcp_server._get_collection`` must pass ``embedding_function=`` into
+        both ``client.get_collection`` and ``client.create_collection``,
+        mirroring ``ChromaBackend.get_collection``. Without it, ChromaDB 1.x
+        falls back to its built-in ``DefaultEmbeddingFunction`` (whose lazy
+        ONNX provider selection has SIGSEGV'd on python 3.14 + Apple Silicon),
+        and writers/readers can disagree with the miner about which EF is
+        bound to the collection. The miner / Stop hook ingest path routes
+        through ``ChromaBackend.get_collection`` which does this correctly;
+        the MCP server must match.
+        """
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace import mcp_server
+
+        client = mcp_server._get_client()
+        client_cls = type(client)
+        captured: dict[str, list[dict]] = {"get": [], "create": []}
+        real_get = client_cls.get_collection
+        real_create = client_cls.create_collection
+
+        def _spy_get(self, name, **kwargs):
+            captured["get"].append(dict(kwargs))
+            return real_get(self, name, **kwargs)
+
+        def _spy_create(self, name, **kwargs):
+            captured["create"].append(dict(kwargs))
+            return real_create(self, name, **kwargs)
+
+        monkeypatch.setattr(client_cls, "get_collection", _spy_get)
+        monkeypatch.setattr(client_cls, "create_collection", _spy_create)
+        mcp_server._collection_cache = None
+
+        col = mcp_server._get_collection(create=True)
+        assert col is not None
+
+        all_calls = captured["get"] + captured["create"]
+        assert all_calls, "expected get_collection or create_collection to be called"
+        for kwargs in all_calls:
+            assert (
+                "embedding_function" in kwargs
+            ), f"missing embedding_function= in chromadb call: {kwargs}"
+            assert kwargs["embedding_function"] is not None
+
+        # Same expectation on the create=False (cache-miss) reopen path.
+        mcp_server._collection_cache = None
+        captured["get"].clear()
+        captured["create"].clear()
+        col2 = mcp_server._get_collection()
+        assert col2 is not None
+        assert captured["get"], "expected get_collection on cache-miss reopen"
+        for kwargs in captured["get"]:
+            assert "embedding_function" in kwargs
+            assert kwargs["embedding_function"] is not None
+
+    def test_get_collection_retries_once_on_exception(self, monkeypatch, config, palace_path, kg):
+        """Regression: a transient failure inside _get_collection must trigger
+        one retry after clearing the client/collection caches, not silently
+        return None.
+
+        Before this fix, a stale chromadb handle (e.g. the rust bindings
+        invalidating after an out-of-band write) would raise inside the
+        single ``try`` block, get swallowed by ``except Exception: return
+        None``, and every subsequent tool call would hit the same poisoned
+        cache returning None. The retry forces ``_get_client()`` to rebuild
+        the client (which re-runs ``quarantine_stale_hnsw`` per #1322), so
+        the second attempt heals the common stale-handle case.
+        """
+        _patch_mcp_server(monkeypatch, config, kg)
+        _client, _col = _get_collection(palace_path, create=True)
+        del _client
+        from mempalace import mcp_server
+
+        # Force a cold cache so the first call goes through the open path.
+        mcp_server._client_cache = None
+        mcp_server._collection_cache = None
+
+        real_get_client = mcp_server._get_client
+        attempts = {"count": 0}
+
+        def flaky_get_client():
+            attempts["count"] += 1
+            if attempts["count"] == 1:
+                raise RuntimeError("simulated transient chromadb failure")
+            return real_get_client()
+
+        monkeypatch.setattr(mcp_server, "_get_client", flaky_get_client)
+
+        col = mcp_server._get_collection()
+
+        # Both attempts ran and the second succeeded.
+        assert attempts["count"] == 2
+        assert col is not None
+
+    def test_get_collection_returns_none_after_two_failures(
+        self, monkeypatch, config, palace_path, kg
+    ):
+        """If both attempts fail, return None (matches the prior contract for
+        permanent failures — only the transient case is now self-healing)."""
+        _patch_mcp_server(monkeypatch, config, kg)
+        _client, _col = _get_collection(palace_path, create=True)
+        del _client
+        from mempalace import mcp_server
+
+        mcp_server._client_cache = None
+        mcp_server._collection_cache = None
+
+        attempts = {"count": 0}
+
+        def always_fails():
+            attempts["count"] += 1
+            raise RuntimeError("permanent chromadb failure")
+
+        monkeypatch.setattr(mcp_server, "_get_client", always_fails)
+
+        col = mcp_server._get_collection()
+
+        assert attempts["count"] == 2
+        assert col is None
+
+
+class TestKGLazyCache:
+    """Lazy per-path KnowledgeGraph cache (issue #1136)."""
+
+    def test_lazy_init_no_import_side_effect(self, tmp_path):
+        """Importing mcp_server must not create knowledge_graph.sqlite3.
+
+        Runs in a fresh subprocess with HOME pointed at tmp_path so the
+        assertion targets a clean filesystem, independent of conftest's
+        session-level HOME patch.
+        """
+        import subprocess
+        import sys
+
+        kg_file = tmp_path / ".mempalace" / "knowledge_graph.sqlite3"
+        env = {k: v for k, v in os.environ.items() if not k.startswith("MEMPAL")}
+        env["HOME"] = str(tmp_path)
+        env["USERPROFILE"] = str(tmp_path)
+        result = subprocess.run(
+            [sys.executable, "-c", "import mempalace.mcp_server"],
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        assert result.returncode == 0, f"import failed: {result.stderr}"
+        assert not kg_file.exists(), f"import created sqlite file at {kg_file} as a side effect"
+
+    def test_get_kg_returns_same_instance(self, tmp_path, monkeypatch):
+        """Two calls with the same resolved path return the same KG."""
+        from mempalace import mcp_server
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {})
+        monkeypatch.setattr(mcp_server, "_palace_flag_given", True)
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_path))
+
+        kg1 = mcp_server._get_kg()
+        kg2 = mcp_server._get_kg()
+        assert kg1 is kg2
+        assert len(mcp_server._kg_by_path) == 1
+
+    def test_get_kg_different_paths_different_instances(self, tmp_path, monkeypatch):
+        """Different palace paths map to different KG instances."""
+        from mempalace import mcp_server
+
+        tmp_a = tmp_path / "a"
+        tmp_b = tmp_path / "b"
+        tmp_a.mkdir()
+        tmp_b.mkdir()
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {})
+        monkeypatch.setattr(mcp_server, "_palace_flag_given", True)
+
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_a))
+        kg_a = mcp_server._get_kg()
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_b))
+        kg_b = mcp_server._get_kg()
+
+        assert kg_a is not kg_b
+        assert len(mcp_server._kg_by_path) == 2
+
+    def test_multi_tenant_env_switch(self, tmp_path, monkeypatch):
+        """The issue #1136 acceptance scenario.
+
+        Rotating MEMPALACE_PALACE_PATH between MCP tool calls must route
+        each call to the correct tenant's KG sqlite file.
+        """
+        from mempalace import mcp_server
+
+        tmp_a = tmp_path / "tenant_a"
+        tmp_b = tmp_path / "tenant_b"
+        tmp_a.mkdir()
+        tmp_b.mkdir()
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {})
+        monkeypatch.setattr(mcp_server, "_palace_flag_given", True)
+
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_a))
+        add_result = mcp_server.tool_kg_add(
+            subject="alice_secret",
+            predicate="owns",
+            object="repo_a",
+        )
+        assert add_result.get("success") is True, add_result
+
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_b))
+        query_b = mcp_server.tool_kg_query(entity="alice_secret")
+        assert query_b.get("count", 0) == 0, f"tenant B leaked tenant A's fact: {query_b}"
+
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_a))
+        query_a = mcp_server.tool_kg_query(entity="alice_secret")
+        assert query_a.get("count", 0) >= 1, f"tenant A lost its own fact: {query_a}"
+
+    def test_cache_thread_safe(self, tmp_path, monkeypatch):
+        """Concurrent _get_kg() for the same path yields one instance."""
+        import concurrent.futures
+        from mempalace import mcp_server
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {})
+        monkeypatch.setattr(mcp_server, "_palace_flag_given", True)
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_path))
+
+        with concurrent.futures.ThreadPoolExecutor(max_workers=16) as pool:
+            results = list(pool.map(lambda _: mcp_server._get_kg(), range(16)))
+
+        ids = {id(kg) for kg in results}
+        assert len(ids) == 1, f"expected 1 unique instance, got {len(ids)}"
+        assert len(mcp_server._kg_by_path) == 1
+
+    def test_tool_reconnect_drains_kg_cache(self, monkeypatch):
+        """``tool_reconnect`` must close cached KG instances and clear the dict.
+
+        Without this, an external replacement of ``knowledge_graph.sqlite3``
+        leaves the server pinned to a stale ``sqlite3.Connection``.
+        """
+        from mempalace import mcp_server
+
+        class _FakeKG:
+            def __init__(self):
+                self.closed = False
+
+            def close(self):
+                self.closed = True
+
+        fake_a = _FakeKG()
+        fake_b = _FakeKG()
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {"/a": fake_a, "/b": fake_b})
+        # Bypass real ChromaDB so the test isolates KG-cache behaviour.
+        monkeypatch.setattr(mcp_server, "_get_collection", lambda: None)
+
+        mcp_server.tool_reconnect()
+
+        assert fake_a.closed is True
+        assert fake_b.closed is True
+        assert mcp_server._kg_by_path == {}
+
+    def test_tool_reconnect_swallows_kg_close_errors(self, monkeypatch):
+        """A failing ``close()`` on one cached KG must not block cache clearing."""
+        from mempalace import mcp_server
+
+        class _BoomKG:
+            def close(self):
+                raise RuntimeError("boom")
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {"/a": _BoomKG()})
+        monkeypatch.setattr(mcp_server, "_get_collection", lambda: None)
+
+        mcp_server.tool_reconnect()
+
+        assert mcp_server._kg_by_path == {}
+
+    def test_call_kg_retries_after_concurrent_close(self, monkeypatch):
+        """A KG closed mid-handler must trigger a one-shot retry with a fresh
+        instance — not surface a -32000 to the MCP client."""
+        import sqlite3 as _sqlite3
+
+        from mempalace import mcp_server
+
+        path = "/fake/palace/knowledge_graph.sqlite3"
+        monkeypatch.setattr(mcp_server, "_resolve_kg_path", lambda: path)
+
+        class _ClosedKG:
+            def query_entity(self, entity, **kwargs):
+                raise _sqlite3.ProgrammingError("Cannot operate on a closed database")
+
+        class _FreshKG:
+            def query_entity(self, entity, **kwargs):
+                return [{"entity": entity}]
+
+        cache = {os.path.abspath(path): _ClosedKG()}
+        monkeypatch.setattr(mcp_server, "_kg_by_path", cache)
+
+        # Second _get_kg() call (after the cache eviction) constructs a new
+        # KG. Patch the constructor so we don't open a real sqlite file.
+        monkeypatch.setattr(mcp_server, "KnowledgeGraph", lambda **_: _FreshKG())
+
+        result = mcp_server._call_kg(lambda kg: kg.query_entity("Alice"))
+        assert result == [{"entity": "Alice"}]
+        # The closed instance must be evicted; the fresh one must be cached.
+        assert isinstance(cache[os.path.abspath(path)], _FreshKG)
+
+    def test_call_kg_does_not_retry_on_other_errors(self, monkeypatch):
+        """Non-ProgrammingError exceptions must propagate without retry —
+        we don't want the retry guard masking real bugs."""
+        from mempalace import mcp_server
+
+        path = "/fake/palace/knowledge_graph.sqlite3"
+        monkeypatch.setattr(mcp_server, "_resolve_kg_path", lambda: path)
+
+        calls = {"count": 0}
+
+        class _FailingKG:
+            def query_entity(self, entity, **kwargs):
+                calls["count"] += 1
+                raise ValueError("bad input")
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {os.path.abspath(path): _FailingKG()})
+        monkeypatch.setattr(mcp_server, "KnowledgeGraph", lambda **_: _FailingKG())
+
+        with pytest.raises(ValueError, match="bad input"):
+            mcp_server._call_kg(lambda kg: kg.query_entity("Alice"))
+        assert calls["count"] == 1, "non-ProgrammingError must not trigger retry"
+
+    def test_call_kg_gives_up_after_one_retry(self, monkeypatch):
+        """If the second attempt also hits a closed DB, give up rather than
+        loop forever — a sustained close-stream is a different bug."""
+        import sqlite3 as _sqlite3
+
+        from mempalace import mcp_server
+
+        path = "/fake/palace/knowledge_graph.sqlite3"
+        monkeypatch.setattr(mcp_server, "_resolve_kg_path", lambda: path)
+
+        calls = {"count": 0}
+
+        class _AlwaysClosedKG:
+            def query_entity(self, entity, **kwargs):
+                calls["count"] += 1
+                raise _sqlite3.ProgrammingError("closed again")
+
+        cache = {}
+        monkeypatch.setattr(mcp_server, "_kg_by_path", cache)
+        monkeypatch.setattr(mcp_server, "KnowledgeGraph", lambda **_: _AlwaysClosedKG())
+
+        with pytest.raises(_sqlite3.ProgrammingError):
+            mcp_server._call_kg(lambda kg: kg.query_entity("Alice"))
+        assert calls["count"] == 2, "expected exactly one retry beyond the initial attempt"
@@ -135,19 +135,77 @@ def test_different_palaces_dont_conflict(tmp_path, monkeypatch):


 def test_palace_path_is_normalized(tmp_path, monkeypatch):
-    """Relative and absolute forms of the same path must use the same lock."""
+    """Relative and absolute forms of the same path must use the same lock.
+
+    Cross-process variant: a child holds the absolute form, a relative form
+    in the parent must hash to the same lock key and raise
+    ``MineAlreadyRunning``. (The same-thread case is now a re-entrant
+    pass-through by design — see ``test_reentrant_same_thread_passes_through``
+    — so we exercise the normalization invariant across a process boundary
+    where re-entrance does not apply.)
+    """
    monkeypatch.setenv("HOME", str(tmp_path))
    monkeypatch.chdir(tmp_path)
    os.makedirs(tmp_path / "palace", exist_ok=True)
    absolute = str(tmp_path / "palace")
-    relative = "palace"
+    ready = str(tmp_path / "ready")
+    release = str(tmp_path / "release")

-    # Hold the lock with the absolute form; attempting to re-acquire with
-    # the relative form (which resolves to the same absolute path) must fail.
-    with mine_palace_lock(absolute):
+    ctx = _get_mp_context()
+    holder = ctx.Process(target=_hold_lock, args=(absolute, ready, release))
+    holder.start()
+    try:
+        for _ in range(500):
+            if os.path.exists(ready):
+                break
+            time.sleep(0.01)
+        assert os.path.exists(ready), "holder failed to acquire lock in time"
+
+        # Parent holds CWD = tmp_path so "palace" is the same on-disk dir as
+        # the absolute form. The lock key is sha256(realpath+normcase) so the
+        # two forms must collide.
        with pytest.raises(MineAlreadyRunning):
-            with mine_palace_lock(relative):
+            with mine_palace_lock("palace"):
                pytest.fail("normalized path collision should have raised")
+    finally:
+        open(release, "w").close()
+        holder.join(timeout=5)
+
+
+def test_reentrant_same_thread_passes_through(tmp_path, monkeypatch):
+    """Same thread re-acquiring the same palace lock must not deadlock or raise.
+
+    This is the invariant that makes ``ChromaCollection`` write methods (which
+    take ``mine_palace_lock`` for MCP/direct-writer protection) compose with
+    ``miner.mine()`` (which already holds the lock for the entire mine
+    pipeline). Without the per-thread re-entrant guard the inner acquire
+    would self-deadlock on the outer flock.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+    with mine_palace_lock(palace):
+        # Re-enter from the same thread — must yield without raising or hanging.
+        with mine_palace_lock(palace):
+            pass
+        # After the inner exits, the outer is still held: confirm via a
+        # subprocess that tries to acquire and reports back.
+        ctx = _get_mp_context()
+        result_q = ctx.Queue()
+        child = ctx.Process(target=_try_acquire_expect_busy, args=(palace, result_q))
+        child.start()
+        child.join(timeout=5)
+        assert (
+            result_q.get(timeout=1) == "busy"
+        ), "outer lock should still be held by parent after inner re-entrant exit"
+
+
+def _try_acquire_expect_busy(palace_path, result_q):
+    """Helper: try to acquire, push 'busy' (raised) or 'free' (acquired) into queue."""
+    try:
+        with mine_palace_lock(palace_path):
+            result_q.put("free")
+    except MineAlreadyRunning:
+        result_q.put("busy")


 def test_mine_global_lock_is_alias_for_back_compat(tmp_path, monkeypatch):
@@ -120,6 +120,65 @@ class TestSearchMemories:
        assert none_hit["wing"] == "unknown"
        assert none_hit["room"] == "unknown"

+    def test_effective_distance_clamped_to_valid_cosine_range(self):
+        """A strong closet boost (up to 0.40) applied to a low-distance drawer
+        can drive ``dist - boost`` negative. That violates the cosine-distance
+        invariant ``[0, 2]``: the API returns ``similarity > 1.0`` and the
+        internal ``_sort_key`` sinks below ordinary positive distances,
+        inverting the ranking so the best hybrid matches sort last.
+
+        With the clamp, ``effective_distance`` stays in ``[0, 2]``,
+        ``similarity`` stays in ``[0, 1]``, and the sort order is stable.
+        """
+        # Drawer a.md gets a tiny base distance (0.08) — nearly exact match.
+        # Drawer b.md gets a larger base distance (0.35).
+        drawers_col = MagicMock()
+        drawers_col.query.return_value = {
+            "documents": [["doc-a", "doc-b"]],
+            "metadatas": [
+                [
+                    {"source_file": "a.md", "wing": "w", "room": "r", "chunk_index": 0},
+                    {"source_file": "b.md", "wing": "w", "room": "r", "chunk_index": 0},
+                ]
+            ],
+            "distances": [[0.08, 0.35]],
+            "ids": [["d-a", "d-b"]],
+        }
+        # A strong closet at rank 0 points at a.md → boost = 0.40,
+        # which exceeds a.md's base distance and would go negative without
+        # the clamp. No closet for b.md.
+        closets_col = MagicMock()
+        closets_col.query.return_value = {
+            "documents": [["closet-preview-a"]],
+            "metadatas": [[{"source_file": "a.md"}]],
+            "distances": [[0.2]],  # within CLOSET_DISTANCE_CAP (1.5)
+            "ids": [["c-a"]],
+        }
+
+        with (
+            patch("mempalace.searcher.get_collection", return_value=drawers_col),
+            patch("mempalace.searcher.get_closets_collection", return_value=closets_col),
+        ):
+            result = search_memories("query", "/fake/path", n_results=5)
+
+        hits = result["results"]
+        assert hits, "should return results"
+
+        # Invariants on every hit.
+        for h in hits:
+            assert (
+                0.0 <= h["similarity"] <= 1.0
+            ), f"similarity out of range: {h['similarity']} for {h['source_file']}"
+            assert 0.0 <= h["effective_distance"] <= 2.0, (
+                f"effective_distance out of range: {h['effective_distance']} "
+                f"for {h['source_file']}"
+            )
+
+        # With the clamp, the closet-boosted a.md still ranks ahead of b.md —
+        # the boost still wins, but it no longer flips the ranking.
+        assert hits[0]["source_file"] == "a.md"
+        assert hits[0]["matched_via"] == "drawer+closet"
+

 # ── BM25 internals: None / empty document safety ─────────────────────