feat(normalize): auto-rebuild stale drawers via NORMALIZE_VERSION schema gate

Without this, the strip_noise improvement only helps new mines. Every
user who had already mined Claude Code JSONL sessions would keep their
noise-polluted drawers forever, because convo_miner's file_already_mined
skip short-circuits before re-processing.

Adds a versioned schema gate so upgrades propagate silently:

- palace.NORMALIZE_VERSION=2 — bumped when the normalization pipeline
  changes shape (this PR's strip_noise is the v1→v2 bump).
- file_already_mined now returns False if the stored normalize_version
  is missing or less than current, triggering a rebuild on next mine.
- Both miners stamp drawers with the current normalize_version.
- convo_miner now purges stale drawers before inserting fresh chunks
  (mirrors miner.py's existing delete+insert), extracted into
  _file_convo_chunks helper to keep mine_convos under ruff's C901 limit.

User experience: upgrade mempalace, run `mempalace mine` as usual, old
noisy drawers get silently replaced with clean ones. No erase needed,
no "you need to rebuild" changelog footgun.

Tests:
- test_file_already_mined_returns_false_for_stale_normalize_version —
  pins the version gate contract for missing/v1/current.
- test_add_drawer_stamps_normalize_version — fresh project-miner drawers
  carry the field.
- test_mine_convos_rebuilds_stale_drawers_after_schema_bump — end-to-end
  proof that a pre-v2 palace gets silently cleaned on next mine, with
  orphan drawers purged and NOT skipped.

Existing test_file_already_mined_check_mtime updated to include the
new field; all other tests unaffected.
This commit is contained in:
Igor Lins e Silva
2026-04-13 16:20:55 -03:00
parent ca2598a9f6
commit 7e5eeda9a5
5 changed files with 253 additions and 38 deletions
+83
View File
@@ -75,3 +75,86 @@ def test_mine_convos_does_not_reprocess_empty_chunk_files(capsys):
assert "Files skipped (already filed): 1" in out2
finally:
shutil.rmtree(tmpdir, ignore_errors=True)
def test_mine_convos_rebuilds_stale_drawers_after_schema_bump(capsys):
"""When stored drawers have an older normalize_version, the next mine
silently purges them and refiles — no manual erase required.
This is what makes the strip_noise upgrade apply to existing corpora:
users just run `mempalace mine` again and old noise-filled drawers get
replaced with clean ones."""
from mempalace.palace import NORMALIZE_VERSION
tmpdir = tempfile.mkdtemp()
try:
convo_path = Path(tmpdir) / "chat.txt"
convo_path.write_text(
"> What is memory?\nMemory is persistence.\n\n"
"> Why does it matter?\nIt enables continuity.\n\n"
"> How do we build it?\nWith structured storage.\n"
)
palace_path = os.path.join(tmpdir, "palace")
# First mine — stamps drawers with NORMALIZE_VERSION
mine_convos(tmpdir, palace_path, wing="test")
capsys.readouterr()
client = chromadb.PersistentClient(path=palace_path)
col = client.get_collection("mempalace_drawers")
resolved = str(Path(tmpdir).resolve() / "chat.txt")
first_pass = col.get(where={"source_file": resolved})
first_ids = set(first_pass["ids"])
assert first_ids, "first mine should produce drawers"
for meta in first_pass["metadatas"]:
assert meta.get("normalize_version") == NORMALIZE_VERSION
# Simulate pre-v2 drawers: rewrite metadata to an older version,
# and replace content with "noise" so we can see it get cleaned up.
stale_metas = []
for meta in first_pass["metadatas"]:
stale = dict(meta)
stale["normalize_version"] = 1
stale_metas.append(stale)
col.update(
ids=list(first_pass["ids"]),
documents=["STALE NOISE"] * len(first_pass["ids"]),
metadatas=stale_metas,
)
# Add an extra orphan drawer that should also be purged.
col.add(
ids=["orphan_drawer"],
documents=["OLD ORPHAN"],
metadatas=[
{
"wing": "test",
"room": "default",
"source_file": resolved,
"chunk_index": 999,
"normalize_version": 1,
}
],
)
del col, client
# Second mine — version gate should trigger rebuild
mine_convos(tmpdir, palace_path, wing="test")
out = capsys.readouterr().out
assert (
"Files skipped (already filed): 0" in out
), "stale drawers should force a rebuild, not a skip"
client = chromadb.PersistentClient(path=palace_path)
col = client.get_collection("mempalace_drawers")
rebuilt = col.get(where={"source_file": resolved})
# Orphan is gone
assert "orphan_drawer" not in rebuilt["ids"]
# No stale content survived
assert all("STALE NOISE" not in d for d in rebuilt["documents"])
assert all("OLD ORPHAN" not in d for d in rebuilt["documents"])
# All rebuilt drawers carry the current version
for meta in rebuilt["metadatas"]:
assert meta.get("normalize_version") == NORMALIZE_VERSION
del col, client
finally:
shutil.rmtree(tmpdir, ignore_errors=True)