feat(normalize): auto-rebuild stale drawers via NORMALIZE_VERSION schema gate

Without this, the strip_noise improvement only helps new mines. Every
user who had already mined Claude Code JSONL sessions would keep their
noise-polluted drawers forever, because convo_miner's file_already_mined
skip short-circuits before re-processing.

Adds a versioned schema gate so upgrades propagate silently:

- palace.NORMALIZE_VERSION=2 — bumped when the normalization pipeline
  changes shape (this PR's strip_noise is the v1→v2 bump).
- file_already_mined now returns False if the stored normalize_version
  is missing or less than current, triggering a rebuild on next mine.
- Both miners stamp drawers with the current normalize_version.
- convo_miner now purges stale drawers before inserting fresh chunks
  (mirrors miner.py's existing delete+insert), extracted into
  _file_convo_chunks helper to keep mine_convos under ruff's C901 limit.

User experience: upgrade mempalace, run `mempalace mine` as usual, old
noisy drawers get silently replaced with clean ones. No erase needed,
no "you need to rebuild" changelog footgun.

Tests:
- test_file_already_mined_returns_false_for_stale_normalize_version —
  pins the version gate contract for missing/v1/current.
- test_add_drawer_stamps_normalize_version — fresh project-miner drawers
  carry the field.
- test_mine_convos_rebuilds_stale_drawers_after_schema_bump — end-to-end
  proof that a pre-v2 palace gets silently cleaned on next mine, with
  orphan drawers purged and NOT skipped.

Existing test_file_already_mined_check_mtime updated to include the
new field; all other tests unaffected.
This commit is contained in:
Igor Lins e Silva
2026-04-13 16:20:55 -03:00
parent ca2598a9f6
commit 7e5eeda9a5
5 changed files with 253 additions and 38 deletions
+90 -4
View File
@@ -7,7 +7,7 @@ import chromadb
import yaml
from mempalace.miner import mine, scan_project, status
from mempalace.palace import file_already_mined
from mempalace.palace import NORMALIZE_VERSION, file_already_mined
def write_file(path: Path, content: str):
@@ -227,11 +227,17 @@ def test_file_already_mined_check_mtime():
assert file_already_mined(col, test_file) is False
assert file_already_mined(col, test_file, check_mtime=True) is False
# Add it with mtime
# Add it with mtime + current normalize_version
col.add(
ids=["d1"],
documents=["hello world"],
metadatas=[{"source_file": test_file, "source_mtime": str(mtime)}],
metadatas=[
{
"source_file": test_file,
"source_mtime": str(mtime),
"normalize_version": NORMALIZE_VERSION,
}
],
)
# Already mined (no mtime check)
@@ -253,7 +259,12 @@ def test_file_already_mined_check_mtime():
col.add(
ids=["d2"],
documents=["other"],
metadatas=[{"source_file": "/fake/no_mtime.txt"}],
metadatas=[
{
"source_file": "/fake/no_mtime.txt",
"normalize_version": NORMALIZE_VERSION,
}
],
)
assert file_already_mined(col, "/fake/no_mtime.txt", check_mtime=True) is False
finally:
@@ -296,3 +307,78 @@ def test_status_missing_palace_does_not_create_empty_collection(tmp_path, capsys
out = capsys.readouterr().out
assert "No palace found" in out
assert not palace_path.exists()
# ── normalize_version schema gate ───────────────────────────────────────
#
# When the normalization pipeline changes shape (e.g., strip_noise lands),
# `NORMALIZE_VERSION` is bumped so pre-existing drawers can be silently
# rebuilt on the next mine. These tests pin that contract.
def test_file_already_mined_returns_false_for_stale_normalize_version():
"""Pre-v2 drawers (no field, or older integer) must not short-circuit."""
tmpdir = tempfile.mkdtemp()
try:
palace_path = os.path.join(tmpdir, "palace")
os.makedirs(palace_path)
client = chromadb.PersistentClient(path=palace_path)
col = client.get_or_create_collection("mempalace_drawers")
# Pre-v2 drawer: no normalize_version field at all
col.add(
ids=["d_old"],
documents=["old"],
metadatas=[{"source_file": "/fake/old.jsonl"}],
)
assert file_already_mined(col, "/fake/old.jsonl") is False
# Explicitly older version
col.add(
ids=["d_v1"],
documents=["v1"],
metadatas=[{"source_file": "/fake/v1.jsonl", "normalize_version": 1}],
)
assert file_already_mined(col, "/fake/v1.jsonl") is False
# Current version — short-circuits
col.add(
ids=["d_current"],
documents=["cur"],
metadatas=[
{
"source_file": "/fake/current.jsonl",
"normalize_version": NORMALIZE_VERSION,
}
],
)
assert file_already_mined(col, "/fake/current.jsonl") is True
finally:
del col, client
shutil.rmtree(tmpdir, ignore_errors=True)
def test_add_drawer_stamps_normalize_version(tmp_path):
"""Fresh drawers carry the current schema version so future upgrades work."""
from mempalace.miner import add_drawer
palace_path = tmp_path / "palace"
palace_path.mkdir()
client = chromadb.PersistentClient(path=str(palace_path))
col = client.get_or_create_collection("mempalace_drawers")
try:
added = add_drawer(
collection=col,
wing="test",
room="notes",
content="hello",
source_file=str(tmp_path / "src.md"),
chunk_index=0,
agent="unit",
)
assert added is True
stored = col.get(limit=1)
meta = stored["metadatas"][0]
assert meta["normalize_version"] == NORMALIZE_VERSION
finally:
del col, client