feat(normalize): auto-rebuild stale drawers via NORMALIZE_VERSION schema gate
Without this, the strip_noise improvement only helps new mines. Every user who had already mined Claude Code JSONL sessions would keep their noise-polluted drawers forever, because convo_miner's file_already_mined skip short-circuits before re-processing. Adds a versioned schema gate so upgrades propagate silently: - palace.NORMALIZE_VERSION=2 — bumped when the normalization pipeline changes shape (this PR's strip_noise is the v1→v2 bump). - file_already_mined now returns False if the stored normalize_version is missing or less than current, triggering a rebuild on next mine. - Both miners stamp drawers with the current normalize_version. - convo_miner now purges stale drawers before inserting fresh chunks (mirrors miner.py's existing delete+insert), extracted into _file_convo_chunks helper to keep mine_convos under ruff's C901 limit. User experience: upgrade mempalace, run `mempalace mine` as usual, old noisy drawers get silently replaced with clean ones. No erase needed, no "you need to rebuild" changelog footgun. Tests: - test_file_already_mined_returns_false_for_stale_normalize_version — pins the version gate contract for missing/v1/current. - test_add_drawer_stamps_normalize_version — fresh project-miner drawers carry the field. - test_mine_convos_rebuilds_stale_drawers_after_schema_bump — end-to-end proof that a pre-v2 palace gets silently cleaned on next mine, with orphan drawers purged and NOT skipped. Existing test_file_already_mined_check_mtime updated to include the new field; all other tests unaffected.
This commit is contained in:
+90
-4
@@ -7,7 +7,7 @@ import chromadb
|
||||
import yaml
|
||||
|
||||
from mempalace.miner import mine, scan_project, status
|
||||
from mempalace.palace import file_already_mined
|
||||
from mempalace.palace import NORMALIZE_VERSION, file_already_mined
|
||||
|
||||
|
||||
def write_file(path: Path, content: str):
|
||||
@@ -227,11 +227,17 @@ def test_file_already_mined_check_mtime():
|
||||
assert file_already_mined(col, test_file) is False
|
||||
assert file_already_mined(col, test_file, check_mtime=True) is False
|
||||
|
||||
# Add it with mtime
|
||||
# Add it with mtime + current normalize_version
|
||||
col.add(
|
||||
ids=["d1"],
|
||||
documents=["hello world"],
|
||||
metadatas=[{"source_file": test_file, "source_mtime": str(mtime)}],
|
||||
metadatas=[
|
||||
{
|
||||
"source_file": test_file,
|
||||
"source_mtime": str(mtime),
|
||||
"normalize_version": NORMALIZE_VERSION,
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
# Already mined (no mtime check)
|
||||
@@ -253,7 +259,12 @@ def test_file_already_mined_check_mtime():
|
||||
col.add(
|
||||
ids=["d2"],
|
||||
documents=["other"],
|
||||
metadatas=[{"source_file": "/fake/no_mtime.txt"}],
|
||||
metadatas=[
|
||||
{
|
||||
"source_file": "/fake/no_mtime.txt",
|
||||
"normalize_version": NORMALIZE_VERSION,
|
||||
}
|
||||
],
|
||||
)
|
||||
assert file_already_mined(col, "/fake/no_mtime.txt", check_mtime=True) is False
|
||||
finally:
|
||||
@@ -296,3 +307,78 @@ def test_status_missing_palace_does_not_create_empty_collection(tmp_path, capsys
|
||||
out = capsys.readouterr().out
|
||||
assert "No palace found" in out
|
||||
assert not palace_path.exists()
|
||||
|
||||
|
||||
# ── normalize_version schema gate ───────────────────────────────────────
|
||||
#
|
||||
# When the normalization pipeline changes shape (e.g., strip_noise lands),
|
||||
# `NORMALIZE_VERSION` is bumped so pre-existing drawers can be silently
|
||||
# rebuilt on the next mine. These tests pin that contract.
|
||||
|
||||
|
||||
def test_file_already_mined_returns_false_for_stale_normalize_version():
|
||||
"""Pre-v2 drawers (no field, or older integer) must not short-circuit."""
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
try:
|
||||
palace_path = os.path.join(tmpdir, "palace")
|
||||
os.makedirs(palace_path)
|
||||
client = chromadb.PersistentClient(path=palace_path)
|
||||
col = client.get_or_create_collection("mempalace_drawers")
|
||||
|
||||
# Pre-v2 drawer: no normalize_version field at all
|
||||
col.add(
|
||||
ids=["d_old"],
|
||||
documents=["old"],
|
||||
metadatas=[{"source_file": "/fake/old.jsonl"}],
|
||||
)
|
||||
assert file_already_mined(col, "/fake/old.jsonl") is False
|
||||
|
||||
# Explicitly older version
|
||||
col.add(
|
||||
ids=["d_v1"],
|
||||
documents=["v1"],
|
||||
metadatas=[{"source_file": "/fake/v1.jsonl", "normalize_version": 1}],
|
||||
)
|
||||
assert file_already_mined(col, "/fake/v1.jsonl") is False
|
||||
|
||||
# Current version — short-circuits
|
||||
col.add(
|
||||
ids=["d_current"],
|
||||
documents=["cur"],
|
||||
metadatas=[
|
||||
{
|
||||
"source_file": "/fake/current.jsonl",
|
||||
"normalize_version": NORMALIZE_VERSION,
|
||||
}
|
||||
],
|
||||
)
|
||||
assert file_already_mined(col, "/fake/current.jsonl") is True
|
||||
finally:
|
||||
del col, client
|
||||
shutil.rmtree(tmpdir, ignore_errors=True)
|
||||
|
||||
|
||||
def test_add_drawer_stamps_normalize_version(tmp_path):
|
||||
"""Fresh drawers carry the current schema version so future upgrades work."""
|
||||
from mempalace.miner import add_drawer
|
||||
|
||||
palace_path = tmp_path / "palace"
|
||||
palace_path.mkdir()
|
||||
client = chromadb.PersistentClient(path=str(palace_path))
|
||||
col = client.get_or_create_collection("mempalace_drawers")
|
||||
try:
|
||||
added = add_drawer(
|
||||
collection=col,
|
||||
wing="test",
|
||||
room="notes",
|
||||
content="hello",
|
||||
source_file=str(tmp_path / "src.md"),
|
||||
chunk_index=0,
|
||||
agent="unit",
|
||||
)
|
||||
assert added is True
|
||||
stored = col.get(limit=1)
|
||||
meta = stored["metadatas"][0]
|
||||
assert meta["normalize_version"] == NORMALIZE_VERSION
|
||||
finally:
|
||||
del col, client
|
||||
|
||||
Reference in New Issue
Block a user