7e5eeda9a5
Without this, the strip_noise improvement only helps new mines. Every user who had already mined Claude Code JSONL sessions would keep their noise-polluted drawers forever, because convo_miner's file_already_mined skip short-circuits before re-processing. Adds a versioned schema gate so upgrades propagate silently: - palace.NORMALIZE_VERSION=2 — bumped when the normalization pipeline changes shape (this PR's strip_noise is the v1→v2 bump). - file_already_mined now returns False if the stored normalize_version is missing or less than current, triggering a rebuild on next mine. - Both miners stamp drawers with the current normalize_version. - convo_miner now purges stale drawers before inserting fresh chunks (mirrors miner.py's existing delete+insert), extracted into _file_convo_chunks helper to keep mine_convos under ruff's C901 limit. User experience: upgrade mempalace, run `mempalace mine` as usual, old noisy drawers get silently replaced with clean ones. No erase needed, no "you need to rebuild" changelog footgun. Tests: - test_file_already_mined_returns_false_for_stale_normalize_version — pins the version gate contract for missing/v1/current. - test_add_drawer_stamps_normalize_version — fresh project-miner drawers carry the field. - test_mine_convos_rebuilds_stale_drawers_after_schema_bump — end-to-end proof that a pre-v2 palace gets silently cleaned on next mine, with orphan drawers purged and NOT skipped. Existing test_file_already_mined_check_mtime updated to include the new field; all other tests unaffected.
161 lines
6.1 KiB
Python
161 lines
6.1 KiB
Python
import os
|
|
import tempfile
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
import chromadb
|
|
|
|
from mempalace.convo_miner import mine_convos
|
|
from mempalace.palace import file_already_mined
|
|
|
|
|
|
def test_convo_mining():
|
|
tmpdir = tempfile.mkdtemp()
|
|
with open(os.path.join(tmpdir, "chat.txt"), "w") as f:
|
|
f.write(
|
|
"> What is memory?\nMemory is persistence.\n\n> Why does it matter?\nIt enables continuity.\n\n> How do we build it?\nWith structured storage.\n"
|
|
)
|
|
|
|
palace_path = os.path.join(tmpdir, "palace")
|
|
mine_convos(tmpdir, palace_path, wing="test_convos")
|
|
|
|
client = chromadb.PersistentClient(path=palace_path)
|
|
col = client.get_collection("mempalace_drawers")
|
|
assert col.count() >= 2
|
|
|
|
# Verify search works
|
|
results = col.query(query_texts=["memory persistence"], n_results=1)
|
|
assert len(results["documents"][0]) > 0
|
|
|
|
shutil.rmtree(tmpdir, ignore_errors=True)
|
|
|
|
|
|
def test_mine_convos_does_not_reprocess_short_files(capsys):
|
|
"""Files below MIN_CHUNK_SIZE get a sentinel so they are skipped on re-run."""
|
|
tmpdir = tempfile.mkdtemp()
|
|
try:
|
|
# A file too short to produce any chunks
|
|
with open(os.path.join(tmpdir, "tiny.txt"), "w") as f:
|
|
f.write("hi")
|
|
|
|
palace_path = os.path.join(tmpdir, "palace")
|
|
|
|
# First run -- file is processed (sentinel written)
|
|
mine_convos(tmpdir, palace_path, wing="test")
|
|
capsys.readouterr() # drain output
|
|
|
|
# Verify sentinel was written (resolve path -- macOS /var -> /private/var)
|
|
resolved_file = str(Path(tmpdir).resolve() / "tiny.txt")
|
|
client = chromadb.PersistentClient(path=palace_path)
|
|
col = client.get_collection("mempalace_drawers")
|
|
assert file_already_mined(col, resolved_file)
|
|
|
|
# Second run -- file should be skipped
|
|
mine_convos(tmpdir, palace_path, wing="test")
|
|
out2 = capsys.readouterr().out
|
|
assert "Files skipped (already filed): 1" in out2
|
|
finally:
|
|
shutil.rmtree(tmpdir, ignore_errors=True)
|
|
|
|
|
|
def test_mine_convos_does_not_reprocess_empty_chunk_files(capsys):
|
|
"""Files that normalize but produce 0 exchange chunks get a sentinel."""
|
|
tmpdir = tempfile.mkdtemp()
|
|
try:
|
|
# Content long enough to pass MIN_CHUNK_SIZE but with no exchange markers
|
|
# (no "> " lines), so chunk_exchanges returns []
|
|
with open(os.path.join(tmpdir, "no_exchanges.txt"), "w") as f:
|
|
f.write("This is a plain paragraph without any exchange markers. " * 5)
|
|
|
|
palace_path = os.path.join(tmpdir, "palace")
|
|
|
|
mine_convos(tmpdir, palace_path, wing="test")
|
|
mine_convos(tmpdir, palace_path, wing="test")
|
|
out2 = capsys.readouterr().out
|
|
assert "Files skipped (already filed): 1" in out2
|
|
finally:
|
|
shutil.rmtree(tmpdir, ignore_errors=True)
|
|
|
|
|
|
def test_mine_convos_rebuilds_stale_drawers_after_schema_bump(capsys):
|
|
"""When stored drawers have an older normalize_version, the next mine
|
|
silently purges them and refiles — no manual erase required.
|
|
|
|
This is what makes the strip_noise upgrade apply to existing corpora:
|
|
users just run `mempalace mine` again and old noise-filled drawers get
|
|
replaced with clean ones."""
|
|
from mempalace.palace import NORMALIZE_VERSION
|
|
|
|
tmpdir = tempfile.mkdtemp()
|
|
try:
|
|
convo_path = Path(tmpdir) / "chat.txt"
|
|
convo_path.write_text(
|
|
"> What is memory?\nMemory is persistence.\n\n"
|
|
"> Why does it matter?\nIt enables continuity.\n\n"
|
|
"> How do we build it?\nWith structured storage.\n"
|
|
)
|
|
palace_path = os.path.join(tmpdir, "palace")
|
|
|
|
# First mine — stamps drawers with NORMALIZE_VERSION
|
|
mine_convos(tmpdir, palace_path, wing="test")
|
|
capsys.readouterr()
|
|
|
|
client = chromadb.PersistentClient(path=palace_path)
|
|
col = client.get_collection("mempalace_drawers")
|
|
resolved = str(Path(tmpdir).resolve() / "chat.txt")
|
|
first_pass = col.get(where={"source_file": resolved})
|
|
first_ids = set(first_pass["ids"])
|
|
assert first_ids, "first mine should produce drawers"
|
|
for meta in first_pass["metadatas"]:
|
|
assert meta.get("normalize_version") == NORMALIZE_VERSION
|
|
|
|
# Simulate pre-v2 drawers: rewrite metadata to an older version,
|
|
# and replace content with "noise" so we can see it get cleaned up.
|
|
stale_metas = []
|
|
for meta in first_pass["metadatas"]:
|
|
stale = dict(meta)
|
|
stale["normalize_version"] = 1
|
|
stale_metas.append(stale)
|
|
col.update(
|
|
ids=list(first_pass["ids"]),
|
|
documents=["STALE NOISE"] * len(first_pass["ids"]),
|
|
metadatas=stale_metas,
|
|
)
|
|
# Add an extra orphan drawer that should also be purged.
|
|
col.add(
|
|
ids=["orphan_drawer"],
|
|
documents=["OLD ORPHAN"],
|
|
metadatas=[
|
|
{
|
|
"wing": "test",
|
|
"room": "default",
|
|
"source_file": resolved,
|
|
"chunk_index": 999,
|
|
"normalize_version": 1,
|
|
}
|
|
],
|
|
)
|
|
del col, client
|
|
|
|
# Second mine — version gate should trigger rebuild
|
|
mine_convos(tmpdir, palace_path, wing="test")
|
|
out = capsys.readouterr().out
|
|
assert (
|
|
"Files skipped (already filed): 0" in out
|
|
), "stale drawers should force a rebuild, not a skip"
|
|
|
|
client = chromadb.PersistentClient(path=palace_path)
|
|
col = client.get_collection("mempalace_drawers")
|
|
rebuilt = col.get(where={"source_file": resolved})
|
|
# Orphan is gone
|
|
assert "orphan_drawer" not in rebuilt["ids"]
|
|
# No stale content survived
|
|
assert all("STALE NOISE" not in d for d in rebuilt["documents"])
|
|
assert all("OLD ORPHAN" not in d for d in rebuilt["documents"])
|
|
# All rebuilt drawers carry the current version
|
|
for meta in rebuilt["metadatas"]:
|
|
assert meta.get("normalize_version") == NORMALIZE_VERSION
|
|
del col, client
|
|
finally:
|
|
shutil.rmtree(tmpdir, ignore_errors=True)
|