2026-04-04 18:33:42 -07:00
|
|
|
import os
|
|
|
|
|
import tempfile
|
|
|
|
|
import shutil
|
2026-04-13 02:25:34 +05:00
|
|
|
from pathlib import Path
|
|
|
|
|
|
2026-04-04 18:33:42 -07:00
|
|
|
import chromadb
|
2026-04-13 02:25:34 +05:00
|
|
|
|
2026-04-04 18:33:42 -07:00
|
|
|
from mempalace.convo_miner import mine_convos
|
2026-04-13 02:25:34 +05:00
|
|
|
from mempalace.palace import file_already_mined
|
2026-04-04 18:33:42 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_convo_mining():
|
|
|
|
|
tmpdir = tempfile.mkdtemp()
|
|
|
|
|
with open(os.path.join(tmpdir, "chat.txt"), "w") as f:
|
|
|
|
|
f.write(
|
|
|
|
|
"> What is memory?\nMemory is persistence.\n\n> Why does it matter?\nIt enables continuity.\n\n> How do we build it?\nWith structured storage.\n"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
palace_path = os.path.join(tmpdir, "palace")
|
|
|
|
|
mine_convos(tmpdir, palace_path, wing="test_convos")
|
|
|
|
|
|
|
|
|
|
client = chromadb.PersistentClient(path=palace_path)
|
|
|
|
|
col = client.get_collection("mempalace_drawers")
|
|
|
|
|
assert col.count() >= 2
|
|
|
|
|
|
|
|
|
|
# Verify search works
|
|
|
|
|
results = col.query(query_texts=["memory persistence"], n_results=1)
|
|
|
|
|
assert len(results["documents"][0]) > 0
|
|
|
|
|
|
2026-04-08 21:38:12 +03:00
|
|
|
shutil.rmtree(tmpdir, ignore_errors=True)
|
2026-04-13 02:25:34 +05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_mine_convos_does_not_reprocess_short_files(capsys):
|
|
|
|
|
"""Files below MIN_CHUNK_SIZE get a sentinel so they are skipped on re-run."""
|
|
|
|
|
tmpdir = tempfile.mkdtemp()
|
|
|
|
|
try:
|
|
|
|
|
# A file too short to produce any chunks
|
|
|
|
|
with open(os.path.join(tmpdir, "tiny.txt"), "w") as f:
|
|
|
|
|
f.write("hi")
|
|
|
|
|
|
|
|
|
|
palace_path = os.path.join(tmpdir, "palace")
|
|
|
|
|
|
|
|
|
|
# First run -- file is processed (sentinel written)
|
|
|
|
|
mine_convos(tmpdir, palace_path, wing="test")
|
|
|
|
|
capsys.readouterr() # drain output
|
|
|
|
|
|
|
|
|
|
# Verify sentinel was written (resolve path -- macOS /var -> /private/var)
|
|
|
|
|
resolved_file = str(Path(tmpdir).resolve() / "tiny.txt")
|
|
|
|
|
client = chromadb.PersistentClient(path=palace_path)
|
|
|
|
|
col = client.get_collection("mempalace_drawers")
|
|
|
|
|
assert file_already_mined(col, resolved_file)
|
|
|
|
|
|
|
|
|
|
# Second run -- file should be skipped
|
|
|
|
|
mine_convos(tmpdir, palace_path, wing="test")
|
|
|
|
|
out2 = capsys.readouterr().out
|
|
|
|
|
assert "Files skipped (already filed): 1" in out2
|
|
|
|
|
finally:
|
|
|
|
|
shutil.rmtree(tmpdir, ignore_errors=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_mine_convos_does_not_reprocess_empty_chunk_files(capsys):
|
|
|
|
|
"""Files that normalize but produce 0 exchange chunks get a sentinel."""
|
|
|
|
|
tmpdir = tempfile.mkdtemp()
|
|
|
|
|
try:
|
|
|
|
|
# Content long enough to pass MIN_CHUNK_SIZE but with no exchange markers
|
|
|
|
|
# (no "> " lines), so chunk_exchanges returns []
|
|
|
|
|
with open(os.path.join(tmpdir, "no_exchanges.txt"), "w") as f:
|
|
|
|
|
f.write("This is a plain paragraph without any exchange markers. " * 5)
|
|
|
|
|
|
|
|
|
|
palace_path = os.path.join(tmpdir, "palace")
|
|
|
|
|
|
|
|
|
|
mine_convos(tmpdir, palace_path, wing="test")
|
|
|
|
|
mine_convos(tmpdir, palace_path, wing="test")
|
|
|
|
|
out2 = capsys.readouterr().out
|
|
|
|
|
assert "Files skipped (already filed): 1" in out2
|
|
|
|
|
finally:
|
|
|
|
|
shutil.rmtree(tmpdir, ignore_errors=True)
|