3d529e7028
Agent-Logs-Url: https://github.com/MemPalace/mempalace/sessions/3213a67a-6871-4bb2-9ae0-23fa11001a22 Co-authored-by: igorls <4753812+igorls@users.noreply.github.com>
150 lines
5.7 KiB
Python
150 lines
5.7 KiB
Python
"""Unit tests for convo_miner pure functions (no chromadb needed)."""
|
|
|
|
import contextlib
|
|
|
|
from mempalace.convo_miner import (
|
|
_file_chunks_locked,
|
|
chunk_exchanges,
|
|
detect_convo_room,
|
|
scan_convos,
|
|
)
|
|
|
|
|
|
class TestChunkExchanges:
|
|
def test_exchange_chunking(self):
|
|
content = (
|
|
"> What is memory?\n"
|
|
"Memory is persistence of information over time.\n\n"
|
|
"> Why does it matter?\n"
|
|
"It enables continuity across sessions and conversations.\n\n"
|
|
"> How do we build it?\n"
|
|
"With structured storage and retrieval mechanisms.\n"
|
|
)
|
|
chunks = chunk_exchanges(content)
|
|
assert len(chunks) >= 2
|
|
assert all("content" in c and "chunk_index" in c for c in chunks)
|
|
|
|
def test_paragraph_fallback(self):
|
|
"""Content without '>' lines falls back to paragraph chunking."""
|
|
content = (
|
|
"This is a long paragraph about memory systems. " * 10 + "\n\n"
|
|
"This is another paragraph about storage. " * 10 + "\n\n"
|
|
"And a third paragraph about retrieval. " * 10
|
|
)
|
|
chunks = chunk_exchanges(content)
|
|
assert len(chunks) >= 2
|
|
|
|
def test_paragraph_line_group_fallback(self):
|
|
"""Long content with no paragraph breaks chunks by line groups."""
|
|
lines = [f"Line {i}: some content that is meaningful" for i in range(60)]
|
|
content = "\n".join(lines)
|
|
chunks = chunk_exchanges(content)
|
|
assert len(chunks) >= 1
|
|
|
|
def test_empty_content(self):
|
|
chunks = chunk_exchanges("")
|
|
assert chunks == []
|
|
|
|
def test_short_content_skipped(self):
|
|
chunks = chunk_exchanges("> hi\nbye")
|
|
# Too short to produce chunks (below MIN_CHUNK_SIZE)
|
|
assert isinstance(chunks, list)
|
|
|
|
def test_long_ai_response_not_truncated(self):
|
|
"""AI responses longer than 8 lines must be stored in full (verbatim principle)."""
|
|
lines = [f"Step {i}: important detail that must be stored" for i in range(1, 14)]
|
|
content = "> How do I implement authentication?\n" + "\n".join(lines)
|
|
chunks = chunk_exchanges(content)
|
|
assert len(chunks) >= 1
|
|
stored = chunks[0]["content"]
|
|
# All 13 lines must be present — none silently dropped
|
|
for i in range(1, 14):
|
|
assert f"Step {i}:" in stored, f"Step {i} was truncated and not stored"
|
|
|
|
|
|
class TestDetectConvoRoom:
|
|
def test_technical_room(self):
|
|
content = "Let me debug this python function and fix the code error in the api"
|
|
assert detect_convo_room(content) == "technical"
|
|
|
|
def test_planning_room(self):
|
|
content = "We need to plan the roadmap for the next sprint and set milestone deadlines"
|
|
assert detect_convo_room(content) == "planning"
|
|
|
|
def test_architecture_room(self):
|
|
content = "The architecture uses a service layer with component interface and module design"
|
|
assert detect_convo_room(content) == "architecture"
|
|
|
|
def test_decisions_room(self):
|
|
content = "We decided to switch and migrated to the new framework after we chose it"
|
|
assert detect_convo_room(content) == "decisions"
|
|
|
|
def test_general_fallback(self):
|
|
content = "Hello, how are you doing today? The weather is nice."
|
|
assert detect_convo_room(content) == "general"
|
|
|
|
|
|
class TestScanConvos:
|
|
def test_scan_finds_txt_and_md(self, tmp_path):
|
|
(tmp_path / "chat.txt").write_text("hello", encoding="utf-8")
|
|
(tmp_path / "notes.md").write_text("world", encoding="utf-8")
|
|
(tmp_path / "image.png").write_bytes(b"fake")
|
|
files = scan_convos(str(tmp_path))
|
|
extensions = {f.suffix for f in files}
|
|
assert ".txt" in extensions
|
|
assert ".md" in extensions
|
|
assert ".png" not in extensions
|
|
|
|
def test_scan_skips_git_dir(self, tmp_path):
|
|
git_dir = tmp_path / ".git"
|
|
git_dir.mkdir()
|
|
(git_dir / "config.txt").write_text("git stuff", encoding="utf-8")
|
|
(tmp_path / "chat.txt").write_text("hello", encoding="utf-8")
|
|
files = scan_convos(str(tmp_path))
|
|
assert len(files) == 1
|
|
|
|
def test_scan_skips_meta_json(self, tmp_path):
|
|
(tmp_path / "chat.meta.json").write_text("{}", encoding="utf-8")
|
|
(tmp_path / "chat.json").write_text("{}", encoding="utf-8")
|
|
files = scan_convos(str(tmp_path))
|
|
names = [f.name for f in files]
|
|
assert "chat.json" in names
|
|
assert "chat.meta.json" not in names
|
|
|
|
def test_scan_empty_dir(self, tmp_path):
|
|
files = scan_convos(str(tmp_path))
|
|
assert files == []
|
|
|
|
|
|
class TestFileChunksLocked:
|
|
def test_uses_bounded_upsert_batches(self, monkeypatch):
|
|
import mempalace.convo_miner as convo_miner
|
|
|
|
class FakeCol:
|
|
def __init__(self):
|
|
self.batch_sizes = []
|
|
|
|
def delete(self, *args, **kwargs):
|
|
pass
|
|
|
|
def upsert(self, documents, ids, metadatas):
|
|
self.batch_sizes.append(len(documents))
|
|
|
|
chunks = [{"content": f"chunk {i} " * 20, "chunk_index": i} for i in range(5)]
|
|
col = FakeCol()
|
|
monkeypatch.setattr(convo_miner, "DRAWER_UPSERT_BATCH_SIZE", 2)
|
|
monkeypatch.setattr(
|
|
convo_miner, "file_already_mined", lambda collection, source_file: False
|
|
)
|
|
monkeypatch.setattr(convo_miner, "mine_lock", lambda source_file: contextlib.nullcontext())
|
|
monkeypatch.setattr(convo_miner, "_detect_hall_cached", lambda content: "conversations")
|
|
|
|
drawers, room_counts, skipped = _file_chunks_locked(
|
|
col, "chat.txt", chunks, "wing", "general", "agent", "exchange"
|
|
)
|
|
|
|
assert drawers == 5
|
|
assert dict(room_counts) == {}
|
|
assert skipped is False
|
|
assert col.batch_sizes == [2, 2, 1]
|