267a644f4f
Prerequisite for RFC 001 (plugin spec, #743). Removes every direct `import chromadb` outside the ChromaDB backend itself so the core modules depend only on the backend abstraction layer. Extends ChromaBackend with make_client, get_or_create_collection, delete_collection, create_collection, and backend_version. Adds update() to the BaseCollection contract. Non-backend callers (mcp_server, dedup, repair, migrate, cli) now go through the abstraction; tests patch ChromaBackend instead of chromadb. With this landed, the RFC 001 spec can be enforced and PalaceStore (#643) can ship as a plugin without touching core modules.
272 lines
8.9 KiB
Python
272 lines
8.9 KiB
Python
"""Tests for mempalace.dedup — near-duplicate drawer detection and removal."""
|
|
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
|
|
from mempalace import dedup
|
|
|
|
|
|
# ── get_source_groups ─────────────────────────────────────────────────
|
|
|
|
|
|
def test_get_source_groups_basic():
|
|
col = MagicMock()
|
|
col.count.return_value = 5
|
|
col.get.side_effect = [
|
|
{
|
|
"ids": ["d1", "d2", "d3", "d4", "d5"],
|
|
"metadatas": [
|
|
{"source_file": "a.txt"},
|
|
{"source_file": "a.txt"},
|
|
{"source_file": "a.txt"},
|
|
{"source_file": "a.txt"},
|
|
{"source_file": "a.txt"},
|
|
],
|
|
},
|
|
{"ids": []},
|
|
]
|
|
groups = dedup.get_source_groups(col, min_count=5)
|
|
assert "a.txt" in groups
|
|
assert len(groups["a.txt"]) == 5
|
|
|
|
|
|
def test_get_source_groups_below_min():
|
|
col = MagicMock()
|
|
col.count.return_value = 2
|
|
col.get.side_effect = [
|
|
{
|
|
"ids": ["d1", "d2"],
|
|
"metadatas": [
|
|
{"source_file": "a.txt"},
|
|
{"source_file": "a.txt"},
|
|
],
|
|
},
|
|
{"ids": []},
|
|
]
|
|
groups = dedup.get_source_groups(col, min_count=5)
|
|
assert len(groups) == 0
|
|
|
|
|
|
def test_get_source_groups_source_filter():
|
|
col = MagicMock()
|
|
col.count.return_value = 6
|
|
col.get.side_effect = [
|
|
{
|
|
"ids": ["d1", "d2", "d3", "d4", "d5", "d6"],
|
|
"metadatas": [
|
|
{"source_file": "project_a.txt"},
|
|
{"source_file": "project_a.txt"},
|
|
{"source_file": "project_a.txt"},
|
|
{"source_file": "project_a.txt"},
|
|
{"source_file": "project_a.txt"},
|
|
{"source_file": "other.txt"},
|
|
],
|
|
},
|
|
{"ids": []},
|
|
]
|
|
groups = dedup.get_source_groups(col, min_count=5, source_pattern="project_a")
|
|
assert "project_a.txt" in groups
|
|
assert "other.txt" not in groups
|
|
|
|
|
|
def test_get_source_groups_wing_filter():
|
|
col = MagicMock()
|
|
col.count.return_value = 5
|
|
col.get.side_effect = [
|
|
{
|
|
"ids": ["d1", "d2", "d3", "d4", "d5"],
|
|
"metadatas": [
|
|
{"source_file": "a.txt"},
|
|
{"source_file": "a.txt"},
|
|
{"source_file": "a.txt"},
|
|
{"source_file": "a.txt"},
|
|
{"source_file": "a.txt"},
|
|
],
|
|
},
|
|
{"ids": []},
|
|
]
|
|
dedup.get_source_groups(col, min_count=5, wing="my_wing")
|
|
# Verify where filter was passed
|
|
first_call = col.get.call_args_list[0]
|
|
assert first_call.kwargs.get("where") == {"wing": "my_wing"}
|
|
|
|
|
|
def test_get_source_groups_missing_source_file():
|
|
col = MagicMock()
|
|
col.count.return_value = 5
|
|
col.get.side_effect = [
|
|
{
|
|
"ids": ["d1", "d2", "d3", "d4", "d5"],
|
|
"metadatas": [{}, {}, {}, {}, {}],
|
|
},
|
|
{"ids": []},
|
|
]
|
|
groups = dedup.get_source_groups(col, min_count=5)
|
|
assert "unknown" in groups
|
|
|
|
|
|
# ── dedup_source_group ────────────────────────────────────────────────
|
|
|
|
|
|
def test_dedup_source_group_all_unique():
|
|
col = MagicMock()
|
|
col.get.return_value = {
|
|
"ids": ["d1", "d2"],
|
|
"documents": ["long document one content here", "different document two here"],
|
|
"metadatas": [{"wing": "a"}, {"wing": "a"}],
|
|
}
|
|
col.query.return_value = {
|
|
"ids": [["d1"]],
|
|
"distances": [[0.8]], # far apart = unique
|
|
}
|
|
kept, deleted = dedup.dedup_source_group(col, ["d1", "d2"], threshold=0.15, dry_run=True)
|
|
assert len(kept) == 2
|
|
assert len(deleted) == 0
|
|
|
|
|
|
def test_dedup_source_group_with_duplicate():
|
|
col = MagicMock()
|
|
col.get.return_value = {
|
|
"ids": ["d1", "d2"],
|
|
"documents": [
|
|
"long document content that is fairly long",
|
|
"long document content that is fairly long",
|
|
],
|
|
"metadatas": [{"wing": "a"}, {"wing": "a"}],
|
|
}
|
|
col.query.return_value = {
|
|
"ids": [["d1"]],
|
|
"distances": [[0.05]], # very close = duplicate
|
|
}
|
|
kept, deleted = dedup.dedup_source_group(col, ["d1", "d2"], threshold=0.15, dry_run=True)
|
|
assert len(kept) == 1
|
|
assert len(deleted) == 1
|
|
|
|
|
|
def test_dedup_source_group_short_docs_deleted():
|
|
col = MagicMock()
|
|
col.get.return_value = {
|
|
"ids": ["d1", "d2"],
|
|
"documents": ["long enough document to keep in the palace", "tiny"],
|
|
"metadatas": [{"wing": "a"}, {"wing": "a"}],
|
|
}
|
|
kept, deleted = dedup.dedup_source_group(col, ["d1", "d2"], threshold=0.15, dry_run=True)
|
|
assert "d2" in deleted # too short
|
|
|
|
|
|
def test_dedup_source_group_empty_doc_deleted():
|
|
col = MagicMock()
|
|
col.get.return_value = {
|
|
"ids": ["d1", "d2"],
|
|
"documents": ["real document content here that is long enough", None],
|
|
"metadatas": [{"wing": "a"}, {"wing": "a"}],
|
|
}
|
|
kept, deleted = dedup.dedup_source_group(col, ["d1", "d2"], threshold=0.15, dry_run=True)
|
|
assert "d2" in deleted
|
|
|
|
|
|
def test_dedup_source_group_live_deletes():
|
|
col = MagicMock()
|
|
col.get.return_value = {
|
|
"ids": ["d1", "d2"],
|
|
"documents": ["long document content here enough", "long document content here enough"],
|
|
"metadatas": [{"wing": "a"}, {"wing": "a"}],
|
|
}
|
|
col.query.return_value = {
|
|
"ids": [["d1"]],
|
|
"distances": [[0.05]],
|
|
}
|
|
kept, deleted = dedup.dedup_source_group(col, ["d1", "d2"], threshold=0.15, dry_run=False)
|
|
col.delete.assert_called_once()
|
|
|
|
|
|
def test_dedup_source_group_query_failure_keeps():
|
|
col = MagicMock()
|
|
col.get.return_value = {
|
|
"ids": ["d1", "d2"],
|
|
"documents": [
|
|
"long document one content here enough",
|
|
"long document two content here enough",
|
|
],
|
|
"metadatas": [{"wing": "a"}, {"wing": "a"}],
|
|
}
|
|
col.query.side_effect = Exception("query failed")
|
|
kept, deleted = dedup.dedup_source_group(col, ["d1", "d2"], threshold=0.15, dry_run=True)
|
|
assert len(kept) == 2 # both kept on error
|
|
|
|
|
|
# ── show_stats ────────────────────────────────────────────────────────
|
|
|
|
|
|
def _install_mock_backend(mock_backend_cls, collection):
|
|
mock_backend = MagicMock()
|
|
mock_backend.get_collection.return_value = collection
|
|
mock_backend_cls.return_value = mock_backend
|
|
return mock_backend
|
|
|
|
|
|
@patch("mempalace.dedup.ChromaBackend")
|
|
def test_show_stats(mock_backend_cls, tmp_path):
|
|
mock_col = MagicMock()
|
|
mock_col.count.return_value = 5
|
|
mock_col.get.side_effect = [
|
|
{
|
|
"ids": ["d1", "d2", "d3", "d4", "d5"],
|
|
"metadatas": [
|
|
{"source_file": "a.txt"},
|
|
{"source_file": "a.txt"},
|
|
{"source_file": "a.txt"},
|
|
{"source_file": "a.txt"},
|
|
{"source_file": "a.txt"},
|
|
],
|
|
},
|
|
{"ids": []},
|
|
]
|
|
_install_mock_backend(mock_backend_cls, mock_col)
|
|
|
|
dedup.show_stats(palace_path=str(tmp_path)) # should not raise
|
|
|
|
|
|
# ── dedup_palace ──────────────────────────────────────────────────────
|
|
|
|
|
|
@patch("mempalace.dedup.dedup_source_group")
|
|
@patch("mempalace.dedup.get_source_groups")
|
|
@patch("mempalace.dedup.ChromaBackend")
|
|
def test_dedup_palace_dry_run(mock_backend_cls, mock_groups, mock_dedup_group, tmp_path):
|
|
mock_col = MagicMock()
|
|
mock_col.count.return_value = 10
|
|
_install_mock_backend(mock_backend_cls, mock_col)
|
|
|
|
mock_groups.return_value = {"a.txt": ["d1", "d2", "d3", "d4", "d5"]}
|
|
mock_dedup_group.return_value = (["d1", "d2", "d3"], ["d4", "d5"])
|
|
|
|
dedup.dedup_palace(palace_path=str(tmp_path), dry_run=True)
|
|
mock_dedup_group.assert_called_once()
|
|
|
|
|
|
@patch("mempalace.dedup.dedup_source_group")
|
|
@patch("mempalace.dedup.get_source_groups")
|
|
@patch("mempalace.dedup.ChromaBackend")
|
|
def test_dedup_palace_with_wing(mock_backend_cls, mock_groups, mock_dedup_group, tmp_path):
|
|
mock_col = MagicMock()
|
|
mock_col.count.return_value = 10
|
|
_install_mock_backend(mock_backend_cls, mock_col)
|
|
|
|
mock_groups.return_value = {}
|
|
dedup.dedup_palace(palace_path=str(tmp_path), wing="test_wing", dry_run=True)
|
|
mock_groups.assert_called_once_with(mock_col, 5, None, wing="test_wing")
|
|
|
|
|
|
@patch("mempalace.dedup.dedup_source_group")
|
|
@patch("mempalace.dedup.get_source_groups")
|
|
@patch("mempalace.dedup.ChromaBackend")
|
|
def test_dedup_palace_no_groups(mock_backend_cls, mock_groups, mock_dedup_group, tmp_path):
|
|
mock_col = MagicMock()
|
|
mock_col.count.return_value = 3
|
|
_install_mock_backend(mock_backend_cls, mock_col)
|
|
|
|
mock_groups.return_value = {}
|
|
dedup.dedup_palace(palace_path=str(tmp_path), dry_run=True)
|
|
mock_dedup_group.assert_not_called()
|