fix: guard None metadata/doc in tool_check_duplicate and Layer1/Layer2

Chroma 1.5.x can return ``None`` inside the ``metadatas`` / ``documents`` lists of a query/get result for partially-flushed rows. The codebase already has a systemic None-guard pattern (merged #999, #1013, #1019) but three call sites were still unguarded: * ``mcp_server.tool_check_duplicate`` (``mcp_server.py:487-488``) — ``meta = results["metadatas"][0][i]`` followed by ``meta.get(...)`` raises ``AttributeError: 'NoneType' object has no attribute 'get'``. The broad ``except Exception`` wrapper (line 504) swallows it and returns an uninformative ``"Duplicate check failed"``. * ``layers.Layer1.generate`` (``layers.py:126``) — iterates ``zip(docs, metas)`` and calls ``meta.get(key)`` in the importance loop. A single None metadata blows up the entire wake-up render. * ``layers.Layer2.retrieve`` (``layers.py:224``) — same pattern, same crash path for the on-demand render. Apply the same ``meta = meta or {}`` / ``doc = doc or ""`` idiom used by the merged guards in the search path. Three-line additions, no behaviour change on well-formed results. Tests added: * ``test_check_duplicate_handles_none_metadata`` — mocks the collection query to return ``None`` for one metadata and document, asserts the call does not crash and the sentinel-rendered entry has wing/room "?" and empty content. * ``test_layer1_handles_none_metadata`` / ``_handles_none_document`` * ``test_layer2_handles_none_metadata`` Relationship to other open PRs: * **#1019** guarded ``searcher.py`` loops. This PR extends the same guard to the three call sites #1019 did not touch. * **#979** fixed ``tool_check_duplicate`` negative similarity but left the None-metadata path unguarded. * Does not overlap **#1013** (``Layer3.search_raw``) or **#999**.
2026-04-19 11:13:50 +03:00
parent 109d7f267c
commit 4949aab68b
4 changed files with 113 additions and 2 deletions
@@ -124,6 +124,8 @@ class Layer1:
        # Score each drawer: prefer high importance, recent filing
        scored = []
        for doc, meta in zip(docs, metas):
            meta = meta or {}
            doc = doc or ""
            importance = 3
            # Try multiple metadata keys that might carry weight info
            for key in ("importance", "emotional_weight", "weight"):
@@ -222,6 +224,8 @@ class Layer2:
        lines = [f"## L2 — ON-DEMAND ({len(docs)} drawers)"]
        for doc, meta in zip(docs[:n_results], metas[:n_results]):
            meta = meta or {}
            doc = doc or ""
            room_name = meta.get("room", "?")
            source = Path(meta.get("source_file", "")).name if meta.get("source_file") else ""
            snippet = doc.strip().replace("\n", " ")
@@ -484,8 +484,10 @@ def tool_check_duplicate(content: str, threshold: float = 0.9):
                dist = results["distances"][0][i]
                similarity = round(1 - dist, 3)
                if similarity >= threshold:
-                    meta = results["metadatas"][0][i]
+                    # Chroma 1.5.x can return None for partially-flushed rows;
-                    doc = results["documents"][0][i]
+                    # coerce to empty sentinels so downstream .get() is safe.
                    meta = results["metadatas"][0][i] or {}
                    doc = results["documents"][0][i] or ""
                    duplicates.append(
                        {
                            "id": drawer_id,
@@ -655,3 +655,72 @@ def test_memory_stack_status_with_palace(tmp_path):
    assert result["total_drawers"] == 42
    assert result["L0_identity"]["exists"] is True
 # ── Layer1 / Layer2 None-metadata guards ───────────────────────────────
 #
 # Chroma 1.5.x can return ``None`` inside the ``metadatas`` / ``documents``
 # lists for partially-flushed rows. The Layer1.generate() and
 # Layer2.retrieve() loops previously called ``meta.get(...)`` without
 # coercing, raising ``AttributeError: 'NoneType' object has no attribute
 # 'get'`` and blowing up the whole wake-up render. These tests guard that
 # the loops tolerate the None entries and render the rest of the result.
 def test_layer1_handles_none_metadata():
    """Layer1.generate tolerates None entries in the metadatas list."""
    docs = ["important memory", "another memory"]
    metas = [{"room": "decisions", "source_file": "a.txt"}, None]
    mock_col = _mock_chromadb_for_layer(docs, metas)
    with (
        patch("mempalace.layers.MempalaceConfig") as mock_cfg,
        patch("mempalace.layers._get_collection", return_value=mock_col),
    ):
        mock_cfg.return_value.palace_path = "/fake"
        layer = Layer1(palace_path="/fake")
        # Should not raise AttributeError on the None entry.
        result = layer.generate()
    assert "ESSENTIAL STORY" in result
    assert "important memory" in result
 def test_layer1_handles_none_document():
    """Layer1.generate tolerates None entries in the documents list."""
    docs = ["first doc", None]
    metas = [
        {"room": "r", "source_file": "a.txt"},
        {"room": "r", "source_file": "b.txt"},
    ]
    mock_col = _mock_chromadb_for_layer(docs, metas)
    with (
        patch("mempalace.layers.MempalaceConfig") as mock_cfg,
        patch("mempalace.layers._get_collection", return_value=mock_col),
    ):
        mock_cfg.return_value.palace_path = "/fake"
        layer = Layer1(palace_path="/fake")
        result = layer.generate()
    assert result  # Render succeeded despite the None document.
 def test_layer2_handles_none_metadata():
    """Layer2.retrieve tolerates None entries in the metadatas list."""
    mock_col = MagicMock()
    mock_col.get.return_value = {
        "documents": ["first doc", "second doc"],
        "metadatas": [{"room": "r", "source_file": "a.txt"}, None],
    }
    with (
        patch("mempalace.layers.MempalaceConfig") as mock_cfg,
        patch("mempalace.layers._get_collection", return_value=mock_col),
    ):
        mock_cfg.return_value.palace_path = "/fake"
        layer = Layer2(palace_path="/fake")
        # Should not raise AttributeError on the None entry.
        result = layer.retrieve()
    assert "L2 — ON-DEMAND" in result
@@ -9,6 +9,7 @@ via monkeypatch to avoid touching real data.
 from datetime import datetime
 import json
 import sys
 from unittest.mock import MagicMock
 import pytest
@@ -495,6 +496,41 @@ class TestWriteTools:
        result = tool_delete_drawer("nonexistent_drawer")
        assert result["success"] is False
    def test_check_duplicate_handles_none_metadata(self, monkeypatch, config, kg):
        """tool_check_duplicate must tolerate None entries in the result lists
        that ChromaDB 1.5.x returns for partially-flushed rows.
        Previously ``meta = results["metadatas"][0][i]`` was unguarded and
        raised ``AttributeError: 'NoneType' object has no attribute 'get'``
        the moment the first matching drawer came back with None metadata —
        surfacing to the MCP client as the uninformative
        ``"Duplicate check failed"`` because the broad ``except Exception``
        wrapper swallows the real cause.
        """
        _patch_mcp_server(monkeypatch, config, kg)
        from mempalace import mcp_server
        mock_col = MagicMock()
        mock_col.query.return_value = {
            "ids": [["d1", "d2"]],
            "distances": [[0.05, 0.05]],
            "metadatas": [[{"wing": "w", "room": "r"}, None]],
            "documents": [["first doc", None]],
        }
        monkeypatch.setattr(mcp_server, "_get_collection", lambda: mock_col)
        result = mcp_server.tool_check_duplicate("any content", threshold=0.5)
        # Both entries land in matches (above threshold), None ones rendered
        # with sentinel values rather than crashing the whole response.
        assert result.get("is_duplicate") is True
        assert len(result["matches"]) == 2
        # The None-metadata entry falls back to sentinels.
        none_entry = result["matches"][1]
        assert none_entry["wing"] == "?"
        assert none_entry["room"] == "?"
        assert none_entry["content"] == ""
    def test_check_duplicate(self, monkeypatch, config, palace_path, seeded_collection, kg):
        _patch_mcp_server(monkeypatch, config, kg)
        from mempalace.mcp_server import tool_check_duplicate