tests/test_sweeper.py

"""TDD: tandem sweeper that catches what the primary miner missed.

The primary miner (miner.py / convo_miner.py) runs at file granularity
and can drop data (size caps, silent OSError, dedup false-positives).
The sweeper is a second miner that works at MESSAGE granularity,
using timestamp as the coordination cursor.

For each session in the transcript directory:
  1. Look up max(timestamp) across all drawers with matching session_id
  2. Stream the jsonl, yielding only user/assistant messages after the cursor
  3. Write one small drawer per message with:
       session_id, uuid, timestamp, role, content
  4. Idempotent: re-running sweeps should find nothing new on a complete palace.

This test file is TDD — written BEFORE mempalace/sweeper.py exists.
"""

import json
import tempfile
from pathlib import Path

import pytest


@pytest.fixture
def mock_claude_jsonl(tmp_path):
    """Real Claude Code jsonl shape: user/assistant records among progress noise."""
    path = tmp_path / "session_abc.jsonl"
    lines = [
        # Noise: progress event, no message
        {"type": "progress", "timestamp": "2026-04-18T10:00:00Z",
         "sessionId": "abc", "uuid": "p-1"},
        # User message
        {"type": "user", "timestamp": "2026-04-18T10:00:05Z",
         "sessionId": "abc", "uuid": "u-1",
         "message": {"role": "user", "content": "What's the capital of France?"}},
        # Assistant reply
        {"type": "assistant", "timestamp": "2026-04-18T10:00:06Z",
         "sessionId": "abc", "uuid": "a-1",
         "message": {"role": "assistant",
                     "content": [{"type": "text", "text": "Paris."}]}},
        # Noise: file-history-snapshot
        {"type": "file-history-snapshot", "messageId": "abc-snap"},
        # Second user/assistant exchange
        {"type": "user", "timestamp": "2026-04-18T10:01:00Z",
         "sessionId": "abc", "uuid": "u-2",
         "message": {"role": "user", "content": "And of Germany?"}},
        {"type": "assistant", "timestamp": "2026-04-18T10:01:01Z",
         "sessionId": "abc", "uuid": "a-2",
         "message": {"role": "assistant",
                     "content": [{"type": "text", "text": "Berlin."}]}},
    ]
    path.write_text("\n".join(json.dumps(x) for x in lines) + "\n")
    return path


class TestSweeperParsing:
    def test_parse_yields_only_user_and_assistant(self, mock_claude_jsonl):
        from mempalace.sweeper import parse_claude_jsonl
        records = list(parse_claude_jsonl(str(mock_claude_jsonl)))
        roles = [r["role"] for r in records]
        assert roles == ["user", "assistant", "user", "assistant"], (
            f"Expected 4 user/assistant in order, got {roles}. "
            "Noise records (progress, file-history-snapshot) must be "
            "filtered out."
        )

    def test_parse_extracts_session_id_and_timestamp(self, mock_claude_jsonl):
        from mempalace.sweeper import parse_claude_jsonl
        records = list(parse_claude_jsonl(str(mock_claude_jsonl)))
        first = records[0]
        assert first["session_id"] == "abc"
        assert first["timestamp"] == "2026-04-18T10:00:05Z"
        assert first["uuid"] == "u-1"

    def test_parse_normalizes_assistant_content_list_to_text(self, mock_claude_jsonl):
        from mempalace.sweeper import parse_claude_jsonl
        records = list(parse_claude_jsonl(str(mock_claude_jsonl)))
        assistant_rec = records[1]
        assert assistant_rec["role"] == "assistant"
        assert "Paris" in assistant_rec["content"], (
            f"Assistant content blocks must be flattened to text; "
            f"got: {assistant_rec['content']!r}"
        )


class TestSweeperTandem:
    """The sweeper coordinates with other miners via max(timestamp)."""

    def test_sweep_empty_palace_ingests_all_messages(self, mock_claude_jsonl, tmp_path):
        from mempalace.sweeper import sweep
        palace_path = str(tmp_path / "palace")
        result = sweep(str(mock_claude_jsonl), palace_path)
        assert result["drawers_added"] == 4, (
            f"Empty palace: all 4 user/assistant messages should ingest. "
            f"Got drawers_added={result['drawers_added']}."
        )

    def test_sweep_is_idempotent(self, mock_claude_jsonl, tmp_path):
        """Running the sweep twice must not duplicate drawers."""
        from mempalace.sweeper import sweep
        palace_path = str(tmp_path / "palace")
        first = sweep(str(mock_claude_jsonl), palace_path)
        second = sweep(str(mock_claude_jsonl), palace_path)
        assert first["drawers_added"] == 4
        assert second["drawers_added"] == 0, (
            f"Second sweep must be a no-op on unchanged data. "
            f"Got drawers_added={second['drawers_added']} — "
            "cursor logic is broken."
        )

    def test_sweep_resumes_from_cursor(self, tmp_path):
        """If half the messages are already in the palace, sweep picks up
        only the later half."""
        from mempalace.sweeper import sweep

        jsonl_path = tmp_path / "session.jsonl"
        lines = [
            {"type": "user", "timestamp": "2026-04-18T09:00:00Z",
             "sessionId": "s1", "uuid": "u1",
             "message": {"role": "user", "content": "first"}},
            {"type": "assistant", "timestamp": "2026-04-18T09:00:01Z",
             "sessionId": "s1", "uuid": "a1",
             "message": {"role": "assistant",
                         "content": [{"type": "text", "text": "one"}]}},
        ]
        jsonl_path.write_text("\n".join(json.dumps(x) for x in lines) + "\n")

        palace_path = str(tmp_path / "palace")
        first = sweep(str(jsonl_path), palace_path)
        assert first["drawers_added"] == 2

        # Append two more exchanges simulating live session growth.
        more_lines = [
            {"type": "user", "timestamp": "2026-04-18T09:05:00Z",
             "sessionId": "s1", "uuid": "u2",
             "message": {"role": "user", "content": "second"}},
            {"type": "assistant", "timestamp": "2026-04-18T09:05:01Z",
             "sessionId": "s1", "uuid": "a2",
             "message": {"role": "assistant",
                         "content": [{"type": "text", "text": "two"}]}},
        ]
        with open(jsonl_path, "a") as f:
            for x in more_lines:
                f.write(json.dumps(x) + "\n")

        second = sweep(str(jsonl_path), palace_path)
        assert second["drawers_added"] == 2, (
            f"Second sweep should pick up only the 2 new exchanges, "
            f"got {second['drawers_added']}. Cursor (max-timestamp) "
            "coordination is broken."
        )


class TestSweeperDrawerMetadata:
    """Each drawer must carry the metadata the tandem-miner coordination
    depends on: session_id, timestamp, uuid, role."""

    def test_drawer_has_session_id_and_timestamp_metadata(
            self, mock_claude_jsonl, tmp_path):
        from mempalace.sweeper import sweep
        from mempalace.palace import get_collection

        palace_path = str(tmp_path / "palace")
        sweep(str(mock_claude_jsonl), palace_path)

        col = get_collection(palace_path, create=False)
        data = col.get(include=["metadatas"])
        metas = data["metadatas"]
        assert metas, "No drawers written"

        for m in metas:
            assert m.get("session_id") == "abc", (
                f"Drawer missing session_id metadata: {m}"
            )
            assert m.get("timestamp"), (
                f"Drawer missing timestamp metadata: {m}"
            )
            assert m.get("message_uuid"), (
                f"Drawer missing message_uuid metadata: {m}"
            )
            assert m.get("role") in ("user", "assistant"), (
                f"Drawer missing or wrong role metadata: {m}"
            )
Add tandem sweeper: message-level safety net for dropped transcripts 2026-04-18 07:51:10 -07:00			`"""TDD: tandem sweeper that catches what the primary miner missed.`

			`The primary miner (miner.py / convo_miner.py) runs at file granularity`
			`and can drop data (size caps, silent OSError, dedup false-positives).`
			`The sweeper is a second miner that works at MESSAGE granularity,`
			`using timestamp as the coordination cursor.`

			`For each session in the transcript directory:`
			`1. Look up max(timestamp) across all drawers with matching session_id`
			`2. Stream the jsonl, yielding only user/assistant messages after the cursor`
			`3. Write one small drawer per message with:`
			`session_id, uuid, timestamp, role, content`
			`4. Idempotent: re-running sweeps should find nothing new on a complete palace.`

			`This test file is TDD — written BEFORE mempalace/sweeper.py exists.`
			`"""`

			`import json`
			`import tempfile`
			`from pathlib import Path`

			`import pytest`


			`@pytest.fixture`
			`def mock_claude_jsonl(tmp_path):`
			`"""Real Claude Code jsonl shape: user/assistant records among progress noise."""`
			`path = tmp_path / "session_abc.jsonl"`
			`lines = [`
			`# Noise: progress event, no message`
			`{"type": "progress", "timestamp": "2026-04-18T10:00:00Z",`
			`"sessionId": "abc", "uuid": "p-1"},`
			`# User message`
			`{"type": "user", "timestamp": "2026-04-18T10:00:05Z",`
			`"sessionId": "abc", "uuid": "u-1",`
			`"message": {"role": "user", "content": "What's the capital of France?"}},`
			`# Assistant reply`
			`{"type": "assistant", "timestamp": "2026-04-18T10:00:06Z",`
			`"sessionId": "abc", "uuid": "a-1",`
			`"message": {"role": "assistant",`
			`"content": [{"type": "text", "text": "Paris."}]}},`
			`# Noise: file-history-snapshot`
			`{"type": "file-history-snapshot", "messageId": "abc-snap"},`
			`# Second user/assistant exchange`
			`{"type": "user", "timestamp": "2026-04-18T10:01:00Z",`
			`"sessionId": "abc", "uuid": "u-2",`
			`"message": {"role": "user", "content": "And of Germany?"}},`
			`{"type": "assistant", "timestamp": "2026-04-18T10:01:01Z",`
			`"sessionId": "abc", "uuid": "a-2",`
			`"message": {"role": "assistant",`
			`"content": [{"type": "text", "text": "Berlin."}]}},`
			`]`
			`path.write_text("\n".join(json.dumps(x) for x in lines) + "\n")`
			`return path`


			`class TestSweeperParsing:`
			`def test_parse_yields_only_user_and_assistant(self, mock_claude_jsonl):`
			`from mempalace.sweeper import parse_claude_jsonl`
			`records = list(parse_claude_jsonl(str(mock_claude_jsonl)))`
			`roles = [r["role"] for r in records]`
			`assert roles == ["user", "assistant", "user", "assistant"], (`
			`f"Expected 4 user/assistant in order, got {roles}. "`
			`"Noise records (progress, file-history-snapshot) must be "`
			`"filtered out."`
			`)`

			`def test_parse_extracts_session_id_and_timestamp(self, mock_claude_jsonl):`
			`from mempalace.sweeper import parse_claude_jsonl`
			`records = list(parse_claude_jsonl(str(mock_claude_jsonl)))`
			`first = records[0]`
			`assert first["session_id"] == "abc"`
			`assert first["timestamp"] == "2026-04-18T10:00:05Z"`
			`assert first["uuid"] == "u-1"`

			`def test_parse_normalizes_assistant_content_list_to_text(self, mock_claude_jsonl):`
			`from mempalace.sweeper import parse_claude_jsonl`
			`records = list(parse_claude_jsonl(str(mock_claude_jsonl)))`
			`assistant_rec = records[1]`
			`assert assistant_rec["role"] == "assistant"`
			`assert "Paris" in assistant_rec["content"], (`
			`f"Assistant content blocks must be flattened to text; "`
			`f"got: {assistant_rec['content']!r}"`
			`)`


			`class TestSweeperTandem:`
			`"""The sweeper coordinates with other miners via max(timestamp)."""`

			`def test_sweep_empty_palace_ingests_all_messages(self, mock_claude_jsonl, tmp_path):`
			`from mempalace.sweeper import sweep`
			`palace_path = str(tmp_path / "palace")`
			`result = sweep(str(mock_claude_jsonl), palace_path)`
			`assert result["drawers_added"] == 4, (`
			`f"Empty palace: all 4 user/assistant messages should ingest. "`
			`f"Got drawers_added={result['drawers_added']}."`
			`)`

			`def test_sweep_is_idempotent(self, mock_claude_jsonl, tmp_path):`
			`"""Running the sweep twice must not duplicate drawers."""`
			`from mempalace.sweeper import sweep`
			`palace_path = str(tmp_path / "palace")`
			`first = sweep(str(mock_claude_jsonl), palace_path)`
			`second = sweep(str(mock_claude_jsonl), palace_path)`
			`assert first["drawers_added"] == 4`
			`assert second["drawers_added"] == 0, (`
			`f"Second sweep must be a no-op on unchanged data. "`
			`f"Got drawers_added={second['drawers_added']} — "`
			`"cursor logic is broken."`
			`)`

			`def test_sweep_resumes_from_cursor(self, tmp_path):`
			`"""If half the messages are already in the palace, sweep picks up`
			`only the later half."""`
			`from mempalace.sweeper import sweep`

			`jsonl_path = tmp_path / "session.jsonl"`
			`lines = [`
			`{"type": "user", "timestamp": "2026-04-18T09:00:00Z",`
			`"sessionId": "s1", "uuid": "u1",`
			`"message": {"role": "user", "content": "first"}},`
			`{"type": "assistant", "timestamp": "2026-04-18T09:00:01Z",`
			`"sessionId": "s1", "uuid": "a1",`
			`"message": {"role": "assistant",`
			`"content": [{"type": "text", "text": "one"}]}},`
			`]`
			`jsonl_path.write_text("\n".join(json.dumps(x) for x in lines) + "\n")`

			`palace_path = str(tmp_path / "palace")`
			`first = sweep(str(jsonl_path), palace_path)`
			`assert first["drawers_added"] == 2`

			`# Append two more exchanges simulating live session growth.`
			`more_lines = [`
			`{"type": "user", "timestamp": "2026-04-18T09:05:00Z",`
			`"sessionId": "s1", "uuid": "u2",`
			`"message": {"role": "user", "content": "second"}},`
			`{"type": "assistant", "timestamp": "2026-04-18T09:05:01Z",`
			`"sessionId": "s1", "uuid": "a2",`
			`"message": {"role": "assistant",`
			`"content": [{"type": "text", "text": "two"}]}},`
			`]`
			`with open(jsonl_path, "a") as f:`
			`for x in more_lines:`
			`f.write(json.dumps(x) + "\n")`

			`second = sweep(str(jsonl_path), palace_path)`
			`assert second["drawers_added"] == 2, (`
			`f"Second sweep should pick up only the 2 new exchanges, "`
			`f"got {second['drawers_added']}. Cursor (max-timestamp) "`
			`"coordination is broken."`
			`)`


			`class TestSweeperDrawerMetadata:`
			`"""Each drawer must carry the metadata the tandem-miner coordination`
			`depends on: session_id, timestamp, uuid, role."""`

			`def test_drawer_has_session_id_and_timestamp_metadata(`
			`self, mock_claude_jsonl, tmp_path):`
			`from mempalace.sweeper import sweep`
			`from mempalace.palace import get_collection`

			`palace_path = str(tmp_path / "palace")`
			`sweep(str(mock_claude_jsonl), palace_path)`

			`col = get_collection(palace_path, create=False)`
			`data = col.get(include=["metadatas"])`
			`metas = data["metadatas"]`
			`assert metas, "No drawers written"`

			`for m in metas:`
			`assert m.get("session_id") == "abc", (`
			`f"Drawer missing session_id metadata: {m}"`
			`)`
			`assert m.get("timestamp"), (`
			`f"Drawer missing timestamp metadata: {m}"`
			`)`
			`assert m.get("message_uuid"), (`
			`f"Drawer missing message_uuid metadata: {m}"`
			`)`
			`assert m.get("role") in ("user", "assistant"), (`
			`f"Drawer missing or wrong role metadata: {m}"`
			`)`