Harden sweeper for production: verbatim tool blocks, full session_id, logged failures

Four changes on top of the proposal's initial sweeper draft, driven by
the CLAUDE.md design principles:

1. Drop the 500-char truncation on tool_use / tool_result content in
   _flatten_content. The "verbatim always" principle forbids lossy
   compression of user-adjacent data; a long code-edit diff handed to
   the assistant must round-trip intact. Unknown block types now also
   serialize their full payload instead of just a type marker. New test
   test_parse_preserves_tool_blocks_verbatim covers a 5000-char input.

2. Use the full session_id in drawer IDs (not session_id[:12]). Rules
   out cross-session collisions if a transcript source ever uses
   non-UUID session identifiers or shared prefixes.

3. Replace silent `except Exception: return None` in get_palace_cursor
   with a logger.warning — the exact anti-pattern this PR otherwise
   criticizes in miner.py. The fallback behavior is still safe
   (deterministic IDs make a missed cursor recover on the next run),
   but the failure is now discoverable.

4. sweep_directory now collects per-file failures into the result dict
   and the CLI exits non-zero when any file failed, so a partial-sweep
   outcome is visible rather than swallowed.

Co-Authored-By: MSL <232237854+milla-jovovich@users.noreply.github.com>
This commit is contained in:
Igor Lins e Silva
2026-04-18 12:58:33 -03:00
parent fed69935d3
commit 29ce7c7135
4 changed files with 174 additions and 70 deletions
+7
View File
@@ -170,6 +170,13 @@ def cmd_sweep(args):
f"+{result['drawers_added']} drawers, " f"+{result['drawers_added']} drawers, "
f"{result['drawers_skipped']} already present." f"{result['drawers_skipped']} already present."
) )
failures = result.get("failures") or []
if failures:
print(
f"{len(failures)} file(s) failed to sweep — see stderr / logs for details.",
file=sys.stderr,
)
sys.exit(2)
else: else:
print(f" ✗ Not a file or directory: {target}", file=sys.stderr) print(f" ✗ Not a file or directory: {target}", file=sys.stderr)
sys.exit(1) sys.exit(1)
+44 -17
View File
@@ -26,6 +26,7 @@ Usage:
from __future__ import annotations from __future__ import annotations
import json import json
import logging
import sys import sys
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
@@ -33,16 +34,20 @@ from typing import Iterator, Optional
from .palace import get_collection from .palace import get_collection
logger = logging.getLogger(__name__)
# ── JSONL parsing ──────────────────────────────────────────────────── # ── JSONL parsing ────────────────────────────────────────────────────
def _flatten_content(content) -> str: def _flatten_content(content) -> str:
"""Normalize Claude Code's message content to a plain string. """Normalize Claude Code's message content to a plain string.
User messages are strings already; assistant messages are a list of User messages are strings already; assistant messages are a list of
content blocks like [{"type": "text", "text": "..."}, {"type": content blocks like [{"type": "text", "text": "..."}, {"type":
"tool_use", ...}]. We keep text blocks verbatim and describe non-text "tool_use", ...}]. All blocks are preserved verbatim — the design
blocks as a marker so the drawer carries a faithful record. principle is "verbatim always", so tool inputs and results are
serialized in full, never truncated.
""" """
if isinstance(content, str): if isinstance(content, str):
return content return content
@@ -57,14 +62,12 @@ def _flatten_content(content) -> str:
elif btype == "tool_use": elif btype == "tool_use":
parts.append( parts.append(
f"[tool_use: {block.get('name', '?')} " f"[tool_use: {block.get('name', '?')} "
f"input={json.dumps(block.get('input', {}), default=str)[:500]}]" f"input={json.dumps(block.get('input', {}), default=str)}]"
) )
elif btype == "tool_result": elif btype == "tool_result":
parts.append( parts.append(f"[tool_result: {json.dumps(block.get('content', ''), default=str)}]")
f"[tool_result: {json.dumps(block.get('content', ''), default=str)[:500]}]"
)
else: else:
parts.append(f"[{btype}]") parts.append(f"[{btype}: {json.dumps(block, default=str)}]")
return "\n".join(p for p in parts if p) return "\n".join(p for p in parts if p)
return str(content) return str(content)
@@ -127,19 +130,32 @@ def parse_claude_jsonl(path: str) -> Iterator[dict]:
# ── Cursor resolution ──────────────────────────────────────────────── # ── Cursor resolution ────────────────────────────────────────────────
def get_palace_cursor(collection, session_id: str) -> Optional[str]: def get_palace_cursor(collection, session_id: str) -> Optional[str]:
"""Return the max timestamp of drawers for this session_id, or None. """Return the max timestamp of drawers for this session_id, or None.
ISO-8601 strings compare lexically in the right order, so we don't ISO-8601 strings compare lexically in the right order, so we don't
need to parse them. Query scans metadatas for the session (ChromaDB need to parse them. Query scans metadatas for the session via the
where-filter), then reduces. backend's where-filter, then reduces.
Backend errors are logged at WARNING and surface as a `None` cursor —
which makes the caller treat the session as empty and ingest every
message. That's intentional: a no-cursor sweep is recovered from on
the next run by deterministic drawer IDs, so a degraded cursor never
causes silent data loss.
""" """
try: try:
data = collection.get( data = collection.get(
where={"session_id": session_id}, where={"session_id": session_id},
include=["metadatas"], include=["metadatas"],
) )
except Exception: except Exception as exc:
logger.warning(
"sweeper: cursor lookup failed for session_id=%s (%s); "
"treating as empty — drawers will be re-upserted idempotently.",
session_id,
exc,
)
return None return None
metas = data.get("metadatas") or [] metas = data.get("metadatas") or []
timestamps = [m.get("timestamp") for m in metas if m and m.get("timestamp")] timestamps = [m.get("timestamp") for m in metas if m and m.get("timestamp")]
@@ -150,13 +166,18 @@ def get_palace_cursor(collection, session_id: str) -> Optional[str]:
# ── Sweep ──────────────────────────────────────────────────────────── # ── Sweep ────────────────────────────────────────────────────────────
def _drawer_id_for_message(session_id: str, message_uuid: str) -> str: def _drawer_id_for_message(session_id: str, message_uuid: str) -> str:
"""Deterministic drawer ID so upserts at the same message are no-ops.""" """Deterministic drawer ID so upserts at the same message are no-ops.
return f"sweep_{session_id[:12]}_{message_uuid}"
Uses the full session_id (not a prefix) to avoid any cross-session
collision risk if a transcript source ever uses non-UUID session
identifiers or shares prefixes across sessions.
"""
return f"sweep_{session_id}_{message_uuid}"
def sweep(jsonl_path: str, palace_path: str, def sweep(jsonl_path: str, palace_path: str, source_label: Optional[str] = None) -> dict:
source_label: Optional[str] = None) -> dict:
"""Ingest every user/assistant message not already represented. """Ingest every user/assistant message not already represented.
For each message in the jsonl: For each message in the jsonl:
@@ -241,23 +262,29 @@ def sweep_directory(dir_path: str, palace_path: str) -> dict:
total_skipped = 0 total_skipped = 0
per_file = [] per_file = []
failures: list[dict] = []
for f in files: for f in files:
try: try:
result = sweep(str(f), palace_path, source_label=str(f)) result = sweep(str(f), palace_path, source_label=str(f))
except Exception as exc: except Exception as exc:
print(f" sweep failed on {f}: {exc}", file=sys.stderr) logger.error("sweeper: sweep failed on %s: %s", f, exc)
print(f" \u26a0 sweep failed on {f}: {exc}", file=sys.stderr)
failures.append({"file": str(f), "error": str(exc)})
continue continue
total_added += result["drawers_added"] total_added += result["drawers_added"]
total_skipped += result["drawers_skipped"] total_skipped += result["drawers_skipped"]
per_file.append({ per_file.append(
{
"file": str(f), "file": str(f),
"added": result["drawers_added"], "added": result["drawers_added"],
"skipped": result["drawers_skipped"], "skipped": result["drawers_skipped"],
}) }
)
return { return {
"files_processed": len(per_file), "files_processed": len(per_file),
"drawers_added": total_added, "drawers_added": total_added,
"drawers_skipped": total_skipped, "drawers_skipped": total_skipped,
"per_file": per_file, "per_file": per_file,
"failures": failures,
} }
+2
View File
@@ -108,9 +108,11 @@ class TestJsonlNotSilentlySkipped:
def fake_stat(self, *args, **kwargs): def fake_stat(self, *args, **kwargs):
result = real_stat(self, *args, **kwargs) result = real_stat(self, *args, **kwargs)
if self.name == "big_transcript.jsonl": if self.name == "big_transcript.jsonl":
class _FakeStat: class _FakeStat:
st_size = fake_size st_size = fake_size
st_mode = result.st_mode st_mode = result.st_mode
return _FakeStat() return _FakeStat()
return result return result
+117 -49
View File
@@ -16,8 +16,6 @@ This test file is TDD — written BEFORE mempalace/sweeper.py exists.
""" """
import json import json
import tempfile
from pathlib import Path
import pytest import pytest
@@ -28,27 +26,45 @@ def mock_claude_jsonl(tmp_path):
path = tmp_path / "session_abc.jsonl" path = tmp_path / "session_abc.jsonl"
lines = [ lines = [
# Noise: progress event, no message # Noise: progress event, no message
{"type": "progress", "timestamp": "2026-04-18T10:00:00Z", {
"sessionId": "abc", "uuid": "p-1"}, "type": "progress",
"timestamp": "2026-04-18T10:00:00Z",
"sessionId": "abc",
"uuid": "p-1",
},
# User message # User message
{"type": "user", "timestamp": "2026-04-18T10:00:05Z", {
"sessionId": "abc", "uuid": "u-1", "type": "user",
"message": {"role": "user", "content": "What's the capital of France?"}}, "timestamp": "2026-04-18T10:00:05Z",
"sessionId": "abc",
"uuid": "u-1",
"message": {"role": "user", "content": "What's the capital of France?"},
},
# Assistant reply # Assistant reply
{"type": "assistant", "timestamp": "2026-04-18T10:00:06Z", {
"sessionId": "abc", "uuid": "a-1", "type": "assistant",
"message": {"role": "assistant", "timestamp": "2026-04-18T10:00:06Z",
"content": [{"type": "text", "text": "Paris."}]}}, "sessionId": "abc",
"uuid": "a-1",
"message": {"role": "assistant", "content": [{"type": "text", "text": "Paris."}]},
},
# Noise: file-history-snapshot # Noise: file-history-snapshot
{"type": "file-history-snapshot", "messageId": "abc-snap"}, {"type": "file-history-snapshot", "messageId": "abc-snap"},
# Second user/assistant exchange # Second user/assistant exchange
{"type": "user", "timestamp": "2026-04-18T10:01:00Z", {
"sessionId": "abc", "uuid": "u-2", "type": "user",
"message": {"role": "user", "content": "And of Germany?"}}, "timestamp": "2026-04-18T10:01:00Z",
{"type": "assistant", "timestamp": "2026-04-18T10:01:01Z", "sessionId": "abc",
"sessionId": "abc", "uuid": "a-2", "uuid": "u-2",
"message": {"role": "assistant", "message": {"role": "user", "content": "And of Germany?"},
"content": [{"type": "text", "text": "Berlin."}]}}, },
{
"type": "assistant",
"timestamp": "2026-04-18T10:01:01Z",
"sessionId": "abc",
"uuid": "a-2",
"message": {"role": "assistant", "content": [{"type": "text", "text": "Berlin."}]},
},
] ]
path.write_text("\n".join(json.dumps(x) for x in lines) + "\n") path.write_text("\n".join(json.dumps(x) for x in lines) + "\n")
return path return path
@@ -57,6 +73,7 @@ def mock_claude_jsonl(tmp_path):
class TestSweeperParsing: class TestSweeperParsing:
def test_parse_yields_only_user_and_assistant(self, mock_claude_jsonl): def test_parse_yields_only_user_and_assistant(self, mock_claude_jsonl):
from mempalace.sweeper import parse_claude_jsonl from mempalace.sweeper import parse_claude_jsonl
records = list(parse_claude_jsonl(str(mock_claude_jsonl))) records = list(parse_claude_jsonl(str(mock_claude_jsonl)))
roles = [r["role"] for r in records] roles = [r["role"] for r in records]
assert roles == ["user", "assistant", "user", "assistant"], ( assert roles == ["user", "assistant", "user", "assistant"], (
@@ -67,6 +84,7 @@ class TestSweeperParsing:
def test_parse_extracts_session_id_and_timestamp(self, mock_claude_jsonl): def test_parse_extracts_session_id_and_timestamp(self, mock_claude_jsonl):
from mempalace.sweeper import parse_claude_jsonl from mempalace.sweeper import parse_claude_jsonl
records = list(parse_claude_jsonl(str(mock_claude_jsonl))) records = list(parse_claude_jsonl(str(mock_claude_jsonl)))
first = records[0] first = records[0]
assert first["session_id"] == "abc" assert first["session_id"] == "abc"
@@ -75,12 +93,52 @@ class TestSweeperParsing:
def test_parse_normalizes_assistant_content_list_to_text(self, mock_claude_jsonl): def test_parse_normalizes_assistant_content_list_to_text(self, mock_claude_jsonl):
from mempalace.sweeper import parse_claude_jsonl from mempalace.sweeper import parse_claude_jsonl
records = list(parse_claude_jsonl(str(mock_claude_jsonl))) records = list(parse_claude_jsonl(str(mock_claude_jsonl)))
assistant_rec = records[1] assistant_rec = records[1]
assert assistant_rec["role"] == "assistant" assert assistant_rec["role"] == "assistant"
assert "Paris" in assistant_rec["content"], ( assert (
f"Assistant content blocks must be flattened to text; " "Paris" in assistant_rec["content"]
f"got: {assistant_rec['content']!r}" ), f"Assistant content blocks must be flattened to text; got: {assistant_rec['content']!r}"
def test_parse_preserves_tool_blocks_verbatim(self, tmp_path):
"""Per the design principle "verbatim always", tool_use and
tool_result blocks must NOT be truncated. A long tool input
(e.g. a large diff handed to a code-edit tool) must round-trip
in full, otherwise we silently lose user-adjacent data.
"""
import json as _json
from mempalace.sweeper import parse_claude_jsonl
big_input = {"diff": "x" * 5000} # well past the old 500-char cap
path = tmp_path / "session_tools.jsonl"
path.write_text(
_json.dumps(
{
"type": "assistant",
"timestamp": "2026-04-18T10:00:00Z",
"sessionId": "tools-1",
"uuid": "a-tool",
"message": {
"role": "assistant",
"content": [
{"type": "tool_use", "name": "Edit", "input": big_input},
],
},
}
)
+ "\n"
)
records = list(parse_claude_jsonl(str(path)))
assert len(records) == 1
content = records[0]["content"]
# The full 5000-char value must be present — no truncation marker,
# no [:500] slice. Look for the raw string in the serialized form.
assert big_input["diff"] in content, (
"tool_use input was truncated. The verbatim guarantee requires "
f"the full payload to round-trip. Got len={len(content)}."
) )
@@ -89,6 +147,7 @@ class TestSweeperTandem:
def test_sweep_empty_palace_ingests_all_messages(self, mock_claude_jsonl, tmp_path): def test_sweep_empty_palace_ingests_all_messages(self, mock_claude_jsonl, tmp_path):
from mempalace.sweeper import sweep from mempalace.sweeper import sweep
palace_path = str(tmp_path / "palace") palace_path = str(tmp_path / "palace")
result = sweep(str(mock_claude_jsonl), palace_path) result = sweep(str(mock_claude_jsonl), palace_path)
assert result["drawers_added"] == 4, ( assert result["drawers_added"] == 4, (
@@ -99,6 +158,7 @@ class TestSweeperTandem:
def test_sweep_is_idempotent(self, mock_claude_jsonl, tmp_path): def test_sweep_is_idempotent(self, mock_claude_jsonl, tmp_path):
"""Running the sweep twice must not duplicate drawers.""" """Running the sweep twice must not duplicate drawers."""
from mempalace.sweeper import sweep from mempalace.sweeper import sweep
palace_path = str(tmp_path / "palace") palace_path = str(tmp_path / "palace")
first = sweep(str(mock_claude_jsonl), palace_path) first = sweep(str(mock_claude_jsonl), palace_path)
second = sweep(str(mock_claude_jsonl), palace_path) second = sweep(str(mock_claude_jsonl), palace_path)
@@ -116,13 +176,20 @@ class TestSweeperTandem:
jsonl_path = tmp_path / "session.jsonl" jsonl_path = tmp_path / "session.jsonl"
lines = [ lines = [
{"type": "user", "timestamp": "2026-04-18T09:00:00Z", {
"sessionId": "s1", "uuid": "u1", "type": "user",
"message": {"role": "user", "content": "first"}}, "timestamp": "2026-04-18T09:00:00Z",
{"type": "assistant", "timestamp": "2026-04-18T09:00:01Z", "sessionId": "s1",
"sessionId": "s1", "uuid": "a1", "uuid": "u1",
"message": {"role": "assistant", "message": {"role": "user", "content": "first"},
"content": [{"type": "text", "text": "one"}]}}, },
{
"type": "assistant",
"timestamp": "2026-04-18T09:00:01Z",
"sessionId": "s1",
"uuid": "a1",
"message": {"role": "assistant", "content": [{"type": "text", "text": "one"}]},
},
] ]
jsonl_path.write_text("\n".join(json.dumps(x) for x in lines) + "\n") jsonl_path.write_text("\n".join(json.dumps(x) for x in lines) + "\n")
@@ -132,13 +199,20 @@ class TestSweeperTandem:
# Append two more exchanges simulating live session growth. # Append two more exchanges simulating live session growth.
more_lines = [ more_lines = [
{"type": "user", "timestamp": "2026-04-18T09:05:00Z", {
"sessionId": "s1", "uuid": "u2", "type": "user",
"message": {"role": "user", "content": "second"}}, "timestamp": "2026-04-18T09:05:00Z",
{"type": "assistant", "timestamp": "2026-04-18T09:05:01Z", "sessionId": "s1",
"sessionId": "s1", "uuid": "a2", "uuid": "u2",
"message": {"role": "assistant", "message": {"role": "user", "content": "second"},
"content": [{"type": "text", "text": "two"}]}}, },
{
"type": "assistant",
"timestamp": "2026-04-18T09:05:01Z",
"sessionId": "s1",
"uuid": "a2",
"message": {"role": "assistant", "content": [{"type": "text", "text": "two"}]},
},
] ]
with open(jsonl_path, "a") as f: with open(jsonl_path, "a") as f:
for x in more_lines: for x in more_lines:
@@ -156,8 +230,7 @@ class TestSweeperDrawerMetadata:
"""Each drawer must carry the metadata the tandem-miner coordination """Each drawer must carry the metadata the tandem-miner coordination
depends on: session_id, timestamp, uuid, role.""" depends on: session_id, timestamp, uuid, role."""
def test_drawer_has_session_id_and_timestamp_metadata( def test_drawer_has_session_id_and_timestamp_metadata(self, mock_claude_jsonl, tmp_path):
self, mock_claude_jsonl, tmp_path):
from mempalace.sweeper import sweep from mempalace.sweeper import sweep
from mempalace.palace import get_collection from mempalace.palace import get_collection
@@ -170,15 +243,10 @@ class TestSweeperDrawerMetadata:
assert metas, "No drawers written" assert metas, "No drawers written"
for m in metas: for m in metas:
assert m.get("session_id") == "abc", ( assert m.get("session_id") == "abc", f"Drawer missing session_id metadata: {m}"
f"Drawer missing session_id metadata: {m}" assert m.get("timestamp"), f"Drawer missing timestamp metadata: {m}"
) assert m.get("message_uuid"), f"Drawer missing message_uuid metadata: {m}"
assert m.get("timestamp"), ( assert m.get("role") in (
f"Drawer missing timestamp metadata: {m}" "user",
) "assistant",
assert m.get("message_uuid"), ( ), f"Drawer missing or wrong role metadata: {m}"
f"Drawer missing message_uuid metadata: {m}"
)
assert m.get("role") in ("user", "assistant"), (
f"Drawer missing or wrong role metadata: {m}"
)