diff --git a/mempalace/normalize.py b/mempalace/normalize.py index 256a5e9..f2b8173 100644 --- a/mempalace/normalize.py +++ b/mempalace/normalize.py @@ -22,20 +22,40 @@ from typing import Optional # ─── Noise stripping ───────────────────────────────────────────────────── -# Claude Code and other tools inject system tags, hook output, UI chrome, -# and tool-call JSON into transcripts. These waste drawer space and pollute -# search results. Strip them before filing. +# Claude Code and other tools inject system tags, hook output, and UI chrome +# into transcripts. These waste drawer space and pollute search results. +# +# Verbatim is sacred — every pattern here is anchored to line boundaries and +# refuses to cross blank lines, so a stray unclosed tag in one message can +# never eat content from neighboring messages. When in doubt, leave text +# alone. -_NOISE_TAG_PATTERNS = [ - re.compile(r"]*>.*?", re.DOTALL), - re.compile(r"]*>.*?", re.DOTALL), - re.compile(r"]*>.*?", re.DOTALL), - re.compile(r"]*>.*?", re.DOTALL), - re.compile(r"]*>.*?", re.DOTALL), - re.compile(r"]*>.*?", re.DOTALL), -] +_NOISE_TAGS = ( + "system-reminder", + "command-message", + "command-name", + "task-notification", + "user-prompt-submit-hook", + "hook_output", +) -_NOISE_STRINGS = [ + +def _tag_pattern(name: str) -> "re.Pattern[str]": + # Opening tag must begin a line (optionally after a `> ` blockquote marker, + # since _messages_to_transcript prefixes lines with `> `). Body is lazy but + # forbidden from crossing a blank line, so a dangling open tag can't span + # multiple messages. Closing tag eats optional trailing whitespace + newline. + return re.compile( + rf"(?m)^(?:> )?<{name}(?:\s[^>]*)?>" rf"(?:(?!\n\s*\n)[\s\S])*?" rf"[ \t]*\n?" + ) + + +_NOISE_TAG_PATTERNS = [_tag_pattern(t) for t in _NOISE_TAGS] + +# Strings that identify an entire noise line when found at its start. +# Matched case-sensitively and anchored to line-start so user prose mentioning +# e.g. "current time:" in a sentence is untouched. +_NOISE_LINE_PREFIXES = ( "CURRENT TIME:", "VERIFIED FACTS (do not contradict)", "AGENT SPECIALIZATION:", @@ -46,20 +66,39 @@ _NOISE_STRINGS = [ "Auto-save reminder...", "Checking pipeline...", "MemPalace auto-save checkpoint.", +) + +_NOISE_LINE_PATTERNS = [ + re.compile(rf"(?m)^(?:> )?{re.escape(p)}.*\n?") for p in _NOISE_LINE_PREFIXES ] +# Claude Code TUI hook-run chrome, e.g. "Ran 2 Stop hook", "Ran 1 PreCompact hook". +# Line-anchored, case-sensitive, explicit hook names — prose like +# "our CI has a stop hook" stays intact. +_HOOK_LINE_RE = re.compile( + r"(?m)^(?:> )?Ran \d+ (?:Stop|PreCompact|PreToolUse|PostToolUse|UserPromptSubmit|Notification|SessionStart|SessionEnd) hook[s]?.*\n?" +) + +# "… +N lines" collapsed-output marker, line-anchored. +_COLLAPSED_LINES_RE = re.compile(r"(?m)^(?:> )?…\s*\+\d+ lines.*\n?") + def strip_noise(text: str) -> str: - """Remove system tags, hook output, and Claude Code UI chrome from text.""" + """Remove system tags, hook output, and Claude Code UI chrome from text. + + All patterns are line-anchored. User prose that happens to mention these + strings inline (e.g., documenting them) is preserved verbatim. + """ for pat in _NOISE_TAG_PATTERNS: text = pat.sub("", text) - for noise in _NOISE_STRINGS: - text = text.replace(noise, "") - # Strip Claude Code UI chrome - text = re.sub(r".*\(ctrl\+o to expand\).*\n?", "", text) - text = re.sub(r"Ran \d+ (?:stop|pre|post)\s*hook.*\n?", "", text, flags=re.IGNORECASE) - text = re.sub(r"…\s*\+\d+ lines.*\n?", "", text) - # Collapse runs of blank lines + for pat in _NOISE_LINE_PATTERNS: + text = pat.sub("", text) + text = _HOOK_LINE_RE.sub("", text) + text = _COLLAPSED_LINES_RE.sub("", text) + # Strip the Claude Code collapsed-output chrome "[N tokens] (ctrl+o to expand)". + # Narrow shape — a bare "(ctrl+o to expand)" in user prose stays intact. + text = re.sub(r"\s*\[\d+\s+tokens?\]\s*\(ctrl\+o to expand\)", "", text) + # Collapse runs of blank lines created by the removals text = re.sub(r"\n{4,}", "\n\n\n", text) return text.strip() @@ -84,23 +123,21 @@ def normalize(filepath: str) -> str: if not content.strip(): return content - # Already has > markers — pass through (strip noise but preserve trailing newline) + # Already has > markers — pass through unchanged. lines = content.split("\n") if sum(1 for line in lines if line.strip().startswith(">")) >= 3: - cleaned = strip_noise(content) - # Preserve trailing newline if original had one - if content.endswith("\n") and not cleaned.endswith("\n"): - cleaned += "\n" - return cleaned + return content - # Try JSON normalization + # Try JSON normalization. strip_noise is applied inside the Claude Code + # JSONL parser (the only format that injects system tags/hook chrome); + # other formats pass through verbatim. ext = Path(filepath).suffix.lower() if ext in (".json", ".jsonl") or content.strip()[:1] in ("{", "["): normalized = _try_normalize_json(content) if normalized: - return strip_noise(normalized) + return normalized - return strip_noise(content) + return content def _try_normalize_json(content: str) -> Optional[str]: @@ -160,6 +197,10 @@ def _try_claude_code_jsonl(content: str) -> Optional[str]: isinstance(b, dict) and b.get("type") == "tool_result" for b in msg_content ) text = _extract_content(msg_content, tool_use_map=tool_use_map) + # Strip Claude Code system-injected noise per message, never across + # message boundaries — prevents span-eating. + if text: + text = strip_noise(text) if text: if is_tool_only and messages and messages[-1][0] == "assistant": # Append tool results to the previous assistant message @@ -169,6 +210,8 @@ def _try_claude_code_jsonl(content: str) -> Optional[str]: messages.append(("user", text)) elif msg_type == "assistant": text = _extract_content(msg_content, tool_use_map=tool_use_map) + if text: + text = strip_noise(text) if text: # If previous message is also assistant (multi-turn tool loop), # merge into the same assistant turn diff --git a/tests/test_normalize.py b/tests/test_normalize.py index 7f0652a..53fc933 100644 --- a/tests/test_normalize.py +++ b/tests/test_normalize.py @@ -13,6 +13,7 @@ from mempalace.normalize import ( _try_normalize_json, _try_slack_json, normalize, + strip_noise, ) @@ -1048,3 +1049,148 @@ def test_normalize_rejects_large_file(): assert False, "Should have raised IOError" except IOError as e: assert "too large" in str(e).lower() + + +# ── strip_noise() — verbatim-safety boundary tests ───────────────────── +# +# The "Verbatim always" design principle requires that we never delete +# user-authored text. These tests pin down the boundary between system +# noise (which we strip) and user prose that happens to mention the same +# strings (which must survive untouched). + + +class TestStripNoisePreservesUserContent: + """User prose that mentions noise strings inline must be preserved.""" + + def test_user_discusses_stop_hook_in_prose(self): + # Regression: original regex with IGNORECASE + `.*\n?` ate the second + # sentence from real user commentary. + text = ( + "> User:\n" + "> Our CI has a stop hook that rejects merges after 5pm. " + "Ran 2 stop hooks last week.\n" + "> Assistant:\n" + "> Got it." + ) + assert strip_noise(text) == text.strip() + + def test_user_mentions_system_reminder_inline(self): + # Inline tags inside user prose (e.g. documenting + # Claude Code behavior) must not be stripped. + text = ( + "> User:\n" + "> Here is what Claude Code emits: " + "Auto-save reminder..." + " — I want to ignore it." + ) + assert strip_noise(text) == text.strip() + + def test_ctrl_o_hint_in_prose_preserved(self): + # Regression: original `.*\(ctrl\+o to expand\).*\n?` nuked the whole + # line whenever a user documented the TUI shortcut. + text = ( + "> User:\n" + "> In the TUI you hit (ctrl+o to expand) to see more. " + "That is the shortcut I want to document." + ) + assert strip_noise(text) == text.strip() + + def test_current_time_inline_in_prose(self): + text = "> User:\n> At CURRENT TIME: the meeting starts, not before." + assert strip_noise(text) == text.strip() + + def test_plus_n_lines_marker_inline(self): + text = "> User:\n> The log showed … +50 lines of stack trace, useful." + assert strip_noise(text) == text.strip() + + def test_dangling_open_tag_does_not_span_messages(self): + # THE span-eating bug: a stray unclosed in one + # message must NOT merge with a closing tag in another message and + # silently delete everything in between. + text = ( + "> User 1: normal content A\n" + "> Assistant: reply\n" + "> User 2: more content tail" + ) + out = strip_noise(text) + assert "Assistant: reply" in out + assert "User 2: more content" in out + assert "User 1: normal content" in out + + +class TestStripNoiseRemovesSystemChrome: + """System-injected noise with standalone/line-anchored shape must be stripped.""" + + def test_strips_line_anchored_system_reminder_block(self): + text = ( + "> User:\n" + "\n" + "Auto-save reminder...\n" + "\n" + "> Real message." + ) + out = strip_noise(text) + assert "system-reminder" not in out + assert "Auto-save reminder" not in out + assert "Real message." in out + + def test_strips_system_reminder_with_blockquote_prefix(self): + # _messages_to_transcript prefixes lines with "> ", so the line + # anchor must also accept that shape. + text = "> User:\n" "> Injected noise\n" "> Real message." + out = strip_noise(text) + assert "Injected noise" not in out + assert "Real message." in out + + def test_strips_standalone_ran_hook_line(self): + text = "Ran 2 Stop hook\n> User: real content" + out = strip_noise(text) + assert "Ran 2 Stop hook" not in out + assert "real content" in out + + def test_strips_known_hook_names(self): + for hook in ("Stop", "PreCompact", "PreToolUse", "PostToolUse", "UserPromptSubmit"): + text = f"Ran 1 {hook} hook\n> User: content" + assert hook not in strip_noise(text) + + def test_strips_current_time_standalone(self): + text = "CURRENT TIME: 2026-04-13 10:00 UTC\n> User: Hello" + out = strip_noise(text) + assert "CURRENT TIME" not in out + assert "Hello" in out + + def test_strips_collapsed_lines_marker(self): + text = "… +42 lines\n> User: Hello" + out = strip_noise(text) + assert "+42 lines" not in out + assert "Hello" in out + + def test_strips_token_count_ctrl_o_chrome(self): + # Claude Code's actual collapsed-output chrome: "[N tokens] (ctrl+o to expand)" + text = "> Assistant: some output [5 tokens] (ctrl+o to expand)\n> User: ok" + out = strip_noise(text) + assert "(ctrl+o to expand)" not in out + assert "[5 tokens]" not in out + assert "some output" in out + + def test_strips_each_known_noise_tag(self): + for tag in ( + "system-reminder", + "command-message", + "command-name", + "task-notification", + "user-prompt-submit-hook", + "hook_output", + ): + text = f"> User:\n<{tag}>junk\n> Real." + out = strip_noise(text) + assert tag not in out, f"{tag} leaked into output" + assert "Real." in out + + def test_collapses_excessive_blank_lines(self): + text = "line one\n\n\n\n\n\nline two" + out = strip_noise(text) + assert "line one" in out + assert "line two" in out + # Should collapse to no more than 3 newlines + assert "\n\n\n\n" not in out