From ca2598a9f69247429c367217eaf167c9d9c824da Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Mon, 13 Apr 2026 16:11:03 -0300
Subject: [PATCH] fix(normalize): make strip_noise verbatim-safe and scope it
to Claude Code JSONL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The initial strip_noise() regressed on three fronts when audited against
adversarial user content — each verified with executable repros against
the cherry-picked code:
1. `.*?` with re.DOTALL span-ate across messages: one
stray unclosed anywhere in a session merged with
the next closing tag, silently deleting everything between them
(including full assistant replies).
2. `.*\(ctrl\+o to expand\).*\n?` nuked entire lines of user prose
whenever a user happened to document the TUI shortcut.
3. `Ran \d+ (?:stop|pre|post)\s*hook.*` with IGNORECASE ate the
second sentence from "our CI has a stop hook ... Ran 2 stop hooks
last week" — legitimate user commentary.
These are unambiguous violations of the project's "Verbatim always"
design principle.
Fixes:
- All tag patterns are now line-anchored (`(?m)^(?:> )?`) and their
body forbids crossing a blank line (`(?:(?!\n\s*\n)[\s\S])*?`), so a
dangling open tag cannot eat neighboring messages.
- `_NOISE_LINE_PREFIXES` are line-anchored and case-sensitive — user
prose mentioning "CURRENT TIME:" mid-sentence is preserved.
- Hook-run chrome requires `(?m)^`, explicit hook names (Stop,
PreCompact, PreToolUse, etc.), and no IGNORECASE.
- "… +N lines" is line-anchored.
- "(ctrl+o to expand)" only matches Claude Code's actual collapsed-
output chrome shape `[N tokens] (ctrl+o to expand)`; a bare
parenthetical in user prose stays intact.
Scope:
- `strip_noise()` is no longer called on every normalization path.
Only `_try_claude_code_jsonl` invokes it, per-extracted-message — so
Claude.ai exports, ChatGPT exports, Slack JSON, Codex JSONL, and
plain text with `>` markers pass through fully verbatim. Per-message
application also makes span-eating structurally impossible.
Tests:
- 15 new tests in test_normalize.py pin the boundary: 6 guard user
content that must survive (each of the adversarial repros), 9 assert
real system chrome is still stripped. All pass; full suite 702 pass
(2 failures are the unrelated pre-existing version.py bug, cleared
by #820).
Known limitation (not fixed here): convo_miner.py does not delete
drawers on re-mine, so transcripts mined before this PR keep noise-
filled drawers until the user manually erases + re-mines. Proper fix
needs a schema-version field on drawer metadata + re-mine trigger —
out of scope for this PR.
---
mempalace/normalize.py | 101 +++++++++++++++++++--------
tests/test_normalize.py | 146 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 218 insertions(+), 29 deletions(-)
diff --git a/mempalace/normalize.py b/mempalace/normalize.py
index 256a5e9..f2b8173 100644
--- a/mempalace/normalize.py
+++ b/mempalace/normalize.py
@@ -22,20 +22,40 @@ from typing import Optional
# ─── Noise stripping ─────────────────────────────────────────────────────
-# Claude Code and other tools inject system tags, hook output, UI chrome,
-# and tool-call JSON into transcripts. These waste drawer space and pollute
-# search results. Strip them before filing.
+# Claude Code and other tools inject system tags, hook output, and UI chrome
+# into transcripts. These waste drawer space and pollute search results.
+#
+# Verbatim is sacred — every pattern here is anchored to line boundaries and
+# refuses to cross blank lines, so a stray unclosed tag in one message can
+# never eat content from neighboring messages. When in doubt, leave text
+# alone.
-_NOISE_TAG_PATTERNS = [
- re.compile(r"]*>.*?", re.DOTALL),
- re.compile(r"]*>.*?", re.DOTALL),
- re.compile(r"]*>.*?", re.DOTALL),
- re.compile(r"]*>.*?", re.DOTALL),
- re.compile(r"]*>.*?", re.DOTALL),
- re.compile(r"]*>.*?", re.DOTALL),
-]
+_NOISE_TAGS = (
+ "system-reminder",
+ "command-message",
+ "command-name",
+ "task-notification",
+ "user-prompt-submit-hook",
+ "hook_output",
+)
-_NOISE_STRINGS = [
+
+def _tag_pattern(name: str) -> "re.Pattern[str]":
+ # Opening tag must begin a line (optionally after a `> ` blockquote marker,
+ # since _messages_to_transcript prefixes lines with `> `). Body is lazy but
+ # forbidden from crossing a blank line, so a dangling open tag can't span
+ # multiple messages. Closing tag eats optional trailing whitespace + newline.
+ return re.compile(
+ rf"(?m)^(?:> )?<{name}(?:\s[^>]*)?>" rf"(?:(?!\n\s*\n)[\s\S])*?" rf"{name}>[ \t]*\n?"
+ )
+
+
+_NOISE_TAG_PATTERNS = [_tag_pattern(t) for t in _NOISE_TAGS]
+
+# Strings that identify an entire noise line when found at its start.
+# Matched case-sensitively and anchored to line-start so user prose mentioning
+# e.g. "current time:" in a sentence is untouched.
+_NOISE_LINE_PREFIXES = (
"CURRENT TIME:",
"VERIFIED FACTS (do not contradict)",
"AGENT SPECIALIZATION:",
@@ -46,20 +66,39 @@ _NOISE_STRINGS = [
"Auto-save reminder...",
"Checking pipeline...",
"MemPalace auto-save checkpoint.",
+)
+
+_NOISE_LINE_PATTERNS = [
+ re.compile(rf"(?m)^(?:> )?{re.escape(p)}.*\n?") for p in _NOISE_LINE_PREFIXES
]
+# Claude Code TUI hook-run chrome, e.g. "Ran 2 Stop hook", "Ran 1 PreCompact hook".
+# Line-anchored, case-sensitive, explicit hook names — prose like
+# "our CI has a stop hook" stays intact.
+_HOOK_LINE_RE = re.compile(
+ r"(?m)^(?:> )?Ran \d+ (?:Stop|PreCompact|PreToolUse|PostToolUse|UserPromptSubmit|Notification|SessionStart|SessionEnd) hook[s]?.*\n?"
+)
+
+# "… +N lines" collapsed-output marker, line-anchored.
+_COLLAPSED_LINES_RE = re.compile(r"(?m)^(?:> )?…\s*\+\d+ lines.*\n?")
+
def strip_noise(text: str) -> str:
- """Remove system tags, hook output, and Claude Code UI chrome from text."""
+ """Remove system tags, hook output, and Claude Code UI chrome from text.
+
+ All patterns are line-anchored. User prose that happens to mention these
+ strings inline (e.g., documenting them) is preserved verbatim.
+ """
for pat in _NOISE_TAG_PATTERNS:
text = pat.sub("", text)
- for noise in _NOISE_STRINGS:
- text = text.replace(noise, "")
- # Strip Claude Code UI chrome
- text = re.sub(r".*\(ctrl\+o to expand\).*\n?", "", text)
- text = re.sub(r"Ran \d+ (?:stop|pre|post)\s*hook.*\n?", "", text, flags=re.IGNORECASE)
- text = re.sub(r"…\s*\+\d+ lines.*\n?", "", text)
- # Collapse runs of blank lines
+ for pat in _NOISE_LINE_PATTERNS:
+ text = pat.sub("", text)
+ text = _HOOK_LINE_RE.sub("", text)
+ text = _COLLAPSED_LINES_RE.sub("", text)
+ # Strip the Claude Code collapsed-output chrome "[N tokens] (ctrl+o to expand)".
+ # Narrow shape — a bare "(ctrl+o to expand)" in user prose stays intact.
+ text = re.sub(r"\s*\[\d+\s+tokens?\]\s*\(ctrl\+o to expand\)", "", text)
+ # Collapse runs of blank lines created by the removals
text = re.sub(r"\n{4,}", "\n\n\n", text)
return text.strip()
@@ -84,23 +123,21 @@ def normalize(filepath: str) -> str:
if not content.strip():
return content
- # Already has > markers — pass through (strip noise but preserve trailing newline)
+ # Already has > markers — pass through unchanged.
lines = content.split("\n")
if sum(1 for line in lines if line.strip().startswith(">")) >= 3:
- cleaned = strip_noise(content)
- # Preserve trailing newline if original had one
- if content.endswith("\n") and not cleaned.endswith("\n"):
- cleaned += "\n"
- return cleaned
+ return content
- # Try JSON normalization
+ # Try JSON normalization. strip_noise is applied inside the Claude Code
+ # JSONL parser (the only format that injects system tags/hook chrome);
+ # other formats pass through verbatim.
ext = Path(filepath).suffix.lower()
if ext in (".json", ".jsonl") or content.strip()[:1] in ("{", "["):
normalized = _try_normalize_json(content)
if normalized:
- return strip_noise(normalized)
+ return normalized
- return strip_noise(content)
+ return content
def _try_normalize_json(content: str) -> Optional[str]:
@@ -160,6 +197,10 @@ def _try_claude_code_jsonl(content: str) -> Optional[str]:
isinstance(b, dict) and b.get("type") == "tool_result" for b in msg_content
)
text = _extract_content(msg_content, tool_use_map=tool_use_map)
+ # Strip Claude Code system-injected noise per message, never across
+ # message boundaries — prevents span-eating.
+ if text:
+ text = strip_noise(text)
if text:
if is_tool_only and messages and messages[-1][0] == "assistant":
# Append tool results to the previous assistant message
@@ -169,6 +210,8 @@ def _try_claude_code_jsonl(content: str) -> Optional[str]:
messages.append(("user", text))
elif msg_type == "assistant":
text = _extract_content(msg_content, tool_use_map=tool_use_map)
+ if text:
+ text = strip_noise(text)
if text:
# If previous message is also assistant (multi-turn tool loop),
# merge into the same assistant turn
diff --git a/tests/test_normalize.py b/tests/test_normalize.py
index 7f0652a..53fc933 100644
--- a/tests/test_normalize.py
+++ b/tests/test_normalize.py
@@ -13,6 +13,7 @@ from mempalace.normalize import (
_try_normalize_json,
_try_slack_json,
normalize,
+ strip_noise,
)
@@ -1048,3 +1049,148 @@ def test_normalize_rejects_large_file():
assert False, "Should have raised IOError"
except IOError as e:
assert "too large" in str(e).lower()
+
+
+# ── strip_noise() — verbatim-safety boundary tests ─────────────────────
+#
+# The "Verbatim always" design principle requires that we never delete
+# user-authored text. These tests pin down the boundary between system
+# noise (which we strip) and user prose that happens to mention the same
+# strings (which must survive untouched).
+
+
+class TestStripNoisePreservesUserContent:
+ """User prose that mentions noise strings inline must be preserved."""
+
+ def test_user_discusses_stop_hook_in_prose(self):
+ # Regression: original regex with IGNORECASE + `.*\n?` ate the second
+ # sentence from real user commentary.
+ text = (
+ "> User:\n"
+ "> Our CI has a stop hook that rejects merges after 5pm. "
+ "Ran 2 stop hooks last week.\n"
+ "> Assistant:\n"
+ "> Got it."
+ )
+ assert strip_noise(text) == text.strip()
+
+ def test_user_mentions_system_reminder_inline(self):
+ # Inline tags inside user prose (e.g. documenting
+ # Claude Code behavior) must not be stripped.
+ text = (
+ "> User:\n"
+ "> Here is what Claude Code emits: "
+ "Auto-save reminder..."
+ " — I want to ignore it."
+ )
+ assert strip_noise(text) == text.strip()
+
+ def test_ctrl_o_hint_in_prose_preserved(self):
+ # Regression: original `.*\(ctrl\+o to expand\).*\n?` nuked the whole
+ # line whenever a user documented the TUI shortcut.
+ text = (
+ "> User:\n"
+ "> In the TUI you hit (ctrl+o to expand) to see more. "
+ "That is the shortcut I want to document."
+ )
+ assert strip_noise(text) == text.strip()
+
+ def test_current_time_inline_in_prose(self):
+ text = "> User:\n> At CURRENT TIME: the meeting starts, not before."
+ assert strip_noise(text) == text.strip()
+
+ def test_plus_n_lines_marker_inline(self):
+ text = "> User:\n> The log showed … +50 lines of stack trace, useful."
+ assert strip_noise(text) == text.strip()
+
+ def test_dangling_open_tag_does_not_span_messages(self):
+ # THE span-eating bug: a stray unclosed in one
+ # message must NOT merge with a closing tag in another message and
+ # silently delete everything in between.
+ text = (
+ "> User 1: normal content A\n"
+ "> Assistant: reply\n"
+ "> User 2: more content tail"
+ )
+ out = strip_noise(text)
+ assert "Assistant: reply" in out
+ assert "User 2: more content" in out
+ assert "User 1: normal content" in out
+
+
+class TestStripNoiseRemovesSystemChrome:
+ """System-injected noise with standalone/line-anchored shape must be stripped."""
+
+ def test_strips_line_anchored_system_reminder_block(self):
+ text = (
+ "> User:\n"
+ "\n"
+ "Auto-save reminder...\n"
+ "\n"
+ "> Real message."
+ )
+ out = strip_noise(text)
+ assert "system-reminder" not in out
+ assert "Auto-save reminder" not in out
+ assert "Real message." in out
+
+ def test_strips_system_reminder_with_blockquote_prefix(self):
+ # _messages_to_transcript prefixes lines with "> ", so the line
+ # anchor must also accept that shape.
+ text = "> User:\n" "> Injected noise\n" "> Real message."
+ out = strip_noise(text)
+ assert "Injected noise" not in out
+ assert "Real message." in out
+
+ def test_strips_standalone_ran_hook_line(self):
+ text = "Ran 2 Stop hook\n> User: real content"
+ out = strip_noise(text)
+ assert "Ran 2 Stop hook" not in out
+ assert "real content" in out
+
+ def test_strips_known_hook_names(self):
+ for hook in ("Stop", "PreCompact", "PreToolUse", "PostToolUse", "UserPromptSubmit"):
+ text = f"Ran 1 {hook} hook\n> User: content"
+ assert hook not in strip_noise(text)
+
+ def test_strips_current_time_standalone(self):
+ text = "CURRENT TIME: 2026-04-13 10:00 UTC\n> User: Hello"
+ out = strip_noise(text)
+ assert "CURRENT TIME" not in out
+ assert "Hello" in out
+
+ def test_strips_collapsed_lines_marker(self):
+ text = "… +42 lines\n> User: Hello"
+ out = strip_noise(text)
+ assert "+42 lines" not in out
+ assert "Hello" in out
+
+ def test_strips_token_count_ctrl_o_chrome(self):
+ # Claude Code's actual collapsed-output chrome: "[N tokens] (ctrl+o to expand)"
+ text = "> Assistant: some output [5 tokens] (ctrl+o to expand)\n> User: ok"
+ out = strip_noise(text)
+ assert "(ctrl+o to expand)" not in out
+ assert "[5 tokens]" not in out
+ assert "some output" in out
+
+ def test_strips_each_known_noise_tag(self):
+ for tag in (
+ "system-reminder",
+ "command-message",
+ "command-name",
+ "task-notification",
+ "user-prompt-submit-hook",
+ "hook_output",
+ ):
+ text = f"> User:\n<{tag}>junk{tag}>\n> Real."
+ out = strip_noise(text)
+ assert tag not in out, f"{tag} leaked into output"
+ assert "Real." in out
+
+ def test_collapses_excessive_blank_lines(self):
+ text = "line one\n\n\n\n\n\nline two"
+ out = strip_noise(text)
+ assert "line one" in out
+ assert "line two" in out
+ # Should collapse to no more than 3 newlines
+ assert "\n\n\n\n" not in out