From 9b99c136ee13c1dc97f3c20bb2f59f4464d7284d Mon Sep 17 00:00:00 2001
From: MSL <232237854+milla-jovovich@users.noreply.github.com>
Date: Mon, 13 Apr 2026 01:55:25 -0700
Subject: [PATCH 1/3] fix: strip system tags, hook output, and Claude UI chrome
 from drawers

normalize.py now strips before filing:
- <system-reminder>, <command-message>, <command-name> tags
- <task-notification>, <user-prompt-submit-hook>, <hook_output> tags
- Hook status messages (CURRENT TIME, Checking verified facts, etc.)
- Claude Code UI chrome (ctrl+o to expand, progress bars, etc.)
- Collapsed runs of blank lines

This noise was going straight into drawers, wasting storage space
and polluting search results. strip_noise() runs on all normalized
output regardless of input format (JSONL, JSON, plain text).

689/689 tests pass.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 mempalace/normalize.py | 56 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 52 insertions(+), 4 deletions(-)
diff --git a/mempalace/normalize.py b/mempalace/normalize.py
index e599df9..256a5e9 100644
--- a/mempalace/normalize.py
+++ b/mempalace/normalize.py
@@ -16,10 +16,54 @@ No API key. No internet. Everything local.
 
 import json
 import os
+import re
 from pathlib import Path
 from typing import Optional
 
 
+# ─── Noise stripping ─────────────────────────────────────────────────────
+# Claude Code and other tools inject system tags, hook output, UI chrome,
+# and tool-call JSON into transcripts. These waste drawer space and pollute
+# search results. Strip them before filing.
+
+_NOISE_TAG_PATTERNS = [
+    re.compile(r"<system-reminder[^>]*>.*?</system-reminder>", re.DOTALL),
+    re.compile(r"<command-message[^>]*>.*?</command-message>", re.DOTALL),
+    re.compile(r"<command-name[^>]*>.*?</command-name>", re.DOTALL),
+    re.compile(r"<task-notification[^>]*>.*?</task-notification>", re.DOTALL),
+    re.compile(r"<user-prompt-submit-hook[^>]*>.*?</user-prompt-submit-hook>", re.DOTALL),
+    re.compile(r"<hook_output[^>]*>.*?</hook_output>", re.DOTALL),
+]
+
+_NOISE_STRINGS = [
+    "CURRENT TIME:",
+    "VERIFIED FACTS (do not contradict)",
+    "AGENT SPECIALIZATION:",
+    "Checking verified facts...",
+    "Injecting timestamp...",
+    "Starting background pipeline...",
+    "Checking emotional weights...",
+    "Auto-save reminder...",
+    "Checking pipeline...",
+    "MemPalace auto-save checkpoint.",
+]
+
+
+def strip_noise(text: str) -> str:
+    """Remove system tags, hook output, and Claude Code UI chrome from text."""
+    for pat in _NOISE_TAG_PATTERNS:
+        text = pat.sub("", text)
+    for noise in _NOISE_STRINGS:
+        text = text.replace(noise, "")
+    # Strip Claude Code UI chrome
+    text = re.sub(r".*\(ctrl\+o to expand\).*\n?", "", text)
+    text = re.sub(r"Ran \d+ (?:stop|pre|post)\s*hook.*\n?", "", text, flags=re.IGNORECASE)
+    text = re.sub(r"…\s*\+\d+ lines.*\n?", "", text)
+    # Collapse runs of blank lines
+    text = re.sub(r"\n{4,}", "\n\n\n", text)
+    return text.strip()
+
+
 def normalize(filepath: str) -> str:
     """
     Load a file and normalize to transcript format if it's a chat export.
@@ -40,19 +84,23 @@ def normalize(filepath: str) -> str:
     if not content.strip():
         return content
 
-    # Already has > markers — pass through
+    # Already has > markers — pass through (strip noise but preserve trailing newline)
     lines = content.split("\n")
     if sum(1 for line in lines if line.strip().startswith(">")) >= 3:
-        return content
+        cleaned = strip_noise(content)
+        # Preserve trailing newline if original had one
+        if content.endswith("\n") and not cleaned.endswith("\n"):
+            cleaned += "\n"
+        return cleaned
 
     # Try JSON normalization
     ext = Path(filepath).suffix.lower()
     if ext in (".json", ".jsonl") or content.strip()[:1] in ("{", "["):
         normalized = _try_normalize_json(content)
         if normalized:
-            return normalized
+            return strip_noise(normalized)
 
-    return content
+    return strip_noise(content)
 
 
 def _try_normalize_json(content: str) -> Optional[str]:

From ca2598a9f69247429c367217eaf167c9d9c824da Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Mon, 13 Apr 2026 16:11:03 -0300
Subject: [PATCH 2/3] fix(normalize): make strip_noise verbatim-safe and scope
 it to Claude Code JSONL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The initial strip_noise() regressed on three fronts when audited against
adversarial user content — each verified with executable repros against
the cherry-picked code:

  1. `<tag>.*?</tag>` with re.DOTALL span-ate across messages: one
     stray unclosed <system-reminder> anywhere in a session merged with
     the next closing tag, silently deleting everything between them
     (including full assistant replies).
  2. `.*\(ctrl\+o to expand\).*\n?` nuked entire lines of user prose
     whenever a user happened to document the TUI shortcut.
  3. `Ran \d+ (?:stop|pre|post)\s*hook.*` with IGNORECASE ate the
     second sentence from "our CI has a stop hook ... Ran 2 stop hooks
     last week" — legitimate user commentary.

These are unambiguous violations of the project's "Verbatim always"
design principle.

Fixes:

- All tag patterns are now line-anchored (`(?m)^(?:> )?<tag>`) and their
  body forbids crossing a blank line (`(?:(?!\n\s*\n)[\s\S])*?`), so a
  dangling open tag cannot eat neighboring messages.
- `_NOISE_LINE_PREFIXES` are line-anchored and case-sensitive — user
  prose mentioning "CURRENT TIME:" mid-sentence is preserved.
- Hook-run chrome requires `(?m)^`, explicit hook names (Stop,
  PreCompact, PreToolUse, etc.), and no IGNORECASE.
- "… +N lines" is line-anchored.
- "(ctrl+o to expand)" only matches Claude Code's actual collapsed-
  output chrome shape `[N tokens] (ctrl+o to expand)`; a bare
  parenthetical in user prose stays intact.

Scope:

- `strip_noise()` is no longer called on every normalization path.
  Only `_try_claude_code_jsonl` invokes it, per-extracted-message — so
  Claude.ai exports, ChatGPT exports, Slack JSON, Codex JSONL, and
  plain text with `>` markers pass through fully verbatim. Per-message
  application also makes span-eating structurally impossible.

Tests:

- 15 new tests in test_normalize.py pin the boundary: 6 guard user
  content that must survive (each of the adversarial repros), 9 assert
  real system chrome is still stripped. All pass; full suite 702 pass
  (2 failures are the unrelated pre-existing version.py bug, cleared
  by #820).

Known limitation (not fixed here): convo_miner.py does not delete
drawers on re-mine, so transcripts mined before this PR keep noise-
filled drawers until the user manually erases + re-mines. Proper fix
needs a schema-version field on drawer metadata + re-mine trigger —
out of scope for this PR.
---
 mempalace/normalize.py  | 101 +++++++++++++++++++--------
 tests/test_normalize.py | 146 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 218 insertions(+), 29 deletions(-)

diff --git a/mempalace/normalize.py b/mempalace/normalize.py
index 256a5e9..f2b8173 100644
--- a/mempalace/normalize.py
+++ b/mempalace/normalize.py
@@ -22,20 +22,40 @@ from typing import Optional
 
 
 # ─── Noise stripping ─────────────────────────────────────────────────────
-# Claude Code and other tools inject system tags, hook output, UI chrome,
-# and tool-call JSON into transcripts. These waste drawer space and pollute
-# search results. Strip them before filing.
+# Claude Code and other tools inject system tags, hook output, and UI chrome
+# into transcripts. These waste drawer space and pollute search results.
+#
+# Verbatim is sacred — every pattern here is anchored to line boundaries and
+# refuses to cross blank lines, so a stray unclosed tag in one message can
+# never eat content from neighboring messages. When in doubt, leave text
+# alone.
 
-_NOISE_TAG_PATTERNS = [
-    re.compile(r"<system-reminder[^>]*>.*?</system-reminder>", re.DOTALL),
-    re.compile(r"<command-message[^>]*>.*?</command-message>", re.DOTALL),
-    re.compile(r"<command-name[^>]*>.*?</command-name>", re.DOTALL),
-    re.compile(r"<task-notification[^>]*>.*?</task-notification>", re.DOTALL),
-    re.compile(r"<user-prompt-submit-hook[^>]*>.*?</user-prompt-submit-hook>", re.DOTALL),
-    re.compile(r"<hook_output[^>]*>.*?</hook_output>", re.DOTALL),
-]
+_NOISE_TAGS = (
+    "system-reminder",
+    "command-message",
+    "command-name",
+    "task-notification",
+    "user-prompt-submit-hook",
+    "hook_output",
+)
 
-_NOISE_STRINGS = [
+
+def _tag_pattern(name: str) -> "re.Pattern[str]":
+    # Opening tag must begin a line (optionally after a `> ` blockquote marker,
+    # since _messages_to_transcript prefixes lines with `> `). Body is lazy but
+    # forbidden from crossing a blank line, so a dangling open tag can't span
+    # multiple messages. Closing tag eats optional trailing whitespace + newline.
+    return re.compile(
+        rf"(?m)^(?:> )?<{name}(?:\s[^>]*)?>" rf"(?:(?!\n\s*\n)[\s\S])*?" rf"</{name}>[ \t]*\n?"
+    )
+
+
+_NOISE_TAG_PATTERNS = [_tag_pattern(t) for t in _NOISE_TAGS]
+
+# Strings that identify an entire noise line when found at its start.
+# Matched case-sensitively and anchored to line-start so user prose mentioning
+# e.g. "current time:" in a sentence is untouched.
+_NOISE_LINE_PREFIXES = (
     "CURRENT TIME:",
     "VERIFIED FACTS (do not contradict)",
     "AGENT SPECIALIZATION:",
@@ -46,20 +66,39 @@ _NOISE_STRINGS = [
     "Auto-save reminder...",
     "Checking pipeline...",
     "MemPalace auto-save checkpoint.",
+)
+
+_NOISE_LINE_PATTERNS = [
+    re.compile(rf"(?m)^(?:> )?{re.escape(p)}.*\n?") for p in _NOISE_LINE_PREFIXES
 ]
 
+# Claude Code TUI hook-run chrome, e.g. "Ran 2 Stop hook", "Ran 1 PreCompact hook".
+# Line-anchored, case-sensitive, explicit hook names — prose like
+# "our CI has a stop hook" stays intact.
+_HOOK_LINE_RE = re.compile(
+    r"(?m)^(?:> )?Ran \d+ (?:Stop|PreCompact|PreToolUse|PostToolUse|UserPromptSubmit|Notification|SessionStart|SessionEnd) hook[s]?.*\n?"
+)
+
+# "… +N lines" collapsed-output marker, line-anchored.
+_COLLAPSED_LINES_RE = re.compile(r"(?m)^(?:> )?…\s*\+\d+ lines.*\n?")
+
 
 def strip_noise(text: str) -> str:
-    """Remove system tags, hook output, and Claude Code UI chrome from text."""
+    """Remove system tags, hook output, and Claude Code UI chrome from text.
+
+    All patterns are line-anchored. User prose that happens to mention these
+    strings inline (e.g., documenting them) is preserved verbatim.
+    """
     for pat in _NOISE_TAG_PATTERNS:
         text = pat.sub("", text)
-    for noise in _NOISE_STRINGS:
-        text = text.replace(noise, "")
-    # Strip Claude Code UI chrome
-    text = re.sub(r".*\(ctrl\+o to expand\).*\n?", "", text)
-    text = re.sub(r"Ran \d+ (?:stop|pre|post)\s*hook.*\n?", "", text, flags=re.IGNORECASE)
-    text = re.sub(r"…\s*\+\d+ lines.*\n?", "", text)
-    # Collapse runs of blank lines
+    for pat in _NOISE_LINE_PATTERNS:
+        text = pat.sub("", text)
+    text = _HOOK_LINE_RE.sub("", text)
+    text = _COLLAPSED_LINES_RE.sub("", text)
+    # Strip the Claude Code collapsed-output chrome "[N tokens] (ctrl+o to expand)".
+    # Narrow shape — a bare "(ctrl+o to expand)" in user prose stays intact.
+    text = re.sub(r"\s*\[\d+\s+tokens?\]\s*\(ctrl\+o to expand\)", "", text)
+    # Collapse runs of blank lines created by the removals
     text = re.sub(r"\n{4,}", "\n\n\n", text)
     return text.strip()
 
@@ -84,23 +123,21 @@ def normalize(filepath: str) -> str:
     if not content.strip():
         return content
 
-    # Already has > markers — pass through (strip noise but preserve trailing newline)
+    # Already has > markers — pass through unchanged.
     lines = content.split("\n")
     if sum(1 for line in lines if line.strip().startswith(">")) >= 3:
-        cleaned = strip_noise(content)
-        # Preserve trailing newline if original had one
-        if content.endswith("\n") and not cleaned.endswith("\n"):
-            cleaned += "\n"
-        return cleaned
+        return content
 
-    # Try JSON normalization
+    # Try JSON normalization. strip_noise is applied inside the Claude Code
+    # JSONL parser (the only format that injects system tags/hook chrome);
+    # other formats pass through verbatim.
     ext = Path(filepath).suffix.lower()
     if ext in (".json", ".jsonl") or content.strip()[:1] in ("{", "["):
         normalized = _try_normalize_json(content)
         if normalized:
-            return strip_noise(normalized)
+            return normalized
 
-    return strip_noise(content)
+    return content
 
 
 def _try_normalize_json(content: str) -> Optional[str]:
@@ -160,6 +197,10 @@ def _try_claude_code_jsonl(content: str) -> Optional[str]:
                 isinstance(b, dict) and b.get("type") == "tool_result" for b in msg_content
             )
             text = _extract_content(msg_content, tool_use_map=tool_use_map)
+            # Strip Claude Code system-injected noise per message, never across
+            # message boundaries — prevents span-eating.
+            if text:
+                text = strip_noise(text)
             if text:
                 if is_tool_only and messages and messages[-1][0] == "assistant":
                     # Append tool results to the previous assistant message
@@ -169,6 +210,8 @@ def _try_claude_code_jsonl(content: str) -> Optional[str]:
                     messages.append(("user", text))
         elif msg_type == "assistant":
             text = _extract_content(msg_content, tool_use_map=tool_use_map)
+            if text:
+                text = strip_noise(text)
             if text:
                 # If previous message is also assistant (multi-turn tool loop),
                 # merge into the same assistant turn
diff --git a/tests/test_normalize.py b/tests/test_normalize.py
index 7f0652a..53fc933 100644
--- a/tests/test_normalize.py
+++ b/tests/test_normalize.py
@@ -13,6 +13,7 @@ from mempalace.normalize import (
     _try_normalize_json,
     _try_slack_json,
     normalize,
+    strip_noise,
 )
 
 
@@ -1048,3 +1049,148 @@ def test_normalize_rejects_large_file():
             assert False, "Should have raised IOError"
         except IOError as e:
             assert "too large" in str(e).lower()
+
+
+# ── strip_noise() — verbatim-safety boundary tests ─────────────────────
+#
+# The "Verbatim always" design principle requires that we never delete
+# user-authored text. These tests pin down the boundary between system
+# noise (which we strip) and user prose that happens to mention the same
+# strings (which must survive untouched).
+
+
+class TestStripNoisePreservesUserContent:
+    """User prose that mentions noise strings inline must be preserved."""
+
+    def test_user_discusses_stop_hook_in_prose(self):
+        # Regression: original regex with IGNORECASE + `.*\n?` ate the second
+        # sentence from real user commentary.
+        text = (
+            "> User:\n"
+            "> Our CI has a stop hook that rejects merges after 5pm. "
+            "Ran 2 stop hooks last week.\n"
+            "> Assistant:\n"
+            "> Got it."
+        )
+        assert strip_noise(text) == text.strip()
+
+    def test_user_mentions_system_reminder_inline(self):
+        # Inline <system-reminder> tags inside user prose (e.g. documenting
+        # Claude Code behavior) must not be stripped.
+        text = (
+            "> User:\n"
+            "> Here is what Claude Code emits: "
+            "<system-reminder>Auto-save reminder...</system-reminder>"
+            " — I want to ignore it."
+        )
+        assert strip_noise(text) == text.strip()
+
+    def test_ctrl_o_hint_in_prose_preserved(self):
+        # Regression: original `.*\(ctrl\+o to expand\).*\n?` nuked the whole
+        # line whenever a user documented the TUI shortcut.
+        text = (
+            "> User:\n"
+            "> In the TUI you hit (ctrl+o to expand) to see more. "
+            "That is the shortcut I want to document."
+        )
+        assert strip_noise(text) == text.strip()
+
+    def test_current_time_inline_in_prose(self):
+        text = "> User:\n> At CURRENT TIME: the meeting starts, not before."
+        assert strip_noise(text) == text.strip()
+
+    def test_plus_n_lines_marker_inline(self):
+        text = "> User:\n> The log showed … +50 lines of stack trace, useful."
+        assert strip_noise(text) == text.strip()
+
+    def test_dangling_open_tag_does_not_span_messages(self):
+        # THE span-eating bug: a stray unclosed <system-reminder> in one
+        # message must NOT merge with a closing tag in another message and
+        # silently delete everything in between.
+        text = (
+            "> User 1: normal content <system-reminder>A\n"
+            "> Assistant: reply\n"
+            "> User 2: more content</system-reminder> tail"
+        )
+        out = strip_noise(text)
+        assert "Assistant: reply" in out
+        assert "User 2: more content" in out
+        assert "User 1: normal content" in out
+
+
+class TestStripNoiseRemovesSystemChrome:
+    """System-injected noise with standalone/line-anchored shape must be stripped."""
+
+    def test_strips_line_anchored_system_reminder_block(self):
+        text = (
+            "> User:\n"
+            "<system-reminder>\n"
+            "Auto-save reminder...\n"
+            "</system-reminder>\n"
+            "> Real message."
+        )
+        out = strip_noise(text)
+        assert "system-reminder" not in out
+        assert "Auto-save reminder" not in out
+        assert "Real message." in out
+
+    def test_strips_system_reminder_with_blockquote_prefix(self):
+        # _messages_to_transcript prefixes lines with "> ", so the line
+        # anchor must also accept that shape.
+        text = "> User:\n" "> <system-reminder>Injected noise</system-reminder>\n" "> Real message."
+        out = strip_noise(text)
+        assert "Injected noise" not in out
+        assert "Real message." in out
+
+    def test_strips_standalone_ran_hook_line(self):
+        text = "Ran 2 Stop hook\n> User: real content"
+        out = strip_noise(text)
+        assert "Ran 2 Stop hook" not in out
+        assert "real content" in out
+
+    def test_strips_known_hook_names(self):
+        for hook in ("Stop", "PreCompact", "PreToolUse", "PostToolUse", "UserPromptSubmit"):
+            text = f"Ran 1 {hook} hook\n> User: content"
+            assert hook not in strip_noise(text)
+
+    def test_strips_current_time_standalone(self):
+        text = "CURRENT TIME: 2026-04-13 10:00 UTC\n> User: Hello"
+        out = strip_noise(text)
+        assert "CURRENT TIME" not in out
+        assert "Hello" in out
+
+    def test_strips_collapsed_lines_marker(self):
+        text = "… +42 lines\n> User: Hello"
+        out = strip_noise(text)
+        assert "+42 lines" not in out
+        assert "Hello" in out
+
+    def test_strips_token_count_ctrl_o_chrome(self):
+        # Claude Code's actual collapsed-output chrome: "[N tokens] (ctrl+o to expand)"
+        text = "> Assistant: some output [5 tokens] (ctrl+o to expand)\n> User: ok"
+        out = strip_noise(text)
+        assert "(ctrl+o to expand)" not in out
+        assert "[5 tokens]" not in out
+        assert "some output" in out
+
+    def test_strips_each_known_noise_tag(self):
+        for tag in (
+            "system-reminder",
+            "command-message",
+            "command-name",
+            "task-notification",
+            "user-prompt-submit-hook",
+            "hook_output",
+        ):
+            text = f"> User:\n<{tag}>junk</{tag}>\n> Real."
+            out = strip_noise(text)
+            assert tag not in out, f"{tag} leaked into output"
+            assert "Real." in out
+
+    def test_collapses_excessive_blank_lines(self):
+        text = "line one\n\n\n\n\n\nline two"
+        out = strip_noise(text)
+        assert "line one" in out
+        assert "line two" in out
+        # Should collapse to no more than 3 newlines
+        assert "\n\n\n\n" not in out

From 7e5eeda9a5c22168719067d15af8b2424662f586 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Mon, 13 Apr 2026 16:20:55 -0300
Subject: [PATCH 3/3] feat(normalize): auto-rebuild stale drawers via
 NORMALIZE_VERSION schema gate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without this, the strip_noise improvement only helps new mines. Every
user who had already mined Claude Code JSONL sessions would keep their
noise-polluted drawers forever, because convo_miner's file_already_mined
skip short-circuits before re-processing.

Adds a versioned schema gate so upgrades propagate silently:

- palace.NORMALIZE_VERSION=2 — bumped when the normalization pipeline
  changes shape (this PR's strip_noise is the v1→v2 bump).
- file_already_mined now returns False if the stored normalize_version
  is missing or less than current, triggering a rebuild on next mine.
- Both miners stamp drawers with the current normalize_version.
- convo_miner now purges stale drawers before inserting fresh chunks
  (mirrors miner.py's existing delete+insert), extracted into
  _file_convo_chunks helper to keep mine_convos under ruff's C901 limit.

User experience: upgrade mempalace, run `mempalace mine` as usual, old
noisy drawers get silently replaced with clean ones. No erase needed,
no "you need to rebuild" changelog footgun.

Tests:
- test_file_already_mined_returns_false_for_stale_normalize_version —
  pins the version gate contract for missing/v1/current.
- test_add_drawer_stamps_normalize_version — fresh project-miner drawers
  carry the field.
- test_mine_convos_rebuilds_stale_drawers_after_schema_bump — end-to-end
  proof that a pre-v2 palace gets silently cleaned on next mine, with
  orphan drawers purged and NOT skipped.

Existing test_file_already_mined_check_mtime updated to include the
new field; all other tests unaffected.
---
 mempalace/convo_miner.py  | 83 ++++++++++++++++++++++------------
 mempalace/miner.py        |  3 +-
 mempalace/palace.py       | 28 ++++++++++--
 tests/test_convo_miner.py | 83 ++++++++++++++++++++++++++++++++++
 tests/test_miner.py       | 94 +++++++++++++++++++++++++++++++++++++--
 5 files changed, 253 insertions(+), 38 deletions(-)

diff --git a/mempalace/convo_miner.py b/mempalace/convo_miner.py
index d406073..663f1a0 100644
--- a/mempalace/convo_miner.py
+++ b/mempalace/convo_miner.py
@@ -16,7 +16,7 @@ from datetime import datetime
 from collections import defaultdict
 
 from .normalize import normalize
-from .palace import SKIP_DIRS, get_collection, file_already_mined
+from .palace import NORMALIZE_VERSION, SKIP_DIRS, file_already_mined, get_collection
 
 
 # File types that might contain conversations
@@ -51,6 +51,7 @@ def _register_file(collection, source_file: str, wing: str, agent: str):
                 "added_by": agent,
                 "filed_at": datetime.now().isoformat(),
                 "ingest_mode": "registry",
+                "normalize_version": NORMALIZE_VERSION,
             }
         ],
     )
@@ -272,6 +273,52 @@ def scan_convos(convo_dir: str) -> list:
 # =============================================================================
 
 
+def _file_convo_chunks(collection, source_file, chunks, wing, room, agent, extract_mode):
+    """Purge stale drawers for ``source_file`` then upsert fresh chunks.
+
+    Returns (drawers_added, room_counts_delta).
+    """
+    # Purge stale drawers first. When the normalize schema bumps,
+    # file_already_mined() returns False for pre-v2 drawers and we land
+    # here — clean them out so the source doesn't end up with a mix of
+    # old-noise and new-clean drawers.
+    try:
+        collection.delete(where={"source_file": source_file})
+    except Exception:
+        pass
+
+    room_counts_delta: dict = defaultdict(int)
+    drawers_added = 0
+    for chunk in chunks:
+        chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
+        if extract_mode == "general":
+            room_counts_delta[chunk_room] += 1
+        drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
+        try:
+            collection.upsert(
+                documents=[chunk["content"]],
+                ids=[drawer_id],
+                metadatas=[
+                    {
+                        "wing": wing,
+                        "room": chunk_room,
+                        "source_file": source_file,
+                        "chunk_index": chunk["chunk_index"],
+                        "added_by": agent,
+                        "filed_at": datetime.now().isoformat(),
+                        "ingest_mode": "convos",
+                        "extract_mode": extract_mode,
+                        "normalize_version": NORMALIZE_VERSION,
+                    }
+                ],
+            )
+            drawers_added += 1
+        except Exception as e:
+            if "already exists" not in str(e).lower():
+                raise
+    return drawers_added, room_counts_delta
+
+
 def mine_convos(
     convo_dir: str,
     palace_path: str,
@@ -375,34 +422,12 @@ def mine_convos(
         if extract_mode != "general":
             room_counts[room] += 1
 
-        # File each chunk
-        drawers_added = 0
-        for chunk in chunks:
-            chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
-            if extract_mode == "general":
-                room_counts[chunk_room] += 1
-            drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
-            try:
-                collection.upsert(
-                    documents=[chunk["content"]],
-                    ids=[drawer_id],
-                    metadatas=[
-                        {
-                            "wing": wing,
-                            "room": chunk_room,
-                            "source_file": source_file,
-                            "chunk_index": chunk["chunk_index"],
-                            "added_by": agent,
-                            "filed_at": datetime.now().isoformat(),
-                            "ingest_mode": "convos",
-                            "extract_mode": extract_mode,
-                        }
-                    ],
-                )
-                drawers_added += 1
-            except Exception as e:
-                if "already exists" not in str(e).lower():
-                    raise
+        # Purge stale drawers + file fresh chunks.
+        drawers_added, room_delta = _file_convo_chunks(
+            collection, source_file, chunks, wing, room, agent, extract_mode
+        )
+        for r, n in room_delta.items():
+            room_counts[r] += n
 
         total_drawers += drawers_added
         print(f"  ✓ [{i:4}/{len(files)}] {filepath.name[:50]:50} +{drawers_added}")
diff --git a/mempalace/miner.py b/mempalace/miner.py
index 22c8af3..49e0d25 100644
--- a/mempalace/miner.py
+++ b/mempalace/miner.py
@@ -15,7 +15,7 @@ from pathlib import Path
 from datetime import datetime
 from collections import defaultdict
 
-from .palace import SKIP_DIRS, get_collection, file_already_mined
+from .palace import NORMALIZE_VERSION, SKIP_DIRS, file_already_mined, get_collection
 
 READABLE_EXTENSIONS = {
     ".txt",
@@ -381,6 +381,7 @@ def add_drawer(
             "chunk_index": chunk_index,
             "added_by": agent,
             "filed_at": datetime.now().isoformat(),
+            "normalize_version": NORMALIZE_VERSION,
         }
         # Store file mtime so we can detect modifications later.
         try:
diff --git a/mempalace/palace.py b/mempalace/palace.py
index 948fecc..9cfb55e 100644
--- a/mempalace/palace.py
+++ b/mempalace/palace.py
@@ -36,6 +36,16 @@ SKIP_DIRS = {
 
 _DEFAULT_BACKEND = ChromaBackend()
 
+# Schema version for drawer normalization. Bump when the normalization
+# pipeline changes in a way that existing drawers should be rebuilt to pick up
+# (e.g., new noise-stripping rules). `file_already_mined` treats drawers with
+# a missing or stale `normalize_version` as "not mined", so the next mine pass
+# silently rebuilds them — users don't need to manually erase + re-mine.
+#
+# v2 (2026-04): introduced strip_noise() for Claude Code JSONL; previous
+#               drawers stored system tags / hook chrome verbatim.
+NORMALIZE_VERSION = 2
+
 
 def get_collection(
     palace_path: str,
@@ -53,16 +63,26 @@ def get_collection(
 def file_already_mined(collection, source_file: str, check_mtime: bool = False) -> bool:
     """Check if a file has already been filed in the palace.
 
-    When check_mtime=True (used by project miner), returns False if the file
-    has been modified since it was last mined, so it gets re-mined.
-    When check_mtime=False (used by convo miner), just checks existence.
+    Returns False (so the file gets re-mined) when:
+      - no drawers exist for this source_file
+      - the stored `normalize_version` is missing or older than the current
+        schema (triggers silent rebuild after a normalization upgrade)
+      - `check_mtime=True` and the file's mtime differs from the stored one
+
+    When check_mtime=True (used by project miner), also re-mines on content
+    change. When check_mtime=False (used by convo miner), transcripts are
+    assumed immutable, so only the version gate triggers a rebuild.
     """
     try:
         results = collection.get(where={"source_file": source_file}, limit=1)
         if not results.get("ids"):
             return False
+        stored_meta = results.get("metadatas", [{}])[0] or {}
+        # Pre-v2 drawers have no version field — treat them as stale.
+        stored_version = stored_meta.get("normalize_version", 1)
+        if stored_version < NORMALIZE_VERSION:
+            return False
         if check_mtime:
-            stored_meta = results.get("metadatas", [{}])[0]
             stored_mtime = stored_meta.get("source_mtime")
             if stored_mtime is None:
                 return False
diff --git a/tests/test_convo_miner.py b/tests/test_convo_miner.py
index f5074b4..166644b 100644
--- a/tests/test_convo_miner.py
+++ b/tests/test_convo_miner.py
@@ -75,3 +75,86 @@ def test_mine_convos_does_not_reprocess_empty_chunk_files(capsys):
         assert "Files skipped (already filed): 1" in out2
     finally:
         shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+def test_mine_convos_rebuilds_stale_drawers_after_schema_bump(capsys):
+    """When stored drawers have an older normalize_version, the next mine
+    silently purges them and refiles — no manual erase required.
+
+    This is what makes the strip_noise upgrade apply to existing corpora:
+    users just run `mempalace mine` again and old noise-filled drawers get
+    replaced with clean ones."""
+    from mempalace.palace import NORMALIZE_VERSION
+
+    tmpdir = tempfile.mkdtemp()
+    try:
+        convo_path = Path(tmpdir) / "chat.txt"
+        convo_path.write_text(
+            "> What is memory?\nMemory is persistence.\n\n"
+            "> Why does it matter?\nIt enables continuity.\n\n"
+            "> How do we build it?\nWith structured storage.\n"
+        )
+        palace_path = os.path.join(tmpdir, "palace")
+
+        # First mine — stamps drawers with NORMALIZE_VERSION
+        mine_convos(tmpdir, palace_path, wing="test")
+        capsys.readouterr()
+
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_collection("mempalace_drawers")
+        resolved = str(Path(tmpdir).resolve() / "chat.txt")
+        first_pass = col.get(where={"source_file": resolved})
+        first_ids = set(first_pass["ids"])
+        assert first_ids, "first mine should produce drawers"
+        for meta in first_pass["metadatas"]:
+            assert meta.get("normalize_version") == NORMALIZE_VERSION
+
+        # Simulate pre-v2 drawers: rewrite metadata to an older version,
+        # and replace content with "noise" so we can see it get cleaned up.
+        stale_metas = []
+        for meta in first_pass["metadatas"]:
+            stale = dict(meta)
+            stale["normalize_version"] = 1
+            stale_metas.append(stale)
+        col.update(
+            ids=list(first_pass["ids"]),
+            documents=["STALE NOISE"] * len(first_pass["ids"]),
+            metadatas=stale_metas,
+        )
+        # Add an extra orphan drawer that should also be purged.
+        col.add(
+            ids=["orphan_drawer"],
+            documents=["OLD ORPHAN"],
+            metadatas=[
+                {
+                    "wing": "test",
+                    "room": "default",
+                    "source_file": resolved,
+                    "chunk_index": 999,
+                    "normalize_version": 1,
+                }
+            ],
+        )
+        del col, client
+
+        # Second mine — version gate should trigger rebuild
+        mine_convos(tmpdir, palace_path, wing="test")
+        out = capsys.readouterr().out
+        assert (
+            "Files skipped (already filed): 0" in out
+        ), "stale drawers should force a rebuild, not a skip"
+
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_collection("mempalace_drawers")
+        rebuilt = col.get(where={"source_file": resolved})
+        # Orphan is gone
+        assert "orphan_drawer" not in rebuilt["ids"]
+        # No stale content survived
+        assert all("STALE NOISE" not in d for d in rebuilt["documents"])
+        assert all("OLD ORPHAN" not in d for d in rebuilt["documents"])
+        # All rebuilt drawers carry the current version
+        for meta in rebuilt["metadatas"]:
+            assert meta.get("normalize_version") == NORMALIZE_VERSION
+        del col, client
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)
diff --git a/tests/test_miner.py b/tests/test_miner.py
index ea2f2a9..020d5bd 100644
--- a/tests/test_miner.py
+++ b/tests/test_miner.py
@@ -7,7 +7,7 @@ import chromadb
 import yaml
 
 from mempalace.miner import mine, scan_project, status
-from mempalace.palace import file_already_mined
+from mempalace.palace import NORMALIZE_VERSION, file_already_mined
 
 
 def write_file(path: Path, content: str):
@@ -227,11 +227,17 @@ def test_file_already_mined_check_mtime():
         assert file_already_mined(col, test_file) is False
         assert file_already_mined(col, test_file, check_mtime=True) is False
 
-        # Add it with mtime
+        # Add it with mtime + current normalize_version
         col.add(
             ids=["d1"],
             documents=["hello world"],
-            metadatas=[{"source_file": test_file, "source_mtime": str(mtime)}],
+            metadatas=[
+                {
+                    "source_file": test_file,
+                    "source_mtime": str(mtime),
+                    "normalize_version": NORMALIZE_VERSION,
+                }
+            ],
         )
 
         # Already mined (no mtime check)
@@ -253,7 +259,12 @@ def test_file_already_mined_check_mtime():
         col.add(
             ids=["d2"],
             documents=["other"],
-            metadatas=[{"source_file": "/fake/no_mtime.txt"}],
+            metadatas=[
+                {
+                    "source_file": "/fake/no_mtime.txt",
+                    "normalize_version": NORMALIZE_VERSION,
+                }
+            ],
         )
         assert file_already_mined(col, "/fake/no_mtime.txt", check_mtime=True) is False
     finally:
@@ -296,3 +307,78 @@ def test_status_missing_palace_does_not_create_empty_collection(tmp_path, capsys
     out = capsys.readouterr().out
     assert "No palace found" in out
     assert not palace_path.exists()
+
+
+# ── normalize_version schema gate ───────────────────────────────────────
+#
+# When the normalization pipeline changes shape (e.g., strip_noise lands),
+# `NORMALIZE_VERSION` is bumped so pre-existing drawers can be silently
+# rebuilt on the next mine. These tests pin that contract.
+
+
+def test_file_already_mined_returns_false_for_stale_normalize_version():
+    """Pre-v2 drawers (no field, or older integer) must not short-circuit."""
+    tmpdir = tempfile.mkdtemp()
+    try:
+        palace_path = os.path.join(tmpdir, "palace")
+        os.makedirs(palace_path)
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_or_create_collection("mempalace_drawers")
+
+        # Pre-v2 drawer: no normalize_version field at all
+        col.add(
+            ids=["d_old"],
+            documents=["old"],
+            metadatas=[{"source_file": "/fake/old.jsonl"}],
+        )
+        assert file_already_mined(col, "/fake/old.jsonl") is False
+
+        # Explicitly older version
+        col.add(
+            ids=["d_v1"],
+            documents=["v1"],
+            metadatas=[{"source_file": "/fake/v1.jsonl", "normalize_version": 1}],
+        )
+        assert file_already_mined(col, "/fake/v1.jsonl") is False
+
+        # Current version — short-circuits
+        col.add(
+            ids=["d_current"],
+            documents=["cur"],
+            metadatas=[
+                {
+                    "source_file": "/fake/current.jsonl",
+                    "normalize_version": NORMALIZE_VERSION,
+                }
+            ],
+        )
+        assert file_already_mined(col, "/fake/current.jsonl") is True
+    finally:
+        del col, client
+        shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+def test_add_drawer_stamps_normalize_version(tmp_path):
+    """Fresh drawers carry the current schema version so future upgrades work."""
+    from mempalace.miner import add_drawer
+
+    palace_path = tmp_path / "palace"
+    palace_path.mkdir()
+    client = chromadb.PersistentClient(path=str(palace_path))
+    col = client.get_or_create_collection("mempalace_drawers")
+    try:
+        added = add_drawer(
+            collection=col,
+            wing="test",
+            room="notes",
+            content="hello",
+            source_file=str(tmp_path / "src.md"),
+            chunk_index=0,
+            agent="unit",
+        )
+        assert added is True
+        stored = col.get(limit=1)
+        meta = stored["metadatas"][0]
+        assert meta["normalize_version"] == NORMALIZE_VERSION
+    finally:
+        del col, client