fix: strip system tags, hook output, and Claude UI chrome from drawers

normalize.py now strips before filing:
- <system-reminder>, <command-message>, <command-name> tags
- <task-notification>, <user-prompt-submit-hook>, <hook_output> tags
- Hook status messages (CURRENT TIME, Checking verified facts, etc.)
- Claude Code UI chrome (ctrl+o to expand, progress bars, etc.)
- Collapsed runs of blank lines

This noise was going straight into drawers, wasting storage space
and polluting search results. strip_noise() runs on all normalized
output regardless of input format (JSONL, JSON, plain text).

689/689 tests pass.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
MSL
2026-04-13 01:55:25 -07:00
committed by Igor Lins e Silva
parent 6614b9b4e7
commit 9b99c136ee
+52 -4
View File
@@ -16,10 +16,54 @@ No API key. No internet. Everything local.
import json import json
import os import os
import re
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
# ─── Noise stripping ─────────────────────────────────────────────────────
# Claude Code and other tools inject system tags, hook output, UI chrome,
# and tool-call JSON into transcripts. These waste drawer space and pollute
# search results. Strip them before filing.
_NOISE_TAG_PATTERNS = [
re.compile(r"<system-reminder[^>]*>.*?</system-reminder>", re.DOTALL),
re.compile(r"<command-message[^>]*>.*?</command-message>", re.DOTALL),
re.compile(r"<command-name[^>]*>.*?</command-name>", re.DOTALL),
re.compile(r"<task-notification[^>]*>.*?</task-notification>", re.DOTALL),
re.compile(r"<user-prompt-submit-hook[^>]*>.*?</user-prompt-submit-hook>", re.DOTALL),
re.compile(r"<hook_output[^>]*>.*?</hook_output>", re.DOTALL),
]
_NOISE_STRINGS = [
"CURRENT TIME:",
"VERIFIED FACTS (do not contradict)",
"AGENT SPECIALIZATION:",
"Checking verified facts...",
"Injecting timestamp...",
"Starting background pipeline...",
"Checking emotional weights...",
"Auto-save reminder...",
"Checking pipeline...",
"MemPalace auto-save checkpoint.",
]
def strip_noise(text: str) -> str:
"""Remove system tags, hook output, and Claude Code UI chrome from text."""
for pat in _NOISE_TAG_PATTERNS:
text = pat.sub("", text)
for noise in _NOISE_STRINGS:
text = text.replace(noise, "")
# Strip Claude Code UI chrome
text = re.sub(r".*\(ctrl\+o to expand\).*\n?", "", text)
text = re.sub(r"Ran \d+ (?:stop|pre|post)\s*hook.*\n?", "", text, flags=re.IGNORECASE)
text = re.sub(r"\s*\+\d+ lines.*\n?", "", text)
# Collapse runs of blank lines
text = re.sub(r"\n{4,}", "\n\n\n", text)
return text.strip()
def normalize(filepath: str) -> str: def normalize(filepath: str) -> str:
""" """
Load a file and normalize to transcript format if it's a chat export. Load a file and normalize to transcript format if it's a chat export.
@@ -40,19 +84,23 @@ def normalize(filepath: str) -> str:
if not content.strip(): if not content.strip():
return content return content
# Already has > markers — pass through # Already has > markers — pass through (strip noise but preserve trailing newline)
lines = content.split("\n") lines = content.split("\n")
if sum(1 for line in lines if line.strip().startswith(">")) >= 3: if sum(1 for line in lines if line.strip().startswith(">")) >= 3:
return content cleaned = strip_noise(content)
# Preserve trailing newline if original had one
if content.endswith("\n") and not cleaned.endswith("\n"):
cleaned += "\n"
return cleaned
# Try JSON normalization # Try JSON normalization
ext = Path(filepath).suffix.lower() ext = Path(filepath).suffix.lower()
if ext in (".json", ".jsonl") or content.strip()[:1] in ("{", "["): if ext in (".json", ".jsonl") or content.strip()[:1] in ("{", "["):
normalized = _try_normalize_json(content) normalized = _try_normalize_json(content)
if normalized: if normalized:
return normalized return strip_noise(normalized)
return content return strip_noise(content)
def _try_normalize_json(content: str) -> Optional[str]: def _try_normalize_json(content: str) -> Optional[str]: