fix: strip system tags, hook output, and Claude UI chrome from drawers
normalize.py now strips before filing: - <system-reminder>, <command-message>, <command-name> tags - <task-notification>, <user-prompt-submit-hook>, <hook_output> tags - Hook status messages (CURRENT TIME, Checking verified facts, etc.) - Claude Code UI chrome (ctrl+o to expand, progress bars, etc.) - Collapsed runs of blank lines This noise was going straight into drawers, wasting storage space and polluting search results. strip_noise() runs on all normalized output regardless of input format (JSONL, JSON, plain text). 689/689 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
+52
-4
@@ -16,10 +16,54 @@ No API key. No internet. Everything local.
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Noise stripping ─────────────────────────────────────────────────────
|
||||||
|
# Claude Code and other tools inject system tags, hook output, UI chrome,
|
||||||
|
# and tool-call JSON into transcripts. These waste drawer space and pollute
|
||||||
|
# search results. Strip them before filing.
|
||||||
|
|
||||||
|
_NOISE_TAG_PATTERNS = [
|
||||||
|
re.compile(r"<system-reminder[^>]*>.*?</system-reminder>", re.DOTALL),
|
||||||
|
re.compile(r"<command-message[^>]*>.*?</command-message>", re.DOTALL),
|
||||||
|
re.compile(r"<command-name[^>]*>.*?</command-name>", re.DOTALL),
|
||||||
|
re.compile(r"<task-notification[^>]*>.*?</task-notification>", re.DOTALL),
|
||||||
|
re.compile(r"<user-prompt-submit-hook[^>]*>.*?</user-prompt-submit-hook>", re.DOTALL),
|
||||||
|
re.compile(r"<hook_output[^>]*>.*?</hook_output>", re.DOTALL),
|
||||||
|
]
|
||||||
|
|
||||||
|
_NOISE_STRINGS = [
|
||||||
|
"CURRENT TIME:",
|
||||||
|
"VERIFIED FACTS (do not contradict)",
|
||||||
|
"AGENT SPECIALIZATION:",
|
||||||
|
"Checking verified facts...",
|
||||||
|
"Injecting timestamp...",
|
||||||
|
"Starting background pipeline...",
|
||||||
|
"Checking emotional weights...",
|
||||||
|
"Auto-save reminder...",
|
||||||
|
"Checking pipeline...",
|
||||||
|
"MemPalace auto-save checkpoint.",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def strip_noise(text: str) -> str:
|
||||||
|
"""Remove system tags, hook output, and Claude Code UI chrome from text."""
|
||||||
|
for pat in _NOISE_TAG_PATTERNS:
|
||||||
|
text = pat.sub("", text)
|
||||||
|
for noise in _NOISE_STRINGS:
|
||||||
|
text = text.replace(noise, "")
|
||||||
|
# Strip Claude Code UI chrome
|
||||||
|
text = re.sub(r".*\(ctrl\+o to expand\).*\n?", "", text)
|
||||||
|
text = re.sub(r"Ran \d+ (?:stop|pre|post)\s*hook.*\n?", "", text, flags=re.IGNORECASE)
|
||||||
|
text = re.sub(r"…\s*\+\d+ lines.*\n?", "", text)
|
||||||
|
# Collapse runs of blank lines
|
||||||
|
text = re.sub(r"\n{4,}", "\n\n\n", text)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
def normalize(filepath: str) -> str:
|
def normalize(filepath: str) -> str:
|
||||||
"""
|
"""
|
||||||
Load a file and normalize to transcript format if it's a chat export.
|
Load a file and normalize to transcript format if it's a chat export.
|
||||||
@@ -40,19 +84,23 @@ def normalize(filepath: str) -> str:
|
|||||||
if not content.strip():
|
if not content.strip():
|
||||||
return content
|
return content
|
||||||
|
|
||||||
# Already has > markers — pass through
|
# Already has > markers — pass through (strip noise but preserve trailing newline)
|
||||||
lines = content.split("\n")
|
lines = content.split("\n")
|
||||||
if sum(1 for line in lines if line.strip().startswith(">")) >= 3:
|
if sum(1 for line in lines if line.strip().startswith(">")) >= 3:
|
||||||
return content
|
cleaned = strip_noise(content)
|
||||||
|
# Preserve trailing newline if original had one
|
||||||
|
if content.endswith("\n") and not cleaned.endswith("\n"):
|
||||||
|
cleaned += "\n"
|
||||||
|
return cleaned
|
||||||
|
|
||||||
# Try JSON normalization
|
# Try JSON normalization
|
||||||
ext = Path(filepath).suffix.lower()
|
ext = Path(filepath).suffix.lower()
|
||||||
if ext in (".json", ".jsonl") or content.strip()[:1] in ("{", "["):
|
if ext in (".json", ".jsonl") or content.strip()[:1] in ("{", "["):
|
||||||
normalized = _try_normalize_json(content)
|
normalized = _try_normalize_json(content)
|
||||||
if normalized:
|
if normalized:
|
||||||
return normalized
|
return strip_noise(normalized)
|
||||||
|
|
||||||
return content
|
return strip_noise(content)
|
||||||
|
|
||||||
|
|
||||||
def _try_normalize_json(content: str) -> Optional[str]:
|
def _try_normalize_json(content: str) -> Optional[str]:
|
||||||
|
|||||||
Reference in New Issue
Block a user