MemPalace: palace architecture, AAAK compression, knowledge graph

The memory system: - Palace structure: Wings (people/projects) → Rooms (topics) → Closets (AAAK compressed) → Drawers (verbatim transcripts) - Halls connect related rooms within a wing - Tunnels cross-reference rooms across wings - AAAK: 30x lossless compression dialect for AI agents - Knowledge graph: temporal entity-relationship triples (SQLite) - Palace graph: room-based navigation with tunnel detection - MCP server: 19 tools — search, graph traversal, agent diary, AAAK auto-teach - Onboarding: guided setup generates wing config + AAAK entity registry - Contradiction detection: catches wrong pronouns, names, ages - Auto-save hooks for Claude Code 96.6% Recall@5 on LongMemEval — highest zero-API score published. 100% with optional Haiku rerank (500/500). Local. Free. No API key required.
2026-04-04 18:16:04 -07:00
commit 068dbd9a7b
39 changed files with 9210 additions and 0 deletions
@@ -0,0 +1,269 @@
+#!/usr/bin/env python3
+"""
+spellcheck.py — Spell-correct user messages before palace filing.
+
+Preserves:
+  - Technical terms (words with digits, hyphens, underscores)
+  - CamelCase and ALL_CAPS identifiers
+  - Known entity names (from EntityRegistry if available)
+  - URLs and file paths
+  - Words shorter than 3 chars (common abbreviations, pronouns, etc.)
+  - Proper nouns already capitalized in context
+
+Corrects:
+  - Genuine typos in lowercase, flowing text
+  - Common fat-finger words (3am → 3am, knoe → know)
+
+Usage:
+    from mempalace.spellcheck import spellcheck_user_text
+    corrected = spellcheck_user_text("lsresdy knoe the question befor")
+    # → "already know the question before"  (best effort)
+"""
+
+import re
+from pathlib import Path
+from typing import Optional
+
+# Lazy-load autocorrect — not everyone has it installed
+_speller = None
+_autocorrect_available = None
+
+# System word list — loaded once, used to skip already-valid words
+_system_words: Optional[set] = None
+_SYSTEM_DICT = Path("/usr/share/dict/words")
+
+
+def _get_speller():
+    global _speller, _autocorrect_available
+    if _autocorrect_available is None:
+        try:
+            from autocorrect import Speller
+
+            _speller = Speller(lang="en")
+            _autocorrect_available = True
+        except ImportError:
+            _autocorrect_available = False
+    return _speller if _autocorrect_available else None
+
+
+def _get_system_words() -> set:
+    """Load /usr/share/dict/words once and cache it."""
+    global _system_words
+    if _system_words is None:
+        if _SYSTEM_DICT.exists():
+            with open(_SYSTEM_DICT) as f:
+                _system_words = {w.strip().lower() for w in f if w.strip()}
+        else:
+            _system_words = set()
+    return _system_words
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Patterns that mark a token as "don't touch this"
+# ─────────────────────────────────────────────────────────────────────────────
+
+# Matches any token with a digit anywhere in it: 3am, bge-large-v1.5, top-10
+_HAS_DIGIT = re.compile(r"\d")
+
+# CamelCase: ChromaDB, MemPalace, LongMemEval
+_IS_CAMEL = re.compile(r"[A-Z][a-z]+[A-Z]")
+
+# ALL_CAPS or all-caps with underscores: NDCG, R@5, MAX_RESULTS
+_IS_ALLCAPS = re.compile(r"^[A-Z_@#$%^&*()+=\[\]{}|<>?.:/\\]+$")
+
+# Technical token: contains hyphens or underscores (bge-large, train_test)
+_IS_TECHNICAL = re.compile(r"[-_]")
+
+# URL-like or file-path-like
+_IS_URL = re.compile(r"https?://|www\.|/Users/|~/|\.[a-z]{2,4}$", re.IGNORECASE)
+
+# Code fences, markdown, or emoji-heavy
+_IS_CODE_OR_EMOJI = re.compile(r"[`*_#{}[\]\\]")
+
+# Very short tokens — skip (I, a, ok, my, etc. — also avoids ambiguous 3-char typos
+# like "kno" which autocorrect resolves as "no" rather than "know")
+_MIN_LENGTH = 4
+
+
+def _should_skip(token: str, known_names: set) -> bool:
+    """Return True if this token should be left as-is."""
+    if len(token) < _MIN_LENGTH:
+        return True
+    if _HAS_DIGIT.search(token):
+        return True
+    if _IS_CAMEL.search(token):
+        return True
+    if _IS_ALLCAPS.match(token):
+        return True
+    if _IS_TECHNICAL.search(token):
+        return True
+    if _IS_URL.search(token):
+        return True
+    if _IS_CODE_OR_EMOJI.search(token):
+        return True
+    # Known proper names (entity registry)
+    if token.lower() in known_names:
+        return True
+    return False
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Load known entity names from registry (optional, best-effort)
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+def _load_known_names() -> set:
+    """Pull all registered names from EntityRegistry. Returns empty set on failure."""
+    try:
+        from mempalace.entity_registry import EntityRegistry
+
+        reg = EntityRegistry.load()
+        names = set()
+        for entity in reg._data.get("entities", {}).values():
+            names.add(entity.get("canonical", "").lower())
+            for alias in entity.get("aliases", []):
+                names.add(alias.lower())
+        return names
+    except Exception:
+        return set()
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Edit distance — used to guard against over-aggressive autocorrect
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+def _edit_distance(a: str, b: str) -> int:
+    """Levenshtein distance between two strings."""
+    if a == b:
+        return 0
+    if not a:
+        return len(b)
+    if not b:
+        return len(a)
+    prev = list(range(len(b) + 1))
+    for i, ca in enumerate(a, 1):
+        curr = [i]
+        for j, cb in enumerate(b, 1):
+            curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + (ca != cb)))
+        prev = curr
+    return prev[-1]
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Core correction
+# ─────────────────────────────────────────────────────────────────────────────
+
+# Split on word boundaries but keep punctuation attached to tokens
+_TOKEN_RE = re.compile(r"(\S+)")
+
+
+def spellcheck_user_text(text: str, known_names: Optional[set] = None) -> str:
+    """
+    Spell-correct a user message.
+
+    Args:
+        text: Raw user message text.
+        known_names: Set of lowercase names/terms to preserve. If None,
+                     attempts to load from EntityRegistry automatically.
+
+    Returns:
+        Corrected text. Falls back to original if autocorrect not installed.
+    """
+    speller = _get_speller()
+    if speller is None:
+        return text  # autocorrect not installed — pass through unchanged
+
+    if known_names is None:
+        known_names = _load_known_names()
+
+    # Process token by token, preserving all whitespace
+    sys_words = _get_system_words()
+
+    def _fix(match):
+        token = match.group(0)
+        # Strip trailing punctuation for checking, reattach after
+        stripped = token.rstrip(".,!?;:'\")")
+        punct = token[len(stripped) :]
+
+        if not stripped or _should_skip(stripped, known_names):
+            return token
+
+        # Only correct lowercase words (capitalized words are likely proper nouns)
+        if stripped[0].isupper():
+            return token
+
+        # Skip words that are already valid English — prevents "coherently" → "inherently"
+        if stripped.lower() in sys_words:
+            return token
+
+        corrected = speller(stripped)
+
+        # Guard: don't apply if corrected word is too different from original.
+        # Extra safety net for words not in the system dict but also not typos.
+        if corrected != stripped:
+            dist = _edit_distance(stripped, corrected)
+            max_edits = 2 if len(stripped) <= 7 else 3
+            if dist > max_edits:
+                return token
+
+        return corrected + punct
+
+    return _TOKEN_RE.sub(_fix, text)
+
+
+def spellcheck_transcript_line(line: str) -> str:
+    """
+    Spell-correct a single transcript line.
+    Only touches lines that start with '>' (user turns).
+    Assistant turns are never modified.
+    """
+    stripped = line.lstrip()
+    if not stripped.startswith(">"):
+        return line
+
+    # '> actual message here'
+    prefix_len = len(line) - len(stripped) + 2  # '> '
+    message = line[prefix_len:]
+    if not message.strip():
+        return line
+
+    corrected = spellcheck_user_text(message)
+    return line[:prefix_len] + corrected
+
+
+def spellcheck_transcript(content: str) -> str:
+    """
+    Spell-correct all user turns in a full transcript.
+    Only lines starting with '>' are touched.
+    """
+    lines = content.split("\n")
+    return "\n".join(spellcheck_transcript_line(line) for line in lines)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Quick test
+# ─────────────────────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    test_cases = [
+        "lsresdy knoe the question befor",
+        "isn't there meny diferent benchmarks tesing questions?",
+        "also can you pleese spell chekc my questions befroe storing",
+        "it's realy hard for me to writte coherently at 3am",
+        "Mempalace cant be fine-tunned if you alredy kno the question",
+        # Should NOT change these:
+        "ChromaDB bge-large-en-v1.5 NDCG@10 R@5",
+        "Riley picked up Sam from school",
+        "hybrid_v4 top-k=50 longmemeval_bench.py",
+    ]
+
+    print("Spell-check test\n" + "=" * 50)
+    for msg in test_cases:
+        result = spellcheck_user_text(msg, known_names={"riley", "sam", "mempalace"})
+        changed = " ← CHANGED" if result != msg else ""
+        print(f"\nIN:  {msg}")
+        if result != msg:
+            print(f"OUT: {result}{changed}")
+        else:
+            print("OUT: (unchanged)")