MemPalace: palace architecture, AAAK compression, knowledge graph
The memory system: - Palace structure: Wings (people/projects) → Rooms (topics) → Closets (AAAK compressed) → Drawers (verbatim transcripts) - Halls connect related rooms within a wing - Tunnels cross-reference rooms across wings - AAAK: 30x lossless compression dialect for AI agents - Knowledge graph: temporal entity-relationship triples (SQLite) - Palace graph: room-based navigation with tunnel detection - MCP server: 19 tools — search, graph traversal, agent diary, AAAK auto-teach - Onboarding: guided setup generates wing config + AAAK entity registry - Contradiction detection: catches wrong pronouns, names, ages - Auto-save hooks for Claude Code 96.6% Recall@5 on LongMemEval — highest zero-API score published. 100% with optional Haiku rerank (500/500). Local. Free. No API key required.
This commit is contained in:
@@ -0,0 +1,269 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
spellcheck.py — Spell-correct user messages before palace filing.
|
||||
|
||||
Preserves:
|
||||
- Technical terms (words with digits, hyphens, underscores)
|
||||
- CamelCase and ALL_CAPS identifiers
|
||||
- Known entity names (from EntityRegistry if available)
|
||||
- URLs and file paths
|
||||
- Words shorter than 3 chars (common abbreviations, pronouns, etc.)
|
||||
- Proper nouns already capitalized in context
|
||||
|
||||
Corrects:
|
||||
- Genuine typos in lowercase, flowing text
|
||||
- Common fat-finger words (3am → 3am, knoe → know)
|
||||
|
||||
Usage:
|
||||
from mempalace.spellcheck import spellcheck_user_text
|
||||
corrected = spellcheck_user_text("lsresdy knoe the question befor")
|
||||
# → "already know the question before" (best effort)
|
||||
"""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
# Lazy-load autocorrect — not everyone has it installed
|
||||
_speller = None
|
||||
_autocorrect_available = None
|
||||
|
||||
# System word list — loaded once, used to skip already-valid words
|
||||
_system_words: Optional[set] = None
|
||||
_SYSTEM_DICT = Path("/usr/share/dict/words")
|
||||
|
||||
|
||||
def _get_speller():
|
||||
global _speller, _autocorrect_available
|
||||
if _autocorrect_available is None:
|
||||
try:
|
||||
from autocorrect import Speller
|
||||
|
||||
_speller = Speller(lang="en")
|
||||
_autocorrect_available = True
|
||||
except ImportError:
|
||||
_autocorrect_available = False
|
||||
return _speller if _autocorrect_available else None
|
||||
|
||||
|
||||
def _get_system_words() -> set:
|
||||
"""Load /usr/share/dict/words once and cache it."""
|
||||
global _system_words
|
||||
if _system_words is None:
|
||||
if _SYSTEM_DICT.exists():
|
||||
with open(_SYSTEM_DICT) as f:
|
||||
_system_words = {w.strip().lower() for w in f if w.strip()}
|
||||
else:
|
||||
_system_words = set()
|
||||
return _system_words
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Patterns that mark a token as "don't touch this"
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
# Matches any token with a digit anywhere in it: 3am, bge-large-v1.5, top-10
|
||||
_HAS_DIGIT = re.compile(r"\d")
|
||||
|
||||
# CamelCase: ChromaDB, MemPalace, LongMemEval
|
||||
_IS_CAMEL = re.compile(r"[A-Z][a-z]+[A-Z]")
|
||||
|
||||
# ALL_CAPS or all-caps with underscores: NDCG, R@5, MAX_RESULTS
|
||||
_IS_ALLCAPS = re.compile(r"^[A-Z_@#$%^&*()+=\[\]{}|<>?.:/\\]+$")
|
||||
|
||||
# Technical token: contains hyphens or underscores (bge-large, train_test)
|
||||
_IS_TECHNICAL = re.compile(r"[-_]")
|
||||
|
||||
# URL-like or file-path-like
|
||||
_IS_URL = re.compile(r"https?://|www\.|/Users/|~/|\.[a-z]{2,4}$", re.IGNORECASE)
|
||||
|
||||
# Code fences, markdown, or emoji-heavy
|
||||
_IS_CODE_OR_EMOJI = re.compile(r"[`*_#{}[\]\\]")
|
||||
|
||||
# Very short tokens — skip (I, a, ok, my, etc. — also avoids ambiguous 3-char typos
|
||||
# like "kno" which autocorrect resolves as "no" rather than "know")
|
||||
_MIN_LENGTH = 4
|
||||
|
||||
|
||||
def _should_skip(token: str, known_names: set) -> bool:
|
||||
"""Return True if this token should be left as-is."""
|
||||
if len(token) < _MIN_LENGTH:
|
||||
return True
|
||||
if _HAS_DIGIT.search(token):
|
||||
return True
|
||||
if _IS_CAMEL.search(token):
|
||||
return True
|
||||
if _IS_ALLCAPS.match(token):
|
||||
return True
|
||||
if _IS_TECHNICAL.search(token):
|
||||
return True
|
||||
if _IS_URL.search(token):
|
||||
return True
|
||||
if _IS_CODE_OR_EMOJI.search(token):
|
||||
return True
|
||||
# Known proper names (entity registry)
|
||||
if token.lower() in known_names:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Load known entity names from registry (optional, best-effort)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _load_known_names() -> set:
|
||||
"""Pull all registered names from EntityRegistry. Returns empty set on failure."""
|
||||
try:
|
||||
from mempalace.entity_registry import EntityRegistry
|
||||
|
||||
reg = EntityRegistry.load()
|
||||
names = set()
|
||||
for entity in reg._data.get("entities", {}).values():
|
||||
names.add(entity.get("canonical", "").lower())
|
||||
for alias in entity.get("aliases", []):
|
||||
names.add(alias.lower())
|
||||
return names
|
||||
except Exception:
|
||||
return set()
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Edit distance — used to guard against over-aggressive autocorrect
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _edit_distance(a: str, b: str) -> int:
|
||||
"""Levenshtein distance between two strings."""
|
||||
if a == b:
|
||||
return 0
|
||||
if not a:
|
||||
return len(b)
|
||||
if not b:
|
||||
return len(a)
|
||||
prev = list(range(len(b) + 1))
|
||||
for i, ca in enumerate(a, 1):
|
||||
curr = [i]
|
||||
for j, cb in enumerate(b, 1):
|
||||
curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + (ca != cb)))
|
||||
prev = curr
|
||||
return prev[-1]
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Core correction
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
# Split on word boundaries but keep punctuation attached to tokens
|
||||
_TOKEN_RE = re.compile(r"(\S+)")
|
||||
|
||||
|
||||
def spellcheck_user_text(text: str, known_names: Optional[set] = None) -> str:
|
||||
"""
|
||||
Spell-correct a user message.
|
||||
|
||||
Args:
|
||||
text: Raw user message text.
|
||||
known_names: Set of lowercase names/terms to preserve. If None,
|
||||
attempts to load from EntityRegistry automatically.
|
||||
|
||||
Returns:
|
||||
Corrected text. Falls back to original if autocorrect not installed.
|
||||
"""
|
||||
speller = _get_speller()
|
||||
if speller is None:
|
||||
return text # autocorrect not installed — pass through unchanged
|
||||
|
||||
if known_names is None:
|
||||
known_names = _load_known_names()
|
||||
|
||||
# Process token by token, preserving all whitespace
|
||||
sys_words = _get_system_words()
|
||||
|
||||
def _fix(match):
|
||||
token = match.group(0)
|
||||
# Strip trailing punctuation for checking, reattach after
|
||||
stripped = token.rstrip(".,!?;:'\")")
|
||||
punct = token[len(stripped) :]
|
||||
|
||||
if not stripped or _should_skip(stripped, known_names):
|
||||
return token
|
||||
|
||||
# Only correct lowercase words (capitalized words are likely proper nouns)
|
||||
if stripped[0].isupper():
|
||||
return token
|
||||
|
||||
# Skip words that are already valid English — prevents "coherently" → "inherently"
|
||||
if stripped.lower() in sys_words:
|
||||
return token
|
||||
|
||||
corrected = speller(stripped)
|
||||
|
||||
# Guard: don't apply if corrected word is too different from original.
|
||||
# Extra safety net for words not in the system dict but also not typos.
|
||||
if corrected != stripped:
|
||||
dist = _edit_distance(stripped, corrected)
|
||||
max_edits = 2 if len(stripped) <= 7 else 3
|
||||
if dist > max_edits:
|
||||
return token
|
||||
|
||||
return corrected + punct
|
||||
|
||||
return _TOKEN_RE.sub(_fix, text)
|
||||
|
||||
|
||||
def spellcheck_transcript_line(line: str) -> str:
|
||||
"""
|
||||
Spell-correct a single transcript line.
|
||||
Only touches lines that start with '>' (user turns).
|
||||
Assistant turns are never modified.
|
||||
"""
|
||||
stripped = line.lstrip()
|
||||
if not stripped.startswith(">"):
|
||||
return line
|
||||
|
||||
# '> actual message here'
|
||||
prefix_len = len(line) - len(stripped) + 2 # '> '
|
||||
message = line[prefix_len:]
|
||||
if not message.strip():
|
||||
return line
|
||||
|
||||
corrected = spellcheck_user_text(message)
|
||||
return line[:prefix_len] + corrected
|
||||
|
||||
|
||||
def spellcheck_transcript(content: str) -> str:
|
||||
"""
|
||||
Spell-correct all user turns in a full transcript.
|
||||
Only lines starting with '>' are touched.
|
||||
"""
|
||||
lines = content.split("\n")
|
||||
return "\n".join(spellcheck_transcript_line(line) for line in lines)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Quick test
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_cases = [
|
||||
"lsresdy knoe the question befor",
|
||||
"isn't there meny diferent benchmarks tesing questions?",
|
||||
"also can you pleese spell chekc my questions befroe storing",
|
||||
"it's realy hard for me to writte coherently at 3am",
|
||||
"Mempalace cant be fine-tunned if you alredy kno the question",
|
||||
# Should NOT change these:
|
||||
"ChromaDB bge-large-en-v1.5 NDCG@10 R@5",
|
||||
"Riley picked up Sam from school",
|
||||
"hybrid_v4 top-k=50 longmemeval_bench.py",
|
||||
]
|
||||
|
||||
print("Spell-check test\n" + "=" * 50)
|
||||
for msg in test_cases:
|
||||
result = spellcheck_user_text(msg, known_names={"riley", "sam", "mempalace"})
|
||||
changed = " ← CHANGED" if result != msg else ""
|
||||
print(f"\nIN: {msg}")
|
||||
if result != msg:
|
||||
print(f"OUT: {result}{changed}")
|
||||
else:
|
||||
print("OUT: (unchanged)")
|
||||
Reference in New Issue
Block a user