From 39c14be1136319f4471880c19a7510b3e5aee0f4 Mon Sep 17 00:00:00 2001 From: bensig Date: Tue, 7 Apr 2026 14:14:31 -0700 Subject: [PATCH] =?UTF-8?q?fix:=20honest=20AAAK=20stats=20=E2=80=94=20word?= =?UTF-8?q?-based=20token=20estimator,=20lossy=20labels?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace len(text)//3 token heuristic with word-based estimate (~1.3 tokens/word) - Old heuristic inflated compression ratios by ~3-5x - Update docstrings: "compression" → "lossy summarization" - Update module docstring to clarify AAAK is NOT lossless - compression_stats() now returns honest field names and a note - CLI output labels ratios as lossy Fixes #43 --- mempalace/dialect.py | 69 ++++++++++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 22 deletions(-) diff --git a/mempalace/dialect.py b/mempalace/dialect.py index dc9dd69..e900965 100644 --- a/mempalace/dialect.py +++ b/mempalace/dialect.py @@ -1,13 +1,19 @@ #!/usr/bin/env python3 """ -AAAK Dialect -- Compressed Symbolic Memory Language +AAAK Dialect -- Structured Symbolic Summary Format ==================================================== -A structured symbolic format that ANY LLM reads natively at ~30x compression. -Not latent vectors. Not English prose. A universal memory compression dialect. +A lossy summarization format that extracts entities, topics, key sentences, +emotions, and flags from plain text into a compact structured representation. +Any LLM reads it natively — no decoder required. Works with: Claude, ChatGPT, Gemini, Llama, Mistral -- any model that reads text. +NOTE: AAAK is NOT lossless compression. The original text cannot be reconstructed +from AAAK output. It is a structured summary layer (closets) that points to the +original verbatim content (drawers). The 96.6% benchmark score is from raw mode, +not AAAK mode. + Adapted for mempalace: works standalone on plain text and ChromaDB drawers. No dependency on palace.py or layers.py. @@ -538,19 +544,19 @@ class Dialect: def compress(self, text: str, metadata: dict = None) -> str: """ - Compress plain text into AAAK Dialect format. + Summarize plain text into AAAK Dialect format. - This is the primary method for mempalace: takes any text content - (drawer content, transcript chunk, note) and returns a compressed - symbolic representation. + Extracts entities, topics, a key sentence, emotions, and flags + from the input text. This is lossy — the original text cannot be + reconstructed from the output. Args: - text: Plain text content to compress + text: Plain text content to summarize metadata: Optional dict with keys like 'source_file', 'wing', 'room', 'date', etc. Returns: - AAAK-compressed string (~30x smaller than input) + AAAK-formatted summary string """ metadata = metadata or {} @@ -930,19 +936,34 @@ class Dialect: @staticmethod def count_tokens(text: str) -> int: - """Rough token count (1 token ~ 3 chars for structured text).""" - return len(text) // 3 + """Estimate token count using word-based heuristic (~1.3 tokens per word). + + This is an approximation. For accurate counts, use a real tokenizer + like tiktoken. The old len(text)//3 heuristic was wildly inaccurate + and made AAAK compression ratios look much better than reality. + """ + words = text.split() + # Most English words tokenize to 1-2 tokens; punctuation and + # special chars in AAAK (|, +, :) each cost a token. + # ~1.3 tokens/word is a conservative average. + return max(1, int(len(words) * 1.3)) def compression_stats(self, original_text: str, compressed: str) -> dict: - """Get compression statistics for a text->AAAK conversion.""" + """Get size comparison stats for a text->AAAK conversion. + + NOTE: AAAK is lossy summarization, not compression. The "ratio" + reflects how much shorter the summary is, not a compression ratio + in the traditional sense — information is lost. + """ orig_tokens = self.count_tokens(original_text) comp_tokens = self.count_tokens(compressed) return { - "original_tokens": orig_tokens, - "compressed_tokens": comp_tokens, - "ratio": orig_tokens / max(comp_tokens, 1), + "original_tokens_est": orig_tokens, + "summary_tokens_est": comp_tokens, + "size_ratio": round(orig_tokens / max(comp_tokens, 1), 1), "original_chars": len(original_text), - "compressed_chars": len(compressed), + "summary_chars": len(compressed), + "note": "Estimates only. Use tiktoken for accurate counts. AAAK is lossy.", } @@ -1021,9 +1042,9 @@ if __name__ == "__main__": encoded = dialect.encode_file(data) stats = dialect.compression_stats(json_str, encoded) print("=== COMPRESSION STATS ===") - print(f"JSON: ~{stats['original_tokens']:,} tokens") - print(f"AAAK: ~{stats['compressed_tokens']:,} tokens") - print(f"Ratio: {stats['ratio']:.0f}x") + print(f"JSON: ~{stats['original_tokens_est']:,} tokens (est)") + print(f"AAAK: ~{stats['summary_tokens_est']:,} tokens (est)") + print(f"Ratio: {stats['size_ratio']}x (lossy — information is lost)") print() print("=== AAAK DIALECT OUTPUT ===") print(encoded) @@ -1043,8 +1064,12 @@ if __name__ == "__main__": text = " ".join(args) compressed = dialect.compress(text) stats = dialect.compression_stats(text, compressed) - print(f"Original: ~{stats['original_tokens']} tokens ({stats['original_chars']} chars)") - print(f"AAAK: ~{stats['compressed_tokens']} tokens ({stats['compressed_chars']} chars)") - print(f"Ratio: {stats['ratio']:.1f}x") + print( + f"Original: ~{stats['original_tokens_est']} tokens est ({stats['original_chars']} chars)" + ) + print( + f"AAAK: ~{stats['summary_tokens_est']} tokens est ({stats['summary_chars']} chars)" + ) + print(f"Ratio: {stats['size_ratio']}x (lossy summary, not lossless compression)") print() print(compressed)