Merge pull request #147 from milla-jovovich/fix/aaak-honest-stats
fix: honest AAAK stats — word-based token estimator, lossy labels
This commit is contained in:
+47
-22
@@ -1,13 +1,19 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
AAAK Dialect -- Compressed Symbolic Memory Language
|
AAAK Dialect -- Structured Symbolic Summary Format
|
||||||
====================================================
|
====================================================
|
||||||
|
|
||||||
A structured symbolic format that ANY LLM reads natively at ~30x compression.
|
A lossy summarization format that extracts entities, topics, key sentences,
|
||||||
Not latent vectors. Not English prose. A universal memory compression dialect.
|
emotions, and flags from plain text into a compact structured representation.
|
||||||
|
Any LLM reads it natively — no decoder required.
|
||||||
|
|
||||||
Works with: Claude, ChatGPT, Gemini, Llama, Mistral -- any model that reads text.
|
Works with: Claude, ChatGPT, Gemini, Llama, Mistral -- any model that reads text.
|
||||||
|
|
||||||
|
NOTE: AAAK is NOT lossless compression. The original text cannot be reconstructed
|
||||||
|
from AAAK output. It is a structured summary layer (closets) that points to the
|
||||||
|
original verbatim content (drawers). The 96.6% benchmark score is from raw mode,
|
||||||
|
not AAAK mode.
|
||||||
|
|
||||||
Adapted for mempalace: works standalone on plain text and ChromaDB drawers.
|
Adapted for mempalace: works standalone on plain text and ChromaDB drawers.
|
||||||
No dependency on palace.py or layers.py.
|
No dependency on palace.py or layers.py.
|
||||||
|
|
||||||
@@ -538,19 +544,19 @@ class Dialect:
|
|||||||
|
|
||||||
def compress(self, text: str, metadata: dict = None) -> str:
|
def compress(self, text: str, metadata: dict = None) -> str:
|
||||||
"""
|
"""
|
||||||
Compress plain text into AAAK Dialect format.
|
Summarize plain text into AAAK Dialect format.
|
||||||
|
|
||||||
This is the primary method for mempalace: takes any text content
|
Extracts entities, topics, a key sentence, emotions, and flags
|
||||||
(drawer content, transcript chunk, note) and returns a compressed
|
from the input text. This is lossy — the original text cannot be
|
||||||
symbolic representation.
|
reconstructed from the output.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: Plain text content to compress
|
text: Plain text content to summarize
|
||||||
metadata: Optional dict with keys like 'source_file', 'wing',
|
metadata: Optional dict with keys like 'source_file', 'wing',
|
||||||
'room', 'date', etc.
|
'room', 'date', etc.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
AAAK-compressed string (~30x smaller than input)
|
AAAK-formatted summary string
|
||||||
"""
|
"""
|
||||||
metadata = metadata or {}
|
metadata = metadata or {}
|
||||||
|
|
||||||
@@ -930,19 +936,34 @@ class Dialect:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def count_tokens(text: str) -> int:
|
def count_tokens(text: str) -> int:
|
||||||
"""Rough token count (1 token ~ 3 chars for structured text)."""
|
"""Estimate token count using word-based heuristic (~1.3 tokens per word).
|
||||||
return len(text) // 3
|
|
||||||
|
This is an approximation. For accurate counts, use a real tokenizer
|
||||||
|
like tiktoken. The old len(text)//3 heuristic was wildly inaccurate
|
||||||
|
and made AAAK compression ratios look much better than reality.
|
||||||
|
"""
|
||||||
|
words = text.split()
|
||||||
|
# Most English words tokenize to 1-2 tokens; punctuation and
|
||||||
|
# special chars in AAAK (|, +, :) each cost a token.
|
||||||
|
# ~1.3 tokens/word is a conservative average.
|
||||||
|
return max(1, int(len(words) * 1.3))
|
||||||
|
|
||||||
def compression_stats(self, original_text: str, compressed: str) -> dict:
|
def compression_stats(self, original_text: str, compressed: str) -> dict:
|
||||||
"""Get compression statistics for a text->AAAK conversion."""
|
"""Get size comparison stats for a text->AAAK conversion.
|
||||||
|
|
||||||
|
NOTE: AAAK is lossy summarization, not compression. The "ratio"
|
||||||
|
reflects how much shorter the summary is, not a compression ratio
|
||||||
|
in the traditional sense — information is lost.
|
||||||
|
"""
|
||||||
orig_tokens = self.count_tokens(original_text)
|
orig_tokens = self.count_tokens(original_text)
|
||||||
comp_tokens = self.count_tokens(compressed)
|
comp_tokens = self.count_tokens(compressed)
|
||||||
return {
|
return {
|
||||||
"original_tokens": orig_tokens,
|
"original_tokens_est": orig_tokens,
|
||||||
"compressed_tokens": comp_tokens,
|
"summary_tokens_est": comp_tokens,
|
||||||
"ratio": orig_tokens / max(comp_tokens, 1),
|
"size_ratio": round(orig_tokens / max(comp_tokens, 1), 1),
|
||||||
"original_chars": len(original_text),
|
"original_chars": len(original_text),
|
||||||
"compressed_chars": len(compressed),
|
"summary_chars": len(compressed),
|
||||||
|
"note": "Estimates only. Use tiktoken for accurate counts. AAAK is lossy.",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -1021,9 +1042,9 @@ if __name__ == "__main__":
|
|||||||
encoded = dialect.encode_file(data)
|
encoded = dialect.encode_file(data)
|
||||||
stats = dialect.compression_stats(json_str, encoded)
|
stats = dialect.compression_stats(json_str, encoded)
|
||||||
print("=== COMPRESSION STATS ===")
|
print("=== COMPRESSION STATS ===")
|
||||||
print(f"JSON: ~{stats['original_tokens']:,} tokens")
|
print(f"JSON: ~{stats['original_tokens_est']:,} tokens (est)")
|
||||||
print(f"AAAK: ~{stats['compressed_tokens']:,} tokens")
|
print(f"AAAK: ~{stats['summary_tokens_est']:,} tokens (est)")
|
||||||
print(f"Ratio: {stats['ratio']:.0f}x")
|
print(f"Ratio: {stats['size_ratio']}x (lossy — information is lost)")
|
||||||
print()
|
print()
|
||||||
print("=== AAAK DIALECT OUTPUT ===")
|
print("=== AAAK DIALECT OUTPUT ===")
|
||||||
print(encoded)
|
print(encoded)
|
||||||
@@ -1043,8 +1064,12 @@ if __name__ == "__main__":
|
|||||||
text = " ".join(args)
|
text = " ".join(args)
|
||||||
compressed = dialect.compress(text)
|
compressed = dialect.compress(text)
|
||||||
stats = dialect.compression_stats(text, compressed)
|
stats = dialect.compression_stats(text, compressed)
|
||||||
print(f"Original: ~{stats['original_tokens']} tokens ({stats['original_chars']} chars)")
|
print(
|
||||||
print(f"AAAK: ~{stats['compressed_tokens']} tokens ({stats['compressed_chars']} chars)")
|
f"Original: ~{stats['original_tokens_est']} tokens est ({stats['original_chars']} chars)"
|
||||||
print(f"Ratio: {stats['ratio']:.1f}x")
|
)
|
||||||
|
print(
|
||||||
|
f"AAAK: ~{stats['summary_tokens_est']} tokens est ({stats['summary_chars']} chars)"
|
||||||
|
)
|
||||||
|
print(f"Ratio: {stats['size_ratio']}x (lossy summary, not lossless compression)")
|
||||||
print()
|
print()
|
||||||
print(compressed)
|
print(compressed)
|
||||||
|
|||||||
Reference in New Issue
Block a user