b214aced90
Move all entity-detection lexical patterns (person verbs, pronouns,
dialogue markers, project verbs, stopwords, candidate character class)
out of hardcoded module-level constants and into the entity section of
each locale's JSON in mempalace/i18n/. Adds a languages parameter to
every public function so callers union patterns across the desired
locales. The default stays ("en",), so all existing callers and tests
behave unchanged.
Also adds:
- get_entity_patterns(langs) helper in mempalace/i18n/ that merges
patterns across requested languages, dedupes lists, unions stopwords,
and falls back to English for unknown locales
- MempalaceConfig.entity_languages property + setter, with env var
override (MEMPALACE_ENTITY_LANGUAGES, comma-separated)
- mempalace init --lang en,pt-br flag (persists to config.json)
- Per-language candidate_pattern so non-Latin scripts (Cyrillic,
Devanagari, CJK) can register their own character classes instead of
being silently dropped by the ASCII-only [A-Z][a-z]+ default
- _build_patterns LRU cache keyed by (name, languages) so multi-language
callers don't poison each other's cache slots
Why now: the open language PRs (#760 ru, #773 hi, #778 id, #907 it) only
add CLI strings via mempalace/i18n/. PR #156 (pt-br) is the first that
needed entity_detector changes and inlined a _PTBR variant of every
constant. That doesn't scale past 2-3 languages — every text gets
checked against every language's patterns regardless of relevance, and
candidate extraction still drops accented and non-Latin names.
This PR sets the standard so future locale contributors only edit one
JSON file (no Python changes), and entity detection scales linearly
with how many languages a user actually enabled, not how many ship.
147 lines
6.4 KiB
JSON
147 lines
6.4 KiB
JSON
{
|
|
"lang": "en",
|
|
"label": "English",
|
|
"terms": {
|
|
"palace": "palace",
|
|
"wing": "wing",
|
|
"hall": "hall",
|
|
"closet": "closet",
|
|
"drawer": "drawer",
|
|
"mine": "mine",
|
|
"search": "search",
|
|
"status": "status",
|
|
"init": "init",
|
|
"repair": "repair",
|
|
"migrate": "migrate",
|
|
"entity": "entity",
|
|
"topic": "topic"
|
|
},
|
|
"cli": {
|
|
"mine_start": "Mining {path}...",
|
|
"mine_complete": "Done. {closets} closets, {drawers} drawers created.",
|
|
"mine_skip": "Already mined. Use --force to re-mine.",
|
|
"search_no_results": "No results for: {query}",
|
|
"search_results": "Found {count} results:",
|
|
"status_palace": "Palace: {path}",
|
|
"status_wings": "{count} wings",
|
|
"status_closets": "{count} closets",
|
|
"status_drawers": "{count} drawers",
|
|
"init_complete": "Palace initialized at {path}",
|
|
"init_exists": "Palace already exists at {path}",
|
|
"repair_complete": "Repair complete. {fixed} issues fixed.",
|
|
"migrate_complete": "Migration complete.",
|
|
"no_palace": "No palace found. Run: mempalace init <dir>"
|
|
},
|
|
"aaak": {
|
|
"instruction": "Compress to index format. Hyphens between words, pipes between concepts. Drop articles and filler. Keep names and numbers exact."
|
|
},
|
|
"regex": {
|
|
"topic_pattern": "[A-Z][a-z]{2,}|[A-Za-z][A-Za-z0-9_]{2,}",
|
|
"stop_words": "the this that these those some many most each every other only such very will would could should must shall yeah okay also even then now already still back done make take give know think want need going come find work added saved session summary conversation topics source about once just really actually here there where good great better thank please sorry right wrong true false",
|
|
"quote_pattern": "\"([^\"]{20,200})\"",
|
|
"action_pattern": "(?:built|fixed|wrote|added|pushed|measured|tested|reviewed|created|deleted|updated|configured|deployed|migrated)\\s+[\\w\\s]{3,30}"
|
|
},
|
|
"entity": {
|
|
"candidate_pattern": "[A-Z][a-z]{1,19}",
|
|
"multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
|
|
"person_verb_patterns": [
|
|
"\\b{name}\\s+said\\b",
|
|
"\\b{name}\\s+asked\\b",
|
|
"\\b{name}\\s+told\\b",
|
|
"\\b{name}\\s+replied\\b",
|
|
"\\b{name}\\s+laughed\\b",
|
|
"\\b{name}\\s+smiled\\b",
|
|
"\\b{name}\\s+cried\\b",
|
|
"\\b{name}\\s+felt\\b",
|
|
"\\b{name}\\s+thinks?\\b",
|
|
"\\b{name}\\s+wants?\\b",
|
|
"\\b{name}\\s+loves?\\b",
|
|
"\\b{name}\\s+hates?\\b",
|
|
"\\b{name}\\s+knows?\\b",
|
|
"\\b{name}\\s+decided\\b",
|
|
"\\b{name}\\s+pushed\\b",
|
|
"\\b{name}\\s+wrote\\b",
|
|
"\\bhey\\s+{name}\\b",
|
|
"\\bthanks?\\s+{name}\\b",
|
|
"\\bhi\\s+{name}\\b",
|
|
"\\bdear\\s+{name}\\b"
|
|
],
|
|
"pronoun_patterns": [
|
|
"\\bshe\\b",
|
|
"\\bher\\b",
|
|
"\\bhers\\b",
|
|
"\\bhe\\b",
|
|
"\\bhim\\b",
|
|
"\\bhis\\b",
|
|
"\\bthey\\b",
|
|
"\\bthem\\b",
|
|
"\\btheir\\b"
|
|
],
|
|
"dialogue_patterns": [
|
|
"^>\\s*{name}[:\\s]",
|
|
"^{name}:\\s",
|
|
"^\\[{name}\\]",
|
|
"\"{name}\\s+said"
|
|
],
|
|
"direct_address_pattern": "\\bhey\\s+{name}\\b|\\bthanks?\\s+{name}\\b|\\bhi\\s+{name}\\b",
|
|
"project_verb_patterns": [
|
|
"\\bbuilding\\s+{name}\\b",
|
|
"\\bbuilt\\s+{name}\\b",
|
|
"\\bship(?:ping|ped)?\\s+{name}\\b",
|
|
"\\blaunch(?:ing|ed)?\\s+{name}\\b",
|
|
"\\bdeploy(?:ing|ed)?\\s+{name}\\b",
|
|
"\\binstall(?:ing|ed)?\\s+{name}\\b",
|
|
"\\bthe\\s+{name}\\s+architecture\\b",
|
|
"\\bthe\\s+{name}\\s+pipeline\\b",
|
|
"\\bthe\\s+{name}\\s+system\\b",
|
|
"\\bthe\\s+{name}\\s+repo\\b",
|
|
"\\b{name}\\s+v\\d+\\b",
|
|
"\\b{name}\\.py\\b",
|
|
"\\b{name}-core\\b",
|
|
"\\b{name}-local\\b",
|
|
"\\bimport\\s+{name}\\b",
|
|
"\\bpip\\s+install\\s+{name}\\b"
|
|
],
|
|
"stopwords": [
|
|
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to",
|
|
"for", "of", "with", "by", "from", "as", "is", "was", "are", "were",
|
|
"be", "been", "being", "have", "has", "had", "do", "does", "did",
|
|
"will", "would", "could", "should", "may", "might", "must", "shall", "can",
|
|
"this", "that", "these", "those", "it", "its", "they", "them", "their",
|
|
"we", "our", "you", "your", "i", "my", "me", "he", "she", "his", "her",
|
|
"who", "what", "when", "where", "why", "how", "which",
|
|
"if", "then", "so", "not", "no", "yes", "ok", "okay",
|
|
"just", "very", "really", "also", "already", "still", "even", "only",
|
|
"here", "there", "now", "too", "up", "out", "about", "like",
|
|
"use", "get", "got", "make", "made", "take", "put", "come", "go", "see",
|
|
"know", "think", "true", "false", "none", "null", "new", "old", "all", "any", "some",
|
|
"return", "print", "def", "class", "import",
|
|
"step", "usage", "run", "check", "find", "add", "set", "list",
|
|
"args", "dict", "str", "int", "bool", "path", "file", "type", "name",
|
|
"note", "example", "option", "result", "error", "warning", "info",
|
|
"every", "each", "more", "less", "next", "last", "first", "second",
|
|
"stack", "layer", "mode", "test", "stop", "start", "copy", "move",
|
|
"source", "target", "output", "input", "data", "item", "key", "value",
|
|
"returns", "raises", "yields", "self", "cls", "kwargs",
|
|
"world", "well", "want", "topic", "choose", "social", "cars", "phones",
|
|
"healthcare", "ex", "machina", "deus", "human", "humans", "people",
|
|
"things", "something", "nothing", "everything", "anything", "someone",
|
|
"everyone", "anyone", "way", "time", "day", "life", "place", "thing",
|
|
"part", "kind", "sort", "case", "point", "idea", "fact", "sense",
|
|
"question", "answer", "reason", "number", "version", "system",
|
|
"hey", "hi", "hello", "thanks", "thank", "right", "let",
|
|
"click", "hit", "press", "tap", "drag", "drop", "open", "close",
|
|
"save", "load", "launch", "install", "download", "upload", "scroll",
|
|
"select", "enter", "submit", "cancel", "confirm", "delete", "paste",
|
|
"write", "read", "search", "show", "hide",
|
|
"desktop", "documents", "downloads", "users", "home", "library",
|
|
"applications", "preferences", "settings", "terminal",
|
|
"actor", "vector", "remote", "control", "duration", "fetch",
|
|
"agents", "tools", "others", "guards", "ethics", "regulation",
|
|
"learning", "thinking", "memory", "language", "intelligence",
|
|
"technology", "society", "culture", "future", "history", "science",
|
|
"model", "models", "network", "networks", "training", "inference"
|
|
]
|
|
}
|
|
}
|