refactor(entity_detector): make multi-language extensible via i18n JSON

Move all entity-detection lexical patterns (person verbs, pronouns, dialogue markers, project verbs, stopwords, candidate character class) out of hardcoded module-level constants and into the entity section of each locale's JSON in mempalace/i18n/. Adds a languages parameter to every public function so callers union patterns across the desired locales. The default stays ("en",), so all existing callers and tests behave unchanged. Also adds: - get_entity_patterns(langs) helper in mempalace/i18n/ that merges patterns across requested languages, dedupes lists, unions stopwords, and falls back to English for unknown locales - MempalaceConfig.entity_languages property + setter, with env var override (MEMPALACE_ENTITY_LANGUAGES, comma-separated) - mempalace init --lang en,pt-br flag (persists to config.json) - Per-language candidate_pattern so non-Latin scripts (Cyrillic, Devanagari, CJK) can register their own character classes instead of being silently dropped by the ASCII-only [A-Z][a-z]+ default - _build_patterns LRU cache keyed by (name, languages) so multi-language callers don't poison each other's cache slots Why now: the open language PRs (#760 ru, #773 hi, #778 id, #907 it) only add CLI strings via mempalace/i18n/. PR #156 (pt-br) is the first that needed entity_detector changes and inlined a _PTBR variant of every constant. That doesn't scale past 2-3 languages — every text gets checked against every language's patterns regardless of relevance, and candidate extraction still drops accented and non-Latin names. This PR sets the standard so future locale contributors only edit one JSON file (no Python changes), and entity detection scales linearly with how many languages a user actually enabled, not how many ship.
2026-04-15 08:52:42 -03:00
parent 56b6a6360f
commit b214aced90
7 changed files with 641 additions and 421 deletions
@@ -7,6 +7,10 @@ Usage:
    print(t("cli.mine_start", path="/docs"))  # "Extraction de /docs..."
    print(t("terms.wing"))    # "aile"
    print(t("aaak.instruction"))  # AAAK compression instruction in French
+
+Each locale JSON may include an ``entity`` section with patterns used by
+``mempalace.entity_detector``. See ``get_entity_patterns`` for the merge rules
+and the README section "Adding a new language" for the schema.
 """

 import json
@@ -16,6 +20,9 @@ _LANG_DIR = Path(__file__).parent
 _strings: dict = {}
 _current_lang: str = "en"

+# Cache: tuple(langs) -> merged entity pattern dict
+_entity_cache: dict = {}
+

 def available_languages() -> list[str]:
    """Return list of available language codes."""
@@ -72,5 +79,112 @@ def get_regex() -> dict:
    return _strings.get("regex", {})


+def _load_entity_section(lang: str) -> dict:
+    """Load the raw entity section for one language. Returns {} if missing."""
+    lang_file = _LANG_DIR / f"{lang}.json"
+    if not lang_file.exists():
+        return {}
+    try:
+        data = json.loads(lang_file.read_text(encoding="utf-8"))
+    except (json.JSONDecodeError, OSError):
+        return {}
+    return data.get("entity", {}) or {}
+
+
+def get_entity_patterns(languages=("en",)) -> dict:
+    """Return merged entity detection patterns for the requested languages.
+
+    Entity detection patterns live under each locale's ``entity`` section.
+    This function merges them into a single dict for consumption by
+    ``mempalace.entity_detector``.
+
+    Merge rules:
+      - List fields (person_verb_patterns, pronoun_patterns, dialogue_patterns,
+        project_verb_patterns) are concatenated in the order of ``languages``,
+        with duplicates removed while preserving first occurrence.
+      - ``stopwords`` is the set union across all languages, returned as a
+        sorted list.
+      - ``candidate_patterns`` and ``multi_word_patterns`` are returned as
+        lists (one per language) since they use different character classes;
+        callers run each pattern independently and union the matches.
+      - ``direct_address_pattern`` is returned as a list of per-language
+        alternation patterns (not concatenated — each is applied separately).
+
+    If ``languages`` is empty or no requested language declares entity data,
+    English is used as a fallback so callers always get a working config.
+    """
+    if not languages:
+        languages = ("en",)
+    key = tuple(languages)
+    if key in _entity_cache:
+        return _entity_cache[key]
+
+    candidate_patterns: list[str] = []
+    multi_word_patterns: list[str] = []
+    person_verbs: list[str] = []
+    pronouns: list[str] = []
+    dialogue: list[str] = []
+    direct_address: list[str] = []
+    project_verbs: list[str] = []
+    stopwords: set = set()
+
+    found_any = False
+    for lang in languages:
+        section = _load_entity_section(lang)
+        if not section:
+            continue
+        found_any = True
+        if section.get("candidate_pattern"):
+            candidate_patterns.append(section["candidate_pattern"])
+        if section.get("multi_word_pattern"):
+            multi_word_patterns.append(section["multi_word_pattern"])
+        if section.get("direct_address_pattern"):
+            direct_address.append(section["direct_address_pattern"])
+        person_verbs.extend(section.get("person_verb_patterns", []))
+        pronouns.extend(section.get("pronoun_patterns", []))
+        dialogue.extend(section.get("dialogue_patterns", []))
+        project_verbs.extend(section.get("project_verb_patterns", []))
+        stopwords.update(w.lower() for w in section.get("stopwords", []))
+
+    if not found_any:
+        # Fallback: load English directly
+        section = _load_entity_section("en")
+        if section.get("candidate_pattern"):
+            candidate_patterns.append(section["candidate_pattern"])
+        if section.get("multi_word_pattern"):
+            multi_word_patterns.append(section["multi_word_pattern"])
+        if section.get("direct_address_pattern"):
+            direct_address.append(section["direct_address_pattern"])
+        person_verbs.extend(section.get("person_verb_patterns", []))
+        pronouns.extend(section.get("pronoun_patterns", []))
+        dialogue.extend(section.get("dialogue_patterns", []))
+        project_verbs.extend(section.get("project_verb_patterns", []))
+        stopwords.update(w.lower() for w in section.get("stopwords", []))
+
+    merged = {
+        "candidate_patterns": candidate_patterns,
+        "multi_word_patterns": multi_word_patterns,
+        "person_verb_patterns": _dedupe(person_verbs),
+        "pronoun_patterns": _dedupe(pronouns),
+        "dialogue_patterns": _dedupe(dialogue),
+        "direct_address_patterns": direct_address,
+        "project_verb_patterns": _dedupe(project_verbs),
+        "stopwords": sorted(stopwords),
+    }
+    _entity_cache[key] = merged
+    return merged
+
+
+def _dedupe(items: list) -> list:
+    """Remove duplicates while preserving first-occurrence order."""
+    seen = set()
+    out = []
+    for item in items:
+        if item not in seen:
+            seen.add(item)
+            out.append(item)
+    return out
+
+
 # Auto-load English on import
 load_lang("en")
@@ -40,5 +40,107 @@
    "stop_words": "the this that these those some many most each every other only such very will would could should must shall yeah okay also even then now already still back done make take give know think want need going come find work added saved session summary conversation topics source about once just really actually here there where good great better thank please sorry right wrong true false",
    "quote_pattern": "\"([^\"]{20,200})\"",
    "action_pattern": "(?:built|fixed|wrote|added|pushed|measured|tested|reviewed|created|deleted|updated|configured|deployed|migrated)\\s+[\\w\\s]{3,30}"
+  },
+  "entity": {
+    "candidate_pattern": "[A-Z][a-z]{1,19}",
+    "multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
+    "person_verb_patterns": [
+      "\\b{name}\\s+said\\b",
+      "\\b{name}\\s+asked\\b",
+      "\\b{name}\\s+told\\b",
+      "\\b{name}\\s+replied\\b",
+      "\\b{name}\\s+laughed\\b",
+      "\\b{name}\\s+smiled\\b",
+      "\\b{name}\\s+cried\\b",
+      "\\b{name}\\s+felt\\b",
+      "\\b{name}\\s+thinks?\\b",
+      "\\b{name}\\s+wants?\\b",
+      "\\b{name}\\s+loves?\\b",
+      "\\b{name}\\s+hates?\\b",
+      "\\b{name}\\s+knows?\\b",
+      "\\b{name}\\s+decided\\b",
+      "\\b{name}\\s+pushed\\b",
+      "\\b{name}\\s+wrote\\b",
+      "\\bhey\\s+{name}\\b",
+      "\\bthanks?\\s+{name}\\b",
+      "\\bhi\\s+{name}\\b",
+      "\\bdear\\s+{name}\\b"
+    ],
+    "pronoun_patterns": [
+      "\\bshe\\b",
+      "\\bher\\b",
+      "\\bhers\\b",
+      "\\bhe\\b",
+      "\\bhim\\b",
+      "\\bhis\\b",
+      "\\bthey\\b",
+      "\\bthem\\b",
+      "\\btheir\\b"
+    ],
+    "dialogue_patterns": [
+      "^>\\s*{name}[:\\s]",
+      "^{name}:\\s",
+      "^\\[{name}\\]",
+      "\"{name}\\s+said"
+    ],
+    "direct_address_pattern": "\\bhey\\s+{name}\\b|\\bthanks?\\s+{name}\\b|\\bhi\\s+{name}\\b",
+    "project_verb_patterns": [
+      "\\bbuilding\\s+{name}\\b",
+      "\\bbuilt\\s+{name}\\b",
+      "\\bship(?:ping|ped)?\\s+{name}\\b",
+      "\\blaunch(?:ing|ed)?\\s+{name}\\b",
+      "\\bdeploy(?:ing|ed)?\\s+{name}\\b",
+      "\\binstall(?:ing|ed)?\\s+{name}\\b",
+      "\\bthe\\s+{name}\\s+architecture\\b",
+      "\\bthe\\s+{name}\\s+pipeline\\b",
+      "\\bthe\\s+{name}\\s+system\\b",
+      "\\bthe\\s+{name}\\s+repo\\b",
+      "\\b{name}\\s+v\\d+\\b",
+      "\\b{name}\\.py\\b",
+      "\\b{name}-core\\b",
+      "\\b{name}-local\\b",
+      "\\bimport\\s+{name}\\b",
+      "\\bpip\\s+install\\s+{name}\\b"
+    ],
+    "stopwords": [
+      "the", "a", "an", "and", "or", "but", "in", "on", "at", "to",
+      "for", "of", "with", "by", "from", "as", "is", "was", "are", "were",
+      "be", "been", "being", "have", "has", "had", "do", "does", "did",
+      "will", "would", "could", "should", "may", "might", "must", "shall", "can",
+      "this", "that", "these", "those", "it", "its", "they", "them", "their",
+      "we", "our", "you", "your", "i", "my", "me", "he", "she", "his", "her",
+      "who", "what", "when", "where", "why", "how", "which",
+      "if", "then", "so", "not", "no", "yes", "ok", "okay",
+      "just", "very", "really", "also", "already", "still", "even", "only",
+      "here", "there", "now", "too", "up", "out", "about", "like",
+      "use", "get", "got", "make", "made", "take", "put", "come", "go", "see",
+      "know", "think", "true", "false", "none", "null", "new", "old", "all", "any", "some",
+      "return", "print", "def", "class", "import",
+      "step", "usage", "run", "check", "find", "add", "set", "list",
+      "args", "dict", "str", "int", "bool", "path", "file", "type", "name",
+      "note", "example", "option", "result", "error", "warning", "info",
+      "every", "each", "more", "less", "next", "last", "first", "second",
+      "stack", "layer", "mode", "test", "stop", "start", "copy", "move",
+      "source", "target", "output", "input", "data", "item", "key", "value",
+      "returns", "raises", "yields", "self", "cls", "kwargs",
+      "world", "well", "want", "topic", "choose", "social", "cars", "phones",
+      "healthcare", "ex", "machina", "deus", "human", "humans", "people",
+      "things", "something", "nothing", "everything", "anything", "someone",
+      "everyone", "anyone", "way", "time", "day", "life", "place", "thing",
+      "part", "kind", "sort", "case", "point", "idea", "fact", "sense",
+      "question", "answer", "reason", "number", "version", "system",
+      "hey", "hi", "hello", "thanks", "thank", "right", "let",
+      "click", "hit", "press", "tap", "drag", "drop", "open", "close",
+      "save", "load", "launch", "install", "download", "upload", "scroll",
+      "select", "enter", "submit", "cancel", "confirm", "delete", "paste",
+      "write", "read", "search", "show", "hide",
+      "desktop", "documents", "downloads", "users", "home", "library",
+      "applications", "preferences", "settings", "terminal",
+      "actor", "vector", "remote", "control", "duration", "fetch",
+      "agents", "tools", "others", "guards", "ethics", "regulation",
+      "learning", "thinking", "memory", "language", "intelligence",
+      "technology", "society", "culture", "future", "history", "science",
+      "model", "models", "network", "networks", "training", "inference"
+    ]
  }
 }