refactor(entity_detector): make multi-language extensible via i18n JSON

Move all entity-detection lexical patterns (person verbs, pronouns, dialogue markers, project verbs, stopwords, candidate character class) out of hardcoded module-level constants and into the entity section of each locale's JSON in mempalace/i18n/. Adds a languages parameter to every public function so callers union patterns across the desired locales. The default stays ("en",), so all existing callers and tests behave unchanged. Also adds: - get_entity_patterns(langs) helper in mempalace/i18n/ that merges patterns across requested languages, dedupes lists, unions stopwords, and falls back to English for unknown locales - MempalaceConfig.entity_languages property + setter, with env var override (MEMPALACE_ENTITY_LANGUAGES, comma-separated) - mempalace init --lang en,pt-br flag (persists to config.json) - Per-language candidate_pattern so non-Latin scripts (Cyrillic, Devanagari, CJK) can register their own character classes instead of being silently dropped by the ASCII-only [A-Z][a-z]+ default - _build_patterns LRU cache keyed by (name, languages) so multi-language callers don't poison each other's cache slots Why now: the open language PRs (#760 ru, #773 hi, #778 id, #907 it) only add CLI strings via mempalace/i18n/. PR #156 (pt-br) is the first that needed entity_detector changes and inlined a _PTBR variant of every constant. That doesn't scale past 2-3 languages — every text gets checked against every language's patterns regardless of relevance, and candidate extraction still drops accented and non-Latin names. This PR sets the standard so future locale contributors only edit one JSON file (no Python changes), and entity detection scales linearly with how many languages a user actually enabled, not how many ship.
2026-04-15 08:52:42 -03:00
parent 56b6a6360f
commit b214aced90
7 changed files with 641 additions and 421 deletions
@@ -9,9 +9,21 @@ Two-pass approach:
 Used by mempalace init before mining begins.
 The confirmed entity map feeds the miner as the taxonomy.

+Multi-language support:
+    All lexical patterns (person verbs, pronouns, dialogue markers, project
+    verbs, stopwords, and the candidate-extraction character class) live in
+    the ``entity`` section of ``mempalace/i18n/<lang>.json``. Every public
+    function accepts a ``languages`` tuple and applies the union of the
+    requested locales' patterns. The default is ``("en",)`` — existing
+    English-only callers behave exactly as before.
+
+    To add a new language: add an ``entity`` section to that locale's JSON.
+    No code changes required.
+
 Usage:
-    from entity_detector import detect_entities, confirm_entities
-    candidates = detect_entities(file_paths)
+    from mempalace.entity_detector import detect_entities, confirm_entities
+    candidates = detect_entities(file_paths)                    # English only
+    candidates = detect_entities(paths, languages=("en", "pt-br"))
    confirmed = confirm_entities(candidates)  # interactive review
 """

@@ -21,382 +33,46 @@ import functools
 from pathlib import Path
 from collections import defaultdict

+from mempalace.i18n import get_entity_patterns

-# ==================== SIGNAL PATTERNS ====================

-# Person signals — things people do
-PERSON_VERB_PATTERNS = [
-    r"\b{name}\s+said\b",
-    r"\b{name}\s+asked\b",
-    r"\b{name}\s+told\b",
-    r"\b{name}\s+replied\b",
-    r"\b{name}\s+laughed\b",
-    r"\b{name}\s+smiled\b",
-    r"\b{name}\s+cried\b",
-    r"\b{name}\s+felt\b",
-    r"\b{name}\s+thinks?\b",
-    r"\b{name}\s+wants?\b",
-    r"\b{name}\s+loves?\b",
-    r"\b{name}\s+hates?\b",
-    r"\b{name}\s+knows?\b",
-    r"\b{name}\s+decided\b",
-    r"\b{name}\s+pushed\b",
-    r"\b{name}\s+wrote\b",
-    r"\bhey\s+{name}\b",
-    r"\bthanks?\s+{name}\b",
-    r"\bhi\s+{name}\b",
-    r"\bdear\s+{name}\b",
-]
+# ==================== LANGUAGE-AWARE PATTERN LOADING ====================

-# Person signals — pronouns resolving nearby
-PRONOUN_PATTERNS = [
-    r"\bshe\b",
-    r"\bher\b",
-    r"\bhers\b",
-    r"\bhe\b",
-    r"\bhim\b",
-    r"\bhis\b",
-    r"\bthey\b",
-    r"\bthem\b",
-    r"\btheir\b",
-]

-PRONOUN_RE = re.compile("|".join(PRONOUN_PATTERNS), re.IGNORECASE)
+def _normalize_langs(languages) -> tuple:
+    """Coerce a language input into a non-empty hashable tuple."""
+    if not languages:
+        return ("en",)
+    if isinstance(languages, str):
+        return (languages,)
+    return tuple(languages)

-# Person signals — dialogue markers
-DIALOGUE_PATTERNS = [
-    r"^>\s*{name}[:\s]",  # > Speaker: ...
-    r"^{name}:\s",  # Speaker: ...
-    r"^\[{name}\]",  # [Speaker]
-    r'"{name}\s+said',
-]

-# Project signals — things projects have/do
-PROJECT_VERB_PATTERNS = [
-    r"\bbuilding\s+{name}\b",
-    r"\bbuilt\s+{name}\b",
-    r"\bship(?:ping|ped)?\s+{name}\b",
-    r"\blaunch(?:ing|ed)?\s+{name}\b",
-    r"\bdeploy(?:ing|ed)?\s+{name}\b",
-    r"\binstall(?:ing|ed)?\s+{name}\b",
-    r"\bthe\s+{name}\s+architecture\b",
-    r"\bthe\s+{name}\s+pipeline\b",
-    r"\bthe\s+{name}\s+system\b",
-    r"\bthe\s+{name}\s+repo\b",
-    r"\b{name}\s+v\d+\b",  # MemPal v2
-    r"\b{name}\.py\b",  # mempalace.py
-    r"\b{name}-core\b",  # mempal-core (hyphen only, not underscore)
-    r"\b{name}-local\b",
-    r"\bimport\s+{name}\b",
-    r"\bpip\s+install\s+{name}\b",
-]
+@functools.lru_cache(maxsize=32)
+def _get_stopwords(languages: tuple) -> frozenset:
+    """Return the union of stopwords across the given languages."""
+    patterns = get_entity_patterns(languages)
+    return frozenset(patterns["stopwords"])

-# Words that are almost certainly NOT entities
-STOPWORDS = {
-    "the",
-    "a",
-    "an",
-    "and",
-    "or",
-    "but",
-    "in",
-    "on",
-    "at",
-    "to",
-    "for",
-    "of",
-    "with",
-    "by",
-    "from",
-    "as",
-    "is",
-    "was",
-    "are",
-    "were",
-    "be",
-    "been",
-    "being",
-    "have",
-    "has",
-    "had",
-    "do",
-    "does",
-    "did",
-    "will",
-    "would",
-    "could",
-    "should",
-    "may",
-    "might",
-    "must",
-    "shall",
-    "can",
-    "this",
-    "that",
-    "these",
-    "those",
-    "it",
-    "its",
-    "they",
-    "them",
-    "their",
-    "we",
-    "our",
-    "you",
-    "your",
-    "i",
-    "my",
-    "me",
-    "he",
-    "she",
-    "his",
-    "her",
-    "who",
-    "what",
-    "when",
-    "where",
-    "why",
-    "how",
-    "which",
-    "if",
-    "then",
-    "so",
-    "not",
-    "no",
-    "yes",
-    "ok",
-    "okay",
-    "just",
-    "very",
-    "really",
-    "also",
-    "already",
-    "still",
-    "even",
-    "only",
-    "here",
-    "there",
-    "now",
-    "then",
-    "too",
-    "up",
-    "out",
-    "about",
-    "like",
-    "use",
-    "get",
-    "got",
-    "make",
-    "made",
-    "take",
-    "put",
-    "come",
-    "go",
-    "see",
-    "know",
-    "think",
-    "true",
-    "false",
-    "none",
-    "null",
-    "new",
-    "old",
-    "all",
-    "any",
-    "some",
-    "true",
-    "false",
-    "return",
-    "print",
-    "def",
-    "class",
-    "import",
-    "from",
-    # Common capitalized words in prose that aren't entities
-    "step",
-    "usage",
-    "run",
-    "check",
-    "find",
-    "add",
-    "get",
-    "set",
-    "list",
-    "args",
-    "dict",
-    "str",
-    "int",
-    "bool",
-    "path",
-    "file",
-    "type",
-    "name",
-    "note",
-    "example",
-    "option",
-    "result",
-    "error",
-    "warning",
-    "info",
-    "every",
-    "each",
-    "more",
-    "less",
-    "next",
-    "last",
-    "first",
-    "second",
-    "stack",
-    "layer",
-    "mode",
-    "test",
-    "stop",
-    "start",
-    "copy",
-    "move",
-    "source",
-    "target",
-    "output",
-    "input",
-    "data",
-    "item",
-    "key",
-    "value",
-    "returns",
-    "raises",
-    "yields",
-    "none",
-    "self",
-    "cls",
-    "kwargs",
-    # Common sentence-starting / abstract words that aren't entities
-    "world",
-    "well",
-    "want",
-    "topic",
-    "choose",
-    "social",
-    "cars",
-    "phones",
-    "healthcare",
-    "ex",
-    "machina",
-    "deus",
-    "human",
-    "humans",
-    "people",
-    "things",
-    "something",
-    "nothing",
-    "everything",
-    "anything",
-    "someone",
-    "everyone",
-    "anyone",
-    "way",
-    "time",
-    "day",
-    "life",
-    "place",
-    "thing",
-    "part",
-    "kind",
-    "sort",
-    "case",
-    "point",
-    "idea",
-    "fact",
-    "sense",
-    "question",
-    "answer",
-    "reason",
-    "number",
-    "version",
-    "system",
-    # Greetings and filler words at sentence starts
-    "hey",
-    "hi",
-    "hello",
-    "thanks",
-    "thank",
-    "right",
-    "let",
-    "ok",
-    # UI/action words that appear in how-to content
-    "click",
-    "hit",
-    "press",
-    "tap",
-    "drag",
-    "drop",
-    "open",
-    "close",
-    "save",
-    "load",
-    "launch",
-    "install",
-    "download",
-    "upload",
-    "scroll",
-    "select",
-    "enter",
-    "submit",
-    "cancel",
-    "confirm",
-    "delete",
-    "copy",
-    "paste",
-    "type",
-    "write",
-    "read",
-    "search",
-    "find",
-    "show",
-    "hide",
-    # Common filesystem/technical capitalized words
-    "desktop",
-    "documents",
-    "downloads",
-    "users",
-    "home",
-    "library",
-    "applications",
-    "system",
-    "preferences",
-    "settings",
-    "terminal",
-    # Abstract/topic words
-    "actor",
-    "vector",
-    "remote",
-    "control",
-    "duration",
-    "fetch",
-    # Abstract concepts that appear as subjects but aren't entities
-    "agents",
-    "tools",
-    "others",
-    "guards",
-    "ethics",
-    "regulation",
-    "learning",
-    "thinking",
-    "memory",
-    "language",
-    "intelligence",
-    "technology",
-    "society",
-    "culture",
-    "future",
-    "history",
-    "science",
-    "model",
-    "models",
-    "network",
-    "networks",
-    "training",
-    "inference",
-}
+
+# ==================== BACKWARD-COMPAT MODULE CONSTANTS ====================
+#
+# These mirror the old module-level constants so existing imports keep working.
+# They reflect the English defaults and are populated at import time from
+# ``mempalace/i18n/en.json``. Callers that need multi-language behavior should
+# pass the ``languages`` parameter to the public functions below.
+
+_EN = get_entity_patterns(("en",))
+
+PERSON_VERB_PATTERNS = list(_EN["person_verb_patterns"])
+PRONOUN_PATTERNS = list(_EN["pronoun_patterns"])
+PRONOUN_RE = re.compile("|".join(PRONOUN_PATTERNS), re.IGNORECASE) if PRONOUN_PATTERNS else None
+DIALOGUE_PATTERNS = list(_EN["dialogue_patterns"])
+PROJECT_VERB_PATTERNS = list(_EN["project_verb_patterns"])
+STOPWORDS = set(_EN["stopwords"])
+
+
+# ==================== EXTENSION POINTS (not language-scoped) ====================

 # For entity detection — prose only, no code files
 # Code files have too many capitalized names (classes, functions) that aren't entities
@@ -443,56 +119,107 @@ SKIP_DIRS = {
 # ==================== CANDIDATE EXTRACTION ====================


-def extract_candidates(text: str) -> dict:
+def extract_candidates(text: str, languages=("en",)) -> dict:
    """
    Extract all capitalized proper noun candidates from text.
    Returns {name: frequency} for names appearing 3+ times.
-    """
-    # Find all capitalized words (not at sentence start — harder, so we use frequency as filter)
-    raw = re.findall(r"\b([A-Z][a-z]{1,19})\b", text)

-    counts = defaultdict(int)
-    for word in raw:
-        if word.lower() not in STOPWORDS and len(word) > 1:
+    Each language contributes its own character-class pattern (e.g. ASCII
+    for English, Latin+diacritics for pt-br, Cyrillic for Russian,
+    Devanagari for Hindi). Matches from all languages are unioned.
+    """
+    langs = _normalize_langs(languages)
+    patterns = get_entity_patterns(langs)
+    stopwords = _get_stopwords(langs)
+
+    counts: defaultdict = defaultdict(int)
+
+    # Single-word candidates — one pattern per language
+    for raw_pat in patterns["candidate_patterns"]:
+        try:
+            rx = re.compile(rf"\b({raw_pat})\b")
+        except re.error:
+            continue
+        for word in rx.findall(text):
+            if word.lower() in stopwords:
+                continue
+            if len(word) < 2:
+                continue
            counts[word] += 1

-    # Also find multi-word proper nouns (e.g. "Memory Palace", "Claude Code")
-    multi = re.findall(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b", text)
-    for phrase in multi:
-        if not any(w.lower() in STOPWORDS for w in phrase.split()):
+    # Multi-word candidates — one pattern per language
+    for raw_pat in patterns["multi_word_patterns"]:
+        try:
+            rx = re.compile(rf"\b({raw_pat})\b")
+        except re.error:
+            continue
+        for phrase in rx.findall(text):
+            if any(w.lower() in stopwords for w in phrase.split()):
+                continue
            counts[phrase] += 1

-    # Filter: must appear at least 3 times to be a candidate
    return {name: count for name, count in counts.items() if count >= 3}


 # ==================== SIGNAL SCORING ====================


-@functools.lru_cache(maxsize=128)
-def _build_patterns(name: str) -> dict:
-    """Pre-compile all regex patterns for a single entity name."""
+@functools.lru_cache(maxsize=256)
+def _build_patterns(name: str, languages: tuple = ("en",)) -> dict:
+    """Pre-compile all regex patterns for a single entity name, per language set."""
    n = re.escape(name)
+    langs = _normalize_langs(languages)
+    sources = get_entity_patterns(langs)
+
+    def _compile_each(raw_patterns, flags=re.IGNORECASE):
+        compiled = []
+        for p in raw_patterns:
+            try:
+                compiled.append(re.compile(p.format(name=n), flags))
+            except (re.error, KeyError, IndexError):
+                continue
+        return compiled
+
+    direct_sources = sources.get("direct_address_patterns") or []
+    direct_compiled = []
+    for raw in direct_sources:
+        try:
+            direct_compiled.append(re.compile(raw.format(name=n), re.IGNORECASE))
+        except (re.error, KeyError, IndexError):
+            continue
+
    return {
-        "dialogue": [
-            re.compile(p.format(name=n), re.MULTILINE | re.IGNORECASE) for p in DIALOGUE_PATTERNS
-        ],
-        "person_verbs": [re.compile(p.format(name=n), re.IGNORECASE) for p in PERSON_VERB_PATTERNS],
-        "project_verbs": [
-            re.compile(p.format(name=n), re.IGNORECASE) for p in PROJECT_VERB_PATTERNS
-        ],
-        "direct": re.compile(rf"\bhey\s+{n}\b|\bthanks?\s+{n}\b|\bhi\s+{n}\b", re.IGNORECASE),
+        "dialogue": _compile_each(sources["dialogue_patterns"], re.MULTILINE | re.IGNORECASE),
+        "person_verbs": _compile_each(sources["person_verb_patterns"]),
+        "project_verbs": _compile_each(sources["project_verb_patterns"]),
+        "direct": direct_compiled,
        "versioned": re.compile(rf"\b{n}[-v]\w+", re.IGNORECASE),
        "code_ref": re.compile(rf"\b{n}\.(py|js|ts|yaml|yml|json|sh)\b", re.IGNORECASE),
    }


-def score_entity(name: str, text: str, lines: list) -> dict:
+@functools.lru_cache(maxsize=32)
+def _pronoun_re(languages: tuple):
+    """Compile a combined pronoun regex for the given languages."""
+    langs = _normalize_langs(languages)
+    patterns = get_entity_patterns(langs)
+    pronouns = patterns.get("pronoun_patterns") or []
+    if not pronouns:
+        return None
+    try:
+        return re.compile("|".join(pronouns), re.IGNORECASE)
+    except re.error:
+        return None
+
+
+def score_entity(name: str, text: str, lines: list, languages=("en",)) -> dict:
    """
    Score a candidate entity as person vs project.
    Returns scores and the signals that fired.
    """
-    patterns = _build_patterns(name)
+    langs = _normalize_langs(languages)
+    patterns = _build_patterns(name, langs)
+    pronoun_re = _pronoun_re(langs)
    person_score = 0
    project_score = 0
    person_signals = []
@@ -515,22 +242,25 @@ def score_entity(name: str, text: str, lines: list) -> dict:
            person_signals.append(f"'{name} ...' action ({matches}x)")

    # Pronoun proximity — pronouns within 3 lines of the name
-    name_lower = name.lower()
-    name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()]
-    pronoun_hits = 0
-    for idx in name_line_indices:
-        window_text = " ".join(lines[max(0, idx - 2) : idx + 3])
-        if PRONOUN_RE.search(window_text):
-            pronoun_hits += 1
-    if pronoun_hits > 0:
-        person_score += pronoun_hits * 2
-        person_signals.append(f"pronoun nearby ({pronoun_hits}x)")
+    if pronoun_re is not None:
+        name_lower = name.lower()
+        name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()]
+        pronoun_hits = 0
+        for idx in name_line_indices:
+            window_text = " ".join(lines[max(0, idx - 2) : idx + 3])
+            if pronoun_re.search(window_text):
+                pronoun_hits += 1
+        if pronoun_hits > 0:
+            person_score += pronoun_hits * 2
+            person_signals.append(f"pronoun nearby ({pronoun_hits}x)")

    # Direct address
-    direct = len(patterns["direct"].findall(text))
-    if direct > 0:
-        person_score += direct * 4
-        person_signals.append(f"addressed directly ({direct}x)")
+    direct_hits = 0
+    for rx in patterns["direct"]:
+        direct_hits += len(rx.findall(text))
+    if direct_hits > 0:
+        person_score += direct_hits * 4
+        person_signals.append(f"addressed directly ({direct_hits}x)")

    # --- Project signals ---

@@ -631,13 +361,15 @@ def classify_entity(name: str, frequency: int, scores: dict) -> dict:
 # ==================== MAIN DETECT ====================


-def detect_entities(file_paths: list, max_files: int = 10) -> dict:
+def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) -> dict:
    """
    Scan files and detect entity candidates.

    Args:
        file_paths: List of Path objects to scan
        max_files: Max files to read (for speed)
+        languages: Tuple of language codes whose entity patterns should be
+            applied (union). Defaults to ``("en",)``.

    Returns:
        {
@@ -646,6 +378,8 @@ def detect_entities(file_paths: list, max_files: int = 10) -> dict:
            "uncertain":[...entity dicts...],
        }
    """
+    langs = _normalize_langs(languages)
+
    # Collect text from files
    all_text = []
    all_lines = []
@@ -668,7 +402,7 @@ def detect_entities(file_paths: list, max_files: int = 10) -> dict:
    combined_text = "\n".join(all_text)

    # Extract candidates
-    candidates = extract_candidates(combined_text)
+    candidates = extract_candidates(combined_text, languages=langs)

    if not candidates:
        return {"people": [], "projects": [], "uncertain": []}
@@ -679,7 +413,7 @@ def detect_entities(file_paths: list, max_files: int = 10) -> dict:
    uncertain = []

    for name, frequency in sorted(candidates.items(), key=lambda x: x[1], reverse=True):
-        scores = score_entity(name, combined_text, all_lines)
+        scores = score_entity(name, combined_text, all_lines, languages=langs)
        entity = classify_entity(name, frequency, scores)

        if entity["type"] == "person":
@@ -843,13 +577,14 @@ if __name__ == "__main__":
    import sys

    if len(sys.argv) < 2:
-        print("Usage: python entity_detector.py <directory>")
+        print("Usage: python entity_detector.py <directory> [lang1,lang2,...]")
        sys.exit(1)

    project_dir = sys.argv[1]
-    print(f"Scanning: {project_dir}")
+    langs = tuple(sys.argv[2].split(",")) if len(sys.argv) >= 3 else ("en",)
+    print(f"Scanning: {project_dir} (languages: {', '.join(langs)})")
    files = scan_for_detection(project_dir)
    print(f"Reading {len(files)} files...")
-    detected = detect_entities(files)
+    detected = detect_entities(files, languages=langs)
    confirmed = confirm_entities(detected)
    print("Confirmed entities:", confirmed)