Merge pull request #911 from MemPalace/refactor/entity-detector-i18n

refactor(entity_detector): make multi-language extensible via i18n JSON
2026-04-15 09:40:36 -03:00
parent 56b6a6360f c722c91e2a
commit 3bac3654c4
7 changed files with 646 additions and 421 deletions
@@ -73,12 +73,25 @@ def cmd_init(args):
    from .entity_detector import scan_for_detection, detect_entities, confirm_entities
    from .room_detector_local import detect_rooms_local
    cfg = MempalaceConfig()
    # Resolve entity-detection languages: --lang overrides config.
    lang_arg = getattr(args, "lang", None)
    if lang_arg:
        languages = [s.strip() for s in lang_arg.split(",") if s.strip()] or ["en"]
        cfg.set_entity_languages(languages)
    else:
        languages = cfg.entity_languages
    languages_tuple = tuple(languages)
    # Pass 1: auto-detect people and projects from file content
    print(f"\n  Scanning for entities in: {args.dir}")
    if languages_tuple != ("en",):
        print(f"  Languages: {', '.join(languages_tuple)}")
    files = scan_for_detection(args.dir)
    if files:
        print(f"  Reading {len(files)} files...")
-        detected = detect_entities(files)
+        detected = detect_entities(files, languages=languages_tuple)
        total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
        if total > 0:
            confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
@@ -93,7 +106,7 @@ def cmd_init(args):
    # Pass 2: detect rooms from folder structure
    detect_rooms_local(project_dir=args.dir, yes=getattr(args, "yes", False))
-    MempalaceConfig().init()
+    cfg.init()
    # Pass 3: protect git repos from accidentally committing per-project files
    _ensure_mempalace_files_gitignored(args.dir)
@@ -478,6 +491,16 @@ def main():
        action="store_true",
        help="Auto-accept all detected entities (non-interactive)",
    )
    p_init.add_argument(
        "--lang",
        default=None,
        help=(
            "Comma-separated language codes for entity detection "
            "(e.g. 'en' or 'en,pt-br'). Defaults to value from config "
            "(MEMPALACE_ENTITY_LANGUAGES env var or config.json), or 'en'. "
            "When given, the value is also persisted to config.json."
        ),
    )
    # mine
    p_mine = sub.add_parser("mine", help="Mine files into the palace")
@@ -197,6 +197,42 @@ class MempalaceConfig:
        """Mapping of hall names to keyword lists."""
        return self._file_config.get("hall_keywords", DEFAULT_HALL_KEYWORDS)
    @property
    def entity_languages(self):
        """Languages whose entity-detection patterns should be applied.
        Reads from env var ``MEMPALACE_ENTITY_LANGUAGES`` (comma-separated)
        first, then the ``entity_languages`` field in ``config.json``,
        defaulting to ``["en"]``.
        """
        env_val = os.environ.get("MEMPALACE_ENTITY_LANGUAGES") or os.environ.get(
            "MEMPAL_ENTITY_LANGUAGES"
        )
        if env_val:
            return [s.strip() for s in env_val.split(",") if s.strip()] or ["en"]
        cfg = self._file_config.get("entity_languages")
        if isinstance(cfg, list) and cfg:
            return [str(s) for s in cfg]
        return ["en"]
    def set_entity_languages(self, languages):
        """Persist the entity-detection language list to ``config.json``."""
        normalized = [s.strip() for s in languages if s and s.strip()]
        if not normalized:
            normalized = ["en"]
        self._file_config["entity_languages"] = normalized
        self._config_dir.mkdir(parents=True, exist_ok=True)
        try:
            with open(self._config_file, "w", encoding="utf-8") as f:
                json.dump(self._file_config, f, indent=2, ensure_ascii=False)
        except OSError:
            pass
        try:
            self._config_file.chmod(0o600)
        except (OSError, NotImplementedError):
            pass
        return normalized
    @property
    def hook_silent_save(self):
        """Whether the stop hook saves directly (True) or blocks for MCP calls (False)."""
@@ -9,9 +9,21 @@ Two-pass approach:
 Used by mempalace init before mining begins.
 The confirmed entity map feeds the miner as the taxonomy.
 Multi-language support:
    All lexical patterns (person verbs, pronouns, dialogue markers, project
    verbs, stopwords, and the candidate-extraction character class) live in
    the ``entity`` section of ``mempalace/i18n/<lang>.json``. Every public
    function accepts a ``languages`` tuple and applies the union of the
    requested locales' patterns. The default is ``("en",)`` — existing
    English-only callers behave exactly as before.
    To add a new language: add an ``entity`` section to that locale's JSON.
    No code changes required.
 Usage:
-    from entity_detector import detect_entities, confirm_entities
+    from mempalace.entity_detector import detect_entities, confirm_entities
-    candidates = detect_entities(file_paths)
+    candidates = detect_entities(file_paths)                    # English only
    candidates = detect_entities(paths, languages=("en", "pt-br"))
    confirmed = confirm_entities(candidates)  # interactive review
 """
@@ -21,382 +33,46 @@ import functools
 from pathlib import Path
 from collections import defaultdict
 from mempalace.i18n import get_entity_patterns
 # ==================== SIGNAL PATTERNS ====================
-# Person signals — things people do
+# ==================== LANGUAGE-AWARE PATTERN LOADING ====================
 PERSON_VERB_PATTERNS = [
    r"\b{name}\s+said\b",
    r"\b{name}\s+asked\b",
    r"\b{name}\s+told\b",
    r"\b{name}\s+replied\b",
    r"\b{name}\s+laughed\b",
    r"\b{name}\s+smiled\b",
    r"\b{name}\s+cried\b",
    r"\b{name}\s+felt\b",
    r"\b{name}\s+thinks?\b",
    r"\b{name}\s+wants?\b",
    r"\b{name}\s+loves?\b",
    r"\b{name}\s+hates?\b",
    r"\b{name}\s+knows?\b",
    r"\b{name}\s+decided\b",
    r"\b{name}\s+pushed\b",
    r"\b{name}\s+wrote\b",
    r"\bhey\s+{name}\b",
    r"\bthanks?\s+{name}\b",
    r"\bhi\s+{name}\b",
    r"\bdear\s+{name}\b",
 ]
 # Person signals — pronouns resolving nearby
 PRONOUN_PATTERNS = [
    r"\bshe\b",
    r"\bher\b",
    r"\bhers\b",
    r"\bhe\b",
    r"\bhim\b",
    r"\bhis\b",
    r"\bthey\b",
    r"\bthem\b",
    r"\btheir\b",
 ]
-PRONOUN_RE = re.compile("|".join(PRONOUN_PATTERNS), re.IGNORECASE)
+def _normalize_langs(languages) -> tuple:
    """Coerce a language input into a non-empty hashable tuple."""
    if not languages:
        return ("en",)
    if isinstance(languages, str):
        return (languages,)
    return tuple(languages)
 # Person signals — dialogue markers
 DIALOGUE_PATTERNS = [
    r"^>\s*{name}[:\s]",  # > Speaker: ...
    r"^{name}:\s",  # Speaker: ...
    r"^\[{name}\]",  # [Speaker]
    r'"{name}\s+said',
 ]
-# Project signals — things projects have/do
+@functools.lru_cache(maxsize=32)
-PROJECT_VERB_PATTERNS = [
+def _get_stopwords(languages: tuple) -> frozenset:
-    r"\bbuilding\s+{name}\b",
+    """Return the union of stopwords across the given languages."""
-    r"\bbuilt\s+{name}\b",
+    patterns = get_entity_patterns(languages)
-    r"\bship(?:ping|ped)?\s+{name}\b",
+    return frozenset(patterns["stopwords"])
    r"\blaunch(?:ing|ed)?\s+{name}\b",
    r"\bdeploy(?:ing|ed)?\s+{name}\b",
    r"\binstall(?:ing|ed)?\s+{name}\b",
    r"\bthe\s+{name}\s+architecture\b",
    r"\bthe\s+{name}\s+pipeline\b",
    r"\bthe\s+{name}\s+system\b",
    r"\bthe\s+{name}\s+repo\b",
    r"\b{name}\s+v\d+\b",  # MemPal v2
    r"\b{name}\.py\b",  # mempalace.py
    r"\b{name}-core\b",  # mempal-core (hyphen only, not underscore)
    r"\b{name}-local\b",
    r"\bimport\s+{name}\b",
    r"\bpip\s+install\s+{name}\b",
 ]
-# Words that are almost certainly NOT entities
+
-STOPWORDS = {
+# ==================== BACKWARD-COMPAT MODULE CONSTANTS ====================
-    "the",
+#
-    "a",
+# These mirror the old module-level constants so existing imports keep working.
-    "an",
+# They reflect the English defaults and are populated at import time from
-    "and",
+# ``mempalace/i18n/en.json``. Callers that need multi-language behavior should
-    "or",
+# pass the ``languages`` parameter to the public functions below.
-    "but",
+
-    "in",
+_EN = get_entity_patterns(("en",))
-    "on",
+
-    "at",
+PERSON_VERB_PATTERNS = list(_EN["person_verb_patterns"])
-    "to",
+PRONOUN_PATTERNS = list(_EN["pronoun_patterns"])
-    "for",
+PRONOUN_RE = re.compile("|".join(PRONOUN_PATTERNS), re.IGNORECASE) if PRONOUN_PATTERNS else None
-    "of",
+DIALOGUE_PATTERNS = list(_EN["dialogue_patterns"])
-    "with",
+PROJECT_VERB_PATTERNS = list(_EN["project_verb_patterns"])
-    "by",
+STOPWORDS = set(_EN["stopwords"])
-    "from",
+
-    "as",
+
-    "is",
+# ==================== EXTENSION POINTS (not language-scoped) ====================
    "was",
    "are",
    "were",
    "be",
    "been",
    "being",
    "have",
    "has",
    "had",
    "do",
    "does",
    "did",
    "will",
    "would",
    "could",
    "should",
    "may",
    "might",
    "must",
    "shall",
    "can",
    "this",
    "that",
    "these",
    "those",
    "it",
    "its",
    "they",
    "them",
    "their",
    "we",
    "our",
    "you",
    "your",
    "i",
    "my",
    "me",
    "he",
    "she",
    "his",
    "her",
    "who",
    "what",
    "when",
    "where",
    "why",
    "how",
    "which",
    "if",
    "then",
    "so",
    "not",
    "no",
    "yes",
    "ok",
    "okay",
    "just",
    "very",
    "really",
    "also",
    "already",
    "still",
    "even",
    "only",
    "here",
    "there",
    "now",
    "then",
    "too",
    "up",
    "out",
    "about",
    "like",
    "use",
    "get",
    "got",
    "make",
    "made",
    "take",
    "put",
    "come",
    "go",
    "see",
    "know",
    "think",
    "true",
    "false",
    "none",
    "null",
    "new",
    "old",
    "all",
    "any",
    "some",
    "true",
    "false",
    "return",
    "print",
    "def",
    "class",
    "import",
    "from",
    # Common capitalized words in prose that aren't entities
    "step",
    "usage",
    "run",
    "check",
    "find",
    "add",
    "get",
    "set",
    "list",
    "args",
    "dict",
    "str",
    "int",
    "bool",
    "path",
    "file",
    "type",
    "name",
    "note",
    "example",
    "option",
    "result",
    "error",
    "warning",
    "info",
    "every",
    "each",
    "more",
    "less",
    "next",
    "last",
    "first",
    "second",
    "stack",
    "layer",
    "mode",
    "test",
    "stop",
    "start",
    "copy",
    "move",
    "source",
    "target",
    "output",
    "input",
    "data",
    "item",
    "key",
    "value",
    "returns",
    "raises",
    "yields",
    "none",
    "self",
    "cls",
    "kwargs",
    # Common sentence-starting / abstract words that aren't entities
    "world",
    "well",
    "want",
    "topic",
    "choose",
    "social",
    "cars",
    "phones",
    "healthcare",
    "ex",
    "machina",
    "deus",
    "human",
    "humans",
    "people",
    "things",
    "something",
    "nothing",
    "everything",
    "anything",
    "someone",
    "everyone",
    "anyone",
    "way",
    "time",
    "day",
    "life",
    "place",
    "thing",
    "part",
    "kind",
    "sort",
    "case",
    "point",
    "idea",
    "fact",
    "sense",
    "question",
    "answer",
    "reason",
    "number",
    "version",
    "system",
    # Greetings and filler words at sentence starts
    "hey",
    "hi",
    "hello",
    "thanks",
    "thank",
    "right",
    "let",
    "ok",
    # UI/action words that appear in how-to content
    "click",
    "hit",
    "press",
    "tap",
    "drag",
    "drop",
    "open",
    "close",
    "save",
    "load",
    "launch",
    "install",
    "download",
    "upload",
    "scroll",
    "select",
    "enter",
    "submit",
    "cancel",
    "confirm",
    "delete",
    "copy",
    "paste",
    "type",
    "write",
    "read",
    "search",
    "find",
    "show",
    "hide",
    # Common filesystem/technical capitalized words
    "desktop",
    "documents",
    "downloads",
    "users",
    "home",
    "library",
    "applications",
    "system",
    "preferences",
    "settings",
    "terminal",
    # Abstract/topic words
    "actor",
    "vector",
    "remote",
    "control",
    "duration",
    "fetch",
    # Abstract concepts that appear as subjects but aren't entities
    "agents",
    "tools",
    "others",
    "guards",
    "ethics",
    "regulation",
    "learning",
    "thinking",
    "memory",
    "language",
    "intelligence",
    "technology",
    "society",
    "culture",
    "future",
    "history",
    "science",
    "model",
    "models",
    "network",
    "networks",
    "training",
    "inference",
 }
 # For entity detection — prose only, no code files
 # Code files have too many capitalized names (classes, functions) that aren't entities
@@ -443,56 +119,107 @@ SKIP_DIRS = {
 # ==================== CANDIDATE EXTRACTION ====================
-def extract_candidates(text: str) -> dict:
+def extract_candidates(text: str, languages=("en",)) -> dict:
    """
    Extract all capitalized proper noun candidates from text.
    Returns {name: frequency} for names appearing 3+ times.
    """
    # Find all capitalized words (not at sentence start — harder, so we use frequency as filter)
    raw = re.findall(r"\b([A-Z][a-z]{1,19})\b", text)
-    counts = defaultdict(int)
+    Each language contributes its own character-class pattern (e.g. ASCII
-    for word in raw:
+    for English, Latin+diacritics for pt-br, Cyrillic for Russian,
-        if word.lower() not in STOPWORDS and len(word) > 1:
+    Devanagari for Hindi). Matches from all languages are unioned.
    """
    langs = _normalize_langs(languages)
    patterns = get_entity_patterns(langs)
    stopwords = _get_stopwords(langs)
    counts: defaultdict = defaultdict(int)
    # Single-word candidates — one pattern per language
    for raw_pat in patterns["candidate_patterns"]:
        try:
            rx = re.compile(rf"\b({raw_pat})\b")
        except re.error:
            continue
        for word in rx.findall(text):
            if word.lower() in stopwords:
                continue
            if len(word) < 2:
                continue
            counts[word] += 1
-    # Also find multi-word proper nouns (e.g. "Memory Palace", "Claude Code")
+    # Multi-word candidates — one pattern per language
-    multi = re.findall(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b", text)
+    for raw_pat in patterns["multi_word_patterns"]:
-    for phrase in multi:
+        try:
-        if not any(w.lower() in STOPWORDS for w in phrase.split()):
+            rx = re.compile(rf"\b({raw_pat})\b")
        except re.error:
            continue
        for phrase in rx.findall(text):
            if any(w.lower() in stopwords for w in phrase.split()):
                continue
            counts[phrase] += 1
    # Filter: must appear at least 3 times to be a candidate
    return {name: count for name, count in counts.items() if count >= 3}
 # ==================== SIGNAL SCORING ====================
-@functools.lru_cache(maxsize=128)
+@functools.lru_cache(maxsize=256)
-def _build_patterns(name: str) -> dict:
+def _build_patterns(name: str, languages: tuple = ("en",)) -> dict:
-    """Pre-compile all regex patterns for a single entity name."""
+    """Pre-compile all regex patterns for a single entity name, per language set."""
    n = re.escape(name)
    langs = _normalize_langs(languages)
    sources = get_entity_patterns(langs)
    def _compile_each(raw_patterns, flags=re.IGNORECASE):
        compiled = []
        for p in raw_patterns:
            try:
                compiled.append(re.compile(p.format(name=n), flags))
            except (re.error, KeyError, IndexError):
                continue
        return compiled
    direct_sources = sources.get("direct_address_patterns") or []
    direct_compiled = []
    for raw in direct_sources:
        try:
            direct_compiled.append(re.compile(raw.format(name=n), re.IGNORECASE))
        except (re.error, KeyError, IndexError):
            continue
    return {
-        "dialogue": [
+        "dialogue": _compile_each(sources["dialogue_patterns"], re.MULTILINE | re.IGNORECASE),
-            re.compile(p.format(name=n), re.MULTILINE | re.IGNORECASE) for p in DIALOGUE_PATTERNS
+        "person_verbs": _compile_each(sources["person_verb_patterns"]),
-        ],
+        "project_verbs": _compile_each(sources["project_verb_patterns"]),
-        "person_verbs": [re.compile(p.format(name=n), re.IGNORECASE) for p in PERSON_VERB_PATTERNS],
+        "direct": direct_compiled,
        "project_verbs": [
            re.compile(p.format(name=n), re.IGNORECASE) for p in PROJECT_VERB_PATTERNS
        ],
        "direct": re.compile(rf"\bhey\s+{n}\b|\bthanks?\s+{n}\b|\bhi\s+{n}\b", re.IGNORECASE),
        "versioned": re.compile(rf"\b{n}[-v]\w+", re.IGNORECASE),
        "code_ref": re.compile(rf"\b{n}\.(py|js|ts|yaml|yml|json|sh)\b", re.IGNORECASE),
    }
-def score_entity(name: str, text: str, lines: list) -> dict:
+@functools.lru_cache(maxsize=32)
 def _pronoun_re(languages: tuple):
    """Compile a combined pronoun regex for the given languages."""
    langs = _normalize_langs(languages)
    patterns = get_entity_patterns(langs)
    pronouns = patterns.get("pronoun_patterns") or []
    if not pronouns:
        return None
    try:
        return re.compile("|".join(pronouns), re.IGNORECASE)
    except re.error:
        return None
 def score_entity(name: str, text: str, lines: list, languages=("en",)) -> dict:
    """
    Score a candidate entity as person vs project.
    Returns scores and the signals that fired.
    """
-    patterns = _build_patterns(name)
+    langs = _normalize_langs(languages)
    patterns = _build_patterns(name, langs)
    pronoun_re = _pronoun_re(langs)
    person_score = 0
    project_score = 0
    person_signals = []
@@ -515,22 +242,25 @@ def score_entity(name: str, text: str, lines: list) -> dict:
            person_signals.append(f"'{name} ...' action ({matches}x)")
    # Pronoun proximity — pronouns within 3 lines of the name
-    name_lower = name.lower()
+    if pronoun_re is not None:
-    name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()]
+        name_lower = name.lower()
-    pronoun_hits = 0
+        name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()]
-    for idx in name_line_indices:
+        pronoun_hits = 0
-        window_text = " ".join(lines[max(0, idx - 2) : idx + 3])
+        for idx in name_line_indices:
-        if PRONOUN_RE.search(window_text):
+            window_text = " ".join(lines[max(0, idx - 2) : idx + 3])
-            pronoun_hits += 1
+            if pronoun_re.search(window_text):
-    if pronoun_hits > 0:
+                pronoun_hits += 1
-        person_score += pronoun_hits * 2
+        if pronoun_hits > 0:
-        person_signals.append(f"pronoun nearby ({pronoun_hits}x)")
+            person_score += pronoun_hits * 2
            person_signals.append(f"pronoun nearby ({pronoun_hits}x)")
    # Direct address
-    direct = len(patterns["direct"].findall(text))
+    direct_hits = 0
-    if direct > 0:
+    for rx in patterns["direct"]:
-        person_score += direct * 4
+        direct_hits += len(rx.findall(text))
-        person_signals.append(f"addressed directly ({direct}x)")
+    if direct_hits > 0:
        person_score += direct_hits * 4
        person_signals.append(f"addressed directly ({direct_hits}x)")
    # --- Project signals ---
@@ -631,13 +361,15 @@ def classify_entity(name: str, frequency: int, scores: dict) -> dict:
 # ==================== MAIN DETECT ====================
-def detect_entities(file_paths: list, max_files: int = 10) -> dict:
+def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) -> dict:
    """
    Scan files and detect entity candidates.
    Args:
        file_paths: List of Path objects to scan
        max_files: Max files to read (for speed)
        languages: Tuple of language codes whose entity patterns should be
            applied (union). Defaults to ``("en",)``.
    Returns:
        {
@@ -646,6 +378,8 @@ def detect_entities(file_paths: list, max_files: int = 10) -> dict:
            "uncertain":[...entity dicts...],
        }
    """
    langs = _normalize_langs(languages)
    # Collect text from files
    all_text = []
    all_lines = []
@@ -668,7 +402,7 @@ def detect_entities(file_paths: list, max_files: int = 10) -> dict:
    combined_text = "\n".join(all_text)
    # Extract candidates
-    candidates = extract_candidates(combined_text)
+    candidates = extract_candidates(combined_text, languages=langs)
    if not candidates:
        return {"people": [], "projects": [], "uncertain": []}
@@ -679,7 +413,7 @@ def detect_entities(file_paths: list, max_files: int = 10) -> dict:
    uncertain = []
    for name, frequency in sorted(candidates.items(), key=lambda x: x[1], reverse=True):
-        scores = score_entity(name, combined_text, all_lines)
+        scores = score_entity(name, combined_text, all_lines, languages=langs)
        entity = classify_entity(name, frequency, scores)
        if entity["type"] == "person":
@@ -843,13 +577,14 @@ if __name__ == "__main__":
    import sys
    if len(sys.argv) < 2:
-        print("Usage: python entity_detector.py <directory>")
+        print("Usage: python entity_detector.py <directory> [lang1,lang2,...]")
        sys.exit(1)
    project_dir = sys.argv[1]
-    print(f"Scanning: {project_dir}")
+    langs = tuple(sys.argv[2].split(",")) if len(sys.argv) >= 3 else ("en",)
    print(f"Scanning: {project_dir} (languages: {', '.join(langs)})")
    files = scan_for_detection(project_dir)
    print(f"Reading {len(files)} files...")
-    detected = detect_entities(files)
+    detected = detect_entities(files, languages=langs)
    confirmed = confirm_entities(detected)
    print("Confirmed entities:", confirmed)
@@ -583,15 +583,19 @@ class EntityRegistry:
    # ── Learn from sessions ──────────────────────────────────────────────────
-    def learn_from_text(self, text: str, min_confidence: float = 0.75) -> list:
+    def learn_from_text(self, text: str, min_confidence: float = 0.75, languages=("en",)) -> list:
        """
        Scan session text for new entity candidates.
        Returns list of newly discovered candidates for review.
        ``languages`` is forwarded to entity detection — pass the user's
        configured ``MempalaceConfig().entity_languages`` to match the
        locales used at ``mempalace init`` time.
        """
        from mempalace.entity_detector import extract_candidates, score_entity, classify_entity
        lines = text.splitlines()
-        candidates = extract_candidates(text)
+        candidates = extract_candidates(text, languages=languages)
        new_candidates = []
        for name, frequency in candidates.items():
@@ -599,7 +603,7 @@ class EntityRegistry:
            if name in self.people or name in self.projects:
                continue
-            scores = score_entity(name, text, lines)
+            scores = score_entity(name, text, lines, languages=languages)
            entity = classify_entity(name, frequency, scores)
            if entity["type"] == "person" and entity["confidence"] >= min_confidence:
@@ -7,6 +7,10 @@ Usage:
    print(t("cli.mine_start", path="/docs"))  # "Extraction de /docs..."
    print(t("terms.wing"))    # "aile"
    print(t("aaak.instruction"))  # AAAK compression instruction in French
 Each locale JSON may include an ``entity`` section with patterns used by
 ``mempalace.entity_detector``. See ``get_entity_patterns`` for the merge rules
 and the README section "Adding a new language" for the schema.
 """
 import json
@@ -16,6 +20,9 @@ _LANG_DIR = Path(__file__).parent
 _strings: dict = {}
 _current_lang: str = "en"
 # Cache: tuple(langs) -> merged entity pattern dict
 _entity_cache: dict = {}
 def available_languages() -> list[str]:
    """Return list of available language codes."""
@@ -72,5 +79,112 @@ def get_regex() -> dict:
    return _strings.get("regex", {})
 def _load_entity_section(lang: str) -> dict:
    """Load the raw entity section for one language. Returns {} if missing."""
    lang_file = _LANG_DIR / f"{lang}.json"
    if not lang_file.exists():
        return {}
    try:
        data = json.loads(lang_file.read_text(encoding="utf-8"))
    except (json.JSONDecodeError, OSError):
        return {}
    return data.get("entity", {}) or {}
 def get_entity_patterns(languages=("en",)) -> dict:
    """Return merged entity detection patterns for the requested languages.
    Entity detection patterns live under each locale's ``entity`` section.
    This function merges them into a single dict for consumption by
    ``mempalace.entity_detector``.
    Merge rules:
      - List fields (person_verb_patterns, pronoun_patterns, dialogue_patterns,
        project_verb_patterns) are concatenated in the order of ``languages``,
        with duplicates removed while preserving first occurrence.
      - ``stopwords`` is the set union across all languages, returned as a
        sorted list.
      - ``candidate_patterns`` and ``multi_word_patterns`` are returned as
        lists (one per language) since they use different character classes;
        callers run each pattern independently and union the matches.
      - ``direct_address_pattern`` is returned as a list of per-language
        alternation patterns (not concatenated — each is applied separately).
    If ``languages`` is empty or no requested language declares entity data,
    English is used as a fallback so callers always get a working config.
    """
    if not languages:
        languages = ("en",)
    key = tuple(languages)
    if key in _entity_cache:
        return _entity_cache[key]
    candidate_patterns: list[str] = []
    multi_word_patterns: list[str] = []
    person_verbs: list[str] = []
    pronouns: list[str] = []
    dialogue: list[str] = []
    direct_address: list[str] = []
    project_verbs: list[str] = []
    stopwords: set = set()
    found_any = False
    for lang in languages:
        section = _load_entity_section(lang)
        if not section:
            continue
        found_any = True
        if section.get("candidate_pattern"):
            candidate_patterns.append(section["candidate_pattern"])
        if section.get("multi_word_pattern"):
            multi_word_patterns.append(section["multi_word_pattern"])
        if section.get("direct_address_pattern"):
            direct_address.append(section["direct_address_pattern"])
        person_verbs.extend(section.get("person_verb_patterns", []))
        pronouns.extend(section.get("pronoun_patterns", []))
        dialogue.extend(section.get("dialogue_patterns", []))
        project_verbs.extend(section.get("project_verb_patterns", []))
        stopwords.update(w.lower() for w in section.get("stopwords", []))
    if not found_any:
        # Fallback: load English directly
        section = _load_entity_section("en")
        if section.get("candidate_pattern"):
            candidate_patterns.append(section["candidate_pattern"])
        if section.get("multi_word_pattern"):
            multi_word_patterns.append(section["multi_word_pattern"])
        if section.get("direct_address_pattern"):
            direct_address.append(section["direct_address_pattern"])
        person_verbs.extend(section.get("person_verb_patterns", []))
        pronouns.extend(section.get("pronoun_patterns", []))
        dialogue.extend(section.get("dialogue_patterns", []))
        project_verbs.extend(section.get("project_verb_patterns", []))
        stopwords.update(w.lower() for w in section.get("stopwords", []))
    merged = {
        "candidate_patterns": candidate_patterns,
        "multi_word_patterns": multi_word_patterns,
        "person_verb_patterns": _dedupe(person_verbs),
        "pronoun_patterns": _dedupe(pronouns),
        "dialogue_patterns": _dedupe(dialogue),
        "direct_address_patterns": direct_address,
        "project_verb_patterns": _dedupe(project_verbs),
        "stopwords": sorted(stopwords),
    }
    _entity_cache[key] = merged
    return merged
 def _dedupe(items: list) -> list:
    """Remove duplicates while preserving first-occurrence order."""
    seen = set()
    out = []
    for item in items:
        if item not in seen:
            seen.add(item)
            out.append(item)
    return out
 # Auto-load English on import
 load_lang("en")
@@ -40,5 +40,107 @@
    "stop_words": "the this that these those some many most each every other only such very will would could should must shall yeah okay also even then now already still back done make take give know think want need going come find work added saved session summary conversation topics source about once just really actually here there where good great better thank please sorry right wrong true false",
    "quote_pattern": "\"([^\"]{20,200})\"",
    "action_pattern": "(?:built|fixed|wrote|added|pushed|measured|tested|reviewed|created|deleted|updated|configured|deployed|migrated)\\s+[\\w\\s]{3,30}"
  },
  "entity": {
    "candidate_pattern": "[A-Z][a-z]{1,19}",
    "multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
    "person_verb_patterns": [
      "\\b{name}\\s+said\\b",
      "\\b{name}\\s+asked\\b",
      "\\b{name}\\s+told\\b",
      "\\b{name}\\s+replied\\b",
      "\\b{name}\\s+laughed\\b",
      "\\b{name}\\s+smiled\\b",
      "\\b{name}\\s+cried\\b",
      "\\b{name}\\s+felt\\b",
      "\\b{name}\\s+thinks?\\b",
      "\\b{name}\\s+wants?\\b",
      "\\b{name}\\s+loves?\\b",
      "\\b{name}\\s+hates?\\b",
      "\\b{name}\\s+knows?\\b",
      "\\b{name}\\s+decided\\b",
      "\\b{name}\\s+pushed\\b",
      "\\b{name}\\s+wrote\\b",
      "\\bhey\\s+{name}\\b",
      "\\bthanks?\\s+{name}\\b",
      "\\bhi\\s+{name}\\b",
      "\\bdear\\s+{name}\\b"
    ],
    "pronoun_patterns": [
      "\\bshe\\b",
      "\\bher\\b",
      "\\bhers\\b",
      "\\bhe\\b",
      "\\bhim\\b",
      "\\bhis\\b",
      "\\bthey\\b",
      "\\bthem\\b",
      "\\btheir\\b"
    ],
    "dialogue_patterns": [
      "^>\\s*{name}[:\\s]",
      "^{name}:\\s",
      "^\\[{name}\\]",
      "\"{name}\\s+said"
    ],
    "direct_address_pattern": "\\bhey\\s+{name}\\b|\\bthanks?\\s+{name}\\b|\\bhi\\s+{name}\\b",
    "project_verb_patterns": [
      "\\bbuilding\\s+{name}\\b",
      "\\bbuilt\\s+{name}\\b",
      "\\bship(?:ping|ped)?\\s+{name}\\b",
      "\\blaunch(?:ing|ed)?\\s+{name}\\b",
      "\\bdeploy(?:ing|ed)?\\s+{name}\\b",
      "\\binstall(?:ing|ed)?\\s+{name}\\b",
      "\\bthe\\s+{name}\\s+architecture\\b",
      "\\bthe\\s+{name}\\s+pipeline\\b",
      "\\bthe\\s+{name}\\s+system\\b",
      "\\bthe\\s+{name}\\s+repo\\b",
      "\\b{name}\\s+v\\d+\\b",
      "\\b{name}\\.py\\b",
      "\\b{name}-core\\b",
      "\\b{name}-local\\b",
      "\\bimport\\s+{name}\\b",
      "\\bpip\\s+install\\s+{name}\\b"
    ],
    "stopwords": [
      "the", "a", "an", "and", "or", "but", "in", "on", "at", "to",
      "for", "of", "with", "by", "from", "as", "is", "was", "are", "were",
      "be", "been", "being", "have", "has", "had", "do", "does", "did",
      "will", "would", "could", "should", "may", "might", "must", "shall", "can",
      "this", "that", "these", "those", "it", "its", "they", "them", "their",
      "we", "our", "you", "your", "i", "my", "me", "he", "she", "his", "her",
      "who", "what", "when", "where", "why", "how", "which",
      "if", "then", "so", "not", "no", "yes", "ok", "okay",
      "just", "very", "really", "also", "already", "still", "even", "only",
      "here", "there", "now", "too", "up", "out", "about", "like",
      "use", "get", "got", "make", "made", "take", "put", "come", "go", "see",
      "know", "think", "true", "false", "none", "null", "new", "old", "all", "any", "some",
      "return", "print", "def", "class", "import",
      "step", "usage", "run", "check", "find", "add", "set", "list",
      "args", "dict", "str", "int", "bool", "path", "file", "type", "name",
      "note", "example", "option", "result", "error", "warning", "info",
      "every", "each", "more", "less", "next", "last", "first", "second",
      "stack", "layer", "mode", "test", "stop", "start", "copy", "move",
      "source", "target", "output", "input", "data", "item", "key", "value",
      "returns", "raises", "yields", "self", "cls", "kwargs",
      "world", "well", "want", "topic", "choose", "social", "cars", "phones",
      "healthcare", "ex", "machina", "deus", "human", "humans", "people",
      "things", "something", "nothing", "everything", "anything", "someone",
      "everyone", "anyone", "way", "time", "day", "life", "place", "thing",
      "part", "kind", "sort", "case", "point", "idea", "fact", "sense",
      "question", "answer", "reason", "number", "version", "system",
      "hey", "hi", "hello", "thanks", "thank", "right", "let",
      "click", "hit", "press", "tap", "drag", "drop", "open", "close",
      "save", "load", "launch", "install", "download", "upload", "scroll",
      "select", "enter", "submit", "cancel", "confirm", "delete", "paste",
      "write", "read", "search", "show", "hide",
      "desktop", "documents", "downloads", "users", "home", "library",
      "applications", "preferences", "settings", "terminal",
      "actor", "vector", "remote", "control", "duration", "fetch",
      "agents", "tools", "others", "guards", "ethics", "regulation",
      "learning", "thinking", "memory", "language", "intelligence",
      "technology", "society", "culture", "future", "history", "science",
      "model", "models", "network", "networks", "training", "inference"
    ]
  }
 }
@@ -1,6 +1,9 @@
 """Tests for mempalace.entity_detector."""
 import contextlib
 import json
 import os
 from pathlib import Path
 from unittest.mock import patch
 from mempalace.entity_detector import (
@@ -378,3 +381,211 @@ def test_scan_for_detection_max_files(tmp_path):
        (tmp_path / f"note{i}.md").write_text(f"content {i}")
    files = scan_for_detection(str(tmp_path), max_files=5)
    assert len(files) <= 5
 # ── multi-language infra ───────────────────────────────────────────────
@contextlib.contextmanager
 def _temp_locale(locale_code: str, entity_section: dict):
    """Context manager that drops a locale JSON into mempalace/i18n/ for the test body.
    Cleans up the file and clears every cache that depends on locale data on exit,
    even if the test fails or the entity section is invalid.
    Note: writes into the real mempalace/i18n/ directory. If a test process is
    SIGKILLed mid-test the orphan zz-test-*.json file will break test_all_languages_load
    on the next run (the fixture lacks the required terms/cli/aaak sections).
    Recover with `rm mempalace/i18n/zz-test-*.json`.
    """
    from mempalace import i18n
    from mempalace import entity_detector
    locale_path = Path(i18n.__file__).parent / f"{locale_code}.json"
    if locale_path.exists():
        raise RuntimeError(f"Test locale {locale_code} collides with an existing file")
    payload = {
        "lang": locale_code,
        "label": locale_code,
        "terms": {},
        "cli": {},
        "aaak": {"instruction": "test"},
        "entity": entity_section,
    }
    locale_path.write_text(json.dumps(payload), encoding="utf-8")
    def _clear_caches():
        i18n._entity_cache.clear()
        entity_detector._build_patterns.cache_clear()
        entity_detector._pronoun_re.cache_clear()
        entity_detector._get_stopwords.cache_clear()
    _clear_caches()
    try:
        yield locale_path
    finally:
        try:
            locale_path.unlink()
        except OSError:
            pass
        _clear_caches()
 def test_extract_candidates_default_languages_is_english_only():
    """Default languages tuple = ('en',) — accented names dropped (as today)."""
    text = "João said hi. João laughed. João waved. João decided."
    result = extract_candidates(text)  # default ("en",)
    assert "João" not in result
 def test_extract_candidates_with_extra_locale_picks_up_new_charset():
    """A locale with a Latin+diacritics candidate_pattern catches accented names."""
    locale = {
        "candidate_pattern": "[A-ZÀ-Ú][a-zà-ÿ]{1,19}",
        "multi_word_pattern": "[A-ZÀ-Ú][a-zà-ÿ]+(?:\\s+[A-ZÀ-Ú][a-zà-ÿ]+)+",
        "person_verb_patterns": [],
        "pronoun_patterns": [],
        "dialogue_patterns": [],
        "project_verb_patterns": [],
        "stopwords": [],
    }
    with _temp_locale("zz-test-latin", locale):
        text = "João said hi. João laughed. João waved. João decided."
        result = extract_candidates(text, languages=("en", "zz-test-latin"))
        assert "João" in result
        assert result["João"] >= 3
 def test_extract_candidates_with_cyrillic_locale():
    """A locale with a Cyrillic candidate_pattern catches Russian names."""
    locale = {
        "candidate_pattern": "[А-ЯЁ][а-яё]{1,19}",
        "multi_word_pattern": "[А-ЯЁ][а-яё]+(?:\\s+[А-ЯЁ][а-яё]+)+",
        "person_verb_patterns": [],
        "pronoun_patterns": [],
        "dialogue_patterns": [],
        "project_verb_patterns": [],
        "stopwords": [],
    }
    with _temp_locale("zz-test-cyrillic", locale):
        text = "Иван сказал привет. Иван засмеялся. Иван помахал. Иван решил."
        result = extract_candidates(text, languages=("en", "zz-test-cyrillic"))
        assert "Иван" in result
 def test_score_entity_unions_person_verbs_across_languages():
    """A non-English person-verb pattern fires when its locale is enabled."""
    locale = {
        "candidate_pattern": "[A-Z][a-z]{1,19}",
        "multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
        "person_verb_patterns": [
            "\\b{name}\\s+disse\\b",
            "\\b{name}\\s+falou\\b",
            "\\b{name}\\s+riu\\b",
        ],
        "pronoun_patterns": [],
        "dialogue_patterns": [],
        "project_verb_patterns": [],
        "stopwords": [],
    }
    with _temp_locale("zz-test-verbs", locale):
        text = "Maria disse oi. Maria falou. Maria riu."
        lines = text.splitlines()
        en_only = score_entity("Maria", text, lines, languages=("en",))
        multi = score_entity("Maria", text, lines, languages=("en", "zz-test-verbs"))
        assert multi["person_score"] > en_only["person_score"]
        assert any("action" in s for s in multi["person_signals"])
 def test_get_entity_patterns_unknown_lang_falls_back_to_english():
    """Asking for a non-existent language returns English defaults."""
    from mempalace.i18n import get_entity_patterns
    patterns = get_entity_patterns(("zz-does-not-exist",))
    assert len(patterns["stopwords"]) > 0
    assert patterns["candidate_patterns"]  # English fallback
 def test_get_entity_patterns_dedupes_across_overlapping_languages():
    """Loading ('en', 'en') doesn't double-count patterns or stopwords."""
    from mempalace.i18n import get_entity_patterns
    single = get_entity_patterns(("en",))
    doubled = get_entity_patterns(("en", "en"))
    assert len(doubled["person_verb_patterns"]) == len(single["person_verb_patterns"])
    assert len(doubled["stopwords"]) == len(single["stopwords"])
 def test_build_patterns_cache_is_keyed_by_language():
    """Same name with different language tuples yields different compiled sets."""
    from mempalace.entity_detector import _build_patterns
    locale = {
        "candidate_pattern": "[A-Z][a-z]+",
        "multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
        "person_verb_patterns": ["\\b{name}\\s+ranxx\\b"],
        "pronoun_patterns": [],
        "dialogue_patterns": [],
        "project_verb_patterns": [],
        "stopwords": [],
    }
    with _temp_locale("zz-test-cache", locale):
        en_patterns = _build_patterns("Sam", ("en",))
        multi_patterns = _build_patterns("Sam", ("en", "zz-test-cache"))
        assert len(multi_patterns["person_verbs"]) > len(en_patterns["person_verbs"])
 def test_normalize_langs_handles_string_input():
    """Passing a bare string instead of a tuple still works."""
    from mempalace.entity_detector import _normalize_langs
    assert _normalize_langs("en") == ("en",)
    assert _normalize_langs(["en", "pt-br"]) == ("en", "pt-br")
    assert _normalize_langs(None) == ("en",)
    assert _normalize_langs(()) == ("en",)
 def test_config_entity_languages_defaults_to_english(tmp_path, monkeypatch):
    """MempalaceConfig.entity_languages defaults to ['en'] with no config file."""
    from mempalace.config import MempalaceConfig
    monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False)
    monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False)
    cfg = MempalaceConfig(config_dir=str(tmp_path))
    assert cfg.entity_languages == ["en"]
 def test_config_entity_languages_from_env(tmp_path, monkeypatch):
    """Env var overrides config file."""
    from mempalace.config import MempalaceConfig
    monkeypatch.setenv("MEMPALACE_ENTITY_LANGUAGES", "en,pt-br,ru")
    cfg = MempalaceConfig(config_dir=str(tmp_path))
    assert cfg.entity_languages == ["en", "pt-br", "ru"]
 def test_config_set_entity_languages_persists(tmp_path, monkeypatch):
    """set_entity_languages writes to disk and is read back."""
    from mempalace.config import MempalaceConfig
    monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False)
    monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False)
    cfg = MempalaceConfig(config_dir=str(tmp_path))
    cfg.set_entity_languages(["en", "pt-br"])
    cfg2 = MempalaceConfig(config_dir=str(tmp_path))
    assert cfg2.entity_languages == ["en", "pt-br"]
 def test_config_set_entity_languages_empty_falls_back_to_english(tmp_path, monkeypatch):
    """An empty list normalizes to ['en']."""
    from mempalace.config import MempalaceConfig
    monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False)
    monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False)
    cfg = MempalaceConfig(config_dir=str(tmp_path))
    result = cfg.set_entity_languages([])
    assert result == ["en"]
    assert cfg.entity_languages == ["en"]