diff --git a/mempalace/cli.py b/mempalace/cli.py index b06a711..fb2f0ae 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -73,12 +73,25 @@ def cmd_init(args): from .entity_detector import scan_for_detection, detect_entities, confirm_entities from .room_detector_local import detect_rooms_local + cfg = MempalaceConfig() + + # Resolve entity-detection languages: --lang overrides config. + lang_arg = getattr(args, "lang", None) + if lang_arg: + languages = [s.strip() for s in lang_arg.split(",") if s.strip()] or ["en"] + cfg.set_entity_languages(languages) + else: + languages = cfg.entity_languages + languages_tuple = tuple(languages) + # Pass 1: auto-detect people and projects from file content print(f"\n Scanning for entities in: {args.dir}") + if languages_tuple != ("en",): + print(f" Languages: {', '.join(languages_tuple)}") files = scan_for_detection(args.dir) if files: print(f" Reading {len(files)} files...") - detected = detect_entities(files) + detected = detect_entities(files, languages=languages_tuple) total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"]) if total > 0: confirmed = confirm_entities(detected, yes=getattr(args, "yes", False)) @@ -93,7 +106,7 @@ def cmd_init(args): # Pass 2: detect rooms from folder structure detect_rooms_local(project_dir=args.dir, yes=getattr(args, "yes", False)) - MempalaceConfig().init() + cfg.init() # Pass 3: protect git repos from accidentally committing per-project files _ensure_mempalace_files_gitignored(args.dir) @@ -478,6 +491,16 @@ def main(): action="store_true", help="Auto-accept all detected entities (non-interactive)", ) + p_init.add_argument( + "--lang", + default=None, + help=( + "Comma-separated language codes for entity detection " + "(e.g. 'en' or 'en,pt-br'). Defaults to value from config " + "(MEMPALACE_ENTITY_LANGUAGES env var or config.json), or 'en'. " + "When given, the value is also persisted to config.json." + ), + ) # mine p_mine = sub.add_parser("mine", help="Mine files into the palace") diff --git a/mempalace/config.py b/mempalace/config.py index a8cbee3..a9bcc7f 100644 --- a/mempalace/config.py +++ b/mempalace/config.py @@ -197,6 +197,42 @@ class MempalaceConfig: """Mapping of hall names to keyword lists.""" return self._file_config.get("hall_keywords", DEFAULT_HALL_KEYWORDS) + @property + def entity_languages(self): + """Languages whose entity-detection patterns should be applied. + + Reads from env var ``MEMPALACE_ENTITY_LANGUAGES`` (comma-separated) + first, then the ``entity_languages`` field in ``config.json``, + defaulting to ``["en"]``. + """ + env_val = os.environ.get("MEMPALACE_ENTITY_LANGUAGES") or os.environ.get( + "MEMPAL_ENTITY_LANGUAGES" + ) + if env_val: + return [s.strip() for s in env_val.split(",") if s.strip()] or ["en"] + cfg = self._file_config.get("entity_languages") + if isinstance(cfg, list) and cfg: + return [str(s) for s in cfg] + return ["en"] + + def set_entity_languages(self, languages): + """Persist the entity-detection language list to ``config.json``.""" + normalized = [s.strip() for s in languages if s and s.strip()] + if not normalized: + normalized = ["en"] + self._file_config["entity_languages"] = normalized + self._config_dir.mkdir(parents=True, exist_ok=True) + try: + with open(self._config_file, "w", encoding="utf-8") as f: + json.dump(self._file_config, f, indent=2, ensure_ascii=False) + except OSError: + pass + try: + self._config_file.chmod(0o600) + except (OSError, NotImplementedError): + pass + return normalized + @property def hook_silent_save(self): """Whether the stop hook saves directly (True) or blocks for MCP calls (False).""" diff --git a/mempalace/entity_detector.py b/mempalace/entity_detector.py index 203c0aa..a20e2af 100644 --- a/mempalace/entity_detector.py +++ b/mempalace/entity_detector.py @@ -9,9 +9,21 @@ Two-pass approach: Used by mempalace init before mining begins. The confirmed entity map feeds the miner as the taxonomy. +Multi-language support: + All lexical patterns (person verbs, pronouns, dialogue markers, project + verbs, stopwords, and the candidate-extraction character class) live in + the ``entity`` section of ``mempalace/i18n/.json``. Every public + function accepts a ``languages`` tuple and applies the union of the + requested locales' patterns. The default is ``("en",)`` — existing + English-only callers behave exactly as before. + + To add a new language: add an ``entity`` section to that locale's JSON. + No code changes required. + Usage: - from entity_detector import detect_entities, confirm_entities - candidates = detect_entities(file_paths) + from mempalace.entity_detector import detect_entities, confirm_entities + candidates = detect_entities(file_paths) # English only + candidates = detect_entities(paths, languages=("en", "pt-br")) confirmed = confirm_entities(candidates) # interactive review """ @@ -21,382 +33,46 @@ import functools from pathlib import Path from collections import defaultdict +from mempalace.i18n import get_entity_patterns -# ==================== SIGNAL PATTERNS ==================== -# Person signals — things people do -PERSON_VERB_PATTERNS = [ - r"\b{name}\s+said\b", - r"\b{name}\s+asked\b", - r"\b{name}\s+told\b", - r"\b{name}\s+replied\b", - r"\b{name}\s+laughed\b", - r"\b{name}\s+smiled\b", - r"\b{name}\s+cried\b", - r"\b{name}\s+felt\b", - r"\b{name}\s+thinks?\b", - r"\b{name}\s+wants?\b", - r"\b{name}\s+loves?\b", - r"\b{name}\s+hates?\b", - r"\b{name}\s+knows?\b", - r"\b{name}\s+decided\b", - r"\b{name}\s+pushed\b", - r"\b{name}\s+wrote\b", - r"\bhey\s+{name}\b", - r"\bthanks?\s+{name}\b", - r"\bhi\s+{name}\b", - r"\bdear\s+{name}\b", -] +# ==================== LANGUAGE-AWARE PATTERN LOADING ==================== -# Person signals — pronouns resolving nearby -PRONOUN_PATTERNS = [ - r"\bshe\b", - r"\bher\b", - r"\bhers\b", - r"\bhe\b", - r"\bhim\b", - r"\bhis\b", - r"\bthey\b", - r"\bthem\b", - r"\btheir\b", -] -PRONOUN_RE = re.compile("|".join(PRONOUN_PATTERNS), re.IGNORECASE) +def _normalize_langs(languages) -> tuple: + """Coerce a language input into a non-empty hashable tuple.""" + if not languages: + return ("en",) + if isinstance(languages, str): + return (languages,) + return tuple(languages) -# Person signals — dialogue markers -DIALOGUE_PATTERNS = [ - r"^>\s*{name}[:\s]", # > Speaker: ... - r"^{name}:\s", # Speaker: ... - r"^\[{name}\]", # [Speaker] - r'"{name}\s+said', -] -# Project signals — things projects have/do -PROJECT_VERB_PATTERNS = [ - r"\bbuilding\s+{name}\b", - r"\bbuilt\s+{name}\b", - r"\bship(?:ping|ped)?\s+{name}\b", - r"\blaunch(?:ing|ed)?\s+{name}\b", - r"\bdeploy(?:ing|ed)?\s+{name}\b", - r"\binstall(?:ing|ed)?\s+{name}\b", - r"\bthe\s+{name}\s+architecture\b", - r"\bthe\s+{name}\s+pipeline\b", - r"\bthe\s+{name}\s+system\b", - r"\bthe\s+{name}\s+repo\b", - r"\b{name}\s+v\d+\b", # MemPal v2 - r"\b{name}\.py\b", # mempalace.py - r"\b{name}-core\b", # mempal-core (hyphen only, not underscore) - r"\b{name}-local\b", - r"\bimport\s+{name}\b", - r"\bpip\s+install\s+{name}\b", -] +@functools.lru_cache(maxsize=32) +def _get_stopwords(languages: tuple) -> frozenset: + """Return the union of stopwords across the given languages.""" + patterns = get_entity_patterns(languages) + return frozenset(patterns["stopwords"]) -# Words that are almost certainly NOT entities -STOPWORDS = { - "the", - "a", - "an", - "and", - "or", - "but", - "in", - "on", - "at", - "to", - "for", - "of", - "with", - "by", - "from", - "as", - "is", - "was", - "are", - "were", - "be", - "been", - "being", - "have", - "has", - "had", - "do", - "does", - "did", - "will", - "would", - "could", - "should", - "may", - "might", - "must", - "shall", - "can", - "this", - "that", - "these", - "those", - "it", - "its", - "they", - "them", - "their", - "we", - "our", - "you", - "your", - "i", - "my", - "me", - "he", - "she", - "his", - "her", - "who", - "what", - "when", - "where", - "why", - "how", - "which", - "if", - "then", - "so", - "not", - "no", - "yes", - "ok", - "okay", - "just", - "very", - "really", - "also", - "already", - "still", - "even", - "only", - "here", - "there", - "now", - "then", - "too", - "up", - "out", - "about", - "like", - "use", - "get", - "got", - "make", - "made", - "take", - "put", - "come", - "go", - "see", - "know", - "think", - "true", - "false", - "none", - "null", - "new", - "old", - "all", - "any", - "some", - "true", - "false", - "return", - "print", - "def", - "class", - "import", - "from", - # Common capitalized words in prose that aren't entities - "step", - "usage", - "run", - "check", - "find", - "add", - "get", - "set", - "list", - "args", - "dict", - "str", - "int", - "bool", - "path", - "file", - "type", - "name", - "note", - "example", - "option", - "result", - "error", - "warning", - "info", - "every", - "each", - "more", - "less", - "next", - "last", - "first", - "second", - "stack", - "layer", - "mode", - "test", - "stop", - "start", - "copy", - "move", - "source", - "target", - "output", - "input", - "data", - "item", - "key", - "value", - "returns", - "raises", - "yields", - "none", - "self", - "cls", - "kwargs", - # Common sentence-starting / abstract words that aren't entities - "world", - "well", - "want", - "topic", - "choose", - "social", - "cars", - "phones", - "healthcare", - "ex", - "machina", - "deus", - "human", - "humans", - "people", - "things", - "something", - "nothing", - "everything", - "anything", - "someone", - "everyone", - "anyone", - "way", - "time", - "day", - "life", - "place", - "thing", - "part", - "kind", - "sort", - "case", - "point", - "idea", - "fact", - "sense", - "question", - "answer", - "reason", - "number", - "version", - "system", - # Greetings and filler words at sentence starts - "hey", - "hi", - "hello", - "thanks", - "thank", - "right", - "let", - "ok", - # UI/action words that appear in how-to content - "click", - "hit", - "press", - "tap", - "drag", - "drop", - "open", - "close", - "save", - "load", - "launch", - "install", - "download", - "upload", - "scroll", - "select", - "enter", - "submit", - "cancel", - "confirm", - "delete", - "copy", - "paste", - "type", - "write", - "read", - "search", - "find", - "show", - "hide", - # Common filesystem/technical capitalized words - "desktop", - "documents", - "downloads", - "users", - "home", - "library", - "applications", - "system", - "preferences", - "settings", - "terminal", - # Abstract/topic words - "actor", - "vector", - "remote", - "control", - "duration", - "fetch", - # Abstract concepts that appear as subjects but aren't entities - "agents", - "tools", - "others", - "guards", - "ethics", - "regulation", - "learning", - "thinking", - "memory", - "language", - "intelligence", - "technology", - "society", - "culture", - "future", - "history", - "science", - "model", - "models", - "network", - "networks", - "training", - "inference", -} + +# ==================== BACKWARD-COMPAT MODULE CONSTANTS ==================== +# +# These mirror the old module-level constants so existing imports keep working. +# They reflect the English defaults and are populated at import time from +# ``mempalace/i18n/en.json``. Callers that need multi-language behavior should +# pass the ``languages`` parameter to the public functions below. + +_EN = get_entity_patterns(("en",)) + +PERSON_VERB_PATTERNS = list(_EN["person_verb_patterns"]) +PRONOUN_PATTERNS = list(_EN["pronoun_patterns"]) +PRONOUN_RE = re.compile("|".join(PRONOUN_PATTERNS), re.IGNORECASE) if PRONOUN_PATTERNS else None +DIALOGUE_PATTERNS = list(_EN["dialogue_patterns"]) +PROJECT_VERB_PATTERNS = list(_EN["project_verb_patterns"]) +STOPWORDS = set(_EN["stopwords"]) + + +# ==================== EXTENSION POINTS (not language-scoped) ==================== # For entity detection — prose only, no code files # Code files have too many capitalized names (classes, functions) that aren't entities @@ -443,56 +119,107 @@ SKIP_DIRS = { # ==================== CANDIDATE EXTRACTION ==================== -def extract_candidates(text: str) -> dict: +def extract_candidates(text: str, languages=("en",)) -> dict: """ Extract all capitalized proper noun candidates from text. Returns {name: frequency} for names appearing 3+ times. - """ - # Find all capitalized words (not at sentence start — harder, so we use frequency as filter) - raw = re.findall(r"\b([A-Z][a-z]{1,19})\b", text) - counts = defaultdict(int) - for word in raw: - if word.lower() not in STOPWORDS and len(word) > 1: + Each language contributes its own character-class pattern (e.g. ASCII + for English, Latin+diacritics for pt-br, Cyrillic for Russian, + Devanagari for Hindi). Matches from all languages are unioned. + """ + langs = _normalize_langs(languages) + patterns = get_entity_patterns(langs) + stopwords = _get_stopwords(langs) + + counts: defaultdict = defaultdict(int) + + # Single-word candidates — one pattern per language + for raw_pat in patterns["candidate_patterns"]: + try: + rx = re.compile(rf"\b({raw_pat})\b") + except re.error: + continue + for word in rx.findall(text): + if word.lower() in stopwords: + continue + if len(word) < 2: + continue counts[word] += 1 - # Also find multi-word proper nouns (e.g. "Memory Palace", "Claude Code") - multi = re.findall(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b", text) - for phrase in multi: - if not any(w.lower() in STOPWORDS for w in phrase.split()): + # Multi-word candidates — one pattern per language + for raw_pat in patterns["multi_word_patterns"]: + try: + rx = re.compile(rf"\b({raw_pat})\b") + except re.error: + continue + for phrase in rx.findall(text): + if any(w.lower() in stopwords for w in phrase.split()): + continue counts[phrase] += 1 - # Filter: must appear at least 3 times to be a candidate return {name: count for name, count in counts.items() if count >= 3} # ==================== SIGNAL SCORING ==================== -@functools.lru_cache(maxsize=128) -def _build_patterns(name: str) -> dict: - """Pre-compile all regex patterns for a single entity name.""" +@functools.lru_cache(maxsize=256) +def _build_patterns(name: str, languages: tuple = ("en",)) -> dict: + """Pre-compile all regex patterns for a single entity name, per language set.""" n = re.escape(name) + langs = _normalize_langs(languages) + sources = get_entity_patterns(langs) + + def _compile_each(raw_patterns, flags=re.IGNORECASE): + compiled = [] + for p in raw_patterns: + try: + compiled.append(re.compile(p.format(name=n), flags)) + except (re.error, KeyError, IndexError): + continue + return compiled + + direct_sources = sources.get("direct_address_patterns") or [] + direct_compiled = [] + for raw in direct_sources: + try: + direct_compiled.append(re.compile(raw.format(name=n), re.IGNORECASE)) + except (re.error, KeyError, IndexError): + continue + return { - "dialogue": [ - re.compile(p.format(name=n), re.MULTILINE | re.IGNORECASE) for p in DIALOGUE_PATTERNS - ], - "person_verbs": [re.compile(p.format(name=n), re.IGNORECASE) for p in PERSON_VERB_PATTERNS], - "project_verbs": [ - re.compile(p.format(name=n), re.IGNORECASE) for p in PROJECT_VERB_PATTERNS - ], - "direct": re.compile(rf"\bhey\s+{n}\b|\bthanks?\s+{n}\b|\bhi\s+{n}\b", re.IGNORECASE), + "dialogue": _compile_each(sources["dialogue_patterns"], re.MULTILINE | re.IGNORECASE), + "person_verbs": _compile_each(sources["person_verb_patterns"]), + "project_verbs": _compile_each(sources["project_verb_patterns"]), + "direct": direct_compiled, "versioned": re.compile(rf"\b{n}[-v]\w+", re.IGNORECASE), "code_ref": re.compile(rf"\b{n}\.(py|js|ts|yaml|yml|json|sh)\b", re.IGNORECASE), } -def score_entity(name: str, text: str, lines: list) -> dict: +@functools.lru_cache(maxsize=32) +def _pronoun_re(languages: tuple): + """Compile a combined pronoun regex for the given languages.""" + langs = _normalize_langs(languages) + patterns = get_entity_patterns(langs) + pronouns = patterns.get("pronoun_patterns") or [] + if not pronouns: + return None + try: + return re.compile("|".join(pronouns), re.IGNORECASE) + except re.error: + return None + + +def score_entity(name: str, text: str, lines: list, languages=("en",)) -> dict: """ Score a candidate entity as person vs project. Returns scores and the signals that fired. """ - patterns = _build_patterns(name) + langs = _normalize_langs(languages) + patterns = _build_patterns(name, langs) + pronoun_re = _pronoun_re(langs) person_score = 0 project_score = 0 person_signals = [] @@ -515,22 +242,25 @@ def score_entity(name: str, text: str, lines: list) -> dict: person_signals.append(f"'{name} ...' action ({matches}x)") # Pronoun proximity — pronouns within 3 lines of the name - name_lower = name.lower() - name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()] - pronoun_hits = 0 - for idx in name_line_indices: - window_text = " ".join(lines[max(0, idx - 2) : idx + 3]) - if PRONOUN_RE.search(window_text): - pronoun_hits += 1 - if pronoun_hits > 0: - person_score += pronoun_hits * 2 - person_signals.append(f"pronoun nearby ({pronoun_hits}x)") + if pronoun_re is not None: + name_lower = name.lower() + name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()] + pronoun_hits = 0 + for idx in name_line_indices: + window_text = " ".join(lines[max(0, idx - 2) : idx + 3]) + if pronoun_re.search(window_text): + pronoun_hits += 1 + if pronoun_hits > 0: + person_score += pronoun_hits * 2 + person_signals.append(f"pronoun nearby ({pronoun_hits}x)") # Direct address - direct = len(patterns["direct"].findall(text)) - if direct > 0: - person_score += direct * 4 - person_signals.append(f"addressed directly ({direct}x)") + direct_hits = 0 + for rx in patterns["direct"]: + direct_hits += len(rx.findall(text)) + if direct_hits > 0: + person_score += direct_hits * 4 + person_signals.append(f"addressed directly ({direct_hits}x)") # --- Project signals --- @@ -631,13 +361,15 @@ def classify_entity(name: str, frequency: int, scores: dict) -> dict: # ==================== MAIN DETECT ==================== -def detect_entities(file_paths: list, max_files: int = 10) -> dict: +def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) -> dict: """ Scan files and detect entity candidates. Args: file_paths: List of Path objects to scan max_files: Max files to read (for speed) + languages: Tuple of language codes whose entity patterns should be + applied (union). Defaults to ``("en",)``. Returns: { @@ -646,6 +378,8 @@ def detect_entities(file_paths: list, max_files: int = 10) -> dict: "uncertain":[...entity dicts...], } """ + langs = _normalize_langs(languages) + # Collect text from files all_text = [] all_lines = [] @@ -668,7 +402,7 @@ def detect_entities(file_paths: list, max_files: int = 10) -> dict: combined_text = "\n".join(all_text) # Extract candidates - candidates = extract_candidates(combined_text) + candidates = extract_candidates(combined_text, languages=langs) if not candidates: return {"people": [], "projects": [], "uncertain": []} @@ -679,7 +413,7 @@ def detect_entities(file_paths: list, max_files: int = 10) -> dict: uncertain = [] for name, frequency in sorted(candidates.items(), key=lambda x: x[1], reverse=True): - scores = score_entity(name, combined_text, all_lines) + scores = score_entity(name, combined_text, all_lines, languages=langs) entity = classify_entity(name, frequency, scores) if entity["type"] == "person": @@ -843,13 +577,14 @@ if __name__ == "__main__": import sys if len(sys.argv) < 2: - print("Usage: python entity_detector.py ") + print("Usage: python entity_detector.py [lang1,lang2,...]") sys.exit(1) project_dir = sys.argv[1] - print(f"Scanning: {project_dir}") + langs = tuple(sys.argv[2].split(",")) if len(sys.argv) >= 3 else ("en",) + print(f"Scanning: {project_dir} (languages: {', '.join(langs)})") files = scan_for_detection(project_dir) print(f"Reading {len(files)} files...") - detected = detect_entities(files) + detected = detect_entities(files, languages=langs) confirmed = confirm_entities(detected) print("Confirmed entities:", confirmed) diff --git a/mempalace/entity_registry.py b/mempalace/entity_registry.py index 2c6930d..6c37572 100644 --- a/mempalace/entity_registry.py +++ b/mempalace/entity_registry.py @@ -583,15 +583,19 @@ class EntityRegistry: # ── Learn from sessions ────────────────────────────────────────────────── - def learn_from_text(self, text: str, min_confidence: float = 0.75) -> list: + def learn_from_text(self, text: str, min_confidence: float = 0.75, languages=("en",)) -> list: """ Scan session text for new entity candidates. Returns list of newly discovered candidates for review. + + ``languages`` is forwarded to entity detection — pass the user's + configured ``MempalaceConfig().entity_languages`` to match the + locales used at ``mempalace init`` time. """ from mempalace.entity_detector import extract_candidates, score_entity, classify_entity lines = text.splitlines() - candidates = extract_candidates(text) + candidates = extract_candidates(text, languages=languages) new_candidates = [] for name, frequency in candidates.items(): @@ -599,7 +603,7 @@ class EntityRegistry: if name in self.people or name in self.projects: continue - scores = score_entity(name, text, lines) + scores = score_entity(name, text, lines, languages=languages) entity = classify_entity(name, frequency, scores) if entity["type"] == "person" and entity["confidence"] >= min_confidence: diff --git a/mempalace/i18n/__init__.py b/mempalace/i18n/__init__.py index 1b90b4d..671e0a1 100644 --- a/mempalace/i18n/__init__.py +++ b/mempalace/i18n/__init__.py @@ -7,6 +7,10 @@ Usage: print(t("cli.mine_start", path="/docs")) # "Extraction de /docs..." print(t("terms.wing")) # "aile" print(t("aaak.instruction")) # AAAK compression instruction in French + +Each locale JSON may include an ``entity`` section with patterns used by +``mempalace.entity_detector``. See ``get_entity_patterns`` for the merge rules +and the README section "Adding a new language" for the schema. """ import json @@ -16,6 +20,9 @@ _LANG_DIR = Path(__file__).parent _strings: dict = {} _current_lang: str = "en" +# Cache: tuple(langs) -> merged entity pattern dict +_entity_cache: dict = {} + def available_languages() -> list[str]: """Return list of available language codes.""" @@ -72,5 +79,112 @@ def get_regex() -> dict: return _strings.get("regex", {}) +def _load_entity_section(lang: str) -> dict: + """Load the raw entity section for one language. Returns {} if missing.""" + lang_file = _LANG_DIR / f"{lang}.json" + if not lang_file.exists(): + return {} + try: + data = json.loads(lang_file.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError): + return {} + return data.get("entity", {}) or {} + + +def get_entity_patterns(languages=("en",)) -> dict: + """Return merged entity detection patterns for the requested languages. + + Entity detection patterns live under each locale's ``entity`` section. + This function merges them into a single dict for consumption by + ``mempalace.entity_detector``. + + Merge rules: + - List fields (person_verb_patterns, pronoun_patterns, dialogue_patterns, + project_verb_patterns) are concatenated in the order of ``languages``, + with duplicates removed while preserving first occurrence. + - ``stopwords`` is the set union across all languages, returned as a + sorted list. + - ``candidate_patterns`` and ``multi_word_patterns`` are returned as + lists (one per language) since they use different character classes; + callers run each pattern independently and union the matches. + - ``direct_address_pattern`` is returned as a list of per-language + alternation patterns (not concatenated — each is applied separately). + + If ``languages`` is empty or no requested language declares entity data, + English is used as a fallback so callers always get a working config. + """ + if not languages: + languages = ("en",) + key = tuple(languages) + if key in _entity_cache: + return _entity_cache[key] + + candidate_patterns: list[str] = [] + multi_word_patterns: list[str] = [] + person_verbs: list[str] = [] + pronouns: list[str] = [] + dialogue: list[str] = [] + direct_address: list[str] = [] + project_verbs: list[str] = [] + stopwords: set = set() + + found_any = False + for lang in languages: + section = _load_entity_section(lang) + if not section: + continue + found_any = True + if section.get("candidate_pattern"): + candidate_patterns.append(section["candidate_pattern"]) + if section.get("multi_word_pattern"): + multi_word_patterns.append(section["multi_word_pattern"]) + if section.get("direct_address_pattern"): + direct_address.append(section["direct_address_pattern"]) + person_verbs.extend(section.get("person_verb_patterns", [])) + pronouns.extend(section.get("pronoun_patterns", [])) + dialogue.extend(section.get("dialogue_patterns", [])) + project_verbs.extend(section.get("project_verb_patterns", [])) + stopwords.update(w.lower() for w in section.get("stopwords", [])) + + if not found_any: + # Fallback: load English directly + section = _load_entity_section("en") + if section.get("candidate_pattern"): + candidate_patterns.append(section["candidate_pattern"]) + if section.get("multi_word_pattern"): + multi_word_patterns.append(section["multi_word_pattern"]) + if section.get("direct_address_pattern"): + direct_address.append(section["direct_address_pattern"]) + person_verbs.extend(section.get("person_verb_patterns", [])) + pronouns.extend(section.get("pronoun_patterns", [])) + dialogue.extend(section.get("dialogue_patterns", [])) + project_verbs.extend(section.get("project_verb_patterns", [])) + stopwords.update(w.lower() for w in section.get("stopwords", [])) + + merged = { + "candidate_patterns": candidate_patterns, + "multi_word_patterns": multi_word_patterns, + "person_verb_patterns": _dedupe(person_verbs), + "pronoun_patterns": _dedupe(pronouns), + "dialogue_patterns": _dedupe(dialogue), + "direct_address_patterns": direct_address, + "project_verb_patterns": _dedupe(project_verbs), + "stopwords": sorted(stopwords), + } + _entity_cache[key] = merged + return merged + + +def _dedupe(items: list) -> list: + """Remove duplicates while preserving first-occurrence order.""" + seen = set() + out = [] + for item in items: + if item not in seen: + seen.add(item) + out.append(item) + return out + + # Auto-load English on import load_lang("en") diff --git a/mempalace/i18n/en.json b/mempalace/i18n/en.json index 88c97db..6a9dff9 100644 --- a/mempalace/i18n/en.json +++ b/mempalace/i18n/en.json @@ -40,5 +40,107 @@ "stop_words": "the this that these those some many most each every other only such very will would could should must shall yeah okay also even then now already still back done make take give know think want need going come find work added saved session summary conversation topics source about once just really actually here there where good great better thank please sorry right wrong true false", "quote_pattern": "\"([^\"]{20,200})\"", "action_pattern": "(?:built|fixed|wrote|added|pushed|measured|tested|reviewed|created|deleted|updated|configured|deployed|migrated)\\s+[\\w\\s]{3,30}" + }, + "entity": { + "candidate_pattern": "[A-Z][a-z]{1,19}", + "multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+", + "person_verb_patterns": [ + "\\b{name}\\s+said\\b", + "\\b{name}\\s+asked\\b", + "\\b{name}\\s+told\\b", + "\\b{name}\\s+replied\\b", + "\\b{name}\\s+laughed\\b", + "\\b{name}\\s+smiled\\b", + "\\b{name}\\s+cried\\b", + "\\b{name}\\s+felt\\b", + "\\b{name}\\s+thinks?\\b", + "\\b{name}\\s+wants?\\b", + "\\b{name}\\s+loves?\\b", + "\\b{name}\\s+hates?\\b", + "\\b{name}\\s+knows?\\b", + "\\b{name}\\s+decided\\b", + "\\b{name}\\s+pushed\\b", + "\\b{name}\\s+wrote\\b", + "\\bhey\\s+{name}\\b", + "\\bthanks?\\s+{name}\\b", + "\\bhi\\s+{name}\\b", + "\\bdear\\s+{name}\\b" + ], + "pronoun_patterns": [ + "\\bshe\\b", + "\\bher\\b", + "\\bhers\\b", + "\\bhe\\b", + "\\bhim\\b", + "\\bhis\\b", + "\\bthey\\b", + "\\bthem\\b", + "\\btheir\\b" + ], + "dialogue_patterns": [ + "^>\\s*{name}[:\\s]", + "^{name}:\\s", + "^\\[{name}\\]", + "\"{name}\\s+said" + ], + "direct_address_pattern": "\\bhey\\s+{name}\\b|\\bthanks?\\s+{name}\\b|\\bhi\\s+{name}\\b", + "project_verb_patterns": [ + "\\bbuilding\\s+{name}\\b", + "\\bbuilt\\s+{name}\\b", + "\\bship(?:ping|ped)?\\s+{name}\\b", + "\\blaunch(?:ing|ed)?\\s+{name}\\b", + "\\bdeploy(?:ing|ed)?\\s+{name}\\b", + "\\binstall(?:ing|ed)?\\s+{name}\\b", + "\\bthe\\s+{name}\\s+architecture\\b", + "\\bthe\\s+{name}\\s+pipeline\\b", + "\\bthe\\s+{name}\\s+system\\b", + "\\bthe\\s+{name}\\s+repo\\b", + "\\b{name}\\s+v\\d+\\b", + "\\b{name}\\.py\\b", + "\\b{name}-core\\b", + "\\b{name}-local\\b", + "\\bimport\\s+{name}\\b", + "\\bpip\\s+install\\s+{name}\\b" + ], + "stopwords": [ + "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", + "for", "of", "with", "by", "from", "as", "is", "was", "are", "were", + "be", "been", "being", "have", "has", "had", "do", "does", "did", + "will", "would", "could", "should", "may", "might", "must", "shall", "can", + "this", "that", "these", "those", "it", "its", "they", "them", "their", + "we", "our", "you", "your", "i", "my", "me", "he", "she", "his", "her", + "who", "what", "when", "where", "why", "how", "which", + "if", "then", "so", "not", "no", "yes", "ok", "okay", + "just", "very", "really", "also", "already", "still", "even", "only", + "here", "there", "now", "too", "up", "out", "about", "like", + "use", "get", "got", "make", "made", "take", "put", "come", "go", "see", + "know", "think", "true", "false", "none", "null", "new", "old", "all", "any", "some", + "return", "print", "def", "class", "import", + "step", "usage", "run", "check", "find", "add", "set", "list", + "args", "dict", "str", "int", "bool", "path", "file", "type", "name", + "note", "example", "option", "result", "error", "warning", "info", + "every", "each", "more", "less", "next", "last", "first", "second", + "stack", "layer", "mode", "test", "stop", "start", "copy", "move", + "source", "target", "output", "input", "data", "item", "key", "value", + "returns", "raises", "yields", "self", "cls", "kwargs", + "world", "well", "want", "topic", "choose", "social", "cars", "phones", + "healthcare", "ex", "machina", "deus", "human", "humans", "people", + "things", "something", "nothing", "everything", "anything", "someone", + "everyone", "anyone", "way", "time", "day", "life", "place", "thing", + "part", "kind", "sort", "case", "point", "idea", "fact", "sense", + "question", "answer", "reason", "number", "version", "system", + "hey", "hi", "hello", "thanks", "thank", "right", "let", + "click", "hit", "press", "tap", "drag", "drop", "open", "close", + "save", "load", "launch", "install", "download", "upload", "scroll", + "select", "enter", "submit", "cancel", "confirm", "delete", "paste", + "write", "read", "search", "show", "hide", + "desktop", "documents", "downloads", "users", "home", "library", + "applications", "preferences", "settings", "terminal", + "actor", "vector", "remote", "control", "duration", "fetch", + "agents", "tools", "others", "guards", "ethics", "regulation", + "learning", "thinking", "memory", "language", "intelligence", + "technology", "society", "culture", "future", "history", "science", + "model", "models", "network", "networks", "training", "inference" + ] } } diff --git a/tests/test_entity_detector.py b/tests/test_entity_detector.py index 91f0e29..50cb7d1 100644 --- a/tests/test_entity_detector.py +++ b/tests/test_entity_detector.py @@ -1,6 +1,9 @@ """Tests for mempalace.entity_detector.""" +import contextlib +import json import os +from pathlib import Path from unittest.mock import patch from mempalace.entity_detector import ( @@ -378,3 +381,211 @@ def test_scan_for_detection_max_files(tmp_path): (tmp_path / f"note{i}.md").write_text(f"content {i}") files = scan_for_detection(str(tmp_path), max_files=5) assert len(files) <= 5 + + +# ── multi-language infra ─────────────────────────────────────────────── + + +@contextlib.contextmanager +def _temp_locale(locale_code: str, entity_section: dict): + """Context manager that drops a locale JSON into mempalace/i18n/ for the test body. + + Cleans up the file and clears every cache that depends on locale data on exit, + even if the test fails or the entity section is invalid. + + Note: writes into the real mempalace/i18n/ directory. If a test process is + SIGKILLed mid-test the orphan zz-test-*.json file will break test_all_languages_load + on the next run (the fixture lacks the required terms/cli/aaak sections). + Recover with `rm mempalace/i18n/zz-test-*.json`. + """ + from mempalace import i18n + from mempalace import entity_detector + + locale_path = Path(i18n.__file__).parent / f"{locale_code}.json" + if locale_path.exists(): + raise RuntimeError(f"Test locale {locale_code} collides with an existing file") + + payload = { + "lang": locale_code, + "label": locale_code, + "terms": {}, + "cli": {}, + "aaak": {"instruction": "test"}, + "entity": entity_section, + } + locale_path.write_text(json.dumps(payload), encoding="utf-8") + + def _clear_caches(): + i18n._entity_cache.clear() + entity_detector._build_patterns.cache_clear() + entity_detector._pronoun_re.cache_clear() + entity_detector._get_stopwords.cache_clear() + + _clear_caches() + try: + yield locale_path + finally: + try: + locale_path.unlink() + except OSError: + pass + _clear_caches() + + +def test_extract_candidates_default_languages_is_english_only(): + """Default languages tuple = ('en',) — accented names dropped (as today).""" + text = "João said hi. João laughed. João waved. João decided." + result = extract_candidates(text) # default ("en",) + assert "João" not in result + + +def test_extract_candidates_with_extra_locale_picks_up_new_charset(): + """A locale with a Latin+diacritics candidate_pattern catches accented names.""" + locale = { + "candidate_pattern": "[A-ZÀ-Ú][a-zà-ÿ]{1,19}", + "multi_word_pattern": "[A-ZÀ-Ú][a-zà-ÿ]+(?:\\s+[A-ZÀ-Ú][a-zà-ÿ]+)+", + "person_verb_patterns": [], + "pronoun_patterns": [], + "dialogue_patterns": [], + "project_verb_patterns": [], + "stopwords": [], + } + with _temp_locale("zz-test-latin", locale): + text = "João said hi. João laughed. João waved. João decided." + result = extract_candidates(text, languages=("en", "zz-test-latin")) + assert "João" in result + assert result["João"] >= 3 + + +def test_extract_candidates_with_cyrillic_locale(): + """A locale with a Cyrillic candidate_pattern catches Russian names.""" + locale = { + "candidate_pattern": "[А-ЯЁ][а-яё]{1,19}", + "multi_word_pattern": "[А-ЯЁ][а-яё]+(?:\\s+[А-ЯЁ][а-яё]+)+", + "person_verb_patterns": [], + "pronoun_patterns": [], + "dialogue_patterns": [], + "project_verb_patterns": [], + "stopwords": [], + } + with _temp_locale("zz-test-cyrillic", locale): + text = "Иван сказал привет. Иван засмеялся. Иван помахал. Иван решил." + result = extract_candidates(text, languages=("en", "zz-test-cyrillic")) + assert "Иван" in result + + +def test_score_entity_unions_person_verbs_across_languages(): + """A non-English person-verb pattern fires when its locale is enabled.""" + locale = { + "candidate_pattern": "[A-Z][a-z]{1,19}", + "multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+", + "person_verb_patterns": [ + "\\b{name}\\s+disse\\b", + "\\b{name}\\s+falou\\b", + "\\b{name}\\s+riu\\b", + ], + "pronoun_patterns": [], + "dialogue_patterns": [], + "project_verb_patterns": [], + "stopwords": [], + } + with _temp_locale("zz-test-verbs", locale): + text = "Maria disse oi. Maria falou. Maria riu." + lines = text.splitlines() + + en_only = score_entity("Maria", text, lines, languages=("en",)) + multi = score_entity("Maria", text, lines, languages=("en", "zz-test-verbs")) + + assert multi["person_score"] > en_only["person_score"] + assert any("action" in s for s in multi["person_signals"]) + + +def test_get_entity_patterns_unknown_lang_falls_back_to_english(): + """Asking for a non-existent language returns English defaults.""" + from mempalace.i18n import get_entity_patterns + + patterns = get_entity_patterns(("zz-does-not-exist",)) + assert len(patterns["stopwords"]) > 0 + assert patterns["candidate_patterns"] # English fallback + + +def test_get_entity_patterns_dedupes_across_overlapping_languages(): + """Loading ('en', 'en') doesn't double-count patterns or stopwords.""" + from mempalace.i18n import get_entity_patterns + + single = get_entity_patterns(("en",)) + doubled = get_entity_patterns(("en", "en")) + assert len(doubled["person_verb_patterns"]) == len(single["person_verb_patterns"]) + assert len(doubled["stopwords"]) == len(single["stopwords"]) + + +def test_build_patterns_cache_is_keyed_by_language(): + """Same name with different language tuples yields different compiled sets.""" + from mempalace.entity_detector import _build_patterns + + locale = { + "candidate_pattern": "[A-Z][a-z]+", + "multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+", + "person_verb_patterns": ["\\b{name}\\s+ranxx\\b"], + "pronoun_patterns": [], + "dialogue_patterns": [], + "project_verb_patterns": [], + "stopwords": [], + } + with _temp_locale("zz-test-cache", locale): + en_patterns = _build_patterns("Sam", ("en",)) + multi_patterns = _build_patterns("Sam", ("en", "zz-test-cache")) + assert len(multi_patterns["person_verbs"]) > len(en_patterns["person_verbs"]) + + +def test_normalize_langs_handles_string_input(): + """Passing a bare string instead of a tuple still works.""" + from mempalace.entity_detector import _normalize_langs + + assert _normalize_langs("en") == ("en",) + assert _normalize_langs(["en", "pt-br"]) == ("en", "pt-br") + assert _normalize_langs(None) == ("en",) + assert _normalize_langs(()) == ("en",) + + +def test_config_entity_languages_defaults_to_english(tmp_path, monkeypatch): + """MempalaceConfig.entity_languages defaults to ['en'] with no config file.""" + from mempalace.config import MempalaceConfig + + monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False) + monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False) + cfg = MempalaceConfig(config_dir=str(tmp_path)) + assert cfg.entity_languages == ["en"] + + +def test_config_entity_languages_from_env(tmp_path, monkeypatch): + """Env var overrides config file.""" + from mempalace.config import MempalaceConfig + + monkeypatch.setenv("MEMPALACE_ENTITY_LANGUAGES", "en,pt-br,ru") + cfg = MempalaceConfig(config_dir=str(tmp_path)) + assert cfg.entity_languages == ["en", "pt-br", "ru"] + + +def test_config_set_entity_languages_persists(tmp_path, monkeypatch): + """set_entity_languages writes to disk and is read back.""" + from mempalace.config import MempalaceConfig + + monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False) + monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False) + cfg = MempalaceConfig(config_dir=str(tmp_path)) + cfg.set_entity_languages(["en", "pt-br"]) + cfg2 = MempalaceConfig(config_dir=str(tmp_path)) + assert cfg2.entity_languages == ["en", "pt-br"] + + +def test_config_set_entity_languages_empty_falls_back_to_english(tmp_path, monkeypatch): + """An empty list normalizes to ['en'].""" + from mempalace.config import MempalaceConfig + + monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False) + monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False) + cfg = MempalaceConfig(config_dir=str(tmp_path)) + result = cfg.set_entity_languages([]) + assert result == ["en"] + assert cfg.entity_languages == ["en"]