Merge pull request #911 from MemPalace/refactor/entity-detector-i18n

refactor(entity_detector): make multi-language extensible via i18n JSON
This commit is contained in:
Igor Lins e Silva
2026-04-15 09:40:36 -03:00
committed by GitHub
7 changed files with 646 additions and 421 deletions
+25 -2
View File
@@ -73,12 +73,25 @@ def cmd_init(args):
from .entity_detector import scan_for_detection, detect_entities, confirm_entities from .entity_detector import scan_for_detection, detect_entities, confirm_entities
from .room_detector_local import detect_rooms_local from .room_detector_local import detect_rooms_local
cfg = MempalaceConfig()
# Resolve entity-detection languages: --lang overrides config.
lang_arg = getattr(args, "lang", None)
if lang_arg:
languages = [s.strip() for s in lang_arg.split(",") if s.strip()] or ["en"]
cfg.set_entity_languages(languages)
else:
languages = cfg.entity_languages
languages_tuple = tuple(languages)
# Pass 1: auto-detect people and projects from file content # Pass 1: auto-detect people and projects from file content
print(f"\n Scanning for entities in: {args.dir}") print(f"\n Scanning for entities in: {args.dir}")
if languages_tuple != ("en",):
print(f" Languages: {', '.join(languages_tuple)}")
files = scan_for_detection(args.dir) files = scan_for_detection(args.dir)
if files: if files:
print(f" Reading {len(files)} files...") print(f" Reading {len(files)} files...")
detected = detect_entities(files) detected = detect_entities(files, languages=languages_tuple)
total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"]) total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
if total > 0: if total > 0:
confirmed = confirm_entities(detected, yes=getattr(args, "yes", False)) confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
@@ -93,7 +106,7 @@ def cmd_init(args):
# Pass 2: detect rooms from folder structure # Pass 2: detect rooms from folder structure
detect_rooms_local(project_dir=args.dir, yes=getattr(args, "yes", False)) detect_rooms_local(project_dir=args.dir, yes=getattr(args, "yes", False))
MempalaceConfig().init() cfg.init()
# Pass 3: protect git repos from accidentally committing per-project files # Pass 3: protect git repos from accidentally committing per-project files
_ensure_mempalace_files_gitignored(args.dir) _ensure_mempalace_files_gitignored(args.dir)
@@ -478,6 +491,16 @@ def main():
action="store_true", action="store_true",
help="Auto-accept all detected entities (non-interactive)", help="Auto-accept all detected entities (non-interactive)",
) )
p_init.add_argument(
"--lang",
default=None,
help=(
"Comma-separated language codes for entity detection "
"(e.g. 'en' or 'en,pt-br'). Defaults to value from config "
"(MEMPALACE_ENTITY_LANGUAGES env var or config.json), or 'en'. "
"When given, the value is also persisted to config.json."
),
)
# mine # mine
p_mine = sub.add_parser("mine", help="Mine files into the palace") p_mine = sub.add_parser("mine", help="Mine files into the palace")
+36
View File
@@ -197,6 +197,42 @@ class MempalaceConfig:
"""Mapping of hall names to keyword lists.""" """Mapping of hall names to keyword lists."""
return self._file_config.get("hall_keywords", DEFAULT_HALL_KEYWORDS) return self._file_config.get("hall_keywords", DEFAULT_HALL_KEYWORDS)
@property
def entity_languages(self):
"""Languages whose entity-detection patterns should be applied.
Reads from env var ``MEMPALACE_ENTITY_LANGUAGES`` (comma-separated)
first, then the ``entity_languages`` field in ``config.json``,
defaulting to ``["en"]``.
"""
env_val = os.environ.get("MEMPALACE_ENTITY_LANGUAGES") or os.environ.get(
"MEMPAL_ENTITY_LANGUAGES"
)
if env_val:
return [s.strip() for s in env_val.split(",") if s.strip()] or ["en"]
cfg = self._file_config.get("entity_languages")
if isinstance(cfg, list) and cfg:
return [str(s) for s in cfg]
return ["en"]
def set_entity_languages(self, languages):
"""Persist the entity-detection language list to ``config.json``."""
normalized = [s.strip() for s in languages if s and s.strip()]
if not normalized:
normalized = ["en"]
self._file_config["entity_languages"] = normalized
self._config_dir.mkdir(parents=True, exist_ok=True)
try:
with open(self._config_file, "w", encoding="utf-8") as f:
json.dump(self._file_config, f, indent=2, ensure_ascii=False)
except OSError:
pass
try:
self._config_file.chmod(0o600)
except (OSError, NotImplementedError):
pass
return normalized
@property @property
def hook_silent_save(self): def hook_silent_save(self):
"""Whether the stop hook saves directly (True) or blocks for MCP calls (False).""" """Whether the stop hook saves directly (True) or blocks for MCP calls (False)."""
+151 -416
View File
@@ -9,9 +9,21 @@ Two-pass approach:
Used by mempalace init before mining begins. Used by mempalace init before mining begins.
The confirmed entity map feeds the miner as the taxonomy. The confirmed entity map feeds the miner as the taxonomy.
Multi-language support:
All lexical patterns (person verbs, pronouns, dialogue markers, project
verbs, stopwords, and the candidate-extraction character class) live in
the ``entity`` section of ``mempalace/i18n/<lang>.json``. Every public
function accepts a ``languages`` tuple and applies the union of the
requested locales' patterns. The default is ``("en",)`` — existing
English-only callers behave exactly as before.
To add a new language: add an ``entity`` section to that locale's JSON.
No code changes required.
Usage: Usage:
from entity_detector import detect_entities, confirm_entities from mempalace.entity_detector import detect_entities, confirm_entities
candidates = detect_entities(file_paths) candidates = detect_entities(file_paths) # English only
candidates = detect_entities(paths, languages=("en", "pt-br"))
confirmed = confirm_entities(candidates) # interactive review confirmed = confirm_entities(candidates) # interactive review
""" """
@@ -21,382 +33,46 @@ import functools
from pathlib import Path from pathlib import Path
from collections import defaultdict from collections import defaultdict
from mempalace.i18n import get_entity_patterns
# ==================== SIGNAL PATTERNS ====================
# Person signals — things people do # ==================== LANGUAGE-AWARE PATTERN LOADING ====================
PERSON_VERB_PATTERNS = [
r"\b{name}\s+said\b",
r"\b{name}\s+asked\b",
r"\b{name}\s+told\b",
r"\b{name}\s+replied\b",
r"\b{name}\s+laughed\b",
r"\b{name}\s+smiled\b",
r"\b{name}\s+cried\b",
r"\b{name}\s+felt\b",
r"\b{name}\s+thinks?\b",
r"\b{name}\s+wants?\b",
r"\b{name}\s+loves?\b",
r"\b{name}\s+hates?\b",
r"\b{name}\s+knows?\b",
r"\b{name}\s+decided\b",
r"\b{name}\s+pushed\b",
r"\b{name}\s+wrote\b",
r"\bhey\s+{name}\b",
r"\bthanks?\s+{name}\b",
r"\bhi\s+{name}\b",
r"\bdear\s+{name}\b",
]
# Person signals — pronouns resolving nearby
PRONOUN_PATTERNS = [
r"\bshe\b",
r"\bher\b",
r"\bhers\b",
r"\bhe\b",
r"\bhim\b",
r"\bhis\b",
r"\bthey\b",
r"\bthem\b",
r"\btheir\b",
]
PRONOUN_RE = re.compile("|".join(PRONOUN_PATTERNS), re.IGNORECASE) def _normalize_langs(languages) -> tuple:
"""Coerce a language input into a non-empty hashable tuple."""
if not languages:
return ("en",)
if isinstance(languages, str):
return (languages,)
return tuple(languages)
# Person signals — dialogue markers
DIALOGUE_PATTERNS = [
r"^>\s*{name}[:\s]", # > Speaker: ...
r"^{name}:\s", # Speaker: ...
r"^\[{name}\]", # [Speaker]
r'"{name}\s+said',
]
# Project signals — things projects have/do @functools.lru_cache(maxsize=32)
PROJECT_VERB_PATTERNS = [ def _get_stopwords(languages: tuple) -> frozenset:
r"\bbuilding\s+{name}\b", """Return the union of stopwords across the given languages."""
r"\bbuilt\s+{name}\b", patterns = get_entity_patterns(languages)
r"\bship(?:ping|ped)?\s+{name}\b", return frozenset(patterns["stopwords"])
r"\blaunch(?:ing|ed)?\s+{name}\b",
r"\bdeploy(?:ing|ed)?\s+{name}\b",
r"\binstall(?:ing|ed)?\s+{name}\b",
r"\bthe\s+{name}\s+architecture\b",
r"\bthe\s+{name}\s+pipeline\b",
r"\bthe\s+{name}\s+system\b",
r"\bthe\s+{name}\s+repo\b",
r"\b{name}\s+v\d+\b", # MemPal v2
r"\b{name}\.py\b", # mempalace.py
r"\b{name}-core\b", # mempal-core (hyphen only, not underscore)
r"\b{name}-local\b",
r"\bimport\s+{name}\b",
r"\bpip\s+install\s+{name}\b",
]
# Words that are almost certainly NOT entities
STOPWORDS = { # ==================== BACKWARD-COMPAT MODULE CONSTANTS ====================
"the", #
"a", # These mirror the old module-level constants so existing imports keep working.
"an", # They reflect the English defaults and are populated at import time from
"and", # ``mempalace/i18n/en.json``. Callers that need multi-language behavior should
"or", # pass the ``languages`` parameter to the public functions below.
"but",
"in", _EN = get_entity_patterns(("en",))
"on",
"at", PERSON_VERB_PATTERNS = list(_EN["person_verb_patterns"])
"to", PRONOUN_PATTERNS = list(_EN["pronoun_patterns"])
"for", PRONOUN_RE = re.compile("|".join(PRONOUN_PATTERNS), re.IGNORECASE) if PRONOUN_PATTERNS else None
"of", DIALOGUE_PATTERNS = list(_EN["dialogue_patterns"])
"with", PROJECT_VERB_PATTERNS = list(_EN["project_verb_patterns"])
"by", STOPWORDS = set(_EN["stopwords"])
"from",
"as",
"is", # ==================== EXTENSION POINTS (not language-scoped) ====================
"was",
"are",
"were",
"be",
"been",
"being",
"have",
"has",
"had",
"do",
"does",
"did",
"will",
"would",
"could",
"should",
"may",
"might",
"must",
"shall",
"can",
"this",
"that",
"these",
"those",
"it",
"its",
"they",
"them",
"their",
"we",
"our",
"you",
"your",
"i",
"my",
"me",
"he",
"she",
"his",
"her",
"who",
"what",
"when",
"where",
"why",
"how",
"which",
"if",
"then",
"so",
"not",
"no",
"yes",
"ok",
"okay",
"just",
"very",
"really",
"also",
"already",
"still",
"even",
"only",
"here",
"there",
"now",
"then",
"too",
"up",
"out",
"about",
"like",
"use",
"get",
"got",
"make",
"made",
"take",
"put",
"come",
"go",
"see",
"know",
"think",
"true",
"false",
"none",
"null",
"new",
"old",
"all",
"any",
"some",
"true",
"false",
"return",
"print",
"def",
"class",
"import",
"from",
# Common capitalized words in prose that aren't entities
"step",
"usage",
"run",
"check",
"find",
"add",
"get",
"set",
"list",
"args",
"dict",
"str",
"int",
"bool",
"path",
"file",
"type",
"name",
"note",
"example",
"option",
"result",
"error",
"warning",
"info",
"every",
"each",
"more",
"less",
"next",
"last",
"first",
"second",
"stack",
"layer",
"mode",
"test",
"stop",
"start",
"copy",
"move",
"source",
"target",
"output",
"input",
"data",
"item",
"key",
"value",
"returns",
"raises",
"yields",
"none",
"self",
"cls",
"kwargs",
# Common sentence-starting / abstract words that aren't entities
"world",
"well",
"want",
"topic",
"choose",
"social",
"cars",
"phones",
"healthcare",
"ex",
"machina",
"deus",
"human",
"humans",
"people",
"things",
"something",
"nothing",
"everything",
"anything",
"someone",
"everyone",
"anyone",
"way",
"time",
"day",
"life",
"place",
"thing",
"part",
"kind",
"sort",
"case",
"point",
"idea",
"fact",
"sense",
"question",
"answer",
"reason",
"number",
"version",
"system",
# Greetings and filler words at sentence starts
"hey",
"hi",
"hello",
"thanks",
"thank",
"right",
"let",
"ok",
# UI/action words that appear in how-to content
"click",
"hit",
"press",
"tap",
"drag",
"drop",
"open",
"close",
"save",
"load",
"launch",
"install",
"download",
"upload",
"scroll",
"select",
"enter",
"submit",
"cancel",
"confirm",
"delete",
"copy",
"paste",
"type",
"write",
"read",
"search",
"find",
"show",
"hide",
# Common filesystem/technical capitalized words
"desktop",
"documents",
"downloads",
"users",
"home",
"library",
"applications",
"system",
"preferences",
"settings",
"terminal",
# Abstract/topic words
"actor",
"vector",
"remote",
"control",
"duration",
"fetch",
# Abstract concepts that appear as subjects but aren't entities
"agents",
"tools",
"others",
"guards",
"ethics",
"regulation",
"learning",
"thinking",
"memory",
"language",
"intelligence",
"technology",
"society",
"culture",
"future",
"history",
"science",
"model",
"models",
"network",
"networks",
"training",
"inference",
}
# For entity detection — prose only, no code files # For entity detection — prose only, no code files
# Code files have too many capitalized names (classes, functions) that aren't entities # Code files have too many capitalized names (classes, functions) that aren't entities
@@ -443,56 +119,107 @@ SKIP_DIRS = {
# ==================== CANDIDATE EXTRACTION ==================== # ==================== CANDIDATE EXTRACTION ====================
def extract_candidates(text: str) -> dict: def extract_candidates(text: str, languages=("en",)) -> dict:
""" """
Extract all capitalized proper noun candidates from text. Extract all capitalized proper noun candidates from text.
Returns {name: frequency} for names appearing 3+ times. Returns {name: frequency} for names appearing 3+ times.
"""
# Find all capitalized words (not at sentence start — harder, so we use frequency as filter)
raw = re.findall(r"\b([A-Z][a-z]{1,19})\b", text)
counts = defaultdict(int) Each language contributes its own character-class pattern (e.g. ASCII
for word in raw: for English, Latin+diacritics for pt-br, Cyrillic for Russian,
if word.lower() not in STOPWORDS and len(word) > 1: Devanagari for Hindi). Matches from all languages are unioned.
"""
langs = _normalize_langs(languages)
patterns = get_entity_patterns(langs)
stopwords = _get_stopwords(langs)
counts: defaultdict = defaultdict(int)
# Single-word candidates — one pattern per language
for raw_pat in patterns["candidate_patterns"]:
try:
rx = re.compile(rf"\b({raw_pat})\b")
except re.error:
continue
for word in rx.findall(text):
if word.lower() in stopwords:
continue
if len(word) < 2:
continue
counts[word] += 1 counts[word] += 1
# Also find multi-word proper nouns (e.g. "Memory Palace", "Claude Code") # Multi-word candidates — one pattern per language
multi = re.findall(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b", text) for raw_pat in patterns["multi_word_patterns"]:
for phrase in multi: try:
if not any(w.lower() in STOPWORDS for w in phrase.split()): rx = re.compile(rf"\b({raw_pat})\b")
except re.error:
continue
for phrase in rx.findall(text):
if any(w.lower() in stopwords for w in phrase.split()):
continue
counts[phrase] += 1 counts[phrase] += 1
# Filter: must appear at least 3 times to be a candidate
return {name: count for name, count in counts.items() if count >= 3} return {name: count for name, count in counts.items() if count >= 3}
# ==================== SIGNAL SCORING ==================== # ==================== SIGNAL SCORING ====================
@functools.lru_cache(maxsize=128) @functools.lru_cache(maxsize=256)
def _build_patterns(name: str) -> dict: def _build_patterns(name: str, languages: tuple = ("en",)) -> dict:
"""Pre-compile all regex patterns for a single entity name.""" """Pre-compile all regex patterns for a single entity name, per language set."""
n = re.escape(name) n = re.escape(name)
langs = _normalize_langs(languages)
sources = get_entity_patterns(langs)
def _compile_each(raw_patterns, flags=re.IGNORECASE):
compiled = []
for p in raw_patterns:
try:
compiled.append(re.compile(p.format(name=n), flags))
except (re.error, KeyError, IndexError):
continue
return compiled
direct_sources = sources.get("direct_address_patterns") or []
direct_compiled = []
for raw in direct_sources:
try:
direct_compiled.append(re.compile(raw.format(name=n), re.IGNORECASE))
except (re.error, KeyError, IndexError):
continue
return { return {
"dialogue": [ "dialogue": _compile_each(sources["dialogue_patterns"], re.MULTILINE | re.IGNORECASE),
re.compile(p.format(name=n), re.MULTILINE | re.IGNORECASE) for p in DIALOGUE_PATTERNS "person_verbs": _compile_each(sources["person_verb_patterns"]),
], "project_verbs": _compile_each(sources["project_verb_patterns"]),
"person_verbs": [re.compile(p.format(name=n), re.IGNORECASE) for p in PERSON_VERB_PATTERNS], "direct": direct_compiled,
"project_verbs": [
re.compile(p.format(name=n), re.IGNORECASE) for p in PROJECT_VERB_PATTERNS
],
"direct": re.compile(rf"\bhey\s+{n}\b|\bthanks?\s+{n}\b|\bhi\s+{n}\b", re.IGNORECASE),
"versioned": re.compile(rf"\b{n}[-v]\w+", re.IGNORECASE), "versioned": re.compile(rf"\b{n}[-v]\w+", re.IGNORECASE),
"code_ref": re.compile(rf"\b{n}\.(py|js|ts|yaml|yml|json|sh)\b", re.IGNORECASE), "code_ref": re.compile(rf"\b{n}\.(py|js|ts|yaml|yml|json|sh)\b", re.IGNORECASE),
} }
def score_entity(name: str, text: str, lines: list) -> dict: @functools.lru_cache(maxsize=32)
def _pronoun_re(languages: tuple):
"""Compile a combined pronoun regex for the given languages."""
langs = _normalize_langs(languages)
patterns = get_entity_patterns(langs)
pronouns = patterns.get("pronoun_patterns") or []
if not pronouns:
return None
try:
return re.compile("|".join(pronouns), re.IGNORECASE)
except re.error:
return None
def score_entity(name: str, text: str, lines: list, languages=("en",)) -> dict:
""" """
Score a candidate entity as person vs project. Score a candidate entity as person vs project.
Returns scores and the signals that fired. Returns scores and the signals that fired.
""" """
patterns = _build_patterns(name) langs = _normalize_langs(languages)
patterns = _build_patterns(name, langs)
pronoun_re = _pronoun_re(langs)
person_score = 0 person_score = 0
project_score = 0 project_score = 0
person_signals = [] person_signals = []
@@ -515,22 +242,25 @@ def score_entity(name: str, text: str, lines: list) -> dict:
person_signals.append(f"'{name} ...' action ({matches}x)") person_signals.append(f"'{name} ...' action ({matches}x)")
# Pronoun proximity — pronouns within 3 lines of the name # Pronoun proximity — pronouns within 3 lines of the name
name_lower = name.lower() if pronoun_re is not None:
name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()] name_lower = name.lower()
pronoun_hits = 0 name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()]
for idx in name_line_indices: pronoun_hits = 0
window_text = " ".join(lines[max(0, idx - 2) : idx + 3]) for idx in name_line_indices:
if PRONOUN_RE.search(window_text): window_text = " ".join(lines[max(0, idx - 2) : idx + 3])
pronoun_hits += 1 if pronoun_re.search(window_text):
if pronoun_hits > 0: pronoun_hits += 1
person_score += pronoun_hits * 2 if pronoun_hits > 0:
person_signals.append(f"pronoun nearby ({pronoun_hits}x)") person_score += pronoun_hits * 2
person_signals.append(f"pronoun nearby ({pronoun_hits}x)")
# Direct address # Direct address
direct = len(patterns["direct"].findall(text)) direct_hits = 0
if direct > 0: for rx in patterns["direct"]:
person_score += direct * 4 direct_hits += len(rx.findall(text))
person_signals.append(f"addressed directly ({direct}x)") if direct_hits > 0:
person_score += direct_hits * 4
person_signals.append(f"addressed directly ({direct_hits}x)")
# --- Project signals --- # --- Project signals ---
@@ -631,13 +361,15 @@ def classify_entity(name: str, frequency: int, scores: dict) -> dict:
# ==================== MAIN DETECT ==================== # ==================== MAIN DETECT ====================
def detect_entities(file_paths: list, max_files: int = 10) -> dict: def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) -> dict:
""" """
Scan files and detect entity candidates. Scan files and detect entity candidates.
Args: Args:
file_paths: List of Path objects to scan file_paths: List of Path objects to scan
max_files: Max files to read (for speed) max_files: Max files to read (for speed)
languages: Tuple of language codes whose entity patterns should be
applied (union). Defaults to ``("en",)``.
Returns: Returns:
{ {
@@ -646,6 +378,8 @@ def detect_entities(file_paths: list, max_files: int = 10) -> dict:
"uncertain":[...entity dicts...], "uncertain":[...entity dicts...],
} }
""" """
langs = _normalize_langs(languages)
# Collect text from files # Collect text from files
all_text = [] all_text = []
all_lines = [] all_lines = []
@@ -668,7 +402,7 @@ def detect_entities(file_paths: list, max_files: int = 10) -> dict:
combined_text = "\n".join(all_text) combined_text = "\n".join(all_text)
# Extract candidates # Extract candidates
candidates = extract_candidates(combined_text) candidates = extract_candidates(combined_text, languages=langs)
if not candidates: if not candidates:
return {"people": [], "projects": [], "uncertain": []} return {"people": [], "projects": [], "uncertain": []}
@@ -679,7 +413,7 @@ def detect_entities(file_paths: list, max_files: int = 10) -> dict:
uncertain = [] uncertain = []
for name, frequency in sorted(candidates.items(), key=lambda x: x[1], reverse=True): for name, frequency in sorted(candidates.items(), key=lambda x: x[1], reverse=True):
scores = score_entity(name, combined_text, all_lines) scores = score_entity(name, combined_text, all_lines, languages=langs)
entity = classify_entity(name, frequency, scores) entity = classify_entity(name, frequency, scores)
if entity["type"] == "person": if entity["type"] == "person":
@@ -843,13 +577,14 @@ if __name__ == "__main__":
import sys import sys
if len(sys.argv) < 2: if len(sys.argv) < 2:
print("Usage: python entity_detector.py <directory>") print("Usage: python entity_detector.py <directory> [lang1,lang2,...]")
sys.exit(1) sys.exit(1)
project_dir = sys.argv[1] project_dir = sys.argv[1]
print(f"Scanning: {project_dir}") langs = tuple(sys.argv[2].split(",")) if len(sys.argv) >= 3 else ("en",)
print(f"Scanning: {project_dir} (languages: {', '.join(langs)})")
files = scan_for_detection(project_dir) files = scan_for_detection(project_dir)
print(f"Reading {len(files)} files...") print(f"Reading {len(files)} files...")
detected = detect_entities(files) detected = detect_entities(files, languages=langs)
confirmed = confirm_entities(detected) confirmed = confirm_entities(detected)
print("Confirmed entities:", confirmed) print("Confirmed entities:", confirmed)
+7 -3
View File
@@ -583,15 +583,19 @@ class EntityRegistry:
# ── Learn from sessions ────────────────────────────────────────────────── # ── Learn from sessions ──────────────────────────────────────────────────
def learn_from_text(self, text: str, min_confidence: float = 0.75) -> list: def learn_from_text(self, text: str, min_confidence: float = 0.75, languages=("en",)) -> list:
""" """
Scan session text for new entity candidates. Scan session text for new entity candidates.
Returns list of newly discovered candidates for review. Returns list of newly discovered candidates for review.
``languages`` is forwarded to entity detection — pass the user's
configured ``MempalaceConfig().entity_languages`` to match the
locales used at ``mempalace init`` time.
""" """
from mempalace.entity_detector import extract_candidates, score_entity, classify_entity from mempalace.entity_detector import extract_candidates, score_entity, classify_entity
lines = text.splitlines() lines = text.splitlines()
candidates = extract_candidates(text) candidates = extract_candidates(text, languages=languages)
new_candidates = [] new_candidates = []
for name, frequency in candidates.items(): for name, frequency in candidates.items():
@@ -599,7 +603,7 @@ class EntityRegistry:
if name in self.people or name in self.projects: if name in self.people or name in self.projects:
continue continue
scores = score_entity(name, text, lines) scores = score_entity(name, text, lines, languages=languages)
entity = classify_entity(name, frequency, scores) entity = classify_entity(name, frequency, scores)
if entity["type"] == "person" and entity["confidence"] >= min_confidence: if entity["type"] == "person" and entity["confidence"] >= min_confidence:
+114
View File
@@ -7,6 +7,10 @@ Usage:
print(t("cli.mine_start", path="/docs")) # "Extraction de /docs..." print(t("cli.mine_start", path="/docs")) # "Extraction de /docs..."
print(t("terms.wing")) # "aile" print(t("terms.wing")) # "aile"
print(t("aaak.instruction")) # AAAK compression instruction in French print(t("aaak.instruction")) # AAAK compression instruction in French
Each locale JSON may include an ``entity`` section with patterns used by
``mempalace.entity_detector``. See ``get_entity_patterns`` for the merge rules
and the README section "Adding a new language" for the schema.
""" """
import json import json
@@ -16,6 +20,9 @@ _LANG_DIR = Path(__file__).parent
_strings: dict = {} _strings: dict = {}
_current_lang: str = "en" _current_lang: str = "en"
# Cache: tuple(langs) -> merged entity pattern dict
_entity_cache: dict = {}
def available_languages() -> list[str]: def available_languages() -> list[str]:
"""Return list of available language codes.""" """Return list of available language codes."""
@@ -72,5 +79,112 @@ def get_regex() -> dict:
return _strings.get("regex", {}) return _strings.get("regex", {})
def _load_entity_section(lang: str) -> dict:
"""Load the raw entity section for one language. Returns {} if missing."""
lang_file = _LANG_DIR / f"{lang}.json"
if not lang_file.exists():
return {}
try:
data = json.loads(lang_file.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
return {}
return data.get("entity", {}) or {}
def get_entity_patterns(languages=("en",)) -> dict:
"""Return merged entity detection patterns for the requested languages.
Entity detection patterns live under each locale's ``entity`` section.
This function merges them into a single dict for consumption by
``mempalace.entity_detector``.
Merge rules:
- List fields (person_verb_patterns, pronoun_patterns, dialogue_patterns,
project_verb_patterns) are concatenated in the order of ``languages``,
with duplicates removed while preserving first occurrence.
- ``stopwords`` is the set union across all languages, returned as a
sorted list.
- ``candidate_patterns`` and ``multi_word_patterns`` are returned as
lists (one per language) since they use different character classes;
callers run each pattern independently and union the matches.
- ``direct_address_pattern`` is returned as a list of per-language
alternation patterns (not concatenated — each is applied separately).
If ``languages`` is empty or no requested language declares entity data,
English is used as a fallback so callers always get a working config.
"""
if not languages:
languages = ("en",)
key = tuple(languages)
if key in _entity_cache:
return _entity_cache[key]
candidate_patterns: list[str] = []
multi_word_patterns: list[str] = []
person_verbs: list[str] = []
pronouns: list[str] = []
dialogue: list[str] = []
direct_address: list[str] = []
project_verbs: list[str] = []
stopwords: set = set()
found_any = False
for lang in languages:
section = _load_entity_section(lang)
if not section:
continue
found_any = True
if section.get("candidate_pattern"):
candidate_patterns.append(section["candidate_pattern"])
if section.get("multi_word_pattern"):
multi_word_patterns.append(section["multi_word_pattern"])
if section.get("direct_address_pattern"):
direct_address.append(section["direct_address_pattern"])
person_verbs.extend(section.get("person_verb_patterns", []))
pronouns.extend(section.get("pronoun_patterns", []))
dialogue.extend(section.get("dialogue_patterns", []))
project_verbs.extend(section.get("project_verb_patterns", []))
stopwords.update(w.lower() for w in section.get("stopwords", []))
if not found_any:
# Fallback: load English directly
section = _load_entity_section("en")
if section.get("candidate_pattern"):
candidate_patterns.append(section["candidate_pattern"])
if section.get("multi_word_pattern"):
multi_word_patterns.append(section["multi_word_pattern"])
if section.get("direct_address_pattern"):
direct_address.append(section["direct_address_pattern"])
person_verbs.extend(section.get("person_verb_patterns", []))
pronouns.extend(section.get("pronoun_patterns", []))
dialogue.extend(section.get("dialogue_patterns", []))
project_verbs.extend(section.get("project_verb_patterns", []))
stopwords.update(w.lower() for w in section.get("stopwords", []))
merged = {
"candidate_patterns": candidate_patterns,
"multi_word_patterns": multi_word_patterns,
"person_verb_patterns": _dedupe(person_verbs),
"pronoun_patterns": _dedupe(pronouns),
"dialogue_patterns": _dedupe(dialogue),
"direct_address_patterns": direct_address,
"project_verb_patterns": _dedupe(project_verbs),
"stopwords": sorted(stopwords),
}
_entity_cache[key] = merged
return merged
def _dedupe(items: list) -> list:
"""Remove duplicates while preserving first-occurrence order."""
seen = set()
out = []
for item in items:
if item not in seen:
seen.add(item)
out.append(item)
return out
# Auto-load English on import # Auto-load English on import
load_lang("en") load_lang("en")
+102
View File
@@ -40,5 +40,107 @@
"stop_words": "the this that these those some many most each every other only such very will would could should must shall yeah okay also even then now already still back done make take give know think want need going come find work added saved session summary conversation topics source about once just really actually here there where good great better thank please sorry right wrong true false", "stop_words": "the this that these those some many most each every other only such very will would could should must shall yeah okay also even then now already still back done make take give know think want need going come find work added saved session summary conversation topics source about once just really actually here there where good great better thank please sorry right wrong true false",
"quote_pattern": "\"([^\"]{20,200})\"", "quote_pattern": "\"([^\"]{20,200})\"",
"action_pattern": "(?:built|fixed|wrote|added|pushed|measured|tested|reviewed|created|deleted|updated|configured|deployed|migrated)\\s+[\\w\\s]{3,30}" "action_pattern": "(?:built|fixed|wrote|added|pushed|measured|tested|reviewed|created|deleted|updated|configured|deployed|migrated)\\s+[\\w\\s]{3,30}"
},
"entity": {
"candidate_pattern": "[A-Z][a-z]{1,19}",
"multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
"person_verb_patterns": [
"\\b{name}\\s+said\\b",
"\\b{name}\\s+asked\\b",
"\\b{name}\\s+told\\b",
"\\b{name}\\s+replied\\b",
"\\b{name}\\s+laughed\\b",
"\\b{name}\\s+smiled\\b",
"\\b{name}\\s+cried\\b",
"\\b{name}\\s+felt\\b",
"\\b{name}\\s+thinks?\\b",
"\\b{name}\\s+wants?\\b",
"\\b{name}\\s+loves?\\b",
"\\b{name}\\s+hates?\\b",
"\\b{name}\\s+knows?\\b",
"\\b{name}\\s+decided\\b",
"\\b{name}\\s+pushed\\b",
"\\b{name}\\s+wrote\\b",
"\\bhey\\s+{name}\\b",
"\\bthanks?\\s+{name}\\b",
"\\bhi\\s+{name}\\b",
"\\bdear\\s+{name}\\b"
],
"pronoun_patterns": [
"\\bshe\\b",
"\\bher\\b",
"\\bhers\\b",
"\\bhe\\b",
"\\bhim\\b",
"\\bhis\\b",
"\\bthey\\b",
"\\bthem\\b",
"\\btheir\\b"
],
"dialogue_patterns": [
"^>\\s*{name}[:\\s]",
"^{name}:\\s",
"^\\[{name}\\]",
"\"{name}\\s+said"
],
"direct_address_pattern": "\\bhey\\s+{name}\\b|\\bthanks?\\s+{name}\\b|\\bhi\\s+{name}\\b",
"project_verb_patterns": [
"\\bbuilding\\s+{name}\\b",
"\\bbuilt\\s+{name}\\b",
"\\bship(?:ping|ped)?\\s+{name}\\b",
"\\blaunch(?:ing|ed)?\\s+{name}\\b",
"\\bdeploy(?:ing|ed)?\\s+{name}\\b",
"\\binstall(?:ing|ed)?\\s+{name}\\b",
"\\bthe\\s+{name}\\s+architecture\\b",
"\\bthe\\s+{name}\\s+pipeline\\b",
"\\bthe\\s+{name}\\s+system\\b",
"\\bthe\\s+{name}\\s+repo\\b",
"\\b{name}\\s+v\\d+\\b",
"\\b{name}\\.py\\b",
"\\b{name}-core\\b",
"\\b{name}-local\\b",
"\\bimport\\s+{name}\\b",
"\\bpip\\s+install\\s+{name}\\b"
],
"stopwords": [
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to",
"for", "of", "with", "by", "from", "as", "is", "was", "are", "were",
"be", "been", "being", "have", "has", "had", "do", "does", "did",
"will", "would", "could", "should", "may", "might", "must", "shall", "can",
"this", "that", "these", "those", "it", "its", "they", "them", "their",
"we", "our", "you", "your", "i", "my", "me", "he", "she", "his", "her",
"who", "what", "when", "where", "why", "how", "which",
"if", "then", "so", "not", "no", "yes", "ok", "okay",
"just", "very", "really", "also", "already", "still", "even", "only",
"here", "there", "now", "too", "up", "out", "about", "like",
"use", "get", "got", "make", "made", "take", "put", "come", "go", "see",
"know", "think", "true", "false", "none", "null", "new", "old", "all", "any", "some",
"return", "print", "def", "class", "import",
"step", "usage", "run", "check", "find", "add", "set", "list",
"args", "dict", "str", "int", "bool", "path", "file", "type", "name",
"note", "example", "option", "result", "error", "warning", "info",
"every", "each", "more", "less", "next", "last", "first", "second",
"stack", "layer", "mode", "test", "stop", "start", "copy", "move",
"source", "target", "output", "input", "data", "item", "key", "value",
"returns", "raises", "yields", "self", "cls", "kwargs",
"world", "well", "want", "topic", "choose", "social", "cars", "phones",
"healthcare", "ex", "machina", "deus", "human", "humans", "people",
"things", "something", "nothing", "everything", "anything", "someone",
"everyone", "anyone", "way", "time", "day", "life", "place", "thing",
"part", "kind", "sort", "case", "point", "idea", "fact", "sense",
"question", "answer", "reason", "number", "version", "system",
"hey", "hi", "hello", "thanks", "thank", "right", "let",
"click", "hit", "press", "tap", "drag", "drop", "open", "close",
"save", "load", "launch", "install", "download", "upload", "scroll",
"select", "enter", "submit", "cancel", "confirm", "delete", "paste",
"write", "read", "search", "show", "hide",
"desktop", "documents", "downloads", "users", "home", "library",
"applications", "preferences", "settings", "terminal",
"actor", "vector", "remote", "control", "duration", "fetch",
"agents", "tools", "others", "guards", "ethics", "regulation",
"learning", "thinking", "memory", "language", "intelligence",
"technology", "society", "culture", "future", "history", "science",
"model", "models", "network", "networks", "training", "inference"
]
} }
} }
+211
View File
@@ -1,6 +1,9 @@
"""Tests for mempalace.entity_detector.""" """Tests for mempalace.entity_detector."""
import contextlib
import json
import os import os
from pathlib import Path
from unittest.mock import patch from unittest.mock import patch
from mempalace.entity_detector import ( from mempalace.entity_detector import (
@@ -378,3 +381,211 @@ def test_scan_for_detection_max_files(tmp_path):
(tmp_path / f"note{i}.md").write_text(f"content {i}") (tmp_path / f"note{i}.md").write_text(f"content {i}")
files = scan_for_detection(str(tmp_path), max_files=5) files = scan_for_detection(str(tmp_path), max_files=5)
assert len(files) <= 5 assert len(files) <= 5
# ── multi-language infra ───────────────────────────────────────────────
@contextlib.contextmanager
def _temp_locale(locale_code: str, entity_section: dict):
"""Context manager that drops a locale JSON into mempalace/i18n/ for the test body.
Cleans up the file and clears every cache that depends on locale data on exit,
even if the test fails or the entity section is invalid.
Note: writes into the real mempalace/i18n/ directory. If a test process is
SIGKILLed mid-test the orphan zz-test-*.json file will break test_all_languages_load
on the next run (the fixture lacks the required terms/cli/aaak sections).
Recover with `rm mempalace/i18n/zz-test-*.json`.
"""
from mempalace import i18n
from mempalace import entity_detector
locale_path = Path(i18n.__file__).parent / f"{locale_code}.json"
if locale_path.exists():
raise RuntimeError(f"Test locale {locale_code} collides with an existing file")
payload = {
"lang": locale_code,
"label": locale_code,
"terms": {},
"cli": {},
"aaak": {"instruction": "test"},
"entity": entity_section,
}
locale_path.write_text(json.dumps(payload), encoding="utf-8")
def _clear_caches():
i18n._entity_cache.clear()
entity_detector._build_patterns.cache_clear()
entity_detector._pronoun_re.cache_clear()
entity_detector._get_stopwords.cache_clear()
_clear_caches()
try:
yield locale_path
finally:
try:
locale_path.unlink()
except OSError:
pass
_clear_caches()
def test_extract_candidates_default_languages_is_english_only():
"""Default languages tuple = ('en',) — accented names dropped (as today)."""
text = "João said hi. João laughed. João waved. João decided."
result = extract_candidates(text) # default ("en",)
assert "João" not in result
def test_extract_candidates_with_extra_locale_picks_up_new_charset():
"""A locale with a Latin+diacritics candidate_pattern catches accented names."""
locale = {
"candidate_pattern": "[A-ZÀ-Ú][a-zà-ÿ]{1,19}",
"multi_word_pattern": "[A-ZÀ-Ú][a-zà-ÿ]+(?:\\s+[A-ZÀ-Ú][a-zà-ÿ]+)+",
"person_verb_patterns": [],
"pronoun_patterns": [],
"dialogue_patterns": [],
"project_verb_patterns": [],
"stopwords": [],
}
with _temp_locale("zz-test-latin", locale):
text = "João said hi. João laughed. João waved. João decided."
result = extract_candidates(text, languages=("en", "zz-test-latin"))
assert "João" in result
assert result["João"] >= 3
def test_extract_candidates_with_cyrillic_locale():
"""A locale with a Cyrillic candidate_pattern catches Russian names."""
locale = {
"candidate_pattern": "[А-ЯЁ][а-яё]{1,19}",
"multi_word_pattern": "[А-ЯЁ][а-яё]+(?:\\s+[А-ЯЁ][а-яё]+)+",
"person_verb_patterns": [],
"pronoun_patterns": [],
"dialogue_patterns": [],
"project_verb_patterns": [],
"stopwords": [],
}
with _temp_locale("zz-test-cyrillic", locale):
text = "Иван сказал привет. Иван засмеялся. Иван помахал. Иван решил."
result = extract_candidates(text, languages=("en", "zz-test-cyrillic"))
assert "Иван" in result
def test_score_entity_unions_person_verbs_across_languages():
"""A non-English person-verb pattern fires when its locale is enabled."""
locale = {
"candidate_pattern": "[A-Z][a-z]{1,19}",
"multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
"person_verb_patterns": [
"\\b{name}\\s+disse\\b",
"\\b{name}\\s+falou\\b",
"\\b{name}\\s+riu\\b",
],
"pronoun_patterns": [],
"dialogue_patterns": [],
"project_verb_patterns": [],
"stopwords": [],
}
with _temp_locale("zz-test-verbs", locale):
text = "Maria disse oi. Maria falou. Maria riu."
lines = text.splitlines()
en_only = score_entity("Maria", text, lines, languages=("en",))
multi = score_entity("Maria", text, lines, languages=("en", "zz-test-verbs"))
assert multi["person_score"] > en_only["person_score"]
assert any("action" in s for s in multi["person_signals"])
def test_get_entity_patterns_unknown_lang_falls_back_to_english():
"""Asking for a non-existent language returns English defaults."""
from mempalace.i18n import get_entity_patterns
patterns = get_entity_patterns(("zz-does-not-exist",))
assert len(patterns["stopwords"]) > 0
assert patterns["candidate_patterns"] # English fallback
def test_get_entity_patterns_dedupes_across_overlapping_languages():
"""Loading ('en', 'en') doesn't double-count patterns or stopwords."""
from mempalace.i18n import get_entity_patterns
single = get_entity_patterns(("en",))
doubled = get_entity_patterns(("en", "en"))
assert len(doubled["person_verb_patterns"]) == len(single["person_verb_patterns"])
assert len(doubled["stopwords"]) == len(single["stopwords"])
def test_build_patterns_cache_is_keyed_by_language():
"""Same name with different language tuples yields different compiled sets."""
from mempalace.entity_detector import _build_patterns
locale = {
"candidate_pattern": "[A-Z][a-z]+",
"multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
"person_verb_patterns": ["\\b{name}\\s+ranxx\\b"],
"pronoun_patterns": [],
"dialogue_patterns": [],
"project_verb_patterns": [],
"stopwords": [],
}
with _temp_locale("zz-test-cache", locale):
en_patterns = _build_patterns("Sam", ("en",))
multi_patterns = _build_patterns("Sam", ("en", "zz-test-cache"))
assert len(multi_patterns["person_verbs"]) > len(en_patterns["person_verbs"])
def test_normalize_langs_handles_string_input():
"""Passing a bare string instead of a tuple still works."""
from mempalace.entity_detector import _normalize_langs
assert _normalize_langs("en") == ("en",)
assert _normalize_langs(["en", "pt-br"]) == ("en", "pt-br")
assert _normalize_langs(None) == ("en",)
assert _normalize_langs(()) == ("en",)
def test_config_entity_languages_defaults_to_english(tmp_path, monkeypatch):
"""MempalaceConfig.entity_languages defaults to ['en'] with no config file."""
from mempalace.config import MempalaceConfig
monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False)
monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False)
cfg = MempalaceConfig(config_dir=str(tmp_path))
assert cfg.entity_languages == ["en"]
def test_config_entity_languages_from_env(tmp_path, monkeypatch):
"""Env var overrides config file."""
from mempalace.config import MempalaceConfig
monkeypatch.setenv("MEMPALACE_ENTITY_LANGUAGES", "en,pt-br,ru")
cfg = MempalaceConfig(config_dir=str(tmp_path))
assert cfg.entity_languages == ["en", "pt-br", "ru"]
def test_config_set_entity_languages_persists(tmp_path, monkeypatch):
"""set_entity_languages writes to disk and is read back."""
from mempalace.config import MempalaceConfig
monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False)
monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False)
cfg = MempalaceConfig(config_dir=str(tmp_path))
cfg.set_entity_languages(["en", "pt-br"])
cfg2 = MempalaceConfig(config_dir=str(tmp_path))
assert cfg2.entity_languages == ["en", "pt-br"]
def test_config_set_entity_languages_empty_falls_back_to_english(tmp_path, monkeypatch):
"""An empty list normalizes to ['en']."""
from mempalace.config import MempalaceConfig
monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False)
monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False)
cfg = MempalaceConfig(config_dir=str(tmp_path))
result = cfg.set_entity_languages([])
assert result == ["en"]
assert cfg.entity_languages == ["en"]