refactor(entity_detector): make multi-language extensible via i18n JSON
Move all entity-detection lexical patterns (person verbs, pronouns,
dialogue markers, project verbs, stopwords, candidate character class)
out of hardcoded module-level constants and into the entity section of
each locale's JSON in mempalace/i18n/. Adds a languages parameter to
every public function so callers union patterns across the desired
locales. The default stays ("en",), so all existing callers and tests
behave unchanged.
Also adds:
- get_entity_patterns(langs) helper in mempalace/i18n/ that merges
patterns across requested languages, dedupes lists, unions stopwords,
and falls back to English for unknown locales
- MempalaceConfig.entity_languages property + setter, with env var
override (MEMPALACE_ENTITY_LANGUAGES, comma-separated)
- mempalace init --lang en,pt-br flag (persists to config.json)
- Per-language candidate_pattern so non-Latin scripts (Cyrillic,
Devanagari, CJK) can register their own character classes instead of
being silently dropped by the ASCII-only [A-Z][a-z]+ default
- _build_patterns LRU cache keyed by (name, languages) so multi-language
callers don't poison each other's cache slots
Why now: the open language PRs (#760 ru, #773 hi, #778 id, #907 it) only
add CLI strings via mempalace/i18n/. PR #156 (pt-br) is the first that
needed entity_detector changes and inlined a _PTBR variant of every
constant. That doesn't scale past 2-3 languages — every text gets
checked against every language's patterns regardless of relevance, and
candidate extraction still drops accented and non-Latin names.
This PR sets the standard so future locale contributors only edit one
JSON file (no Python changes), and entity detection scales linearly
with how many languages a user actually enabled, not how many ship.
This commit is contained in:
+25
-2
@@ -73,12 +73,25 @@ def cmd_init(args):
|
||||
from .entity_detector import scan_for_detection, detect_entities, confirm_entities
|
||||
from .room_detector_local import detect_rooms_local
|
||||
|
||||
cfg = MempalaceConfig()
|
||||
|
||||
# Resolve entity-detection languages: --lang overrides config.
|
||||
lang_arg = getattr(args, "lang", None)
|
||||
if lang_arg:
|
||||
languages = [s.strip() for s in lang_arg.split(",") if s.strip()] or ["en"]
|
||||
cfg.set_entity_languages(languages)
|
||||
else:
|
||||
languages = cfg.entity_languages
|
||||
languages_tuple = tuple(languages)
|
||||
|
||||
# Pass 1: auto-detect people and projects from file content
|
||||
print(f"\n Scanning for entities in: {args.dir}")
|
||||
if languages_tuple != ("en",):
|
||||
print(f" Languages: {', '.join(languages_tuple)}")
|
||||
files = scan_for_detection(args.dir)
|
||||
if files:
|
||||
print(f" Reading {len(files)} files...")
|
||||
detected = detect_entities(files)
|
||||
detected = detect_entities(files, languages=languages_tuple)
|
||||
total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
|
||||
if total > 0:
|
||||
confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
|
||||
@@ -93,7 +106,7 @@ def cmd_init(args):
|
||||
|
||||
# Pass 2: detect rooms from folder structure
|
||||
detect_rooms_local(project_dir=args.dir, yes=getattr(args, "yes", False))
|
||||
MempalaceConfig().init()
|
||||
cfg.init()
|
||||
|
||||
# Pass 3: protect git repos from accidentally committing per-project files
|
||||
_ensure_mempalace_files_gitignored(args.dir)
|
||||
@@ -478,6 +491,16 @@ def main():
|
||||
action="store_true",
|
||||
help="Auto-accept all detected entities (non-interactive)",
|
||||
)
|
||||
p_init.add_argument(
|
||||
"--lang",
|
||||
default=None,
|
||||
help=(
|
||||
"Comma-separated language codes for entity detection "
|
||||
"(e.g. 'en' or 'en,pt-br'). Defaults to value from config "
|
||||
"(MEMPALACE_ENTITY_LANGUAGES env var or config.json), or 'en'. "
|
||||
"When given, the value is also persisted to config.json."
|
||||
),
|
||||
)
|
||||
|
||||
# mine
|
||||
p_mine = sub.add_parser("mine", help="Mine files into the palace")
|
||||
|
||||
@@ -197,6 +197,42 @@ class MempalaceConfig:
|
||||
"""Mapping of hall names to keyword lists."""
|
||||
return self._file_config.get("hall_keywords", DEFAULT_HALL_KEYWORDS)
|
||||
|
||||
@property
|
||||
def entity_languages(self):
|
||||
"""Languages whose entity-detection patterns should be applied.
|
||||
|
||||
Reads from env var ``MEMPALACE_ENTITY_LANGUAGES`` (comma-separated)
|
||||
first, then the ``entity_languages`` field in ``config.json``,
|
||||
defaulting to ``["en"]``.
|
||||
"""
|
||||
env_val = os.environ.get("MEMPALACE_ENTITY_LANGUAGES") or os.environ.get(
|
||||
"MEMPAL_ENTITY_LANGUAGES"
|
||||
)
|
||||
if env_val:
|
||||
return [s.strip() for s in env_val.split(",") if s.strip()] or ["en"]
|
||||
cfg = self._file_config.get("entity_languages")
|
||||
if isinstance(cfg, list) and cfg:
|
||||
return [str(s) for s in cfg]
|
||||
return ["en"]
|
||||
|
||||
def set_entity_languages(self, languages):
|
||||
"""Persist the entity-detection language list to ``config.json``."""
|
||||
normalized = [s.strip() for s in languages if s and s.strip()]
|
||||
if not normalized:
|
||||
normalized = ["en"]
|
||||
self._file_config["entity_languages"] = normalized
|
||||
self._config_dir.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
with open(self._config_file, "w", encoding="utf-8") as f:
|
||||
json.dump(self._file_config, f, indent=2, ensure_ascii=False)
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
self._config_file.chmod(0o600)
|
||||
except (OSError, NotImplementedError):
|
||||
pass
|
||||
return normalized
|
||||
|
||||
@property
|
||||
def hook_silent_save(self):
|
||||
"""Whether the stop hook saves directly (True) or blocks for MCP calls (False)."""
|
||||
|
||||
+151
-416
@@ -9,9 +9,21 @@ Two-pass approach:
|
||||
Used by mempalace init before mining begins.
|
||||
The confirmed entity map feeds the miner as the taxonomy.
|
||||
|
||||
Multi-language support:
|
||||
All lexical patterns (person verbs, pronouns, dialogue markers, project
|
||||
verbs, stopwords, and the candidate-extraction character class) live in
|
||||
the ``entity`` section of ``mempalace/i18n/<lang>.json``. Every public
|
||||
function accepts a ``languages`` tuple and applies the union of the
|
||||
requested locales' patterns. The default is ``("en",)`` — existing
|
||||
English-only callers behave exactly as before.
|
||||
|
||||
To add a new language: add an ``entity`` section to that locale's JSON.
|
||||
No code changes required.
|
||||
|
||||
Usage:
|
||||
from entity_detector import detect_entities, confirm_entities
|
||||
candidates = detect_entities(file_paths)
|
||||
from mempalace.entity_detector import detect_entities, confirm_entities
|
||||
candidates = detect_entities(file_paths) # English only
|
||||
candidates = detect_entities(paths, languages=("en", "pt-br"))
|
||||
confirmed = confirm_entities(candidates) # interactive review
|
||||
"""
|
||||
|
||||
@@ -21,382 +33,46 @@ import functools
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
from mempalace.i18n import get_entity_patterns
|
||||
|
||||
# ==================== SIGNAL PATTERNS ====================
|
||||
|
||||
# Person signals — things people do
|
||||
PERSON_VERB_PATTERNS = [
|
||||
r"\b{name}\s+said\b",
|
||||
r"\b{name}\s+asked\b",
|
||||
r"\b{name}\s+told\b",
|
||||
r"\b{name}\s+replied\b",
|
||||
r"\b{name}\s+laughed\b",
|
||||
r"\b{name}\s+smiled\b",
|
||||
r"\b{name}\s+cried\b",
|
||||
r"\b{name}\s+felt\b",
|
||||
r"\b{name}\s+thinks?\b",
|
||||
r"\b{name}\s+wants?\b",
|
||||
r"\b{name}\s+loves?\b",
|
||||
r"\b{name}\s+hates?\b",
|
||||
r"\b{name}\s+knows?\b",
|
||||
r"\b{name}\s+decided\b",
|
||||
r"\b{name}\s+pushed\b",
|
||||
r"\b{name}\s+wrote\b",
|
||||
r"\bhey\s+{name}\b",
|
||||
r"\bthanks?\s+{name}\b",
|
||||
r"\bhi\s+{name}\b",
|
||||
r"\bdear\s+{name}\b",
|
||||
]
|
||||
# ==================== LANGUAGE-AWARE PATTERN LOADING ====================
|
||||
|
||||
# Person signals — pronouns resolving nearby
|
||||
PRONOUN_PATTERNS = [
|
||||
r"\bshe\b",
|
||||
r"\bher\b",
|
||||
r"\bhers\b",
|
||||
r"\bhe\b",
|
||||
r"\bhim\b",
|
||||
r"\bhis\b",
|
||||
r"\bthey\b",
|
||||
r"\bthem\b",
|
||||
r"\btheir\b",
|
||||
]
|
||||
|
||||
PRONOUN_RE = re.compile("|".join(PRONOUN_PATTERNS), re.IGNORECASE)
|
||||
def _normalize_langs(languages) -> tuple:
|
||||
"""Coerce a language input into a non-empty hashable tuple."""
|
||||
if not languages:
|
||||
return ("en",)
|
||||
if isinstance(languages, str):
|
||||
return (languages,)
|
||||
return tuple(languages)
|
||||
|
||||
# Person signals — dialogue markers
|
||||
DIALOGUE_PATTERNS = [
|
||||
r"^>\s*{name}[:\s]", # > Speaker: ...
|
||||
r"^{name}:\s", # Speaker: ...
|
||||
r"^\[{name}\]", # [Speaker]
|
||||
r'"{name}\s+said',
|
||||
]
|
||||
|
||||
# Project signals — things projects have/do
|
||||
PROJECT_VERB_PATTERNS = [
|
||||
r"\bbuilding\s+{name}\b",
|
||||
r"\bbuilt\s+{name}\b",
|
||||
r"\bship(?:ping|ped)?\s+{name}\b",
|
||||
r"\blaunch(?:ing|ed)?\s+{name}\b",
|
||||
r"\bdeploy(?:ing|ed)?\s+{name}\b",
|
||||
r"\binstall(?:ing|ed)?\s+{name}\b",
|
||||
r"\bthe\s+{name}\s+architecture\b",
|
||||
r"\bthe\s+{name}\s+pipeline\b",
|
||||
r"\bthe\s+{name}\s+system\b",
|
||||
r"\bthe\s+{name}\s+repo\b",
|
||||
r"\b{name}\s+v\d+\b", # MemPal v2
|
||||
r"\b{name}\.py\b", # mempalace.py
|
||||
r"\b{name}-core\b", # mempal-core (hyphen only, not underscore)
|
||||
r"\b{name}-local\b",
|
||||
r"\bimport\s+{name}\b",
|
||||
r"\bpip\s+install\s+{name}\b",
|
||||
]
|
||||
@functools.lru_cache(maxsize=32)
|
||||
def _get_stopwords(languages: tuple) -> frozenset:
|
||||
"""Return the union of stopwords across the given languages."""
|
||||
patterns = get_entity_patterns(languages)
|
||||
return frozenset(patterns["stopwords"])
|
||||
|
||||
# Words that are almost certainly NOT entities
|
||||
STOPWORDS = {
|
||||
"the",
|
||||
"a",
|
||||
"an",
|
||||
"and",
|
||||
"or",
|
||||
"but",
|
||||
"in",
|
||||
"on",
|
||||
"at",
|
||||
"to",
|
||||
"for",
|
||||
"of",
|
||||
"with",
|
||||
"by",
|
||||
"from",
|
||||
"as",
|
||||
"is",
|
||||
"was",
|
||||
"are",
|
||||
"were",
|
||||
"be",
|
||||
"been",
|
||||
"being",
|
||||
"have",
|
||||
"has",
|
||||
"had",
|
||||
"do",
|
||||
"does",
|
||||
"did",
|
||||
"will",
|
||||
"would",
|
||||
"could",
|
||||
"should",
|
||||
"may",
|
||||
"might",
|
||||
"must",
|
||||
"shall",
|
||||
"can",
|
||||
"this",
|
||||
"that",
|
||||
"these",
|
||||
"those",
|
||||
"it",
|
||||
"its",
|
||||
"they",
|
||||
"them",
|
||||
"their",
|
||||
"we",
|
||||
"our",
|
||||
"you",
|
||||
"your",
|
||||
"i",
|
||||
"my",
|
||||
"me",
|
||||
"he",
|
||||
"she",
|
||||
"his",
|
||||
"her",
|
||||
"who",
|
||||
"what",
|
||||
"when",
|
||||
"where",
|
||||
"why",
|
||||
"how",
|
||||
"which",
|
||||
"if",
|
||||
"then",
|
||||
"so",
|
||||
"not",
|
||||
"no",
|
||||
"yes",
|
||||
"ok",
|
||||
"okay",
|
||||
"just",
|
||||
"very",
|
||||
"really",
|
||||
"also",
|
||||
"already",
|
||||
"still",
|
||||
"even",
|
||||
"only",
|
||||
"here",
|
||||
"there",
|
||||
"now",
|
||||
"then",
|
||||
"too",
|
||||
"up",
|
||||
"out",
|
||||
"about",
|
||||
"like",
|
||||
"use",
|
||||
"get",
|
||||
"got",
|
||||
"make",
|
||||
"made",
|
||||
"take",
|
||||
"put",
|
||||
"come",
|
||||
"go",
|
||||
"see",
|
||||
"know",
|
||||
"think",
|
||||
"true",
|
||||
"false",
|
||||
"none",
|
||||
"null",
|
||||
"new",
|
||||
"old",
|
||||
"all",
|
||||
"any",
|
||||
"some",
|
||||
"true",
|
||||
"false",
|
||||
"return",
|
||||
"print",
|
||||
"def",
|
||||
"class",
|
||||
"import",
|
||||
"from",
|
||||
# Common capitalized words in prose that aren't entities
|
||||
"step",
|
||||
"usage",
|
||||
"run",
|
||||
"check",
|
||||
"find",
|
||||
"add",
|
||||
"get",
|
||||
"set",
|
||||
"list",
|
||||
"args",
|
||||
"dict",
|
||||
"str",
|
||||
"int",
|
||||
"bool",
|
||||
"path",
|
||||
"file",
|
||||
"type",
|
||||
"name",
|
||||
"note",
|
||||
"example",
|
||||
"option",
|
||||
"result",
|
||||
"error",
|
||||
"warning",
|
||||
"info",
|
||||
"every",
|
||||
"each",
|
||||
"more",
|
||||
"less",
|
||||
"next",
|
||||
"last",
|
||||
"first",
|
||||
"second",
|
||||
"stack",
|
||||
"layer",
|
||||
"mode",
|
||||
"test",
|
||||
"stop",
|
||||
"start",
|
||||
"copy",
|
||||
"move",
|
||||
"source",
|
||||
"target",
|
||||
"output",
|
||||
"input",
|
||||
"data",
|
||||
"item",
|
||||
"key",
|
||||
"value",
|
||||
"returns",
|
||||
"raises",
|
||||
"yields",
|
||||
"none",
|
||||
"self",
|
||||
"cls",
|
||||
"kwargs",
|
||||
# Common sentence-starting / abstract words that aren't entities
|
||||
"world",
|
||||
"well",
|
||||
"want",
|
||||
"topic",
|
||||
"choose",
|
||||
"social",
|
||||
"cars",
|
||||
"phones",
|
||||
"healthcare",
|
||||
"ex",
|
||||
"machina",
|
||||
"deus",
|
||||
"human",
|
||||
"humans",
|
||||
"people",
|
||||
"things",
|
||||
"something",
|
||||
"nothing",
|
||||
"everything",
|
||||
"anything",
|
||||
"someone",
|
||||
"everyone",
|
||||
"anyone",
|
||||
"way",
|
||||
"time",
|
||||
"day",
|
||||
"life",
|
||||
"place",
|
||||
"thing",
|
||||
"part",
|
||||
"kind",
|
||||
"sort",
|
||||
"case",
|
||||
"point",
|
||||
"idea",
|
||||
"fact",
|
||||
"sense",
|
||||
"question",
|
||||
"answer",
|
||||
"reason",
|
||||
"number",
|
||||
"version",
|
||||
"system",
|
||||
# Greetings and filler words at sentence starts
|
||||
"hey",
|
||||
"hi",
|
||||
"hello",
|
||||
"thanks",
|
||||
"thank",
|
||||
"right",
|
||||
"let",
|
||||
"ok",
|
||||
# UI/action words that appear in how-to content
|
||||
"click",
|
||||
"hit",
|
||||
"press",
|
||||
"tap",
|
||||
"drag",
|
||||
"drop",
|
||||
"open",
|
||||
"close",
|
||||
"save",
|
||||
"load",
|
||||
"launch",
|
||||
"install",
|
||||
"download",
|
||||
"upload",
|
||||
"scroll",
|
||||
"select",
|
||||
"enter",
|
||||
"submit",
|
||||
"cancel",
|
||||
"confirm",
|
||||
"delete",
|
||||
"copy",
|
||||
"paste",
|
||||
"type",
|
||||
"write",
|
||||
"read",
|
||||
"search",
|
||||
"find",
|
||||
"show",
|
||||
"hide",
|
||||
# Common filesystem/technical capitalized words
|
||||
"desktop",
|
||||
"documents",
|
||||
"downloads",
|
||||
"users",
|
||||
"home",
|
||||
"library",
|
||||
"applications",
|
||||
"system",
|
||||
"preferences",
|
||||
"settings",
|
||||
"terminal",
|
||||
# Abstract/topic words
|
||||
"actor",
|
||||
"vector",
|
||||
"remote",
|
||||
"control",
|
||||
"duration",
|
||||
"fetch",
|
||||
# Abstract concepts that appear as subjects but aren't entities
|
||||
"agents",
|
||||
"tools",
|
||||
"others",
|
||||
"guards",
|
||||
"ethics",
|
||||
"regulation",
|
||||
"learning",
|
||||
"thinking",
|
||||
"memory",
|
||||
"language",
|
||||
"intelligence",
|
||||
"technology",
|
||||
"society",
|
||||
"culture",
|
||||
"future",
|
||||
"history",
|
||||
"science",
|
||||
"model",
|
||||
"models",
|
||||
"network",
|
||||
"networks",
|
||||
"training",
|
||||
"inference",
|
||||
}
|
||||
|
||||
# ==================== BACKWARD-COMPAT MODULE CONSTANTS ====================
|
||||
#
|
||||
# These mirror the old module-level constants so existing imports keep working.
|
||||
# They reflect the English defaults and are populated at import time from
|
||||
# ``mempalace/i18n/en.json``. Callers that need multi-language behavior should
|
||||
# pass the ``languages`` parameter to the public functions below.
|
||||
|
||||
_EN = get_entity_patterns(("en",))
|
||||
|
||||
PERSON_VERB_PATTERNS = list(_EN["person_verb_patterns"])
|
||||
PRONOUN_PATTERNS = list(_EN["pronoun_patterns"])
|
||||
PRONOUN_RE = re.compile("|".join(PRONOUN_PATTERNS), re.IGNORECASE) if PRONOUN_PATTERNS else None
|
||||
DIALOGUE_PATTERNS = list(_EN["dialogue_patterns"])
|
||||
PROJECT_VERB_PATTERNS = list(_EN["project_verb_patterns"])
|
||||
STOPWORDS = set(_EN["stopwords"])
|
||||
|
||||
|
||||
# ==================== EXTENSION POINTS (not language-scoped) ====================
|
||||
|
||||
# For entity detection — prose only, no code files
|
||||
# Code files have too many capitalized names (classes, functions) that aren't entities
|
||||
@@ -443,56 +119,107 @@ SKIP_DIRS = {
|
||||
# ==================== CANDIDATE EXTRACTION ====================
|
||||
|
||||
|
||||
def extract_candidates(text: str) -> dict:
|
||||
def extract_candidates(text: str, languages=("en",)) -> dict:
|
||||
"""
|
||||
Extract all capitalized proper noun candidates from text.
|
||||
Returns {name: frequency} for names appearing 3+ times.
|
||||
"""
|
||||
# Find all capitalized words (not at sentence start — harder, so we use frequency as filter)
|
||||
raw = re.findall(r"\b([A-Z][a-z]{1,19})\b", text)
|
||||
|
||||
counts = defaultdict(int)
|
||||
for word in raw:
|
||||
if word.lower() not in STOPWORDS and len(word) > 1:
|
||||
Each language contributes its own character-class pattern (e.g. ASCII
|
||||
for English, Latin+diacritics for pt-br, Cyrillic for Russian,
|
||||
Devanagari for Hindi). Matches from all languages are unioned.
|
||||
"""
|
||||
langs = _normalize_langs(languages)
|
||||
patterns = get_entity_patterns(langs)
|
||||
stopwords = _get_stopwords(langs)
|
||||
|
||||
counts: defaultdict = defaultdict(int)
|
||||
|
||||
# Single-word candidates — one pattern per language
|
||||
for raw_pat in patterns["candidate_patterns"]:
|
||||
try:
|
||||
rx = re.compile(rf"\b({raw_pat})\b")
|
||||
except re.error:
|
||||
continue
|
||||
for word in rx.findall(text):
|
||||
if word.lower() in stopwords:
|
||||
continue
|
||||
if len(word) < 2:
|
||||
continue
|
||||
counts[word] += 1
|
||||
|
||||
# Also find multi-word proper nouns (e.g. "Memory Palace", "Claude Code")
|
||||
multi = re.findall(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b", text)
|
||||
for phrase in multi:
|
||||
if not any(w.lower() in STOPWORDS for w in phrase.split()):
|
||||
# Multi-word candidates — one pattern per language
|
||||
for raw_pat in patterns["multi_word_patterns"]:
|
||||
try:
|
||||
rx = re.compile(rf"\b({raw_pat})\b")
|
||||
except re.error:
|
||||
continue
|
||||
for phrase in rx.findall(text):
|
||||
if any(w.lower() in stopwords for w in phrase.split()):
|
||||
continue
|
||||
counts[phrase] += 1
|
||||
|
||||
# Filter: must appear at least 3 times to be a candidate
|
||||
return {name: count for name, count in counts.items() if count >= 3}
|
||||
|
||||
|
||||
# ==================== SIGNAL SCORING ====================
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=128)
|
||||
def _build_patterns(name: str) -> dict:
|
||||
"""Pre-compile all regex patterns for a single entity name."""
|
||||
@functools.lru_cache(maxsize=256)
|
||||
def _build_patterns(name: str, languages: tuple = ("en",)) -> dict:
|
||||
"""Pre-compile all regex patterns for a single entity name, per language set."""
|
||||
n = re.escape(name)
|
||||
langs = _normalize_langs(languages)
|
||||
sources = get_entity_patterns(langs)
|
||||
|
||||
def _compile_each(raw_patterns, flags=re.IGNORECASE):
|
||||
compiled = []
|
||||
for p in raw_patterns:
|
||||
try:
|
||||
compiled.append(re.compile(p.format(name=n), flags))
|
||||
except (re.error, KeyError, IndexError):
|
||||
continue
|
||||
return compiled
|
||||
|
||||
direct_sources = sources.get("direct_address_patterns") or []
|
||||
direct_compiled = []
|
||||
for raw in direct_sources:
|
||||
try:
|
||||
direct_compiled.append(re.compile(raw.format(name=n), re.IGNORECASE))
|
||||
except (re.error, KeyError, IndexError):
|
||||
continue
|
||||
|
||||
return {
|
||||
"dialogue": [
|
||||
re.compile(p.format(name=n), re.MULTILINE | re.IGNORECASE) for p in DIALOGUE_PATTERNS
|
||||
],
|
||||
"person_verbs": [re.compile(p.format(name=n), re.IGNORECASE) for p in PERSON_VERB_PATTERNS],
|
||||
"project_verbs": [
|
||||
re.compile(p.format(name=n), re.IGNORECASE) for p in PROJECT_VERB_PATTERNS
|
||||
],
|
||||
"direct": re.compile(rf"\bhey\s+{n}\b|\bthanks?\s+{n}\b|\bhi\s+{n}\b", re.IGNORECASE),
|
||||
"dialogue": _compile_each(sources["dialogue_patterns"], re.MULTILINE | re.IGNORECASE),
|
||||
"person_verbs": _compile_each(sources["person_verb_patterns"]),
|
||||
"project_verbs": _compile_each(sources["project_verb_patterns"]),
|
||||
"direct": direct_compiled,
|
||||
"versioned": re.compile(rf"\b{n}[-v]\w+", re.IGNORECASE),
|
||||
"code_ref": re.compile(rf"\b{n}\.(py|js|ts|yaml|yml|json|sh)\b", re.IGNORECASE),
|
||||
}
|
||||
|
||||
|
||||
def score_entity(name: str, text: str, lines: list) -> dict:
|
||||
@functools.lru_cache(maxsize=32)
|
||||
def _pronoun_re(languages: tuple):
|
||||
"""Compile a combined pronoun regex for the given languages."""
|
||||
langs = _normalize_langs(languages)
|
||||
patterns = get_entity_patterns(langs)
|
||||
pronouns = patterns.get("pronoun_patterns") or []
|
||||
if not pronouns:
|
||||
return None
|
||||
try:
|
||||
return re.compile("|".join(pronouns), re.IGNORECASE)
|
||||
except re.error:
|
||||
return None
|
||||
|
||||
|
||||
def score_entity(name: str, text: str, lines: list, languages=("en",)) -> dict:
|
||||
"""
|
||||
Score a candidate entity as person vs project.
|
||||
Returns scores and the signals that fired.
|
||||
"""
|
||||
patterns = _build_patterns(name)
|
||||
langs = _normalize_langs(languages)
|
||||
patterns = _build_patterns(name, langs)
|
||||
pronoun_re = _pronoun_re(langs)
|
||||
person_score = 0
|
||||
project_score = 0
|
||||
person_signals = []
|
||||
@@ -515,22 +242,25 @@ def score_entity(name: str, text: str, lines: list) -> dict:
|
||||
person_signals.append(f"'{name} ...' action ({matches}x)")
|
||||
|
||||
# Pronoun proximity — pronouns within 3 lines of the name
|
||||
name_lower = name.lower()
|
||||
name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()]
|
||||
pronoun_hits = 0
|
||||
for idx in name_line_indices:
|
||||
window_text = " ".join(lines[max(0, idx - 2) : idx + 3])
|
||||
if PRONOUN_RE.search(window_text):
|
||||
pronoun_hits += 1
|
||||
if pronoun_hits > 0:
|
||||
person_score += pronoun_hits * 2
|
||||
person_signals.append(f"pronoun nearby ({pronoun_hits}x)")
|
||||
if pronoun_re is not None:
|
||||
name_lower = name.lower()
|
||||
name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()]
|
||||
pronoun_hits = 0
|
||||
for idx in name_line_indices:
|
||||
window_text = " ".join(lines[max(0, idx - 2) : idx + 3])
|
||||
if pronoun_re.search(window_text):
|
||||
pronoun_hits += 1
|
||||
if pronoun_hits > 0:
|
||||
person_score += pronoun_hits * 2
|
||||
person_signals.append(f"pronoun nearby ({pronoun_hits}x)")
|
||||
|
||||
# Direct address
|
||||
direct = len(patterns["direct"].findall(text))
|
||||
if direct > 0:
|
||||
person_score += direct * 4
|
||||
person_signals.append(f"addressed directly ({direct}x)")
|
||||
direct_hits = 0
|
||||
for rx in patterns["direct"]:
|
||||
direct_hits += len(rx.findall(text))
|
||||
if direct_hits > 0:
|
||||
person_score += direct_hits * 4
|
||||
person_signals.append(f"addressed directly ({direct_hits}x)")
|
||||
|
||||
# --- Project signals ---
|
||||
|
||||
@@ -631,13 +361,15 @@ def classify_entity(name: str, frequency: int, scores: dict) -> dict:
|
||||
# ==================== MAIN DETECT ====================
|
||||
|
||||
|
||||
def detect_entities(file_paths: list, max_files: int = 10) -> dict:
|
||||
def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) -> dict:
|
||||
"""
|
||||
Scan files and detect entity candidates.
|
||||
|
||||
Args:
|
||||
file_paths: List of Path objects to scan
|
||||
max_files: Max files to read (for speed)
|
||||
languages: Tuple of language codes whose entity patterns should be
|
||||
applied (union). Defaults to ``("en",)``.
|
||||
|
||||
Returns:
|
||||
{
|
||||
@@ -646,6 +378,8 @@ def detect_entities(file_paths: list, max_files: int = 10) -> dict:
|
||||
"uncertain":[...entity dicts...],
|
||||
}
|
||||
"""
|
||||
langs = _normalize_langs(languages)
|
||||
|
||||
# Collect text from files
|
||||
all_text = []
|
||||
all_lines = []
|
||||
@@ -668,7 +402,7 @@ def detect_entities(file_paths: list, max_files: int = 10) -> dict:
|
||||
combined_text = "\n".join(all_text)
|
||||
|
||||
# Extract candidates
|
||||
candidates = extract_candidates(combined_text)
|
||||
candidates = extract_candidates(combined_text, languages=langs)
|
||||
|
||||
if not candidates:
|
||||
return {"people": [], "projects": [], "uncertain": []}
|
||||
@@ -679,7 +413,7 @@ def detect_entities(file_paths: list, max_files: int = 10) -> dict:
|
||||
uncertain = []
|
||||
|
||||
for name, frequency in sorted(candidates.items(), key=lambda x: x[1], reverse=True):
|
||||
scores = score_entity(name, combined_text, all_lines)
|
||||
scores = score_entity(name, combined_text, all_lines, languages=langs)
|
||||
entity = classify_entity(name, frequency, scores)
|
||||
|
||||
if entity["type"] == "person":
|
||||
@@ -843,13 +577,14 @@ if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python entity_detector.py <directory>")
|
||||
print("Usage: python entity_detector.py <directory> [lang1,lang2,...]")
|
||||
sys.exit(1)
|
||||
|
||||
project_dir = sys.argv[1]
|
||||
print(f"Scanning: {project_dir}")
|
||||
langs = tuple(sys.argv[2].split(",")) if len(sys.argv) >= 3 else ("en",)
|
||||
print(f"Scanning: {project_dir} (languages: {', '.join(langs)})")
|
||||
files = scan_for_detection(project_dir)
|
||||
print(f"Reading {len(files)} files...")
|
||||
detected = detect_entities(files)
|
||||
detected = detect_entities(files, languages=langs)
|
||||
confirmed = confirm_entities(detected)
|
||||
print("Confirmed entities:", confirmed)
|
||||
|
||||
@@ -583,15 +583,19 @@ class EntityRegistry:
|
||||
|
||||
# ── Learn from sessions ──────────────────────────────────────────────────
|
||||
|
||||
def learn_from_text(self, text: str, min_confidence: float = 0.75) -> list:
|
||||
def learn_from_text(self, text: str, min_confidence: float = 0.75, languages=("en",)) -> list:
|
||||
"""
|
||||
Scan session text for new entity candidates.
|
||||
Returns list of newly discovered candidates for review.
|
||||
|
||||
``languages`` is forwarded to entity detection — pass the user's
|
||||
configured ``MempalaceConfig().entity_languages`` to match the
|
||||
locales used at ``mempalace init`` time.
|
||||
"""
|
||||
from mempalace.entity_detector import extract_candidates, score_entity, classify_entity
|
||||
|
||||
lines = text.splitlines()
|
||||
candidates = extract_candidates(text)
|
||||
candidates = extract_candidates(text, languages=languages)
|
||||
new_candidates = []
|
||||
|
||||
for name, frequency in candidates.items():
|
||||
@@ -599,7 +603,7 @@ class EntityRegistry:
|
||||
if name in self.people or name in self.projects:
|
||||
continue
|
||||
|
||||
scores = score_entity(name, text, lines)
|
||||
scores = score_entity(name, text, lines, languages=languages)
|
||||
entity = classify_entity(name, frequency, scores)
|
||||
|
||||
if entity["type"] == "person" and entity["confidence"] >= min_confidence:
|
||||
|
||||
@@ -7,6 +7,10 @@ Usage:
|
||||
print(t("cli.mine_start", path="/docs")) # "Extraction de /docs..."
|
||||
print(t("terms.wing")) # "aile"
|
||||
print(t("aaak.instruction")) # AAAK compression instruction in French
|
||||
|
||||
Each locale JSON may include an ``entity`` section with patterns used by
|
||||
``mempalace.entity_detector``. See ``get_entity_patterns`` for the merge rules
|
||||
and the README section "Adding a new language" for the schema.
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -16,6 +20,9 @@ _LANG_DIR = Path(__file__).parent
|
||||
_strings: dict = {}
|
||||
_current_lang: str = "en"
|
||||
|
||||
# Cache: tuple(langs) -> merged entity pattern dict
|
||||
_entity_cache: dict = {}
|
||||
|
||||
|
||||
def available_languages() -> list[str]:
|
||||
"""Return list of available language codes."""
|
||||
@@ -72,5 +79,112 @@ def get_regex() -> dict:
|
||||
return _strings.get("regex", {})
|
||||
|
||||
|
||||
def _load_entity_section(lang: str) -> dict:
|
||||
"""Load the raw entity section for one language. Returns {} if missing."""
|
||||
lang_file = _LANG_DIR / f"{lang}.json"
|
||||
if not lang_file.exists():
|
||||
return {}
|
||||
try:
|
||||
data = json.loads(lang_file.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return {}
|
||||
return data.get("entity", {}) or {}
|
||||
|
||||
|
||||
def get_entity_patterns(languages=("en",)) -> dict:
|
||||
"""Return merged entity detection patterns for the requested languages.
|
||||
|
||||
Entity detection patterns live under each locale's ``entity`` section.
|
||||
This function merges them into a single dict for consumption by
|
||||
``mempalace.entity_detector``.
|
||||
|
||||
Merge rules:
|
||||
- List fields (person_verb_patterns, pronoun_patterns, dialogue_patterns,
|
||||
project_verb_patterns) are concatenated in the order of ``languages``,
|
||||
with duplicates removed while preserving first occurrence.
|
||||
- ``stopwords`` is the set union across all languages, returned as a
|
||||
sorted list.
|
||||
- ``candidate_patterns`` and ``multi_word_patterns`` are returned as
|
||||
lists (one per language) since they use different character classes;
|
||||
callers run each pattern independently and union the matches.
|
||||
- ``direct_address_pattern`` is returned as a list of per-language
|
||||
alternation patterns (not concatenated — each is applied separately).
|
||||
|
||||
If ``languages`` is empty or no requested language declares entity data,
|
||||
English is used as a fallback so callers always get a working config.
|
||||
"""
|
||||
if not languages:
|
||||
languages = ("en",)
|
||||
key = tuple(languages)
|
||||
if key in _entity_cache:
|
||||
return _entity_cache[key]
|
||||
|
||||
candidate_patterns: list[str] = []
|
||||
multi_word_patterns: list[str] = []
|
||||
person_verbs: list[str] = []
|
||||
pronouns: list[str] = []
|
||||
dialogue: list[str] = []
|
||||
direct_address: list[str] = []
|
||||
project_verbs: list[str] = []
|
||||
stopwords: set = set()
|
||||
|
||||
found_any = False
|
||||
for lang in languages:
|
||||
section = _load_entity_section(lang)
|
||||
if not section:
|
||||
continue
|
||||
found_any = True
|
||||
if section.get("candidate_pattern"):
|
||||
candidate_patterns.append(section["candidate_pattern"])
|
||||
if section.get("multi_word_pattern"):
|
||||
multi_word_patterns.append(section["multi_word_pattern"])
|
||||
if section.get("direct_address_pattern"):
|
||||
direct_address.append(section["direct_address_pattern"])
|
||||
person_verbs.extend(section.get("person_verb_patterns", []))
|
||||
pronouns.extend(section.get("pronoun_patterns", []))
|
||||
dialogue.extend(section.get("dialogue_patterns", []))
|
||||
project_verbs.extend(section.get("project_verb_patterns", []))
|
||||
stopwords.update(w.lower() for w in section.get("stopwords", []))
|
||||
|
||||
if not found_any:
|
||||
# Fallback: load English directly
|
||||
section = _load_entity_section("en")
|
||||
if section.get("candidate_pattern"):
|
||||
candidate_patterns.append(section["candidate_pattern"])
|
||||
if section.get("multi_word_pattern"):
|
||||
multi_word_patterns.append(section["multi_word_pattern"])
|
||||
if section.get("direct_address_pattern"):
|
||||
direct_address.append(section["direct_address_pattern"])
|
||||
person_verbs.extend(section.get("person_verb_patterns", []))
|
||||
pronouns.extend(section.get("pronoun_patterns", []))
|
||||
dialogue.extend(section.get("dialogue_patterns", []))
|
||||
project_verbs.extend(section.get("project_verb_patterns", []))
|
||||
stopwords.update(w.lower() for w in section.get("stopwords", []))
|
||||
|
||||
merged = {
|
||||
"candidate_patterns": candidate_patterns,
|
||||
"multi_word_patterns": multi_word_patterns,
|
||||
"person_verb_patterns": _dedupe(person_verbs),
|
||||
"pronoun_patterns": _dedupe(pronouns),
|
||||
"dialogue_patterns": _dedupe(dialogue),
|
||||
"direct_address_patterns": direct_address,
|
||||
"project_verb_patterns": _dedupe(project_verbs),
|
||||
"stopwords": sorted(stopwords),
|
||||
}
|
||||
_entity_cache[key] = merged
|
||||
return merged
|
||||
|
||||
|
||||
def _dedupe(items: list) -> list:
|
||||
"""Remove duplicates while preserving first-occurrence order."""
|
||||
seen = set()
|
||||
out = []
|
||||
for item in items:
|
||||
if item not in seen:
|
||||
seen.add(item)
|
||||
out.append(item)
|
||||
return out
|
||||
|
||||
|
||||
# Auto-load English on import
|
||||
load_lang("en")
|
||||
|
||||
@@ -40,5 +40,107 @@
|
||||
"stop_words": "the this that these those some many most each every other only such very will would could should must shall yeah okay also even then now already still back done make take give know think want need going come find work added saved session summary conversation topics source about once just really actually here there where good great better thank please sorry right wrong true false",
|
||||
"quote_pattern": "\"([^\"]{20,200})\"",
|
||||
"action_pattern": "(?:built|fixed|wrote|added|pushed|measured|tested|reviewed|created|deleted|updated|configured|deployed|migrated)\\s+[\\w\\s]{3,30}"
|
||||
},
|
||||
"entity": {
|
||||
"candidate_pattern": "[A-Z][a-z]{1,19}",
|
||||
"multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
|
||||
"person_verb_patterns": [
|
||||
"\\b{name}\\s+said\\b",
|
||||
"\\b{name}\\s+asked\\b",
|
||||
"\\b{name}\\s+told\\b",
|
||||
"\\b{name}\\s+replied\\b",
|
||||
"\\b{name}\\s+laughed\\b",
|
||||
"\\b{name}\\s+smiled\\b",
|
||||
"\\b{name}\\s+cried\\b",
|
||||
"\\b{name}\\s+felt\\b",
|
||||
"\\b{name}\\s+thinks?\\b",
|
||||
"\\b{name}\\s+wants?\\b",
|
||||
"\\b{name}\\s+loves?\\b",
|
||||
"\\b{name}\\s+hates?\\b",
|
||||
"\\b{name}\\s+knows?\\b",
|
||||
"\\b{name}\\s+decided\\b",
|
||||
"\\b{name}\\s+pushed\\b",
|
||||
"\\b{name}\\s+wrote\\b",
|
||||
"\\bhey\\s+{name}\\b",
|
||||
"\\bthanks?\\s+{name}\\b",
|
||||
"\\bhi\\s+{name}\\b",
|
||||
"\\bdear\\s+{name}\\b"
|
||||
],
|
||||
"pronoun_patterns": [
|
||||
"\\bshe\\b",
|
||||
"\\bher\\b",
|
||||
"\\bhers\\b",
|
||||
"\\bhe\\b",
|
||||
"\\bhim\\b",
|
||||
"\\bhis\\b",
|
||||
"\\bthey\\b",
|
||||
"\\bthem\\b",
|
||||
"\\btheir\\b"
|
||||
],
|
||||
"dialogue_patterns": [
|
||||
"^>\\s*{name}[:\\s]",
|
||||
"^{name}:\\s",
|
||||
"^\\[{name}\\]",
|
||||
"\"{name}\\s+said"
|
||||
],
|
||||
"direct_address_pattern": "\\bhey\\s+{name}\\b|\\bthanks?\\s+{name}\\b|\\bhi\\s+{name}\\b",
|
||||
"project_verb_patterns": [
|
||||
"\\bbuilding\\s+{name}\\b",
|
||||
"\\bbuilt\\s+{name}\\b",
|
||||
"\\bship(?:ping|ped)?\\s+{name}\\b",
|
||||
"\\blaunch(?:ing|ed)?\\s+{name}\\b",
|
||||
"\\bdeploy(?:ing|ed)?\\s+{name}\\b",
|
||||
"\\binstall(?:ing|ed)?\\s+{name}\\b",
|
||||
"\\bthe\\s+{name}\\s+architecture\\b",
|
||||
"\\bthe\\s+{name}\\s+pipeline\\b",
|
||||
"\\bthe\\s+{name}\\s+system\\b",
|
||||
"\\bthe\\s+{name}\\s+repo\\b",
|
||||
"\\b{name}\\s+v\\d+\\b",
|
||||
"\\b{name}\\.py\\b",
|
||||
"\\b{name}-core\\b",
|
||||
"\\b{name}-local\\b",
|
||||
"\\bimport\\s+{name}\\b",
|
||||
"\\bpip\\s+install\\s+{name}\\b"
|
||||
],
|
||||
"stopwords": [
|
||||
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to",
|
||||
"for", "of", "with", "by", "from", "as", "is", "was", "are", "were",
|
||||
"be", "been", "being", "have", "has", "had", "do", "does", "did",
|
||||
"will", "would", "could", "should", "may", "might", "must", "shall", "can",
|
||||
"this", "that", "these", "those", "it", "its", "they", "them", "their",
|
||||
"we", "our", "you", "your", "i", "my", "me", "he", "she", "his", "her",
|
||||
"who", "what", "when", "where", "why", "how", "which",
|
||||
"if", "then", "so", "not", "no", "yes", "ok", "okay",
|
||||
"just", "very", "really", "also", "already", "still", "even", "only",
|
||||
"here", "there", "now", "too", "up", "out", "about", "like",
|
||||
"use", "get", "got", "make", "made", "take", "put", "come", "go", "see",
|
||||
"know", "think", "true", "false", "none", "null", "new", "old", "all", "any", "some",
|
||||
"return", "print", "def", "class", "import",
|
||||
"step", "usage", "run", "check", "find", "add", "set", "list",
|
||||
"args", "dict", "str", "int", "bool", "path", "file", "type", "name",
|
||||
"note", "example", "option", "result", "error", "warning", "info",
|
||||
"every", "each", "more", "less", "next", "last", "first", "second",
|
||||
"stack", "layer", "mode", "test", "stop", "start", "copy", "move",
|
||||
"source", "target", "output", "input", "data", "item", "key", "value",
|
||||
"returns", "raises", "yields", "self", "cls", "kwargs",
|
||||
"world", "well", "want", "topic", "choose", "social", "cars", "phones",
|
||||
"healthcare", "ex", "machina", "deus", "human", "humans", "people",
|
||||
"things", "something", "nothing", "everything", "anything", "someone",
|
||||
"everyone", "anyone", "way", "time", "day", "life", "place", "thing",
|
||||
"part", "kind", "sort", "case", "point", "idea", "fact", "sense",
|
||||
"question", "answer", "reason", "number", "version", "system",
|
||||
"hey", "hi", "hello", "thanks", "thank", "right", "let",
|
||||
"click", "hit", "press", "tap", "drag", "drop", "open", "close",
|
||||
"save", "load", "launch", "install", "download", "upload", "scroll",
|
||||
"select", "enter", "submit", "cancel", "confirm", "delete", "paste",
|
||||
"write", "read", "search", "show", "hide",
|
||||
"desktop", "documents", "downloads", "users", "home", "library",
|
||||
"applications", "preferences", "settings", "terminal",
|
||||
"actor", "vector", "remote", "control", "duration", "fetch",
|
||||
"agents", "tools", "others", "guards", "ethics", "regulation",
|
||||
"learning", "thinking", "memory", "language", "intelligence",
|
||||
"technology", "society", "culture", "future", "history", "science",
|
||||
"model", "models", "network", "networks", "training", "inference"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user