refactor(entity_detector): make multi-language extensible via i18n JSON

Move all entity-detection lexical patterns (person verbs, pronouns,
dialogue markers, project verbs, stopwords, candidate character class)
out of hardcoded module-level constants and into the entity section of
each locale's JSON in mempalace/i18n/. Adds a languages parameter to
every public function so callers union patterns across the desired
locales. The default stays ("en",), so all existing callers and tests
behave unchanged.

Also adds:
- get_entity_patterns(langs) helper in mempalace/i18n/ that merges
  patterns across requested languages, dedupes lists, unions stopwords,
  and falls back to English for unknown locales
- MempalaceConfig.entity_languages property + setter, with env var
  override (MEMPALACE_ENTITY_LANGUAGES, comma-separated)
- mempalace init --lang en,pt-br flag (persists to config.json)
- Per-language candidate_pattern so non-Latin scripts (Cyrillic,
  Devanagari, CJK) can register their own character classes instead of
  being silently dropped by the ASCII-only [A-Z][a-z]+ default
- _build_patterns LRU cache keyed by (name, languages) so multi-language
  callers don't poison each other's cache slots

Why now: the open language PRs (#760 ru, #773 hi, #778 id, #907 it) only
add CLI strings via mempalace/i18n/. PR #156 (pt-br) is the first that
needed entity_detector changes and inlined a _PTBR variant of every
constant. That doesn't scale past 2-3 languages — every text gets
checked against every language's patterns regardless of relevance, and
candidate extraction still drops accented and non-Latin names.

This PR sets the standard so future locale contributors only edit one
JSON file (no Python changes), and entity detection scales linearly
with how many languages a user actually enabled, not how many ship.
This commit is contained in:
Igor Lins e Silva
2026-04-15 08:52:42 -03:00
parent 56b6a6360f
commit b214aced90
7 changed files with 641 additions and 421 deletions
+206
View File
@@ -1,6 +1,9 @@
"""Tests for mempalace.entity_detector."""
import contextlib
import json
import os
from pathlib import Path
from unittest.mock import patch
from mempalace.entity_detector import (
@@ -378,3 +381,206 @@ def test_scan_for_detection_max_files(tmp_path):
(tmp_path / f"note{i}.md").write_text(f"content {i}")
files = scan_for_detection(str(tmp_path), max_files=5)
assert len(files) <= 5
# ── multi-language infra ───────────────────────────────────────────────
@contextlib.contextmanager
def _temp_locale(locale_code: str, entity_section: dict):
"""Context manager that drops a locale JSON into mempalace/i18n/ for the test body.
Cleans up the file and clears every cache that depends on locale data on exit,
even if the test fails or the entity section is invalid.
"""
from mempalace import i18n
from mempalace import entity_detector
locale_path = Path(i18n.__file__).parent / f"{locale_code}.json"
if locale_path.exists():
raise RuntimeError(f"Test locale {locale_code} collides with an existing file")
payload = {
"lang": locale_code,
"label": locale_code,
"terms": {},
"cli": {},
"aaak": {"instruction": "test"},
"entity": entity_section,
}
locale_path.write_text(json.dumps(payload), encoding="utf-8")
def _clear_caches():
i18n._entity_cache.clear()
entity_detector._build_patterns.cache_clear()
entity_detector._pronoun_re.cache_clear()
entity_detector._get_stopwords.cache_clear()
_clear_caches()
try:
yield locale_path
finally:
try:
locale_path.unlink()
except OSError:
pass
_clear_caches()
def test_extract_candidates_default_languages_is_english_only():
"""Default languages tuple = ('en',) — accented names dropped (as today)."""
text = "João said hi. João laughed. João waved. João decided."
result = extract_candidates(text) # default ("en",)
assert "João" not in result
def test_extract_candidates_with_extra_locale_picks_up_new_charset():
"""A locale with a Latin+diacritics candidate_pattern catches accented names."""
locale = {
"candidate_pattern": "[A-ZÀ-Ú][a-zà-ÿ]{1,19}",
"multi_word_pattern": "[A-ZÀ-Ú][a-zà-ÿ]+(?:\\s+[A-ZÀ-Ú][a-zà-ÿ]+)+",
"person_verb_patterns": [],
"pronoun_patterns": [],
"dialogue_patterns": [],
"project_verb_patterns": [],
"stopwords": [],
}
with _temp_locale("zz-test-latin", locale):
text = "João said hi. João laughed. João waved. João decided."
result = extract_candidates(text, languages=("en", "zz-test-latin"))
assert "João" in result
assert result["João"] >= 3
def test_extract_candidates_with_cyrillic_locale():
"""A locale with a Cyrillic candidate_pattern catches Russian names."""
locale = {
"candidate_pattern": "[А-ЯЁ][а-яё]{1,19}",
"multi_word_pattern": "[А-ЯЁ][а-яё]+(?:\\s+[А-ЯЁ][а-яё]+)+",
"person_verb_patterns": [],
"pronoun_patterns": [],
"dialogue_patterns": [],
"project_verb_patterns": [],
"stopwords": [],
}
with _temp_locale("zz-test-cyrillic", locale):
text = "Иван сказал привет. Иван засмеялся. Иван помахал. Иван решил."
result = extract_candidates(text, languages=("en", "zz-test-cyrillic"))
assert "Иван" in result
def test_score_entity_unions_person_verbs_across_languages():
"""A non-English person-verb pattern fires when its locale is enabled."""
locale = {
"candidate_pattern": "[A-Z][a-z]{1,19}",
"multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
"person_verb_patterns": [
"\\b{name}\\s+disse\\b",
"\\b{name}\\s+falou\\b",
"\\b{name}\\s+riu\\b",
],
"pronoun_patterns": [],
"dialogue_patterns": [],
"project_verb_patterns": [],
"stopwords": [],
}
with _temp_locale("zz-test-verbs", locale):
text = "Maria disse oi. Maria falou. Maria riu."
lines = text.splitlines()
en_only = score_entity("Maria", text, lines, languages=("en",))
multi = score_entity("Maria", text, lines, languages=("en", "zz-test-verbs"))
assert multi["person_score"] > en_only["person_score"]
assert any("action" in s for s in multi["person_signals"])
def test_get_entity_patterns_unknown_lang_falls_back_to_english():
"""Asking for a non-existent language returns English defaults."""
from mempalace.i18n import get_entity_patterns
patterns = get_entity_patterns(("zz-does-not-exist",))
assert len(patterns["stopwords"]) > 0
assert patterns["candidate_patterns"] # English fallback
def test_get_entity_patterns_dedupes_across_overlapping_languages():
"""Loading ('en', 'en') doesn't double-count patterns or stopwords."""
from mempalace.i18n import get_entity_patterns
single = get_entity_patterns(("en",))
doubled = get_entity_patterns(("en", "en"))
assert len(doubled["person_verb_patterns"]) == len(single["person_verb_patterns"])
assert len(doubled["stopwords"]) == len(single["stopwords"])
def test_build_patterns_cache_is_keyed_by_language():
"""Same name with different language tuples yields different compiled sets."""
from mempalace.entity_detector import _build_patterns
locale = {
"candidate_pattern": "[A-Z][a-z]+",
"multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
"person_verb_patterns": ["\\b{name}\\s+ranxx\\b"],
"pronoun_patterns": [],
"dialogue_patterns": [],
"project_verb_patterns": [],
"stopwords": [],
}
with _temp_locale("zz-test-cache", locale):
en_patterns = _build_patterns("Sam", ("en",))
multi_patterns = _build_patterns("Sam", ("en", "zz-test-cache"))
assert len(multi_patterns["person_verbs"]) > len(en_patterns["person_verbs"])
def test_normalize_langs_handles_string_input():
"""Passing a bare string instead of a tuple still works."""
from mempalace.entity_detector import _normalize_langs
assert _normalize_langs("en") == ("en",)
assert _normalize_langs(["en", "pt-br"]) == ("en", "pt-br")
assert _normalize_langs(None) == ("en",)
assert _normalize_langs(()) == ("en",)
def test_config_entity_languages_defaults_to_english(tmp_path, monkeypatch):
"""MempalaceConfig.entity_languages defaults to ['en'] with no config file."""
from mempalace.config import MempalaceConfig
monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False)
monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False)
cfg = MempalaceConfig(config_dir=str(tmp_path))
assert cfg.entity_languages == ["en"]
def test_config_entity_languages_from_env(tmp_path, monkeypatch):
"""Env var overrides config file."""
from mempalace.config import MempalaceConfig
monkeypatch.setenv("MEMPALACE_ENTITY_LANGUAGES", "en,pt-br,ru")
cfg = MempalaceConfig(config_dir=str(tmp_path))
assert cfg.entity_languages == ["en", "pt-br", "ru"]
def test_config_set_entity_languages_persists(tmp_path, monkeypatch):
"""set_entity_languages writes to disk and is read back."""
from mempalace.config import MempalaceConfig
monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False)
monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False)
cfg = MempalaceConfig(config_dir=str(tmp_path))
cfg.set_entity_languages(["en", "pt-br"])
cfg2 = MempalaceConfig(config_dir=str(tmp_path))
assert cfg2.entity_languages == ["en", "pt-br"]
def test_config_set_entity_languages_empty_falls_back_to_english(tmp_path, monkeypatch):
"""An empty list normalizes to ['en']."""
from mempalace.config import MempalaceConfig
monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False)
monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False)
cfg = MempalaceConfig(config_dir=str(tmp_path))
result = cfg.set_entity_languages([])
assert result == ["en"]
assert cfg.entity_languages == ["en"]