refactor(entity_detector): make multi-language extensible via i18n JSON
Move all entity-detection lexical patterns (person verbs, pronouns,
dialogue markers, project verbs, stopwords, candidate character class)
out of hardcoded module-level constants and into the entity section of
each locale's JSON in mempalace/i18n/. Adds a languages parameter to
every public function so callers union patterns across the desired
locales. The default stays ("en",), so all existing callers and tests
behave unchanged.
Also adds:
- get_entity_patterns(langs) helper in mempalace/i18n/ that merges
patterns across requested languages, dedupes lists, unions stopwords,
and falls back to English for unknown locales
- MempalaceConfig.entity_languages property + setter, with env var
override (MEMPALACE_ENTITY_LANGUAGES, comma-separated)
- mempalace init --lang en,pt-br flag (persists to config.json)
- Per-language candidate_pattern so non-Latin scripts (Cyrillic,
Devanagari, CJK) can register their own character classes instead of
being silently dropped by the ASCII-only [A-Z][a-z]+ default
- _build_patterns LRU cache keyed by (name, languages) so multi-language
callers don't poison each other's cache slots
Why now: the open language PRs (#760 ru, #773 hi, #778 id, #907 it) only
add CLI strings via mempalace/i18n/. PR #156 (pt-br) is the first that
needed entity_detector changes and inlined a _PTBR variant of every
constant. That doesn't scale past 2-3 languages — every text gets
checked against every language's patterns regardless of relevance, and
candidate extraction still drops accented and non-Latin names.
This PR sets the standard so future locale contributors only edit one
JSON file (no Python changes), and entity detection scales linearly
with how many languages a user actually enabled, not how many ship.
This commit is contained in:
@@ -1,6 +1,9 @@
|
||||
"""Tests for mempalace.entity_detector."""
|
||||
|
||||
import contextlib
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from mempalace.entity_detector import (
|
||||
@@ -378,3 +381,206 @@ def test_scan_for_detection_max_files(tmp_path):
|
||||
(tmp_path / f"note{i}.md").write_text(f"content {i}")
|
||||
files = scan_for_detection(str(tmp_path), max_files=5)
|
||||
assert len(files) <= 5
|
||||
|
||||
|
||||
# ── multi-language infra ───────────────────────────────────────────────
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _temp_locale(locale_code: str, entity_section: dict):
|
||||
"""Context manager that drops a locale JSON into mempalace/i18n/ for the test body.
|
||||
|
||||
Cleans up the file and clears every cache that depends on locale data on exit,
|
||||
even if the test fails or the entity section is invalid.
|
||||
"""
|
||||
from mempalace import i18n
|
||||
from mempalace import entity_detector
|
||||
|
||||
locale_path = Path(i18n.__file__).parent / f"{locale_code}.json"
|
||||
if locale_path.exists():
|
||||
raise RuntimeError(f"Test locale {locale_code} collides with an existing file")
|
||||
|
||||
payload = {
|
||||
"lang": locale_code,
|
||||
"label": locale_code,
|
||||
"terms": {},
|
||||
"cli": {},
|
||||
"aaak": {"instruction": "test"},
|
||||
"entity": entity_section,
|
||||
}
|
||||
locale_path.write_text(json.dumps(payload), encoding="utf-8")
|
||||
|
||||
def _clear_caches():
|
||||
i18n._entity_cache.clear()
|
||||
entity_detector._build_patterns.cache_clear()
|
||||
entity_detector._pronoun_re.cache_clear()
|
||||
entity_detector._get_stopwords.cache_clear()
|
||||
|
||||
_clear_caches()
|
||||
try:
|
||||
yield locale_path
|
||||
finally:
|
||||
try:
|
||||
locale_path.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
_clear_caches()
|
||||
|
||||
|
||||
def test_extract_candidates_default_languages_is_english_only():
|
||||
"""Default languages tuple = ('en',) — accented names dropped (as today)."""
|
||||
text = "João said hi. João laughed. João waved. João decided."
|
||||
result = extract_candidates(text) # default ("en",)
|
||||
assert "João" not in result
|
||||
|
||||
|
||||
def test_extract_candidates_with_extra_locale_picks_up_new_charset():
|
||||
"""A locale with a Latin+diacritics candidate_pattern catches accented names."""
|
||||
locale = {
|
||||
"candidate_pattern": "[A-ZÀ-Ú][a-zà-ÿ]{1,19}",
|
||||
"multi_word_pattern": "[A-ZÀ-Ú][a-zà-ÿ]+(?:\\s+[A-ZÀ-Ú][a-zà-ÿ]+)+",
|
||||
"person_verb_patterns": [],
|
||||
"pronoun_patterns": [],
|
||||
"dialogue_patterns": [],
|
||||
"project_verb_patterns": [],
|
||||
"stopwords": [],
|
||||
}
|
||||
with _temp_locale("zz-test-latin", locale):
|
||||
text = "João said hi. João laughed. João waved. João decided."
|
||||
result = extract_candidates(text, languages=("en", "zz-test-latin"))
|
||||
assert "João" in result
|
||||
assert result["João"] >= 3
|
||||
|
||||
|
||||
def test_extract_candidates_with_cyrillic_locale():
|
||||
"""A locale with a Cyrillic candidate_pattern catches Russian names."""
|
||||
locale = {
|
||||
"candidate_pattern": "[А-ЯЁ][а-яё]{1,19}",
|
||||
"multi_word_pattern": "[А-ЯЁ][а-яё]+(?:\\s+[А-ЯЁ][а-яё]+)+",
|
||||
"person_verb_patterns": [],
|
||||
"pronoun_patterns": [],
|
||||
"dialogue_patterns": [],
|
||||
"project_verb_patterns": [],
|
||||
"stopwords": [],
|
||||
}
|
||||
with _temp_locale("zz-test-cyrillic", locale):
|
||||
text = "Иван сказал привет. Иван засмеялся. Иван помахал. Иван решил."
|
||||
result = extract_candidates(text, languages=("en", "zz-test-cyrillic"))
|
||||
assert "Иван" in result
|
||||
|
||||
|
||||
def test_score_entity_unions_person_verbs_across_languages():
|
||||
"""A non-English person-verb pattern fires when its locale is enabled."""
|
||||
locale = {
|
||||
"candidate_pattern": "[A-Z][a-z]{1,19}",
|
||||
"multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
|
||||
"person_verb_patterns": [
|
||||
"\\b{name}\\s+disse\\b",
|
||||
"\\b{name}\\s+falou\\b",
|
||||
"\\b{name}\\s+riu\\b",
|
||||
],
|
||||
"pronoun_patterns": [],
|
||||
"dialogue_patterns": [],
|
||||
"project_verb_patterns": [],
|
||||
"stopwords": [],
|
||||
}
|
||||
with _temp_locale("zz-test-verbs", locale):
|
||||
text = "Maria disse oi. Maria falou. Maria riu."
|
||||
lines = text.splitlines()
|
||||
|
||||
en_only = score_entity("Maria", text, lines, languages=("en",))
|
||||
multi = score_entity("Maria", text, lines, languages=("en", "zz-test-verbs"))
|
||||
|
||||
assert multi["person_score"] > en_only["person_score"]
|
||||
assert any("action" in s for s in multi["person_signals"])
|
||||
|
||||
|
||||
def test_get_entity_patterns_unknown_lang_falls_back_to_english():
|
||||
"""Asking for a non-existent language returns English defaults."""
|
||||
from mempalace.i18n import get_entity_patterns
|
||||
|
||||
patterns = get_entity_patterns(("zz-does-not-exist",))
|
||||
assert len(patterns["stopwords"]) > 0
|
||||
assert patterns["candidate_patterns"] # English fallback
|
||||
|
||||
|
||||
def test_get_entity_patterns_dedupes_across_overlapping_languages():
|
||||
"""Loading ('en', 'en') doesn't double-count patterns or stopwords."""
|
||||
from mempalace.i18n import get_entity_patterns
|
||||
|
||||
single = get_entity_patterns(("en",))
|
||||
doubled = get_entity_patterns(("en", "en"))
|
||||
assert len(doubled["person_verb_patterns"]) == len(single["person_verb_patterns"])
|
||||
assert len(doubled["stopwords"]) == len(single["stopwords"])
|
||||
|
||||
|
||||
def test_build_patterns_cache_is_keyed_by_language():
|
||||
"""Same name with different language tuples yields different compiled sets."""
|
||||
from mempalace.entity_detector import _build_patterns
|
||||
|
||||
locale = {
|
||||
"candidate_pattern": "[A-Z][a-z]+",
|
||||
"multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
|
||||
"person_verb_patterns": ["\\b{name}\\s+ranxx\\b"],
|
||||
"pronoun_patterns": [],
|
||||
"dialogue_patterns": [],
|
||||
"project_verb_patterns": [],
|
||||
"stopwords": [],
|
||||
}
|
||||
with _temp_locale("zz-test-cache", locale):
|
||||
en_patterns = _build_patterns("Sam", ("en",))
|
||||
multi_patterns = _build_patterns("Sam", ("en", "zz-test-cache"))
|
||||
assert len(multi_patterns["person_verbs"]) > len(en_patterns["person_verbs"])
|
||||
|
||||
|
||||
def test_normalize_langs_handles_string_input():
|
||||
"""Passing a bare string instead of a tuple still works."""
|
||||
from mempalace.entity_detector import _normalize_langs
|
||||
|
||||
assert _normalize_langs("en") == ("en",)
|
||||
assert _normalize_langs(["en", "pt-br"]) == ("en", "pt-br")
|
||||
assert _normalize_langs(None) == ("en",)
|
||||
assert _normalize_langs(()) == ("en",)
|
||||
|
||||
|
||||
def test_config_entity_languages_defaults_to_english(tmp_path, monkeypatch):
|
||||
"""MempalaceConfig.entity_languages defaults to ['en'] with no config file."""
|
||||
from mempalace.config import MempalaceConfig
|
||||
|
||||
monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False)
|
||||
monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False)
|
||||
cfg = MempalaceConfig(config_dir=str(tmp_path))
|
||||
assert cfg.entity_languages == ["en"]
|
||||
|
||||
|
||||
def test_config_entity_languages_from_env(tmp_path, monkeypatch):
|
||||
"""Env var overrides config file."""
|
||||
from mempalace.config import MempalaceConfig
|
||||
|
||||
monkeypatch.setenv("MEMPALACE_ENTITY_LANGUAGES", "en,pt-br,ru")
|
||||
cfg = MempalaceConfig(config_dir=str(tmp_path))
|
||||
assert cfg.entity_languages == ["en", "pt-br", "ru"]
|
||||
|
||||
|
||||
def test_config_set_entity_languages_persists(tmp_path, monkeypatch):
|
||||
"""set_entity_languages writes to disk and is read back."""
|
||||
from mempalace.config import MempalaceConfig
|
||||
|
||||
monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False)
|
||||
monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False)
|
||||
cfg = MempalaceConfig(config_dir=str(tmp_path))
|
||||
cfg.set_entity_languages(["en", "pt-br"])
|
||||
cfg2 = MempalaceConfig(config_dir=str(tmp_path))
|
||||
assert cfg2.entity_languages == ["en", "pt-br"]
|
||||
|
||||
|
||||
def test_config_set_entity_languages_empty_falls_back_to_english(tmp_path, monkeypatch):
|
||||
"""An empty list normalizes to ['en']."""
|
||||
from mempalace.config import MempalaceConfig
|
||||
|
||||
monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False)
|
||||
monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False)
|
||||
cfg = MempalaceConfig(config_dir=str(tmp_path))
|
||||
result = cfg.set_entity_languages([])
|
||||
assert result == ["en"]
|
||||
assert cfg.entity_languages == ["en"]
|
||||
|
||||
Reference in New Issue
Block a user