refactor(entity_detector): make multi-language extensible via i18n JSON

Move all entity-detection lexical patterns (person verbs, pronouns, dialogue markers, project verbs, stopwords, candidate character class) out of hardcoded module-level constants and into the entity section of each locale's JSON in mempalace/i18n/. Adds a languages parameter to every public function so callers union patterns across the desired locales. The default stays ("en",), so all existing callers and tests behave unchanged. Also adds: - get_entity_patterns(langs) helper in mempalace/i18n/ that merges patterns across requested languages, dedupes lists, unions stopwords, and falls back to English for unknown locales - MempalaceConfig.entity_languages property + setter, with env var override (MEMPALACE_ENTITY_LANGUAGES, comma-separated) - mempalace init --lang en,pt-br flag (persists to config.json) - Per-language candidate_pattern so non-Latin scripts (Cyrillic, Devanagari, CJK) can register their own character classes instead of being silently dropped by the ASCII-only [A-Z][a-z]+ default - _build_patterns LRU cache keyed by (name, languages) so multi-language callers don't poison each other's cache slots Why now: the open language PRs (#760 ru, #773 hi, #778 id, #907 it) only add CLI strings via mempalace/i18n/. PR #156 (pt-br) is the first that needed entity_detector changes and inlined a _PTBR variant of every constant. That doesn't scale past 2-3 languages — every text gets checked against every language's patterns regardless of relevance, and candidate extraction still drops accented and non-Latin names. This PR sets the standard so future locale contributors only edit one JSON file (no Python changes), and entity detection scales linearly with how many languages a user actually enabled, not how many ship.
2026-04-15 08:52:42 -03:00
parent 56b6a6360f
commit b214aced90
7 changed files with 641 additions and 421 deletions
@@ -1,6 +1,9 @@
 """Tests for mempalace.entity_detector."""

+import contextlib
+import json
 import os
+from pathlib import Path
 from unittest.mock import patch

 from mempalace.entity_detector import (
@@ -378,3 +381,206 @@ def test_scan_for_detection_max_files(tmp_path):
        (tmp_path / f"note{i}.md").write_text(f"content {i}")
    files = scan_for_detection(str(tmp_path), max_files=5)
    assert len(files) <= 5
+
+
+# ── multi-language infra ───────────────────────────────────────────────
+
+
+@contextlib.contextmanager
+def _temp_locale(locale_code: str, entity_section: dict):
+    """Context manager that drops a locale JSON into mempalace/i18n/ for the test body.
+
+    Cleans up the file and clears every cache that depends on locale data on exit,
+    even if the test fails or the entity section is invalid.
+    """
+    from mempalace import i18n
+    from mempalace import entity_detector
+
+    locale_path = Path(i18n.__file__).parent / f"{locale_code}.json"
+    if locale_path.exists():
+        raise RuntimeError(f"Test locale {locale_code} collides with an existing file")
+
+    payload = {
+        "lang": locale_code,
+        "label": locale_code,
+        "terms": {},
+        "cli": {},
+        "aaak": {"instruction": "test"},
+        "entity": entity_section,
+    }
+    locale_path.write_text(json.dumps(payload), encoding="utf-8")
+
+    def _clear_caches():
+        i18n._entity_cache.clear()
+        entity_detector._build_patterns.cache_clear()
+        entity_detector._pronoun_re.cache_clear()
+        entity_detector._get_stopwords.cache_clear()
+
+    _clear_caches()
+    try:
+        yield locale_path
+    finally:
+        try:
+            locale_path.unlink()
+        except OSError:
+            pass
+        _clear_caches()
+
+
+def test_extract_candidates_default_languages_is_english_only():
+    """Default languages tuple = ('en',) — accented names dropped (as today)."""
+    text = "João said hi. João laughed. João waved. João decided."
+    result = extract_candidates(text)  # default ("en",)
+    assert "João" not in result
+
+
+def test_extract_candidates_with_extra_locale_picks_up_new_charset():
+    """A locale with a Latin+diacritics candidate_pattern catches accented names."""
+    locale = {
+        "candidate_pattern": "[A-ZÀ-Ú][a-zà-ÿ]{1,19}",
+        "multi_word_pattern": "[A-ZÀ-Ú][a-zà-ÿ]+(?:\\s+[A-ZÀ-Ú][a-zà-ÿ]+)+",
+        "person_verb_patterns": [],
+        "pronoun_patterns": [],
+        "dialogue_patterns": [],
+        "project_verb_patterns": [],
+        "stopwords": [],
+    }
+    with _temp_locale("zz-test-latin", locale):
+        text = "João said hi. João laughed. João waved. João decided."
+        result = extract_candidates(text, languages=("en", "zz-test-latin"))
+        assert "João" in result
+        assert result["João"] >= 3
+
+
+def test_extract_candidates_with_cyrillic_locale():
+    """A locale with a Cyrillic candidate_pattern catches Russian names."""
+    locale = {
+        "candidate_pattern": "[А-ЯЁ][а-яё]{1,19}",
+        "multi_word_pattern": "[А-ЯЁ][а-яё]+(?:\\s+[А-ЯЁ][а-яё]+)+",
+        "person_verb_patterns": [],
+        "pronoun_patterns": [],
+        "dialogue_patterns": [],
+        "project_verb_patterns": [],
+        "stopwords": [],
+    }
+    with _temp_locale("zz-test-cyrillic", locale):
+        text = "Иван сказал привет. Иван засмеялся. Иван помахал. Иван решил."
+        result = extract_candidates(text, languages=("en", "zz-test-cyrillic"))
+        assert "Иван" in result
+
+
+def test_score_entity_unions_person_verbs_across_languages():
+    """A non-English person-verb pattern fires when its locale is enabled."""
+    locale = {
+        "candidate_pattern": "[A-Z][a-z]{1,19}",
+        "multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
+        "person_verb_patterns": [
+            "\\b{name}\\s+disse\\b",
+            "\\b{name}\\s+falou\\b",
+            "\\b{name}\\s+riu\\b",
+        ],
+        "pronoun_patterns": [],
+        "dialogue_patterns": [],
+        "project_verb_patterns": [],
+        "stopwords": [],
+    }
+    with _temp_locale("zz-test-verbs", locale):
+        text = "Maria disse oi. Maria falou. Maria riu."
+        lines = text.splitlines()
+
+        en_only = score_entity("Maria", text, lines, languages=("en",))
+        multi = score_entity("Maria", text, lines, languages=("en", "zz-test-verbs"))
+
+        assert multi["person_score"] > en_only["person_score"]
+        assert any("action" in s for s in multi["person_signals"])
+
+
+def test_get_entity_patterns_unknown_lang_falls_back_to_english():
+    """Asking for a non-existent language returns English defaults."""
+    from mempalace.i18n import get_entity_patterns
+
+    patterns = get_entity_patterns(("zz-does-not-exist",))
+    assert len(patterns["stopwords"]) > 0
+    assert patterns["candidate_patterns"]  # English fallback
+
+
+def test_get_entity_patterns_dedupes_across_overlapping_languages():
+    """Loading ('en', 'en') doesn't double-count patterns or stopwords."""
+    from mempalace.i18n import get_entity_patterns
+
+    single = get_entity_patterns(("en",))
+    doubled = get_entity_patterns(("en", "en"))
+    assert len(doubled["person_verb_patterns"]) == len(single["person_verb_patterns"])
+    assert len(doubled["stopwords"]) == len(single["stopwords"])
+
+
+def test_build_patterns_cache_is_keyed_by_language():
+    """Same name with different language tuples yields different compiled sets."""
+    from mempalace.entity_detector import _build_patterns
+
+    locale = {
+        "candidate_pattern": "[A-Z][a-z]+",
+        "multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
+        "person_verb_patterns": ["\\b{name}\\s+ranxx\\b"],
+        "pronoun_patterns": [],
+        "dialogue_patterns": [],
+        "project_verb_patterns": [],
+        "stopwords": [],
+    }
+    with _temp_locale("zz-test-cache", locale):
+        en_patterns = _build_patterns("Sam", ("en",))
+        multi_patterns = _build_patterns("Sam", ("en", "zz-test-cache"))
+        assert len(multi_patterns["person_verbs"]) > len(en_patterns["person_verbs"])
+
+
+def test_normalize_langs_handles_string_input():
+    """Passing a bare string instead of a tuple still works."""
+    from mempalace.entity_detector import _normalize_langs
+
+    assert _normalize_langs("en") == ("en",)
+    assert _normalize_langs(["en", "pt-br"]) == ("en", "pt-br")
+    assert _normalize_langs(None) == ("en",)
+    assert _normalize_langs(()) == ("en",)
+
+
+def test_config_entity_languages_defaults_to_english(tmp_path, monkeypatch):
+    """MempalaceConfig.entity_languages defaults to ['en'] with no config file."""
+    from mempalace.config import MempalaceConfig
+
+    monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False)
+    monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False)
+    cfg = MempalaceConfig(config_dir=str(tmp_path))
+    assert cfg.entity_languages == ["en"]
+
+
+def test_config_entity_languages_from_env(tmp_path, monkeypatch):
+    """Env var overrides config file."""
+    from mempalace.config import MempalaceConfig
+
+    monkeypatch.setenv("MEMPALACE_ENTITY_LANGUAGES", "en,pt-br,ru")
+    cfg = MempalaceConfig(config_dir=str(tmp_path))
+    assert cfg.entity_languages == ["en", "pt-br", "ru"]
+
+
+def test_config_set_entity_languages_persists(tmp_path, monkeypatch):
+    """set_entity_languages writes to disk and is read back."""
+    from mempalace.config import MempalaceConfig
+
+    monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False)
+    monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False)
+    cfg = MempalaceConfig(config_dir=str(tmp_path))
+    cfg.set_entity_languages(["en", "pt-br"])
+    cfg2 = MempalaceConfig(config_dir=str(tmp_path))
+    assert cfg2.entity_languages == ["en", "pt-br"]
+
+
+def test_config_set_entity_languages_empty_falls_back_to_english(tmp_path, monkeypatch):
+    """An empty list normalizes to ['en']."""
+    from mempalace.config import MempalaceConfig
+
+    monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False)
+    monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False)
+    cfg = MempalaceConfig(config_dir=str(tmp_path))
+    result = cfg.set_entity_languages([])
+    assert result == ["en"]
+    assert cfg.entity_languages == ["en"]