diff --git a/mempalace/i18n/__init__.py b/mempalace/i18n/__init__.py index 671e0a1..564bf2c 100644 --- a/mempalace/i18n/__init__.py +++ b/mempalace/i18n/__init__.py @@ -24,6 +24,23 @@ _current_lang: str = "en" _entity_cache: dict = {} +def _canonical_lang(lang: str) -> str | None: + """Resolve a language code to its on-disk canonical filename stem. + + BCP 47 tags are case-insensitive (RFC 5646 §2.1.1), and the locale + files mix conventions (``pt-br.json`` vs ``zh-CN.json``). Match on + lowercase so callers can pass ``PT-BR``, ``zh-cn``, ``Pt-Br``, etc. + Returns ``None`` if no file matches. + """ + if not lang: + return None + target = lang.strip().lower() + for path in _LANG_DIR.glob("*.json"): + if path.stem.lower() == target: + return path.stem + return None + + def available_languages() -> list[str]: """Return list of available language codes.""" return sorted(p.stem for p in _LANG_DIR.glob("*.json")) @@ -32,12 +49,12 @@ def available_languages() -> list[str]: def load_lang(lang: str = "en") -> dict: """Load a language dictionary. Falls back to English if not found.""" global _strings, _current_lang - lang_file = _LANG_DIR / f"{lang}.json" - if not lang_file.exists(): - lang_file = _LANG_DIR / "en.json" - lang = "en" + canonical = _canonical_lang(lang) + if canonical is None: + canonical = "en" + lang_file = _LANG_DIR / f"{canonical}.json" _strings = json.loads(lang_file.read_text(encoding="utf-8")) - _current_lang = lang + _current_lang = canonical return _strings @@ -81,9 +98,10 @@ def get_regex() -> dict: def _load_entity_section(lang: str) -> dict: """Load the raw entity section for one language. Returns {} if missing.""" - lang_file = _LANG_DIR / f"{lang}.json" - if not lang_file.exists(): + canonical = _canonical_lang(lang) + if canonical is None: return {} + lang_file = _LANG_DIR / f"{canonical}.json" try: data = json.loads(lang_file.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): @@ -115,7 +133,12 @@ def get_entity_patterns(languages=("en",)) -> dict: """ if not languages: languages = ("en",) - key = tuple(languages) + # Normalize via canonical filename so callers using different casing + # (e.g. "PT-BR" vs "pt-br") share the same cache entry and load the + # same locale file. Unknown codes are kept as-is so the merge loop's + # "found_any" branch fires the English fallback exactly once. + languages = tuple(_canonical_lang(lang) or lang for lang in languages) + key = languages if key in _entity_cache: return _entity_cache[key] diff --git a/tests/test_i18n_lang_case.py b/tests/test_i18n_lang_case.py new file mode 100644 index 0000000..4c57ed1 --- /dev/null +++ b/tests/test_i18n_lang_case.py @@ -0,0 +1,86 @@ +"""Regression tests for issue #927 — language code lookup must be case-insensitive. + +The locale files use mixed case for the region subtag (``pt-br.json`` vs +``zh-CN.json``). BCP 47 tags are case-insensitive (RFC 5646 §2.1.1), so +``--lang PT-BR``, ``--lang zh-cn``, and ``--lang ZH-TW`` must all resolve +to the canonical file rather than silently falling back to English. +""" + +import pytest + +from mempalace import i18n +from mempalace.i18n import ( + _canonical_lang, + _load_entity_section, + available_languages, + get_entity_patterns, + load_lang, +) + + +@pytest.fixture(autouse=True) +def _reset_state(): + """Reset the module-level entity cache between tests.""" + i18n._entity_cache.clear() + yield + i18n._entity_cache.clear() + + +def test_canonical_lang_lowercase_passthrough(): + assert _canonical_lang("en") == "en" + assert _canonical_lang("pt-br") == "pt-br" + + +def test_canonical_lang_uppercase_resolves(): + assert _canonical_lang("PT-BR") == "pt-br" + assert _canonical_lang("ZH-CN") == "zh-CN" + assert _canonical_lang("zh-cn") == "zh-CN" + assert _canonical_lang("Pt-Br") == "pt-br" + + +def test_canonical_lang_unknown_returns_none(): + assert _canonical_lang("xx") is None + assert _canonical_lang("") is None + + +def test_load_lang_case_insensitive(): + """`load_lang('PT-BR')` must load the pt-br dictionary, not English.""" + en = load_lang("en") + pt_lower = load_lang("pt-br") + pt_upper = load_lang("PT-BR") + assert pt_lower == pt_upper, "case should not change the loaded dict" + # If load_lang silently fell back to English, both would equal `en`. + if "pt-br" in available_languages() and pt_lower != en: + assert i18n.current_lang() == "pt-br" + + +def test_entity_section_loads_for_uppercase_input(): + """`_load_entity_section('PT-BR')` must read pt-br.json, not return {}.""" + pt_lower = _load_entity_section("pt-br") + pt_upper = _load_entity_section("PT-BR") + assert pt_lower == pt_upper + + +def test_get_entity_patterns_case_insensitive(): + """Entity patterns must be identical regardless of input case.""" + lower = get_entity_patterns(("pt-br",)) + upper = get_entity_patterns(("PT-BR",)) + assert lower == upper + + +def test_get_entity_patterns_shares_cache_across_cases(): + """Different casing must hit the same cache entry — not duplicate work.""" + get_entity_patterns(("zh-CN",)) + cache_keys = list(i18n._entity_cache.keys()) + get_entity_patterns(("ZH-CN",)) + get_entity_patterns(("zh-cn",)) + assert len(i18n._entity_cache) == len( + cache_keys + ), "different casings of the same language must not create new cache entries" + + +def test_unknown_language_still_falls_back_to_english(): + """A code with no matching file must fall through to English (existing contract).""" + patterns = get_entity_patterns(("xx-yy",)) + en = get_entity_patterns(("en",)) + assert patterns["candidate_patterns"] == en["candidate_patterns"]