fix(i18n): resolve language codes case-insensitively (#927)
BCP 47 language tags are case-insensitive (RFC 5646 §2.1.1) but the
locale files mix conventions (pt-br.json vs zh-CN.json). On
case-sensitive filesystems, '--lang PT-BR' or '--lang zh-cn' silently
missed the file, _load_entity_section returned {}, and entity
detection ran in English with no warning.
The cache key in get_entity_patterns was built from raw input, so
('PT-BR',) and ('pt-br',) produced two distinct entries, both wrong.
Add _canonical_lang(lang) that resolves any casing to the on-disk
filename stem via lowercase comparison, and route load_lang,
_load_entity_section, and the cache key through it.
Closes #927
This commit is contained in:
@@ -24,6 +24,23 @@ _current_lang: str = "en"
|
|||||||
_entity_cache: dict = {}
|
_entity_cache: dict = {}
|
||||||
|
|
||||||
|
|
||||||
|
def _canonical_lang(lang: str) -> str | None:
|
||||||
|
"""Resolve a language code to its on-disk canonical filename stem.
|
||||||
|
|
||||||
|
BCP 47 tags are case-insensitive (RFC 5646 §2.1.1), and the locale
|
||||||
|
files mix conventions (``pt-br.json`` vs ``zh-CN.json``). Match on
|
||||||
|
lowercase so callers can pass ``PT-BR``, ``zh-cn``, ``Pt-Br``, etc.
|
||||||
|
Returns ``None`` if no file matches.
|
||||||
|
"""
|
||||||
|
if not lang:
|
||||||
|
return None
|
||||||
|
target = lang.strip().lower()
|
||||||
|
for path in _LANG_DIR.glob("*.json"):
|
||||||
|
if path.stem.lower() == target:
|
||||||
|
return path.stem
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def available_languages() -> list[str]:
|
def available_languages() -> list[str]:
|
||||||
"""Return list of available language codes."""
|
"""Return list of available language codes."""
|
||||||
return sorted(p.stem for p in _LANG_DIR.glob("*.json"))
|
return sorted(p.stem for p in _LANG_DIR.glob("*.json"))
|
||||||
@@ -32,12 +49,12 @@ def available_languages() -> list[str]:
|
|||||||
def load_lang(lang: str = "en") -> dict:
|
def load_lang(lang: str = "en") -> dict:
|
||||||
"""Load a language dictionary. Falls back to English if not found."""
|
"""Load a language dictionary. Falls back to English if not found."""
|
||||||
global _strings, _current_lang
|
global _strings, _current_lang
|
||||||
lang_file = _LANG_DIR / f"{lang}.json"
|
canonical = _canonical_lang(lang)
|
||||||
if not lang_file.exists():
|
if canonical is None:
|
||||||
lang_file = _LANG_DIR / "en.json"
|
canonical = "en"
|
||||||
lang = "en"
|
lang_file = _LANG_DIR / f"{canonical}.json"
|
||||||
_strings = json.loads(lang_file.read_text(encoding="utf-8"))
|
_strings = json.loads(lang_file.read_text(encoding="utf-8"))
|
||||||
_current_lang = lang
|
_current_lang = canonical
|
||||||
return _strings
|
return _strings
|
||||||
|
|
||||||
|
|
||||||
@@ -81,9 +98,10 @@ def get_regex() -> dict:
|
|||||||
|
|
||||||
def _load_entity_section(lang: str) -> dict:
|
def _load_entity_section(lang: str) -> dict:
|
||||||
"""Load the raw entity section for one language. Returns {} if missing."""
|
"""Load the raw entity section for one language. Returns {} if missing."""
|
||||||
lang_file = _LANG_DIR / f"{lang}.json"
|
canonical = _canonical_lang(lang)
|
||||||
if not lang_file.exists():
|
if canonical is None:
|
||||||
return {}
|
return {}
|
||||||
|
lang_file = _LANG_DIR / f"{canonical}.json"
|
||||||
try:
|
try:
|
||||||
data = json.loads(lang_file.read_text(encoding="utf-8"))
|
data = json.loads(lang_file.read_text(encoding="utf-8"))
|
||||||
except (json.JSONDecodeError, OSError):
|
except (json.JSONDecodeError, OSError):
|
||||||
@@ -115,7 +133,12 @@ def get_entity_patterns(languages=("en",)) -> dict:
|
|||||||
"""
|
"""
|
||||||
if not languages:
|
if not languages:
|
||||||
languages = ("en",)
|
languages = ("en",)
|
||||||
key = tuple(languages)
|
# Normalize via canonical filename so callers using different casing
|
||||||
|
# (e.g. "PT-BR" vs "pt-br") share the same cache entry and load the
|
||||||
|
# same locale file. Unknown codes are kept as-is so the merge loop's
|
||||||
|
# "found_any" branch fires the English fallback exactly once.
|
||||||
|
languages = tuple(_canonical_lang(lang) or lang for lang in languages)
|
||||||
|
key = languages
|
||||||
if key in _entity_cache:
|
if key in _entity_cache:
|
||||||
return _entity_cache[key]
|
return _entity_cache[key]
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,86 @@
|
|||||||
|
"""Regression tests for issue #927 — language code lookup must be case-insensitive.
|
||||||
|
|
||||||
|
The locale files use mixed case for the region subtag (``pt-br.json`` vs
|
||||||
|
``zh-CN.json``). BCP 47 tags are case-insensitive (RFC 5646 §2.1.1), so
|
||||||
|
``--lang PT-BR``, ``--lang zh-cn``, and ``--lang ZH-TW`` must all resolve
|
||||||
|
to the canonical file rather than silently falling back to English.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from mempalace import i18n
|
||||||
|
from mempalace.i18n import (
|
||||||
|
_canonical_lang,
|
||||||
|
_load_entity_section,
|
||||||
|
available_languages,
|
||||||
|
get_entity_patterns,
|
||||||
|
load_lang,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def _reset_state():
|
||||||
|
"""Reset the module-level entity cache between tests."""
|
||||||
|
i18n._entity_cache.clear()
|
||||||
|
yield
|
||||||
|
i18n._entity_cache.clear()
|
||||||
|
|
||||||
|
|
||||||
|
def test_canonical_lang_lowercase_passthrough():
|
||||||
|
assert _canonical_lang("en") == "en"
|
||||||
|
assert _canonical_lang("pt-br") == "pt-br"
|
||||||
|
|
||||||
|
|
||||||
|
def test_canonical_lang_uppercase_resolves():
|
||||||
|
assert _canonical_lang("PT-BR") == "pt-br"
|
||||||
|
assert _canonical_lang("ZH-CN") == "zh-CN"
|
||||||
|
assert _canonical_lang("zh-cn") == "zh-CN"
|
||||||
|
assert _canonical_lang("Pt-Br") == "pt-br"
|
||||||
|
|
||||||
|
|
||||||
|
def test_canonical_lang_unknown_returns_none():
|
||||||
|
assert _canonical_lang("xx") is None
|
||||||
|
assert _canonical_lang("") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_lang_case_insensitive():
|
||||||
|
"""`load_lang('PT-BR')` must load the pt-br dictionary, not English."""
|
||||||
|
en = load_lang("en")
|
||||||
|
pt_lower = load_lang("pt-br")
|
||||||
|
pt_upper = load_lang("PT-BR")
|
||||||
|
assert pt_lower == pt_upper, "case should not change the loaded dict"
|
||||||
|
# If load_lang silently fell back to English, both would equal `en`.
|
||||||
|
if "pt-br" in available_languages() and pt_lower != en:
|
||||||
|
assert i18n.current_lang() == "pt-br"
|
||||||
|
|
||||||
|
|
||||||
|
def test_entity_section_loads_for_uppercase_input():
|
||||||
|
"""`_load_entity_section('PT-BR')` must read pt-br.json, not return {}."""
|
||||||
|
pt_lower = _load_entity_section("pt-br")
|
||||||
|
pt_upper = _load_entity_section("PT-BR")
|
||||||
|
assert pt_lower == pt_upper
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_entity_patterns_case_insensitive():
|
||||||
|
"""Entity patterns must be identical regardless of input case."""
|
||||||
|
lower = get_entity_patterns(("pt-br",))
|
||||||
|
upper = get_entity_patterns(("PT-BR",))
|
||||||
|
assert lower == upper
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_entity_patterns_shares_cache_across_cases():
|
||||||
|
"""Different casing must hit the same cache entry — not duplicate work."""
|
||||||
|
get_entity_patterns(("zh-CN",))
|
||||||
|
cache_keys = list(i18n._entity_cache.keys())
|
||||||
|
get_entity_patterns(("ZH-CN",))
|
||||||
|
get_entity_patterns(("zh-cn",))
|
||||||
|
assert len(i18n._entity_cache) == len(
|
||||||
|
cache_keys
|
||||||
|
), "different casings of the same language must not create new cache entries"
|
||||||
|
|
||||||
|
|
||||||
|
def test_unknown_language_still_falls_back_to_english():
|
||||||
|
"""A code with no matching file must fall through to English (existing contract)."""
|
||||||
|
patterns = get_entity_patterns(("xx-yy",))
|
||||||
|
en = get_entity_patterns(("en",))
|
||||||
|
assert patterns["candidate_patterns"] == en["candidate_patterns"]
|
||||||
Reference in New Issue
Block a user