From 8bf940f861b696f3c11c8edc2489293229605112 Mon Sep 17 00:00:00 2001 From: mvalentsev Date: Thu, 16 Apr 2026 05:23:33 +0500 Subject: [PATCH] fix: use i18n candidate patterns for entity extraction in miner and palace entity_detector.py was refactored in #911 to load candidate patterns from i18n locale JSON files, supporting non-Latin scripts (Cyrillic, accented Latin, etc.). But three other code paths still hardcoded the ASCII-only regex [A-Z][a-z]{2,}, silently missing non-Latin entity names in metadata tagging, closet indexing, and registry lookups. Replace the hardcoded regex with a shared _candidate_entity_words() helper that reuses the same i18n candidate_patterns as entity_detector. --- mempalace/entity_registry.py | 4 +++- mempalace/miner.py | 4 +++- mempalace/palace.py | 36 +++++++++++++++++++++++++++++++++--- tests/test_miner.py | 17 +++++++++++++++++ 4 files changed, 56 insertions(+), 5 deletions(-) diff --git a/mempalace/entity_registry.py b/mempalace/entity_registry.py index 6c37572..72d9937 100644 --- a/mempalace/entity_registry.py +++ b/mempalace/entity_registry.py @@ -656,7 +656,9 @@ class EntityRegistry: Find capitalized words in query that aren't in registry or common words. These are candidates for Wikipedia research. """ - candidates = re.findall(r"\b[A-Z][a-z]{2,15}\b", query) + from .palace import _candidate_entity_words + + candidates = _candidate_entity_words(query) unknown = [] for word in set(candidates): if word.lower() in COMMON_ENGLISH_WORDS: diff --git a/mempalace/miner.py b/mempalace/miner.py index 713c3b1..ed48cf1 100644 --- a/mempalace/miner.py +++ b/mempalace/miner.py @@ -513,8 +513,10 @@ def _extract_entities_for_metadata(content: str) -> str: if re.search(r"(? list: + """Find entity candidate words using i18n-aware patterns. + + Uses the same candidate_patterns as entity_detector (loaded from locale + JSON files via get_entity_patterns), so non-Latin names (Cyrillic, + accented Latin, etc.) are detected alongside ASCII names. + """ + global _CANDIDATE_RX_CACHE + if _CANDIDATE_RX_CACHE is None: + from .config import MempalaceConfig + from .i18n import get_entity_patterns + + patterns = get_entity_patterns(MempalaceConfig().entity_languages) + rxs = [] + for raw_pat in patterns["candidate_patterns"]: + try: + rxs.append(re.compile(rf"\b({raw_pat})\b")) + except re.error: + continue + _CANDIDATE_RX_CACHE = rxs + words = [] + for rx in _CANDIDATE_RX_CACHE: + words.extend(rx.findall(text)) + return words + + def build_closet_lines(source_file, drawer_ids, content, wing, room): """Build compact closet pointer lines from drawer content. @@ -144,9 +174,9 @@ def build_closet_lines(source_file, drawer_ids, content, wing, room): drawer_ref = ",".join(drawer_ids[:3]) window = content[:CLOSET_EXTRACT_WINDOW] - # Extract proper nouns (capitalized words, 2+ occurrences). Filter out - # common sentence-starters that aren't real entities. - words = re.findall(r"\b[A-Z][a-z]{2,}\b", window) + # Extract proper nouns (2+ occurrences). Uses i18n-aware patterns so + # non-Latin names (Cyrillic, accented Latin, etc.) are also detected. + words = _candidate_entity_words(window) word_freq = {} for w in words: if w in _ENTITY_STOPLIST: diff --git a/tests/test_miner.py b/tests/test_miner.py index e2a3a85..18f4e50 100644 --- a/tests/test_miner.py +++ b/tests/test_miner.py @@ -224,6 +224,23 @@ def test_scan_project_skip_dirs_still_apply_without_override(): shutil.rmtree(tmpdir) +def test_entity_metadata_finds_cyrillic_names(monkeypatch): + """Entity extraction must find non-Latin names when entity_languages includes the locale.""" + import mempalace.palace as palace_mod + from mempalace.miner import _extract_entities_for_metadata + + # Reset cached patterns so they reload with the monkeypatched languages + monkeypatch.setattr(palace_mod, "_CANDIDATE_RX_CACHE", None) + monkeypatch.setattr( + "mempalace.config.MempalaceConfig.entity_languages", + property(lambda self: ("en", "ru")), + ) + + content = "Михаил написал код. Михаил отправил PR. Михаил получил ревью." + result = _extract_entities_for_metadata(content) + assert "Михаил" in result, f"Cyrillic name not found in entity metadata: {result!r}" + + def test_file_already_mined_check_mtime(): tmpdir = tempfile.mkdtemp() try: