From 8bf940f861b696f3c11c8edc2489293229605112 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Thu, 16 Apr 2026 05:23:33 +0500
Subject: [PATCH] fix: use i18n candidate patterns for entity extraction in
 miner and palace

entity_detector.py was refactored in #911 to load candidate patterns
from i18n locale JSON files, supporting non-Latin scripts (Cyrillic,
accented Latin, etc.). But three other code paths still hardcoded the
ASCII-only regex [A-Z][a-z]{2,}, silently missing non-Latin entity
names in metadata tagging, closet indexing, and registry lookups.

Replace the hardcoded regex with a shared _candidate_entity_words()
helper that reuses the same i18n candidate_patterns as entity_detector.
---
 mempalace/entity_registry.py |  4 +++-
 mempalace/miner.py           |  4 +++-
 mempalace/palace.py          | 36 +++++++++++++++++++++++++++++++++---
 tests/test_miner.py          | 17 +++++++++++++++++
 4 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/mempalace/entity_registry.py b/mempalace/entity_registry.py
index 6c37572..72d9937 100644
--- a/mempalace/entity_registry.py
+++ b/mempalace/entity_registry.py
@@ -656,7 +656,9 @@ class EntityRegistry:
         Find capitalized words in query that aren't in registry or common words.
         These are candidates for Wikipedia research.
         """
-        candidates = re.findall(r"\b[A-Z][a-z]{2,15}\b", query)
+        from .palace import _candidate_entity_words
+
+        candidates = _candidate_entity_words(query)
         unknown = []
         for word in set(candidates):
             if word.lower() in COMMON_ENGLISH_WORDS:
diff --git a/mempalace/miner.py b/mempalace/miner.py
index 713c3b1..ed48cf1 100644
--- a/mempalace/miner.py
+++ b/mempalace/miner.py
@@ -513,8 +513,10 @@ def _extract_entities_for_metadata(content: str) -> str:
         if re.search(r"(?<!\w)" + re.escape(name) + r"(?!\w)", content):
             matched.add(name)
 
+    from .palace import _candidate_entity_words
+
     window = content[:_ENTITY_EXTRACT_WINDOW]
-    words = re.findall(r"\b[A-Z][a-z]{2,}\b", window)
+    words = _candidate_entity_words(window)
     freq: dict = {}
     for w in words:
         if w in _ENTITY_STOPLIST:
diff --git a/mempalace/palace.py b/mempalace/palace.py
index ea1c234..2a9fdd4 100644
--- a/mempalace/palace.py
+++ b/mempalace/palace.py
@@ -7,6 +7,7 @@ Consolidates collection access patterns used by both miners and the MCP server.
 import contextlib
 import hashlib
 import os
+import re
 
 from .backends.chroma import ChromaBackend
 
@@ -130,6 +131,35 @@ _ENTITY_STOPLIST = frozenset(
 )
 
 
+_CANDIDATE_RX_CACHE = None
+
+
+def _candidate_entity_words(text: str) -> list:
+    """Find entity candidate words using i18n-aware patterns.
+
+    Uses the same candidate_patterns as entity_detector (loaded from locale
+    JSON files via get_entity_patterns), so non-Latin names (Cyrillic,
+    accented Latin, etc.) are detected alongside ASCII names.
+    """
+    global _CANDIDATE_RX_CACHE
+    if _CANDIDATE_RX_CACHE is None:
+        from .config import MempalaceConfig
+        from .i18n import get_entity_patterns
+
+        patterns = get_entity_patterns(MempalaceConfig().entity_languages)
+        rxs = []
+        for raw_pat in patterns["candidate_patterns"]:
+            try:
+                rxs.append(re.compile(rf"\b({raw_pat})\b"))
+            except re.error:
+                continue
+        _CANDIDATE_RX_CACHE = rxs
+    words = []
+    for rx in _CANDIDATE_RX_CACHE:
+        words.extend(rx.findall(text))
+    return words
+
+
 def build_closet_lines(source_file, drawer_ids, content, wing, room):
     """Build compact closet pointer lines from drawer content.
 
@@ -144,9 +174,9 @@ def build_closet_lines(source_file, drawer_ids, content, wing, room):
     drawer_ref = ",".join(drawer_ids[:3])
     window = content[:CLOSET_EXTRACT_WINDOW]
 
-    # Extract proper nouns (capitalized words, 2+ occurrences). Filter out
-    # common sentence-starters that aren't real entities.
-    words = re.findall(r"\b[A-Z][a-z]{2,}\b", window)
+    # Extract proper nouns (2+ occurrences). Uses i18n-aware patterns so
+    # non-Latin names (Cyrillic, accented Latin, etc.) are also detected.
+    words = _candidate_entity_words(window)
     word_freq = {}
     for w in words:
         if w in _ENTITY_STOPLIST:
diff --git a/tests/test_miner.py b/tests/test_miner.py
index e2a3a85..18f4e50 100644
--- a/tests/test_miner.py
+++ b/tests/test_miner.py
@@ -224,6 +224,23 @@ def test_scan_project_skip_dirs_still_apply_without_override():
         shutil.rmtree(tmpdir)
 
 
+def test_entity_metadata_finds_cyrillic_names(monkeypatch):
+    """Entity extraction must find non-Latin names when entity_languages includes the locale."""
+    import mempalace.palace as palace_mod
+    from mempalace.miner import _extract_entities_for_metadata
+
+    # Reset cached patterns so they reload with the monkeypatched languages
+    monkeypatch.setattr(palace_mod, "_CANDIDATE_RX_CACHE", None)
+    monkeypatch.setattr(
+        "mempalace.config.MempalaceConfig.entity_languages",
+        property(lambda self: ("en", "ru")),
+    )
+
+    content = "Михаил написал код. Михаил отправил PR. Михаил получил ревью."
+    result = _extract_entities_for_metadata(content)
+    assert "Михаил" in result, f"Cyrillic name not found in entity metadata: {result!r}"
+
+
 def test_file_already_mined_check_mtime():
     tmpdir = tempfile.mkdtemp()
     try: