Merge pull request #931 from mvalentsev/fix/i18n-entity-metadata

fix: use i18n candidate patterns for entity extraction in miner and palace
2026-04-16 15:54:01 -03:00
parent c5e249bba8 cde0f5b9e7
commit 55a004fe1e
4 changed files with 56 additions and 5 deletions
@@ -656,7 +656,9 @@ class EntityRegistry:
        Find capitalized words in query that aren't in registry or common words.
        These are candidates for Wikipedia research.
        """
-        candidates = re.findall(r"\b[A-Z][a-z]{2,15}\b", query)
+        from .palace import _candidate_entity_words
+
+        candidates = _candidate_entity_words(query)
        unknown = []
        for word in set(candidates):
            if word.lower() in COMMON_ENGLISH_WORDS:
@@ -513,8 +513,10 @@ def _extract_entities_for_metadata(content: str) -> str:
        if re.search(r"(?<!\w)" + re.escape(name) + r"(?!\w)", content):
            matched.add(name)

+    from .palace import _candidate_entity_words
+
    window = content[:_ENTITY_EXTRACT_WINDOW]
-    words = re.findall(r"\b[A-Z][a-z]{2,}\b", window)
+    words = _candidate_entity_words(window)
    freq: dict = {}
    for w in words:
        if w in _ENTITY_STOPLIST:
@@ -7,6 +7,7 @@ Consolidates collection access patterns used by both miners and the MCP server.
 import contextlib
 import hashlib
 import os
+import re

 from .backends.chroma import ChromaBackend

@@ -130,6 +131,35 @@ _ENTITY_STOPLIST = frozenset(
 )


+_CANDIDATE_RX_CACHE = None
+
+
+def _candidate_entity_words(text: str) -> list:
+    """Find entity candidate words using i18n-aware patterns.
+
+    Uses the same candidate_patterns as entity_detector (loaded from locale
+    JSON files via get_entity_patterns), so non-Latin names (Cyrillic,
+    accented Latin, etc.) are detected alongside ASCII names.
+    """
+    global _CANDIDATE_RX_CACHE
+    if _CANDIDATE_RX_CACHE is None:
+        from .config import MempalaceConfig
+        from .i18n import get_entity_patterns
+
+        patterns = get_entity_patterns(MempalaceConfig().entity_languages)
+        rxs = []
+        for pat in patterns["candidate_patterns"]:
+            try:
+                rxs.append(re.compile(pat))
+            except re.error:
+                continue
+        _CANDIDATE_RX_CACHE = rxs
+    words = []
+    for rx in _CANDIDATE_RX_CACHE:
+        words.extend(rx.findall(text))
+    return words
+
+
 def build_closet_lines(source_file, drawer_ids, content, wing, room):
    """Build compact closet pointer lines from drawer content.

@@ -144,9 +174,9 @@ def build_closet_lines(source_file, drawer_ids, content, wing, room):
    drawer_ref = ",".join(drawer_ids[:3])
    window = content[:CLOSET_EXTRACT_WINDOW]

-    # Extract proper nouns (capitalized words, 2+ occurrences). Filter out
-    # common sentence-starters that aren't real entities.
-    words = re.findall(r"\b[A-Z][a-z]{2,}\b", window)
+    # Extract proper nouns (2+ occurrences). Uses i18n-aware patterns so
+    # non-Latin names (Cyrillic, accented Latin, etc.) are also detected.
+    words = _candidate_entity_words(window)
    word_freq = {}
    for w in words:
        if w in _ENTITY_STOPLIST:
@@ -224,6 +224,23 @@ def test_scan_project_skip_dirs_still_apply_without_override():
        shutil.rmtree(tmpdir)


+def test_entity_metadata_finds_cyrillic_names(monkeypatch):
+    """Entity extraction must find non-Latin names when entity_languages includes the locale."""
+    import mempalace.palace as palace_mod
+    from mempalace.miner import _extract_entities_for_metadata
+
+    # Reset cached patterns so they reload with the monkeypatched languages
+    monkeypatch.setattr(palace_mod, "_CANDIDATE_RX_CACHE", None)
+    monkeypatch.setattr(
+        "mempalace.config.MempalaceConfig.entity_languages",
+        property(lambda self: ("en", "ru")),
+    )
+
+    content = "Михаил написал код. Михаил отправил PR. Михаил получил ревью."
+    result = _extract_entities_for_metadata(content)
+    assert "Михаил" in result, f"Cyrillic name not found in entity metadata: {result!r}"
+
+
 def test_file_already_mined_check_mtime():
    tmpdir = tempfile.mkdtemp()
    try: