Merge pull request #931 from mvalentsev/fix/i18n-entity-metadata

fix: use i18n candidate patterns for entity extraction in miner and palace
2026-04-16 15:54:01 -03:00
parent c5e249bba8 cde0f5b9e7
commit 55a004fe1e
4 changed files with 56 additions and 5 deletions
@@ -656,7 +656,9 @@ class EntityRegistry:
        Find capitalized words in query that aren't in registry or common words.
        These are candidates for Wikipedia research.
        """
-        candidates = re.findall(r"\b[A-Z][a-z]{2,15}\b", query)
+        from .palace import _candidate_entity_words
        candidates = _candidate_entity_words(query)
        unknown = []
        for word in set(candidates):
            if word.lower() in COMMON_ENGLISH_WORDS:
@@ -513,8 +513,10 @@ def _extract_entities_for_metadata(content: str) -> str:
        if re.search(r"(?<!\w)" + re.escape(name) + r"(?!\w)", content):
            matched.add(name)
    from .palace import _candidate_entity_words
    window = content[:_ENTITY_EXTRACT_WINDOW]
-    words = re.findall(r"\b[A-Z][a-z]{2,}\b", window)
+    words = _candidate_entity_words(window)
    freq: dict = {}
    for w in words:
        if w in _ENTITY_STOPLIST:
@@ -7,6 +7,7 @@ Consolidates collection access patterns used by both miners and the MCP server.
 import contextlib
 import hashlib
 import os
 import re
 from .backends.chroma import ChromaBackend
@@ -130,6 +131,35 @@ _ENTITY_STOPLIST = frozenset(
 )
 _CANDIDATE_RX_CACHE = None
 def _candidate_entity_words(text: str) -> list:
    """Find entity candidate words using i18n-aware patterns.
    Uses the same candidate_patterns as entity_detector (loaded from locale
    JSON files via get_entity_patterns), so non-Latin names (Cyrillic,
    accented Latin, etc.) are detected alongside ASCII names.
    """
    global _CANDIDATE_RX_CACHE
    if _CANDIDATE_RX_CACHE is None:
        from .config import MempalaceConfig
        from .i18n import get_entity_patterns
        patterns = get_entity_patterns(MempalaceConfig().entity_languages)
        rxs = []
        for pat in patterns["candidate_patterns"]:
            try:
                rxs.append(re.compile(pat))
            except re.error:
                continue
        _CANDIDATE_RX_CACHE = rxs
    words = []
    for rx in _CANDIDATE_RX_CACHE:
        words.extend(rx.findall(text))
    return words
 def build_closet_lines(source_file, drawer_ids, content, wing, room):
    """Build compact closet pointer lines from drawer content.
@@ -144,9 +174,9 @@ def build_closet_lines(source_file, drawer_ids, content, wing, room):
    drawer_ref = ",".join(drawer_ids[:3])
    window = content[:CLOSET_EXTRACT_WINDOW]
-    # Extract proper nouns (capitalized words, 2+ occurrences). Filter out
+    # Extract proper nouns (2+ occurrences). Uses i18n-aware patterns so
-    # common sentence-starters that aren't real entities.
+    # non-Latin names (Cyrillic, accented Latin, etc.) are also detected.
-    words = re.findall(r"\b[A-Z][a-z]{2,}\b", window)
+    words = _candidate_entity_words(window)
    word_freq = {}
    for w in words:
        if w in _ENTITY_STOPLIST:
@@ -224,6 +224,23 @@ def test_scan_project_skip_dirs_still_apply_without_override():
        shutil.rmtree(tmpdir)
 def test_entity_metadata_finds_cyrillic_names(monkeypatch):
    """Entity extraction must find non-Latin names when entity_languages includes the locale."""
    import mempalace.palace as palace_mod
    from mempalace.miner import _extract_entities_for_metadata
    # Reset cached patterns so they reload with the monkeypatched languages
    monkeypatch.setattr(palace_mod, "_CANDIDATE_RX_CACHE", None)
    monkeypatch.setattr(
        "mempalace.config.MempalaceConfig.entity_languages",
        property(lambda self: ("en", "ru")),
    )
    content = "Михаил написал код. Михаил отправил PR. Михаил получил ревью."
    result = _extract_entities_for_metadata(content)
    assert "Михаил" in result, f"Cyrillic name not found in entity metadata: {result!r}"
 def test_file_already_mined_check_mtime():
    tmpdir = tempfile.mkdtemp()
    try: