fix: use i18n candidate patterns for entity extraction in miner and palace

entity_detector.py was refactored in #911 to load candidate patterns
from i18n locale JSON files, supporting non-Latin scripts (Cyrillic,
accented Latin, etc.). But three other code paths still hardcoded the
ASCII-only regex [A-Z][a-z]{2,}, silently missing non-Latin entity
names in metadata tagging, closet indexing, and registry lookups.

Replace the hardcoded regex with a shared _candidate_entity_words()
helper that reuses the same i18n candidate_patterns as entity_detector.
This commit is contained in:
mvalentsev
2026-04-16 05:23:33 +05:00
parent d4c942417a
commit 8bf940f861
4 changed files with 56 additions and 5 deletions
+3 -1
View File
@@ -656,7 +656,9 @@ class EntityRegistry:
Find capitalized words in query that aren't in registry or common words. Find capitalized words in query that aren't in registry or common words.
These are candidates for Wikipedia research. These are candidates for Wikipedia research.
""" """
candidates = re.findall(r"\b[A-Z][a-z]{2,15}\b", query) from .palace import _candidate_entity_words
candidates = _candidate_entity_words(query)
unknown = [] unknown = []
for word in set(candidates): for word in set(candidates):
if word.lower() in COMMON_ENGLISH_WORDS: if word.lower() in COMMON_ENGLISH_WORDS:
+3 -1
View File
@@ -513,8 +513,10 @@ def _extract_entities_for_metadata(content: str) -> str:
if re.search(r"(?<!\w)" + re.escape(name) + r"(?!\w)", content): if re.search(r"(?<!\w)" + re.escape(name) + r"(?!\w)", content):
matched.add(name) matched.add(name)
from .palace import _candidate_entity_words
window = content[:_ENTITY_EXTRACT_WINDOW] window = content[:_ENTITY_EXTRACT_WINDOW]
words = re.findall(r"\b[A-Z][a-z]{2,}\b", window) words = _candidate_entity_words(window)
freq: dict = {} freq: dict = {}
for w in words: for w in words:
if w in _ENTITY_STOPLIST: if w in _ENTITY_STOPLIST:
+33 -3
View File
@@ -7,6 +7,7 @@ Consolidates collection access patterns used by both miners and the MCP server.
import contextlib import contextlib
import hashlib import hashlib
import os import os
import re
from .backends.chroma import ChromaBackend from .backends.chroma import ChromaBackend
@@ -130,6 +131,35 @@ _ENTITY_STOPLIST = frozenset(
) )
_CANDIDATE_RX_CACHE = None
def _candidate_entity_words(text: str) -> list:
"""Find entity candidate words using i18n-aware patterns.
Uses the same candidate_patterns as entity_detector (loaded from locale
JSON files via get_entity_patterns), so non-Latin names (Cyrillic,
accented Latin, etc.) are detected alongside ASCII names.
"""
global _CANDIDATE_RX_CACHE
if _CANDIDATE_RX_CACHE is None:
from .config import MempalaceConfig
from .i18n import get_entity_patterns
patterns = get_entity_patterns(MempalaceConfig().entity_languages)
rxs = []
for raw_pat in patterns["candidate_patterns"]:
try:
rxs.append(re.compile(rf"\b({raw_pat})\b"))
except re.error:
continue
_CANDIDATE_RX_CACHE = rxs
words = []
for rx in _CANDIDATE_RX_CACHE:
words.extend(rx.findall(text))
return words
def build_closet_lines(source_file, drawer_ids, content, wing, room): def build_closet_lines(source_file, drawer_ids, content, wing, room):
"""Build compact closet pointer lines from drawer content. """Build compact closet pointer lines from drawer content.
@@ -144,9 +174,9 @@ def build_closet_lines(source_file, drawer_ids, content, wing, room):
drawer_ref = ",".join(drawer_ids[:3]) drawer_ref = ",".join(drawer_ids[:3])
window = content[:CLOSET_EXTRACT_WINDOW] window = content[:CLOSET_EXTRACT_WINDOW]
# Extract proper nouns (capitalized words, 2+ occurrences). Filter out # Extract proper nouns (2+ occurrences). Uses i18n-aware patterns so
# common sentence-starters that aren't real entities. # non-Latin names (Cyrillic, accented Latin, etc.) are also detected.
words = re.findall(r"\b[A-Z][a-z]{2,}\b", window) words = _candidate_entity_words(window)
word_freq = {} word_freq = {}
for w in words: for w in words:
if w in _ENTITY_STOPLIST: if w in _ENTITY_STOPLIST:
+17
View File
@@ -224,6 +224,23 @@ def test_scan_project_skip_dirs_still_apply_without_override():
shutil.rmtree(tmpdir) shutil.rmtree(tmpdir)
def test_entity_metadata_finds_cyrillic_names(monkeypatch):
"""Entity extraction must find non-Latin names when entity_languages includes the locale."""
import mempalace.palace as palace_mod
from mempalace.miner import _extract_entities_for_metadata
# Reset cached patterns so they reload with the monkeypatched languages
monkeypatch.setattr(palace_mod, "_CANDIDATE_RX_CACHE", None)
monkeypatch.setattr(
"mempalace.config.MempalaceConfig.entity_languages",
property(lambda self: ("en", "ru")),
)
content = "Михаил написал код. Михаил отправил PR. Михаил получил ревью."
result = _extract_entities_for_metadata(content)
assert "Михаил" in result, f"Cyrillic name not found in entity metadata: {result!r}"
def test_file_already_mined_check_mtime(): def test_file_already_mined_check_mtime():
tmpdir = tempfile.mkdtemp() tmpdir = tempfile.mkdtemp()
try: try: