fix: use i18n candidate patterns for entity extraction in miner and palace

entity_detector.py was refactored in #911 to load candidate patterns
from i18n locale JSON files, supporting non-Latin scripts (Cyrillic,
accented Latin, etc.). But three other code paths still hardcoded the
ASCII-only regex [A-Z][a-z]{2,}, silently missing non-Latin entity
names in metadata tagging, closet indexing, and registry lookups.

Replace the hardcoded regex with a shared _candidate_entity_words()
helper that reuses the same i18n candidate_patterns as entity_detector.
This commit is contained in:
mvalentsev
2026-04-16 05:23:33 +05:00
parent d4c942417a
commit 8bf940f861
4 changed files with 56 additions and 5 deletions
+3 -1
View File
@@ -656,7 +656,9 @@ class EntityRegistry:
Find capitalized words in query that aren't in registry or common words.
These are candidates for Wikipedia research.
"""
candidates = re.findall(r"\b[A-Z][a-z]{2,15}\b", query)
from .palace import _candidate_entity_words
candidates = _candidate_entity_words(query)
unknown = []
for word in set(candidates):
if word.lower() in COMMON_ENGLISH_WORDS:
+3 -1
View File
@@ -513,8 +513,10 @@ def _extract_entities_for_metadata(content: str) -> str:
if re.search(r"(?<!\w)" + re.escape(name) + r"(?!\w)", content):
matched.add(name)
from .palace import _candidate_entity_words
window = content[:_ENTITY_EXTRACT_WINDOW]
words = re.findall(r"\b[A-Z][a-z]{2,}\b", window)
words = _candidate_entity_words(window)
freq: dict = {}
for w in words:
if w in _ENTITY_STOPLIST:
+33 -3
View File
@@ -7,6 +7,7 @@ Consolidates collection access patterns used by both miners and the MCP server.
import contextlib
import hashlib
import os
import re
from .backends.chroma import ChromaBackend
@@ -130,6 +131,35 @@ _ENTITY_STOPLIST = frozenset(
)
_CANDIDATE_RX_CACHE = None
def _candidate_entity_words(text: str) -> list:
"""Find entity candidate words using i18n-aware patterns.
Uses the same candidate_patterns as entity_detector (loaded from locale
JSON files via get_entity_patterns), so non-Latin names (Cyrillic,
accented Latin, etc.) are detected alongside ASCII names.
"""
global _CANDIDATE_RX_CACHE
if _CANDIDATE_RX_CACHE is None:
from .config import MempalaceConfig
from .i18n import get_entity_patterns
patterns = get_entity_patterns(MempalaceConfig().entity_languages)
rxs = []
for raw_pat in patterns["candidate_patterns"]:
try:
rxs.append(re.compile(rf"\b({raw_pat})\b"))
except re.error:
continue
_CANDIDATE_RX_CACHE = rxs
words = []
for rx in _CANDIDATE_RX_CACHE:
words.extend(rx.findall(text))
return words
def build_closet_lines(source_file, drawer_ids, content, wing, room):
"""Build compact closet pointer lines from drawer content.
@@ -144,9 +174,9 @@ def build_closet_lines(source_file, drawer_ids, content, wing, room):
drawer_ref = ",".join(drawer_ids[:3])
window = content[:CLOSET_EXTRACT_WINDOW]
# Extract proper nouns (capitalized words, 2+ occurrences). Filter out
# common sentence-starters that aren't real entities.
words = re.findall(r"\b[A-Z][a-z]{2,}\b", window)
# Extract proper nouns (2+ occurrences). Uses i18n-aware patterns so
# non-Latin names (Cyrillic, accented Latin, etc.) are also detected.
words = _candidate_entity_words(window)
word_freq = {}
for w in words:
if w in _ENTITY_STOPLIST:
+17
View File
@@ -224,6 +224,23 @@ def test_scan_project_skip_dirs_still_apply_without_override():
shutil.rmtree(tmpdir)
def test_entity_metadata_finds_cyrillic_names(monkeypatch):
"""Entity extraction must find non-Latin names when entity_languages includes the locale."""
import mempalace.palace as palace_mod
from mempalace.miner import _extract_entities_for_metadata
# Reset cached patterns so they reload with the monkeypatched languages
monkeypatch.setattr(palace_mod, "_CANDIDATE_RX_CACHE", None)
monkeypatch.setattr(
"mempalace.config.MempalaceConfig.entity_languages",
property(lambda self: ("en", "ru")),
)
content = "Михаил написал код. Михаил отправил PR. Михаил получил ревью."
result = _extract_entities_for_metadata(content)
assert "Михаил" in result, f"Cyrillic name not found in entity metadata: {result!r}"
def test_file_already_mined_check_mtime():
tmpdir = tempfile.mkdtemp()
try: