Merge pull request #931 from mvalentsev/fix/i18n-entity-metadata

fix: use i18n candidate patterns for entity extraction in miner and palace
This commit is contained in:
Igor Lins e Silva
2026-04-16 15:54:01 -03:00
committed by GitHub
4 changed files with 56 additions and 5 deletions
+3 -1
View File
@@ -656,7 +656,9 @@ class EntityRegistry:
Find capitalized words in query that aren't in registry or common words. Find capitalized words in query that aren't in registry or common words.
These are candidates for Wikipedia research. These are candidates for Wikipedia research.
""" """
candidates = re.findall(r"\b[A-Z][a-z]{2,15}\b", query) from .palace import _candidate_entity_words
candidates = _candidate_entity_words(query)
unknown = [] unknown = []
for word in set(candidates): for word in set(candidates):
if word.lower() in COMMON_ENGLISH_WORDS: if word.lower() in COMMON_ENGLISH_WORDS:
+3 -1
View File
@@ -513,8 +513,10 @@ def _extract_entities_for_metadata(content: str) -> str:
if re.search(r"(?<!\w)" + re.escape(name) + r"(?!\w)", content): if re.search(r"(?<!\w)" + re.escape(name) + r"(?!\w)", content):
matched.add(name) matched.add(name)
from .palace import _candidate_entity_words
window = content[:_ENTITY_EXTRACT_WINDOW] window = content[:_ENTITY_EXTRACT_WINDOW]
words = re.findall(r"\b[A-Z][a-z]{2,}\b", window) words = _candidate_entity_words(window)
freq: dict = {} freq: dict = {}
for w in words: for w in words:
if w in _ENTITY_STOPLIST: if w in _ENTITY_STOPLIST:
+33 -3
View File
@@ -7,6 +7,7 @@ Consolidates collection access patterns used by both miners and the MCP server.
import contextlib import contextlib
import hashlib import hashlib
import os import os
import re
from .backends.chroma import ChromaBackend from .backends.chroma import ChromaBackend
@@ -130,6 +131,35 @@ _ENTITY_STOPLIST = frozenset(
) )
_CANDIDATE_RX_CACHE = None
def _candidate_entity_words(text: str) -> list:
"""Find entity candidate words using i18n-aware patterns.
Uses the same candidate_patterns as entity_detector (loaded from locale
JSON files via get_entity_patterns), so non-Latin names (Cyrillic,
accented Latin, etc.) are detected alongside ASCII names.
"""
global _CANDIDATE_RX_CACHE
if _CANDIDATE_RX_CACHE is None:
from .config import MempalaceConfig
from .i18n import get_entity_patterns
patterns = get_entity_patterns(MempalaceConfig().entity_languages)
rxs = []
for pat in patterns["candidate_patterns"]:
try:
rxs.append(re.compile(pat))
except re.error:
continue
_CANDIDATE_RX_CACHE = rxs
words = []
for rx in _CANDIDATE_RX_CACHE:
words.extend(rx.findall(text))
return words
def build_closet_lines(source_file, drawer_ids, content, wing, room): def build_closet_lines(source_file, drawer_ids, content, wing, room):
"""Build compact closet pointer lines from drawer content. """Build compact closet pointer lines from drawer content.
@@ -144,9 +174,9 @@ def build_closet_lines(source_file, drawer_ids, content, wing, room):
drawer_ref = ",".join(drawer_ids[:3]) drawer_ref = ",".join(drawer_ids[:3])
window = content[:CLOSET_EXTRACT_WINDOW] window = content[:CLOSET_EXTRACT_WINDOW]
# Extract proper nouns (capitalized words, 2+ occurrences). Filter out # Extract proper nouns (2+ occurrences). Uses i18n-aware patterns so
# common sentence-starters that aren't real entities. # non-Latin names (Cyrillic, accented Latin, etc.) are also detected.
words = re.findall(r"\b[A-Z][a-z]{2,}\b", window) words = _candidate_entity_words(window)
word_freq = {} word_freq = {}
for w in words: for w in words:
if w in _ENTITY_STOPLIST: if w in _ENTITY_STOPLIST:
+17
View File
@@ -224,6 +224,23 @@ def test_scan_project_skip_dirs_still_apply_without_override():
shutil.rmtree(tmpdir) shutil.rmtree(tmpdir)
def test_entity_metadata_finds_cyrillic_names(monkeypatch):
"""Entity extraction must find non-Latin names when entity_languages includes the locale."""
import mempalace.palace as palace_mod
from mempalace.miner import _extract_entities_for_metadata
# Reset cached patterns so they reload with the monkeypatched languages
monkeypatch.setattr(palace_mod, "_CANDIDATE_RX_CACHE", None)
monkeypatch.setattr(
"mempalace.config.MempalaceConfig.entity_languages",
property(lambda self: ("en", "ru")),
)
content = "Михаил написал код. Михаил отправил PR. Михаил получил ревью."
result = _extract_entities_for_metadata(content)
assert "Михаил" in result, f"Cyrillic name not found in entity metadata: {result!r}"
def test_file_already_mined_check_mtime(): def test_file_already_mined_check_mtime():
tmpdir = tempfile.mkdtemp() tmpdir = tempfile.mkdtemp()
try: try: