Merge pull request #931 from mvalentsev/fix/i18n-entity-metadata
fix: use i18n candidate patterns for entity extraction in miner and palace
This commit is contained in:
@@ -656,7 +656,9 @@ class EntityRegistry:
|
||||
Find capitalized words in query that aren't in registry or common words.
|
||||
These are candidates for Wikipedia research.
|
||||
"""
|
||||
candidates = re.findall(r"\b[A-Z][a-z]{2,15}\b", query)
|
||||
from .palace import _candidate_entity_words
|
||||
|
||||
candidates = _candidate_entity_words(query)
|
||||
unknown = []
|
||||
for word in set(candidates):
|
||||
if word.lower() in COMMON_ENGLISH_WORDS:
|
||||
|
||||
+3
-1
@@ -513,8 +513,10 @@ def _extract_entities_for_metadata(content: str) -> str:
|
||||
if re.search(r"(?<!\w)" + re.escape(name) + r"(?!\w)", content):
|
||||
matched.add(name)
|
||||
|
||||
from .palace import _candidate_entity_words
|
||||
|
||||
window = content[:_ENTITY_EXTRACT_WINDOW]
|
||||
words = re.findall(r"\b[A-Z][a-z]{2,}\b", window)
|
||||
words = _candidate_entity_words(window)
|
||||
freq: dict = {}
|
||||
for w in words:
|
||||
if w in _ENTITY_STOPLIST:
|
||||
|
||||
+33
-3
@@ -7,6 +7,7 @@ Consolidates collection access patterns used by both miners and the MCP server.
|
||||
import contextlib
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
|
||||
from .backends.chroma import ChromaBackend
|
||||
|
||||
@@ -130,6 +131,35 @@ _ENTITY_STOPLIST = frozenset(
|
||||
)
|
||||
|
||||
|
||||
_CANDIDATE_RX_CACHE = None
|
||||
|
||||
|
||||
def _candidate_entity_words(text: str) -> list:
|
||||
"""Find entity candidate words using i18n-aware patterns.
|
||||
|
||||
Uses the same candidate_patterns as entity_detector (loaded from locale
|
||||
JSON files via get_entity_patterns), so non-Latin names (Cyrillic,
|
||||
accented Latin, etc.) are detected alongside ASCII names.
|
||||
"""
|
||||
global _CANDIDATE_RX_CACHE
|
||||
if _CANDIDATE_RX_CACHE is None:
|
||||
from .config import MempalaceConfig
|
||||
from .i18n import get_entity_patterns
|
||||
|
||||
patterns = get_entity_patterns(MempalaceConfig().entity_languages)
|
||||
rxs = []
|
||||
for pat in patterns["candidate_patterns"]:
|
||||
try:
|
||||
rxs.append(re.compile(pat))
|
||||
except re.error:
|
||||
continue
|
||||
_CANDIDATE_RX_CACHE = rxs
|
||||
words = []
|
||||
for rx in _CANDIDATE_RX_CACHE:
|
||||
words.extend(rx.findall(text))
|
||||
return words
|
||||
|
||||
|
||||
def build_closet_lines(source_file, drawer_ids, content, wing, room):
|
||||
"""Build compact closet pointer lines from drawer content.
|
||||
|
||||
@@ -144,9 +174,9 @@ def build_closet_lines(source_file, drawer_ids, content, wing, room):
|
||||
drawer_ref = ",".join(drawer_ids[:3])
|
||||
window = content[:CLOSET_EXTRACT_WINDOW]
|
||||
|
||||
# Extract proper nouns (capitalized words, 2+ occurrences). Filter out
|
||||
# common sentence-starters that aren't real entities.
|
||||
words = re.findall(r"\b[A-Z][a-z]{2,}\b", window)
|
||||
# Extract proper nouns (2+ occurrences). Uses i18n-aware patterns so
|
||||
# non-Latin names (Cyrillic, accented Latin, etc.) are also detected.
|
||||
words = _candidate_entity_words(window)
|
||||
word_freq = {}
|
||||
for w in words:
|
||||
if w in _ENTITY_STOPLIST:
|
||||
|
||||
@@ -224,6 +224,23 @@ def test_scan_project_skip_dirs_still_apply_without_override():
|
||||
shutil.rmtree(tmpdir)
|
||||
|
||||
|
||||
def test_entity_metadata_finds_cyrillic_names(monkeypatch):
|
||||
"""Entity extraction must find non-Latin names when entity_languages includes the locale."""
|
||||
import mempalace.palace as palace_mod
|
||||
from mempalace.miner import _extract_entities_for_metadata
|
||||
|
||||
# Reset cached patterns so they reload with the monkeypatched languages
|
||||
monkeypatch.setattr(palace_mod, "_CANDIDATE_RX_CACHE", None)
|
||||
monkeypatch.setattr(
|
||||
"mempalace.config.MempalaceConfig.entity_languages",
|
||||
property(lambda self: ("en", "ru")),
|
||||
)
|
||||
|
||||
content = "Михаил написал код. Михаил отправил PR. Михаил получил ревью."
|
||||
result = _extract_entities_for_metadata(content)
|
||||
assert "Михаил" in result, f"Cyrillic name not found in entity metadata: {result!r}"
|
||||
|
||||
|
||||
def test_file_already_mined_check_mtime():
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user