Merge pull request #931 from mvalentsev/fix/i18n-entity-metadata
fix: use i18n candidate patterns for entity extraction in miner and palace
This commit is contained in:
@@ -656,7 +656,9 @@ class EntityRegistry:
|
|||||||
Find capitalized words in query that aren't in registry or common words.
|
Find capitalized words in query that aren't in registry or common words.
|
||||||
These are candidates for Wikipedia research.
|
These are candidates for Wikipedia research.
|
||||||
"""
|
"""
|
||||||
candidates = re.findall(r"\b[A-Z][a-z]{2,15}\b", query)
|
from .palace import _candidate_entity_words
|
||||||
|
|
||||||
|
candidates = _candidate_entity_words(query)
|
||||||
unknown = []
|
unknown = []
|
||||||
for word in set(candidates):
|
for word in set(candidates):
|
||||||
if word.lower() in COMMON_ENGLISH_WORDS:
|
if word.lower() in COMMON_ENGLISH_WORDS:
|
||||||
|
|||||||
+3
-1
@@ -513,8 +513,10 @@ def _extract_entities_for_metadata(content: str) -> str:
|
|||||||
if re.search(r"(?<!\w)" + re.escape(name) + r"(?!\w)", content):
|
if re.search(r"(?<!\w)" + re.escape(name) + r"(?!\w)", content):
|
||||||
matched.add(name)
|
matched.add(name)
|
||||||
|
|
||||||
|
from .palace import _candidate_entity_words
|
||||||
|
|
||||||
window = content[:_ENTITY_EXTRACT_WINDOW]
|
window = content[:_ENTITY_EXTRACT_WINDOW]
|
||||||
words = re.findall(r"\b[A-Z][a-z]{2,}\b", window)
|
words = _candidate_entity_words(window)
|
||||||
freq: dict = {}
|
freq: dict = {}
|
||||||
for w in words:
|
for w in words:
|
||||||
if w in _ENTITY_STOPLIST:
|
if w in _ENTITY_STOPLIST:
|
||||||
|
|||||||
+33
-3
@@ -7,6 +7,7 @@ Consolidates collection access patterns used by both miners and the MCP server.
|
|||||||
import contextlib
|
import contextlib
|
||||||
import hashlib
|
import hashlib
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
from .backends.chroma import ChromaBackend
|
from .backends.chroma import ChromaBackend
|
||||||
|
|
||||||
@@ -130,6 +131,35 @@ _ENTITY_STOPLIST = frozenset(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_CANDIDATE_RX_CACHE = None
|
||||||
|
|
||||||
|
|
||||||
|
def _candidate_entity_words(text: str) -> list:
|
||||||
|
"""Find entity candidate words using i18n-aware patterns.
|
||||||
|
|
||||||
|
Uses the same candidate_patterns as entity_detector (loaded from locale
|
||||||
|
JSON files via get_entity_patterns), so non-Latin names (Cyrillic,
|
||||||
|
accented Latin, etc.) are detected alongside ASCII names.
|
||||||
|
"""
|
||||||
|
global _CANDIDATE_RX_CACHE
|
||||||
|
if _CANDIDATE_RX_CACHE is None:
|
||||||
|
from .config import MempalaceConfig
|
||||||
|
from .i18n import get_entity_patterns
|
||||||
|
|
||||||
|
patterns = get_entity_patterns(MempalaceConfig().entity_languages)
|
||||||
|
rxs = []
|
||||||
|
for pat in patterns["candidate_patterns"]:
|
||||||
|
try:
|
||||||
|
rxs.append(re.compile(pat))
|
||||||
|
except re.error:
|
||||||
|
continue
|
||||||
|
_CANDIDATE_RX_CACHE = rxs
|
||||||
|
words = []
|
||||||
|
for rx in _CANDIDATE_RX_CACHE:
|
||||||
|
words.extend(rx.findall(text))
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
def build_closet_lines(source_file, drawer_ids, content, wing, room):
|
def build_closet_lines(source_file, drawer_ids, content, wing, room):
|
||||||
"""Build compact closet pointer lines from drawer content.
|
"""Build compact closet pointer lines from drawer content.
|
||||||
|
|
||||||
@@ -144,9 +174,9 @@ def build_closet_lines(source_file, drawer_ids, content, wing, room):
|
|||||||
drawer_ref = ",".join(drawer_ids[:3])
|
drawer_ref = ",".join(drawer_ids[:3])
|
||||||
window = content[:CLOSET_EXTRACT_WINDOW]
|
window = content[:CLOSET_EXTRACT_WINDOW]
|
||||||
|
|
||||||
# Extract proper nouns (capitalized words, 2+ occurrences). Filter out
|
# Extract proper nouns (2+ occurrences). Uses i18n-aware patterns so
|
||||||
# common sentence-starters that aren't real entities.
|
# non-Latin names (Cyrillic, accented Latin, etc.) are also detected.
|
||||||
words = re.findall(r"\b[A-Z][a-z]{2,}\b", window)
|
words = _candidate_entity_words(window)
|
||||||
word_freq = {}
|
word_freq = {}
|
||||||
for w in words:
|
for w in words:
|
||||||
if w in _ENTITY_STOPLIST:
|
if w in _ENTITY_STOPLIST:
|
||||||
|
|||||||
@@ -224,6 +224,23 @@ def test_scan_project_skip_dirs_still_apply_without_override():
|
|||||||
shutil.rmtree(tmpdir)
|
shutil.rmtree(tmpdir)
|
||||||
|
|
||||||
|
|
||||||
|
def test_entity_metadata_finds_cyrillic_names(monkeypatch):
|
||||||
|
"""Entity extraction must find non-Latin names when entity_languages includes the locale."""
|
||||||
|
import mempalace.palace as palace_mod
|
||||||
|
from mempalace.miner import _extract_entities_for_metadata
|
||||||
|
|
||||||
|
# Reset cached patterns so they reload with the monkeypatched languages
|
||||||
|
monkeypatch.setattr(palace_mod, "_CANDIDATE_RX_CACHE", None)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"mempalace.config.MempalaceConfig.entity_languages",
|
||||||
|
property(lambda self: ("en", "ru")),
|
||||||
|
)
|
||||||
|
|
||||||
|
content = "Михаил написал код. Михаил отправил PR. Михаил получил ревью."
|
||||||
|
result = _extract_entities_for_metadata(content)
|
||||||
|
assert "Михаил" in result, f"Cyrillic name not found in entity metadata: {result!r}"
|
||||||
|
|
||||||
|
|
||||||
def test_file_already_mined_check_mtime():
|
def test_file_already_mined_check_mtime():
|
||||||
tmpdir = tempfile.mkdtemp()
|
tmpdir = tempfile.mkdtemp()
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user