fix: use i18n candidate patterns for entity extraction in miner and palace
entity_detector.py was refactored in #911 to load candidate patterns from i18n locale JSON files, supporting non-Latin scripts (Cyrillic, accented Latin, etc.). But three other code paths still hardcoded the ASCII-only regex [A-Z][a-z]{2,}, silently missing non-Latin entity names in metadata tagging, closet indexing, and registry lookups. Replace the hardcoded regex with a shared _candidate_entity_words() helper that reuses the same i18n candidate_patterns as entity_detector.
This commit is contained in:
@@ -224,6 +224,23 @@ def test_scan_project_skip_dirs_still_apply_without_override():
|
||||
shutil.rmtree(tmpdir)
|
||||
|
||||
|
||||
def test_entity_metadata_finds_cyrillic_names(monkeypatch):
|
||||
"""Entity extraction must find non-Latin names when entity_languages includes the locale."""
|
||||
import mempalace.palace as palace_mod
|
||||
from mempalace.miner import _extract_entities_for_metadata
|
||||
|
||||
# Reset cached patterns so they reload with the monkeypatched languages
|
||||
monkeypatch.setattr(palace_mod, "_CANDIDATE_RX_CACHE", None)
|
||||
monkeypatch.setattr(
|
||||
"mempalace.config.MempalaceConfig.entity_languages",
|
||||
property(lambda self: ("en", "ru")),
|
||||
)
|
||||
|
||||
content = "Михаил написал код. Михаил отправил PR. Михаил получил ревью."
|
||||
result = _extract_entities_for_metadata(content)
|
||||
assert "Михаил" in result, f"Cyrillic name not found in entity metadata: {result!r}"
|
||||
|
||||
|
||||
def test_file_already_mined_check_mtime():
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user