fix: use i18n candidate patterns for entity extraction in miner and palace

entity_detector.py was refactored in #911 to load candidate patterns from i18n locale JSON files, supporting non-Latin scripts (Cyrillic, accented Latin, etc.). But three other code paths still hardcoded the ASCII-only regex [A-Z][a-z]{2,}, silently missing non-Latin entity names in metadata tagging, closet indexing, and registry lookups. Replace the hardcoded regex with a shared _candidate_entity_words() helper that reuses the same i18n candidate_patterns as entity_detector.
2026-04-16 05:23:33 +05:00
parent d4c942417a
commit 8bf940f861
4 changed files with 56 additions and 5 deletions
@@ -224,6 +224,23 @@ def test_scan_project_skip_dirs_still_apply_without_override():
        shutil.rmtree(tmpdir)


+def test_entity_metadata_finds_cyrillic_names(monkeypatch):
+    """Entity extraction must find non-Latin names when entity_languages includes the locale."""
+    import mempalace.palace as palace_mod
+    from mempalace.miner import _extract_entities_for_metadata
+
+    # Reset cached patterns so they reload with the monkeypatched languages
+    monkeypatch.setattr(palace_mod, "_CANDIDATE_RX_CACHE", None)
+    monkeypatch.setattr(
+        "mempalace.config.MempalaceConfig.entity_languages",
+        property(lambda self: ("en", "ru")),
+    )
+
+    content = "Михаил написал код. Михаил отправил PR. Михаил получил ревью."
+    result = _extract_entities_for_metadata(content)
+    assert "Михаил" in result, f"Cyrillic name not found in entity metadata: {result!r}"
+
+
 def test_file_already_mined_check_mtime():
    tmpdir = tempfile.mkdtemp()
    try: