fix(entity_detector): script-aware word boundaries for combining-mark scripts
Python's \b is a \w/non-\w transition. Devanagari vowel signs (matras) like ा ी ु are Unicode category Mc (Mark, Spacing Combining) — not \w. This means \b splits mid-word on every matra: names like अनीता (Anita) truncate to अनीत, and person-verb patterns like \bराज\s+ने\s+कहा\b never match because \b fails after the final matra of कहा. Same issue affects Arabic, Hebrew, Thai, Tamil, and every other script whose words contain combining marks. Fix: locales with combining-mark scripts declare a boundary_chars field in their entity section (e.g. "\\w\\u0900-\\u097F" for Hindi). The i18n loader replaces every \b in that locale's patterns with a script-aware lookaround that treats the declared characters as "inside-word", and pre-wraps candidate/multi_word patterns with the same boundary. Default behavior (no boundary_chars) keeps standard \b — en, pt-br, ru, it are unchanged. Changes: - mempalace/i18n/__init__.py: add _script_boundary, _expand_b, _wrap_candidate, _collect_entity_section; candidate_patterns are now returned fully-wrapped (boundary + capture group applied) - mempalace/entity_detector.py: extract_candidates compiles pre-wrapped candidate patterns directly instead of re-wrapping with \b - tests/test_entity_detector.py: 5 new tests for Devanagari boundaries (name extraction with/without boundary_chars, person-verb firing, English regression)
This commit is contained in:
@@ -134,10 +134,10 @@ def extract_candidates(text: str, languages=("en",)) -> dict:
|
|||||||
|
|
||||||
counts: defaultdict = defaultdict(int)
|
counts: defaultdict = defaultdict(int)
|
||||||
|
|
||||||
# Single-word candidates — one pattern per language
|
# Single-word candidates — one pre-wrapped pattern per language
|
||||||
for raw_pat in patterns["candidate_patterns"]:
|
for wrapped_pat in patterns["candidate_patterns"]:
|
||||||
try:
|
try:
|
||||||
rx = re.compile(rf"\b({raw_pat})\b")
|
rx = re.compile(wrapped_pat)
|
||||||
except re.error:
|
except re.error:
|
||||||
continue
|
continue
|
||||||
for word in rx.findall(text):
|
for word in rx.findall(text):
|
||||||
@@ -147,10 +147,10 @@ def extract_candidates(text: str, languages=("en",)) -> dict:
|
|||||||
continue
|
continue
|
||||||
counts[word] += 1
|
counts[word] += 1
|
||||||
|
|
||||||
# Multi-word candidates — one pattern per language
|
# Multi-word candidates — one pre-wrapped pattern per language
|
||||||
for raw_pat in patterns["multi_word_patterns"]:
|
for wrapped_pat in patterns["multi_word_patterns"]:
|
||||||
try:
|
try:
|
||||||
rx = re.compile(rf"\b({raw_pat})\b")
|
rx = re.compile(wrapped_pat)
|
||||||
except re.error:
|
except re.error:
|
||||||
continue
|
continue
|
||||||
for phrase in rx.findall(text):
|
for phrase in rx.findall(text):
|
||||||
|
|||||||
+113
-42
@@ -91,6 +91,90 @@ def _load_entity_section(lang: str) -> dict:
|
|||||||
return data.get("entity", {}) or {}
|
return data.get("entity", {}) or {}
|
||||||
|
|
||||||
|
|
||||||
|
def _script_boundary(chars: str) -> str:
|
||||||
|
"""Build a lookaround-based word boundary expression.
|
||||||
|
|
||||||
|
Python's built-in ``\\b`` is a transition between ``\\w`` and non-``\\w``.
|
||||||
|
``\\w`` covers Unicode Letter and Number categories but NOT Marks (category
|
||||||
|
Mc/Mn), so for scripts whose words contain combining vowel signs — Devanagari
|
||||||
|
(ा ी ु), Arabic (ـَ ـِ ـُ), Hebrew (ִ ֵ), Thai, Tamil, Burmese, Khmer — the
|
||||||
|
default ``\\b`` drops the trailing mark, truncating names like ``अनीता`` to
|
||||||
|
``अनीत`` and failing to match ``\\bकहा\\b`` because the trailing matra is
|
||||||
|
not a word character.
|
||||||
|
|
||||||
|
Locales with such scripts declare ``boundary_chars`` in their entity section
|
||||||
|
(e.g. ``"\\\\w\\\\u0900-\\\\u097F"`` for Hindi). This function returns a
|
||||||
|
regex fragment equivalent to ``\\b`` but where the "word" side is defined
|
||||||
|
as any char matching ``[chars]`` rather than just ``\\w``.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
rf"(?:(?<=[{chars}])(?=[^{chars}])"
|
||||||
|
rf"|(?<=[^{chars}])(?=[{chars}])"
|
||||||
|
rf"|^(?=[{chars}])"
|
||||||
|
rf"|(?<=[{chars}])$)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _expand_b(pattern: str, boundary_chars: str) -> str:
|
||||||
|
"""Replace every literal ``\\b`` in ``pattern`` with a script-aware boundary.
|
||||||
|
|
||||||
|
``boundary_chars`` is the inside-word character class (without brackets).
|
||||||
|
If it's falsy, the pattern is returned unchanged so ``\\b`` keeps its
|
||||||
|
default Python ``re`` semantics.
|
||||||
|
"""
|
||||||
|
if not boundary_chars:
|
||||||
|
return pattern
|
||||||
|
return pattern.replace(r"\b", _script_boundary(boundary_chars))
|
||||||
|
|
||||||
|
|
||||||
|
def _wrap_candidate(raw_pat: str, boundary_chars: str) -> str:
|
||||||
|
"""Wrap a candidate/multi-word extraction pattern with a capture group
|
||||||
|
and word boundaries appropriate for its locale.
|
||||||
|
|
||||||
|
Default: ``\\b(raw)\\b``. With ``boundary_chars``: the script-aware
|
||||||
|
equivalent, so names ending in combining marks are matched in full.
|
||||||
|
"""
|
||||||
|
if boundary_chars:
|
||||||
|
b = _script_boundary(boundary_chars)
|
||||||
|
return f"{b}({raw_pat}){b}"
|
||||||
|
return rf"\b({raw_pat})\b"
|
||||||
|
|
||||||
|
|
||||||
|
def _collect_entity_section(section: dict, acc: dict) -> None:
|
||||||
|
"""Merge one language's entity section into the running accumulator.
|
||||||
|
|
||||||
|
Handles boundary expansion in-place so the caller merges already-expanded
|
||||||
|
strings: `candidate_patterns` and `multi_word_patterns` are pre-wrapped
|
||||||
|
with the locale's boundary (capture group included, ready to compile);
|
||||||
|
every ``\\b`` inside person/pronoun/dialogue/project/direct patterns is
|
||||||
|
replaced with the locale's script-aware boundary.
|
||||||
|
"""
|
||||||
|
boundary_chars = section.get("boundary_chars")
|
||||||
|
if section.get("candidate_pattern"):
|
||||||
|
acc["candidate_patterns"].append(
|
||||||
|
_wrap_candidate(section["candidate_pattern"], boundary_chars)
|
||||||
|
)
|
||||||
|
if section.get("multi_word_pattern"):
|
||||||
|
acc["multi_word_patterns"].append(
|
||||||
|
_wrap_candidate(section["multi_word_pattern"], boundary_chars)
|
||||||
|
)
|
||||||
|
if section.get("direct_address_pattern"):
|
||||||
|
acc["direct_address"].append(_expand_b(section["direct_address_pattern"], boundary_chars))
|
||||||
|
acc["person_verbs"].extend(
|
||||||
|
_expand_b(p, boundary_chars) for p in section.get("person_verb_patterns", [])
|
||||||
|
)
|
||||||
|
acc["pronouns"].extend(
|
||||||
|
_expand_b(p, boundary_chars) for p in section.get("pronoun_patterns", [])
|
||||||
|
)
|
||||||
|
acc["dialogue"].extend(
|
||||||
|
_expand_b(p, boundary_chars) for p in section.get("dialogue_patterns", [])
|
||||||
|
)
|
||||||
|
acc["project_verbs"].extend(
|
||||||
|
_expand_b(p, boundary_chars) for p in section.get("project_verb_patterns", [])
|
||||||
|
)
|
||||||
|
acc["stopwords"].update(w.lower() for w in section.get("stopwords", []))
|
||||||
|
|
||||||
|
|
||||||
def get_entity_patterns(languages=("en",)) -> dict:
|
def get_entity_patterns(languages=("en",)) -> dict:
|
||||||
"""Return merged entity detection patterns for the requested languages.
|
"""Return merged entity detection patterns for the requested languages.
|
||||||
|
|
||||||
@@ -105,11 +189,17 @@ def get_entity_patterns(languages=("en",)) -> dict:
|
|||||||
- ``stopwords`` is the set union across all languages, returned as a
|
- ``stopwords`` is the set union across all languages, returned as a
|
||||||
sorted list.
|
sorted list.
|
||||||
- ``candidate_patterns`` and ``multi_word_patterns`` are returned as
|
- ``candidate_patterns`` and ``multi_word_patterns`` are returned as
|
||||||
lists (one per language) since they use different character classes;
|
**fully-wrapped regex strings** (boundary + capture group applied);
|
||||||
callers run each pattern independently and union the matches.
|
the consumer compiles them directly with no further wrapping.
|
||||||
- ``direct_address_pattern`` is returned as a list of per-language
|
- ``direct_address_pattern`` is returned as a list of per-language
|
||||||
alternation patterns (not concatenated — each is applied separately).
|
alternation patterns (not concatenated — each is applied separately).
|
||||||
|
|
||||||
|
Locales with combining-mark scripts can declare ``boundary_chars`` in
|
||||||
|
their entity section (e.g. ``"\\\\w\\\\u0900-\\\\u097F"`` for Hindi);
|
||||||
|
every ``\\b`` inside that locale's patterns — plus the candidate/multi-
|
||||||
|
word wrapping — is expanded to a script-aware lookaround boundary that
|
||||||
|
treats the declared characters as "inside-word".
|
||||||
|
|
||||||
If ``languages`` is empty or no requested language declares entity data,
|
If ``languages`` is empty or no requested language declares entity data,
|
||||||
English is used as a fallback so callers always get a working config.
|
English is used as a fallback so callers always get a working config.
|
||||||
"""
|
"""
|
||||||
@@ -119,14 +209,16 @@ def get_entity_patterns(languages=("en",)) -> dict:
|
|||||||
if key in _entity_cache:
|
if key in _entity_cache:
|
||||||
return _entity_cache[key]
|
return _entity_cache[key]
|
||||||
|
|
||||||
candidate_patterns: list[str] = []
|
acc = {
|
||||||
multi_word_patterns: list[str] = []
|
"candidate_patterns": [],
|
||||||
person_verbs: list[str] = []
|
"multi_word_patterns": [],
|
||||||
pronouns: list[str] = []
|
"person_verbs": [],
|
||||||
dialogue: list[str] = []
|
"pronouns": [],
|
||||||
direct_address: list[str] = []
|
"dialogue": [],
|
||||||
project_verbs: list[str] = []
|
"direct_address": [],
|
||||||
stopwords: set = set()
|
"project_verbs": [],
|
||||||
|
"stopwords": set(),
|
||||||
|
}
|
||||||
|
|
||||||
found_any = False
|
found_any = False
|
||||||
for lang in languages:
|
for lang in languages:
|
||||||
@@ -134,42 +226,21 @@ def get_entity_patterns(languages=("en",)) -> dict:
|
|||||||
if not section:
|
if not section:
|
||||||
continue
|
continue
|
||||||
found_any = True
|
found_any = True
|
||||||
if section.get("candidate_pattern"):
|
_collect_entity_section(section, acc)
|
||||||
candidate_patterns.append(section["candidate_pattern"])
|
|
||||||
if section.get("multi_word_pattern"):
|
|
||||||
multi_word_patterns.append(section["multi_word_pattern"])
|
|
||||||
if section.get("direct_address_pattern"):
|
|
||||||
direct_address.append(section["direct_address_pattern"])
|
|
||||||
person_verbs.extend(section.get("person_verb_patterns", []))
|
|
||||||
pronouns.extend(section.get("pronoun_patterns", []))
|
|
||||||
dialogue.extend(section.get("dialogue_patterns", []))
|
|
||||||
project_verbs.extend(section.get("project_verb_patterns", []))
|
|
||||||
stopwords.update(w.lower() for w in section.get("stopwords", []))
|
|
||||||
|
|
||||||
if not found_any:
|
if not found_any:
|
||||||
# Fallback: load English directly
|
# Fallback: load English directly so callers always get a working config.
|
||||||
section = _load_entity_section("en")
|
_collect_entity_section(_load_entity_section("en"), acc)
|
||||||
if section.get("candidate_pattern"):
|
|
||||||
candidate_patterns.append(section["candidate_pattern"])
|
|
||||||
if section.get("multi_word_pattern"):
|
|
||||||
multi_word_patterns.append(section["multi_word_pattern"])
|
|
||||||
if section.get("direct_address_pattern"):
|
|
||||||
direct_address.append(section["direct_address_pattern"])
|
|
||||||
person_verbs.extend(section.get("person_verb_patterns", []))
|
|
||||||
pronouns.extend(section.get("pronoun_patterns", []))
|
|
||||||
dialogue.extend(section.get("dialogue_patterns", []))
|
|
||||||
project_verbs.extend(section.get("project_verb_patterns", []))
|
|
||||||
stopwords.update(w.lower() for w in section.get("stopwords", []))
|
|
||||||
|
|
||||||
merged = {
|
merged = {
|
||||||
"candidate_patterns": candidate_patterns,
|
"candidate_patterns": acc["candidate_patterns"],
|
||||||
"multi_word_patterns": multi_word_patterns,
|
"multi_word_patterns": acc["multi_word_patterns"],
|
||||||
"person_verb_patterns": _dedupe(person_verbs),
|
"person_verb_patterns": _dedupe(acc["person_verbs"]),
|
||||||
"pronoun_patterns": _dedupe(pronouns),
|
"pronoun_patterns": _dedupe(acc["pronouns"]),
|
||||||
"dialogue_patterns": _dedupe(dialogue),
|
"dialogue_patterns": _dedupe(acc["dialogue"]),
|
||||||
"direct_address_patterns": direct_address,
|
"direct_address_patterns": acc["direct_address"],
|
||||||
"project_verb_patterns": _dedupe(project_verbs),
|
"project_verb_patterns": _dedupe(acc["project_verbs"]),
|
||||||
"stopwords": sorted(stopwords),
|
"stopwords": sorted(acc["stopwords"]),
|
||||||
}
|
}
|
||||||
_entity_cache[key] = merged
|
_entity_cache[key] = merged
|
||||||
return merged
|
return merged
|
||||||
|
|||||||
@@ -589,3 +589,75 @@ def test_config_set_entity_languages_empty_falls_back_to_english(tmp_path, monke
|
|||||||
result = cfg.set_entity_languages([])
|
result = cfg.set_entity_languages([])
|
||||||
assert result == ["en"]
|
assert result == ["en"]
|
||||||
assert cfg.entity_languages == ["en"]
|
assert cfg.entity_languages == ["en"]
|
||||||
|
|
||||||
|
|
||||||
|
# ── boundary_chars for combining-mark scripts ─────────────────────────
|
||||||
|
|
||||||
|
# Devanagari vowel signs (matras) are Unicode Mc — not matched by \w.
|
||||||
|
# Without boundary_chars, \b truncates names like अनीता → अनीत and
|
||||||
|
# person_verb patterns never fire. With boundary_chars, the i18n loader
|
||||||
|
# replaces \b with a script-aware lookaround, fixing both.
|
||||||
|
|
||||||
|
_DEVANAGARI_ENTITY = {
|
||||||
|
"boundary_chars": "\\w\\u0900-\\u097F",
|
||||||
|
"candidate_pattern": "[\\u0900-\\u097F]{2,20}",
|
||||||
|
"multi_word_pattern": "[\\u0900-\\u097F]+(?:\\s+[\\u0900-\\u097F]+)+",
|
||||||
|
"person_verb_patterns": [
|
||||||
|
"\\b{name}\\s+ने\\s+कहा\\b",
|
||||||
|
"\\b{name}\\s+हँसा\\b",
|
||||||
|
],
|
||||||
|
"pronoun_patterns": ["\\bवह\\b", "\\bउसने\\b"],
|
||||||
|
"dialogue_patterns": ["^{name}:\\s"],
|
||||||
|
"direct_address_pattern": "\\bनमस्ते\\s+{name}\\b",
|
||||||
|
"project_verb_patterns": [],
|
||||||
|
"stopwords": ["यह", "वह", "और", "का", "के", "की"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_devanagari_candidate_extraction_with_boundary_chars():
|
||||||
|
"""Names ending in matras are extracted in full with boundary_chars."""
|
||||||
|
with _temp_locale("zz-test-hindi", _DEVANAGARI_ENTITY):
|
||||||
|
text = "अनीता ने कहा। अनीता हँसा। अनीता सोचा। अनीता बोला।"
|
||||||
|
result = extract_candidates(text, languages=("en", "zz-test-hindi"))
|
||||||
|
assert "अनीता" in result, f"expected अनीता in {result}"
|
||||||
|
assert result["अनीता"] >= 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_devanagari_candidate_without_boundary_chars_truncates():
|
||||||
|
"""Without boundary_chars, a matra-ending name gets truncated."""
|
||||||
|
locale_no_boundary = dict(_DEVANAGARI_ENTITY)
|
||||||
|
del locale_no_boundary["boundary_chars"]
|
||||||
|
with _temp_locale("zz-test-hindi-no-b", locale_no_boundary):
|
||||||
|
text = "अनीता ने कहा। अनीता हँसा। अनीता सोचा।"
|
||||||
|
result = extract_candidates(text, languages=("en", "zz-test-hindi-no-b"))
|
||||||
|
# Without boundary_chars, \b splits on the matra — full name won't appear
|
||||||
|
assert "अनीता" not in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_devanagari_person_verb_fires_with_boundary_chars():
|
||||||
|
"""Hindi person-verb patterns fire when boundary_chars extends \\b."""
|
||||||
|
with _temp_locale("zz-test-hindi", _DEVANAGARI_ENTITY):
|
||||||
|
text = "राज ने कहा कुछ। राज हँसा।"
|
||||||
|
lines = text.splitlines()
|
||||||
|
scores = score_entity("राज", text, lines, languages=("en", "zz-test-hindi"))
|
||||||
|
assert scores["person_score"] > 0, f"expected person_score > 0, got {scores}"
|
||||||
|
assert any("action" in s for s in scores["person_signals"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_devanagari_person_verb_silent_without_boundary_chars():
|
||||||
|
"""Without boundary_chars, Hindi person verbs don't fire."""
|
||||||
|
locale_no_boundary = dict(_DEVANAGARI_ENTITY)
|
||||||
|
del locale_no_boundary["boundary_chars"]
|
||||||
|
with _temp_locale("zz-test-hindi-no-b", locale_no_boundary):
|
||||||
|
text = "राज ने कहा कुछ। राज हँसा।"
|
||||||
|
lines = text.splitlines()
|
||||||
|
scores = score_entity("राज", text, lines, languages=("en", "zz-test-hindi-no-b"))
|
||||||
|
assert scores["person_score"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_boundary_chars_english_regression():
|
||||||
|
"""English patterns (no boundary_chars) still work identically."""
|
||||||
|
text = "Riley said hello. Riley laughed. Riley smiled. Riley waved."
|
||||||
|
result = extract_candidates(text, languages=("en",))
|
||||||
|
assert "Riley" in result
|
||||||
|
assert result["Riley"] >= 3
|
||||||
|
|||||||
Reference in New Issue
Block a user