Merge pull request #880 from MemPalace/perf-optimize-regex-compilation-15578943484596502942

 Optimize regex compilation in entity extraction
This commit is contained in:
Igor Lins e Silva
2026-04-14 15:10:34 -03:00
committed by GitHub
2 changed files with 17 additions and 1 deletions
+3 -1
View File
@@ -158,6 +158,8 @@ _FLAG_SIGNALS = {
}
# Common filler/stop words to strip from topic extraction
_ALPHA_RE = re.compile(r"[^a-zA-Z]")
_STOP_WORDS = {
"the",
"a",
@@ -541,7 +543,7 @@ class Dialect:
# Fallback: find capitalized words that look like names (2+ chars, not sentence-start)
words = text.split()
for i, w in enumerate(words):
clean = re.sub(r"[^a-zA-Z]", "", w)
clean = _ALPHA_RE.sub("", w)
if (
len(clean) >= 2
and clean[0].isupper()
+14
View File
@@ -0,0 +1,14 @@
import pytest
import timeit
import re
from mempalace.dialect import Dialect
def test_detect_entities_benchmark():
dialect = Dialect()
text = "Alice went to the market and met Bob who is a nice guy. They both discussed about Dr. Chen and how he solved the big issue. Another sentence with Name and Name2 and SomeName"
# Run the function multiple times to measure the performance
number = 10000
time = timeit.timeit(lambda: dialect._detect_entities_in_text(text), number=number)
print(f"\nDialect._detect_entities_in_text benchmark: {time:.4f} seconds for {number} iterations")