diff --git a/mempalace/dialect.py b/mempalace/dialect.py index 5a68583..3c51c52 100644 --- a/mempalace/dialect.py +++ b/mempalace/dialect.py @@ -158,6 +158,8 @@ _FLAG_SIGNALS = { } # Common filler/stop words to strip from topic extraction +_ALPHA_RE = re.compile(r"[^a-zA-Z]") + _STOP_WORDS = { "the", "a", @@ -541,7 +543,7 @@ class Dialect: # Fallback: find capitalized words that look like names (2+ chars, not sentence-start) words = text.split() for i, w in enumerate(words): - clean = re.sub(r"[^a-zA-Z]", "", w) + clean = _ALPHA_RE.sub("", w) if ( len(clean) >= 2 and clean[0].isupper() diff --git a/tests/benchmarks/benchmark_dialect.py b/tests/benchmarks/benchmark_dialect.py new file mode 100644 index 0000000..06beca2 --- /dev/null +++ b/tests/benchmarks/benchmark_dialect.py @@ -0,0 +1,14 @@ +import pytest +import timeit +import re + +from mempalace.dialect import Dialect + +def test_detect_entities_benchmark(): + dialect = Dialect() + text = "Alice went to the market and met Bob who is a nice guy. They both discussed about Dr. Chen and how he solved the big issue. Another sentence with Name and Name2 and SomeName" + + # Run the function multiple times to measure the performance + number = 10000 + time = timeit.timeit(lambda: dialect._detect_entities_in_text(text), number=number) + print(f"\nDialect._detect_entities_in_text benchmark: {time:.4f} seconds for {number} iterations")