perf: optimize regex compilation in entity extraction
Move regular expression compilation to the module level in `dialect.py` to prevent repeated parsing during loop execution. Co-authored-by: igorls <4753812+igorls@users.noreply.github.com>
This commit is contained in:
@@ -158,6 +158,8 @@ _FLAG_SIGNALS = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Common filler/stop words to strip from topic extraction
|
# Common filler/stop words to strip from topic extraction
|
||||||
|
_ALPHA_RE = re.compile(r"[^a-zA-Z]")
|
||||||
|
|
||||||
_STOP_WORDS = {
|
_STOP_WORDS = {
|
||||||
"the",
|
"the",
|
||||||
"a",
|
"a",
|
||||||
@@ -541,7 +543,7 @@ class Dialect:
|
|||||||
# Fallback: find capitalized words that look like names (2+ chars, not sentence-start)
|
# Fallback: find capitalized words that look like names (2+ chars, not sentence-start)
|
||||||
words = text.split()
|
words = text.split()
|
||||||
for i, w in enumerate(words):
|
for i, w in enumerate(words):
|
||||||
clean = re.sub(r"[^a-zA-Z]", "", w)
|
clean = _ALPHA_RE.sub("", w)
|
||||||
if (
|
if (
|
||||||
len(clean) >= 2
|
len(clean) >= 2
|
||||||
and clean[0].isupper()
|
and clean[0].isupper()
|
||||||
|
|||||||
@@ -0,0 +1,14 @@
|
|||||||
|
import pytest
|
||||||
|
import timeit
|
||||||
|
import re
|
||||||
|
|
||||||
|
from mempalace.dialect import Dialect
|
||||||
|
|
||||||
|
def test_detect_entities_benchmark():
|
||||||
|
dialect = Dialect()
|
||||||
|
text = "Alice went to the market and met Bob who is a nice guy. They both discussed about Dr. Chen and how he solved the big issue. Another sentence with Name and Name2 and SomeName"
|
||||||
|
|
||||||
|
# Run the function multiple times to measure the performance
|
||||||
|
number = 10000
|
||||||
|
time = timeit.timeit(lambda: dialect._detect_entities_in_text(text), number=number)
|
||||||
|
print(f"\nDialect._detect_entities_in_text benchmark: {time:.4f} seconds for {number} iterations")
|
||||||
Reference in New Issue
Block a user