Skip to content

Commit 60b9a66

Browse files
authored
Merge pull request nltk#3300 from antoniomika/am/punkt-lru-cache
Use a lru cache when instantiating PunktTokenizer
2 parents 0e03877 + e34ee9f commit 60b9a66

File tree

1 file changed

+14
-1
lines changed

1 file changed

+14
-1
lines changed

nltk/tokenize/__init__.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
For further information, please see Chapter 3 of the NLTK book.
6060
"""
6161

62+
import functools
6263
import re
6364

6465
from nltk.data import load
@@ -92,6 +93,18 @@
9293
from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize
9394

9495

96+
@functools.lru_cache
97+
def _get_punkt_tokenizer(language="english"):
98+
"""
99+
A constructor for the PunktTokenizer that utilizes
100+
a lru cache for performance.
101+
102+
:param language: the model name in the Punkt corpus
103+
:type language: str
104+
"""
105+
return PunktTokenizer(language)
106+
107+
95108
# Standard sentence tokenizer.
96109
def sent_tokenize(text, language="english"):
97110
"""
@@ -103,7 +116,7 @@ def sent_tokenize(text, language="english"):
103116
:param text: text to split into sentences
104117
:param language: the model name in the Punkt corpus
105118
"""
106-
tokenizer = PunktTokenizer(language)
119+
tokenizer = _get_punkt_tokenizer(language)
107120
return tokenizer.tokenize(text)
108121

109122

0 commit comments

Comments
 (0)