Skip to content

Commit 511005c

Browse files
committed
Use efficient ngrams implementation from python docs
1 parent 582e6e3 commit 511005c

File tree

1 file changed

+9
-9
lines changed

1 file changed

+9
-9
lines changed

nltk/util.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
# Eric Kafe <[email protected]> (acyclic closures)
66
# URL: <https://www.nltk.org/>
77
# For license information, see LICENSE.TXT
8-
98
import inspect
109
import locale
1110
import os
@@ -858,14 +857,15 @@ def ngrams(sequence, n, **kwargs):
858857
"""
859858
sequence = pad_sequence(sequence, n, **kwargs)
860859

861-
# Creates the sliding window, of n no. of items.
862-
# `iterables` is a tuple of iterables where each iterable is a window of n items.
863-
iterables = tee(sequence, n)
864-
865-
for i, sub_iterable in enumerate(iterables): # For each window,
866-
for _ in range(i): # iterate through every order of ngrams
867-
next(sub_iterable, None) # generate the ngrams within the window.
868-
return zip(*iterables) # Unpack and flattens the iterables.
860+
# sliding_window('ABCDEFG', 4) --> ABCD BCDE CDEF DEFG
861+
# https://docs.python.org/3/library/itertools.html?highlight=sliding_window#itertools-recipes
862+
it = iter(sequence)
863+
window = deque(islice(it, n), maxlen=n)
864+
if len(window) == n:
865+
yield tuple(window)
866+
for x in it:
867+
window.append(x)
868+
yield tuple(window)
869869

870870

871871
def bigrams(sequence, **kwargs):

0 commit comments

Comments
 (0)