Skip to content

Commit 30fefc9

Browse files
authored
Merge pull request nltk#3162 from rmalouf/ngrams
Use efficient ngrams implementation from python docs
2 parents 5f69622 + 511005c commit 30fefc9

File tree

1 file changed

+9
-9
lines changed

1 file changed

+9
-9
lines changed

nltk/util.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
# Eric Kafe <[email protected]> (acyclic closures)
66
# URL: <https://www.nltk.org/>
77
# For license information, see LICENSE.TXT
8-
98
import inspect
109
import locale
1110
import os
@@ -894,14 +893,15 @@ def ngrams(sequence, n, **kwargs):
894893
"""
895894
sequence = pad_sequence(sequence, n, **kwargs)
896895

897-
# Creates the sliding window, of n no. of items.
898-
# `iterables` is a tuple of iterables where each iterable is a window of n items.
899-
iterables = tee(sequence, n)
900-
901-
for i, sub_iterable in enumerate(iterables): # For each window,
902-
for _ in range(i): # iterate through every order of ngrams
903-
next(sub_iterable, None) # generate the ngrams within the window.
904-
return zip(*iterables) # Unpack and flattens the iterables.
896+
# sliding_window('ABCDEFG', 4) --> ABCD BCDE CDEF DEFG
897+
# https://docs.python.org/3/library/itertools.html?highlight=sliding_window#itertools-recipes
898+
it = iter(sequence)
899+
window = deque(islice(it, n), maxlen=n)
900+
if len(window) == n:
901+
yield tuple(window)
902+
for x in it:
903+
window.append(x)
904+
yield tuple(window)
905905

906906

907907
def bigrams(sequence, **kwargs):

0 commit comments

Comments
 (0)