Skip to content

Commit 64c0afc

Browse files
committed
Don't break old pickle requests
1 parent 27e49f7 commit 64c0afc

File tree

1 file changed

+22
-0
lines changed

1 file changed

+22
-0
lines changed

nltk/data.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,20 @@ def restricted_pickle_load(string):
667667
return RestrictedUnpickler(BytesIO(string)).load()
668668

669669

670+
def switch_punkt(lang="english"):
671+
"""
672+
Return a pickle-free Punkt tokenizer instead of loading a pickle.
673+
674+
>>> import nltk
675+
>>> tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
676+
>>> print(tokenizer.tokenize("Hello! How are you?"))
677+
['Hello!', 'How are you?']
678+
"""
679+
from nltk.tokenize import PunktTokenizer as tok
680+
681+
return tok(lang)
682+
683+
670684
def load(
671685
resource_url,
672686
format="auto",
@@ -750,6 +764,14 @@ def load(
750764
print(f"<<Using cached copy of {resource_url}>>")
751765
return resource_val
752766

767+
resource_url = normalize_resource_url(resource_url)
768+
protocol, path_ = split_resource_url(resource_url)
769+
770+
if path_[-7:] == ".pickle":
771+
fil = os.path.split(path_[:-7])[-1]
772+
if path_.startswith("tokenizers/punkt"):
773+
return switch_punkt(fil)
774+
753775
# Let the user know what's going on.
754776
if verbose:
755777
print(f"<<Loading {resource_url}>>")

0 commit comments

Comments
 (0)