Skip to content

Commit 0e03877

Browse files
authored
Merge pull request nltk#3302 from ekaf/switch_pickle
Don't break old pickle requests
2 parents 37f0c01 + 2bd4e86 commit 0e03877

File tree

1 file changed

+64
-0
lines changed

1 file changed

+64
-0
lines changed

nltk/data.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#
33
# Copyright (C) 2001-2024 NLTK Project
44
# Author: Edward Loper <[email protected]>
5+
# Author: ekaf (Restricting and switching pickles)
56
# URL: <https://www.nltk.org/>
67
# For license information, see LICENSE.TXT
78

@@ -667,6 +668,55 @@ def restricted_pickle_load(string):
667668
return RestrictedUnpickler(BytesIO(string)).load()
668669

669670

671+
def switch_punkt(lang="english"):
672+
"""
673+
Return a pickle-free Punkt tokenizer instead of loading a pickle.
674+
675+
>>> import nltk
676+
>>> tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
677+
>>> print(tokenizer.tokenize("Hello! How are you?"))
678+
['Hello!', 'How are you?']
679+
"""
680+
from nltk.tokenize import PunktTokenizer as tok
681+
682+
return tok(lang)
683+
684+
685+
def switch_chunker(fmt="multiclass"):
686+
"""
687+
Return a pickle-free Named Entity Chunker instead of loading a pickle.
688+
689+
690+
"""
691+
from nltk.chunker import ne_chunker
692+
693+
return ne_chunker(fmt)
694+
695+
696+
def switch_t_tagger():
697+
"""
698+
Return a pickle-free Treebank Pos Tagger instead of loading a pickle.
699+
700+
"""
701+
from nltk.classifier.maxent import maxent_pos_tagger
702+
703+
return maxent_pos_tagger()
704+
705+
706+
def switch_p_tagger(lang):
707+
"""
708+
Return a pickle-free Averaged Perceptron Tagger instead of loading a pickle.
709+
710+
"""
711+
from nltk.tag import _get_tagger
712+
713+
if lang == "ru":
714+
lang = "rus"
715+
else:
716+
lang = None
717+
return _get_tagger(lang)
718+
719+
670720
def load(
671721
resource_url,
672722
format="auto",
@@ -750,6 +800,20 @@ def load(
750800
print(f"<<Using cached copy of {resource_url}>>")
751801
return resource_val
752802

803+
resource_url = normalize_resource_url(resource_url)
804+
protocol, path_ = split_resource_url(resource_url)
805+
806+
if path_[-7:] == ".pickle":
807+
fil = os.path.split(path_[:-7])[-1]
808+
if path_.startswith("tokenizers/punkt"):
809+
return switch_punkt(fil)
810+
elif path_.startswith("chunkers/maxent_ne_chunker"):
811+
return switch_chunker(fil.split("_")[-1])
812+
elif path_.startswith("taggers/maxent_treebank_pos_tagger"):
813+
return switch_t_tagger()
814+
elif path_.startswith("taggers/averaged_perceptron_tagger"):
815+
return switch_p_tagger(fil.split("_")[-1])
816+
753817
# Let the user know what's going on.
754818
if verbose:
755819
print(f"<<Loading {resource_url}>>")

0 commit comments

Comments
 (0)