Merge pull request nltk#3302 from ekaf/switch_pickle

stevenbird · web-flow · commit 0e038774475f · 2024-08-16T03:56:20.000+01:00
Don't break old pickle requests
diff --git a/nltk/data.py b/nltk/data.py
@@ -2,6 +2,7 @@
 #
 # Copyright (C) 2001-2024 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
+# Author: ekaf (Restricting and switching pickles)
 # URL: <https://www.nltk.org/>
 # For license information, see LICENSE.TXT
 
@@ -667,6 +668,55 @@ def restricted_pickle_load(string):
     return RestrictedUnpickler(BytesIO(string)).load()
 
 
+def switch_punkt(lang="english"):
+    """
+    Return a pickle-free Punkt tokenizer instead of loading a pickle.
+
+    >>> import nltk
+    >>> tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
+    >>> print(tokenizer.tokenize("Hello! How are you?"))
+    ['Hello!', 'How are you?']
+    """
+    from nltk.tokenize import PunktTokenizer as tok
+
+    return tok(lang)
+
+
+def switch_chunker(fmt="multiclass"):
+    """
+    Return a pickle-free Named Entity Chunker instead of loading a pickle.
+
+
+    """
+    from nltk.chunker import ne_chunker
+
+    return ne_chunker(fmt)
+
+
+def switch_t_tagger():
+    """
+    Return a pickle-free Treebank Pos Tagger instead of loading a pickle.
+
+    """
+    from nltk.classifier.maxent import maxent_pos_tagger
+
+    return maxent_pos_tagger()
+
+
+def switch_p_tagger(lang):
+    """
+    Return a pickle-free Averaged Perceptron Tagger instead of loading a pickle.
+
+    """
+    from nltk.tag import _get_tagger
+
+    if lang == "ru":
+        lang = "rus"
+    else:
+        lang = None
+    return _get_tagger(lang)
+
+
 def load(
     resource_url,
     format="auto",
@@ -750,6 +800,20 @@ def load(
                 print(f"<<Using cached copy of {resource_url}>>")
             return resource_val
 
+    resource_url = normalize_resource_url(resource_url)
+    protocol, path_ = split_resource_url(resource_url)
+
+    if path_[-7:] == ".pickle":
+        fil = os.path.split(path_[:-7])[-1]
+        if path_.startswith("tokenizers/punkt"):
+            return switch_punkt(fil)
+        elif path_.startswith("chunkers/maxent_ne_chunker"):
+            return switch_chunker(fil.split("_")[-1])
+        elif path_.startswith("taggers/maxent_treebank_pos_tagger"):
+            return switch_t_tagger()
+        elif path_.startswith("taggers/averaged_perceptron_tagger"):
+            return switch_p_tagger(fil.split("_")[-1])
+
     # Let the user know what's going on.
     if verbose:
         print(f"<<Loading {resource_url}>>")