|
2 | 2 | # |
3 | 3 | # Copyright (C) 2001-2024 NLTK Project |
4 | 4 | # Author: Edward Loper <[email protected]> |
| 5 | +# Author: ekaf (Restricting and switching pickles) |
5 | 6 | # URL: <https://www.nltk.org/> |
6 | 7 | # For license information, see LICENSE.TXT |
7 | 8 |
|
@@ -667,6 +668,55 @@ def restricted_pickle_load(string): |
667 | 668 | return RestrictedUnpickler(BytesIO(string)).load() |
668 | 669 |
|
669 | 670 |
|
| 671 | +def switch_punkt(lang="english"): |
| 672 | + """ |
| 673 | + Return a pickle-free Punkt tokenizer instead of loading a pickle. |
| 674 | +
|
| 675 | + >>> import nltk |
| 676 | + >>> tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') |
| 677 | + >>> print(tokenizer.tokenize("Hello! How are you?")) |
| 678 | + ['Hello!', 'How are you?'] |
| 679 | + """ |
| 680 | + from nltk.tokenize import PunktTokenizer as tok |
| 681 | + |
| 682 | + return tok(lang) |
| 683 | + |
| 684 | + |
| 685 | +def switch_chunker(fmt="multiclass"): |
| 686 | + """ |
| 687 | + Return a pickle-free Named Entity Chunker instead of loading a pickle. |
| 688 | +
|
| 689 | +
|
| 690 | + """ |
| 691 | + from nltk.chunker import ne_chunker |
| 692 | + |
| 693 | + return ne_chunker(fmt) |
| 694 | + |
| 695 | + |
| 696 | +def switch_t_tagger(): |
| 697 | + """ |
| 698 | + Return a pickle-free Treebank Pos Tagger instead of loading a pickle. |
| 699 | +
|
| 700 | + """ |
| 701 | + from nltk.classifier.maxent import maxent_pos_tagger |
| 702 | + |
| 703 | + return maxent_pos_tagger() |
| 704 | + |
| 705 | + |
| 706 | +def switch_p_tagger(lang): |
| 707 | + """ |
| 708 | + Return a pickle-free Averaged Perceptron Tagger instead of loading a pickle. |
| 709 | +
|
| 710 | + """ |
| 711 | + from nltk.tag import _get_tagger |
| 712 | + |
| 713 | + if lang == "ru": |
| 714 | + lang = "rus" |
| 715 | + else: |
| 716 | + lang = None |
| 717 | + return _get_tagger(lang) |
| 718 | + |
| 719 | + |
670 | 720 | def load( |
671 | 721 | resource_url, |
672 | 722 | format="auto", |
@@ -750,6 +800,20 @@ def load( |
750 | 800 | print(f"<<Using cached copy of {resource_url}>>") |
751 | 801 | return resource_val |
752 | 802 |
|
| 803 | + resource_url = normalize_resource_url(resource_url) |
| 804 | + protocol, path_ = split_resource_url(resource_url) |
| 805 | + |
| 806 | + if path_[-7:] == ".pickle": |
| 807 | + fil = os.path.split(path_[:-7])[-1] |
| 808 | + if path_.startswith("tokenizers/punkt"): |
| 809 | + return switch_punkt(fil) |
| 810 | + elif path_.startswith("chunkers/maxent_ne_chunker"): |
| 811 | + return switch_chunker(fil.split("_")[-1]) |
| 812 | + elif path_.startswith("taggers/maxent_treebank_pos_tagger"): |
| 813 | + return switch_t_tagger() |
| 814 | + elif path_.startswith("taggers/averaged_perceptron_tagger"): |
| 815 | + return switch_p_tagger(fil.split("_")[-1]) |
| 816 | + |
753 | 817 | # Let the user know what's going on. |
754 | 818 | if verbose: |
755 | 819 | print(f"<<Loading {resource_url}>>") |
|
0 commit comments