File tree Expand file tree Collapse file tree 5 files changed +58
-3
lines changed
Expand file tree Collapse file tree 5 files changed +58
-3
lines changed Original file line number Diff line number Diff line change 4444# get vocab
4545thwiki = THWIKI_LSTM
4646
47+ # Validate that corpus files are available
48+ if thwiki ["itos_fname" ] is None or thwiki ["wgts_fname" ] is None :
49+ raise RuntimeError (
50+ "Thai2fit model files not found. "
51+ "Please download the corpus first:\n "
52+ " pythainlp.corpus.download('wiki_lm_lstm')\n "
53+ " pythainlp.corpus.download('wiki_itos_lstm')"
54+ )
55+
4756# Security Note: This loads a pickle file from PyThaiNLP's trusted corpus.
4857# The file is downloaded from PyThaiNLP's official repository with MD5 verification.
4958# Users should only use corpus files from trusted sources.
Original file line number Diff line number Diff line change @@ -102,10 +102,24 @@ def __init__(self, version: str = "1.4") -> None:
102102 self .crf = CRFTagger ()
103103
104104 if version == "1.4" :
105- self .crf .open (get_corpus_path ("thainer-1.4" , version = "1.4" ))
105+ model_path = get_corpus_path ("thainer-1.4" , version = "1.4" )
106+ if model_path is None :
107+ raise RuntimeError (
108+ "ThaiNER 1.4 model not found. "
109+ "Please download the corpus first:\n "
110+ " pythainlp.corpus.download('thainer-1.4')"
111+ )
112+ self .crf .open (model_path )
106113 self .pos_tag_name = "orchid_ud"
107114 elif version == "1.5" :
108- self .crf .open (get_corpus_path ("thainer" , version = "1.5" ))
115+ model_path = get_corpus_path ("thainer" , version = "1.5" )
116+ if model_path is None :
117+ raise RuntimeError (
118+ "ThaiNER 1.5 model not found. "
119+ "Please download the corpus first:\n "
120+ " pythainlp.corpus.download('thainer')"
121+ )
122+ self .crf .open (model_path )
109123 self .pos_tag_name = "blackboard"
110124
111125 def get_ner (
Original file line number Diff line number Diff line change 1717 "THWIKI_LSTM" ,
1818 "ThaiTokenizer" ,
1919 "document_vector" ,
20+ "get_thwiki_lstm" ,
2021 "merge_wgts" ,
2122 "post_rules_th" ,
2223 "post_rules_th_sparse" ,
4142from pythainlp .ulmfit .core import (
4243 THWIKI_LSTM ,
4344 document_vector ,
45+ get_thwiki_lstm ,
4446 merge_wgts ,
4547 post_rules_th ,
4648 post_rules_th_sparse ,
Original file line number Diff line number Diff line change 3939
4040
4141# Pretrained model paths
42+ # Note: These may be None if corpus is not downloaded.
43+ # Access via get_thwiki_lstm() for proper validation or use directly
44+ # if you've already verified the corpus is downloaded.
4245THWIKI_LSTM = {
4346 "wgts_fname" : get_corpus_path (_MODEL_NAME_LSTM ),
4447 "itos_fname" : get_corpus_path (_ITOS_NAME_LSTM ),
4548}
4649
50+
51+ def get_thwiki_lstm () -> dict [str , str ]:
52+ """
53+ Get THWIKI LSTM model paths with validation.
54+
55+ Returns dictionary with 'wgts_fname' and 'itos_fname' keys containing
56+ validated file paths as strings.
57+
58+ :return: Dictionary with model file paths
59+ :raises RuntimeError: If corpus files are not found
60+ """
61+ wgts_fname = THWIKI_LSTM ["wgts_fname" ]
62+ itos_fname = THWIKI_LSTM ["itos_fname" ]
63+
64+ if wgts_fname is None or itos_fname is None :
65+ raise RuntimeError (
66+ "ULMFiT model files not found. "
67+ "Please download the corpus first:\n "
68+ " pythainlp.corpus.download('wiki_lm_lstm')\n "
69+ " pythainlp.corpus.download('wiki_itos_lstm')"
70+ )
71+
72+ return {"wgts_fname" : wgts_fname , "itos_fname" : itos_fname }
73+
4774# Preprocessing rules for Thai text
4875# dense features
4976pre_rules_th = [
Original file line number Diff line number Diff line change @@ -223,7 +223,10 @@ def test_document_vector(self):
223223 imdb = untar_data (URLs .IMDB_SAMPLE )
224224 dummy_df = pd .read_csv (imdb / "texts.csv" )
225225 thwiki = THWIKI_LSTM
226- thwiki_itos = pickle .load (open (thwiki ["itos_fname" ], "rb" ))
226+ # Security note: pickle.load() executes arbitrary code if file is malicious.
227+ # These corpus files come from a trusted source with MD5 verification.
228+ with open (thwiki ["itos_fname" ], "rb" ) as f :
229+ thwiki_itos = pickle .load (f )
227230 thwiki_vocab = fastai .text .transform .Vocab (thwiki_itos )
228231 tt = Tokenizer (
229232 tok_func = ThaiTokenizer ,
You can’t perform that action at this time.
0 commit comments