Skip to content

Commit bb14ad5

Browse files
authored
Merge pull request #1234 from PyThaiNLP/copilot/review-pr-changes
Add None validation for corpus paths to prevent confusing runtime errors
2 parents 7cc69e9 + 31ffa48 commit bb14ad5

File tree

5 files changed

+58
-3
lines changed

5 files changed

+58
-3
lines changed

pythainlp/generate/thai2fit.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,15 @@
4444
# get vocab
4545
thwiki = THWIKI_LSTM
4646

47+
# Validate that corpus files are available
48+
if thwiki["itos_fname"] is None or thwiki["wgts_fname"] is None:
49+
raise RuntimeError(
50+
"Thai2fit model files not found. "
51+
"Please download the corpus first:\n"
52+
" pythainlp.corpus.download('wiki_lm_lstm')\n"
53+
" pythainlp.corpus.download('wiki_itos_lstm')"
54+
)
55+
4756
# Security Note: This loads a pickle file from PyThaiNLP's trusted corpus.
4857
# The file is downloaded from PyThaiNLP's official repository with MD5 verification.
4958
# Users should only use corpus files from trusted sources.

pythainlp/tag/thainer.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,24 @@ def __init__(self, version: str = "1.4") -> None:
102102
self.crf = CRFTagger()
103103

104104
if version == "1.4":
105-
self.crf.open(get_corpus_path("thainer-1.4", version="1.4"))
105+
model_path = get_corpus_path("thainer-1.4", version="1.4")
106+
if model_path is None:
107+
raise RuntimeError(
108+
"ThaiNER 1.4 model not found. "
109+
"Please download the corpus first:\n"
110+
" pythainlp.corpus.download('thainer-1.4')"
111+
)
112+
self.crf.open(model_path)
106113
self.pos_tag_name = "orchid_ud"
107114
elif version == "1.5":
108-
self.crf.open(get_corpus_path("thainer", version="1.5"))
115+
model_path = get_corpus_path("thainer", version="1.5")
116+
if model_path is None:
117+
raise RuntimeError(
118+
"ThaiNER 1.5 model not found. "
119+
"Please download the corpus first:\n"
120+
" pythainlp.corpus.download('thainer')"
121+
)
122+
self.crf.open(model_path)
109123
self.pos_tag_name = "blackboard"
110124

111125
def get_ner(

pythainlp/ulmfit/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
"THWIKI_LSTM",
1818
"ThaiTokenizer",
1919
"document_vector",
20+
"get_thwiki_lstm",
2021
"merge_wgts",
2122
"post_rules_th",
2223
"post_rules_th_sparse",
@@ -41,6 +42,7 @@
4142
from pythainlp.ulmfit.core import (
4243
THWIKI_LSTM,
4344
document_vector,
45+
get_thwiki_lstm,
4446
merge_wgts,
4547
post_rules_th,
4648
post_rules_th_sparse,

pythainlp/ulmfit/core.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,38 @@
3939

4040

4141
# Pretrained model paths
42+
# Note: These may be None if corpus is not downloaded.
43+
# Access via get_thwiki_lstm() for proper validation or use directly
44+
# if you've already verified the corpus is downloaded.
4245
THWIKI_LSTM = {
4346
"wgts_fname": get_corpus_path(_MODEL_NAME_LSTM),
4447
"itos_fname": get_corpus_path(_ITOS_NAME_LSTM),
4548
}
4649

50+
51+
def get_thwiki_lstm() -> dict[str, str]:
52+
"""
53+
Get THWIKI LSTM model paths with validation.
54+
55+
Returns dictionary with 'wgts_fname' and 'itos_fname' keys containing
56+
validated file paths as strings.
57+
58+
:return: Dictionary with model file paths
59+
:raises RuntimeError: If corpus files are not found
60+
"""
61+
wgts_fname = THWIKI_LSTM["wgts_fname"]
62+
itos_fname = THWIKI_LSTM["itos_fname"]
63+
64+
if wgts_fname is None or itos_fname is None:
65+
raise RuntimeError(
66+
"ULMFiT model files not found. "
67+
"Please download the corpus first:\n"
68+
" pythainlp.corpus.download('wiki_lm_lstm')\n"
69+
" pythainlp.corpus.download('wiki_itos_lstm')"
70+
)
71+
72+
return {"wgts_fname": wgts_fname, "itos_fname": itos_fname}
73+
4774
# Preprocessing rules for Thai text
4875
# dense features
4976
pre_rules_th = [

tests/extra/testx_ulmfit.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,10 @@ def test_document_vector(self):
223223
imdb = untar_data(URLs.IMDB_SAMPLE)
224224
dummy_df = pd.read_csv(imdb / "texts.csv")
225225
thwiki = THWIKI_LSTM
226-
thwiki_itos = pickle.load(open(thwiki["itos_fname"], "rb"))
226+
# Security note: pickle.load() executes arbitrary code if file is malicious.
227+
# These corpus files come from a trusted source with MD5 verification.
228+
with open(thwiki["itos_fname"], "rb") as f:
229+
thwiki_itos = pickle.load(f)
227230
thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos)
228231
tt = Tokenizer(
229232
tok_func=ThaiTokenizer,

0 commit comments

Comments
 (0)