Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions pythainlp/generate/thai2fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,15 @@
# get vocab
thwiki = THWIKI_LSTM

# Validate that corpus files are available
if thwiki["itos_fname"] is None or thwiki["wgts_fname"] is None:
raise RuntimeError(
"Thai2fit model files not found. "
"Please download the corpus first:\n"
" pythainlp.corpus.download('wiki_lm_lstm')\n"
" pythainlp.corpus.download('wiki_itos_lstm')"
)

# Security Note: This loads a pickle file from PyThaiNLP's trusted corpus.
# The file is downloaded from PyThaiNLP's official repository with MD5 verification.
# Users should only use corpus files from trusted sources.
Expand Down
18 changes: 16 additions & 2 deletions pythainlp/tag/thainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,24 @@ def __init__(self, version: str = "1.4") -> None:
self.crf = CRFTagger()

if version == "1.4":
self.crf.open(get_corpus_path("thainer-1.4", version="1.4"))
model_path = get_corpus_path("thainer-1.4", version="1.4")
if model_path is None:
raise RuntimeError(
"ThaiNER 1.4 model not found. "
"Please download the corpus first:\n"
" pythainlp.corpus.download('thainer-1.4')"
)
self.crf.open(model_path)
self.pos_tag_name = "orchid_ud"
elif version == "1.5":
self.crf.open(get_corpus_path("thainer", version="1.5"))
model_path = get_corpus_path("thainer", version="1.5")
if model_path is None:
raise RuntimeError(
"ThaiNER 1.5 model not found. "
"Please download the corpus first:\n"
" pythainlp.corpus.download('thainer')"
)
self.crf.open(model_path)
self.pos_tag_name = "blackboard"

def get_ner(
Expand Down
2 changes: 2 additions & 0 deletions pythainlp/ulmfit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"THWIKI_LSTM",
"ThaiTokenizer",
"document_vector",
"get_thwiki_lstm",
"merge_wgts",
"post_rules_th",
"post_rules_th_sparse",
Expand All @@ -41,6 +42,7 @@
from pythainlp.ulmfit.core import (
THWIKI_LSTM,
document_vector,
get_thwiki_lstm,
merge_wgts,
post_rules_th,
post_rules_th_sparse,
Expand Down
27 changes: 27 additions & 0 deletions pythainlp/ulmfit/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,38 @@


# Pretrained model paths
# Note: These may be None if corpus is not downloaded.
# Access via get_thwiki_lstm() for proper validation or use directly
# if you've already verified the corpus is downloaded.
THWIKI_LSTM = {
"wgts_fname": get_corpus_path(_MODEL_NAME_LSTM),
"itos_fname": get_corpus_path(_ITOS_NAME_LSTM),
}


def get_thwiki_lstm() -> dict[str, str]:
"""
Get THWIKI LSTM model paths with validation.
Returns dictionary with 'wgts_fname' and 'itos_fname' keys containing
validated file paths as strings.
:return: Dictionary with model file paths
:raises RuntimeError: If corpus files are not found
"""
wgts_fname = THWIKI_LSTM["wgts_fname"]
itos_fname = THWIKI_LSTM["itos_fname"]

if wgts_fname is None or itos_fname is None:
raise RuntimeError(
"ULMFiT model files not found. "
"Please download the corpus first:\n"
" pythainlp.corpus.download('wiki_lm_lstm')\n"
" pythainlp.corpus.download('wiki_itos_lstm')"
)

return {"wgts_fname": wgts_fname, "itos_fname": itos_fname}

# Preprocessing rules for Thai text
# dense features
pre_rules_th = [
Expand Down
5 changes: 4 additions & 1 deletion tests/extra/testx_ulmfit.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,10 @@ def test_document_vector(self):
imdb = untar_data(URLs.IMDB_SAMPLE)
dummy_df = pd.read_csv(imdb / "texts.csv")
thwiki = THWIKI_LSTM
thwiki_itos = pickle.load(open(thwiki["itos_fname"], "rb"))
# Security note: pickle.load() executes arbitrary code if file is malicious.
# These corpus files come from a trusted source with MD5 verification.
with open(thwiki["itos_fname"], "rb") as f:
thwiki_itos = pickle.load(f)
thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos)
tt = Tokenizer(
tok_func=ThaiTokenizer,
Expand Down
Loading