Merge pull request #1234 from PyThaiNLP/copilot/review-pr-changes

bact · web-flow · commit bb14ad543eb8 · 2026-01-30T00:20:37.000Z
Add None validation for corpus paths to prevent confusing runtime errors
diff --git a/pythainlp/generate/thai2fit.py b/pythainlp/generate/thai2fit.py
@@ -44,6 +44,15 @@
 # get vocab
 thwiki = THWIKI_LSTM
 
+# Validate that corpus files are available
+if thwiki["itos_fname"] is None or thwiki["wgts_fname"] is None:
+    raise RuntimeError(
+        "Thai2fit model files not found. "
+        "Please download the corpus first:\n"
+        "  pythainlp.corpus.download('wiki_lm_lstm')\n"
+        "  pythainlp.corpus.download('wiki_itos_lstm')"
+    )
+
 # Security Note: This loads a pickle file from PyThaiNLP's trusted corpus.
 # The file is downloaded from PyThaiNLP's official repository with MD5 verification.
 # Users should only use corpus files from trusted sources.
diff --git a/pythainlp/tag/thainer.py b/pythainlp/tag/thainer.py
@@ -102,10 +102,24 @@ def __init__(self, version: str = "1.4") -> None:
         self.crf = CRFTagger()
 
         if version == "1.4":
-            self.crf.open(get_corpus_path("thainer-1.4", version="1.4"))
+            model_path = get_corpus_path("thainer-1.4", version="1.4")
+            if model_path is None:
+                raise RuntimeError(
+                    "ThaiNER 1.4 model not found. "
+                    "Please download the corpus first:\n"
+                    "  pythainlp.corpus.download('thainer-1.4')"
+                )
+            self.crf.open(model_path)
             self.pos_tag_name = "orchid_ud"
         elif version == "1.5":
-            self.crf.open(get_corpus_path("thainer", version="1.5"))
+            model_path = get_corpus_path("thainer", version="1.5")
+            if model_path is None:
+                raise RuntimeError(
+                    "ThaiNER 1.5 model not found. "
+                    "Please download the corpus first:\n"
+                    "  pythainlp.corpus.download('thainer')"
+                )
+            self.crf.open(model_path)
             self.pos_tag_name = "blackboard"
 
     def get_ner(
diff --git a/pythainlp/ulmfit/__init__.py b/pythainlp/ulmfit/__init__.py
@@ -17,6 +17,7 @@
     "THWIKI_LSTM",
     "ThaiTokenizer",
     "document_vector",
+    "get_thwiki_lstm",
     "merge_wgts",
     "post_rules_th",
     "post_rules_th_sparse",
@@ -41,6 +42,7 @@
 from pythainlp.ulmfit.core import (
     THWIKI_LSTM,
     document_vector,
+    get_thwiki_lstm,
     merge_wgts,
     post_rules_th,
     post_rules_th_sparse,
diff --git a/pythainlp/ulmfit/core.py b/pythainlp/ulmfit/core.py
@@ -39,11 +39,38 @@
 
 
 # Pretrained model paths
+# Note: These may be None if corpus is not downloaded.
+# Access via get_thwiki_lstm() for proper validation or use directly
+# if you've already verified the corpus is downloaded.
 THWIKI_LSTM = {
     "wgts_fname": get_corpus_path(_MODEL_NAME_LSTM),
     "itos_fname": get_corpus_path(_ITOS_NAME_LSTM),
 }
 
+
+def get_thwiki_lstm() -> dict[str, str]:
+    """
+    Get THWIKI LSTM model paths with validation.
+
+    Returns dictionary with 'wgts_fname' and 'itos_fname' keys containing
+    validated file paths as strings.
+
+    :return: Dictionary with model file paths
+    :raises RuntimeError: If corpus files are not found
+    """
+    wgts_fname = THWIKI_LSTM["wgts_fname"]
+    itos_fname = THWIKI_LSTM["itos_fname"]
+
+    if wgts_fname is None or itos_fname is None:
+        raise RuntimeError(
+            "ULMFiT model files not found. "
+            "Please download the corpus first:\n"
+            "  pythainlp.corpus.download('wiki_lm_lstm')\n"
+            "  pythainlp.corpus.download('wiki_itos_lstm')"
+        )
+
+    return {"wgts_fname": wgts_fname, "itos_fname": itos_fname}
+
 # Preprocessing rules for Thai text
 # dense features
 pre_rules_th = [
diff --git a/tests/extra/testx_ulmfit.py b/tests/extra/testx_ulmfit.py
@@ -223,7 +223,10 @@ def test_document_vector(self):
         imdb = untar_data(URLs.IMDB_SAMPLE)
         dummy_df = pd.read_csv(imdb / "texts.csv")
         thwiki = THWIKI_LSTM
-        thwiki_itos = pickle.load(open(thwiki["itos_fname"], "rb"))
+        # Security note: pickle.load() executes arbitrary code if file is malicious.
+        # These corpus files come from a trusted source with MD5 verification.
+        with open(thwiki["itos_fname"], "rb") as f:
+            thwiki_itos = pickle.load(f)
         thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos)
         tt = Tokenizer(
             tok_func=ThaiTokenizer,