PyThaiNLP
diff --git a/‎pyproject.toml‎
Lines changed: 4 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎pythainlp/augment/lm/wangchanberta.py‎
Lines changed: 6 additions & 7 deletions b/‎pythainlp/augment/lm/wangchanberta.py‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎pythainlp/augment/word2vec/bpemb_wv.py‎
Lines changed: 2 additions & 2 deletions b/‎pythainlp/augment/word2vec/bpemb_wv.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pythainlp/augment/word2vec/core.py‎
Lines changed: 11 additions & 10 deletions b/‎pythainlp/augment/word2vec/core.py‎
Lines changed: 11 additions & 10 deletions
diff --git a/‎pythainlp/benchmarks/word_tokenization.py‎
Lines changed: 17 additions & 17 deletions b/‎pythainlp/benchmarks/word_tokenization.py‎
Lines changed: 17 additions & 17 deletions
diff --git a/‎pythainlp/corpus/common.py‎
Lines changed: 2 additions & 2 deletions b/‎pythainlp/corpus/common.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pythainlp/corpus/core.py‎
Lines changed: 22 additions & 7 deletions b/‎pythainlp/corpus/core.py‎
Lines changed: 22 additions & 7 deletions
diff --git a/‎pythainlp/corpus/util.py‎
Lines changed: 5 additions & 4 deletions b/‎pythainlp/corpus/util.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎pythainlp/lm/text_util.py‎
Lines changed: 3 additions & 3 deletions b/‎pythainlp/lm/text_util.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pythainlp/parse/core.py‎
Lines changed: 9 additions & 9 deletions b/‎pythainlp/parse/core.py‎
Lines changed: 9 additions & 9 deletions
@@ -61,7 +61,10 @@ classifiers = [
 ]
 
 # Core dependencies
-dependencies = ["tzdata; sys_platform == 'win32'"]
+dependencies = [
+    "importlib_resources; python_version < '3.11'",
+    "tzdata; sys_platform == 'win32'",
+]
 
 [project.optional-dependencies]
 
 
@@ -32,7 +32,7 @@ def __init__(self):
         self.MASK_TOKEN = self.tokenizer.mask_token
 
     def generate(self, sentence: str, num_replace_tokens: int = 3):
-        self.sent2 = []
+        sent2: list[str] = []
         self.input_text = sentence
         sent = [
             i for i in self.tokenizer.tokenize(self.input_text) if i != "▁"
@@ -42,13 +42,13 @@ def generate(self, sentence: str, num_replace_tokens: int = 3):
         masked_text = self.input_text
         for i in range(num_replace_tokens):
             masked_text = masked_text + self.MASK_TOKEN
-            self.sent2 += [
+            sent2 += [
                 str(j["sequence"]).replace("<s> ", "").replace("</s>", "")
                 for j in self.fill_mask(masked_text)
-                if j["sequence"] not in self.sent2
+                if j["sequence"] not in sent2
             ]
             masked_text = self.input_text
-        return self.sent2
+        return sent2
 
     def augment(self, sentence: str, num_replace_tokens: int = 3) -> list[str]:
         """Text augmentation from WangchanBERTa
@@ -73,6 +73,5 @@ def augment(self, sentence: str, num_replace_tokens: int = 3) -> list[str]:
              'ช้างมีทั้งหมด 50 ตัว บนนั้น',
              'ช้างมีทั้งหมด 50 ตัว บนหัว']
         """
-        self.sent2 = []
-        self.sent2 = self.generate(sentence, num_replace_tokens)
-        return self.sent2
+        sent2 = self.generate(sentence, num_replace_tokens)
+        return sent2
@@ -35,15 +35,15 @@ def load_w2v(self):
 
     def augment(
         self, sentence: str, n_sent: int = 1, p: float = 0.7
-    ) -> list[tuple[str]]:
+    ) -> list[str]:
         """Text Augment using word2vec from BPEmb
 
         :param str sentence: Thai sentence
         :param int n_sent: number of sentence
         :param float p: probability of word
 
         :return: list of synonyms
-        :rtype: List[str]
+        :rtype: list[str]
         :Example:
         ::
 
 
@@ -4,14 +4,15 @@
 from __future__ import annotations
 
 import itertools
+from typing import Callable
 
 
 class Word2VecAug:
     def __init__(
-        self, model: str, tokenize: object, type: str = "file"
+        self, model: str, tokenize: Callable[[str], list[str]], type: str = "file"
     ) -> None:
         """:param str model: path of model
-        :param object tokenize: tokenize function
+        :param Callable[[str], list[str]] tokenize: tokenize function
         :param str type: model type (file, binary)
         """
         import gensim.models.keyedvectors as word2vec
@@ -27,10 +28,10 @@ def __init__(
             self.model = model
         self.dict_wv = list(self.model.key_to_index.keys())
 
-    def modify_sent(self, sent: str, p: float = 0.7) -> list[list[str]]:
-        """:param str sent: text of sentence
+    def modify_sent(self, sent: list[str], p: float = 0.7) -> list[list[str]]:
+        """:param list[str] sent: list of tokens
         :param float p: probability
-        :rtype: List[List[str]]
+        :rtype: list[list[str]]
         """
         list_sent_new = []
         for i in sent:
@@ -46,17 +47,17 @@ def modify_sent(self, sent: str, p: float = 0.7) -> list[list[str]]:
 
     def augment(
         self, sentence: str, n_sent: int = 1, p: float = 0.7
-    ) -> list[tuple[str]]:
+    ) -> list[tuple[str, ...]]:
         """:param str sentence: text of sentence
         :param int n_sent: maximum number of synonymous sentences
         :param int p: probability
 
         :return: list of synonyms
-        :rtype: List[Tuple[str]]
+        :rtype: list[tuple[str, ...]]
         """
-        self.sentence = self.tokenizer(sentence)
-        self.list_synonym = self.modify_sent(self.sentence, p=p)
+        _sentence = self.tokenizer(sentence)
+        _list_synonym = self.modify_sent(_sentence, p=p)
         new_sentences = []
-        for x in list(itertools.product(*self.list_synonym))[0:n_sent]:
+        for x in list(itertools.product(*_list_synonym))[0:n_sent]:
             new_sentences.append(x)
         return new_sentences
@@ -148,35 +148,35 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict:
     :return: metrics at character- and word-level and indicators of correctly tokenized words
     :rtype: dict[str, Union[float, str]]
     """
-    ref_sample = _binary_representation(ref_sample)
-    sample = _binary_representation(raw_sample)
+    ref_sample_arr = _binary_representation(ref_sample)
+    sample_arr = _binary_representation(raw_sample)
 
     # Compute character-level statistics
-    c_pos_pred, c_neg_pred = np.argwhere(sample == 1), np.argwhere(sample == 0)
+    c_pos_pred, c_neg_pred = np.argwhere(sample_arr == 1), np.argwhere(sample_arr == 0)
 
-    c_pos_pred = c_pos_pred[c_pos_pred < ref_sample.shape[0]]
-    c_neg_pred = c_neg_pred[c_neg_pred < ref_sample.shape[0]]
+    c_pos_pred = c_pos_pred[c_pos_pred < ref_sample_arr.shape[0]]
+    c_neg_pred = c_neg_pred[c_neg_pred < ref_sample_arr.shape[0]]
 
-    c_tp = np.sum(ref_sample[c_pos_pred] == 1)
-    c_fp = np.sum(ref_sample[c_pos_pred] == 0)
+    c_tp = np.sum(ref_sample_arr[c_pos_pred] == 1)
+    c_fp = np.sum(ref_sample_arr[c_pos_pred] == 0)
 
-    c_tn = np.sum(ref_sample[c_neg_pred] == 0)
-    c_fn = np.sum(ref_sample[c_neg_pred] == 1)
+    c_tn = np.sum(ref_sample_arr[c_neg_pred] == 0)
+    c_fn = np.sum(ref_sample_arr[c_neg_pred] == 1)
 
     # Compute word-level statistics
 
     # Find correctly tokenized words in the reference sample
-    word_boundaries = _find_word_boundaries(ref_sample)
+    word_boundaries = _find_word_boundaries(ref_sample_arr)
 
     # Find correctly tokenized words in the sample
-    ss_boundaries = _find_word_boundaries(sample)
+    ss_boundaries = _find_word_boundaries(sample_arr)
     tokenization_indicators = _find_words_correctly_tokenised(
         word_boundaries, ss_boundaries
     )
 
     correctly_tokenised_words = np.sum(tokenization_indicators)
 
-    tokenization_indicators = list(map(str, tokenization_indicators))
+    tokenization_indicators_str = list(map(str, tokenization_indicators))
 
     return {
         "char_level": {
@@ -187,11 +187,11 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict:
         },
         "word_level": {
             "correctly_tokenised_words": correctly_tokenised_words,
-            "total_words_in_sample": np.sum(sample),
-            "total_words_in_ref_sample": np.sum(ref_sample),
+            "total_words_in_sample": np.sum(sample_arr),
+            "total_words_in_ref_sample": np.sum(ref_sample_arr),
         },
         "global": {
-            "tokenisation_indicators": "".join(tokenization_indicators)
+            "tokenisation_indicators": "".join(tokenization_indicators_str)
         },
     }
 
@@ -246,14 +246,14 @@ def _find_word_boundaries(bin_reps) -> list:
 def _find_words_correctly_tokenised(
     ref_boundaries: list[tuple[int, int]],
     predicted_boundaries: list[tuple[int, int]],
-) -> tuple[int]:
+) -> tuple[int, ...]:
     """Find whether each word is correctly tokenized.
 
     :param list[tuple(int, int)] ref_boundaries: word boundaries of reference tokenization
     :param list[tuple(int, int)] predicted_boundaries: word boundareies of predicted tokenization
 
     :return: binary sequence where 1 indicates the corresponding word is tokenized correctly
-    :rtype: tuple[int]
+    :rtype: tuple[int, ...]
     """
     ref_b = dict(zip(ref_boundaries, [1] * len(ref_boundaries)))
 
 
@@ -63,8 +63,8 @@
 _THAI_ORST_WORDS: frozenset[str] = frozenset()
 
 _THAI_DICT: dict[str, list[str]] = {}
-_THAI_WSD_DICT: dict[str, list[str]] = {}
-_THAI_SYNONYMS: dict[str, list[str]] = {}
+_THAI_WSD_DICT: dict[str, Union[list[str], list[list[str]]]] = {}
+_THAI_SYNONYMS: dict[str, Union[list[str], list[list[str]]]] = {}
 
 
 def countries() -> frozenset[str]:
 
@@ -40,7 +40,7 @@ def json(self) -> dict:
         try:
             return json.loads(self._content.decode("utf-8"))
         except (json.JSONDecodeError, UnicodeDecodeError) as err:
-            raise ValueError(f"Failed to parse JSON response: {err}")
+            raise ValueError(f"Failed to parse JSON response: {err}") from err
 
 
 def get_corpus_db(url: str) -> Optional[_ResponseWrapper]:
@@ -298,9 +298,17 @@ def get_corpus_path(name: str, version: str = "", force: bool = False) -> Option
     if corpus_db_detail and corpus_db_detail.get("filename"):
         # corpus is in the local catalog, get full path to the file
         if corpus_db_detail.get("is_folder"):
-            path = get_full_data_path(corpus_db_detail.get("foldername"))
+            foldername = corpus_db_detail.get("foldername")
+            if foldername:
+                path = get_full_data_path(foldername)
+            else:
+                return None
         else:
-            path = get_full_data_path(corpus_db_detail.get("filename"))
+            filename = corpus_db_detail.get("filename")
+            if filename:
+                path = get_full_data_path(filename)
+            else:
+                return None
         # check if the corpus file actually exists, download it if not
         if not os.path.exists(path):
             download(name, version=version, force=force)
@@ -736,10 +744,14 @@ def remove(name: str) -> bool:
         if data[0].get("is_folder"):
             import shutil
 
-            os.remove(get_full_data_path(data[0].get("filename")))
-            shutil.rmtree(path, ignore_errors=True)
+            filename = data[0].get("filename")
+            if filename:
+                os.remove(get_full_data_path(filename))
+            if path:
+                shutil.rmtree(path, ignore_errors=True)
         else:
-            os.remove(path)
+            if path:
+                os.remove(path)
         for i, corpus in db["_default"].copy().items():
             if corpus["name"] == name:
                 del db["_default"][i]
@@ -751,7 +763,10 @@ def remove(name: str) -> bool:
 
 
 def get_path_folder_corpus(name: str, version: str, *path: str) -> str:
-    return os.path.join(get_corpus_path(name, version), *path)
+    corpus_path = get_corpus_path(name, version)
+    if corpus_path is None:
+        raise ValueError(f"Corpus path not found for {name} version {version}")
+    return os.path.join(corpus_path, *path)
 
 
 def make_safe_directory_name(name: str) -> str:
 
@@ -43,12 +43,13 @@ def find_badwords(
     :return: words that are considered to make `tokenize` perform badly
     :rtype: Set[str]
     """
-    right = Counter()
-    wrong = Counter()
+    right: Counter[str] = Counter()
+    wrong: Counter[str] = Counter()
 
     for train_words in training_data:
-        train_set = set(index_pairs(train_words))
-        test_words = tokenize("".join(train_words))
+        train_words_list = list(train_words)
+        train_set = set(index_pairs(train_words_list))
+        test_words = tokenize("".join(train_words_list))
         test_pairs = index_pairs(test_words)
         for w, p in zip(test_words, test_pairs):
             if p in train_set:
 
@@ -7,7 +7,7 @@
 
 def calculate_ngram_counts(
     list_words: list[str], n_min: int = 2, n_max: int = 4
-) -> dict[tuple[str], int]:
+) -> dict[tuple[str, ...], int]:
     """Calculates the counts of n-grams in the list words for the specified range.
 
     :param List[str] list_words: List of string
@@ -20,7 +20,7 @@ def calculate_ngram_counts(
     if not list_words:
         return {}
 
-    ngram_counts = {}
+    ngram_counts: dict[tuple[str, ...], int] = {}
 
     for n in range(n_min, n_max + 1):
         for i in range(len(list_words) - n + 1):
@@ -51,7 +51,7 @@ def remove_repeated_ngrams(string_list: list[str], n: int = 2) -> list[str]:
 
     unique_ngrams = set()
 
-    output_list = []
+    output_list: list[str] = []
 
     for i in range(len(string_list)):
         if i + n <= len(string_list):
 
@@ -3,9 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
-from typing import List, Optional, Union
+from typing import Any, List, Optional, Union
 
-_tagger = None
+_tagger: Optional[Any] = None
 _tagger_name = ""
 
 
@@ -100,22 +100,22 @@ def dependency_parsing(
         if engine == "esupar":
             from pythainlp.parse.esupar_engine import Parse
 
-            _tagger = Parse(model=model)
+            _tagger = Parse(model=model if model else "th")
         elif engine == "transformers_ud":
-            from pythainlp.parse.transformers_ud import Parse
+            from pythainlp.parse.transformers_ud import Parse  # type: ignore[assignment]  # noqa: I001
 
-            _tagger = Parse(model=model)
+            _tagger = Parse(model=model if model else "KoichiYasuoka/deberta-base-thai-ud-head")
         elif engine == "spacy_thai":
-            from pythainlp.parse.spacy_thai_engine import Parse
+            from pythainlp.parse.spacy_thai_engine import Parse  # type: ignore[assignment]  # noqa: I001
 
             _tagger = Parse()
         elif engine == "ud_goeswith":
-            from pythainlp.parse.ud_goeswith import Parse
+            from pythainlp.parse.ud_goeswith import Parse  # type: ignore[assignment]  # noqa: I001
 
-            _tagger = Parse(model=model)
+            _tagger = Parse(model=model if model else "KoichiYasuoka/deberta-base-thai-ud-goeswith")
         else:
             raise NotImplementedError("The engine doesn't support.")
 
     _tagger_name = engine
 
-    return _tagger(text, tag=tag)
+    return _tagger(text, tag=tag)  # type: ignore[misc]
Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,10 @@ classifiers = [`
`61`	`61`	`]`
`62`	`62`
`63`	`63`	`# Core dependencies`
`64`		`-dependencies = ["tzdata; sys_platform == 'win32'"]`
	`64`	`+dependencies = [`
	`65`	`+ "importlib_resources; python_version < '3.11'",`
	`66`	`+ "tzdata; sys_platform == 'win32'",`
	`67`	`+]`
`65`	`68`
`66`	`69`	`[project.optional-dependencies]`
`67`	`70`