Merge pull request #1235 from PyThaiNLP/copilot/fix-mypy-issues

bact · web-flow · commit 03b92ab064b3 · 2026-01-30T01:11:21.000Z
Refactor type: ignore suppressions to proper type fixes
diff --git a/pythainlp/soundex/prayut_and_somchaip.py b/pythainlp/soundex/prayut_and_somchaip.py
@@ -87,7 +87,7 @@ def prayut_and_somchaip(text: str, length: int = 4) -> str:
         elif chars[i] in _C9 and i != 0:
             chars[i] = "9"
         else:
-            chars[i] = None
+            chars[i] = None  # type: ignore[call-overload]
         i += 1
     chars = list("".join([i for i in chars if i is not None]))
     return "".join(chars[-length:])
diff --git a/pythainlp/summarize/freq.py b/pythainlp/summarize/freq.py
@@ -29,7 +29,7 @@ def __rank(ranking, n: int):
     def __compute_frequencies(
         self, word_tokenized_sents: list[list[str]]
     ) -> defaultdict:
-        word_freqs = defaultdict(int)
+        word_freqs: defaultdict[str, float] = defaultdict(int)
         for sent in word_tokenized_sents:
             for word in sent:
                 if word not in self.__stopwords:
@@ -54,7 +54,7 @@ def summarize(
             word_tokenize(sent, engine=tokenizer) for sent in sents
         ]
         self.__freq = self.__compute_frequencies(word_tokenized_sents)
-        ranking = defaultdict(int)
+        ranking: defaultdict[int, float] = defaultdict(int)
 
         for i, sent in enumerate(word_tokenized_sents):
             for w in sent:
diff --git a/pythainlp/tokenize/han_solo.py b/pythainlp/tokenize/han_solo.py
@@ -10,7 +10,7 @@
 
 import threading
 from importlib.resources import as_file, files
-from typing import Optional, Union
+from typing import Optional, Union, cast
 
 try:
     import pycrfsuite
@@ -101,9 +101,9 @@ def featurize(
                 if indiv_char:
                     left_key = "|".join([str(relative_index_left), char_left])
                     if return_type == "dict":
-                        features[left_key] = 1
+                        cast(dict[str, int], features)[left_key] = 1
                     else:
-                        features.append(left_key)
+                        cast(list[str], features).append(left_key)
 
                 abs_index_right += (
                     1  # สมมุติคือตำแหน่งที่ 0 จะได้ 0, 1, 2, 3, 4 (radius = 5)
@@ -119,9 +119,9 @@ def featurize(
                         [str(relative_index_right), char_right]
                     )
                     if return_type == "dict":
-                        features[right_key] = 1
+                        cast(dict[str, int], features)[right_key] = 1
                     else:
-                        features.append(right_key)
+                        cast(list[str], features).append(right_key)
 
                 counter += 1
 
@@ -130,13 +130,14 @@ def featurize(
                 ngram = chars[i : i + self.N]
                 ngram_key = "|".join([str(i - self.radius), ngram])
                 if return_type == "dict":
-                    features[ngram_key] = 1
+                    cast(dict[str, int], features)[ngram_key] = 1
                 else:
-                    features.append(ngram_key)
+                    cast(list[str], features).append(ngram_key)
             all_features.append(features)
             if return_type == "list":
-                cut = str(cut)
-            all_labels.append(cut)
+                all_labels.append(str(cut))
+            else:
+                all_labels.append(cut)
 
         return {"X": all_features, "Y": all_labels}
 
diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
@@ -109,8 +109,8 @@ def __longest_matching(self, text: str, begin_pos: int) -> str:
     def __segment(self, text: str):
         begin_pos = 0
         len_text = len(text)
-        tokens = []
-        token_statuses = []
+        tokens: list[str] = []
+        token_statuses: list[int] = []
         while begin_pos < len_text:
             match = self.__longest_matching(text, begin_pos)
             if not match:
@@ -139,7 +139,7 @@ def __segment(self, text: str):
                 begin_pos += len(match)
 
         # Group consecutive spaces into one token
-        grouped_tokens = []
+        grouped_tokens: list[str] = []
         for token in tokens:
             if (
                 token.isspace()
diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py
@@ -55,7 +55,7 @@ def _multicut(
     if not custom_dict:
         custom_dict = word_dict_trie()
     len_text = len(text)
-    words_at = defaultdict(list)  # main data structure
+    words_at: defaultdict[int, list[str]] = defaultdict(list)  # main data structure
 
     def serialize(p, p2):  # helper function
         for w in words_at[p]:
diff --git a/pythainlp/transliterate/wunsen.py b/pythainlp/transliterate/wunsen.py
@@ -28,7 +28,7 @@ class WunsenTransliterate:
     """
 
     def __init__(self) -> None:
-        self.thap_value: Optional[object] = None
+        self.thap_value: Optional[ThapSap] = None
         self.lang: Optional[str] = None
         self.jp_input: Optional[str] = None
         self.zh_sandhi: Optional[bool] = None
diff --git a/pythainlp/util/date.py b/pythainlp/util/date.py
@@ -202,12 +202,12 @@ def convert_years(year: str, src="be", target="ad") -> str:
     return output_year
 
 
-def _find_month(text: str) -> Optional[int]:
+def _find_month(text: str) -> int:
     for i, m in enumerate(thai_full_month_lists):
         for j in m:
             if j in text:
                 return i + 1
-    return None
+    return 0  # Not found in list
 
 
 def thai_strptime(
@@ -254,9 +254,6 @@ def thai_strptime(
         #   tzinfo=zoneinfo.ZoneInfo(key='Asia/Bangkok')
         # )
     """
-    d = ""
-    m = ""
-    y = ""
     fmt = fmt.replace("%-m", "%m")
     fmt = fmt.replace("%-d", "%d")
     fmt = fmt.replace("%b", "%B")
@@ -290,7 +287,7 @@ def thai_strptime(
     second: Union[int, str] = 0
     f: Union[int, str] = 0
     d = data["d"]
-    m = _find_month(data["B"])
+    m: int = _find_month(data["B"])
     y = data["Y"]
     if "H" in keys:
         hour = data["H"]
@@ -314,7 +311,7 @@ def thai_strptime(
         y = convert_years(y, src="be", target="ad")
     return datetime(
         year=int(y),
-        month=int(m),
+        month=m,
         day=int(d),
         hour=int(hour),
         minute=int(minute),
diff --git a/pythainlp/util/morse.py b/pythainlp/util/morse.py
@@ -154,11 +154,11 @@ def morse_encode(text: str, lang: str = "th") -> str:
     """
     if lang == "th":  # Thai
         return " ".join(
-            map(lambda x, g=THAI_MORSE_CODE.get: g(x, " "), text.upper())
+            THAI_MORSE_CODE.get(char, " ") for char in text.upper()
         )
     elif lang == "en":  # English
         return " ".join(
-            map(lambda x, g=ENGLISH_MORSE_CODE.get: g(x, " "), text.upper())
+            ENGLISH_MORSE_CODE.get(char, " ") for char in text.upper()
         )
     else:
         raise NotImplementedError(f"This function doesn't support {lang}.")
@@ -187,12 +187,12 @@ def morse_decode(morse_text: str, lang: str = "th") -> str:
     """
     if lang == "th":
         ans = "".join(
-            map(lambda x, g=decodingthai.get: g(x, ""), morse_text.split(" "))
+            decodingthai.get(code, "") for code in morse_text.split(" ")
         )
         return "".join(ans.split())
     elif lang == "en":
         ans = "".join(
-            map(lambda x, g=decodingeng.get: g(x, " "), morse_text.split(" "))
+            decodingeng.get(code, " ") for code in morse_text.split(" ")
         )
         return " ".join(ans.split())
     else: