Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pythainlp/soundex/prayut_and_somchaip.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def prayut_and_somchaip(text: str, length: int = 4) -> str:
elif chars[i] in _C9 and i != 0:
chars[i] = "9"
else:
chars[i] = None
chars[i] = None # type: ignore[call-overload]
i += 1
chars = list("".join([i for i in chars if i is not None]))
return "".join(chars[-length:])
4 changes: 2 additions & 2 deletions pythainlp/summarize/freq.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def __rank(ranking, n: int):
def __compute_frequencies(
self, word_tokenized_sents: list[list[str]]
) -> defaultdict:
word_freqs = defaultdict(int)
word_freqs: defaultdict[str, float] = defaultdict(int)
for sent in word_tokenized_sents:
for word in sent:
if word not in self.__stopwords:
Expand All @@ -54,7 +54,7 @@ def summarize(
word_tokenize(sent, engine=tokenizer) for sent in sents
]
self.__freq = self.__compute_frequencies(word_tokenized_sents)
ranking = defaultdict(int)
ranking: defaultdict[int, float] = defaultdict(int)

for i, sent in enumerate(word_tokenized_sents):
for w in sent:
Expand Down
19 changes: 10 additions & 9 deletions pythainlp/tokenize/han_solo.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import threading
from importlib.resources import as_file, files
from typing import Optional, Union
from typing import Optional, Union, cast

try:
import pycrfsuite
Expand Down Expand Up @@ -101,9 +101,9 @@ def featurize(
if indiv_char:
left_key = "|".join([str(relative_index_left), char_left])
if return_type == "dict":
features[left_key] = 1
cast(dict[str, int], features)[left_key] = 1
else:
features.append(left_key)
cast(list[str], features).append(left_key)
Comment on lines 103 to +106
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using cast() to work around Union type narrowing defeats the purpose of static type checking and can lead to runtime errors. The cast() function tells the type checker to trust that the value is of a certain type without actually performing any runtime validation.

A more type-safe approach would be to properly narrow the Union type through control flow. Instead of using cast(), you could check the type after initialization:

if return_type == "dict":
    assert isinstance(features, dict)
    features[left_key] = 1
else:
    assert isinstance(features, list)
    features.append(left_key)

Or refactor the code to avoid the Union type altogether by using separate variables or a more structured approach. The repeated casting throughout the method (lines 104, 106, 122, 124, 133, 135) suggests the control flow could be simplified.

Copilot uses AI. Check for mistakes.

abs_index_right += (
1 # สมมุติคือตำแหน่งที่ 0 จะได้ 0, 1, 2, 3, 4 (radius = 5)
Expand All @@ -119,9 +119,9 @@ def featurize(
[str(relative_index_right), char_right]
)
if return_type == "dict":
features[right_key] = 1
cast(dict[str, int], features)[right_key] = 1
else:
features.append(right_key)
cast(list[str], features).append(right_key)
Comment on lines 121 to +124
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using cast() to work around Union type narrowing defeats the purpose of static type checking and can lead to runtime errors. The cast() function tells the type checker to trust that the value is of a certain type without actually performing any runtime validation.

A more type-safe approach would be to properly narrow the Union type through control flow. Instead of using cast(), you could check the type after initialization or refactor the code to avoid the Union type altogether by using separate variables or a more structured approach.

Copilot uses AI. Check for mistakes.

counter += 1

Expand All @@ -130,13 +130,14 @@ def featurize(
ngram = chars[i : i + self.N]
ngram_key = "|".join([str(i - self.radius), ngram])
if return_type == "dict":
features[ngram_key] = 1
cast(dict[str, int], features)[ngram_key] = 1
else:
features.append(ngram_key)
cast(list[str], features).append(ngram_key)
Comment on lines 132 to +135
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using cast() to work around Union type narrowing defeats the purpose of static type checking and can lead to runtime errors. The cast() function tells the type checker to trust that the value is of a certain type without actually performing any runtime validation.

A more type-safe approach would be to properly narrow the Union type through control flow. Instead of using cast(), you could check the type after initialization or refactor the code to avoid the Union type altogether by using separate variables or a more structured approach.

Copilot uses AI. Check for mistakes.
all_features.append(features)
if return_type == "list":
cut = str(cut)
all_labels.append(cut)
all_labels.append(str(cut))
else:
all_labels.append(cut)

return {"X": all_features, "Y": all_labels}

Expand Down
6 changes: 3 additions & 3 deletions pythainlp/tokenize/longest.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,8 @@ def __longest_matching(self, text: str, begin_pos: int) -> str:
def __segment(self, text: str):
begin_pos = 0
len_text = len(text)
tokens = []
token_statuses = []
tokens: list[str] = []
token_statuses: list[int] = []
while begin_pos < len_text:
match = self.__longest_matching(text, begin_pos)
if not match:
Expand Down Expand Up @@ -139,7 +139,7 @@ def __segment(self, text: str):
begin_pos += len(match)

# Group consecutive spaces into one token
grouped_tokens = []
grouped_tokens: list[str] = []
for token in tokens:
if (
token.isspace()
Expand Down
2 changes: 1 addition & 1 deletion pythainlp/tokenize/multi_cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def _multicut(
if not custom_dict:
custom_dict = word_dict_trie()
len_text = len(text)
words_at = defaultdict(list) # main data structure
words_at: defaultdict[int, list[str]] = defaultdict(list) # main data structure

def serialize(p, p2): # helper function
for w in words_at[p]:
Expand Down
2 changes: 1 addition & 1 deletion pythainlp/transliterate/wunsen.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class WunsenTransliterate:
"""

def __init__(self) -> None:
self.thap_value: Optional[object] = None
self.thap_value: Optional[ThapSap] = None
self.lang: Optional[str] = None
self.jp_input: Optional[str] = None
self.zh_sandhi: Optional[bool] = None
Expand Down
11 changes: 4 additions & 7 deletions pythainlp/util/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,12 +202,12 @@ def convert_years(year: str, src="be", target="ad") -> str:
return output_year


def _find_month(text: str) -> Optional[int]:
def _find_month(text: str) -> int:
for i, m in enumerate(thai_full_month_lists):
for j in m:
if j in text:
return i + 1
return None
return 0 # Not found in list
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing the return type from Optional[int] to int and returning 0 for "not found" is problematic. The value 0 is ambiguous because it could be confused with a valid index (though months are 1-indexed, making this slightly safer). However, this breaks the API contract - the function previously could distinguish between "month found" and "month not found" via None, which is now lost.

This change could cause silent failures: if a month is not found in the input text, the function will now return 0, which will be passed to datetime() constructor at line 314. The datetime constructor expects month values 1-12, and passing 0 will raise a ValueError at runtime rather than handling the case gracefully.

A better approach would be to either:

  1. Keep the Optional[int] return type and handle None appropriately in the caller
  2. Raise an exception when month is not found rather than returning a sentinel value
Suggested change
return 0 # Not found in list
raise ValueError(f"Month not found in text: {text!r}")

Copilot uses AI. Check for mistakes.


def thai_strptime(
Expand Down Expand Up @@ -254,9 +254,6 @@ def thai_strptime(
# tzinfo=zoneinfo.ZoneInfo(key='Asia/Bangkok')
# )
"""
d = ""
m = ""
y = ""
fmt = fmt.replace("%-m", "%m")
fmt = fmt.replace("%-d", "%d")
fmt = fmt.replace("%b", "%B")
Expand Down Expand Up @@ -290,7 +287,7 @@ def thai_strptime(
second: Union[int, str] = 0
f: Union[int, str] = 0
d = data["d"]
m = _find_month(data["B"])
m: int = _find_month(data["B"])
y = data["Y"]
if "H" in keys:
hour = data["H"]
Expand All @@ -314,7 +311,7 @@ def thai_strptime(
y = convert_years(y, src="be", target="ad")
return datetime(
year=int(y),
month=int(m),
month=m,
day=int(d),
hour=int(hour),
minute=int(minute),
Expand Down
8 changes: 4 additions & 4 deletions pythainlp/util/morse.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,11 @@ def morse_encode(text: str, lang: str = "th") -> str:
"""
if lang == "th": # Thai
return " ".join(
map(lambda x, g=THAI_MORSE_CODE.get: g(x, " "), text.upper())
THAI_MORSE_CODE.get(char, " ") for char in text.upper()
)
elif lang == "en": # English
return " ".join(
map(lambda x, g=ENGLISH_MORSE_CODE.get: g(x, " "), text.upper())
ENGLISH_MORSE_CODE.get(char, " ") for char in text.upper()
)
else:
raise NotImplementedError(f"This function doesn't support {lang}.")
Expand Down Expand Up @@ -187,12 +187,12 @@ def morse_decode(morse_text: str, lang: str = "th") -> str:
"""
if lang == "th":
ans = "".join(
map(lambda x, g=decodingthai.get: g(x, ""), morse_text.split(" "))
decodingthai.get(code, "") for code in morse_text.split(" ")
)
return "".join(ans.split())
elif lang == "en":
ans = "".join(
map(lambda x, g=decodingeng.get: g(x, " "), morse_text.split(" "))
decodingeng.get(code, " ") for code in morse_text.split(" ")
)
return " ".join(ans.split())
else:
Expand Down
Loading