Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
178aa7e
Initial plan
Copilot Jan 29, 2026
d2baeca
Fix type hints in util module
Copilot Jan 29, 2026
2bda73f
Fix type hints in tokenize module
Copilot Jan 29, 2026
6e8b335
Fix type hints in transliterate module
Copilot Jan 29, 2026
2e7fecf
Fix type hints in ulmfit, lm, and tools modules
Copilot Jan 29, 2026
cb8f3aa
Fix type hints in word_tokenization.py
Copilot Jan 29, 2026
2578f99
Fix type hints in augment/word2vec/core.py
Copilot Jan 29, 2026
b0aec58
Fix return type in augment/word2vec/bpemb_wv.py
Copilot Jan 29, 2026
c413e65
Add type annotation for sent2 in wangchanberta.py
Copilot Jan 29, 2026
55787f2
Fix type hints in parse/core.py
Copilot Jan 29, 2026
48c3b46
Add type annotations in tag/_tag_perceptron.py
Copilot Jan 29, 2026
ff20049
Fix type hints in tag/named_entity.py
Copilot Jan 29, 2026
6a645fb
Fix type hints in translate/tokenization_small100.py
Copilot Jan 29, 2026
f6bf7e9
Replace | union syntax with Union for Python 3.9 compatibility
Copilot Jan 29, 2026
04406ec
Fix incompatible return value and argument type issues
Copilot Jan 29, 2026
e94f53d
Fix type compatibility in corpus/common.py: use Any for mixed-type di…
Copilot Jan 29, 2026
eb8d1b1
Fix None argument issues in tag/unigram.py and tag/perceptron.py
Copilot Jan 29, 2026
a888187
Fix return type and type annotation issues in tag/thainer.py and tag/…
Copilot Jan 29, 2026
3352bbb
Fix return type and argument type issues in transliterate/w2p.py and …
Copilot Jan 29, 2026
cc3ed7b
Fix None argument to join in translate/en_th.py
Copilot Jan 29, 2026
48f57ce
Fix argument type compatibility in summarize/keybert.py
Copilot Jan 29, 2026
20ed024
Fix Iterable vs list argument type in corpus/util.py
Copilot Jan 29, 2026
6903495
Fix test and remove unnecessary type: ignore comments
Copilot Jan 29, 2026
defd6be
Fix import issues in tools/path.py
Copilot Jan 29, 2026
bb496fc
Add clarifying comment for importlib_resources type: ignore
Copilot Jan 29, 2026
48f1f7f
Fix imports
bact Jan 29, 2026
9d44ca7
Revert "Fix imports"
bact Jan 29, 2026
e626bc0
Fix imports
bact Jan 29, 2026
9c25660
Add importlib_resources import
bact Jan 29, 2026
3b548ed
Fix type annotations and ruff errors based on review feedback
Copilot Jan 29, 2026
07f16b1
Improve type precision per bot review feedback
Copilot Jan 29, 2026
ba221c3
Fix few lint issues
bact Jan 29, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,10 @@ classifiers = [
]

# Core dependencies
dependencies = ["tzdata; sys_platform == 'win32'"]
dependencies = [
"importlib_resources; python_version < '3.11'",
"tzdata; sys_platform == 'win32'",
]

[project.optional-dependencies]

Expand Down
13 changes: 6 additions & 7 deletions pythainlp/augment/lm/wangchanberta.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __init__(self):
self.MASK_TOKEN = self.tokenizer.mask_token

def generate(self, sentence: str, num_replace_tokens: int = 3):
self.sent2 = []
sent2: list[str] = []
self.input_text = sentence
sent = [
i for i in self.tokenizer.tokenize(self.input_text) if i != "▁"
Expand All @@ -42,13 +42,13 @@ def generate(self, sentence: str, num_replace_tokens: int = 3):
masked_text = self.input_text
for i in range(num_replace_tokens):
masked_text = masked_text + self.MASK_TOKEN
self.sent2 += [
sent2 += [
str(j["sequence"]).replace("<s> ", "").replace("</s>", "")
for j in self.fill_mask(masked_text)
if j["sequence"] not in self.sent2
if j["sequence"] not in sent2
]
masked_text = self.input_text
return self.sent2
return sent2

def augment(self, sentence: str, num_replace_tokens: int = 3) -> list[str]:
"""Text augmentation from WangchanBERTa
Expand All @@ -73,6 +73,5 @@ def augment(self, sentence: str, num_replace_tokens: int = 3) -> list[str]:
'ช้างมีทั้งหมด 50 ตัว บนนั้น',
'ช้างมีทั้งหมด 50 ตัว บนหัว']
"""
self.sent2 = []
self.sent2 = self.generate(sentence, num_replace_tokens)
return self.sent2
sent2 = self.generate(sentence, num_replace_tokens)
return sent2
4 changes: 2 additions & 2 deletions pythainlp/augment/word2vec/bpemb_wv.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,15 @@ def load_w2v(self):

def augment(
self, sentence: str, n_sent: int = 1, p: float = 0.7
) -> list[tuple[str]]:
) -> list[str]:
"""Text Augment using word2vec from BPEmb

:param str sentence: Thai sentence
:param int n_sent: number of sentence
:param float p: probability of word

:return: list of synonyms
:rtype: List[str]
:rtype: list[str]
:Example:
::

Expand Down
21 changes: 11 additions & 10 deletions pythainlp/augment/word2vec/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
from __future__ import annotations

import itertools
from typing import Callable


class Word2VecAug:
def __init__(
self, model: str, tokenize: object, type: str = "file"
self, model: str, tokenize: Callable[[str], list[str]], type: str = "file"
) -> None:
""":param str model: path of model
:param object tokenize: tokenize function
:param Callable[[str], list[str]] tokenize: tokenize function
:param str type: model type (file, binary)
"""
import gensim.models.keyedvectors as word2vec
Expand All @@ -27,10 +28,10 @@ def __init__(
self.model = model
self.dict_wv = list(self.model.key_to_index.keys())

def modify_sent(self, sent: str, p: float = 0.7) -> list[list[str]]:
""":param str sent: text of sentence
def modify_sent(self, sent: list[str], p: float = 0.7) -> list[list[str]]:
""":param list[str] sent: list of tokens
:param float p: probability
:rtype: List[List[str]]
:rtype: list[list[str]]
"""
list_sent_new = []
for i in sent:
Expand All @@ -46,17 +47,17 @@ def modify_sent(self, sent: str, p: float = 0.7) -> list[list[str]]:

def augment(
self, sentence: str, n_sent: int = 1, p: float = 0.7
) -> list[tuple[str]]:
) -> list[tuple[str, ...]]:
""":param str sentence: text of sentence
:param int n_sent: maximum number of synonymous sentences
:param int p: probability

:return: list of synonyms
:rtype: List[Tuple[str]]
:rtype: list[tuple[str, ...]]
"""
self.sentence = self.tokenizer(sentence)
self.list_synonym = self.modify_sent(self.sentence, p=p)
_sentence = self.tokenizer(sentence)
_list_synonym = self.modify_sent(_sentence, p=p)
new_sentences = []
for x in list(itertools.product(*self.list_synonym))[0:n_sent]:
for x in list(itertools.product(*_list_synonym))[0:n_sent]:
new_sentences.append(x)
return new_sentences
34 changes: 17 additions & 17 deletions pythainlp/benchmarks/word_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,35 +148,35 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict:
:return: metrics at character- and word-level and indicators of correctly tokenized words
:rtype: dict[str, Union[float, str]]
"""
ref_sample = _binary_representation(ref_sample)
sample = _binary_representation(raw_sample)
ref_sample_arr = _binary_representation(ref_sample)
sample_arr = _binary_representation(raw_sample)

# Compute character-level statistics
c_pos_pred, c_neg_pred = np.argwhere(sample == 1), np.argwhere(sample == 0)
c_pos_pred, c_neg_pred = np.argwhere(sample_arr == 1), np.argwhere(sample_arr == 0)

c_pos_pred = c_pos_pred[c_pos_pred < ref_sample.shape[0]]
c_neg_pred = c_neg_pred[c_neg_pred < ref_sample.shape[0]]
c_pos_pred = c_pos_pred[c_pos_pred < ref_sample_arr.shape[0]]
c_neg_pred = c_neg_pred[c_neg_pred < ref_sample_arr.shape[0]]

c_tp = np.sum(ref_sample[c_pos_pred] == 1)
c_fp = np.sum(ref_sample[c_pos_pred] == 0)
c_tp = np.sum(ref_sample_arr[c_pos_pred] == 1)
c_fp = np.sum(ref_sample_arr[c_pos_pred] == 0)

c_tn = np.sum(ref_sample[c_neg_pred] == 0)
c_fn = np.sum(ref_sample[c_neg_pred] == 1)
c_tn = np.sum(ref_sample_arr[c_neg_pred] == 0)
c_fn = np.sum(ref_sample_arr[c_neg_pred] == 1)

# Compute word-level statistics

# Find correctly tokenized words in the reference sample
word_boundaries = _find_word_boundaries(ref_sample)
word_boundaries = _find_word_boundaries(ref_sample_arr)

# Find correctly tokenized words in the sample
ss_boundaries = _find_word_boundaries(sample)
ss_boundaries = _find_word_boundaries(sample_arr)
tokenization_indicators = _find_words_correctly_tokenised(
word_boundaries, ss_boundaries
)

correctly_tokenised_words = np.sum(tokenization_indicators)

tokenization_indicators = list(map(str, tokenization_indicators))
tokenization_indicators_str = list(map(str, tokenization_indicators))

return {
"char_level": {
Expand All @@ -187,11 +187,11 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict:
},
"word_level": {
"correctly_tokenised_words": correctly_tokenised_words,
"total_words_in_sample": np.sum(sample),
"total_words_in_ref_sample": np.sum(ref_sample),
"total_words_in_sample": np.sum(sample_arr),
"total_words_in_ref_sample": np.sum(ref_sample_arr),
},
"global": {
"tokenisation_indicators": "".join(tokenization_indicators)
"tokenisation_indicators": "".join(tokenization_indicators_str)
},
}

Expand Down Expand Up @@ -246,14 +246,14 @@ def _find_word_boundaries(bin_reps) -> list:
def _find_words_correctly_tokenised(
ref_boundaries: list[tuple[int, int]],
predicted_boundaries: list[tuple[int, int]],
) -> tuple[int]:
) -> tuple[int, ...]:
"""Find whether each word is correctly tokenized.

:param list[tuple(int, int)] ref_boundaries: word boundaries of reference tokenization
:param list[tuple(int, int)] predicted_boundaries: word boundareies of predicted tokenization

:return: binary sequence where 1 indicates the corresponding word is tokenized correctly
:rtype: tuple[int]
:rtype: tuple[int, ...]
"""
ref_b = dict(zip(ref_boundaries, [1] * len(ref_boundaries)))

Expand Down
4 changes: 2 additions & 2 deletions pythainlp/corpus/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@
_THAI_ORST_WORDS: frozenset[str] = frozenset()

_THAI_DICT: dict[str, list[str]] = {}
_THAI_WSD_DICT: dict[str, list[str]] = {}
_THAI_SYNONYMS: dict[str, list[str]] = {}
_THAI_WSD_DICT: dict[str, Union[list[str], list[list[str]]]] = {}
_THAI_SYNONYMS: dict[str, Union[list[str], list[list[str]]]] = {}


def countries() -> frozenset[str]:
Expand Down
29 changes: 22 additions & 7 deletions pythainlp/corpus/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def json(self) -> dict:
try:
return json.loads(self._content.decode("utf-8"))
except (json.JSONDecodeError, UnicodeDecodeError) as err:
raise ValueError(f"Failed to parse JSON response: {err}")
raise ValueError(f"Failed to parse JSON response: {err}") from err


def get_corpus_db(url: str) -> Optional[_ResponseWrapper]:
Expand Down Expand Up @@ -298,9 +298,17 @@ def get_corpus_path(name: str, version: str = "", force: bool = False) -> Option
if corpus_db_detail and corpus_db_detail.get("filename"):
# corpus is in the local catalog, get full path to the file
if corpus_db_detail.get("is_folder"):
path = get_full_data_path(corpus_db_detail.get("foldername"))
foldername = corpus_db_detail.get("foldername")
if foldername:
path = get_full_data_path(foldername)
else:
return None
else:
path = get_full_data_path(corpus_db_detail.get("filename"))
filename = corpus_db_detail.get("filename")
if filename:
path = get_full_data_path(filename)
else:
return None
# check if the corpus file actually exists, download it if not
if not os.path.exists(path):
download(name, version=version, force=force)
Expand Down Expand Up @@ -736,10 +744,14 @@ def remove(name: str) -> bool:
if data[0].get("is_folder"):
import shutil

os.remove(get_full_data_path(data[0].get("filename")))
shutil.rmtree(path, ignore_errors=True)
filename = data[0].get("filename")
if filename:
os.remove(get_full_data_path(filename))
if path:
shutil.rmtree(path, ignore_errors=True)
else:
os.remove(path)
if path:
os.remove(path)
for i, corpus in db["_default"].copy().items():
if corpus["name"] == name:
del db["_default"][i]
Expand All @@ -751,7 +763,10 @@ def remove(name: str) -> bool:


def get_path_folder_corpus(name: str, version: str, *path: str) -> str:
return os.path.join(get_corpus_path(name, version), *path)
corpus_path = get_corpus_path(name, version)
if corpus_path is None:
raise ValueError(f"Corpus path not found for {name} version {version}")
return os.path.join(corpus_path, *path)


def make_safe_directory_name(name: str) -> str:
Expand Down
9 changes: 5 additions & 4 deletions pythainlp/corpus/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,13 @@ def find_badwords(
:return: words that are considered to make `tokenize` perform badly
:rtype: Set[str]
"""
right = Counter()
wrong = Counter()
right: Counter[str] = Counter()
wrong: Counter[str] = Counter()

for train_words in training_data:
train_set = set(index_pairs(train_words))
test_words = tokenize("".join(train_words))
train_words_list = list(train_words)
train_set = set(index_pairs(train_words_list))
test_words = tokenize("".join(train_words_list))
test_pairs = index_pairs(test_words)
for w, p in zip(test_words, test_pairs):
if p in train_set:
Expand Down
6 changes: 3 additions & 3 deletions pythainlp/lm/text_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

def calculate_ngram_counts(
list_words: list[str], n_min: int = 2, n_max: int = 4
) -> dict[tuple[str], int]:
) -> dict[tuple[str, ...], int]:
"""Calculates the counts of n-grams in the list words for the specified range.

:param List[str] list_words: List of string
Expand All @@ -20,7 +20,7 @@ def calculate_ngram_counts(
if not list_words:
return {}

ngram_counts = {}
ngram_counts: dict[tuple[str, ...], int] = {}

for n in range(n_min, n_max + 1):
for i in range(len(list_words) - n + 1):
Expand Down Expand Up @@ -51,7 +51,7 @@ def remove_repeated_ngrams(string_list: list[str], n: int = 2) -> list[str]:

unique_ngrams = set()

output_list = []
output_list: list[str] = []

for i in range(len(string_list)):
if i + n <= len(string_list):
Expand Down
18 changes: 9 additions & 9 deletions pythainlp/parse/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations

from typing import List, Optional, Union
from typing import Any, List, Optional, Union

_tagger = None
_tagger: Optional[Any] = None
_tagger_name = ""


Expand Down Expand Up @@ -100,22 +100,22 @@ def dependency_parsing(
if engine == "esupar":
from pythainlp.parse.esupar_engine import Parse

_tagger = Parse(model=model)
_tagger = Parse(model=model if model else "th")
elif engine == "transformers_ud":
from pythainlp.parse.transformers_ud import Parse
from pythainlp.parse.transformers_ud import Parse # type: ignore[assignment] # noqa: I001

_tagger = Parse(model=model)
_tagger = Parse(model=model if model else "KoichiYasuoka/deberta-base-thai-ud-head")
elif engine == "spacy_thai":
from pythainlp.parse.spacy_thai_engine import Parse
from pythainlp.parse.spacy_thai_engine import Parse # type: ignore[assignment] # noqa: I001

_tagger = Parse()
elif engine == "ud_goeswith":
from pythainlp.parse.ud_goeswith import Parse
from pythainlp.parse.ud_goeswith import Parse # type: ignore[assignment] # noqa: I001

_tagger = Parse(model=model)
_tagger = Parse(model=model if model else "KoichiYasuoka/deberta-base-thai-ud-goeswith")
else:
raise NotImplementedError("The engine doesn't support.")

_tagger_name = engine

return _tagger(text, tag=tag)
return _tagger(text, tag=tag) # type: ignore[misc]
4 changes: 2 additions & 2 deletions pythainlp/summarize/keybert.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def _generate_ngrams(
f"current value={keyphrase_ngram_range}."
)

def _join_ngram(ngrams: list[tuple[str, str]]) -> list[str]:
def _join_ngram(ngrams: list[tuple[str, ...]]) -> list[str]: # type: ignore[type-arg]
ngrams_joined = []
for ng in ngrams:
joined = "".join(ng)
Expand All @@ -187,7 +187,7 @@ def _join_ngram(ngrams: list[tuple[str, str]]) -> list[str]:
ngrams = [word for word in words if word.strip()]
else:
ngrams_tuple = zip(*[words[i:] for i in range(n)])
ngrams = _join_ngram(ngrams_tuple)
ngrams = _join_ngram(list(ngrams_tuple)) # type: ignore[arg-type]

ngrams_cnt = Counter(ngrams)
ngrams = [
Expand Down
Loading
Loading