PyThaiNLP
diff --git a/‎pythainlp/corpus/common.py‎
Lines changed: 4 additions & 4 deletions b/‎pythainlp/corpus/common.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎pythainlp/corpus/core.py‎
Lines changed: 7 additions & 6 deletions b/‎pythainlp/corpus/core.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎pythainlp/corpus/wordnet.py‎
Lines changed: 14 additions & 13 deletions b/‎pythainlp/corpus/wordnet.py‎
Lines changed: 14 additions & 13 deletions
diff --git a/‎pythainlp/tag/_tag_perceptron.py‎
Lines changed: 4 additions & 4 deletions b/‎pythainlp/tag/_tag_perceptron.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎pythainlp/tag/crfchunk.py‎
Lines changed: 13 additions & 6 deletions b/‎pythainlp/tag/crfchunk.py‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎pythainlp/tag/thainer.py‎
Lines changed: 1 addition & 1 deletion b/‎pythainlp/tag/thainer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pythainlp/tag/wangchanberta_onnx.py‎
Lines changed: 7 additions & 6 deletions b/‎pythainlp/tag/wangchanberta_onnx.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎pythainlp/tokenize/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎pythainlp/tokenize/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pythainlp/tokenize/core.py‎
Lines changed: 2 additions & 2 deletions b/‎pythainlp/tokenize/core.py‎
Lines changed: 2 additions & 2 deletions
@@ -62,9 +62,9 @@
 
 _THAI_ORST_WORDS: frozenset[str] = frozenset()
 
-_THAI_DICT: dict[str, list] = {}
-_THAI_WSD_DICT: dict[str, list] = {}
-_THAI_SYNONYMS: dict[str, list] = {}
+_THAI_DICT: dict[str, list[str]] = {}
+_THAI_WSD_DICT: dict[str, list[str]] = {}
+_THAI_SYNONYMS: dict[str, list[str]] = {}
 
 
 def countries() -> frozenset[str]:
@@ -83,7 +83,7 @@ def countries() -> frozenset[str]:
     return _THAI_COUNTRIES
 
 
-def provinces(details: bool = False) -> Union[frozenset[str], list[dict]]:
+def provinces(details: bool = False) -> Union[frozenset[str], list[dict[str, str]]]:
     """Return a frozenset of Thailand province names in Thai such as "กระบี่",
     "กรุงเทพมหานคร", "กาญจนบุรี", and "อุบลราชธานี".
     \n(See: `dev/pythainlp/corpus/thailand_provinces_th.txt\
 
@@ -11,6 +11,7 @@
 import sys
 import tarfile
 import zipfile
+from http.client import HTTPResponse
 from importlib.resources import files
 from typing import Optional
 
@@ -29,20 +30,20 @@
 class _ResponseWrapper:
     """Wrapper to provide requests.Response-like interface for urllib response."""
 
-    def __init__(self, response):
+    def __init__(self, response: HTTPResponse) -> None:
         self.status_code = response.status
         self.headers = response.headers
         self._content = response.read()
 
-    def json(self):
+    def json(self) -> dict:
         """Parse JSON content from response."""
         try:
             return json.loads(self._content.decode("utf-8"))
         except (json.JSONDecodeError, UnicodeDecodeError) as err:
             raise ValueError(f"Failed to parse JSON response: {err}")
 
 
-def get_corpus_db(url: str):
+def get_corpus_db(url: str) -> Optional[_ResponseWrapper]:
     """Get corpus catalog from server.
 
     :param str url: URL corpus catalog
@@ -69,7 +70,7 @@ def get_corpus_db(url: str):
     return corpus_db
 
 
-def get_corpus_db_detail(name: str, version: str = "") -> dict:
+def get_corpus_db_detail(name: str, version: str = "") -> dict[str, str]:
     """Get details about a corpus, using information from local catalog.
 
     :param str name: name of corpus
@@ -172,7 +173,7 @@ def get_corpus(filename: str, comments: bool = True) -> frozenset:
     return frozenset(filter(None, lines))
 
 
-def get_corpus_as_is(filename: str) -> list:
+def get_corpus_as_is(filename: str) -> list[str]:
     """Read corpus data from file, as it is, and return a list.
 
     Each line in the file will be a member of the list.
@@ -749,7 +750,7 @@ def remove(name: str) -> bool:
     return False
 
 
-def get_path_folder_corpus(name, version, *path):
+def get_path_folder_corpus(name: str, version: str, *path: str) -> str:
     return os.path.join(get_corpus_path(name, version), *path)
 
 
 
@@ -12,6 +12,7 @@
 
 from __future__ import annotations
 
+from collections.abc import Iterable
 from typing import Optional
 
 import nltk
@@ -29,7 +30,7 @@
 from nltk.corpus import wordnet
 
 
-def synsets(word: str, pos: Optional[str] = None, lang: str = "tha"):
+def synsets(word: str, pos: Optional[str] = None, lang: str = "tha") -> list[wordnet.Synset]:
     """This function returns the synonym set for all lemmas of the given word
     with an optional argument to constrain the part of speech of the word.
 
@@ -76,7 +77,7 @@ def synsets(word: str, pos: Optional[str] = None, lang: str = "tha"):
     return wordnet.synsets(lemma=word, pos=pos, lang=lang)
 
 
-def synset(name_synsets):
+def synset(name_synsets: str) -> wordnet.Synset:
     """This function returns the synonym set (synset) given the name of the synset
     (i.e. 'dog.n.01', 'chase.v.01').
 
@@ -100,7 +101,7 @@ def synset(name_synsets):
     return wordnet.synset(name_synsets)
 
 
-def all_lemma_names(pos: Optional[str] = None, lang: str = "tha"):
+def all_lemma_names(pos: Optional[str] = None, lang: str = "tha") -> list[str]:
     """This function returns all lemma names for all synsets of the given
     part of speech tag and language. If part of speech tag is not
     specified, all synsets of all parts of speech will be used.
@@ -142,7 +143,7 @@ def all_lemma_names(pos: Optional[str] = None, lang: str = "tha"):
     return wordnet.all_lemma_names(pos=pos, lang=lang)
 
 
-def all_synsets(pos: Optional[str] = None):
+def all_synsets(pos: Optional[str] = None) -> Iterable[wordnet.Synset]:
     """This function iterates over all synsets constrained by the given
     part of speech tag.
 
@@ -172,7 +173,7 @@ def all_synsets(pos: Optional[str] = None):
     return wordnet.all_synsets(pos=pos)
 
 
-def langs():
+def langs() -> list[str]:
     """This function returns a set of ISO-639 language codes.
 
     :return: ISO-639 language codes
@@ -190,7 +191,7 @@ def langs():
     return wordnet.langs()
 
 
-def lemmas(word: str, pos: Optional[str] = None, lang: str = "tha"):
+def lemmas(word: str, pos: Optional[str] = None, lang: str = "tha") -> list[wordnet.Lemma]:
     """This function returns all lemmas given the word with an optional
     argument to constrain the part of speech of the word.
 
@@ -233,7 +234,7 @@ def lemmas(word: str, pos: Optional[str] = None, lang: str = "tha"):
     return wordnet.lemmas(word, pos=pos, lang=lang)
 
 
-def lemma(name_synsets):
+def lemma(name_synsets: str) -> wordnet.Lemma:
     """This function returns lemma object given the name.
 
     .. note::
@@ -260,7 +261,7 @@ def lemma(name_synsets):
     return wordnet.lemma(name_synsets)
 
 
-def lemma_from_key(key):
+def lemma_from_key(key: str) -> wordnet.Lemma:
     """This function returns lemma object given the lemma key.
     This is similar to :func:`lemma` but it needs to be given the key
     of lemma instead of the name of lemma.
@@ -286,7 +287,7 @@ def lemma_from_key(key):
     return wordnet.lemma_from_key(key)
 
 
-def path_similarity(synsets1, synsets2):
+def path_similarity(synsets1: wordnet.Synset, synsets2: wordnet.Synset) -> float:
     """This function returns similarity between two synsets based on the
     shortest path distance calculated using the equation below.
 
@@ -325,7 +326,7 @@ def path_similarity(synsets1, synsets2):
     return wordnet.path_similarity(synsets1, synsets2)
 
 
-def lch_similarity(synsets1, synsets2):
+def lch_similarity(synsets1: wordnet.Synset, synsets2: wordnet.Synset) -> float:
     """This function returns Leacock Chodorow similarity (LCH)
     between two synsets, based on the shortest path distance
     and the maximum depth of the taxonomy. The equation to
@@ -362,7 +363,7 @@ def lch_similarity(synsets1, synsets2):
     return wordnet.lch_similarity(synsets1, synsets2)
 
 
-def wup_similarity(synsets1, synsets2):
+def wup_similarity(synsets1: wordnet.Synset, synsets2: wordnet.Synset) -> float:
     """This function returns Wu-Palmer similarity (WUP) between two synsets,
     based on the depth of the two senses in the taxonomy and their
     Least Common Subsumer (most specific ancestor node).
@@ -393,7 +394,7 @@ def wup_similarity(synsets1, synsets2):
     return wordnet.wup_similarity(synsets1, synsets2)
 
 
-def morphy(form, pos: Optional[str] = None):
+def morphy(form: str, pos: Optional[str] = None) -> str:
     """This function finds a possible base form for the given form,
     with the given part of speech.
 
@@ -423,7 +424,7 @@ def morphy(form, pos: Optional[str] = None):
     return wordnet.morphy(form, pos=None)
 
 
-def custom_lemmas(tab_file, lang: str):
+def custom_lemmas(tab_file, lang: str) -> None:
     """This function reads a custom tab file
     (see: http://compling.hss.ntu.edu.sg/omw/)
     containing mappings of lemmas in the given language.
 
@@ -47,7 +47,7 @@ def __init__(self) -> None:
         # Number of instances seen
         self.i = 0
 
-    def predict(self, features: dict):
+    def predict(self, features: dict[str, float]) -> str:
         """Dot-product the features and current weights and return the best
         label.
         """
@@ -61,10 +61,10 @@ def predict(self, features: dict):
         # Do a secondary alphabetic sort, for stability
         return max(self.classes, key=lambda label: (scores[label], label))
 
-    def update(self, truth, guess, features: dict) -> None:
+    def update(self, truth: str, guess: str, features: dict[str, float]) -> None:
         """Update the feature weights."""
 
-        def upd_feat(c, f, w, v):
+        def upd_feat(c: str, f: str, w: float, v: float) -> None:
             param = (f, c)
             self._totals[param] += (self.i - self._tstamps[param]) * w
             self._tstamps[param] = self.i
@@ -236,7 +236,7 @@ def _get_features(
         trained.
         """
 
-        def add(name: str, *args):
+        def add(name: str, *args: str) -> None:
             features[" ".join((name,) + tuple(args))] += 1
 
         i += len(self.START)
 
@@ -3,7 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import types
 from importlib.resources import as_file, files
+from typing import Optional, Union
 
 from pycrfsuite import Tagger as CRFTagger
 
@@ -14,7 +16,7 @@ def _is_stopword(word: str) -> bool:  # check Thai stopword
     return word in thai_stopwords()
 
 
-def _doc2features(tokens: list[tuple[str, str]], index: int) -> dict:
+def _doc2features(tokens: list[tuple[str, str]], index: int) -> dict[str, Union[str, bool]]:
     """`tokens`  = a POS-tagged sentence [(w1, t1), ...]
     `index`   = the index of the token we want to extract features for
     """
@@ -52,7 +54,7 @@ def _doc2features(tokens: list[tuple[str, str]], index: int) -> dict:
     return f
 
 
-def extract_features(doc):
+def extract_features(doc: list[tuple[str, str]]) -> list[dict[str, Union[str, bool]]]:
     return [_doc2features(doc, i) for i in range(0, len(doc))]
 
 
@@ -74,7 +76,7 @@ def __init__(self, corpus: str = "orchidpp"):
         self._model_file_ctx = None
         self.load_model(self.corpus)
 
-    def load_model(self, corpus: str):
+    def load_model(self, corpus: str) -> None:
         self.tagger = CRFTagger()
         if corpus == "orchidpp":
             corpus_files = files("pythainlp.corpus")
@@ -87,11 +89,16 @@ def parse(self, token_pos: list[tuple[str, str]]) -> list[str]:
         self.xseq = extract_features(token_pos)
         return self.tagger.tag(self.xseq)
 
-    def __enter__(self):
+    def __enter__(self) -> CRFchunk:
         """Context manager entry."""
         return self
 
-    def __exit__(self, exc_type, exc_val, exc_tb):
+    def __exit__(
+        self,
+        exc_type: Optional[type[BaseException]],
+        exc_val: Optional[BaseException],
+        exc_tb: Optional[types.TracebackType]
+    ) -> bool:
         """Context manager exit - clean up resources."""
         if self._model_file_ctx is not None:
             try:
@@ -101,7 +108,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
                 pass
         return False
 
-    def __del__(self):
+    def __del__(self) -> None:
         """Clean up the context manager when object is destroyed.
 
         Note: __del__ is not guaranteed to be called and should not be
 
@@ -201,5 +201,5 @@ def get_ner(
         return sent_ner
 
     @staticmethod
-    def __extract_features(doc):
+    def __extract_features(doc: list[str]) -> list[dict[str, str | bool]]:
         return [_doc2features(doc, i) for i in range(len(doc))]
@@ -4,6 +4,7 @@
 from __future__ import annotations
 
 import json
+from typing import Union
 
 import numpy as np
 
@@ -54,7 +55,7 @@ def __init__(
             self._json = json.load(fh)
             self.id2tag = self._json["id2label"]
 
-    def build_tokenizer(self, sent):
+    def build_tokenizer(self, sent: str) -> dict[str, np.ndarray]:
         _t = [5] + [i + 4 for i in self.sp.encode(sent)] + [6]
         model_inputs = {}
         model_inputs["input_ids"] = np.array([_t], dtype=np.int64)
@@ -63,17 +64,17 @@ def build_tokenizer(self, sent):
         )
         return model_inputs
 
-    def postprocess(self, logits_data):
+    def postprocess(self, logits_data: np.ndarray) -> np.ndarray:
         logits_t = logits_data[0]
         maxes = np.max(logits_t, axis=-1, keepdims=True)
         shifted_exp = np.exp(logits_t - maxes)
         scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
         return scores
 
-    def clean_output(self, list_text):
+    def clean_output(self, list_text: list[tuple[str, str]]) -> list[tuple[str, str]]:
         return list_text
 
-    def totag(self, post, sent):
+    def totag(self, post: np.ndarray, sent: str) -> list[tuple[str, str]]:
         tag = []
         _s = self.sp.EncodeAsPieces(sent)
         for i in range(len(_s)):
@@ -87,10 +88,10 @@ def totag(self, post, sent):
             )
         return tag
 
-    def _config(self, list_ner):
+    def _config(self, list_ner: list[tuple[str, str]]) -> list[tuple[str, str]]:
         return list_ner
 
-    def get_ner(self, text: str, tag: bool = False):
+    def get_ner(self, text: str, tag: bool = False) -> Union[str, list[tuple[str, str]]]:
         self._s = self.build_tokenizer(text)
         logits = self.session.run(
             output_names=[self.outputs_name], input_feed=self._s
 
@@ -29,13 +29,13 @@
 
 
 @lru_cache
-def word_dict_trie():
+def word_dict_trie() -> Trie:
     """Lazy load default word dict trie with cache"""
     return Trie(thai_words())
 
 
 @lru_cache
-def syllable_dict_trie():
+def syllable_dict_trie() -> Trie:
     """Lazy load default syllable dict trie with cache"""
     return Trie(thai_syllables())
 
 
@@ -338,7 +338,7 @@ def word_tokenize(
     return segments
 
 
-def indices_words(words):
+def indices_words(words: list[str]) -> list[tuple[int, int]]:
     """Convert a list of words to a list of character index pairs.
 
     This function takes a list of words and returns the start and end
@@ -369,7 +369,7 @@ def indices_words(words):
     return indices
 
 
-def map_indices_to_words(index_list, sentences):
+def map_indices_to_words(index_list: list[tuple[int, int]], sentences: list[str]) -> list[list[str]]:
     """Map character index pairs to actual words from sentences.
 
     This function takes a list of character index pairs and a list of