Skip to content

Commit 59695bd

Browse files
authored
Merge pull request #1230 from PyThaiNLP/copilot/fix-type-hints-inconsistencies
Revert union type syntax from `|` to `Union[]` for Python 3.9 runtime compatibility
2 parents a075cb2 + b373acc commit 59695bd

File tree

18 files changed

+86
-68
lines changed

18 files changed

+86
-68
lines changed

pythainlp/corpus/common.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,9 @@
6262

6363
_THAI_ORST_WORDS: frozenset[str] = frozenset()
6464

65-
_THAI_DICT: dict[str, list] = {}
66-
_THAI_WSD_DICT: dict[str, list] = {}
67-
_THAI_SYNONYMS: dict[str, list] = {}
65+
_THAI_DICT: dict[str, list[str]] = {}
66+
_THAI_WSD_DICT: dict[str, list[str]] = {}
67+
_THAI_SYNONYMS: dict[str, list[str]] = {}
6868

6969

7070
def countries() -> frozenset[str]:
@@ -83,7 +83,7 @@ def countries() -> frozenset[str]:
8383
return _THAI_COUNTRIES
8484

8585

86-
def provinces(details: bool = False) -> Union[frozenset[str], list[dict]]:
86+
def provinces(details: bool = False) -> Union[frozenset[str], list[dict[str, str]]]:
8787
"""Return a frozenset of Thailand province names in Thai such as "กระบี่",
8888
"กรุงเทพมหานคร", "กาญจนบุรี", and "อุบลราชธานี".
8989
\n(See: `dev/pythainlp/corpus/thailand_provinces_th.txt\

pythainlp/corpus/core.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import sys
1212
import tarfile
1313
import zipfile
14+
from http.client import HTTPResponse
1415
from importlib.resources import files
1516
from typing import Optional
1617

@@ -29,20 +30,20 @@
2930
class _ResponseWrapper:
3031
"""Wrapper to provide requests.Response-like interface for urllib response."""
3132

32-
def __init__(self, response):
33+
def __init__(self, response: HTTPResponse) -> None:
3334
self.status_code = response.status
3435
self.headers = response.headers
3536
self._content = response.read()
3637

37-
def json(self):
38+
def json(self) -> dict:
3839
"""Parse JSON content from response."""
3940
try:
4041
return json.loads(self._content.decode("utf-8"))
4142
except (json.JSONDecodeError, UnicodeDecodeError) as err:
4243
raise ValueError(f"Failed to parse JSON response: {err}")
4344

4445

45-
def get_corpus_db(url: str):
46+
def get_corpus_db(url: str) -> Optional[_ResponseWrapper]:
4647
"""Get corpus catalog from server.
4748
4849
:param str url: URL corpus catalog
@@ -69,7 +70,7 @@ def get_corpus_db(url: str):
6970
return corpus_db
7071

7172

72-
def get_corpus_db_detail(name: str, version: str = "") -> dict:
73+
def get_corpus_db_detail(name: str, version: str = "") -> dict[str, str]:
7374
"""Get details about a corpus, using information from local catalog.
7475
7576
:param str name: name of corpus
@@ -172,7 +173,7 @@ def get_corpus(filename: str, comments: bool = True) -> frozenset:
172173
return frozenset(filter(None, lines))
173174

174175

175-
def get_corpus_as_is(filename: str) -> list:
176+
def get_corpus_as_is(filename: str) -> list[str]:
176177
"""Read corpus data from file, as it is, and return a list.
177178
178179
Each line in the file will be a member of the list.
@@ -749,7 +750,7 @@ def remove(name: str) -> bool:
749750
return False
750751

751752

752-
def get_path_folder_corpus(name, version, *path):
753+
def get_path_folder_corpus(name: str, version: str, *path: str) -> str:
753754
return os.path.join(get_corpus_path(name, version), *path)
754755

755756

pythainlp/corpus/wordnet.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
from __future__ import annotations
1414

15+
from collections.abc import Iterable
1516
from typing import Optional
1617

1718
import nltk
@@ -29,7 +30,7 @@
2930
from nltk.corpus import wordnet
3031

3132

32-
def synsets(word: str, pos: Optional[str] = None, lang: str = "tha"):
33+
def synsets(word: str, pos: Optional[str] = None, lang: str = "tha") -> list[wordnet.Synset]:
3334
"""This function returns the synonym set for all lemmas of the given word
3435
with an optional argument to constrain the part of speech of the word.
3536
@@ -76,7 +77,7 @@ def synsets(word: str, pos: Optional[str] = None, lang: str = "tha"):
7677
return wordnet.synsets(lemma=word, pos=pos, lang=lang)
7778

7879

79-
def synset(name_synsets):
80+
def synset(name_synsets: str) -> wordnet.Synset:
8081
"""This function returns the synonym set (synset) given the name of the synset
8182
(i.e. 'dog.n.01', 'chase.v.01').
8283
@@ -100,7 +101,7 @@ def synset(name_synsets):
100101
return wordnet.synset(name_synsets)
101102

102103

103-
def all_lemma_names(pos: Optional[str] = None, lang: str = "tha"):
104+
def all_lemma_names(pos: Optional[str] = None, lang: str = "tha") -> list[str]:
104105
"""This function returns all lemma names for all synsets of the given
105106
part of speech tag and language. If part of speech tag is not
106107
specified, all synsets of all parts of speech will be used.
@@ -142,7 +143,7 @@ def all_lemma_names(pos: Optional[str] = None, lang: str = "tha"):
142143
return wordnet.all_lemma_names(pos=pos, lang=lang)
143144

144145

145-
def all_synsets(pos: Optional[str] = None):
146+
def all_synsets(pos: Optional[str] = None) -> Iterable[wordnet.Synset]:
146147
"""This function iterates over all synsets constrained by the given
147148
part of speech tag.
148149
@@ -172,7 +173,7 @@ def all_synsets(pos: Optional[str] = None):
172173
return wordnet.all_synsets(pos=pos)
173174

174175

175-
def langs():
176+
def langs() -> list[str]:
176177
"""This function returns a set of ISO-639 language codes.
177178
178179
:return: ISO-639 language codes
@@ -190,7 +191,7 @@ def langs():
190191
return wordnet.langs()
191192

192193

193-
def lemmas(word: str, pos: Optional[str] = None, lang: str = "tha"):
194+
def lemmas(word: str, pos: Optional[str] = None, lang: str = "tha") -> list[wordnet.Lemma]:
194195
"""This function returns all lemmas given the word with an optional
195196
argument to constrain the part of speech of the word.
196197
@@ -233,7 +234,7 @@ def lemmas(word: str, pos: Optional[str] = None, lang: str = "tha"):
233234
return wordnet.lemmas(word, pos=pos, lang=lang)
234235

235236

236-
def lemma(name_synsets):
237+
def lemma(name_synsets: str) -> wordnet.Lemma:
237238
"""This function returns lemma object given the name.
238239
239240
.. note::
@@ -260,7 +261,7 @@ def lemma(name_synsets):
260261
return wordnet.lemma(name_synsets)
261262

262263

263-
def lemma_from_key(key):
264+
def lemma_from_key(key: str) -> wordnet.Lemma:
264265
"""This function returns lemma object given the lemma key.
265266
This is similar to :func:`lemma` but it needs to be given the key
266267
of lemma instead of the name of lemma.
@@ -286,7 +287,7 @@ def lemma_from_key(key):
286287
return wordnet.lemma_from_key(key)
287288

288289

289-
def path_similarity(synsets1, synsets2):
290+
def path_similarity(synsets1: wordnet.Synset, synsets2: wordnet.Synset) -> float:
290291
"""This function returns similarity between two synsets based on the
291292
shortest path distance calculated using the equation below.
292293
@@ -325,7 +326,7 @@ def path_similarity(synsets1, synsets2):
325326
return wordnet.path_similarity(synsets1, synsets2)
326327

327328

328-
def lch_similarity(synsets1, synsets2):
329+
def lch_similarity(synsets1: wordnet.Synset, synsets2: wordnet.Synset) -> float:
329330
"""This function returns Leacock Chodorow similarity (LCH)
330331
between two synsets, based on the shortest path distance
331332
and the maximum depth of the taxonomy. The equation to
@@ -362,7 +363,7 @@ def lch_similarity(synsets1, synsets2):
362363
return wordnet.lch_similarity(synsets1, synsets2)
363364

364365

365-
def wup_similarity(synsets1, synsets2):
366+
def wup_similarity(synsets1: wordnet.Synset, synsets2: wordnet.Synset) -> float:
366367
"""This function returns Wu-Palmer similarity (WUP) between two synsets,
367368
based on the depth of the two senses in the taxonomy and their
368369
Least Common Subsumer (most specific ancestor node).
@@ -393,7 +394,7 @@ def wup_similarity(synsets1, synsets2):
393394
return wordnet.wup_similarity(synsets1, synsets2)
394395

395396

396-
def morphy(form, pos: Optional[str] = None):
397+
def morphy(form: str, pos: Optional[str] = None) -> str:
397398
"""This function finds a possible base form for the given form,
398399
with the given part of speech.
399400
@@ -423,7 +424,7 @@ def morphy(form, pos: Optional[str] = None):
423424
return wordnet.morphy(form, pos=None)
424425

425426

426-
def custom_lemmas(tab_file, lang: str):
427+
def custom_lemmas(tab_file, lang: str) -> None:
427428
"""This function reads a custom tab file
428429
(see: http://compling.hss.ntu.edu.sg/omw/)
429430
containing mappings of lemmas in the given language.

pythainlp/tag/_tag_perceptron.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def __init__(self) -> None:
4747
# Number of instances seen
4848
self.i = 0
4949

50-
def predict(self, features: dict):
50+
def predict(self, features: dict[str, float]) -> str:
5151
"""Dot-product the features and current weights and return the best
5252
label.
5353
"""
@@ -61,10 +61,10 @@ def predict(self, features: dict):
6161
# Do a secondary alphabetic sort, for stability
6262
return max(self.classes, key=lambda label: (scores[label], label))
6363

64-
def update(self, truth, guess, features: dict) -> None:
64+
def update(self, truth: str, guess: str, features: dict[str, float]) -> None:
6565
"""Update the feature weights."""
6666

67-
def upd_feat(c, f, w, v):
67+
def upd_feat(c: str, f: str, w: float, v: float) -> None:
6868
param = (f, c)
6969
self._totals[param] += (self.i - self._tstamps[param]) * w
7070
self._tstamps[param] = self.i
@@ -236,7 +236,7 @@ def _get_features(
236236
trained.
237237
"""
238238

239-
def add(name: str, *args):
239+
def add(name: str, *args: str) -> None:
240240
features[" ".join((name,) + tuple(args))] += 1
241241

242242
i += len(self.START)

pythainlp/tag/crfchunk.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
# SPDX-License-Identifier: Apache-2.0
44
from __future__ import annotations
55

6+
import types
67
from importlib.resources import as_file, files
8+
from typing import Optional, Union
79

810
from pycrfsuite import Tagger as CRFTagger
911

@@ -14,7 +16,7 @@ def _is_stopword(word: str) -> bool: # check Thai stopword
1416
return word in thai_stopwords()
1517

1618

17-
def _doc2features(tokens: list[tuple[str, str]], index: int) -> dict:
19+
def _doc2features(tokens: list[tuple[str, str]], index: int) -> dict[str, Union[str, bool]]:
1820
"""`tokens` = a POS-tagged sentence [(w1, t1), ...]
1921
`index` = the index of the token we want to extract features for
2022
"""
@@ -52,7 +54,7 @@ def _doc2features(tokens: list[tuple[str, str]], index: int) -> dict:
5254
return f
5355

5456

55-
def extract_features(doc):
57+
def extract_features(doc: list[tuple[str, str]]) -> list[dict[str, Union[str, bool]]]:
5658
return [_doc2features(doc, i) for i in range(0, len(doc))]
5759

5860

@@ -74,7 +76,7 @@ def __init__(self, corpus: str = "orchidpp"):
7476
self._model_file_ctx = None
7577
self.load_model(self.corpus)
7678

77-
def load_model(self, corpus: str):
79+
def load_model(self, corpus: str) -> None:
7880
self.tagger = CRFTagger()
7981
if corpus == "orchidpp":
8082
corpus_files = files("pythainlp.corpus")
@@ -87,11 +89,16 @@ def parse(self, token_pos: list[tuple[str, str]]) -> list[str]:
8789
self.xseq = extract_features(token_pos)
8890
return self.tagger.tag(self.xseq)
8991

90-
def __enter__(self):
92+
def __enter__(self) -> CRFchunk:
9193
"""Context manager entry."""
9294
return self
9395

94-
def __exit__(self, exc_type, exc_val, exc_tb):
96+
def __exit__(
97+
self,
98+
exc_type: Optional[type[BaseException]],
99+
exc_val: Optional[BaseException],
100+
exc_tb: Optional[types.TracebackType]
101+
) -> bool:
95102
"""Context manager exit - clean up resources."""
96103
if self._model_file_ctx is not None:
97104
try:
@@ -101,7 +108,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
101108
pass
102109
return False
103110

104-
def __del__(self):
111+
def __del__(self) -> None:
105112
"""Clean up the context manager when object is destroyed.
106113
107114
Note: __del__ is not guaranteed to be called and should not be

pythainlp/tag/thainer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,5 +201,5 @@ def get_ner(
201201
return sent_ner
202202

203203
@staticmethod
204-
def __extract_features(doc):
204+
def __extract_features(doc: list[str]) -> list[dict[str, str | bool]]:
205205
return [_doc2features(doc, i) for i in range(len(doc))]

pythainlp/tag/wangchanberta_onnx.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from __future__ import annotations
55

66
import json
7+
from typing import Union
78

89
import numpy as np
910

@@ -54,7 +55,7 @@ def __init__(
5455
self._json = json.load(fh)
5556
self.id2tag = self._json["id2label"]
5657

57-
def build_tokenizer(self, sent):
58+
def build_tokenizer(self, sent: str) -> dict[str, np.ndarray]:
5859
_t = [5] + [i + 4 for i in self.sp.encode(sent)] + [6]
5960
model_inputs = {}
6061
model_inputs["input_ids"] = np.array([_t], dtype=np.int64)
@@ -63,17 +64,17 @@ def build_tokenizer(self, sent):
6364
)
6465
return model_inputs
6566

66-
def postprocess(self, logits_data):
67+
def postprocess(self, logits_data: np.ndarray) -> np.ndarray:
6768
logits_t = logits_data[0]
6869
maxes = np.max(logits_t, axis=-1, keepdims=True)
6970
shifted_exp = np.exp(logits_t - maxes)
7071
scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
7172
return scores
7273

73-
def clean_output(self, list_text):
74+
def clean_output(self, list_text: list[tuple[str, str]]) -> list[tuple[str, str]]:
7475
return list_text
7576

76-
def totag(self, post, sent):
77+
def totag(self, post: np.ndarray, sent: str) -> list[tuple[str, str]]:
7778
tag = []
7879
_s = self.sp.EncodeAsPieces(sent)
7980
for i in range(len(_s)):
@@ -87,10 +88,10 @@ def totag(self, post, sent):
8788
)
8889
return tag
8990

90-
def _config(self, list_ner):
91+
def _config(self, list_ner: list[tuple[str, str]]) -> list[tuple[str, str]]:
9192
return list_ner
9293

93-
def get_ner(self, text: str, tag: bool = False):
94+
def get_ner(self, text: str, tag: bool = False) -> Union[str, list[tuple[str, str]]]:
9495
self._s = self.build_tokenizer(text)
9596
logits = self.session.run(
9697
output_names=[self.outputs_name], input_feed=self._s

pythainlp/tokenize/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,13 @@
2929

3030

3131
@lru_cache
32-
def word_dict_trie():
32+
def word_dict_trie() -> Trie:
3333
"""Lazy load default word dict trie with cache"""
3434
return Trie(thai_words())
3535

3636

3737
@lru_cache
38-
def syllable_dict_trie():
38+
def syllable_dict_trie() -> Trie:
3939
"""Lazy load default syllable dict trie with cache"""
4040
return Trie(thai_syllables())
4141

pythainlp/tokenize/core.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,7 @@ def word_tokenize(
338338
return segments
339339

340340

341-
def indices_words(words):
341+
def indices_words(words: list[str]) -> list[tuple[int, int]]:
342342
"""Convert a list of words to a list of character index pairs.
343343
344344
This function takes a list of words and returns the start and end
@@ -369,7 +369,7 @@ def indices_words(words):
369369
return indices
370370

371371

372-
def map_indices_to_words(index_list, sentences):
372+
def map_indices_to_words(index_list: list[tuple[int, int]], sentences: list[str]) -> list[list[str]]:
373373
"""Map character index pairs to actual words from sentences.
374374
375375
This function takes a list of character index pairs and a list of

0 commit comments

Comments
 (0)