Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions pythainlp/corpus/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@

_THAI_ORST_WORDS: frozenset[str] = frozenset()

_THAI_DICT: dict[str, list] = {}
_THAI_WSD_DICT: dict[str, list] = {}
_THAI_SYNONYMS: dict[str, list] = {}
_THAI_DICT: dict[str, list[str]] = {}
_THAI_WSD_DICT: dict[str, list[str]] = {}
_THAI_SYNONYMS: dict[str, list[str]] = {}


def countries() -> frozenset[str]:
Expand All @@ -83,7 +83,7 @@ def countries() -> frozenset[str]:
return _THAI_COUNTRIES


def provinces(details: bool = False) -> Union[frozenset[str], list[dict]]:
def provinces(details: bool = False) -> Union[frozenset[str], list[dict[str, str]]]:
"""Return a frozenset of Thailand province names in Thai such as "กระบี่",
"กรุงเทพมหานคร", "กาญจนบุรี", and "อุบลราชธานี".
\n(See: `dev/pythainlp/corpus/thailand_provinces_th.txt\
Expand Down
13 changes: 7 additions & 6 deletions pythainlp/corpus/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import sys
import tarfile
import zipfile
from http.client import HTTPResponse
from importlib.resources import files
from typing import Optional

Expand All @@ -29,20 +30,20 @@
class _ResponseWrapper:
"""Wrapper to provide requests.Response-like interface for urllib response."""

def __init__(self, response):
def __init__(self, response: HTTPResponse) -> None:
self.status_code = response.status
self.headers = response.headers
self._content = response.read()

def json(self):
def json(self) -> dict:
"""Parse JSON content from response."""
try:
return json.loads(self._content.decode("utf-8"))
except (json.JSONDecodeError, UnicodeDecodeError) as err:
raise ValueError(f"Failed to parse JSON response: {err}")


def get_corpus_db(url: str):
def get_corpus_db(url: str) -> Optional[_ResponseWrapper]:
"""Get corpus catalog from server.

:param str url: URL corpus catalog
Expand All @@ -69,7 +70,7 @@ def get_corpus_db(url: str):
return corpus_db


def get_corpus_db_detail(name: str, version: str = "") -> dict:
def get_corpus_db_detail(name: str, version: str = "") -> dict[str, str]:
"""Get details about a corpus, using information from local catalog.

:param str name: name of corpus
Expand Down Expand Up @@ -172,7 +173,7 @@ def get_corpus(filename: str, comments: bool = True) -> frozenset:
return frozenset(filter(None, lines))


def get_corpus_as_is(filename: str) -> list:
def get_corpus_as_is(filename: str) -> list[str]:
"""Read corpus data from file, as it is, and return a list.

Each line in the file will be a member of the list.
Expand Down Expand Up @@ -749,7 +750,7 @@ def remove(name: str) -> bool:
return False


def get_path_folder_corpus(name, version, *path):
def get_path_folder_corpus(name: str, version: str, *path: str) -> str:
return os.path.join(get_corpus_path(name, version), *path)


Expand Down
27 changes: 14 additions & 13 deletions pythainlp/corpus/wordnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from __future__ import annotations

from collections.abc import Iterable
from typing import Optional

import nltk
Expand All @@ -29,7 +30,7 @@
from nltk.corpus import wordnet


def synsets(word: str, pos: Optional[str] = None, lang: str = "tha"):
def synsets(word: str, pos: Optional[str] = None, lang: str = "tha") -> list[wordnet.Synset]:
"""This function returns the synonym set for all lemmas of the given word
with an optional argument to constrain the part of speech of the word.

Expand Down Expand Up @@ -76,7 +77,7 @@ def synsets(word: str, pos: Optional[str] = None, lang: str = "tha"):
return wordnet.synsets(lemma=word, pos=pos, lang=lang)


def synset(name_synsets):
def synset(name_synsets: str) -> wordnet.Synset:
"""This function returns the synonym set (synset) given the name of the synset
(i.e. 'dog.n.01', 'chase.v.01').

Expand All @@ -100,7 +101,7 @@ def synset(name_synsets):
return wordnet.synset(name_synsets)


def all_lemma_names(pos: Optional[str] = None, lang: str = "tha"):
def all_lemma_names(pos: Optional[str] = None, lang: str = "tha") -> list[str]:
"""This function returns all lemma names for all synsets of the given
part of speech tag and language. If part of speech tag is not
specified, all synsets of all parts of speech will be used.
Expand Down Expand Up @@ -142,7 +143,7 @@ def all_lemma_names(pos: Optional[str] = None, lang: str = "tha"):
return wordnet.all_lemma_names(pos=pos, lang=lang)


def all_synsets(pos: Optional[str] = None):
def all_synsets(pos: Optional[str] = None) -> Iterable[wordnet.Synset]:
"""This function iterates over all synsets constrained by the given
part of speech tag.

Expand Down Expand Up @@ -172,7 +173,7 @@ def all_synsets(pos: Optional[str] = None):
return wordnet.all_synsets(pos=pos)


def langs():
def langs() -> list[str]:
"""This function returns a set of ISO-639 language codes.

:return: ISO-639 language codes
Expand All @@ -190,7 +191,7 @@ def langs():
return wordnet.langs()


def lemmas(word: str, pos: Optional[str] = None, lang: str = "tha"):
def lemmas(word: str, pos: Optional[str] = None, lang: str = "tha") -> list[wordnet.Lemma]:
"""This function returns all lemmas given the word with an optional
argument to constrain the part of speech of the word.

Expand Down Expand Up @@ -233,7 +234,7 @@ def lemmas(word: str, pos: Optional[str] = None, lang: str = "tha"):
return wordnet.lemmas(word, pos=pos, lang=lang)


def lemma(name_synsets):
def lemma(name_synsets: str) -> wordnet.Lemma:
"""This function returns lemma object given the name.

.. note::
Expand All @@ -260,7 +261,7 @@ def lemma(name_synsets):
return wordnet.lemma(name_synsets)


def lemma_from_key(key):
def lemma_from_key(key: str) -> wordnet.Lemma:
"""This function returns lemma object given the lemma key.
This is similar to :func:`lemma` but it needs to be given the key
of lemma instead of the name of lemma.
Expand All @@ -286,7 +287,7 @@ def lemma_from_key(key):
return wordnet.lemma_from_key(key)


def path_similarity(synsets1, synsets2):
def path_similarity(synsets1: wordnet.Synset, synsets2: wordnet.Synset) -> float:
"""This function returns similarity between two synsets based on the
shortest path distance calculated using the equation below.

Expand Down Expand Up @@ -325,7 +326,7 @@ def path_similarity(synsets1, synsets2):
return wordnet.path_similarity(synsets1, synsets2)


def lch_similarity(synsets1, synsets2):
def lch_similarity(synsets1: wordnet.Synset, synsets2: wordnet.Synset) -> float:
"""This function returns Leacock Chodorow similarity (LCH)
between two synsets, based on the shortest path distance
and the maximum depth of the taxonomy. The equation to
Expand Down Expand Up @@ -362,7 +363,7 @@ def lch_similarity(synsets1, synsets2):
return wordnet.lch_similarity(synsets1, synsets2)


def wup_similarity(synsets1, synsets2):
def wup_similarity(synsets1: wordnet.Synset, synsets2: wordnet.Synset) -> float:
"""This function returns Wu-Palmer similarity (WUP) between two synsets,
based on the depth of the two senses in the taxonomy and their
Least Common Subsumer (most specific ancestor node).
Expand Down Expand Up @@ -393,7 +394,7 @@ def wup_similarity(synsets1, synsets2):
return wordnet.wup_similarity(synsets1, synsets2)


def morphy(form, pos: Optional[str] = None):
def morphy(form: str, pos: Optional[str] = None) -> str:
"""This function finds a possible base form for the given form,
with the given part of speech.

Expand Down Expand Up @@ -423,7 +424,7 @@ def morphy(form, pos: Optional[str] = None):
return wordnet.morphy(form, pos=None)


def custom_lemmas(tab_file, lang: str):
def custom_lemmas(tab_file, lang: str) -> None:
"""This function reads a custom tab file
(see: http://compling.hss.ntu.edu.sg/omw/)
containing mappings of lemmas in the given language.
Expand Down
8 changes: 4 additions & 4 deletions pythainlp/tag/_tag_perceptron.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(self) -> None:
# Number of instances seen
self.i = 0

def predict(self, features: dict):
def predict(self, features: dict[str, float]) -> str:
"""Dot-product the features and current weights and return the best
label.
"""
Expand All @@ -61,10 +61,10 @@ def predict(self, features: dict):
# Do a secondary alphabetic sort, for stability
return max(self.classes, key=lambda label: (scores[label], label))

def update(self, truth, guess, features: dict) -> None:
def update(self, truth: str, guess: str, features: dict[str, float]) -> None:
"""Update the feature weights."""

def upd_feat(c, f, w, v):
def upd_feat(c: str, f: str, w: float, v: float) -> None:
param = (f, c)
self._totals[param] += (self.i - self._tstamps[param]) * w
self._tstamps[param] = self.i
Expand Down Expand Up @@ -236,7 +236,7 @@ def _get_features(
trained.
"""

def add(name: str, *args):
def add(name: str, *args: str) -> None:
features[" ".join((name,) + tuple(args))] += 1

i += len(self.START)
Expand Down
19 changes: 13 additions & 6 deletions pythainlp/tag/crfchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations

import types
from importlib.resources import as_file, files
from typing import Optional, Union

from pycrfsuite import Tagger as CRFTagger

Expand All @@ -14,7 +16,7 @@ def _is_stopword(word: str) -> bool: # check Thai stopword
return word in thai_stopwords()


def _doc2features(tokens: list[tuple[str, str]], index: int) -> dict:
def _doc2features(tokens: list[tuple[str, str]], index: int) -> dict[str, Union[str, bool]]:
"""`tokens` = a POS-tagged sentence [(w1, t1), ...]
`index` = the index of the token we want to extract features for
"""
Expand Down Expand Up @@ -52,7 +54,7 @@ def _doc2features(tokens: list[tuple[str, str]], index: int) -> dict:
return f


def extract_features(doc):
def extract_features(doc: list[tuple[str, str]]) -> list[dict[str, Union[str, bool]]]:
return [_doc2features(doc, i) for i in range(0, len(doc))]


Expand All @@ -74,7 +76,7 @@ def __init__(self, corpus: str = "orchidpp"):
self._model_file_ctx = None
self.load_model(self.corpus)

def load_model(self, corpus: str):
def load_model(self, corpus: str) -> None:
self.tagger = CRFTagger()
if corpus == "orchidpp":
corpus_files = files("pythainlp.corpus")
Expand All @@ -87,11 +89,16 @@ def parse(self, token_pos: list[tuple[str, str]]) -> list[str]:
self.xseq = extract_features(token_pos)
return self.tagger.tag(self.xseq)

def __enter__(self):
def __enter__(self) -> CRFchunk:
"""Context manager entry."""
return self

def __exit__(self, exc_type, exc_val, exc_tb):
def __exit__(
self,
exc_type: Optional[type[BaseException]],
exc_val: Optional[BaseException],
exc_tb: Optional[types.TracebackType]
) -> bool:
"""Context manager exit - clean up resources."""
if self._model_file_ctx is not None:
try:
Expand All @@ -101,7 +108,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
pass
return False

def __del__(self):
def __del__(self) -> None:
"""Clean up the context manager when object is destroyed.

Note: __del__ is not guaranteed to be called and should not be
Expand Down
2 changes: 1 addition & 1 deletion pythainlp/tag/thainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,5 +201,5 @@ def get_ner(
return sent_ner

@staticmethod
def __extract_features(doc):
def __extract_features(doc: list[str]) -> list[dict[str, str | bool]]:
return [_doc2features(doc, i) for i in range(len(doc))]
13 changes: 7 additions & 6 deletions pythainlp/tag/wangchanberta_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from __future__ import annotations

import json
from typing import Union

import numpy as np

Expand Down Expand Up @@ -54,7 +55,7 @@ def __init__(
self._json = json.load(fh)
self.id2tag = self._json["id2label"]

def build_tokenizer(self, sent):
def build_tokenizer(self, sent: str) -> dict[str, np.ndarray]:
_t = [5] + [i + 4 for i in self.sp.encode(sent)] + [6]
model_inputs = {}
model_inputs["input_ids"] = np.array([_t], dtype=np.int64)
Expand All @@ -63,17 +64,17 @@ def build_tokenizer(self, sent):
)
return model_inputs

def postprocess(self, logits_data):
def postprocess(self, logits_data: np.ndarray) -> np.ndarray:
logits_t = logits_data[0]
maxes = np.max(logits_t, axis=-1, keepdims=True)
shifted_exp = np.exp(logits_t - maxes)
scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
return scores

def clean_output(self, list_text):
def clean_output(self, list_text: list[tuple[str, str]]) -> list[tuple[str, str]]:
return list_text

def totag(self, post, sent):
def totag(self, post: np.ndarray, sent: str) -> list[tuple[str, str]]:
tag = []
_s = self.sp.EncodeAsPieces(sent)
for i in range(len(_s)):
Expand All @@ -87,10 +88,10 @@ def totag(self, post, sent):
)
return tag

def _config(self, list_ner):
def _config(self, list_ner: list[tuple[str, str]]) -> list[tuple[str, str]]:
return list_ner

def get_ner(self, text: str, tag: bool = False):
def get_ner(self, text: str, tag: bool = False) -> Union[str, list[tuple[str, str]]]:
self._s = self.build_tokenizer(text)
logits = self.session.run(
output_names=[self.outputs_name], input_feed=self._s
Expand Down
4 changes: 2 additions & 2 deletions pythainlp/tokenize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@


@lru_cache
def word_dict_trie():
def word_dict_trie() -> Trie:
"""Lazy load default word dict trie with cache"""
return Trie(thai_words())


@lru_cache
def syllable_dict_trie():
def syllable_dict_trie() -> Trie:
"""Lazy load default syllable dict trie with cache"""
return Trie(thai_syllables())

Expand Down
4 changes: 2 additions & 2 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ def word_tokenize(
return segments


def indices_words(words):
def indices_words(words: list[str]) -> list[tuple[int, int]]:
"""Convert a list of words to a list of character index pairs.

This function takes a list of words and returns the start and end
Expand Down Expand Up @@ -369,7 +369,7 @@ def indices_words(words):
return indices


def map_indices_to_words(index_list, sentences):
def map_indices_to_words(index_list: list[tuple[int, int]], sentences: list[str]) -> list[list[str]]:
"""Map character index pairs to actual words from sentences.

This function takes a list of character index pairs and a list of
Expand Down
Loading
Loading