Skip to content
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,6 @@ issues = "https://github.com/PyThaiNLP/pythainlp/issues"
thainlp = "pythainlp.__main__:main"

[tool.setuptools]
zip-safe = false
include-package-data = true

[tool.setuptools.packages.find]
Expand Down
24 changes: 13 additions & 11 deletions pythainlp/corpus/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import os
import re
import sys
from importlib.resources import files

from pythainlp import __version__
from pythainlp.corpus import corpus_db_path, corpus_db_url, corpus_path
Expand Down Expand Up @@ -153,10 +154,10 @@ def get_corpus(filename: str, comments: bool = True) -> frozenset:
# ...})

"""
path = path_pythainlp_corpus(filename)
lines = []
with open(path, encoding="utf-8-sig") as fh:
lines = fh.read().splitlines()
corpus_files = files("pythainlp.corpus")
corpus_file = corpus_files.joinpath(filename)
text = corpus_file.read_text(encoding="utf-8-sig")
lines = text.splitlines()

if not comments:
# if the line has a '#' character, take only text before the first '#'
Expand Down Expand Up @@ -192,10 +193,10 @@ def get_corpus_as_is(filename: str) -> list:
# output:
# ['แต่', 'ไม่']
"""
path = path_pythainlp_corpus(filename)
lines = []
with open(path, encoding="utf-8-sig") as fh:
lines = fh.read().splitlines()
corpus_files = files("pythainlp.corpus")
corpus_file = corpus_files.joinpath(filename)
text = corpus_file.read_text(encoding="utf-8-sig")
lines = text.splitlines()

return lines

Expand All @@ -211,9 +212,10 @@ def get_corpus_default_db(name: str, version: str = "") -> str | None:
If you want to edit default_db.json, \
you can edit pythainlp/corpus/default_db.json
"""
default_db_path = path_pythainlp_corpus("default_db.json")
with open(default_db_path, encoding="utf-8-sig") as fh:
corpus_db = json.load(fh)
corpus_files = files("pythainlp.corpus")
default_db_file = corpus_files.joinpath("default_db.json")
text = default_db_file.read_text(encoding="utf-8-sig")
corpus_db = json.loads(text)

if name in corpus_db:
if version in corpus_db[name]["versions"]:
Expand Down
46 changes: 24 additions & 22 deletions pythainlp/corpus/th_en_translit.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@
]

from collections import defaultdict

from pythainlp.corpus import path_pythainlp_corpus
from importlib.resources import files

_FILE_NAME = "th_en_transliteration_v1.4.tsv"
TRANSLITERATE_EN = "en"
Expand All @@ -30,8 +29,10 @@ def get_transliteration_dict() -> defaultdict:

The returned dict is in dict[str, dict[List[str], List[Optional[bool]]]] format.
"""
path = path_pythainlp_corpus(_FILE_NAME)
if not path:
corpus_files = files("pythainlp.corpus")
corpus_file = corpus_files.joinpath(_FILE_NAME)

if not corpus_file.is_file():
raise FileNotFoundError(
f"Unable to load transliteration dictionary. "
f"{_FILE_NAME} is not found under pythainlp/corpus."
Expand All @@ -42,24 +43,25 @@ def get_transliteration_dict() -> defaultdict:
lambda: {TRANSLITERATE_EN: [], TRANSLITERATE_FOLLOW_RTSG: []}
)
try:
with open(path, encoding="utf-8") as f:
# assume that the first row contains column names, so skip it.
for line in f.readlines()[1:]:
stripped = line.strip()
if stripped:
th, *en_checked = stripped.split("\t")
# replace in-between whitespace to prevent mismatched results from different tokenizers.
# e.g. "บอยแบนด์"
# route 1: "บอยแบนด์" -> ["บอย", "แบนด์"] -> ["boy", "band"] -> "boyband"
# route 2: "บอยแบนด์" -> [""บอยแบนด์""] -> ["boy band"] -> "boy band"
en_translit = en_checked[0].replace(" ", "")
trans_dict[th][TRANSLITERATE_EN].append(en_translit)
en_follow_rtgs = (
bool(en_checked[1]) if len(en_checked) == 2 else None
)
trans_dict[th][TRANSLITERATE_FOLLOW_RTSG].append(
en_follow_rtgs
)
text = corpus_file.read_text(encoding="utf-8")
lines = text.splitlines()
# assume that the first row contains column names, so skip it.
for line in lines[1:]:
stripped = line.strip()
if stripped:
th, *en_checked = stripped.split("\t")
# replace in-between whitespace to prevent mismatched results from different tokenizers.
# e.g. "บอยแบนด์"
# route 1: "บอยแบนด์" -> ["บอย", "แบนด์"] -> ["boy", "band"] -> "boyband"
# route 2: "บอยแบนด์" -> [""บอยแบนด์""] -> ["boy band"] -> "boy band"
en_translit = en_checked[0].replace(" ", "")
trans_dict[th][TRANSLITERATE_EN].append(en_translit)
en_follow_rtgs = (
bool(en_checked[1]) if len(en_checked) == 2 else None
)
trans_dict[th][TRANSLITERATE_FOLLOW_RTSG].append(
en_follow_rtgs
)

except ValueError as exc:
raise ValueError(
Expand Down
61 changes: 45 additions & 16 deletions pythainlp/spell/symspellpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,36 +13,64 @@

from __future__ import annotations

import threading
from importlib.resources import as_file, files

try:
from symspellpy import SymSpell, Verbosity
except ImportError:
raise ImportError(
"Import Error; Install symspellpy by pip install symspellpy"
)

from pythainlp.corpus import get_corpus_path, path_pythainlp_corpus
from pythainlp.corpus import get_corpus_path

_UNIGRAM_FILENAME = "tnc_freq.txt"
_BIGRAM_CORPUS_NAME = "tnc_bigram_word_freqs"

sym_spell = SymSpell()
sym_spell.load_dictionary(
path_pythainlp_corpus(_UNIGRAM_FILENAME),
0,
1,
separator="\t",
encoding="utf-8-sig",
)
sym_spell.load_bigram_dictionary(
get_corpus_path(_BIGRAM_CORPUS_NAME),
0,
2,
separator="\t",
encoding="utf-8-sig",
)
_sym_spell = None
_unigram_file_ctx = None # File context manager kept alive for program lifetime
_load_lock = threading.Lock() # Thread safety for lazy loading


def _get_sym_spell():
"""Lazy load the symspell instance.

This function uses a lock to ensure thread-safe initialization.
The context manager is kept alive for the lifetime of the program
to prevent cleanup of temporary files while SymSpell is in use.
"""
global _sym_spell, _unigram_file_ctx
if _sym_spell is None:
with _load_lock:
# Double-check pattern to avoid race conditions
if _sym_spell is None:
_sym_spell = SymSpell()
# Load unigram dictionary from bundled corpus
corpus_files = files("pythainlp.corpus")
unigram_file = corpus_files.joinpath(_UNIGRAM_FILENAME)
_unigram_file_ctx = as_file(unigram_file)
unigram_path = _unigram_file_ctx.__enter__()
_sym_spell.load_dictionary(
str(unigram_path),
0,
1,
separator="\t",
encoding="utf-8-sig",
)
# Load bigram dictionary from downloaded corpus
_sym_spell.load_bigram_dictionary(
get_corpus_path(_BIGRAM_CORPUS_NAME),
0,
2,
separator="\t",
encoding="utf-8-sig",
)
return _sym_spell


def spell(text: str, max_edit_distance: int = 2) -> list[str]:
sym_spell = _get_sym_spell()
return [
str(i).split(",", maxsplit=1)[0]
for i in list(
Expand All @@ -60,6 +88,7 @@ def correct(text: str, max_edit_distance: int = 1) -> str:
def spell_sent(
list_words: list[str], max_edit_distance: int = 2
) -> list[list[str]]:
sym_spell = _get_sym_spell()
temp = [
str(i).split(",", maxsplit=1)[0].split(" ")
for i in list(
Expand Down
52 changes: 49 additions & 3 deletions pythainlp/tag/crfchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations

from importlib.resources import as_file, files

from pycrfsuite import Tagger as CRFTagger

from pythainlp.corpus import path_pythainlp_corpus, thai_stopwords
from pythainlp.corpus import thai_stopwords


def _is_stopword(word: str) -> bool: # check Thai stopword
Expand Down Expand Up @@ -55,16 +57,60 @@ def extract_features(doc):


class CRFchunk:
"""CRF-based chunker for Thai text.

This class can be used as a context manager to ensure proper cleanup
of resources. Example:

with CRFchunk() as chunker:
result = chunker.parse(tokens)

Alternatively, the object will attempt to clean up resources when
garbage collected, though this is not guaranteed.
"""

def __init__(self, corpus: str = "orchidpp"):
self.corpus = corpus
self._model_file_ctx = None
self.load_model(self.corpus)

def load_model(self, corpus: str):
self.tagger = CRFTagger()
if corpus == "orchidpp":
self.path = path_pythainlp_corpus("crfchunk_orchidpp.model")
self.tagger.open(self.path)
corpus_files = files("pythainlp.corpus")
model_file = corpus_files.joinpath("crfchunk_orchidpp.model")
self._model_file_ctx = as_file(model_file)
model_path = self._model_file_ctx.__enter__()
self.tagger.open(str(model_path))

def parse(self, token_pos: list[tuple[str, str]]) -> list[str]:
self.xseq = extract_features(token_pos)
return self.tagger.tag(self.xseq)

def __enter__(self):
"""Context manager entry."""
return self

def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit - clean up resources."""
if self._model_file_ctx is not None:
try:
self._model_file_ctx.__exit__(exc_type, exc_val, exc_tb)
self._model_file_ctx = None
except Exception: # noqa: S110
pass
return False

def __del__(self):
"""Clean up the context manager when object is destroyed.

Note: __del__ is not guaranteed to be called and should not be
relied upon for critical cleanup. Use the context manager protocol
(with statement) for reliable resource management.
"""
if self._model_file_ctx is not None:
try:
self._model_file_ctx.__exit__(None, None, None)
except Exception: # noqa: S110
# Silently ignore cleanup errors during garbage collection
pass
30 changes: 27 additions & 3 deletions pythainlp/tokenize/han_solo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

from __future__ import annotations

from pythainlp.corpus import path_pythainlp_corpus
import threading
from importlib.resources import as_file, files

try:
import pycrfsuite
Expand All @@ -17,8 +18,30 @@
"ImportError; Install pycrfsuite by pip install python-crfsuite"
)

tagger = pycrfsuite.Tagger()
tagger.open(path_pythainlp_corpus("han_solo.crfsuite"))
_tagger = None
_model_file_ctx = None # File context manager kept alive for program lifetime
_load_lock = threading.Lock() # Thread safety for lazy loading


def _get_tagger():
"""Lazy load the tagger model.

This function uses a lock to ensure thread-safe initialization.
The context manager is kept alive for the lifetime of the program
to prevent cleanup of temporary files while the tagger is in use.
"""
global _tagger, _model_file_ctx
if _tagger is None:
with _load_lock:
# Double-check pattern to avoid race conditions
if _tagger is None:
_tagger = pycrfsuite.Tagger()
corpus_files = files("pythainlp.corpus")
model_file = corpus_files.joinpath("han_solo.crfsuite")
_model_file_ctx = as_file(model_file)
model_path = _model_file_ctx.__enter__()
_tagger.open(str(model_path))
return _tagger


class Featurizer:
Expand Down Expand Up @@ -119,6 +142,7 @@ def featurize(


def segment(text: str) -> list[str]:
tagger = _get_tagger()
x = _to_feature.featurize(text)["X"]
y_pred = tagger.tag(x)
list_cut = []
Expand Down
35 changes: 31 additions & 4 deletions pythainlp/tokenize/nlpo3.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,41 @@
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations

import threading
from importlib.resources import as_file, files
from sys import stderr

from nlpo3 import load_dict as nlpo3_load_dict
from nlpo3 import segment as nlpo3_segment

from pythainlp.corpus import path_pythainlp_corpus
from pythainlp.corpus.common import _THAI_WORDS_FILENAME

_NLPO3_DEFAULT_DICT_NAME = "_73bcj049dzbu9t49b4va170k" # supposed to be unique
_NLPO3_DEFAULT_DICT = nlpo3_load_dict(
path_pythainlp_corpus(_THAI_WORDS_FILENAME), _NLPO3_DEFAULT_DICT_NAME
) # preload default dict, so it can be accessible by _NLPO3_DEFAULT_DICT_NAME
_NLPO3_DEFAULT_DICT = None # Will be lazily loaded
_dict_file_ctx = None # File context manager kept alive for program lifetime
_load_lock = threading.Lock() # Thread safety for lazy loading


def _ensure_default_dict_loaded():
"""Ensure the default dictionary is loaded.

This function uses a lock to ensure thread-safe initialization.
The context manager is kept alive for the lifetime of the program
to prevent cleanup of temporary files while the dictionary is in use.
"""
global _NLPO3_DEFAULT_DICT, _dict_file_ctx
if _NLPO3_DEFAULT_DICT is None:
with _load_lock:
# Double-check pattern to avoid race conditions
if _NLPO3_DEFAULT_DICT is None:
corpus_files = files("pythainlp.corpus")
dict_file = corpus_files.joinpath(_THAI_WORDS_FILENAME)
_dict_file_ctx = as_file(dict_file)
dict_path = _dict_file_ctx.__enter__()
_NLPO3_DEFAULT_DICT = nlpo3_load_dict(
str(dict_path), _NLPO3_DEFAULT_DICT_NAME
)
return _NLPO3_DEFAULT_DICT


def load_dict(file_path: str, dict_name: str) -> bool:
Expand Down Expand Up @@ -64,6 +87,10 @@ def segment(
* \
https://github.com/PyThaiNLP/nlpo3
"""
# Ensure default dict is loaded if it's being used
if custom_dict == _NLPO3_DEFAULT_DICT_NAME:
_ensure_default_dict_loaded()

return nlpo3_segment(
text=text,
dict_name=custom_dict,
Expand Down
Loading
Loading