PyThaiNLP · bact · Jan 17, 2026 · Jan 16, 2026 · Jan 16, 2026 · Jan 16, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -246,7 +246,6 @@ issues = "https://github.com/PyThaiNLP/pythainlp/issues"
 thainlp = "pythainlp.__main__:main"
 
 [tool.setuptools]
-zip-safe = false
 include-package-data = true
 
 [tool.setuptools.packages.find]

diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py
@@ -10,6 +10,7 @@
 import os
 import re
 import sys
+from importlib.resources import files
 
 from pythainlp import __version__
 from pythainlp.corpus import corpus_db_path, corpus_db_url, corpus_path
@@ -153,10 +154,10 @@ def get_corpus(filename: str, comments: bool = True) -> frozenset:
         #     ...})
 
     """
-    path = path_pythainlp_corpus(filename)
-    lines = []
-    with open(path, encoding="utf-8-sig") as fh:
-        lines = fh.read().splitlines()
+    corpus_files = files("pythainlp.corpus")
+    corpus_file = corpus_files.joinpath(filename)
+    text = corpus_file.read_text(encoding="utf-8-sig")
+    lines = text.splitlines()
 
     if not comments:
         # if the line has a '#' character, take only text before the first '#'
@@ -192,10 +193,10 @@ def get_corpus_as_is(filename: str) -> list:
         # output:
         # ['แต่', 'ไม่']
     """
-    path = path_pythainlp_corpus(filename)
-    lines = []
-    with open(path, encoding="utf-8-sig") as fh:
-        lines = fh.read().splitlines()
+    corpus_files = files("pythainlp.corpus")
+    corpus_file = corpus_files.joinpath(filename)
+    text = corpus_file.read_text(encoding="utf-8-sig")
+    lines = text.splitlines()
 
     return lines
 
@@ -211,9 +212,10 @@ def get_corpus_default_db(name: str, version: str = "") -> str | None:
     If you want to edit default_db.json, \
         you can edit pythainlp/corpus/default_db.json
     """
-    default_db_path = path_pythainlp_corpus("default_db.json")
-    with open(default_db_path, encoding="utf-8-sig") as fh:
-        corpus_db = json.load(fh)
+    corpus_files = files("pythainlp.corpus")
+    default_db_file = corpus_files.joinpath("default_db.json")
+    text = default_db_file.read_text(encoding="utf-8-sig")
+    corpus_db = json.loads(text)
 
     if name in corpus_db:
         if version in corpus_db[name]["versions"]:

diff --git a/pythainlp/corpus/th_en_translit.py b/pythainlp/corpus/th_en_translit.py
@@ -17,8 +17,7 @@
 ]
 
 from collections import defaultdict
-
-from pythainlp.corpus import path_pythainlp_corpus
+from importlib.resources import files
 
 _FILE_NAME = "th_en_transliteration_v1.4.tsv"
 TRANSLITERATE_EN = "en"
@@ -30,8 +29,10 @@ def get_transliteration_dict() -> defaultdict:
 
     The returned dict is in dict[str, dict[List[str], List[Optional[bool]]]] format.
     """
-    path = path_pythainlp_corpus(_FILE_NAME)
-    if not path:
+    corpus_files = files("pythainlp.corpus")
+    corpus_file = corpus_files.joinpath(_FILE_NAME)
+
+    if not corpus_file.is_file():
         raise FileNotFoundError(
             f"Unable to load transliteration dictionary. "
             f"{_FILE_NAME} is not found under pythainlp/corpus."
@@ -42,24 +43,25 @@ def get_transliteration_dict() -> defaultdict:
         lambda: {TRANSLITERATE_EN: [], TRANSLITERATE_FOLLOW_RTSG: []}
     )
     try:
-        with open(path, encoding="utf-8") as f:
-            # assume that the first row contains column names, so skip it.
-            for line in f.readlines()[1:]:
-                stripped = line.strip()
-                if stripped:
-                    th, *en_checked = stripped.split("\t")
-                    # replace in-between whitespace to prevent mismatched results from different tokenizers.
-                    # e.g. "บอยแบนด์"
-                    # route 1: "บอยแบนด์" -> ["บอย", "แบนด์"] -> ["boy", "band"] -> "boyband"
-                    # route 2: "บอยแบนด์" -> [""บอยแบนด์""] -> ["boy band"] -> "boy band"
-                    en_translit = en_checked[0].replace(" ", "")
-                    trans_dict[th][TRANSLITERATE_EN].append(en_translit)
-                    en_follow_rtgs = (
-                        bool(en_checked[1]) if len(en_checked) == 2 else None
-                    )
-                    trans_dict[th][TRANSLITERATE_FOLLOW_RTSG].append(
-                        en_follow_rtgs
-                    )
+        text = corpus_file.read_text(encoding="utf-8")
+        lines = text.splitlines()
+        # assume that the first row contains column names, so skip it.
+        for line in lines[1:]:
+            stripped = line.strip()
+            if stripped:
+                th, *en_checked = stripped.split("\t")
+                # replace in-between whitespace to prevent mismatched results from different tokenizers.
+                # e.g. "บอยแบนด์"
+                # route 1: "บอยแบนด์" -> ["บอย", "แบนด์"] -> ["boy", "band"] -> "boyband"
+                # route 2: "บอยแบนด์" -> [""บอยแบนด์""] -> ["boy band"] -> "boy band"
+                en_translit = en_checked[0].replace(" ", "")
+                trans_dict[th][TRANSLITERATE_EN].append(en_translit)
+                en_follow_rtgs = (
+                    bool(en_checked[1]) if len(en_checked) == 2 else None
+                )
+                trans_dict[th][TRANSLITERATE_FOLLOW_RTSG].append(
+                    en_follow_rtgs
+                )
 
     except ValueError as exc:
         raise ValueError(

diff --git a/pythainlp/spell/symspellpy.py b/pythainlp/spell/symspellpy.py
@@ -13,36 +13,64 @@
 
 from __future__ import annotations
 
+import threading
+from importlib.resources import as_file, files
+
 try:
     from symspellpy import SymSpell, Verbosity
 except ImportError:
     raise ImportError(
         "Import Error; Install symspellpy by pip install symspellpy"
     )
 
-from pythainlp.corpus import get_corpus_path, path_pythainlp_corpus
+from pythainlp.corpus import get_corpus_path
 
 _UNIGRAM_FILENAME = "tnc_freq.txt"
 _BIGRAM_CORPUS_NAME = "tnc_bigram_word_freqs"
 
-sym_spell = SymSpell()
-sym_spell.load_dictionary(
-    path_pythainlp_corpus(_UNIGRAM_FILENAME),
-    0,
-    1,
-    separator="\t",
-    encoding="utf-8-sig",
-)
-sym_spell.load_bigram_dictionary(
-    get_corpus_path(_BIGRAM_CORPUS_NAME),
-    0,
-    2,
-    separator="\t",
-    encoding="utf-8-sig",
-)
+_sym_spell = None
+_unigram_file_ctx = None  # File context manager kept alive for program lifetime
+_load_lock = threading.Lock()  # Thread safety for lazy loading
+
+
+def _get_sym_spell():
+    """Lazy load the symspell instance.
+
+    This function uses a lock to ensure thread-safe initialization.
+    The context manager is kept alive for the lifetime of the program
+    to prevent cleanup of temporary files while SymSpell is in use.
+    """
+    global _sym_spell, _unigram_file_ctx
+    if _sym_spell is None:
+        with _load_lock:
+            # Double-check pattern to avoid race conditions
+            if _sym_spell is None:
+                _sym_spell = SymSpell()
+                # Load unigram dictionary from bundled corpus
+                corpus_files = files("pythainlp.corpus")
+                unigram_file = corpus_files.joinpath(_UNIGRAM_FILENAME)
+                _unigram_file_ctx = as_file(unigram_file)
+                unigram_path = _unigram_file_ctx.__enter__()
+                _sym_spell.load_dictionary(
+                    str(unigram_path),
+                    0,
+                    1,
+                    separator="\t",
+                    encoding="utf-8-sig",
+                )
+                # Load bigram dictionary from downloaded corpus
+                _sym_spell.load_bigram_dictionary(
+                    get_corpus_path(_BIGRAM_CORPUS_NAME),
+                    0,
+                    2,
+                    separator="\t",
+                    encoding="utf-8-sig",
+                )
+    return _sym_spell
 
 
 def spell(text: str, max_edit_distance: int = 2) -> list[str]:
+    sym_spell = _get_sym_spell()
     return [
         str(i).split(",", maxsplit=1)[0]
         for i in list(
@@ -60,6 +88,7 @@ def correct(text: str, max_edit_distance: int = 1) -> str:
 def spell_sent(
     list_words: list[str], max_edit_distance: int = 2
 ) -> list[list[str]]:
+    sym_spell = _get_sym_spell()
     temp = [
         str(i).split(",", maxsplit=1)[0].split(" ")
         for i in list(

diff --git a/pythainlp/tag/crfchunk.py b/pythainlp/tag/crfchunk.py
@@ -3,9 +3,11 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+from importlib.resources import as_file, files
+
 from pycrfsuite import Tagger as CRFTagger
 
-from pythainlp.corpus import path_pythainlp_corpus, thai_stopwords
+from pythainlp.corpus import thai_stopwords
 
 
 def _is_stopword(word: str) -> bool:  # check Thai stopword
@@ -55,16 +57,60 @@ def extract_features(doc):
 
 
 class CRFchunk:
+    """CRF-based chunker for Thai text.
+
+    This class can be used as a context manager to ensure proper cleanup
+    of resources. Example:
+
+        with CRFchunk() as chunker:
+            result = chunker.parse(tokens)
+
+    Alternatively, the object will attempt to clean up resources when
+    garbage collected, though this is not guaranteed.
+    """
+
     def __init__(self, corpus: str = "orchidpp"):
         self.corpus = corpus
+        self._model_file_ctx = None
         self.load_model(self.corpus)
 
     def load_model(self, corpus: str):
         self.tagger = CRFTagger()
         if corpus == "orchidpp":
-            self.path = path_pythainlp_corpus("crfchunk_orchidpp.model")
-        self.tagger.open(self.path)
+            corpus_files = files("pythainlp.corpus")
+            model_file = corpus_files.joinpath("crfchunk_orchidpp.model")
+            self._model_file_ctx = as_file(model_file)
+            model_path = self._model_file_ctx.__enter__()
+            self.tagger.open(str(model_path))
 
     def parse(self, token_pos: list[tuple[str, str]]) -> list[str]:
         self.xseq = extract_features(token_pos)
         return self.tagger.tag(self.xseq)
+
+    def __enter__(self):
+        """Context manager entry."""
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit - clean up resources."""
+        if self._model_file_ctx is not None:
+            try:
+                self._model_file_ctx.__exit__(exc_type, exc_val, exc_tb)
+                self._model_file_ctx = None
+            except Exception:  # noqa: S110
+                pass
+        return False
+
+    def __del__(self):
+        """Clean up the context manager when object is destroyed.
+
+        Note: __del__ is not guaranteed to be called and should not be
+        relied upon for critical cleanup. Use the context manager protocol
+        (with statement) for reliable resource management.
+        """
+        if self._model_file_ctx is not None:
+            try:
+                self._model_file_ctx.__exit__(None, None, None)
+            except Exception:  # noqa: S110
+                # Silently ignore cleanup errors during garbage collection
+                pass
diff --git a/pythainlp/tokenize/han_solo.py b/pythainlp/tokenize/han_solo.py
@@ -8,7 +8,8 @@
 
 from __future__ import annotations
 
-from pythainlp.corpus import path_pythainlp_corpus
+import threading
+from importlib.resources import as_file, files
 
 try:
     import pycrfsuite
@@ -17,8 +18,30 @@
         "ImportError; Install pycrfsuite by pip install python-crfsuite"
     )
 
-tagger = pycrfsuite.Tagger()
-tagger.open(path_pythainlp_corpus("han_solo.crfsuite"))
+_tagger = None
+_model_file_ctx = None  # File context manager kept alive for program lifetime
+_load_lock = threading.Lock()  # Thread safety for lazy loading
+
+
+def _get_tagger():
+    """Lazy load the tagger model.
+
+    This function uses a lock to ensure thread-safe initialization.
+    The context manager is kept alive for the lifetime of the program
+    to prevent cleanup of temporary files while the tagger is in use.
+    """
+    global _tagger, _model_file_ctx
+    if _tagger is None:
+        with _load_lock:
+            # Double-check pattern to avoid race conditions
+            if _tagger is None:
+                _tagger = pycrfsuite.Tagger()
+                corpus_files = files("pythainlp.corpus")
+                model_file = corpus_files.joinpath("han_solo.crfsuite")
+                _model_file_ctx = as_file(model_file)
+                model_path = _model_file_ctx.__enter__()
+                _tagger.open(str(model_path))
+    return _tagger
 
 
 class Featurizer:
@@ -119,6 +142,7 @@ def featurize(
 
 
 def segment(text: str) -> list[str]:
+    tagger = _get_tagger()
     x = _to_feature.featurize(text)["X"]
     y_pred = tagger.tag(x)
     list_cut = []

diff --git a/pythainlp/tokenize/nlpo3.py b/pythainlp/tokenize/nlpo3.py
@@ -3,18 +3,41 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import threading
+from importlib.resources import as_file, files
 from sys import stderr
 
 from nlpo3 import load_dict as nlpo3_load_dict
 from nlpo3 import segment as nlpo3_segment
 
-from pythainlp.corpus import path_pythainlp_corpus
 from pythainlp.corpus.common import _THAI_WORDS_FILENAME
 
 _NLPO3_DEFAULT_DICT_NAME = "_73bcj049dzbu9t49b4va170k"  # supposed to be unique
-_NLPO3_DEFAULT_DICT = nlpo3_load_dict(
-    path_pythainlp_corpus(_THAI_WORDS_FILENAME), _NLPO3_DEFAULT_DICT_NAME
-)  # preload default dict, so it can be accessible by _NLPO3_DEFAULT_DICT_NAME
+_NLPO3_DEFAULT_DICT = None  # Will be lazily loaded
+_dict_file_ctx = None  # File context manager kept alive for program lifetime
+_load_lock = threading.Lock()  # Thread safety for lazy loading
+
+
+def _ensure_default_dict_loaded():
+    """Ensure the default dictionary is loaded.
+
+    This function uses a lock to ensure thread-safe initialization.
+    The context manager is kept alive for the lifetime of the program
+    to prevent cleanup of temporary files while the dictionary is in use.
+    """
+    global _NLPO3_DEFAULT_DICT, _dict_file_ctx
+    if _NLPO3_DEFAULT_DICT is None:
+        with _load_lock:
+            # Double-check pattern to avoid race conditions
+            if _NLPO3_DEFAULT_DICT is None:
+                corpus_files = files("pythainlp.corpus")
+                dict_file = corpus_files.joinpath(_THAI_WORDS_FILENAME)
+                _dict_file_ctx = as_file(dict_file)
+                dict_path = _dict_file_ctx.__enter__()
+                _NLPO3_DEFAULT_DICT = nlpo3_load_dict(
+                    str(dict_path), _NLPO3_DEFAULT_DICT_NAME
+                )
+    return _NLPO3_DEFAULT_DICT
 
 
 def load_dict(file_path: str, dict_name: str) -> bool:
@@ -64,6 +87,10 @@ def segment(
         * \
             https://github.com/PyThaiNLP/nlpo3
     """
+    # Ensure default dict is loaded if it's being used
+    if custom_dict == _NLPO3_DEFAULT_DICT_NAME:
+        _ensure_default_dict_loaded()
+
     return nlpo3_segment(
         text=text,
         dict_name=custom_dict,