PyThaiNLP
diff --git a/‎docs/threadsafe.rst‎
Lines changed: 185 additions & 0 deletions b/‎docs/threadsafe.rst‎
Lines changed: 185 additions & 0 deletions
diff --git a/‎pythainlp/tokenize/attacut.py‎
Lines changed: 15 additions & 4 deletions b/‎pythainlp/tokenize/attacut.py‎
Lines changed: 15 additions & 4 deletions
diff --git a/‎pythainlp/tokenize/budoux.py‎
Lines changed: 14 additions & 5 deletions b/‎pythainlp/tokenize/budoux.py‎
Lines changed: 14 additions & 5 deletions
diff --git a/‎pythainlp/tokenize/core.py‎
Lines changed: 9 additions & 0 deletions b/‎pythainlp/tokenize/core.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎pythainlp/tokenize/longest.py‎
Lines changed: 12 additions & 4 deletions b/‎pythainlp/tokenize/longest.py‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎pythainlp/tokenize/oskut.py‎
Lines changed: 27 additions & 6 deletions b/‎pythainlp/tokenize/oskut.py‎
Lines changed: 27 additions & 6 deletions
@@ -0,0 +1,185 @@
+Thread safety in PyThaiNLP word tokenization
+==============================================
+
+Summary
+-------
+
+PyThaiNLP's core word tokenization engines are designed with thread-safety
+in mind. Internal implementations (``mm``, ``newmm``, ``newmm-safe``,
+``longest``, ``icu``) are thread-safe.
+
+For engines that wrap external libraries (``attacut``, ``budoux``, ``deepcut``,
+``nercut``, ``nlpo3``, ``oskut``, ``sefr_cut``, ``tltk``, ``wtsplit``), the
+wrapper code is thread-safe, but we cannot guarantee thread-safety of the
+underlying external libraries themselves.
+
+Thread safety implementation
+-----------------------------
+
+**Internal implementations (fully thread-safe):**
+
+- ``mm``, ``newmm``, ``newmm-safe``: Stateless implementation,
+  all data is local
+- ``longest``: uses lock-protected check-then-act for
+  the management of global cache shared across threads
+- ``icu``: each thread gets its own ``BreakIterator`` instance
+
+**External library wrappers (wrapper code is thread-safe):**
+
+- ``attacut``: uses lock-protected check-then-act for
+  the management of global cache; underlying library thread-safety not guaranteed
+- ``budoux``: uses lock-protected lazy initialization of parser;
+  underlying library thread-safety not guaranteed
+- ``deepcut``, ``nercut``, ``nlpo3``, ``tltk``: Stateless wrapper,
+  underlying library thread-safety not guaranteed
+- ``oskut``, ``sefr_cut``, ``wtsplit``: use lock-protected model
+  loading when switching models/engines; underlying library thread-safety not guaranteed
+
+Usage in multi-threaded applications
+-------------------------------------
+
+Using a tokenization engine safely in multi-threaded contexts:
+
+.. code-block:: python
+
+    import threading
+    from pythainlp.tokenize import word_tokenize
+
+    def tokenize_worker(text, results, index):
+        # Thread-safe for all engines
+        results[index] = word_tokenize(text, engine="longest")
+
+    texts = ["ผมรักประเทศไทย", "วันนี้อากาศดี", "เขาไปโรงเรียน"]
+    results = [None] * len(texts)
+    threads = []
+
+    for i, text in enumerate(texts):
+        thread = threading.Thread(target=tokenize_worker, args=(text, results, i))
+        threads.append(thread)
+        thread.start()
+
+    for thread in threads:
+        thread.join()
+
+    # All results are correctly populated
+    print(results)
+
+Performance considerations
+--------------------------
+
+1. **Lock-based synchronization** (longest, attacut):
+   
+   - Minimal overhead for cache access
+   - Cache lookups are very fast
+   - Lock contention is minimal in typical usage
+
+2. **Thread-local storage** (icu):
+   
+   - Each thread maintains its own instance
+   - No synchronization overhead after initialization
+   - Slightly higher memory usage (one instance per thread)
+
+3. **Stateless engines** (newmm, mm):
+   
+   - Zero synchronization overhead
+   - Best performance in multi-threaded scenarios
+   - Recommended for high-throughput applications
+
+Best practices
+--------------
+
+1. **For high-throughput applications**: Consider using stateless engines like
+   ``newmm`` or ``mm`` for optimal performance.
+
+2. **For custom dictionaries**: The ``longest`` engine with custom dictionaries
+   maintains a cache per dictionary object. Reuse dictionary objects across
+   threads to maximize cache efficiency.
+
+3. **For process pools**: All engines work correctly with multiprocessing as
+   each process has its own memory space.
+
+4. **IMPORTANT: Do not modify custom dictionaries during tokenization**:
+   
+   - Create your custom Trie/dictionary before starting threads
+   - Never call ``trie.add()`` or ``trie.remove()`` while tokenization is in progress
+   - If you need to update the dictionary,
+     create a new Trie instance and pass it to subsequent tokenization calls
+   - The Trie data structure itself is NOT thread-safe for concurrent modifications
+
+Example of safe custom dictionary usage
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+    from pythainlp.tokenize import word_tokenize
+    from pythainlp.corpus.common import thai_words
+    from pythainlp.util import dict_trie
+    import threading
+
+    # SAFE: Create dictionary once before threading
+    custom_words = set(thai_words())
+    custom_words.add("คำใหม่")
+    custom_dict = dict_trie(custom_words)
+
+    texts = ["ผมรักประเทศไทย", "วันนี้อากาศดี", "เขาไปโรงเรียน"]
+
+    def worker(text, custom_dict):
+        # SAFE: Only reading from the dictionary
+        return word_tokenize(text, engine="newmm", custom_dict=custom_dict)
+
+    # All threads share the same dictionary (read-only)
+    threads = []
+    for text in texts:
+        t = threading.Thread(target=worker, args=(text, custom_dict))
+        threads.append(t)
+        t.start()
+
+    # Wait for all threads to finish
+    for t in threads:
+        t.join()
+
+Example of UNSAFE usage (DO NOT DO THIS)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+    # UNSAFE: Modifying dictionary while threads are using it
+    custom_dict = dict_trie(thai_words())
+
+    def unsafe_worker(text, custom_dict):
+        result = word_tokenize(text, engine="newmm", custom_dict=custom_dict)
+        # DANGER: Modifying the shared dictionary
+        custom_dict.add("คำใหม่")  # This is NOT thread-safe!
+        return result
+
+Testing
+-------
+
+Comprehensive thread safety tests are available in:
+
+- ``tests/core/test_tokenize_thread_safety.py``
+
+The test suite includes:
+
+- Concurrent tokenization with multiple threads
+- Race condition testing with multiple dictionaries
+- Verification of result consistency across threads
+- Stress testing with up to 200 concurrent operations (20 threads × 10 iterations)
+
+Maintenance notes
+-----------------
+
+When adding new tokenization engines to PyThaiNLP:
+
+1. **Avoid global mutable state** whenever possible
+2. If caching is necessary, use thread-safe locks
+3. If per-thread state is needed, use ``threading.local()``
+4. Always add thread safety tests for new engines
+5. Document thread safety guarantees in docstrings
+
+Related files
+-------------
+
+- Core implementation: ``pythainlp/tokenize/core.py``
+- Engine implementations: ``pythainlp/tokenize/*.py``
+- Tests: ``tests/core/test_tokenize_thread_safety.py``
@@ -9,6 +9,8 @@
 
 from __future__ import annotations
 
+import threading
+
 from attacut import Tokenizer
 
 
@@ -26,10 +28,17 @@ def tokenize(self, text: str) -> list[str]:
 
 
 _tokenizers: dict[str, AttacutTokenizer] = {}
+_tokenizers_lock = threading.Lock()
 
 
 def segment(text: str, model: str = "attacut-sc") -> list[str]:
     """Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai
+
+    The wrapper uses a lock to protect access to the internal tokenizer cache.
+    However, thread-safety of the underlying AttaCut library itself is not
+    guaranteed. Please refer to the AttaCut library documentation for its
+    thread-safety guarantees.
+
     :param str text: text to be tokenized to words
     :param str model: model of word tokenizer model
     :return: list of words, tokenized from the text
@@ -41,8 +50,10 @@ def segment(text: str, model: str = "attacut-sc") -> list[str]:
     if not text or not isinstance(text, str):
         return []
 
-    global _tokenizers
-    if model not in _tokenizers:
-        _tokenizers[model] = AttacutTokenizer(model)
+    # Thread-safe access to the tokenizers cache
+    with _tokenizers_lock:
+        if model not in _tokenizers:
+            _tokenizers[model] = AttacutTokenizer(model)
+        tokenizer = _tokenizers[model]
 
-    return _tokenizers[model].tokenize(text)
+    return tokenizer.tokenize(text)
@@ -12,7 +12,10 @@
 
 from __future__ import annotations
 
+import threading
+
 _parser = None
+_parser_lock = threading.Lock()
 
 
 def _init_parser():
@@ -34,17 +37,23 @@ def _init_parser():
 def segment(text: str) -> list[str]:
     """Segment `text` into tokens using budoux.
 
+    The wrapper uses a lock to protect lazy initialization of the parser.
+    However, thread-safety of the underlying budoux library itself is not
+    guaranteed. Please refer to the budoux library documentation for its
+    thread-safety guarantees.
+
     The function returns a list of strings. If `budoux` is not available
     the function raises ImportError with an installation hint.
     """
     if not text or not isinstance(text, str):
         return []
 
-    global _parser
-    if _parser is None:
-        _parser = _init_parser()
-
-    parser = _parser
+    # Thread-safe lazy initialization
+    with _parser_lock:
+        if _parser is None:
+            global _parser
+            _parser = _init_parser()
+        parser = _parser
 
     result = parser.parse(text)
 
 
@@ -159,6 +159,15 @@ def word_tokenize(
     :Note:
         - The **custom_dict** parameter only works for \
           *deepcut*, *longest*, *newmm*, and *newmm-safe* engines.
+        - Built-in tokenizers (*longest*, *mm*, *newmm*, and *newmm-safe*) \
+          are thread-safe.
+        - Wrappers of external tokenizer are designed to be thread-safe \
+          but depend on the external tokenizer.
+        - **WARNING**: When using custom_dict in multi-threaded environments, \
+          do NOT modify the Trie object (via add/remove methods) while \
+          tokenization is in progress. The Trie data structure is not \
+          thread-safe for concurrent modifications. Create your dictionary \
+          before starting threads and only read from it during tokenization.
     :Example:
 
     Tokenize text with different tokenizers::
 
@@ -13,6 +13,7 @@
 from __future__ import annotations
 
 import re
+import threading
 
 from pythainlp import thai_tonemarks
 from pythainlp.tokenize import word_dict_trie
@@ -154,11 +155,15 @@ def tokenize(self, text: str) -> list[str]:
 
 
 _tokenizers: dict[int, LongestMatchTokenizer] = {}
+_tokenizers_lock = threading.Lock()
 
 
 def segment(text: str, custom_dict: Trie | None = None) -> list[str]:
     """Dictionary-based longest matching word segmentation.
 
+    This function is thread-safe. It uses a lock to protect access to the
+    internal tokenizer cache.
+
     :param str text: text to be tokenized into words
     :param pythainlp.util.Trie custom_dict: dictionary for tokenization
     :return: list of words, tokenized from the text
@@ -169,9 +174,12 @@ def segment(text: str, custom_dict: Trie | None = None) -> list[str]:
     if not custom_dict:
         custom_dict = word_dict_trie()
 
-    global _tokenizers
     custom_dict_ref_id = id(custom_dict)
-    if custom_dict_ref_id not in _tokenizers:
-        _tokenizers[custom_dict_ref_id] = LongestMatchTokenizer(custom_dict)
 
-    return _tokenizers[custom_dict_ref_id].tokenize(text)
+    # Thread-safe access to the tokenizers cache
+    with _tokenizers_lock:
+        if custom_dict_ref_id not in _tokenizers:
+            _tokenizers[custom_dict_ref_id] = LongestMatchTokenizer(custom_dict)
+        tokenizer = _tokenizers[custom_dict_ref_id]
+
+    return tokenizer.tokenize(text)
@@ -11,17 +11,38 @@
 
 from __future__ import annotations
 
+import threading
+
 import oskut
 
-DEFAULT_ENGINE = "ws"
-oskut.load_model(engine=DEFAULT_ENGINE)
+_DEFAULT_ENGINE = "ws"
+_engine_lock = threading.Lock()
+
+# Load default model at module initialization
+oskut.load_model(engine=_DEFAULT_ENGINE)
 
 
 def segment(text: str, engine: str = "ws") -> list[str]:
-    global DEFAULT_ENGINE
+    """Segment text using OSKut.
+
+    The wrapper uses a lock to protect model loading when switching engines.
+    However, thread-safety of the underlying OSKut library itself is not
+    guaranteed. Please refer to the OSKut library documentation for its
+    thread-safety guarantees.
+
+    :param str text: text to be tokenized
+    :param str engine: model engine to use
+    :return: list of tokens
+    """
     if not text or not isinstance(text, str):
         return []
-    if engine != DEFAULT_ENGINE:
-        DEFAULT_ENGINE = engine
-        oskut.load_model(engine=DEFAULT_ENGINE)
+
+    # Thread-safe model loading
+    with _engine_lock:
+        if engine != _DEFAULT_ENGINE:
+            # Need to update global state and reload model
+            global _DEFAULT_ENGINE
+            _DEFAULT_ENGINE = engine
+            oskut.load_model(engine=_DEFAULT_ENGINE)
+
     return oskut.OSKut(text)