Merge pull request #5 from wannaphong/copilot/fix-output-mismatch-python

wannaphong · web-flow · commit 938c616f6bc5 · 2026-01-11T16:22:57.000+07:00
Fix tokenization by loading full dictionary and removing TCC boundary constraint
diff --git a/cthainlp/data b/cthainlp/data
@@ -0,0 +1 @@
+../data
diff --git a/cthainlp/tokenize.py b/cthainlp/tokenize.py
@@ -11,6 +11,33 @@
     _cthainlp = None
 
 
+def _get_default_dict_path() -> Optional[str]:
+    """
+    Get the default dictionary file path.
+    
+    Returns:
+        str: Absolute path to the default dictionary file
+    """
+    # Get the directory where this module is located
+    module_dir = os.path.dirname(os.path.abspath(__file__))
+    
+    # Try to find dictionary in package data directory
+    # When installed: cthainlp/data/thai_words.txt
+    dict_path = os.path.join(module_dir, "data", "thai_words.txt")
+    if os.path.exists(dict_path):
+        return dict_path
+    
+    # Try parent directory (development mode)
+    # When in source: CThaiNLP/data/thai_words.txt
+    parent_dir = os.path.dirname(module_dir)
+    dict_path = os.path.join(parent_dir, "data", "thai_words.txt")
+    if os.path.exists(dict_path):
+        return dict_path
+    
+    # Fallback: return None to use hardcoded dictionary
+    return None
+
+
 def word_tokenize(
     text: str,
     engine: str = "newmm",
@@ -67,12 +94,15 @@ def word_tokenize(
     if not text:
         return []
     
-    # If custom_dict is provided and exists, use it; otherwise use None for default
-    dict_path = None
+    # Determine which dictionary to use
     if custom_dict is not None:
+        # User provided a custom dictionary
         if not os.path.exists(custom_dict):
             raise FileNotFoundError(f"Dictionary file not found: {custom_dict}")
         dict_path = custom_dict
+    else:
+        # Use default dictionary
+        dict_path = _get_default_dict_path()
     
     # Call the C extension
     tokens = _cthainlp.segment(text, dict_path)
diff --git a/setup.py b/setup.py
@@ -40,6 +40,9 @@
     url="https://github.com/wannaphong/CThaiNLP",
     packages=["cthainlp"],
     ext_modules=[cthainlp_extension],
+    package_data={
+        "cthainlp": ["data/*.txt"],
+    },
     classifiers=[
         "Development Status :: 3 - Alpha",
         "Intended Audience :: Developers",
diff --git a/src/newmm.c b/src/newmm.c
@@ -122,15 +122,72 @@ static int segment_text(const char* text, Trie* trie, char*** tokens) {
         int best_len = 0;
         int best_end_pos = pos;
         
-        /* Find longest valid prefix */
+        /* Simple greedy: find longest match */
+        /* But prefer shorter match if longer one leaves us with unknown Thai character */
         for (int i = 0; i < num_prefixes; i++) {
             int end_pos = pos + lengths[i];
-            if (is_valid_pos(end_pos, valid_pos, num_valid) && lengths[i] > best_len) {
+            
+            if (lengths[i] > best_len) {
                 best_len = lengths[i];
                 best_end_pos = end_pos;
             }
         }
         
+        /* Now check if a shorter match would be better */
+        /* Only if the best match leads to an unknown Thai character */
+        /* and a shorter match leads to a known word */
+        if (best_len > 0 && best_end_pos < text_len) {
+            char** best_next_prefixes;
+            int* best_next_lengths;
+            int num_best_next = trie_prefixes(trie, text + best_end_pos, &best_next_prefixes, &best_next_lengths);
+            
+            if (num_best_next == 0) {
+                /* Best match doesn't lead to a dictionary word */
+                /* Check if it's a Thai character (not Latin/digit) */
+                int byte_len;
+                int next_cp = get_utf8_codepoint(text + best_end_pos, &byte_len);
+                
+                if (!is_non_thai_char(next_cp)) {
+                    /* It's a Thai character that's not in dictionary */
+                    /* Try shorter matches to see if they lead to dictionary words */
+                    for (int i = 0; i < num_prefixes; i++) {
+                        int end_pos = pos + lengths[i];
+                        if (lengths[i] < best_len && end_pos < text_len) {
+                            char** next_prefixes;
+                            int* next_lengths;
+                            int num_next = trie_prefixes(trie, text + end_pos, &next_prefixes, &next_lengths);
+                            
+                            if (num_next > 0) {
+                                /* This shorter match leads to a dictionary word */
+                                /* Prefer it */
+                                best_len = lengths[i];
+                                best_end_pos = end_pos;
+                            }
+                            
+                            /* Free lookahead results */
+                            for (int j = 0; j < num_next; j++) {
+                                free(next_prefixes[j]);
+                            }
+                            free(next_prefixes);
+                            free(next_lengths);
+                            
+                            if (num_next > 0) {
+                                /* We found a better match, stop looking */
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+            
+            /* Free lookahead results */
+            for (int j = 0; j < num_best_next; j++) {
+                free(best_next_prefixes[j]);
+            }
+            free(best_next_prefixes);
+            free(best_next_lengths);
+        }
+        
         /* Free prefix results */
         for (int i = 0; i < num_prefixes; i++) {
             free(prefixes[i]);
diff --git a/tests/test_newmm.c b/tests/test_newmm.c
@@ -105,7 +105,7 @@ int main() {
     
     /* Test 2: Thai sentence with common words */
     run_test("วันนี้อากาศดีมาก", dict,
-             "['วันนี้', 'อา', 'กา', 'ศดี', 'มาก']",
+             "['วันนี้', 'อากาศ', 'ดีมาก']",
              "Thai sentence with partial dictionary match");
     
     /* Test 3: English text */