Merge pull request #7 from wannaphong/copilot/improve-cthainlp-speed

wannaphong · web-flow · commit 258cbe20c599 · 2026-01-11T16:45:37.000+07:00
Cache dictionary to fix ~1200x performance regression
diff --git a/include/newmm.h b/include/newmm.h
@@ -19,6 +19,37 @@
 extern "C" {
 #endif
 
+/* Opaque handle for dictionary */
+typedef void* newmm_dict_t;
+
+/**
+ * @brief Load a dictionary for reuse
+ * 
+ * @param dict_path Path to dictionary file (one word per line, UTF-8 encoded)
+ *                  If NULL, uses a default minimal dictionary
+ * @return Dictionary handle to be used with newmm_segment_with_dict()
+ *         Returns NULL on error
+ */
+newmm_dict_t newmm_load_dict(const char* dict_path);
+
+/**
+ * @brief Free a loaded dictionary
+ * 
+ * @param dict Dictionary handle returned by newmm_load_dict()
+ */
+void newmm_free_dict(newmm_dict_t dict);
+
+/**
+ * @brief Segment Thai text using a pre-loaded dictionary
+ * 
+ * @param text Input Thai text to be segmented (UTF-8 encoded)
+ * @param dict Pre-loaded dictionary handle from newmm_load_dict()
+ * @param token_count Output parameter for number of tokens found
+ * @return Array of strings (tokens), caller must free using newmm_free_result()
+ *         Returns NULL on error
+ */
+char** newmm_segment_with_dict(const char* text, newmm_dict_t dict, int* token_count);
+
 /**
  * @brief Segment Thai text into words using newmm algorithm
  * 
diff --git a/python/cthainlp_wrapper.c b/python/cthainlp_wrapper.c
@@ -5,8 +5,66 @@
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
+#include <string.h>
+#include <stdlib.h>
 #include "newmm.h"
 
+/* Module-level dictionary cache */
+static struct {
+    newmm_dict_t dict;
+    char* dict_path;
+} dict_cache = {NULL, NULL};
+
+/**
+ * Load or retrieve cached dictionary
+ */
+static newmm_dict_t get_or_load_dict(const char* dict_path) {
+    /* Check if we need to reload the dictionary */
+    int need_reload = 0;
+    
+    if (dict_cache.dict == NULL) {
+        /* No cached dict */
+        need_reload = 1;
+    } else if (dict_path == NULL && dict_cache.dict_path != NULL) {
+        /* Switching from custom to default */
+        need_reload = 1;
+    } else if (dict_path != NULL && dict_cache.dict_path == NULL) {
+        /* Switching from default to custom */
+        need_reload = 1;
+    } else if (dict_path != NULL && dict_cache.dict_path != NULL) {
+        /* Both custom, check if path changed */
+        if (strcmp(dict_path, dict_cache.dict_path) != 0) {
+            need_reload = 1;
+        }
+    }
+    
+    if (need_reload) {
+        /* Free old dictionary */
+        if (dict_cache.dict) {
+            newmm_free_dict(dict_cache.dict);
+            dict_cache.dict = NULL;
+        }
+        if (dict_cache.dict_path) {
+            free(dict_cache.dict_path);
+            dict_cache.dict_path = NULL;
+        }
+        
+        /* Load new dictionary */
+        dict_cache.dict = newmm_load_dict(dict_path);
+        if (dict_cache.dict && dict_path) {
+            dict_cache.dict_path = strdup(dict_path);
+            if (!dict_cache.dict_path) {
+                /* strdup failed, clean up and return NULL */
+                newmm_free_dict(dict_cache.dict);
+                dict_cache.dict = NULL;
+                return NULL;
+            }
+        }
+    }
+    
+    return dict_cache.dict;
+}
+
 /**
  * Python wrapper for newmm_segment function
  */
@@ -21,8 +79,15 @@ static PyObject* py_newmm_segment(PyObject* Py_UNUSED(self), PyObject* args, PyO
         return NULL;
     }
     
-    /* Call C function */
-    char** tokens = newmm_segment(text, dict_path, &token_count);
+    /* Get or load dictionary */
+    newmm_dict_t dict = get_or_load_dict(dict_path);
+    if (!dict) {
+        PyErr_SetString(PyExc_MemoryError, "Failed to load dictionary (out of memory)");
+        return NULL;
+    }
+    
+    /* Call C function with cached dictionary */
+    char** tokens = newmm_segment_with_dict(text, dict, &token_count);
     
     if (!tokens) {
         PyErr_SetString(PyExc_RuntimeError, "Failed to segment text");
@@ -52,6 +117,21 @@ static PyObject* py_newmm_segment(PyObject* Py_UNUSED(self), PyObject* args, PyO
     return result;
 }
 
+/**
+ * Clear cached dictionary
+ */
+static PyObject* py_clear_cache(PyObject* Py_UNUSED(self), PyObject* Py_UNUSED(args)) {
+    if (dict_cache.dict) {
+        newmm_free_dict(dict_cache.dict);
+        dict_cache.dict = NULL;
+    }
+    if (dict_cache.dict_path) {
+        free(dict_cache.dict_path);
+        dict_cache.dict_path = NULL;
+    }
+    Py_RETURN_NONE;
+}
+
 /**
  * Module method definitions
  */
@@ -72,6 +152,13 @@ static PyMethodDef CThaiNLPMethods[] = {
         "    >>> print(tokens)\n"
         "    ['ฉัน', 'ไป', 'โรงเรียน']\n"
     },
+    {
+        "clear_cache",
+        py_clear_cache,
+        METH_NOARGS,
+        "Clear the cached dictionary.\n\n"
+        "This forces the next tokenization to reload the dictionary.\n"
+    },
     {NULL, NULL, 0, NULL}  /* Sentinel */
 };
 
@@ -90,9 +177,26 @@ static struct PyModuleDef cthainlp_module = {
     NULL   /* m_free */
 };
 
+/**
+ * Module cleanup function
+ */
+static void module_free(void* Py_UNUSED(self)) {
+    /* Clean up cached dictionary on module unload */
+    if (dict_cache.dict) {
+        newmm_free_dict(dict_cache.dict);
+        dict_cache.dict = NULL;
+    }
+    if (dict_cache.dict_path) {
+        free(dict_cache.dict_path);
+        dict_cache.dict_path = NULL;
+    }
+}
+
 /**
  * Module initialization function
  */
 PyMODINIT_FUNC PyInit__cthainlp(void) {
+    /* Update module definition with cleanup function */
+    cthainlp_module.m_free = module_free;
     return PyModule_Create(&cthainlp_module);
 }
diff --git a/src/newmm.c b/src/newmm.c
@@ -23,15 +23,6 @@ typedef struct {
     int size;
 } Graph;
 
-/* Helper: Check if position is in the valid positions set */
-static bool is_valid_pos(int pos, int* valid_pos, int num_valid) {
-    for (int i = 0; i < num_valid; i++) {
-        if (valid_pos[i] == pos) return true;
-        if (valid_pos[i] > pos) return false;
-    }
-    return false;
-}
-
 /* Helper: Check if character is non-Thai */
 static bool is_non_thai_char(int codepoint) {
     /* Latin letters, digits, spaces */
@@ -259,14 +250,7 @@ static const char* default_words[] = {
     NULL
 };
 
-char** newmm_segment(const char* text, const char* dict_path, int* token_count) {
-    if (!text || !token_count) return NULL;
-    
-    *token_count = 0;
-    
-    /* Empty text */
-    if (!text[0]) return NULL;
-    
+newmm_dict_t newmm_load_dict(const char* dict_path) {
     /* Create trie */
     Trie* trie = trie_create();
     if (!trie) return NULL;
@@ -286,14 +270,51 @@ char** newmm_segment(const char* text, const char* dict_path, int* token_count)
         }
     }
     
+    return (newmm_dict_t)trie;
+}
+
+void newmm_free_dict(newmm_dict_t dict) {
+    if (dict) {
+        trie_free((Trie*)dict);
+    }
+}
+
+char** newmm_segment_with_dict(const char* text, newmm_dict_t dict, int* token_count) {
+    if (!text || !token_count || !dict) return NULL;
+    
+    *token_count = 0;
+    
+    /* Empty text */
+    if (!text[0]) return NULL;
+    
+    Trie* trie = (Trie*)dict;
+    
     /* Segment text */
     char** tokens = NULL;
     int count = segment_text(text, trie, &tokens);
     
+    *token_count = count;
+    return tokens;
+}
+
+char** newmm_segment(const char* text, const char* dict_path, int* token_count) {
+    if (!text || !token_count) return NULL;
+    
+    *token_count = 0;
+    
+    /* Empty text */
+    if (!text[0]) return NULL;
+    
+    /* Create and load dictionary */
+    newmm_dict_t dict = newmm_load_dict(dict_path);
+    if (!dict) return NULL;
+    
+    /* Segment text */
+    char** tokens = newmm_segment_with_dict(text, dict, token_count);
+    
     /* Cleanup */
-    trie_free(trie);
+    newmm_free_dict(dict);
     
-    *token_count = count;
     return tokens;
 }