Skip to content

Commit 258cbe2

Browse files
authored
Merge pull request #7 from wannaphong/copilot/improve-cthainlp-speed
Cache dictionary to fix ~1200x performance regression
2 parents 938c616 + 6b55cbd commit 258cbe2

File tree

3 files changed

+177
-21
lines changed

3 files changed

+177
-21
lines changed

include/newmm.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,37 @@
1919
extern "C" {
2020
#endif
2121

22+
/* Opaque handle for dictionary */
23+
typedef void* newmm_dict_t;
24+
25+
/**
26+
* @brief Load a dictionary for reuse
27+
*
28+
* @param dict_path Path to dictionary file (one word per line, UTF-8 encoded)
29+
* If NULL, uses a default minimal dictionary
30+
* @return Dictionary handle to be used with newmm_segment_with_dict()
31+
* Returns NULL on error
32+
*/
33+
newmm_dict_t newmm_load_dict(const char* dict_path);
34+
35+
/**
36+
* @brief Free a loaded dictionary
37+
*
38+
* @param dict Dictionary handle returned by newmm_load_dict()
39+
*/
40+
void newmm_free_dict(newmm_dict_t dict);
41+
42+
/**
43+
* @brief Segment Thai text using a pre-loaded dictionary
44+
*
45+
* @param text Input Thai text to be segmented (UTF-8 encoded)
46+
* @param dict Pre-loaded dictionary handle from newmm_load_dict()
47+
* @param token_count Output parameter for number of tokens found
48+
* @return Array of strings (tokens), caller must free using newmm_free_result()
49+
* Returns NULL on error
50+
*/
51+
char** newmm_segment_with_dict(const char* text, newmm_dict_t dict, int* token_count);
52+
2253
/**
2354
* @brief Segment Thai text into words using newmm algorithm
2455
*

python/cthainlp_wrapper.c

Lines changed: 106 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,66 @@
55

66
#define PY_SSIZE_T_CLEAN
77
#include <Python.h>
8+
#include <string.h>
9+
#include <stdlib.h>
810
#include "newmm.h"
911

12+
/* Module-level dictionary cache */
13+
static struct {
14+
newmm_dict_t dict;
15+
char* dict_path;
16+
} dict_cache = {NULL, NULL};
17+
18+
/**
19+
* Load or retrieve cached dictionary
20+
*/
21+
static newmm_dict_t get_or_load_dict(const char* dict_path) {
22+
/* Check if we need to reload the dictionary */
23+
int need_reload = 0;
24+
25+
if (dict_cache.dict == NULL) {
26+
/* No cached dict */
27+
need_reload = 1;
28+
} else if (dict_path == NULL && dict_cache.dict_path != NULL) {
29+
/* Switching from custom to default */
30+
need_reload = 1;
31+
} else if (dict_path != NULL && dict_cache.dict_path == NULL) {
32+
/* Switching from default to custom */
33+
need_reload = 1;
34+
} else if (dict_path != NULL && dict_cache.dict_path != NULL) {
35+
/* Both custom, check if path changed */
36+
if (strcmp(dict_path, dict_cache.dict_path) != 0) {
37+
need_reload = 1;
38+
}
39+
}
40+
41+
if (need_reload) {
42+
/* Free old dictionary */
43+
if (dict_cache.dict) {
44+
newmm_free_dict(dict_cache.dict);
45+
dict_cache.dict = NULL;
46+
}
47+
if (dict_cache.dict_path) {
48+
free(dict_cache.dict_path);
49+
dict_cache.dict_path = NULL;
50+
}
51+
52+
/* Load new dictionary */
53+
dict_cache.dict = newmm_load_dict(dict_path);
54+
if (dict_cache.dict && dict_path) {
55+
dict_cache.dict_path = strdup(dict_path);
56+
if (!dict_cache.dict_path) {
57+
/* strdup failed, clean up and return NULL */
58+
newmm_free_dict(dict_cache.dict);
59+
dict_cache.dict = NULL;
60+
return NULL;
61+
}
62+
}
63+
}
64+
65+
return dict_cache.dict;
66+
}
67+
1068
/**
1169
* Python wrapper for newmm_segment function
1270
*/
@@ -21,8 +79,15 @@ static PyObject* py_newmm_segment(PyObject* Py_UNUSED(self), PyObject* args, PyO
2179
return NULL;
2280
}
2381

24-
/* Call C function */
25-
char** tokens = newmm_segment(text, dict_path, &token_count);
82+
/* Get or load dictionary */
83+
newmm_dict_t dict = get_or_load_dict(dict_path);
84+
if (!dict) {
85+
PyErr_SetString(PyExc_MemoryError, "Failed to load dictionary (out of memory)");
86+
return NULL;
87+
}
88+
89+
/* Call C function with cached dictionary */
90+
char** tokens = newmm_segment_with_dict(text, dict, &token_count);
2691

2792
if (!tokens) {
2893
PyErr_SetString(PyExc_RuntimeError, "Failed to segment text");
@@ -52,6 +117,21 @@ static PyObject* py_newmm_segment(PyObject* Py_UNUSED(self), PyObject* args, PyO
52117
return result;
53118
}
54119

120+
/**
121+
* Clear cached dictionary
122+
*/
123+
static PyObject* py_clear_cache(PyObject* Py_UNUSED(self), PyObject* Py_UNUSED(args)) {
124+
if (dict_cache.dict) {
125+
newmm_free_dict(dict_cache.dict);
126+
dict_cache.dict = NULL;
127+
}
128+
if (dict_cache.dict_path) {
129+
free(dict_cache.dict_path);
130+
dict_cache.dict_path = NULL;
131+
}
132+
Py_RETURN_NONE;
133+
}
134+
55135
/**
56136
* Module method definitions
57137
*/
@@ -72,6 +152,13 @@ static PyMethodDef CThaiNLPMethods[] = {
72152
" >>> print(tokens)\n"
73153
" ['ฉัน', 'ไป', 'โรงเรียน']\n"
74154
},
155+
{
156+
"clear_cache",
157+
py_clear_cache,
158+
METH_NOARGS,
159+
"Clear the cached dictionary.\n\n"
160+
"This forces the next tokenization to reload the dictionary.\n"
161+
},
75162
{NULL, NULL, 0, NULL} /* Sentinel */
76163
};
77164

@@ -90,9 +177,26 @@ static struct PyModuleDef cthainlp_module = {
90177
NULL /* m_free */
91178
};
92179

180+
/**
181+
* Module cleanup function
182+
*/
183+
static void module_free(void* Py_UNUSED(self)) {
184+
/* Clean up cached dictionary on module unload */
185+
if (dict_cache.dict) {
186+
newmm_free_dict(dict_cache.dict);
187+
dict_cache.dict = NULL;
188+
}
189+
if (dict_cache.dict_path) {
190+
free(dict_cache.dict_path);
191+
dict_cache.dict_path = NULL;
192+
}
193+
}
194+
93195
/**
94196
* Module initialization function
95197
*/
96198
PyMODINIT_FUNC PyInit__cthainlp(void) {
199+
/* Update module definition with cleanup function */
200+
cthainlp_module.m_free = module_free;
97201
return PyModule_Create(&cthainlp_module);
98202
}

src/newmm.c

Lines changed: 40 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,6 @@ typedef struct {
2323
int size;
2424
} Graph;
2525

26-
/* Helper: Check if position is in the valid positions set */
27-
static bool is_valid_pos(int pos, int* valid_pos, int num_valid) {
28-
for (int i = 0; i < num_valid; i++) {
29-
if (valid_pos[i] == pos) return true;
30-
if (valid_pos[i] > pos) return false;
31-
}
32-
return false;
33-
}
34-
3526
/* Helper: Check if character is non-Thai */
3627
static bool is_non_thai_char(int codepoint) {
3728
/* Latin letters, digits, spaces */
@@ -259,14 +250,7 @@ static const char* default_words[] = {
259250
NULL
260251
};
261252

262-
char** newmm_segment(const char* text, const char* dict_path, int* token_count) {
263-
if (!text || !token_count) return NULL;
264-
265-
*token_count = 0;
266-
267-
/* Empty text */
268-
if (!text[0]) return NULL;
269-
253+
newmm_dict_t newmm_load_dict(const char* dict_path) {
270254
/* Create trie */
271255
Trie* trie = trie_create();
272256
if (!trie) return NULL;
@@ -286,14 +270,51 @@ char** newmm_segment(const char* text, const char* dict_path, int* token_count)
286270
}
287271
}
288272

273+
return (newmm_dict_t)trie;
274+
}
275+
276+
void newmm_free_dict(newmm_dict_t dict) {
277+
if (dict) {
278+
trie_free((Trie*)dict);
279+
}
280+
}
281+
282+
char** newmm_segment_with_dict(const char* text, newmm_dict_t dict, int* token_count) {
283+
if (!text || !token_count || !dict) return NULL;
284+
285+
*token_count = 0;
286+
287+
/* Empty text */
288+
if (!text[0]) return NULL;
289+
290+
Trie* trie = (Trie*)dict;
291+
289292
/* Segment text */
290293
char** tokens = NULL;
291294
int count = segment_text(text, trie, &tokens);
292295

296+
*token_count = count;
297+
return tokens;
298+
}
299+
300+
char** newmm_segment(const char* text, const char* dict_path, int* token_count) {
301+
if (!text || !token_count) return NULL;
302+
303+
*token_count = 0;
304+
305+
/* Empty text */
306+
if (!text[0]) return NULL;
307+
308+
/* Create and load dictionary */
309+
newmm_dict_t dict = newmm_load_dict(dict_path);
310+
if (!dict) return NULL;
311+
312+
/* Segment text */
313+
char** tokens = newmm_segment_with_dict(text, dict, token_count);
314+
293315
/* Cleanup */
294-
trie_free(trie);
316+
newmm_free_dict(dict);
295317

296-
*token_count = count;
297318
return tokens;
298319
}
299320

0 commit comments

Comments
 (0)