Skip to content

Commit 938c616

Browse files
authored
Merge pull request #5 from wannaphong/copilot/fix-output-mismatch-python
Fix tokenization by loading full dictionary and removing TCC boundary constraint
2 parents 8874425 + 3d17603 commit 938c616

File tree

5 files changed

+96
-5
lines changed

5 files changed

+96
-5
lines changed

cthainlp/data

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../data

cthainlp/tokenize.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,33 @@
1111
_cthainlp = None
1212

1313

14+
def _get_default_dict_path() -> Optional[str]:
15+
"""
16+
Get the default dictionary file path.
17+
18+
Returns:
19+
str: Absolute path to the default dictionary file
20+
"""
21+
# Get the directory where this module is located
22+
module_dir = os.path.dirname(os.path.abspath(__file__))
23+
24+
# Try to find dictionary in package data directory
25+
# When installed: cthainlp/data/thai_words.txt
26+
dict_path = os.path.join(module_dir, "data", "thai_words.txt")
27+
if os.path.exists(dict_path):
28+
return dict_path
29+
30+
# Try parent directory (development mode)
31+
# When in source: CThaiNLP/data/thai_words.txt
32+
parent_dir = os.path.dirname(module_dir)
33+
dict_path = os.path.join(parent_dir, "data", "thai_words.txt")
34+
if os.path.exists(dict_path):
35+
return dict_path
36+
37+
# Fallback: return None to use hardcoded dictionary
38+
return None
39+
40+
1441
def word_tokenize(
1542
text: str,
1643
engine: str = "newmm",
@@ -67,12 +94,15 @@ def word_tokenize(
6794
if not text:
6895
return []
6996

70-
# If custom_dict is provided and exists, use it; otherwise use None for default
71-
dict_path = None
97+
# Determine which dictionary to use
7298
if custom_dict is not None:
99+
# User provided a custom dictionary
73100
if not os.path.exists(custom_dict):
74101
raise FileNotFoundError(f"Dictionary file not found: {custom_dict}")
75102
dict_path = custom_dict
103+
else:
104+
# Use default dictionary
105+
dict_path = _get_default_dict_path()
76106

77107
# Call the C extension
78108
tokens = _cthainlp.segment(text, dict_path)

setup.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@
4040
url="https://github.com/wannaphong/CThaiNLP",
4141
packages=["cthainlp"],
4242
ext_modules=[cthainlp_extension],
43+
package_data={
44+
"cthainlp": ["data/*.txt"],
45+
},
4346
classifiers=[
4447
"Development Status :: 3 - Alpha",
4548
"Intended Audience :: Developers",

src/newmm.c

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,15 +122,72 @@ static int segment_text(const char* text, Trie* trie, char*** tokens) {
122122
int best_len = 0;
123123
int best_end_pos = pos;
124124

125-
/* Find longest valid prefix */
125+
/* Simple greedy: find longest match */
126+
/* But prefer shorter match if longer one leaves us with unknown Thai character */
126127
for (int i = 0; i < num_prefixes; i++) {
127128
int end_pos = pos + lengths[i];
128-
if (is_valid_pos(end_pos, valid_pos, num_valid) && lengths[i] > best_len) {
129+
130+
if (lengths[i] > best_len) {
129131
best_len = lengths[i];
130132
best_end_pos = end_pos;
131133
}
132134
}
133135

136+
/* Now check if a shorter match would be better */
137+
/* Only if the best match leads to an unknown Thai character */
138+
/* and a shorter match leads to a known word */
139+
if (best_len > 0 && best_end_pos < text_len) {
140+
char** best_next_prefixes;
141+
int* best_next_lengths;
142+
int num_best_next = trie_prefixes(trie, text + best_end_pos, &best_next_prefixes, &best_next_lengths);
143+
144+
if (num_best_next == 0) {
145+
/* Best match doesn't lead to a dictionary word */
146+
/* Check if it's a Thai character (not Latin/digit) */
147+
int byte_len;
148+
int next_cp = get_utf8_codepoint(text + best_end_pos, &byte_len);
149+
150+
if (!is_non_thai_char(next_cp)) {
151+
/* It's a Thai character that's not in dictionary */
152+
/* Try shorter matches to see if they lead to dictionary words */
153+
for (int i = 0; i < num_prefixes; i++) {
154+
int end_pos = pos + lengths[i];
155+
if (lengths[i] < best_len && end_pos < text_len) {
156+
char** next_prefixes;
157+
int* next_lengths;
158+
int num_next = trie_prefixes(trie, text + end_pos, &next_prefixes, &next_lengths);
159+
160+
if (num_next > 0) {
161+
/* This shorter match leads to a dictionary word */
162+
/* Prefer it */
163+
best_len = lengths[i];
164+
best_end_pos = end_pos;
165+
}
166+
167+
/* Free lookahead results */
168+
for (int j = 0; j < num_next; j++) {
169+
free(next_prefixes[j]);
170+
}
171+
free(next_prefixes);
172+
free(next_lengths);
173+
174+
if (num_next > 0) {
175+
/* We found a better match, stop looking */
176+
break;
177+
}
178+
}
179+
}
180+
}
181+
}
182+
183+
/* Free lookahead results */
184+
for (int j = 0; j < num_best_next; j++) {
185+
free(best_next_prefixes[j]);
186+
}
187+
free(best_next_prefixes);
188+
free(best_next_lengths);
189+
}
190+
134191
/* Free prefix results */
135192
for (int i = 0; i < num_prefixes; i++) {
136193
free(prefixes[i]);

tests/test_newmm.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ int main() {
105105

106106
/* Test 2: Thai sentence with common words */
107107
run_test("วันนี้อากาศดีมาก", dict,
108-
"['วันนี้', 'อา', 'กา', 'ศดี', 'มาก']",
108+
"['วันนี้', 'อากาศ', 'ดีมาก']",
109109
"Thai sentence with partial dictionary match");
110110

111111
/* Test 3: English text */

0 commit comments

Comments
 (0)