Skip to content

Commit a7652a5

Browse files
Copilotwannaphong
andcommitted
Add lookahead heuristic to improve word segmentation
- Implemented 1-step lookahead to prefer matches that avoid unknown Thai characters - When longest match leads to unknown Thai char, try shorter matches that lead to dict words - All existing tests pass (12/12) - 4 out of 5 benchmark cases now match PyThaiNLP exactly Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com>
1 parent 72bace0 commit a7652a5

File tree

1 file changed

+57
-2
lines changed

1 file changed

+57
-2
lines changed

src/newmm.c

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,16 +122,71 @@ static int segment_text(const char* text, Trie* trie, char*** tokens) {
122122
int best_len = 0;
123123
int best_end_pos = pos;
124124

125-
/* Find longest valid prefix */
125+
/* Simple greedy: find longest match */
126+
/* But prefer shorter match if longer one leaves us with unknown Thai character */
126127
for (int i = 0; i < num_prefixes; i++) {
127128
int end_pos = pos + lengths[i];
128-
/* Accept the longest dictionary word, TCC boundary not required */
129+
129130
if (lengths[i] > best_len) {
130131
best_len = lengths[i];
131132
best_end_pos = end_pos;
132133
}
133134
}
134135

136+
/* Now check if a shorter match would be better */
137+
/* Only if the best match leads to an unknown Thai character */
138+
/* and a shorter match leads to a known word */
139+
if (best_len > 0 && best_end_pos < text_len) {
140+
char** best_next_prefixes;
141+
int* best_next_lengths;
142+
int num_best_next = trie_prefixes(trie, text + best_end_pos, &best_next_prefixes, &best_next_lengths);
143+
144+
if (num_best_next == 0) {
145+
/* Best match doesn't lead to a dictionary word */
146+
/* Check if it's a Thai character (not Latin/digit) */
147+
int byte_len;
148+
int next_cp = get_utf8_codepoint(text + best_end_pos, &byte_len);
149+
150+
if (!is_non_thai_char(next_cp)) {
151+
/* It's a Thai character that's not in dictionary */
152+
/* Try shorter matches to see if they lead to dictionary words */
153+
for (int i = 0; i < num_prefixes; i++) {
154+
int end_pos = pos + lengths[i];
155+
if (lengths[i] < best_len && end_pos < text_len) {
156+
char** next_prefixes;
157+
int* next_lengths;
158+
int num_next = trie_prefixes(trie, text + end_pos, &next_prefixes, &next_lengths);
159+
160+
if (num_next > 0) {
161+
/* This shorter match leads to a dictionary word */
162+
/* Prefer it */
163+
best_len = lengths[i];
164+
best_end_pos = end_pos;
165+
166+
/* Free and break */
167+
for (int j = 0; j < num_next; j++) {
168+
free(next_prefixes[j]);
169+
}
170+
free(next_prefixes);
171+
free(next_lengths);
172+
break;
173+
}
174+
175+
free(next_prefixes);
176+
free(next_lengths);
177+
}
178+
}
179+
}
180+
}
181+
182+
/* Free lookahead results */
183+
for (int j = 0; j < num_best_next; j++) {
184+
free(best_next_prefixes[j]);
185+
}
186+
free(best_next_prefixes);
187+
free(best_next_lengths);
188+
}
189+
135190
/* Free prefix results */
136191
for (int i = 0; i < num_prefixes; i++) {
137192
free(prefixes[i]);

0 commit comments

Comments
 (0)