Skip to content

Commit 6e856b6

Browse files
Copilotwannaphong
andcommitted
Add comprehensive newmm tests and fix tokenizer edge cases
Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com>
1 parent b2fe1a3 commit 6e856b6

File tree

3 files changed

+127
-0
lines changed

3 files changed

+127
-0
lines changed

ThaiNLPTest/NewMMTest.cs

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,5 +118,117 @@ public void TestNewMMInvalidEngine()
118118
string text = "ประเทศไทย";
119119
WordTokenizer.WordTokenize(text, engine: "invalid");
120120
}
121+
122+
[TestMethod]
123+
public void TestNewMMComprehensive()
124+
{
125+
// Comprehensive test from PyThaiNLP
126+
// Test null input
127+
var result1 = NewMM.Segment(null);
128+
CollectionAssert.AreEqual(new List<string>(), result1);
129+
130+
// Test empty string
131+
var result2 = NewMM.Segment("");
132+
CollectionAssert.AreEqual(new List<string>(), result2);
133+
134+
// Test Thai sentence tokenization
135+
var result3 = WordTokenizer.WordTokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine: "newmm");
136+
CollectionAssert.AreEqual(
137+
new List<string> { "ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย" },
138+
result3
139+
);
140+
141+
// Test numeric patterns - dots
142+
var result4 = WordTokenizer.WordTokenize("19...", engine: "newmm");
143+
CollectionAssert.AreEqual(
144+
new List<string> { "19", "..." },
145+
result4
146+
);
147+
148+
// Test numeric patterns - single dot
149+
var result5 = WordTokenizer.WordTokenize("19.", engine: "newmm");
150+
CollectionAssert.AreEqual(
151+
new List<string> { "19", "." },
152+
result5
153+
);
154+
155+
// Test numeric patterns - decimal
156+
var result6 = WordTokenizer.WordTokenize("19.84", engine: "newmm");
157+
CollectionAssert.AreEqual(
158+
new List<string> { "19.84" },
159+
result6
160+
);
161+
162+
// Test numeric patterns - IP address
163+
var result7 = WordTokenizer.WordTokenize("127.0.0.1", engine: "newmm");
164+
CollectionAssert.AreEqual(
165+
new List<string> { "127.0.0.1" },
166+
result7
167+
);
168+
169+
// Test numeric patterns - currency
170+
var result8 = WordTokenizer.WordTokenize("USD1,984.42", engine: "newmm");
171+
CollectionAssert.AreEqual(
172+
new List<string> { "USD", "1,984.42" },
173+
result8
174+
);
175+
176+
// Test keep_whitespace parameter
177+
var result9 = WordTokenizer.WordTokenize(
178+
"สวัสดีครับ สบายดีไหมครับ",
179+
engine: "newmm",
180+
keepWhitespace: true
181+
);
182+
CollectionAssert.AreEqual(
183+
new List<string> { "สวัสดี", "ครับ", " ", "สบายดี", "ไหม", "ครับ" },
184+
result9
185+
);
186+
187+
// Test Thai text with uncommon words
188+
var result10 = WordTokenizer.WordTokenize("จุ๋มง่วงนอนยัง", engine: "newmm");
189+
CollectionAssert.AreEqual(
190+
new List<string> { "จุ๋ม", "ง่วงนอน", "ยัง" },
191+
result10
192+
);
193+
194+
// Test Thai text
195+
var result11 = WordTokenizer.WordTokenize("จุ๋มง่วง", engine: "newmm");
196+
CollectionAssert.AreEqual(
197+
new List<string> { "จุ๋ม", "ง่วง" },
198+
result11
199+
);
200+
201+
// Test whitespace handling with keep_whitespace=false
202+
var result12 = WordTokenizer.WordTokenize("จุ๋ม ง่วง", engine: "newmm", keepWhitespace: false);
203+
CollectionAssert.AreEqual(
204+
new List<string> { "จุ๋ม", "ง่วง" },
205+
result12
206+
);
207+
208+
// Test that whitespace is not included when keep_whitespace=false
209+
var result13 = WordTokenizer.WordTokenize("จุ๋มง่วง", keepWhitespace: false);
210+
CollectionAssert.DoesNotContain(result13, " ");
211+
212+
// Test parentheses
213+
var result14 = WordTokenizer.WordTokenize("(คนไม่เอา)", engine: "newmm");
214+
CollectionAssert.AreEqual(
215+
new List<string> { "(", "คน", "ไม่", "เอา", ")" },
216+
result14
217+
);
218+
219+
// Test slash
220+
var result15 = WordTokenizer.WordTokenize("กม/ชม", engine: "newmm");
221+
CollectionAssert.AreEqual(
222+
new List<string> { "กม", "/", "ชม" },
223+
result15
224+
);
225+
226+
// Test complex case with Thai and parentheses
227+
var result16 = WordTokenizer.WordTokenize("สีหน้า(รถ)", engine: "newmm");
228+
CollectionAssert.AreEqual(
229+
new List<string> { "สีหน้า", "(", "รถ", ")" },
230+
result16
231+
);
232+
}
121233
}
122234
}

thainlp/NewMM.cs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,20 @@ private static IEnumerable<string> OneCut(string text, Trie customDict)
175175
posList.Add(endPos);
176176
}
177177
}
178+
179+
// Output remaining tokens from endPos to textLength
180+
if (endPos < textLength)
181+
{
182+
var paths = BfsPathsGraph(graph, endPos, textLength).FirstOrDefault();
183+
if (paths != null)
184+
{
185+
for (int i = 1; i < paths.Count; i++)
186+
{
187+
yield return text.Substring(endPos, paths[i] - endPos);
188+
endPos = paths[i];
189+
}
190+
}
191+
}
178192
}
179193

180194
/// <summary>

thainlp/words_th.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13623,6 +13623,7 @@
1362313623
จุบ
1362413624
จุ๊บแจง
1362513625
จุปาก
13626+
จุ๋ม
1362613627
จุ๋มจิ๋ม
1362713628
จุ่ม
1362813629
จุ้ม

0 commit comments

Comments
 (0)