@@ -118,5 +118,117 @@ public void TestNewMMInvalidEngine()
118118 string text = "ประเทศไทย" ;
119119 WordTokenizer . WordTokenize ( text , engine : "invalid" ) ;
120120 }
121+
122+ [ TestMethod ]
123+ public void TestNewMMComprehensive ( )
124+ {
125+ // Comprehensive test from PyThaiNLP
126+ // Test null input
127+ var result1 = NewMM . Segment ( null ) ;
128+ CollectionAssert . AreEqual ( new List < string > ( ) , result1 ) ;
129+
130+ // Test empty string
131+ var result2 = NewMM . Segment ( "" ) ;
132+ CollectionAssert . AreEqual ( new List < string > ( ) , result2 ) ;
133+
134+ // Test Thai sentence tokenization
135+ var result3 = WordTokenizer . WordTokenize ( "ฉันรักภาษาไทยเพราะฉันเป็นคนไทย" , engine : "newmm" ) ;
136+ CollectionAssert . AreEqual (
137+ new List < string > { "ฉัน" , "รัก" , "ภาษาไทย" , "เพราะ" , "ฉัน" , "เป็น" , "คนไทย" } ,
138+ result3
139+ ) ;
140+
141+ // Test numeric patterns - dots
142+ var result4 = WordTokenizer . WordTokenize ( "19..." , engine : "newmm" ) ;
143+ CollectionAssert . AreEqual (
144+ new List < string > { "19" , "..." } ,
145+ result4
146+ ) ;
147+
148+ // Test numeric patterns - single dot
149+ var result5 = WordTokenizer . WordTokenize ( "19." , engine : "newmm" ) ;
150+ CollectionAssert . AreEqual (
151+ new List < string > { "19" , "." } ,
152+ result5
153+ ) ;
154+
155+ // Test numeric patterns - decimal
156+ var result6 = WordTokenizer . WordTokenize ( "19.84" , engine : "newmm" ) ;
157+ CollectionAssert . AreEqual (
158+ new List < string > { "19.84" } ,
159+ result6
160+ ) ;
161+
162+ // Test numeric patterns - IP address
163+ var result7 = WordTokenizer . WordTokenize ( "127.0.0.1" , engine : "newmm" ) ;
164+ CollectionAssert . AreEqual (
165+ new List < string > { "127.0.0.1" } ,
166+ result7
167+ ) ;
168+
169+ // Test numeric patterns - currency
170+ var result8 = WordTokenizer . WordTokenize ( "USD1,984.42" , engine : "newmm" ) ;
171+ CollectionAssert . AreEqual (
172+ new List < string > { "USD" , "1,984.42" } ,
173+ result8
174+ ) ;
175+
176+ // Test keep_whitespace parameter
177+ var result9 = WordTokenizer . WordTokenize (
178+ "สวัสดีครับ สบายดีไหมครับ" ,
179+ engine : "newmm" ,
180+ keepWhitespace : true
181+ ) ;
182+ CollectionAssert . AreEqual (
183+ new List < string > { "สวัสดี" , "ครับ" , " " , "สบายดี" , "ไหม" , "ครับ" } ,
184+ result9
185+ ) ;
186+
187+ // Test Thai text with uncommon words
188+ var result10 = WordTokenizer . WordTokenize ( "จุ๋มง่วงนอนยัง" , engine : "newmm" ) ;
189+ CollectionAssert . AreEqual (
190+ new List < string > { "จุ๋ม" , "ง่วงนอน" , "ยัง" } ,
191+ result10
192+ ) ;
193+
194+ // Test Thai text
195+ var result11 = WordTokenizer . WordTokenize ( "จุ๋มง่วง" , engine : "newmm" ) ;
196+ CollectionAssert . AreEqual (
197+ new List < string > { "จุ๋ม" , "ง่วง" } ,
198+ result11
199+ ) ;
200+
201+ // Test whitespace handling with keep_whitespace=false
202+ var result12 = WordTokenizer . WordTokenize ( "จุ๋ม ง่วง" , engine : "newmm" , keepWhitespace : false ) ;
203+ CollectionAssert . AreEqual (
204+ new List < string > { "จุ๋ม" , "ง่วง" } ,
205+ result12
206+ ) ;
207+
208+ // Test that whitespace is not included when keep_whitespace=false
209+ var result13 = WordTokenizer . WordTokenize ( "จุ๋มง่วง" , keepWhitespace : false ) ;
210+ CollectionAssert . DoesNotContain ( result13 , " " ) ;
211+
212+ // Test parentheses
213+ var result14 = WordTokenizer . WordTokenize ( "(คนไม่เอา)" , engine : "newmm" ) ;
214+ CollectionAssert . AreEqual (
215+ new List < string > { "(" , "คน" , "ไม่" , "เอา" , ")" } ,
216+ result14
217+ ) ;
218+
219+ // Test slash
220+ var result15 = WordTokenizer . WordTokenize ( "กม/ชม" , engine : "newmm" ) ;
221+ CollectionAssert . AreEqual (
222+ new List < string > { "กม" , "/" , "ชม" } ,
223+ result15
224+ ) ;
225+
226+ // Test complex case with Thai and parentheses
227+ var result16 = WordTokenizer . WordTokenize ( "สีหน้า(รถ)" , engine : "newmm" ) ;
228+ CollectionAssert . AreEqual (
229+ new List < string > { "สีหน้า" , "(" , "รถ" , ")" } ,
230+ result16
231+ ) ;
232+ }
121233 }
122234}
0 commit comments