@@ -93,14 +93,23 @@ int _findNextBoundaryNormal() {
93
93
if ( this . _current == this . _size ) {
94
94
return - 1 ;
95
95
}
96
-
96
+
97
97
WordSeparate . characterType preType = WordSeparate . classifyChar ( this . _text , this . _current + this . _offset ) ;
98
+ bool preBoundaryChar = isBoundaryChar ( this . _text [ this . _current + this . _offset ] ) ;
98
99
this . _current ++ ;
100
+ if ( preBoundaryChar ) {
101
+ return this . _current ;
102
+ }
103
+
99
104
for ( ; this . _current < this . _size ; ++ this . _current ) {
100
105
this . nextUntilCodePoint ( ) ;
101
106
if ( this . _current >= this . _size ) {
102
107
break ;
103
108
}
109
+
110
+ if ( isBoundaryChar ( this . _text [ this . _current + this . _offset ] ) ) {
111
+ break ;
112
+ }
104
113
var currentType = WordSeparate . classifyChar ( this . _text , this . _current + this . _offset ) ;
105
114
if ( ( currentType == WordSeparate . characterType . WhiteSpace )
106
115
!= ( preType == WordSeparate . characterType . WhiteSpace ) ) {
@@ -152,6 +161,22 @@ public static uint getSupplementary(uint lead, uint trail) {
152
161
return ( char ) ( ( ( uint ) ( lead ) << 10 ) + ( uint ) ( trail - U16_SURROGATE_OFFSET ) ) ;
153
162
}
154
163
164
+ public static bool isBoundaryChar ( char code ) {
165
+ if ( char . IsPunctuation ( code ) ) {
166
+ return true ;
167
+ }
168
+ if ( code >= 0x4E00 && code <= 0x9FFF ) { // cjk https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
169
+ return true ;
170
+ }
171
+
172
+ // https://social.msdn.microsoft.com/Forums/en-US/0d1888de-9745-4dd1-80fd-d3c29d3e381d/checking-for-japanese-characters-in-a-string?forum=vcmfcatl
173
+ if ( code >= 0x3040 && code <= 0x30FF ) { // Hiragana or Katakana
174
+ return true ;
175
+ }
176
+
177
+ return false ;
178
+ }
179
+
155
180
void nextUntilCodePoint ( ) {
156
181
while ( this . _current < this . _size
157
182
&& ( char . IsLowSurrogate ( this . _text [ this . _current + this . _offset ] )
0 commit comments