@@ -20,8 +20,10 @@ use icu::segmenter::options::WordType;
20
20
use icu:: segmenter:: provider:: * ;
21
21
use icu_codepointtrie_builder:: { CodePointTrieBuilder , CodePointTrieBuilderData } ;
22
22
use icu_provider:: prelude:: * ;
23
+ use std:: cmp;
23
24
use std:: collections:: HashSet ;
24
25
use std:: fmt:: Debug ;
26
+ use std:: ops:: RangeInclusive ;
25
27
use std:: sync:: OnceLock ;
26
28
use zerovec:: ZeroVec ;
27
29
@@ -85,6 +87,16 @@ struct SegmenterRuleTable {
85
87
rules : Vec < SegmenterState > ,
86
88
}
87
89
90
+ /// Fill `dst` at range `r` with `value`, ignoring any out of bounds ranges
91
+ fn fill_bounded ( dst : & mut [ u8 ] , r : RangeInclusive < u32 > , value : u8 ) {
92
+ let start = * r. start ( ) as usize ;
93
+ let end = cmp:: min ( * r. end ( ) as usize , dst. len ( ) - 1 ) ;
94
+ if start >= dst. len ( ) {
95
+ return ;
96
+ }
97
+ dst[ start..=end] . fill ( value) ;
98
+ }
99
+
88
100
#[ cfg( any( feature = "use_wasm" , feature = "use_icu4c" ) ) ]
89
101
fn generate_rule_break_data (
90
102
provider : & SourceDataProvider ,
@@ -214,33 +226,23 @@ fn generate_rule_break_data(
214
226
match & * segmenter. segmenter_type {
215
227
"word" => {
216
228
// Extended_Pictographic isn't a part of word break property
217
- // Extended pictographic property is within 0..U+0x20000
218
229
if p. name == "Extended_Pictographic" {
219
- for i in 0 ..0x20000 {
220
- if let Some ( c) = char:: from_u32 ( i) {
221
- if extended_pictographic. contains ( c) {
222
- properties_map[ c as usize ] = property_index
223
- }
224
- }
230
+ for range in extended_pictographic. iter_ranges ( ) {
231
+ fill_bounded ( & mut properties_map, range, property_index) ;
225
232
}
226
233
continue ;
227
234
}
228
235
229
236
if p. name == "SA" {
230
237
// Word break property doesn't define SA, but we will use non-UAX29 rules.
231
- // SA/CJ property is within 0..U+0x40000
232
- for c in 0 ..0x40000 {
233
- if lb. get32 ( c) == LineBreak :: ComplexContext {
234
- properties_map[ c as usize ] = property_index
235
- } else if let Some ( c) = char:: from_u32 ( c) {
236
- match script. get ( c) {
237
- Script :: Han | Script :: Hiragana => {
238
- properties_map[ c as usize ] = property_index;
239
- }
240
-
241
- _ => { }
242
- }
243
- }
238
+ for range in script. iter_ranges_for_value ( Script :: Han ) {
239
+ fill_bounded ( & mut properties_map, range, property_index) ;
240
+ }
241
+ for range in script. iter_ranges_for_value ( Script :: Hiragana ) {
242
+ fill_bounded ( & mut properties_map, range, property_index) ;
243
+ }
244
+ for range in lb. iter_ranges_for_value ( LineBreak :: ComplexContext ) {
245
+ fill_bounded ( & mut properties_map, range, property_index) ;
244
246
}
245
247
continue ;
246
248
}
@@ -252,82 +254,67 @@ fn generate_rule_break_data(
252
254
let prop = wb_name_to_enum
253
255
. get_loose ( & p. name )
254
256
. expect ( "property name should be valid!" ) ;
255
- for c in 0 ..( CODEPOINT_TABLE_LEN as u32 ) {
256
- if wb. get32 ( c) == prop {
257
+ for range in wb. iter_ranges_for_value ( prop) {
258
+ if prop == WordBreak :: MidLetter
259
+ && ( range. contains ( & 0x003a )
260
+ || range. contains ( & 0xfe55 )
261
+ || range. contains ( & 0xff1a ) )
262
+ {
257
263
// UAX29 defines the colon as MidLetter, but ICU4C's
258
264
// English data doesn't.
259
265
// See https://unicode-org.atlassian.net/browse/ICU-22112
260
266
//
261
267
// TODO: We have to consider this definition from CLDR instead.
262
- if ( c == 0x003a || c == 0xfe55 || c == 0xff1a ) && p. name == "MidLetter"
268
+ for ch in
269
+ range. filter ( |ch| * ch != 0x003a && * ch != 0xfe55 && * ch != 0xff1a )
263
270
{
264
- // Default (en etc) is undefined class.
265
- continue ;
271
+ properties_map[ ch as usize ] = property_index;
266
272
}
267
-
268
- properties_map[ c as usize ] = property_index;
273
+ } else {
274
+ fill_bounded ( & mut properties_map, range , property_index) ;
269
275
}
270
276
}
277
+
271
278
continue ;
272
279
}
273
280
274
281
"grapheme" => {
275
282
// Extended_Pictographic isn't a part of grapheme break property
276
- // Extended pictographic property is within 0..U+0x20000
277
283
if p. name == "Extended_Pictographic" {
278
- for i in 0 ..0x20000 {
279
- if let Some ( c) = char:: from_u32 ( i) {
280
- if extended_pictographic. contains ( c) {
281
- properties_map[ c as usize ] = property_index
282
- }
283
- }
284
- }
285
- continue ;
286
- }
287
-
288
- if p. name == "InCBConsonant" {
289
- for i in 0 ..( CODEPOINT_TABLE_LEN as u32 ) {
290
- if let Some ( c) = char:: from_u32 ( i) {
291
- if incb. get ( c) == IndicConjunctBreak :: Consonant {
292
- properties_map[ c as usize ] = property_index;
293
- }
294
- }
284
+ for range in extended_pictographic. iter_ranges ( ) {
285
+ fill_bounded ( & mut properties_map, range, property_index) ;
295
286
}
296
287
continue ;
297
288
}
298
289
299
- if p. name == "InCBLinker" {
300
- for i in 0 ..( CODEPOINT_TABLE_LEN as u32 ) {
301
- if let Some ( c) = char:: from_u32 ( i) {
302
- if incb. get ( c) == IndicConjunctBreak :: Linker {
303
- properties_map[ c as usize ] = property_index;
290
+ let relevant_incb = match & * p. name {
291
+ "InCBConsonant" => Some ( IndicConjunctBreak :: Consonant ) ,
292
+ "InCBLinker" => Some ( IndicConjunctBreak :: Linker ) ,
293
+ "InCBExtend" => Some ( IndicConjunctBreak :: Extend ) ,
294
+ _ => None ,
295
+ } ;
296
+
297
+ if let Some ( relevant_incb) = relevant_incb {
298
+ for range in incb. iter_ranges_for_value ( relevant_incb) {
299
+ if range. contains ( & 0x200D ) {
300
+ // ZWJ is handled as a separate rule
301
+ for ch in range. filter ( |ch| * ch != 0x200D ) {
302
+ properties_map[ ch as usize ] = property_index;
304
303
}
304
+ } else {
305
+ fill_bounded ( & mut properties_map, range, property_index) ;
305
306
}
306
307
}
307
- continue ;
308
- }
309
308
310
- if p. name == "InCBExtend" {
311
- for i in 0 ..( CODEPOINT_TABLE_LEN as u32 ) {
312
- if let Some ( c) = char:: from_u32 ( i) {
313
- // ZWJ is handled as another rules.
314
- if incb. get ( c) == IndicConjunctBreak :: Extend
315
- && gb. get32 ( i) != GraphemeClusterBreak :: ZWJ
316
- {
317
- properties_map[ c as usize ] = property_index;
318
- }
319
- }
320
- }
321
309
continue ;
322
310
}
323
311
324
312
let prop = gcb_name_to_enum
325
313
. get_loose ( & p. name )
326
314
. expect ( "property name should be valid!" ) ;
327
- for c in 0 ..( CODEPOINT_TABLE_LEN as u32 ) {
328
- if gb. get32 ( c) == prop {
329
- properties_map[ c as usize ] = property_index;
330
- }
315
+
316
+ for range in gb. iter_ranges_for_value ( prop) {
317
+ fill_bounded ( & mut properties_map, range, property_index) ;
331
318
}
332
319
continue ;
333
320
}
@@ -336,10 +323,8 @@ fn generate_rule_break_data(
336
323
let prop = sb_name_to_enum
337
324
. get_loose ( & p. name )
338
325
. expect ( "property name should be valid!" ) ;
339
- for c in 0 ..( CODEPOINT_TABLE_LEN as u32 ) {
340
- if sb. get32 ( c) == prop {
341
- properties_map[ c as usize ] = property_index;
342
- }
326
+ for range in sb. iter_ranges_for_value ( prop) {
327
+ fill_bounded ( & mut properties_map, range, property_index) ;
343
328
}
344
329
continue ;
345
330
}
@@ -435,21 +420,17 @@ fn generate_rule_break_data(
435
420
let prop = lb_name_to_enum
436
421
. get_loose ( & p. name )
437
422
. expect ( "property name should be valid!" ) ;
438
- for c in 0 ..( CODEPOINT_TABLE_LEN as u32 ) {
439
- if lb. get32 ( c) == prop {
440
- properties_map[ c as usize ] = property_index;
441
- }
423
+ for range in lb. iter_ranges_for_value ( prop) {
424
+ fill_bounded ( & mut properties_map, range, property_index) ;
442
425
}
443
426
444
427
if p. name == "AL" {
445
428
// LB1: SG has no special rules.
446
429
let prop = lb_name_to_enum
447
430
. get_loose ( "SG" )
448
431
. expect ( "property name should be valid!" ) ;
449
- for c in 0 ..( CODEPOINT_TABLE_LEN as u32 ) {
450
- if lb. get32 ( c) == prop {
451
- properties_map[ c as usize ] = property_index;
452
- }
432
+ for range in lb. iter_ranges_for_value ( prop) {
433
+ fill_bounded ( & mut properties_map, range, property_index) ;
453
434
}
454
435
}
455
436
continue ;
0 commit comments