Skip to content

Commit cd9a3e3

Browse files
authored
Use range iteration in segmenter datagen (#6430)
Followup from #6367
1 parent 0dc7027 commit cd9a3e3

File tree

1 file changed

+60
-79
lines changed
  • provider/source/src/segmenter

1 file changed

+60
-79
lines changed

provider/source/src/segmenter/mod.rs

Lines changed: 60 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,10 @@ use icu::segmenter::options::WordType;
2020
use icu::segmenter::provider::*;
2121
use icu_codepointtrie_builder::{CodePointTrieBuilder, CodePointTrieBuilderData};
2222
use icu_provider::prelude::*;
23+
use std::cmp;
2324
use std::collections::HashSet;
2425
use std::fmt::Debug;
26+
use std::ops::RangeInclusive;
2527
use std::sync::OnceLock;
2628
use zerovec::ZeroVec;
2729

@@ -85,6 +87,16 @@ struct SegmenterRuleTable {
8587
rules: Vec<SegmenterState>,
8688
}
8789

90+
/// Fill `dst` at range `r` with `value`, ignoring any out of bounds ranges
91+
fn fill_bounded(dst: &mut [u8], r: RangeInclusive<u32>, value: u8) {
92+
let start = *r.start() as usize;
93+
let end = cmp::min(*r.end() as usize, dst.len() - 1);
94+
if start >= dst.len() {
95+
return;
96+
}
97+
dst[start..=end].fill(value);
98+
}
99+
88100
#[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
89101
fn generate_rule_break_data(
90102
provider: &SourceDataProvider,
@@ -214,33 +226,23 @@ fn generate_rule_break_data(
214226
match &*segmenter.segmenter_type {
215227
"word" => {
216228
// Extended_Pictographic isn't a part of word break property
217-
// Extended pictographic property is within 0..U+0x20000
218229
if p.name == "Extended_Pictographic" {
219-
for i in 0..0x20000 {
220-
if let Some(c) = char::from_u32(i) {
221-
if extended_pictographic.contains(c) {
222-
properties_map[c as usize] = property_index
223-
}
224-
}
230+
for range in extended_pictographic.iter_ranges() {
231+
fill_bounded(&mut properties_map, range, property_index);
225232
}
226233
continue;
227234
}
228235

229236
if p.name == "SA" {
230237
// Word break property doesn't define SA, but we will use non-UAX29 rules.
231-
// SA/CJ property is within 0..U+0x40000
232-
for c in 0..0x40000 {
233-
if lb.get32(c) == LineBreak::ComplexContext {
234-
properties_map[c as usize] = property_index
235-
} else if let Some(c) = char::from_u32(c) {
236-
match script.get(c) {
237-
Script::Han | Script::Hiragana => {
238-
properties_map[c as usize] = property_index;
239-
}
240-
241-
_ => {}
242-
}
243-
}
238+
for range in script.iter_ranges_for_value(Script::Han) {
239+
fill_bounded(&mut properties_map, range, property_index);
240+
}
241+
for range in script.iter_ranges_for_value(Script::Hiragana) {
242+
fill_bounded(&mut properties_map, range, property_index);
243+
}
244+
for range in lb.iter_ranges_for_value(LineBreak::ComplexContext) {
245+
fill_bounded(&mut properties_map, range, property_index);
244246
}
245247
continue;
246248
}
@@ -252,82 +254,67 @@ fn generate_rule_break_data(
252254
let prop = wb_name_to_enum
253255
.get_loose(&p.name)
254256
.expect("property name should be valid!");
255-
for c in 0..(CODEPOINT_TABLE_LEN as u32) {
256-
if wb.get32(c) == prop {
257+
for range in wb.iter_ranges_for_value(prop) {
258+
if prop == WordBreak::MidLetter
259+
&& (range.contains(&0x003a)
260+
|| range.contains(&0xfe55)
261+
|| range.contains(&0xff1a))
262+
{
257263
// UAX29 defines the colon as MidLetter, but ICU4C's
258264
// English data doesn't.
259265
// See https://unicode-org.atlassian.net/browse/ICU-22112
260266
//
261267
// TODO: We have to consider this definition from CLDR instead.
262-
if (c == 0x003a || c == 0xfe55 || c == 0xff1a) && p.name == "MidLetter"
268+
for ch in
269+
range.filter(|ch| *ch != 0x003a && *ch != 0xfe55 && *ch != 0xff1a)
263270
{
264-
// Default (en etc) is undefined class.
265-
continue;
271+
properties_map[ch as usize] = property_index;
266272
}
267-
268-
properties_map[c as usize] = property_index;
273+
} else {
274+
fill_bounded(&mut properties_map, range, property_index);
269275
}
270276
}
277+
271278
continue;
272279
}
273280

274281
"grapheme" => {
275282
// Extended_Pictographic isn't a part of grapheme break property
276-
// Extended pictographic property is within 0..U+0x20000
277283
if p.name == "Extended_Pictographic" {
278-
for i in 0..0x20000 {
279-
if let Some(c) = char::from_u32(i) {
280-
if extended_pictographic.contains(c) {
281-
properties_map[c as usize] = property_index
282-
}
283-
}
284-
}
285-
continue;
286-
}
287-
288-
if p.name == "InCBConsonant" {
289-
for i in 0..(CODEPOINT_TABLE_LEN as u32) {
290-
if let Some(c) = char::from_u32(i) {
291-
if incb.get(c) == IndicConjunctBreak::Consonant {
292-
properties_map[c as usize] = property_index;
293-
}
294-
}
284+
for range in extended_pictographic.iter_ranges() {
285+
fill_bounded(&mut properties_map, range, property_index);
295286
}
296287
continue;
297288
}
298289

299-
if p.name == "InCBLinker" {
300-
for i in 0..(CODEPOINT_TABLE_LEN as u32) {
301-
if let Some(c) = char::from_u32(i) {
302-
if incb.get(c) == IndicConjunctBreak::Linker {
303-
properties_map[c as usize] = property_index;
290+
let relevant_incb = match &*p.name {
291+
"InCBConsonant" => Some(IndicConjunctBreak::Consonant),
292+
"InCBLinker" => Some(IndicConjunctBreak::Linker),
293+
"InCBExtend" => Some(IndicConjunctBreak::Extend),
294+
_ => None,
295+
};
296+
297+
if let Some(relevant_incb) = relevant_incb {
298+
for range in incb.iter_ranges_for_value(relevant_incb) {
299+
if range.contains(&0x200D) {
300+
// ZWJ is handled as a separate rule
301+
for ch in range.filter(|ch| *ch != 0x200D) {
302+
properties_map[ch as usize] = property_index;
304303
}
304+
} else {
305+
fill_bounded(&mut properties_map, range, property_index);
305306
}
306307
}
307-
continue;
308-
}
309308

310-
if p.name == "InCBExtend" {
311-
for i in 0..(CODEPOINT_TABLE_LEN as u32) {
312-
if let Some(c) = char::from_u32(i) {
313-
// ZWJ is handled as another rules.
314-
if incb.get(c) == IndicConjunctBreak::Extend
315-
&& gb.get32(i) != GraphemeClusterBreak::ZWJ
316-
{
317-
properties_map[c as usize] = property_index;
318-
}
319-
}
320-
}
321309
continue;
322310
}
323311

324312
let prop = gcb_name_to_enum
325313
.get_loose(&p.name)
326314
.expect("property name should be valid!");
327-
for c in 0..(CODEPOINT_TABLE_LEN as u32) {
328-
if gb.get32(c) == prop {
329-
properties_map[c as usize] = property_index;
330-
}
315+
316+
for range in gb.iter_ranges_for_value(prop) {
317+
fill_bounded(&mut properties_map, range, property_index);
331318
}
332319
continue;
333320
}
@@ -336,10 +323,8 @@ fn generate_rule_break_data(
336323
let prop = sb_name_to_enum
337324
.get_loose(&p.name)
338325
.expect("property name should be valid!");
339-
for c in 0..(CODEPOINT_TABLE_LEN as u32) {
340-
if sb.get32(c) == prop {
341-
properties_map[c as usize] = property_index;
342-
}
326+
for range in sb.iter_ranges_for_value(prop) {
327+
fill_bounded(&mut properties_map, range, property_index);
343328
}
344329
continue;
345330
}
@@ -435,21 +420,17 @@ fn generate_rule_break_data(
435420
let prop = lb_name_to_enum
436421
.get_loose(&p.name)
437422
.expect("property name should be valid!");
438-
for c in 0..(CODEPOINT_TABLE_LEN as u32) {
439-
if lb.get32(c) == prop {
440-
properties_map[c as usize] = property_index;
441-
}
423+
for range in lb.iter_ranges_for_value(prop) {
424+
fill_bounded(&mut properties_map, range, property_index);
442425
}
443426

444427
if p.name == "AL" {
445428
// LB1: SG has no special rules.
446429
let prop = lb_name_to_enum
447430
.get_loose("SG")
448431
.expect("property name should be valid!");
449-
for c in 0..(CODEPOINT_TABLE_LEN as u32) {
450-
if lb.get32(c) == prop {
451-
properties_map[c as usize] = property_index;
452-
}
432+
for range in lb.iter_ranges_for_value(prop) {
433+
fill_bounded(&mut properties_map, range, property_index);
453434
}
454435
}
455436
continue;

0 commit comments

Comments
 (0)