Skip to content

Commit 1f33101

Browse files
committed
ICU-22789 Add Segmenter API to conveniently wrap BreakIterator in ICU4J
See #3237
1 parent 810b94c commit 1f33101

15 files changed

+1329
-3
lines changed
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
// © 2025 and later: Unicode, Inc. and others.
2+
// License & terms of use: https://www.unicode.org/copyright.html
3+
4+
package com.ibm.icu.segmenter;
5+
6+
import com.ibm.icu.text.BreakIterator;
7+
import com.ibm.icu.segmenter.Segments.IterationDirection;
8+
9+
/**
10+
* An iterator of segmentation boundaries that can operate in either the forwards or reverse
11+
* direction.
12+
*
13+
* <p>When constructed to operate in the forwards direction, the iterator will return all boundaries
14+
* that are strictly after the input index value provided to the constructor. However, when
15+
* constructed to operate in the backwards direction, if the input index is already a segmentation
16+
* boundary, then it will be included as the first value that the iterator returns as it iterates
17+
* backwards.
18+
*/
19+
class BoundaryIteratorOfInts {
20+
private BreakIterator breakIter;
21+
private IterationDirection direction;
22+
private int currIdx;
23+
24+
BoundaryIteratorOfInts(BreakIterator breakIter, CharSequence sourceSequence, IterationDirection direction, int startIdx) {
25+
this.breakIter = breakIter;
26+
this.direction = direction;
27+
28+
if (direction == IterationDirection.FORWARDS) {
29+
currIdx = breakIter.following(startIdx);
30+
} else {
31+
assert direction == IterationDirection.BACKWARDS;
32+
33+
// When iterating backwards over boundaries, adjust the initial index to be the boundary
34+
// that is either startIdx or else the one right before startIdx.
35+
//
36+
// Note: we have to set the initial index indirectly because there is no way to statelessly
37+
// query whether an index is on a boundary. Instead, BreakIterator.isBoundary() will mutate
38+
// state when the input is not on a boundary, before it returns the value indicating a
39+
// boundary.
40+
int sourceLength = sourceSequence.length();
41+
if (startIdx == 0) {
42+
currIdx = breakIter.first();
43+
} else if (startIdx == sourceLength) {
44+
currIdx = breakIter.last();
45+
} else {
46+
boolean isOnBoundary =
47+
0 <= startIdx
48+
&& startIdx <= sourceLength
49+
&& breakIter.isBoundary(startIdx);
50+
51+
// The previous call to BreakIterator.isBoundary(startIdx) will have advanced breakIter's
52+
// current position forwards to the next boundary if the argument, startIdx, is not a
53+
// boundary. Therefore, in that case, we have to move back to the previous boundary.
54+
//
55+
// BreakIterator.isBoundary(startIdx) should have cached the surrounding 2 boundaries in the
56+
// BreakIterator, which means that BreakIterator.preceding(startIdx) shouldn't cost
57+
// significant extra time.
58+
//
59+
// BreakIterator.preceding(startIdx) is used in initialization instead of a simple call to
60+
// BreakIterator.previous() since BreakIterator.preceding() can accept arguments larger than
61+
// the last boundary and return the last boundary, whereas .previous() would return DONE.
62+
// Thus, .preceding() provides symmetrical behavior to .following(), which we use in the
63+
// forwards direction.
64+
currIdx = isOnBoundary ? startIdx : breakIter.preceding(startIdx);
65+
}
66+
}
67+
}
68+
69+
public boolean hasNext() {
70+
return currIdx != BreakIterator.DONE;
71+
}
72+
73+
public Integer next() {
74+
int result = currIdx;
75+
76+
if (direction == IterationDirection.FORWARDS) {
77+
currIdx = breakIter.next();
78+
} else {
79+
assert direction == IterationDirection.BACKWARDS;
80+
currIdx = breakIter.previous();
81+
}
82+
83+
return result;
84+
}
85+
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
// © 2025 and later: Unicode, Inc. and others.
2+
// License & terms of use: https://www.unicode.org/copyright.html
3+
4+
package com.ibm.icu.segmenter;
5+
6+
import com.ibm.icu.text.BreakIterator;
7+
import com.ibm.icu.segmenter.Segments.IterationDirection;
8+
import java.util.Spliterator;
9+
import java.util.function.IntConsumer;
10+
11+
class BoundarySpliterator implements Spliterator.OfInt {
12+
13+
private final BoundaryIteratorOfInts iter;
14+
15+
BoundarySpliterator(BreakIterator breakIter, CharSequence sourceSequence, IterationDirection direction, int startIdx) {
16+
iter = new BoundaryIteratorOfInts(breakIter, sourceSequence, direction, startIdx);
17+
}
18+
19+
@Override
20+
public OfInt trySplit() {
21+
// The elements of the Stream represent an iteration through a string, and is thus inherently
22+
// stateful. Therefore, splitting this Stream does not make sense. Ex: splitting the Stream
23+
// is tantamount to discarding the segment subtended by the end value (index into the input
24+
// string) of one substream and the beginning value of the next substream.
25+
return null;
26+
}
27+
28+
@Override
29+
public long estimateSize() {
30+
// The number of segments per input size depends on language, script, and
31+
// the content of the input string, and thus is hard to estimate without
32+
// sacrificing performance. Thus, returning `Long.MAX_VALUE`, according
33+
// to the API, to mean "unknown, or too expensive to compute".
34+
return Long.MAX_VALUE;
35+
}
36+
37+
@Override
38+
public int characteristics() {
39+
return
40+
// BreakIterator always advances
41+
Spliterator.DISTINCT
42+
// The design of the Segmenter API is to provide an immutable view of
43+
// segmentation by preventing the input string from mutating
44+
// in the underlying BreakIterator.
45+
| Spliterator.IMMUTABLE
46+
// primitive int is non-null
47+
| Spliterator.NONNULL
48+
// BreakIterator always advances, and in a single direction.
49+
| Spliterator.ORDERED;
50+
}
51+
52+
@Override
53+
public boolean tryAdvance(IntConsumer action) {
54+
if (action == null) {
55+
throw new NullPointerException();
56+
}
57+
if (iter.hasNext()) {
58+
action.accept(iter.next());
59+
return true;
60+
} else {
61+
return false;
62+
}
63+
}
64+
}
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
// © 2025 and later: Unicode, Inc. and others.
2+
// License & terms of use: https://www.unicode.org/copyright.html
3+
4+
package com.ibm.icu.segmenter;
5+
6+
import com.ibm.icu.text.BreakIterator;
7+
import com.ibm.icu.util.ULocale;
8+
import java.util.Locale;
9+
import java.util.stream.IntStream;
10+
import java.util.stream.Stream;
11+
12+
/**
13+
* Performs segmentation according to the rules defined for the locale.
14+
*/
15+
public class LocalizedSegmenter implements Segmenter {
16+
17+
private BreakIterator breakIterPrototype;
18+
19+
/**
20+
* Returns a {@link Segments} object that encapsulates the segmentation of the input
21+
* {@code CharSequence}. The {@code Segments} object, in turn, provides the main APIs to support
22+
* traversal over the resulting segments and boundaries via the Java {@code Stream} abstraction.
23+
* @param s input {@code CharSequence} on which segmentation is performed. The input must not be
24+
* modified while using the resulting {@code Segments} object.
25+
* @return A {@code Segments} object with APIs to access the results of segmentation, including
26+
* APIs that return {@code Stream}s of the segments and boundaries.
27+
* @draft ICU 78
28+
*/
29+
@Override
30+
public Segments segment(CharSequence s) {
31+
return new SegmentsImpl(breakIterPrototype, s);
32+
}
33+
34+
/**
35+
* @return a builder for constructing {@code LocalizedSegmenter}
36+
* @draft ICU 78
37+
*/
38+
public static Builder builder() {
39+
return new Builder();
40+
}
41+
42+
private LocalizedSegmenter(ULocale locale, SegmentationType segmentationType) {
43+
switch (segmentationType) {
44+
case LINE:
45+
breakIterPrototype = BreakIterator.getLineInstance(locale);
46+
break;
47+
case SENTENCE:
48+
breakIterPrototype = BreakIterator.getSentenceInstance(locale);
49+
break;
50+
case WORD:
51+
breakIterPrototype = BreakIterator.getWordInstance(locale);
52+
break;
53+
case GRAPHEME_CLUSTER:
54+
breakIterPrototype = BreakIterator.getCharacterInstance(locale);
55+
break;
56+
}
57+
}
58+
59+
/**
60+
* The type of segmentation to be performed. See the ICU User Guide page
61+
* <a
62+
* href="https://unicode-org.github.io/icu/userguide/boundaryanalysis/#four-types-of-breakiterator">Boundary Analysis</a>
63+
* for further details.
64+
* @draft ICU 78
65+
*/
66+
public enum SegmentationType {
67+
GRAPHEME_CLUSTER,
68+
WORD,
69+
LINE,
70+
SENTENCE,
71+
}
72+
73+
/**
74+
* Builder for {@link LocalizedSegmenter}
75+
* @draft ICU 78
76+
*/
77+
public static class Builder {
78+
79+
private ULocale locale = ULocale.ROOT;
80+
81+
private SegmentationType segmentationType = null;
82+
83+
private Builder() { }
84+
85+
/**
86+
* Set the locale for which segmentation rules will be loaded
87+
* @param locale an ICU locale object
88+
* @draft ICU 78
89+
*/
90+
public Builder setLocale(ULocale locale) {
91+
if (locale == null) {
92+
throw new IllegalArgumentException("locale cannot be set to null.");
93+
}
94+
this.locale = locale;
95+
return this;
96+
}
97+
98+
/**
99+
* Set the locale for which segmentation rules will be loaded
100+
* @param locale a Java locale object
101+
* @draft ICU 78
102+
*/
103+
public Builder setLocale(Locale locale) {
104+
if (locale == null) {
105+
throw new IllegalArgumentException("locale cannot be set to null.");
106+
}
107+
this.locale = ULocale.forLocale(locale);
108+
return this;
109+
}
110+
111+
/**
112+
* Set the segmentation type to be performed.
113+
* @param segmentationType
114+
* @draft ICU 78
115+
*/
116+
public Builder setSegmentationType(SegmentationType segmentationType) {
117+
if (segmentationType == null) {
118+
throw new IllegalArgumentException("segmentationType cannot be set to null.");
119+
}
120+
this.segmentationType = segmentationType;
121+
return this;
122+
}
123+
124+
/**
125+
* Builds the {@code Segmenter}
126+
* @return the constructed {@code Segmenter} instance
127+
* @draft ICU 78
128+
*/
129+
public Segmenter build() {
130+
if (segmentationType == null) {
131+
throw new IllegalArgumentException("segmentationType is null and must be set to a specific value.");
132+
}
133+
return new LocalizedSegmenter(locale, segmentationType);
134+
}
135+
136+
}
137+
}
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
// © 2025 and later: Unicode, Inc. and others.
2+
// License & terms of use: https://www.unicode.org/copyright.html
3+
4+
package com.ibm.icu.segmenter;
5+
6+
import com.ibm.icu.text.BreakIterator;
7+
import com.ibm.icu.text.RuleBasedBreakIterator;
8+
import java.io.InputStream;
9+
import java.util.stream.IntStream;
10+
import java.util.stream.Stream;
11+
12+
/**
13+
* Performs segmentation according to the provided rule string. The rule string must follow the
14+
* same guidelines as for {@link RuleBasedBreakIterator#RuleBasedBreakIterator(String)}.
15+
* @draft ICU 78
16+
*/
17+
public class RuleBasedSegmenter implements Segmenter {
18+
19+
private final BreakIterator breakIterPrototype;
20+
21+
/**
22+
* Returns a {@link Segments} object that encapsulates the segmentation of the input
23+
* {@code CharSequence}. The {@code Segments} object, in turn, provides the main APIs to support
24+
* traversal over the resulting segments and boundaries via the Java {@code Stream} abstraction.
25+
* @param s input {@code CharSequence} on which segmentation is performed. The input must not be
26+
* modified while using the resulting {@code Segments} object.
27+
* @return A {@code Segments} object with APIs to access the results of segmentation, including
28+
* APIs that return {@code Stream}s of the segments and boundaries.
29+
* @draft ICU 78
30+
*/
31+
@Override
32+
public Segments segment(CharSequence s) {
33+
return new SegmentsImpl(breakIterPrototype, s);
34+
}
35+
36+
/**
37+
* @return a builder for constructing {@code RuleBasedSegmenter}
38+
* @draft ICU 78
39+
*/
40+
public static Builder builder() {
41+
return new Builder();
42+
}
43+
44+
private RuleBasedSegmenter(BreakIterator breakIter) {
45+
breakIterPrototype = breakIter;
46+
}
47+
48+
/**
49+
* Builder for {@link RuleBasedSegmenter}
50+
* @draft ICU 78
51+
*/
52+
public static class Builder {
53+
54+
private BreakIterator breakIter = null;
55+
56+
private Builder() { }
57+
58+
/**
59+
* Sets the rule string for segmentation.
60+
* @param rules rule string. The rule string must follow the same guidelines as for
61+
* {@link RuleBasedBreakIterator#getInstanceFromCompiledRules(InputStream)}.
62+
* @draft ICU 78
63+
*/
64+
public Builder setRules(String rules) {
65+
if (rules == null) {
66+
throw new IllegalArgumentException("rules cannot be set to null.");
67+
}
68+
try {
69+
breakIter = new RuleBasedBreakIterator(rules);
70+
return this;
71+
} catch (RuntimeException rte) {
72+
throw new IllegalArgumentException("The provided rule string is invalid"
73+
+ " or there was an error in creating the RuleBasedSegmenter.", rte);
74+
}
75+
}
76+
77+
/**
78+
* Builds the {@code Segmenter}
79+
* @return the constructed {@code Segmenter} instance
80+
* @draft ICU 78
81+
*/
82+
public Segmenter build() {
83+
if (breakIter == null) {
84+
throw new IllegalArgumentException("A rule string must be set.");
85+
} else {
86+
return new RuleBasedSegmenter(breakIter);
87+
}
88+
}
89+
}
90+
}

0 commit comments

Comments
 (0)