Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions docs/userguide/boundaryanalysis/break-rules.md
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,39 @@ See, for example, this snippet from the [line break rules](https://github.com/un
$dictionary = [$SA];
```

The status value of dictionary breaks is determined as follows:
* the status value of the final break of the rule-based segment refined by the
dictionary breaks, if the largest status value defined in the rules is greater
than 100 (in that case, the rules are called *word-like*).
* 0 otherwise.

> **Note:** In practice, only word segmentation is word-like.
> The need for a distinct behaviour was realized long after status values were
> introduced. Using the largest status value allows rules that are customized
> versions of the word breaking rules to behave like word segmentation should,
> without needing to introduce a new syntax to select the status of dictionary
> breaks.

> **Example:**
> With the rules
> ```
> $dictionary = [A-Z];
> $ {100};
> [A-Z] [A-Z];
> ```
> The string `ARMAVIRUMQUECANO` has a final break with status 100.
> These rules are not word-like, so if dictionary breaking finds breaks between
> `ARMA`, `VIRUMQUE`, and `CANO`,
> These will have status value of 0.
> If however the following rule is added,
> ```
> . [?] {200};
> ```
> the rules become word-like, and breaks within `ARMAVIRUMQUECANO` all get a
> status value of 100. Any dictionary breaks within
> `QUOUSQUETANDEMABUTERECATILINAPATIENTIANOSTRA?` would get a status value of
> 200.

## Rule Options

| Option | Description |
Expand Down
17 changes: 13 additions & 4 deletions icu4c/source/common/rbbi_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ void RuleBasedBreakIterator::DictionaryCache::reset() {
fFirstRuleStatusIndex = 0;
fOtherRuleStatusIndex = 0;
fBreaks.removeAllElements();
int32_t maxStatus = 0;
for (int32_t i = 0; i < fBI->fData->fStatusMaxIdx; ++i) {
maxStatus = std::max(maxStatus, fBI->fData->fRuleStatusTable[i]);
}
isWordLike = maxStatus > 100;
}

UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
Expand All @@ -60,7 +65,7 @@ UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_
r = fBreaks.elementAti(fPositionInCache);
U_ASSERT(r > fromPos);
*result = r;
*statusIndex = fOtherRuleStatusIndex;
*statusIndex = isWordLike || r == fLimit ? fOtherRuleStatusIndex : 0;
return true;
}

Expand All @@ -70,7 +75,7 @@ UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_
r= fBreaks.elementAti(fPositionInCache);
if (r > fromPos) {
*result = r;
*statusIndex = fOtherRuleStatusIndex;
*statusIndex = isWordLike || r == fLimit ? fOtherRuleStatusIndex : 0;
return true;
}
}
Expand All @@ -97,7 +102,9 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_
r = fBreaks.elementAti(fPositionInCache);
U_ASSERT(r < fromPos);
*result = r;
*statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
*statusIndex = (r == fStart) ? fFirstRuleStatusIndex
: isWordLike ? fOtherRuleStatusIndex
: 0;
return true;
}

Expand All @@ -110,7 +117,9 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_
r = fBreaks.elementAti(fPositionInCache);
if (r < fromPos) {
*result = r;
*statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
*statusIndex = (r == fStart) ? fFirstRuleStatusIndex
: isWordLike ? fOtherRuleStatusIndex
: 0;
return true;
}
}
Expand Down
7 changes: 7 additions & 0 deletions icu4c/source/common/rbbi_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,13 @@ class RuleBasedBreakIterator::DictionaryCache: public UMemory {
// text segment being handled by the dictionary.
int32_t fFirstRuleStatusIndex; // Rule status info for first boundary.
int32_t fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries.

// If `this->isWordLike`, the status of dictionary breaks is equal to the status of the final
// break of the rule-based segment they refine (fOtherRuleStatusIndex); otherwise, dictionary
// breaks have status 0.
// For compatibility, this property is determined by the largest status value used by `*fBI`:
// rules that have a largest status greater than 100 are considered word-like.
bool isWordLike;
};


Expand Down
3 changes: 3 additions & 0 deletions icu4c/source/test/testdata/rbbitst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1268,6 +1268,9 @@ between •Mae •Hong •Son •and •the •Salween •River, •the •Thano
the •Khun •Tan •Range •(ดอย•ขุน•ตาน), •the •Phi •Pan •Nam •Range •(ทิว•เขา•ผี•ปัน•น้ำ), •as •well •as •the •western •\
part •of •the •Luang •Prabang •Range •(ทิว•เขา•หลวง•พระ•บาง).•</data>

<data>•บทความ•แนะนำ
<100></data>

# Breaking around numbers that begin with a decimal point.
# Bug ICU-12017

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1067,6 +1067,11 @@ void reset() {
fFirstRuleStatusIndex = 0;
fOtherRuleStatusIndex = 0;
fBreaks.removeAllElements();
int maxStatus = 0;
for (int status : fRData.fStatusTable) {
maxStatus = Math.max(status, maxStatus);
}
isWordLike = maxStatus > 100;
}
;

Expand All @@ -1090,7 +1095,7 @@ boolean following(int fromPos) {
r = fBreaks.elementAt(fPositionInCache);
assert (r > fromPos);
fBoundary = r;
fStatusIndex = fOtherRuleStatusIndex;
fStatusIndex = isWordLike || r == fLimit ? fOtherRuleStatusIndex : 0;
return true;
}

Expand All @@ -1100,7 +1105,7 @@ boolean following(int fromPos) {
r = fBreaks.elementAt(fPositionInCache);
if (r > fromPos) {
fBoundary = r;
fStatusIndex = fOtherRuleStatusIndex;
fStatusIndex = isWordLike || r == fLimit ? fOtherRuleStatusIndex : 0;
return true;
}
}
Expand Down Expand Up @@ -1133,7 +1138,10 @@ boolean preceding(int fromPos) {
r = fBreaks.elementAt(fPositionInCache);
assert (r < fromPos);
fBoundary = r;
fStatusIndex = (r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
fStatusIndex =
(r == fStart)
? fFirstRuleStatusIndex
: isWordLike ? fOtherRuleStatusIndex : 0;
return true;
}

Expand All @@ -1146,7 +1154,10 @@ boolean preceding(int fromPos) {
r = fBreaks.elementAt(fPositionInCache);
if (r < fromPos) {
fBoundary = r;
fStatusIndex = (r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
fStatusIndex =
(r == fStart)
? fFirstRuleStatusIndex
: isWordLike ? fOtherRuleStatusIndex : 0;
return true;
}
}
Expand Down Expand Up @@ -1287,6 +1298,15 @@ void populateDictionary(
int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries.
int fBoundary; // Current boundary. Set by preceding(), following().
int fStatusIndex; // Current rule status index. Set by preceding, following().

/**
* If `this.isWordLike`, the status of dictionary breaks is equal to the status of the final
* break of the rule-based segment they refine (fOtherRuleStatusIndex); otherwise,
* dictionary breaks have status 0. For compatibility, this property is determined by the
* largest status value used by the rules: rules that have a largest status greater than 100
* are considered word-like.
*/
private boolean isWordLike;
}
;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1268,6 +1268,9 @@ between •Mae •Hong •Son •and •the •Salween •River, •the •Thano
the •Khun •Tan •Range •(ดอย•ขุน•ตาน), •the •Phi •Pan •Nam •Range •(ทิว•เขา•ผี•ปัน•น้ำ), •as •well •as •the •western •\
part •of •the •Luang •Prabang •Range •(ทิว•เขา•หลวง•พระ•บาง).•</data>

<data>•บทความ•แนะนำ
<100></data>

# Breaking around numbers that begin with a decimal point.
# Bug ICU-12017

Expand Down
Loading