Skip to content

Commit e59065c

Browse files
committed
ICU-22984 Clean up old monkeys
1 parent 757f27c commit e59065c

File tree

2 files changed

+80
-17
lines changed

2 files changed

+80
-17
lines changed

docs/userguide/dev/rules_update.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,16 @@ The rule updates are done first for ICU4C, and then ported (code changes) or mov
212212

213213
Updating the test with new or revised rules requires changing the test source code, in `icu4c/source/test/intltest/rbbitst.cpp`. Look for the classes RBBICharMonkey, RBBIWordMonkey, RBBISentMonkey and RBBILineMonkey. The body of each class tracks the corresponding UAX-14 or UAX-29 specifications in defining the character classes and break rules.
214214

215+
The rules, as well as the partition of the code space used to generate the random sample strings,
216+
are defined by regular expressions and Unicode sets generated by GenerateBreakTest in the
217+
Unicode tools, which runs as part of MakeUnicodeFiles.
218+
Copy the relevant lines from `Generated/UCD/17.0.0/extra/*BreakTest.cpp.txt` into `rbbitst.cpp`.
219+
When developing changes to the line breaking algorithms that require changes to property assignments,
220+
the generated rules and partition may need to be adjusted for testing.
221+
However, the updated rules should only be merged into ICU once the property changes have actually been
222+
made in the UCD and imported into ICU, at which point the unmodified generated partition and rules can
223+
be used in `rbbitst.cpp`.
224+
215225
After making changes, as a final check, let the test run for an extended period of time, on the order of several hours.
216226
Run it from a terminal, and just interrupt it (Ctrl-C) when it's gone long enough.
217227

icu4c/source/test/intltest/rbbitst.cpp

Lines changed: 70 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1602,8 +1602,8 @@ class SegmentationRule {
16021602
NO_BREAK = u'×',
16031603
};
16041604
struct BreakContext {
1605-
BreakContext(std::size_t index) : indexInRemapped(index) {}
1606-
std::optional<std::size_t> indexInRemapped;
1605+
BreakContext(int32_t index) : indexInRemapped(index) {}
1606+
std::optional<int32_t> indexInRemapped;
16071607
const SegmentationRule *appliedRule = nullptr;
16081608
};
16091609

@@ -1639,15 +1639,37 @@ class RemapRule : public SegmentationRule {
16391639
auto const start = std::chrono::steady_clock::now();
16401640
UErrorCode status = U_ZERO_ERROR;
16411641
UnicodeString result;
1642-
std::size_t i = 0;
1643-
std::ptrdiff_t offset = 0;
1642+
int32_t i = 0;
1643+
int32_t offset = 0;
1644+
// We find all matches of the `pattern_` and replace them according to
1645+
// the `replacement_`, producing the new remapped string `result`.
1646+
// For every position i in the original string,
1647+
// `resolved[i].indexInRemapped` is nullopt if i lies within a replaced
1648+
// match, and is set to the new index in `result` otherwise, by adding
1649+
// the accumulated difference `offset` between match lengths and
1650+
// replacement lengths.
1651+
// Consider a 4-codepoint, 6 code unit string s = ⟨ 𒀀, ◌́, ␠, ◌𝅲 ⟩, where
1652+
// ␠ stands for U+0020 and U+12000 𒀀 and U+1D172 ◌𝅲 each require two code
1653+
// units, and apply the following two rules:
1654+
// 1. (?<X>\P{lb=SP}) \p{lb=CM}* → ${X}
1655+
// 2. \p{lb=CM} → A
1656+
// The string remapped and the indexInRemapped values change as follows:
1657+
// indexInRemapped remapped string rule final
1658+
// (aligned on the initial string) applied offset
1659+
// 𒀀 ◌́ ␠ ◌𝅲
1660+
// 0 1 2 3 4 5 6 ⟨ 𒀀, ◌́, ␠, ◌𝅲 ⟩ (none)
1661+
// 0 - - 2 3 4 5 ⟨ 𒀀, ␠, ◌𝅲 ⟩ 1 -1
1662+
// 0 - - 2 3 - 4 ⟨ 𒀀, ␠, A ⟩ 2 -1
1663+
//
1664+
// Note that the last indexInRemapped is always equal to the length of
1665+
// the remapped string.
16441666
std::unique_ptr<RegexMatcher> matcher(pattern_->matcher(remapped, status));
16451667
while (matcher->find()) {
16461668
for (;; ++i) {
16471669
if (!resolved[i].indexInRemapped.has_value()) {
16481670
continue;
16491671
}
1650-
if (*resolved[i].indexInRemapped > static_cast<std::size_t>(matcher->start64(status))) {
1672+
if (*resolved[i].indexInRemapped > matcher->start(status)) {
16511673
break;
16521674
}
16531675
*resolved[i].indexInRemapped += offset;
@@ -1656,22 +1678,37 @@ class RemapRule : public SegmentationRule {
16561678
if (!resolved[i].indexInRemapped.has_value()) {
16571679
continue;
16581680
}
1659-
if (*resolved[i].indexInRemapped == static_cast<std::size_t>(matcher->end64(status))) {
1681+
// Note that
1682+
// `*resolved[i].indexInRemapped > matcher->end(status)` should
1683+
// never happen with ordinary rules, but could in principle
1684+
// happen with rules that remap to code point sequences, e.g.,
1685+
// 1. BC → TYZ
1686+
// 2. AT → X
1687+
// applied to ⟨ A, B, C ⟩:
1688+
// indexInRemapped remapped rule
1689+
// A B C
1690+
// 0 1 2 3 ⟨ A, B, C ⟩ (none)
1691+
// 0 1 - 4 ⟨ A, T, Y, Z ⟩ 1
1692+
// 0 - - 3 ⟨ X, Y, Z ⟩ 2
1693+
// Where for the application of rule 2, the match ends at
1694+
// position 2 in remapped, which does not correspond to a
1695+
// position in the original string.
1696+
if (*resolved[i].indexInRemapped >= matcher->end(status)) {
16601697
break;
16611698
}
16621699
if (resolved[i].appliedRule != nullptr &&
1663-
resolved[i].appliedRule->resolution() == BREAK) {
1700+
resolved[i].appliedRule->resolution() == BREAK) {
16641701
printf("Replacement rule at remapped indices %d sqq. spans a break",
16651702
matcher->start(status));
16661703
std::terminate();
16671704
}
16681705
resolved[i].appliedRule = this;
1669-
resolved[i].indexInRemapped = std::nullopt;
1706+
resolved[i].indexInRemapped.reset();
16701707
}
16711708
matcher->appendReplacement(result, replacement_, status);
16721709
offset = result.length() - *resolved[i].indexInRemapped;
16731710
}
1674-
for (; i < resolved.size(); ++i) {
1711+
for (; i < static_cast<int32_t>(resolved.size()); ++i) {
16751712
if (!resolved[i].indexInRemapped.has_value()) {
16761713
continue;
16771714
}
@@ -1691,7 +1728,10 @@ class RemapRule : public SegmentationRule {
16911728
std::terminate();
16921729
}
16931730
remapped = result;
1694-
U_ASSERT(U_SUCCESS(status));
1731+
if (U_FAILURE(status)) {
1732+
puts(("Failed to apply rule " + name()).c_str());
1733+
std::terminate();
1734+
}
16951735
timeSpent_ += std::chrono::steady_clock::now() - start;
16961736
}
16971737

@@ -1715,7 +1755,10 @@ class RegexRule : public SegmentationRule {
17151755
endsWithBefore_.reset(RegexPattern::compile(
17161756
".*(" + before + ")", UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
17171757
after_.reset(RegexPattern::compile(after, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
1718-
U_ASSERT(U_SUCCESS(status));
1758+
if (U_FAILURE(status)) {
1759+
puts(("Failed to compile regular expressions for rule " + this->name()).c_str());
1760+
std::terminate();
1761+
}
17191762
}
17201763

17211764
virtual void apply(UnicodeString &remapped, std::vector<BreakContext> &resolved) const override {
@@ -1764,7 +1807,13 @@ class RegexRule : public SegmentationRule {
17641807
auto const it = std::find_if(resolved.begin(), resolved.end(), [&](auto r) {
17651808
return r.indexInRemapped == afterSearch->start(status);
17661809
});
1767-
U_ASSERT(it != resolved.end());
1810+
if (it == resolved.end()) {
1811+
puts(("Rule " + name() +
1812+
" found a break at a position which does not correspond to an index in "
1813+
"the original string")
1814+
.c_str());
1815+
std::terminate();
1816+
}
17681817
U_ASSERT(U_SUCCESS(status));
17691818
if (it->appliedRule == nullptr &&
17701819
std::unique_ptr<RegexMatcher>(endsWithBefore_->matcher(remapped, status))
@@ -1785,6 +1834,10 @@ class RegexRule : public SegmentationRule {
17851834
U_ASSERT(U_SUCCESS(status));
17861835
}
17871836
}
1837+
if (U_FAILURE(status)) {
1838+
puts(("Failed to apply rule " + name()).c_str());
1839+
std::terminate();
1840+
}
17881841
timeSpent_ += std::chrono::steady_clock::now() - start;
17891842
}
17901843

@@ -3035,9 +3088,9 @@ RBBILineMonkey::RBBILineMonkey() :
30353088
const UnicodeSet lbSA(uR"(\p{lb=SA})", status);
30363089
for (auto it = partition.begin(); it != partition.end();) {
30373090
if (lbSA.containsAll(it->second)) {
3038-
it = partition.erase(it);
3091+
it = partition.erase(it);
30393092
} else {
3040-
++it;
3093+
++it;
30413094
}
30423095
}
30433096

@@ -3079,9 +3132,9 @@ void RBBILineMonkey::setText(const UnicodeString &s) {
30793132

30803133
int32_t RBBILineMonkey::next(int32_t startPos) {
30813134
for (std::size_t i = startPos + 1; i < resolved.size(); ++i) {
3082-
if (resolved[i].appliedRule != nullptr && resolved[i].appliedRule->resolution() ==
3083-
SegmentationRule::BREAK) {
3084-
return i;
3135+
if (resolved[i].appliedRule != nullptr &&
3136+
resolved[i].appliedRule->resolution() == SegmentationRule::BREAK) {
3137+
return i;
30853138
}
30863139
}
30873140
return -1;

0 commit comments

Comments
 (0)