Skip to content

Commit 3f95935

Browse files
committed
ICU-22984 Optimize old monkeys
1 parent 5519b85 commit 3f95935

File tree

1 file changed

+85
-29
lines changed

1 file changed

+85
-29
lines changed

icu4c/source/test/intltest/rbbitst.cpp

Lines changed: 85 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2619,8 +2619,7 @@ class SegmentationRule {
26192619
const SegmentationRule *appliedRule = nullptr;
26202620
};
26212621

2622-
SegmentationRule(std::u16string_view name)
2623-
: name_(UnicodeString(name).toUTF8String(std::string{})) {}
2622+
SegmentationRule(std::u16string_view name) { UnicodeString(name).toUTF8String(name_); }
26242623
virtual ~SegmentationRule() = default;
26252624

26262625
virtual void apply(UnicodeString &remapped, std::vector<BreakContext> &resolved) const = 0;
@@ -2630,7 +2629,7 @@ class SegmentationRule {
26302629
std::chrono::steady_clock::duration timeSpent() const { return time_spent_; }
26312630

26322631
private:
2633-
const std::string name_;
2632+
std::string name_;
26342633
protected:
26352634
mutable std::chrono::steady_clock::duration time_spent_{};
26362635
};
@@ -2651,15 +2650,15 @@ class RemapRule : public SegmentationRule {
26512650
auto const start = std::chrono::steady_clock::now();
26522651
UErrorCode status = U_ZERO_ERROR;
26532652
UnicodeString result;
2654-
int i = 0;
2655-
int offset = 0;
2653+
std::size_t i = 0;
2654+
std::ptrdiff_t offset = 0;
26562655
std::unique_ptr<RegexMatcher> matcher(pattern_->matcher(remapped, status));
26572656
while (matcher->find()) {
26582657
for (;; ++i) {
26592658
if (!resolved[i].indexInRemapped.has_value()) {
26602659
continue;
26612660
}
2662-
if (*resolved[i].indexInRemapped > matcher->start(status)) {
2661+
if (*resolved[i].indexInRemapped > static_cast<std::size_t>(matcher->start64(status))) {
26632662
break;
26642663
}
26652664
*resolved[i].indexInRemapped += offset;
@@ -2668,13 +2667,13 @@ class RemapRule : public SegmentationRule {
26682667
if (!resolved[i].indexInRemapped.has_value()) {
26692668
continue;
26702669
}
2671-
if (*resolved[i].indexInRemapped == matcher->end(status)) {
2670+
if (*resolved[i].indexInRemapped == static_cast<std::size_t>(matcher->end64(status))) {
26722671
break;
26732672
}
26742673
if (resolved[i].appliedRule != nullptr &&
26752674
resolved[i].appliedRule->resolution() == BREAK) {
2676-
printf("Replacement rule at remapped indices %d sqq. spans a break: %s",
2677-
matcher->start(status), remapped.toUTF8String(std::string{}).c_str());
2675+
printf("Replacement rule at remapped indices %d sqq. spans a break",
2676+
matcher->start(status));
26782677
std::terminate();
26792678
}
26802679
resolved[i].appliedRule = this;
@@ -2696,9 +2695,9 @@ class RemapRule : public SegmentationRule {
26962695
indices += r.indexInRemapped.has_value() ? std::to_string(*r.indexInRemapped) : "null";
26972696
indices += ",";
26982697
}
2699-
printf(("Inconsistent indexInRemapped " + indices + " for new remapped string " +
2700-
result.toUTF8String(std::string{}).c_str())
2701-
.c_str());
2698+
std::string s;
2699+
puts(("Inconsistent indexInRemapped " + indices + " for new remapped string " +
2700+
result.toUTF8String(s)).c_str());
27022701
std::terminate();
27032702
}
27042703
remapped = result;
@@ -2721,36 +2720,89 @@ class RegexRule : public SegmentationRule {
27212720
: SegmentationRule(name), resolution_(static_cast<Resolution>(resolution)) {
27222721
UParseError parseError;
27232722
UErrorCode status = U_ZERO_ERROR;
2724-
before_.reset(RegexPattern::compile(u".*(" + before + ")", UREGEX_COMMENTS | UREGEX_DOTALL,
2725-
parseError, status));
2723+
before_.reset(
2724+
RegexPattern::compile(before, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
2725+
endsWithBefore_.reset(RegexPattern::compile(
2726+
".*(" + before + ")", UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
27262727
after_.reset(RegexPattern::compile(after, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
27272728
U_ASSERT(U_SUCCESS(status));
27282729
}
27292730

27302731
virtual void apply(UnicodeString &remapped, std::vector<BreakContext> &resolved) const override {
27312732
auto const start = std::chrono::steady_clock::now();
27322733
UErrorCode status = U_ZERO_ERROR;
2733-
for (auto &[indexInRemapped, appliedRule] : resolved) {
2734-
if (appliedRule != nullptr) {
2735-
continue;
2736-
}
2737-
if (std::unique_ptr<RegexMatcher>(before_->matcher(remapped, status))
2738-
->region(0, *indexInRemapped, status)
2739-
.matches(status) &&
2740-
std::unique_ptr<RegexMatcher>(after_->matcher(remapped, status))
2741-
->region(*indexInRemapped, remapped.length(), status)
2742-
.lookingAt(status)) {
2743-
appliedRule = this;
2734+
// The unicodetools implementation simply tries, for each index, to
2735+
// match the string up to the index against /.*(before)/ (with
2736+
// `matches`) and the beginning of the string after the index against
2737+
// /after/ (with `lookingAt`), but that is very slow, especially for
2738+
// nonempty /before/. While the old monkeys are not a production
2739+
// implementation, we still do not want them to be too slow, since we
2740+
// need to test millions of sample strings. Instead we search for
2741+
// /before/ and /after/, and check resulting candidates. This speeds
2742+
// things up by a factor of ~40.
2743+
// We need to be careful about greedy matching: The first position where
2744+
// the rule matches may be before the end of the first /before/ match.
2745+
// However, it is both:
2746+
// 1. within a /before/ match or at its bounds,
2747+
// 2. at the beginning of an /after/ match.
2748+
// Further, the /before/ context of the rule matches within the
2749+
// aforementioned /before/ match. Note that we need to look for
2750+
// overlapping matches, thus calls to `find` are always preceded by a
2751+
// reset via `region`.
2752+
std::unique_ptr<RegexMatcher> beforeSearch(before_->matcher(remapped, status));
2753+
std::unique_ptr<RegexMatcher> afterSearch(after_->matcher(remapped, status));
2754+
beforeSearch->useAnchoringBounds(false);
2755+
afterSearch->useAnchoringBounds(false);
2756+
U_ASSERT(U_SUCCESS(status));
2757+
if (beforeSearch->find() && afterSearch->find()) {
2758+
for (;;) {
2759+
if (afterSearch->start(status) < beforeSearch->start(status)) {
2760+
afterSearch->region(beforeSearch->start(status), remapped.length(), status);
2761+
if (!afterSearch->find()) {
2762+
break;
2763+
}
2764+
} else if (afterSearch->start(status) > beforeSearch->end(status)) {
2765+
if (beforeSearch->start(status) == remapped.length()) {
2766+
break;
2767+
}
2768+
beforeSearch->region(remapped.moveIndex32(beforeSearch->start(status), 1),
2769+
remapped.length(), status);
2770+
if (!beforeSearch->find()) {
2771+
break;
2772+
}
2773+
} else {
2774+
auto const it = std::find_if(resolved.begin(), resolved.end(), [&](auto r) {
2775+
return r.indexInRemapped == afterSearch->start(status);
2776+
});
2777+
U_ASSERT(it != resolved.end());
2778+
U_ASSERT(U_SUCCESS(status));
2779+
if (it->appliedRule == nullptr &&
2780+
std::unique_ptr<RegexMatcher>(endsWithBefore_->matcher(remapped, status))
2781+
->useAnchoringBounds(false)
2782+
.region(beforeSearch->start(status), afterSearch->start(status), status)
2783+
.matches(status)) {
2784+
it->appliedRule = this;
2785+
}
2786+
if (afterSearch->start(status) == remapped.length()) {
2787+
break;
2788+
}
2789+
afterSearch->region(remapped.moveIndex32(afterSearch->start(status), 1),
2790+
remapped.length(), status);
2791+
if (!afterSearch->find()) {
2792+
break;
2793+
}
2794+
}
2795+
U_ASSERT(U_SUCCESS(status));
27442796
}
27452797
}
2746-
U_ASSERT(U_SUCCESS(status));
27472798
time_spent_ += std::chrono::steady_clock::now() - start;
27482799
}
27492800

27502801
virtual Resolution resolution() const override { return resolution_; }
27512802

27522803
private:
27532804
std::unique_ptr<RegexPattern> before_;
2805+
std::unique_ptr<RegexPattern> endsWithBefore_;
27542806
std::unique_ptr<RegexPattern> after_;
27552807
const Resolution resolution_;
27562808
};
@@ -3005,7 +3057,7 @@ void RBBILineMonkey::setText(const UnicodeString &s) {
30053057
UnicodeString remapped = s;
30063058
resolved.clear();
30073059
resolved.reserve(s.length() + 1);
3008-
for (std::size_t i = 0; i < s.length() + 1; ++i) {
3060+
for (int i = 0; i < s.length() + 1; ++i) {
30093061
resolved.emplace_back(i);
30103062
}
30113063
for (const auto& rule : rules) {
@@ -3038,8 +3090,12 @@ const std::vector<UnicodeSet>& RBBILineMonkey::charClasses() {
30383090

30393091

30403092
RBBILineMonkey::~RBBILineMonkey() {
3041-
for (auto const& rule : rules) {
3042-
printf("%s : %lld ms\n", rule->name().c_str(), rule->timeSpent() / std::chrono::milliseconds(1));
3093+
constexpr bool debuggingOldMonkeyPerformance = false;
3094+
if (debuggingOldMonkeyPerformance) {
3095+
for (auto const &rule : rules) {
3096+
puts((rule->name() + " : " + std::to_string(rule->timeSpent() / std::chrono::milliseconds(1)) +
3097+
" ms").c_str());
3098+
}
30433099
}
30443100

30453101
delete fCharBI;

0 commit comments

Comments
 (0)