@@ -2619,8 +2619,7 @@ class SegmentationRule {
26192619 const SegmentationRule *appliedRule = nullptr ;
26202620 };
26212621
2622- SegmentationRule (std::u16string_view name)
2623- : name_(UnicodeString(name).toUTF8String(std::string{})) {}
2622+ SegmentationRule (std::u16string_view name) { UnicodeString (name).toUTF8String (name_); }
26242623 virtual ~SegmentationRule () = default ;
26252624
26262625 virtual void apply (UnicodeString &remapped, std::vector<BreakContext> &resolved) const = 0;
@@ -2630,7 +2629,7 @@ class SegmentationRule {
26302629 std::chrono::steady_clock::duration timeSpent () const { return time_spent_; }
26312630
26322631 private:
2633- const std::string name_;
2632+ std::string name_;
26342633 protected:
26352634 mutable std::chrono::steady_clock::duration time_spent_{};
26362635};
@@ -2651,15 +2650,15 @@ class RemapRule : public SegmentationRule {
26512650 auto const start = std::chrono::steady_clock::now ();
26522651 UErrorCode status = U_ZERO_ERROR;
26532652 UnicodeString result;
2654- int i = 0 ;
2655- int offset = 0 ;
2653+ std:: size_t i = 0 ;
2654+ std:: ptrdiff_t offset = 0 ;
26562655 std::unique_ptr<RegexMatcher> matcher (pattern_->matcher (remapped, status));
26572656 while (matcher->find ()) {
26582657 for (;; ++i) {
26592658 if (!resolved[i].indexInRemapped .has_value ()) {
26602659 continue ;
26612660 }
2662- if (*resolved[i].indexInRemapped > matcher->start (status)) {
2661+ if (*resolved[i].indexInRemapped > static_cast <std:: size_t >( matcher->start64 (status) )) {
26632662 break ;
26642663 }
26652664 *resolved[i].indexInRemapped += offset;
@@ -2668,13 +2667,13 @@ class RemapRule : public SegmentationRule {
26682667 if (!resolved[i].indexInRemapped .has_value ()) {
26692668 continue ;
26702669 }
2671- if (*resolved[i].indexInRemapped == matcher->end (status)) {
2670+ if (*resolved[i].indexInRemapped == static_cast <std:: size_t >( matcher->end64 (status) )) {
26722671 break ;
26732672 }
26742673 if (resolved[i].appliedRule != nullptr &&
26752674 resolved[i].appliedRule ->resolution () == BREAK) {
2676- printf (" Replacement rule at remapped indices %d sqq. spans a break: %s " ,
2677- matcher->start (status), remapped. toUTF8String (std::string{}). c_str () );
2675+ printf (" Replacement rule at remapped indices %d sqq. spans a break" ,
2676+ matcher->start (status));
26782677 std::terminate ();
26792678 }
26802679 resolved[i].appliedRule = this ;
@@ -2696,9 +2695,9 @@ class RemapRule : public SegmentationRule {
26962695 indices += r.indexInRemapped .has_value () ? std::to_string (*r.indexInRemapped ) : " null" ;
26972696 indices += " ," ;
26982697 }
2699- printf (( " Inconsistent indexInRemapped " + indices + " for new remapped string " +
2700- result. toUTF8String (std:: string{}). c_str ())
2701- .c_str ());
2698+ std:: string s;
2699+ puts (( " Inconsistent indexInRemapped " + indices + " for new remapped string " +
2700+ result. toUTF8String (s)) .c_str ());
27022701 std::terminate ();
27032702 }
27042703 remapped = result;
@@ -2721,36 +2720,89 @@ class RegexRule : public SegmentationRule {
27212720 : SegmentationRule(name), resolution_(static_cast <Resolution>(resolution)) {
27222721 UParseError parseError;
27232722 UErrorCode status = U_ZERO_ERROR;
2724- before_.reset (RegexPattern::compile (u" .*(" + before + " )" , UREGEX_COMMENTS | UREGEX_DOTALL,
2725- parseError, status));
2723+ before_.reset (
2724+ RegexPattern::compile (before, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
2725+ endsWithBefore_.reset (RegexPattern::compile (
2726+ " .*(" + before + " )" , UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
27262727 after_.reset (RegexPattern::compile (after, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
27272728 U_ASSERT (U_SUCCESS (status));
27282729 }
27292730
27302731 virtual void apply (UnicodeString &remapped, std::vector<BreakContext> &resolved) const override {
27312732 auto const start = std::chrono::steady_clock::now ();
27322733 UErrorCode status = U_ZERO_ERROR;
2733- for (auto &[indexInRemapped, appliedRule] : resolved) {
2734- if (appliedRule != nullptr ) {
2735- continue ;
2736- }
2737- if (std::unique_ptr<RegexMatcher>(before_->matcher (remapped, status))
2738- ->region (0 , *indexInRemapped, status)
2739- .matches (status) &&
2740- std::unique_ptr<RegexMatcher>(after_->matcher (remapped, status))
2741- ->region (*indexInRemapped, remapped.length (), status)
2742- .lookingAt (status)) {
2743- appliedRule = this ;
2734+ // The unicodetools implementation simply tries, for each index, to
2735+ // match the string up to the index against /.*(before)/ (with
2736+ // `matches`) and the beginning of the string after the index against
2737+ // /after/ (with `lookingAt`), but that is very slow, especially for
2738+ // nonempty /before/. While the old monkeys are not a production
2739+ // implementation, we still do not want them to be too slow, since we
2740+ // need to test millions of sample strings. Instead we search for
2741+ // /before/ and /after/, and check resulting candidates. This speeds
2742+ // things up by a factor of ~40.
2743+ // We need to be careful about greedy matching: The first position where
2744+ // the rule matches may be before the end of the first /before/ match.
2745+ // However, it is both:
2746+ // 1. within a /before/ match or at its bounds,
2747+ // 2. at the beginning of an /after/ match.
2748+ // Further, the /before/ context of the rule matches within the
2749+ // aforementioned /before/ match. Note that we need to look for
2750+ // overlapping matches, thus calls to `find` are always preceded by a
2751+ // reset via `region`.
2752+ std::unique_ptr<RegexMatcher> beforeSearch (before_->matcher (remapped, status));
2753+ std::unique_ptr<RegexMatcher> afterSearch (after_->matcher (remapped, status));
2754+ beforeSearch->useAnchoringBounds (false );
2755+ afterSearch->useAnchoringBounds (false );
2756+ U_ASSERT (U_SUCCESS (status));
2757+ if (beforeSearch->find () && afterSearch->find ()) {
2758+ for (;;) {
2759+ if (afterSearch->start (status) < beforeSearch->start (status)) {
2760+ afterSearch->region (beforeSearch->start (status), remapped.length (), status);
2761+ if (!afterSearch->find ()) {
2762+ break ;
2763+ }
2764+ } else if (afterSearch->start (status) > beforeSearch->end (status)) {
2765+ if (beforeSearch->start (status) == remapped.length ()) {
2766+ break ;
2767+ }
2768+ beforeSearch->region (remapped.moveIndex32 (beforeSearch->start (status), 1 ),
2769+ remapped.length (), status);
2770+ if (!beforeSearch->find ()) {
2771+ break ;
2772+ }
2773+ } else {
2774+ auto const it = std::find_if (resolved.begin (), resolved.end (), [&](auto r) {
2775+ return r.indexInRemapped == afterSearch->start (status);
2776+ });
2777+ U_ASSERT (it != resolved.end ());
2778+ U_ASSERT (U_SUCCESS (status));
2779+ if (it->appliedRule == nullptr &&
2780+ std::unique_ptr<RegexMatcher>(endsWithBefore_->matcher (remapped, status))
2781+ ->useAnchoringBounds (false )
2782+ .region (beforeSearch->start (status), afterSearch->start (status), status)
2783+ .matches (status)) {
2784+ it->appliedRule = this ;
2785+ }
2786+ if (afterSearch->start (status) == remapped.length ()) {
2787+ break ;
2788+ }
2789+ afterSearch->region (remapped.moveIndex32 (afterSearch->start (status), 1 ),
2790+ remapped.length (), status);
2791+ if (!afterSearch->find ()) {
2792+ break ;
2793+ }
2794+ }
2795+ U_ASSERT (U_SUCCESS (status));
27442796 }
27452797 }
2746- U_ASSERT (U_SUCCESS (status));
27472798 time_spent_ += std::chrono::steady_clock::now () - start;
27482799 }
27492800
27502801 virtual Resolution resolution () const override { return resolution_; }
27512802
27522803 private:
27532804 std::unique_ptr<RegexPattern> before_;
2805+ std::unique_ptr<RegexPattern> endsWithBefore_;
27542806 std::unique_ptr<RegexPattern> after_;
27552807 const Resolution resolution_;
27562808};
@@ -3005,7 +3057,7 @@ void RBBILineMonkey::setText(const UnicodeString &s) {
30053057 UnicodeString remapped = s;
30063058 resolved.clear ();
30073059 resolved.reserve (s.length () + 1 );
3008- for (std:: size_t i = 0 ; i < s.length () + 1 ; ++i) {
3060+ for (int i = 0 ; i < s.length () + 1 ; ++i) {
30093061 resolved.emplace_back (i);
30103062 }
30113063 for (const auto & rule : rules) {
@@ -3038,8 +3090,12 @@ const std::vector<UnicodeSet>& RBBILineMonkey::charClasses() {
30383090
30393091
30403092RBBILineMonkey::~RBBILineMonkey () {
3041- for (auto const & rule : rules) {
3042- printf (" %s : %lld ms\n " , rule->name ().c_str (), rule->timeSpent () / std::chrono::milliseconds (1 ));
3093+ constexpr bool debuggingOldMonkeyPerformance = false ;
3094+ if (debuggingOldMonkeyPerformance) {
3095+ for (auto const &rule : rules) {
3096+ puts ((rule->name () + " : " + std::to_string (rule->timeSpent () / std::chrono::milliseconds (1 )) +
3097+ " ms" ).c_str ());
3098+ }
30433099 }
30443100
30453101 delete fCharBI ;
0 commit comments