Skip to content

Commit 7a31252

Browse files
committed
feat(core): regex cleanup 🙀
- restructure to not allocate the Matcher twice, save a little processing For: #9121
1 parent ad03a5c commit 7a31252

File tree

2 files changed

+58
-75
lines changed

2 files changed

+58
-75
lines changed

core/src/ldml/ldml_transforms.cpp

Lines changed: 48 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -415,9 +415,10 @@ transform_entry::transform_entry(const std::u32string &from, const std::u32strin
415415
}
416416

417417
size_t
418-
transform_entry::match(const std::u32string &input) const {
418+
transform_entry::apply(const std::u32string &input, std::u32string &output) const {
419419
assert(fFromPattern);
420420
// TODO-LDML: Really? can't go from u32 to UnicodeString?
421+
// TODO-LDML: Also, we could cache the u16 string at the transformGroup level or higher.
421422
UErrorCode status = U_ZERO_ERROR;
422423
const std::u16string matchstr = km::kbp::kmx::u32string_to_u16string(input);
423424
icu::UnicodeString matchustr = icu::UnicodeString(matchstr.data(), (int32_t)matchstr.length());
@@ -426,67 +427,56 @@ transform_entry::match(const std::u32string &input) const {
426427
assert(U_SUCCESS(status));
427428

428429
if (!matcher->find(status)) { // i.e. matches somewhere, in this case at end of str
429-
return 0; // and tear everything down
430+
return 0; // no match
430431
}
431432

432433
// TODO-LDML: this is UTF-16 len, not UTF-32 len!!
433-
// auto matchLen = matcher->end64(status) - matcher->start64(status);
434434
// TODO-LDML: if we had an underlying UText this would be simpler.
435-
auto matchStart = matcher->start64(status);
436-
auto matchEnd = matcher->end64(status);
435+
int32_t matchStart = matcher->start(status);
436+
int32_t matchEnd = matcher->end(status);
437+
assert(U_SUCCESS(status));
437438
// extract..
438-
const icu::UnicodeString substr = matchustr.tempSubStringBetween((int32_t)matchStart, (int32_t)matchEnd);
439+
const icu::UnicodeString substr = matchustr.tempSubStringBetween(matchStart, matchEnd);
439440
// preflight to UTF-32 to get length
440-
auto matchLen = substr.toUTF32(nullptr, 0, status);
441-
442-
return matchLen;
443-
}
444-
445-
std::u32string
446-
transform_entry::apply(const std::u32string &input, size_t matchLen) const {
447-
// TODO-LDML: and if you thought the previous function was suboptimal,
448-
// TODO-LDML: We should cache the RegexMatcher from the previous call.
449-
// TODO-LDML: Really? can't go from u32 to UnicodeString?
450-
451-
assert(fFromPattern);
452-
// TODO-LDML: simple approach, create a new Matcher every time. These could be cached and reset.
453-
// TODO-LDML: Really? can't go from u32 to UnicodeString?
454-
UErrorCode status = U_ZERO_ERROR;
441+
UErrorCode substrStatus = U_ZERO_ERROR;
442+
auto matchLen = substr.toUTF32(nullptr, 0, substrStatus);
443+
assert(matchLen > 0);
444+
if (matchLen == 0) {
445+
return 0;
446+
}
447+
// Now, we have a matchLen.
455448

456-
// we know the matchLen so we can slice the string…
457-
// Note: the matchstr here (unlike in transform_entry::match) is sliced so that
458-
// it only includes the matched portion. This way, when changed it's suitable as the
459-
// output string.
460-
const std::u16string matchstr = km::kbp::kmx::u32string_to_u16string(input.substr(input.length() - matchLen, matchLen));
461-
icu::UnicodeString matchustr = icu::UnicodeString(matchstr.data(), (int32_t)matchstr.length());
462-
std::unique_ptr<icu::RegexMatcher> matcher(fFromPattern->matcher(matchustr, status));
463-
assert(U_SUCCESS(status));
464-
// now, do the replace
449+
// now, do the replace.
450+
// Convert the fTo into u16 TODO-LDML (we could cache this?)
465451
const std::u16string rstr = km::kbp::kmx::u32string_to_u16string(fTo);
466452
icu::UnicodeString rustr = icu::UnicodeString(rstr.data(), (int32_t)rstr.length());
467453
// This replace will apply $1, $2 etc. TODO-LDML it will NOT handle mapFrom or mapTo.
468-
icu::UnicodeString output = matcher->replaceFirst(rustr, status);
454+
icu::UnicodeString entireOutput = matcher->replaceFirst(rustr, status);
469455
assert(U_SUCCESS(status));
456+
// entireOutput includes all of 'input', but modified. Need to substring it.
457+
icu::UnicodeString outu = entireOutput.tempSubString(matchStart);
470458

471-
if (output.length() == 0) {
472-
return std::u32string(); // special case of a zero length output (such as delete)
459+
// Special case if there's no output
460+
if (outu.length() == 0) {
461+
output.clear();
462+
} else {
463+
// TODO-LDML: All we are trying to do is to extract the output string. Probably too many steps.
464+
UErrorCode preflightStatus = U_ZERO_ERROR;
465+
// calculate how big the buffer is
466+
auto out32len = outu.toUTF32(nullptr, 0, preflightStatus); // preflightStatus will be an err, because we know the buffer overruns zero bytes
467+
// allocate
468+
char32_t *s = new char32_t[out32len + 1];
469+
assert(s != nullptr);
470+
// convert
471+
outu.toUTF32((UChar32 *)s, out32len + 1, status);
472+
assert(U_SUCCESS(status));
473+
output.assign(s, out32len);
474+
// now, build a u32string
475+
std::u32string out32(s, out32len);
476+
// clean up buffer
477+
delete [] s;
473478
}
474-
475-
// TODO-LDML: All we are trying to do is to extract the output string. Probably too many steps.
476-
UErrorCode preflightStatus = U_ZERO_ERROR;
477-
// calculate how big the buffer is
478-
auto out32len = output.toUTF32(nullptr, 0, preflightStatus); // preflightStatus will be an err, because we know the buffer overruns zero bytes
479-
// allocate
480-
char32_t *s = new char32_t[out32len + 1];
481-
assert(s != nullptr);
482-
// convert
483-
output.toUTF32((UChar32 *)s, out32len + 1, status);
484-
assert(U_SUCCESS(status));
485-
// now, build a u32string
486-
std::u32string out32(s, out32len);
487-
// clean up buffer
488-
delete [] s;
489-
return out32;
479+
return matchLen;
490480
}
491481

492482
any_group::any_group(const transform_group &g) : type(any_group_type::transform), transform(g), reorder() {
@@ -514,17 +504,18 @@ transform_group::transform_group() {
514504
/**
515505
* return the first transform match in this group
516506
*/
517-
const transform_entry *
518-
transform_group::match(const std::u32string &input, size_t &subMatched) const {
507+
size_t
508+
transform_group::apply(const std::u32string &input, std::u32string &output) const {
509+
size_t subMatched = 0;
519510
for (auto transform = begin(); (subMatched == 0) && (transform < end()); transform++) {
520511
// TODO-LDML: non regex implementation
521512
// is the match area too short?
522-
subMatched = transform->match(input);
513+
subMatched = transform->apply(input, output);
523514
if (subMatched != 0) {
524-
return &(*transform); // return alias to transform
515+
return subMatched; // matched. break out.
525516
}
526517
}
527-
return nullptr;
518+
return 0; // no match
528519
}
529520

530521
/**
@@ -577,20 +568,14 @@ transforms::apply(const std::u32string &input, std::u32string &output) {
577568
// TODO-LDML: reorders
578569
// Assume it's a non reorder group
579570
/** Length of match within this group*/
580-
size_t subMatched = 0;
581571

582572
// find the first match in this group (if present)
583573
// TODO-LDML: check if reorder
584574
if (group->type == any_group_type::transform) {
585-
auto entry = group->transform.match(updatedInput, subMatched);
586-
587-
if (entry != nullptr) {
588-
// now apply the found transform
589-
590-
// update subOutput (string) and subMatched
591-
// the returned string must replace the last "subMatched" chars of the string.
592-
std::u32string subOutput = entry->apply(updatedInput, subMatched);
575+
std::u32string subOutput;
576+
size_t subMatched = group->transform.apply(updatedInput, subOutput);
593577

578+
if (subMatched != 0) {
594579
// remove the matched part of the updatedInput
595580
updatedInput.resize(updatedInput.length() - subMatched); // chop of the subMatched part at end
596581
updatedInput.append(subOutput); // subOutput could be empty such as in backspace transform

core/src/ldml/ldml_transforms.hpp

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -86,14 +86,12 @@ class transform_entry {
8686
);
8787

8888
/**
89-
* @returns length if it's a match
90-
*/
91-
size_t match(const std::u32string &input) const;
92-
93-
/**
94-
* @returns output string
89+
* If matching, apply the match to the output string
90+
* @param input input string to match
91+
* @param output output string
92+
* @returns length of 'input' which was matched
9593
*/
96-
std::u32string apply(const std::u32string &input, size_t matchLen) const;
94+
size_t apply(const std::u32string &input, std::u32string &output) const;
9795

9896
private:
9997
const std::u32string fFrom;
@@ -114,12 +112,12 @@ class transform_group : public std::deque<transform_entry> {
114112
transform_group();
115113

116114
/**
117-
* Find the first match in the group
115+
* Find the first match in the group and apply it.
118116
* @param input input string to match
119-
* @param subMatched on output, the matched length
120-
* @returns alias to transform_entry or nullptr
117+
* @param output output string
118+
* @returns length of 'input' which was matched
121119
*/
122-
const transform_entry *match(const std::u32string &input, size_t &subMatched) const;
120+
size_t apply(const std::u32string &input, std::u32string &output) const;
123121
};
124122

125123
/** a single char, categorized according to reorder rules*/
@@ -241,7 +239,7 @@ class transforms {
241239
size_t apply(const std::u32string &input, std::u32string &output);
242240

243241
/**
244-
* For tests - TODO-LDML only supports reorder
242+
* For tests
245243
* @return true if str was altered
246244
*/
247245
bool apply(std::u32string &str);

0 commit comments

Comments
 (0)