Skip to content

Commit 233c0d7

Browse files
authored
Merge pull request #9440 from keymanapp/feat/core/9121-regex-epic-ldml
2 parents 593efb3 + 7f05335 commit 233c0d7

24 files changed

+319
-140
lines changed

core/src/kmx/kmx_plus.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1072,16 +1072,16 @@ COMP_KMXPLUS_USET_Helper::setUset(const COMP_KMXPLUS_USET *newUset) {
10721072
return is_valid;
10731073
}
10741074

1075-
USet::USet(const COMP_KMXPLUS_USET_RANGE *newRange, size_t newCount) {
1075+
SimpleUSet::SimpleUSet(const COMP_KMXPLUS_USET_RANGE *newRange, size_t newCount) {
10761076
for (size_t i = 0; i < newCount; i++) {
10771077
ranges.emplace_back(newRange[i].start, newRange[i].end);
10781078
}
10791079
}
10801080

1081-
USet::USet() {
1081+
SimpleUSet::SimpleUSet() {
10821082
}
10831083

1084-
bool USet::contains(km_kbp_usv ch) const {
1084+
bool SimpleUSet::contains(km_kbp_usv ch) const {
10851085
for (const auto &range : ranges) {
10861086
if (range.start <= ch && range.end >= ch) {
10871087
return true;
@@ -1091,7 +1091,7 @@ bool USet::contains(km_kbp_usv ch) const {
10911091
}
10921092

10931093
bool
1094-
USet::valid() const {
1094+
SimpleUSet::valid() const {
10951095
// double check
10961096
for (const auto &range : ranges) {
10971097
if (!Uni_IsValid(range.start, range.end)) {
@@ -1103,7 +1103,7 @@ USet::valid() const {
11031103
}
11041104

11051105
void
1106-
USet::dump() const {
1106+
SimpleUSet::dump() const {
11071107
DebugLog(" - USet size=%d", ranges.size());
11081108
for (const auto &range : ranges) {
11091109
if (range.start == range.end) {
@@ -1114,14 +1114,14 @@ USet::dump() const {
11141114
}
11151115
}
11161116

1117-
USet
1117+
SimpleUSet
11181118
COMP_KMXPLUS_USET_Helper::getUset(KMXPLUS_USET i) const {
11191119
if (!valid() || i >= uset->usetCount) {
11201120
assert(false);
1121-
return USet(nullptr, 0); // empty set
1121+
return SimpleUSet(nullptr, 0); // empty set
11221122
}
11231123
auto &set = usets[i];
1124-
return USet(getRange(set.range), set.count);
1124+
return SimpleUSet(getRange(set.range), set.count);
11251125
}
11261126

11271127
const COMP_KMXPLUS_USET_RANGE *

core/src/kmx/kmx_plus.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -699,14 +699,16 @@ struct COMP_KMXPLUS_USET_RANGE {
699699
};
700700

701701
/**
702-
* represents one of the uset elements
702+
* represents one of the uset elements.
703+
* TODO-LDML: replace this with a real icu::UnicodeSet? or at least
704+
* a function producing the same?
703705
*/
704-
class USet {
706+
class SimpleUSet {
705707
public:
706708
/** construct a set over the specified range. Data is copied. */
707-
USet(const COMP_KMXPLUS_USET_RANGE* newStart, size_t newCount);
709+
SimpleUSet(const COMP_KMXPLUS_USET_RANGE* newStart, size_t newCount);
708710
/** empty set */
709-
USet();
711+
SimpleUSet();
710712
/** true if the uset contains this char */
711713
bool contains(km_kbp_usv ch) const;
712714
/** debugging */
@@ -726,7 +728,7 @@ class COMP_KMXPLUS_USET_Helper {
726728
bool setUset(const COMP_KMXPLUS_USET *newUset);
727729
inline bool valid() const { return is_valid; }
728730

729-
USet getUset(KMXPLUS_USET list) const;
731+
SimpleUSet getUset(KMXPLUS_USET list) const;
730732
const COMP_KMXPLUS_USET_RANGE *getRange(KMX_DWORD index) const;
731733

732734
private:

core/src/kmx/kmx_xstring.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,19 @@ u16string_to_u32string(const std::u16string &source) {
172172
return out;
173173
}
174174

175+
inline std::u16string
176+
u32string_to_u16string(const std::u32string &source) {
177+
std::u16string out;
178+
char16_single ch;
179+
for (auto c : source) {
180+
const auto len = Utf32CharToUtf16(c, ch);
181+
for (auto i = 0; i < len; i++) {
182+
out.push_back(ch.ch[i]);
183+
}
184+
}
185+
return out;
186+
}
187+
175188
inline bool Uni_IsEndOfPlaneNonCharacter(km_kbp_usv ch) {
176189
return (((ch) & Uni_FFFE_NONCHARACTER) == Uni_FFFE_NONCHARACTER); // matches FFFF or FFFE
177190
}

core/src/ldml/ldml_transforms.cpp

Lines changed: 90 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
#include <string>
1212
#include "kmx/kmx_xstring.h"
1313

14-
1514
#ifndef assert
1615
#define assert(x) // TODO-LDML
1716
#endif
@@ -36,7 +35,7 @@ namespace ldml {
3635
#define DebugTran(msg, ...)
3736
#endif
3837

39-
element::element(const USet &new_u, KMX_DWORD new_flags)
38+
element::element(const SimpleUSet &new_u, KMX_DWORD new_flags)
4039
: chr(), uset(new_u), flags((new_flags & ~LDML_ELEM_FLAGS_TYPE) | LDML_ELEM_FLAGS_TYPE_USET) {
4140
}
4241

@@ -191,7 +190,7 @@ element_list::load(const kmx::kmx_plus &kplus, kmx::KMXPLUS_ELEM id) {
191190
km_kbp_usv ch = e.element;
192191
emplace_back(ch, flags); // char
193192
} else if (type == LDML_ELEM_FLAGS_TYPE_USET) {
194-
// need to load a USet
193+
// need to load a SimpleUSet
195194
auto u = kplus.usetHelper.getUset(e.element);
196195
if (!u.valid()) {
197196
DebugLog("Error, invalid UnicodeSet at element %d", (int)i);
@@ -393,35 +392,96 @@ reorder_group::apply(std::u32string &str) const {
393392
return applied;
394393
}
395394

395+
transform_entry::transform_entry(const transform_entry &other) :
396+
fFrom(other.fFrom), fTo(other.fTo), fFromPattern(nullptr) {
397+
if (other.fFromPattern) {
398+
// clone pattern
399+
fFromPattern.reset(other.fFromPattern->clone());
400+
}
401+
}
402+
396403
transform_entry::transform_entry(const std::u32string &from, const std::u32string &to) : fFrom(from), fTo(to) {
404+
assert(!fFrom.empty()); // TODO-LDML: should not happen?
405+
406+
if (!fFrom.empty()) {
407+
const std::u16string patstr = km::kbp::kmx::u32string_to_u16string(fFrom);
408+
UErrorCode status = U_ZERO_ERROR;
409+
/* const */ icu::UnicodeString patustr = icu::UnicodeString(patstr.data(), (int32_t)patstr.length());
410+
// add '$' to match to end
411+
patustr.append(u'$');
412+
fFromPattern.reset(icu::RegexPattern::compile(patustr, 0, status));
413+
assert(U_SUCCESS(status)); // TODO-LDML: may be best to propagate status up ^^
414+
}
397415
}
398416

399417
size_t
400-
transform_entry::match(const std::u32string &input) const {
401-
if (input.length() < fFrom.length()) {
402-
// TODO-LDML: regex
403-
// Too small, can't match.
404-
return 0;
418+
transform_entry::apply(const std::u32string &input, std::u32string &output) const {
419+
assert(fFromPattern);
420+
// TODO-LDML: Really? can't go from u32 to UnicodeString?
421+
// TODO-LDML: Also, we could cache the u16 string at the transformGroup level or higher.
422+
UErrorCode status = U_ZERO_ERROR;
423+
const std::u16string matchstr = km::kbp::kmx::u32string_to_u16string(input);
424+
icu::UnicodeString matchustr = icu::UnicodeString(matchstr.data(), (int32_t)matchstr.length());
425+
// TODO-LDML: create a new Matcher every time. These could be cached and reset.
426+
std::unique_ptr<icu::RegexMatcher> matcher(fFromPattern->matcher(matchustr, status));
427+
assert(U_SUCCESS(status));
428+
429+
if (!matcher->find(status)) { // i.e. matches somewhere, in this case at end of str
430+
return 0; // no match
405431
}
406-
// string at end
407-
auto substr = input.substr(input.length() - fFrom.length(), fFrom.length());
408-
if (substr != fFrom) {
409-
// end of string doesn't match
432+
433+
// TODO-LDML: this is UTF-16 len, not UTF-32 len!!
434+
// TODO-LDML: if we had an underlying UText this would be simpler.
435+
int32_t matchStart = matcher->start(status);
436+
int32_t matchEnd = matcher->end(status);
437+
assert(U_SUCCESS(status));
438+
// extract..
439+
const icu::UnicodeString substr = matchustr.tempSubStringBetween(matchStart, matchEnd);
440+
// preflight to UTF-32 to get length
441+
UErrorCode substrStatus = U_ZERO_ERROR;
442+
auto matchLen = substr.toUTF32(nullptr, 0, substrStatus);
443+
assert(matchLen > 0);
444+
if (matchLen == 0) {
410445
return 0;
411446
}
412-
// match length == fFrom.length
413-
return substr.length();
414-
}
415-
416-
std::u32string
417-
transform_entry::apply(const std::u32string & /*input*/, size_t /*matchLen*/) const {
418-
// TODO-LDML: regex
419-
// For now, we just return the 'to' string literally.
420-
return fTo;
447+
// Now, we have a matchLen.
448+
449+
// now, do the replace.
450+
// Convert the fTo into u16 TODO-LDML (we could cache this?)
451+
const std::u16string rstr = km::kbp::kmx::u32string_to_u16string(fTo);
452+
icu::UnicodeString rustr = icu::UnicodeString(rstr.data(), (int32_t)rstr.length());
453+
// This replace will apply $1, $2 etc. TODO-LDML it will NOT handle mapFrom or mapTo.
454+
icu::UnicodeString entireOutput = matcher->replaceFirst(rustr, status);
455+
assert(U_SUCCESS(status));
456+
// entireOutput includes all of 'input', but modified. Need to substring it.
457+
icu::UnicodeString outu = entireOutput.tempSubString(matchStart);
458+
459+
// Special case if there's no output
460+
if (outu.length() == 0) {
461+
output.clear();
462+
} else {
463+
// TODO-LDML: All we are trying to do is to extract the output string. Probably too many steps.
464+
UErrorCode preflightStatus = U_ZERO_ERROR;
465+
// calculate how big the buffer is
466+
auto out32len = outu.toUTF32(nullptr, 0, preflightStatus); // preflightStatus will be an err, because we know the buffer overruns zero bytes
467+
// allocate
468+
char32_t *s = new char32_t[out32len + 1];
469+
assert(s != nullptr);
470+
// convert
471+
outu.toUTF32((UChar32 *)s, out32len + 1, status);
472+
assert(U_SUCCESS(status));
473+
output.assign(s, out32len);
474+
// now, build a u32string
475+
std::u32string out32(s, out32len);
476+
// clean up buffer
477+
delete [] s;
478+
}
479+
return matchLen;
421480
}
422481

423482
any_group::any_group(const transform_group &g) : type(any_group_type::transform), transform(g), reorder() {
424483
}
484+
425485
any_group::any_group(const reorder_group &g) : type(any_group_type::reorder), transform(), reorder(g) {
426486
}
427487

@@ -444,17 +504,18 @@ transform_group::transform_group() {
444504
/**
445505
* return the first transform match in this group
446506
*/
447-
const transform_entry *
448-
transform_group::match(const std::u32string &input, size_t &subMatched) const {
507+
size_t
508+
transform_group::apply(const std::u32string &input, std::u32string &output) const {
509+
size_t subMatched = 0;
449510
for (auto transform = begin(); (subMatched == 0) && (transform < end()); transform++) {
450511
// TODO-LDML: non regex implementation
451512
// is the match area too short?
452-
subMatched = transform->match(input);
513+
subMatched = transform->apply(input, output);
453514
if (subMatched != 0) {
454-
return &(*transform); // return alias to transform
515+
return subMatched; // matched. break out.
455516
}
456517
}
457-
return nullptr;
518+
return 0; // no match
458519
}
459520

460521
/**
@@ -507,20 +568,14 @@ transforms::apply(const std::u32string &input, std::u32string &output) {
507568
// TODO-LDML: reorders
508569
// Assume it's a non reorder group
509570
/** Length of match within this group*/
510-
size_t subMatched = 0;
511571

512572
// find the first match in this group (if present)
513573
// TODO-LDML: check if reorder
514574
if (group->type == any_group_type::transform) {
515-
auto entry = group->transform.match(updatedInput, subMatched);
516-
517-
if (entry != nullptr) {
518-
// now apply the found transform
519-
520-
// update subOutput (string) and subMatched
521-
// the returned string must replace the last "subMatched" chars of the string.
522-
std::u32string subOutput = entry->apply(updatedInput, subMatched);
575+
std::u32string subOutput;
576+
size_t subMatched = group->transform.apply(updatedInput, subOutput);
523577

578+
if (subMatched != 0) {
524579
// remove the matched part of the updatedInput
525580
updatedInput.resize(updatedInput.length() - subMatched); // chop of the subMatched part at end
526581
updatedInput.append(subOutput); // subOutput could be empty such as in backspace transform

0 commit comments

Comments
 (0)