1111#include < string>
1212#include " kmx/kmx_xstring.h"
1313
14-
1514#ifndef assert
1615#define assert (x ) // TODO-LDML
1716#endif
@@ -36,7 +35,7 @@ namespace ldml {
3635#define DebugTran (msg, ...)
3736#endif
3837
39- element::element (const USet &new_u, KMX_DWORD new_flags)
38+ element::element (const SimpleUSet &new_u, KMX_DWORD new_flags)
4039 : chr(), uset(new_u), flags((new_flags & ~LDML_ELEM_FLAGS_TYPE) | LDML_ELEM_FLAGS_TYPE_USET) {
4140}
4241
@@ -191,7 +190,7 @@ element_list::load(const kmx::kmx_plus &kplus, kmx::KMXPLUS_ELEM id) {
191190 km_kbp_usv ch = e.element ;
192191 emplace_back (ch, flags); // char
193192 } else if (type == LDML_ELEM_FLAGS_TYPE_USET) {
194- // need to load a USet
193+ // need to load a SimpleUSet
195194 auto u = kplus.usetHelper .getUset (e.element );
196195 if (!u.valid ()) {
197196 DebugLog (" Error, invalid UnicodeSet at element %d" , (int )i);
@@ -393,35 +392,96 @@ reorder_group::apply(std::u32string &str) const {
393392 return applied;
394393}
395394
395+ transform_entry::transform_entry (const transform_entry &other) :
396+ fFrom (other.fFrom ), fTo (other.fTo ), fFromPattern (nullptr ) {
397+ if (other.fFromPattern ) {
398+ // clone pattern
399+ fFromPattern .reset (other.fFromPattern ->clone ());
400+ }
401+ }
402+
396403transform_entry::transform_entry (const std::u32string &from, const std::u32string &to) : fFrom (from), fTo (to) {
404+ assert (!fFrom .empty ()); // TODO-LDML: should not happen?
405+
406+ if (!fFrom .empty ()) {
407+ const std::u16string patstr = km::kbp::kmx::u32string_to_u16string (fFrom );
408+ UErrorCode status = U_ZERO_ERROR;
409+ /* const */ icu::UnicodeString patustr = icu::UnicodeString (patstr.data (), (int32_t )patstr.length ());
410+ // add '$' to match to end
411+ patustr.append (u' $' );
412+ fFromPattern .reset (icu::RegexPattern::compile (patustr, 0 , status));
413+ assert (U_SUCCESS (status)); // TODO-LDML: may be best to propagate status up ^^
414+ }
397415}
398416
399417size_t
400- transform_entry::match (const std::u32string &input) const {
401- if (input.length () < fFrom .length ()) {
402- // TODO-LDML: regex
403- // Too small, can't match.
404- return 0 ;
418+ transform_entry::apply (const std::u32string &input, std::u32string &output) const {
419+ assert (fFromPattern );
420+ // TODO-LDML: Really? can't go from u32 to UnicodeString?
421+ // TODO-LDML: Also, we could cache the u16 string at the transformGroup level or higher.
422+ UErrorCode status = U_ZERO_ERROR;
423+ const std::u16string matchstr = km::kbp::kmx::u32string_to_u16string (input);
424+ icu::UnicodeString matchustr = icu::UnicodeString (matchstr.data (), (int32_t )matchstr.length ());
425+ // TODO-LDML: create a new Matcher every time. These could be cached and reset.
426+ std::unique_ptr<icu::RegexMatcher> matcher (fFromPattern ->matcher (matchustr, status));
427+ assert (U_SUCCESS (status));
428+
429+ if (!matcher->find (status)) { // i.e. matches somewhere, in this case at end of str
430+ return 0 ; // no match
405431 }
406- // string at end
407- auto substr = input.substr (input.length () - fFrom .length (), fFrom .length ());
408- if (substr != fFrom ) {
409- // end of string doesn't match
432+
433+ // TODO-LDML: this is UTF-16 len, not UTF-32 len!!
434+ // TODO-LDML: if we had an underlying UText this would be simpler.
435+ int32_t matchStart = matcher->start (status);
436+ int32_t matchEnd = matcher->end (status);
437+ assert (U_SUCCESS (status));
438+ // extract..
439+ const icu::UnicodeString substr = matchustr.tempSubStringBetween (matchStart, matchEnd);
440+ // preflight to UTF-32 to get length
441+ UErrorCode substrStatus = U_ZERO_ERROR;
442+ auto matchLen = substr.toUTF32 (nullptr , 0 , substrStatus);
443+ assert (matchLen > 0 );
444+ if (matchLen == 0 ) {
410445 return 0 ;
411446 }
412- // match length == fFrom.length
413- return substr.length ();
414- }
415-
416- std::u32string
417- transform_entry::apply (const std::u32string & /* input*/ , size_t /* matchLen*/ ) const {
418- // TODO-LDML: regex
419- // For now, we just return the 'to' string literally.
420- return fTo ;
447+ // Now, we have a matchLen.
448+
449+ // now, do the replace.
450+ // Convert the fTo into u16 TODO-LDML (we could cache this?)
451+ const std::u16string rstr = km::kbp::kmx::u32string_to_u16string (fTo );
452+ icu::UnicodeString rustr = icu::UnicodeString (rstr.data (), (int32_t )rstr.length ());
453+ // This replace will apply $1, $2 etc. TODO-LDML it will NOT handle mapFrom or mapTo.
454+ icu::UnicodeString entireOutput = matcher->replaceFirst (rustr, status);
455+ assert (U_SUCCESS (status));
456+ // entireOutput includes all of 'input', but modified. Need to substring it.
457+ icu::UnicodeString outu = entireOutput.tempSubString (matchStart);
458+
459+ // Special case if there's no output
460+ if (outu.length () == 0 ) {
461+ output.clear ();
462+ } else {
463+ // TODO-LDML: All we are trying to do is to extract the output string. Probably too many steps.
464+ UErrorCode preflightStatus = U_ZERO_ERROR;
465+ // calculate how big the buffer is
466+ auto out32len = outu.toUTF32 (nullptr , 0 , preflightStatus); // preflightStatus will be an err, because we know the buffer overruns zero bytes
467+ // allocate
468+ char32_t *s = new char32_t [out32len + 1 ];
469+ assert (s != nullptr );
470+ // convert
471+ outu.toUTF32 ((UChar32 *)s, out32len + 1 , status);
472+ assert (U_SUCCESS (status));
473+ output.assign (s, out32len);
474+ // now, build a u32string
475+ std::u32string out32 (s, out32len);
476+ // clean up buffer
477+ delete [] s;
478+ }
479+ return matchLen;
421480}
422481
423482any_group::any_group (const transform_group &g) : type(any_group_type::transform), transform(g), reorder() {
424483}
484+
425485any_group::any_group (const reorder_group &g) : type(any_group_type::reorder), transform(), reorder(g) {
426486}
427487
@@ -444,17 +504,18 @@ transform_group::transform_group() {
444504/* *
445505 * return the first transform match in this group
446506 */
447- const transform_entry *
448- transform_group::match (const std::u32string &input, size_t &subMatched) const {
507+ size_t
508+ transform_group::apply (const std::u32string &input, std::u32string &output) const {
509+ size_t subMatched = 0 ;
449510 for (auto transform = begin (); (subMatched == 0 ) && (transform < end ()); transform++) {
450511 // TODO-LDML: non regex implementation
451512 // is the match area too short?
452- subMatched = transform->match (input);
513+ subMatched = transform->apply (input, output );
453514 if (subMatched != 0 ) {
454- return &(*transform); // return alias to transform
515+ return subMatched; // matched. break out.
455516 }
456517 }
457- return nullptr ;
518+ return 0 ; // no match
458519}
459520
460521/* *
@@ -507,20 +568,14 @@ transforms::apply(const std::u32string &input, std::u32string &output) {
507568 // TODO-LDML: reorders
508569 // Assume it's a non reorder group
509570 /* * Length of match within this group*/
510- size_t subMatched = 0 ;
511571
512572 // find the first match in this group (if present)
513573 // TODO-LDML: check if reorder
514574 if (group->type == any_group_type::transform) {
515- auto entry = group->transform .match (updatedInput, subMatched);
516-
517- if (entry != nullptr ) {
518- // now apply the found transform
519-
520- // update subOutput (string) and subMatched
521- // the returned string must replace the last "subMatched" chars of the string.
522- std::u32string subOutput = entry->apply (updatedInput, subMatched);
575+ std::u32string subOutput;
576+ size_t subMatched = group->transform .apply (updatedInput, subOutput);
523577
578+ if (subMatched != 0 ) {
524579 // remove the matched part of the updatedInput
525580 updatedInput.resize (updatedInput.length () - subMatched); // chop of the subMatched part at end
526581 updatedInput.append (subOutput); // subOutput could be empty such as in backspace transform
0 commit comments