@@ -415,9 +415,10 @@ transform_entry::transform_entry(const std::u32string &from, const std::u32strin
415415}
416416
417417size_t
418- transform_entry::match (const std::u32string &input) const {
418+ transform_entry::apply (const std::u32string &input, std::u32string &output ) const {
419419 assert (fFromPattern );
420420 // TODO-LDML: Really? can't go from u32 to UnicodeString?
421+ // TODO-LDML: Also, we could cache the u16 string at the transformGroup level or higher.
421422 UErrorCode status = U_ZERO_ERROR;
422423 const std::u16string matchstr = km::kbp::kmx::u32string_to_u16string (input);
423424 icu::UnicodeString matchustr = icu::UnicodeString (matchstr.data (), (int32_t )matchstr.length ());
@@ -426,67 +427,56 @@ transform_entry::match(const std::u32string &input) const {
426427 assert (U_SUCCESS (status));
427428
428429 if (!matcher->find (status)) { // i.e. matches somewhere, in this case at end of str
429- return 0 ; // and tear everything down
430+ return 0 ; // no match
430431 }
431432
432433 // TODO-LDML: this is UTF-16 len, not UTF-32 len!!
433- // auto matchLen = matcher->end64(status) - matcher->start64(status);
434434 // TODO-LDML: if we had an underlying UText this would be simpler.
435- auto matchStart = matcher->start64 (status);
436- auto matchEnd = matcher->end64 (status);
435+ int32_t matchStart = matcher->start (status);
436+ int32_t matchEnd = matcher->end (status);
437+ assert (U_SUCCESS (status));
437438 // extract..
438- const icu::UnicodeString substr = matchustr.tempSubStringBetween (( int32_t ) matchStart, ( int32_t ) matchEnd);
439+ const icu::UnicodeString substr = matchustr.tempSubStringBetween (matchStart, matchEnd);
439440 // preflight to UTF-32 to get length
440- auto matchLen = substr.toUTF32 (nullptr , 0 , status);
441-
442- return matchLen;
443- }
444-
445- std::u32string
446- transform_entry::apply (const std::u32string &input, size_t matchLen) const {
447- // TODO-LDML: and if you thought the previous function was suboptimal,
448- // TODO-LDML: We should cache the RegexMatcher from the previous call.
449- // TODO-LDML: Really? can't go from u32 to UnicodeString?
450-
451- assert (fFromPattern );
452- // TODO-LDML: simple approach, create a new Matcher every time. These could be cached and reset.
453- // TODO-LDML: Really? can't go from u32 to UnicodeString?
454- UErrorCode status = U_ZERO_ERROR;
441+ UErrorCode substrStatus = U_ZERO_ERROR;
442+ auto matchLen = substr.toUTF32 (nullptr , 0 , substrStatus);
443+ assert (matchLen > 0 );
444+ if (matchLen == 0 ) {
445+ return 0 ;
446+ }
447+ // Now, we have a matchLen.
455448
456- // we know the matchLen so we can slice the string…
457- // Note: the matchstr here (unlike in transform_entry::match) is sliced so that
458- // it only includes the matched portion. This way, when changed it's suitable as the
459- // output string.
460- const std::u16string matchstr = km::kbp::kmx::u32string_to_u16string (input.substr (input.length () - matchLen, matchLen));
461- icu::UnicodeString matchustr = icu::UnicodeString (matchstr.data (), (int32_t )matchstr.length ());
462- std::unique_ptr<icu::RegexMatcher> matcher (fFromPattern ->matcher (matchustr, status));
463- assert (U_SUCCESS (status));
464- // now, do the replace
449+ // now, do the replace.
450+ // Convert the fTo into u16 TODO-LDML (we could cache this?)
465451 const std::u16string rstr = km::kbp::kmx::u32string_to_u16string (fTo );
466452 icu::UnicodeString rustr = icu::UnicodeString (rstr.data (), (int32_t )rstr.length ());
467453 // This replace will apply $1, $2 etc. TODO-LDML it will NOT handle mapFrom or mapTo.
468- icu::UnicodeString output = matcher->replaceFirst (rustr, status);
454+ icu::UnicodeString entireOutput = matcher->replaceFirst (rustr, status);
469455 assert (U_SUCCESS (status));
456+ // entireOutput includes all of 'input', but modified. Need to substring it.
457+ icu::UnicodeString outu = entireOutput.tempSubString (matchStart);
470458
471- if (output.length () == 0 ) {
472- return std::u32string (); // special case of a zero length output (such as delete)
459+ // Special case if there's no output
460+ if (outu.length () == 0 ) {
461+ output.clear ();
462+ } else {
463+ // TODO-LDML: All we are trying to do is to extract the output string. Probably too many steps.
464+ UErrorCode preflightStatus = U_ZERO_ERROR;
465+ // calculate how big the buffer is
466+ auto out32len = outu.toUTF32 (nullptr , 0 , preflightStatus); // preflightStatus will be an err, because we know the buffer overruns zero bytes
467+ // allocate
468+ char32_t *s = new char32_t [out32len + 1 ];
469+ assert (s != nullptr );
470+ // convert
471+ outu.toUTF32 ((UChar32 *)s, out32len + 1 , status);
472+ assert (U_SUCCESS (status));
473+ output.assign (s, out32len);
474+ // now, build a u32string
475+ std::u32string out32 (s, out32len);
476+ // clean up buffer
477+ delete [] s;
473478 }
474-
475- // TODO-LDML: All we are trying to do is to extract the output string. Probably too many steps.
476- UErrorCode preflightStatus = U_ZERO_ERROR;
477- // calculate how big the buffer is
478- auto out32len = output.toUTF32 (nullptr , 0 , preflightStatus); // preflightStatus will be an err, because we know the buffer overruns zero bytes
479- // allocate
480- char32_t *s = new char32_t [out32len + 1 ];
481- assert (s != nullptr );
482- // convert
483- output.toUTF32 ((UChar32 *)s, out32len + 1 , status);
484- assert (U_SUCCESS (status));
485- // now, build a u32string
486- std::u32string out32 (s, out32len);
487- // clean up buffer
488- delete [] s;
489- return out32;
479+ return matchLen;
490480}
491481
492482any_group::any_group (const transform_group &g) : type(any_group_type::transform), transform(g), reorder() {
@@ -514,17 +504,18 @@ transform_group::transform_group() {
514504/* *
515505 * return the first transform match in this group
516506 */
517- const transform_entry *
518- transform_group::match (const std::u32string &input, size_t &subMatched) const {
507+ size_t
508+ transform_group::apply (const std::u32string &input, std::u32string &output) const {
509+ size_t subMatched = 0 ;
519510 for (auto transform = begin (); (subMatched == 0 ) && (transform < end ()); transform++) {
520511 // TODO-LDML: non regex implementation
521512 // is the match area too short?
522- subMatched = transform->match (input);
513+ subMatched = transform->apply (input, output );
523514 if (subMatched != 0 ) {
524- return &(*transform); // return alias to transform
515+ return subMatched; // matched. break out.
525516 }
526517 }
527- return nullptr ;
518+ return 0 ; // no match
528519}
529520
530521/* *
@@ -577,20 +568,14 @@ transforms::apply(const std::u32string &input, std::u32string &output) {
577568 // TODO-LDML: reorders
578569 // Assume it's a non reorder group
579570 /* * Length of match within this group*/
580- size_t subMatched = 0 ;
581571
582572 // find the first match in this group (if present)
583573 // TODO-LDML: check if reorder
584574 if (group->type == any_group_type::transform) {
585- auto entry = group->transform .match (updatedInput, subMatched);
586-
587- if (entry != nullptr ) {
588- // now apply the found transform
589-
590- // update subOutput (string) and subMatched
591- // the returned string must replace the last "subMatched" chars of the string.
592- std::u32string subOutput = entry->apply (updatedInput, subMatched);
575+ std::u32string subOutput;
576+ size_t subMatched = group->transform .apply (updatedInput, subOutput);
593577
578+ if (subMatched != 0 ) {
594579 // remove the matched part of the updatedInput
595580 updatedInput.resize (updatedInput.length () - subMatched); // chop of the subMatched part at end
596581 updatedInput.append (subOutput); // subOutput could be empty such as in backspace transform
0 commit comments