Skip to content

Commit 462802a

Browse files
authored
Merge pull request #9504 from keymanapp/feat/core/9121-regex-map-epic-ldml
feat(core): implement mapped set mapping 🙀
2 parents 4105667 + 07ded45 commit 462802a

File tree

7 files changed

+277
-41
lines changed

7 files changed

+277
-41
lines changed

core/src/kmx/kmx_plus.cpp

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "ldml/keyboardprocessor_ldml.h"
1414

1515
#include <assert.h>
16+
#include "kmx_plus.h"
1617

1718
namespace km {
1819
namespace kbp {
@@ -386,11 +387,35 @@ COMP_KMXPLUS_ELEM::getElementList(KMX_DWORD elementNumber, KMX_DWORD &length) co
386387

387388
std::u16string
388389
COMP_KMXPLUS_ELEM_ELEMENT::get_element_string() const {
389-
assert((flags & LDML_ELEM_FLAGS_TYPE) == LDML_ELEM_FLAGS_TYPE_CHAR); // should only be called on char
390+
assert(type() == LDML_ELEM_FLAGS_TYPE_CHAR); // should only be called on char
390391
return COMP_KMXPLUS_STRS::str_from_char(element);
391392
}
392393

393-
// Note: shared with subclass COMP_KMXPLUS_BKSP
394+
std::deque<std::u32string>
395+
COMP_KMXPLUS_ELEM_ELEMENT::loadAsStringList(KMX_DWORD length, const COMP_KMXPLUS_STRS &strs) const {
396+
std::deque<std::u32string> list;
397+
for (KMX_DWORD i = 0; i<length; i++) {
398+
const auto &o = this[i];
399+
std::u32string str;
400+
if (o.type() == LDML_ELEM_FLAGS_TYPE_STR) {
401+
// fetch the string
402+
const auto str16 = strs.get(o.element);
403+
str = km::kbp::kmx::u16string_to_u32string(str16);
404+
} else {
405+
// single char
406+
str = std::u32string(1, (km_kbp_usv)o.element);
407+
}
408+
list.emplace_back(str);
409+
}
410+
return list;
411+
}
412+
413+
KMX_DWORD
414+
COMP_KMXPLUS_ELEM_ELEMENT::type() const {
415+
return (flags & LDML_ELEM_FLAGS_TYPE);
416+
}
417+
418+
// Note: shared with subclass COMP_KMXPLUS_BKSP
394419
bool
395420
COMP_KMXPLUS_TRAN::valid(KMX_DWORD _kmn_unused(length)) const {
396421
if (header.size < sizeof(*this) + (sizeof(COMP_KMXPLUS_TRAN_GROUP) * groupCount) +
@@ -503,6 +528,18 @@ COMP_KMXPLUS_TRAN_Helper::setTran(const COMP_KMXPLUS_TRAN *newTran) {
503528
is_valid = false;
504529
assert(is_valid);
505530
}
531+
for(KMX_DWORD t = 0; is_valid && t < group.count; t++) {
532+
const auto &transform = transforms[group.index + t];
533+
if (transform.from == 0) {
534+
DebugLog("COMP_KMXPLUS_TRAN_Helper: transform [%d].[%d] has empty 'from' string", i, t);
535+
is_valid = false;
536+
assert(is_valid);
537+
} else if ((transform.mapFrom == 0) != (transform.mapTo == 0)) {
538+
DebugLog("COMP_KMXPLUS_TRAN_Helper: transform [%d].[%d] should have neither or both mapFrom=%d/mapTo=%d", i, t, transform.mapFrom, transform.mapTo);
539+
is_valid = false;
540+
assert(is_valid);
541+
}
542+
}
506543
} else if (group.type == LDML_TRAN_GROUP_TYPE_REORDER) {
507544
DebugLog(" .. type=reorder");
508545
if ((group.index >= tran->reorderCount) || (group.index + group.count > tran->reorderCount)) {
@@ -511,6 +548,15 @@ COMP_KMXPLUS_TRAN_Helper::setTran(const COMP_KMXPLUS_TRAN *newTran) {
511548
is_valid = false;
512549
assert(is_valid);
513550
}
551+
for(KMX_DWORD t = 0; is_valid && t < group.count; t++) {
552+
const auto &reorder = reorders[group.index + t];
553+
if (reorder.elements == 0) {
554+
DebugLog("COMP_KMXPLUS_TRAN_Helper: reorder [%d].[%d] has elements=0", i, t);
555+
// TODO-LDML: is this an error?
556+
// is_valid = false;
557+
// assert(is_valid);
558+
}
559+
}
514560
} else {
515561
DebugLog(" .. type=illegal 0x%X", group.type);
516562
is_valid = false;
@@ -1248,6 +1294,19 @@ COMP_KMXPLUS_VARS::valid(KMX_DWORD _kmn_unused(length)) const {
12481294
return true;
12491295
}
12501296

1297+
const COMP_KMXPLUS_VARS_ITEM *COMP_KMXPLUS_VARS::findByStringId(KMX_DWORD strId) const {
1298+
if (strId == 0) {
1299+
return nullptr;
1300+
}
1301+
for (KMX_DWORD index = 0; index < varCount; index++) {
1302+
if (varEntries[index].id == strId) {
1303+
return &(varEntries[index]);
1304+
}
1305+
}
1306+
return nullptr;
1307+
}
1308+
1309+
12511310

12521311
} // namespace kmx
12531312
} // namespace kbp

core/src/kmx/kmx_plus.h

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <kmx_file.h>
1313
#include <ldml/keyboardprocessor_ldml.h>
1414
#include <list>
15+
#include <deque>
1516

1617
namespace km {
1718
namespace kbp {
@@ -47,6 +48,7 @@ struct COMP_KMXPLUS_TRAN;
4748
struct COMP_KMXPLUS_TRAN_GROUP;
4849
struct COMP_KMXPLUS_TRAN_TRANSFORM;
4950
struct COMP_KMXPLUS_TRAN_REORDER;
51+
struct COMP_KMXPLUS_STRS;
5052

5153
struct COMP_KMXPLUS_HEADER {
5254
KMXPLUS_IDENT ident; // 0000 Section name
@@ -103,6 +105,16 @@ struct COMP_KMXPLUS_ELEM_ELEMENT {
103105
* @return std::u16string
104106
*/
105107
std::u16string get_element_string() const;
108+
109+
/**
110+
* @brief load this[0]…this[length] as a string list
111+
* @param length number of elements, including this one
112+
* @return the string elements as a string array
113+
*/
114+
std::deque<std::u32string> loadAsStringList(KMX_DWORD length, const km::kbp::kmx::COMP_KMXPLUS_STRS &strs) const;
115+
116+
/** @return element type */
117+
KMX_DWORD type() const;
106118
};
107119

108120
struct COMP_KMXPLUS_ELEM_ENTRY {
@@ -260,8 +272,8 @@ struct COMP_KMXPLUS_TRAN_GROUP {
260272
struct COMP_KMXPLUS_TRAN_TRANSFORM {
261273
KMXPLUS_STR from;
262274
KMXPLUS_STR to;
263-
KMXPLUS_ELEM mapFrom;
264-
KMXPLUS_ELEM mapTo;
275+
KMXPLUS_STR mapFrom; // variable name
276+
KMXPLUS_STR mapTo; // variable name
265277
};
266278

267279
struct COMP_KMXPLUS_TRAN_REORDER {
@@ -341,21 +353,23 @@ struct COMP_KMXPLUS_BKSP : public COMP_KMXPLUS_TRAN {
341353

342354
struct COMP_KMXPLUS_VARS_ITEM {
343355
KMX_DWORD_unaligned type;
344-
KMX_DWORD_unaligned id;
345-
KMX_DWORD_unaligned value;
346-
KMX_DWORD_unaligned elem;
356+
KMXPLUS_STR id;
357+
KMXPLUS_STR value;
358+
KMXPLUS_ELEM elem;
347359
};
348360

349361
struct COMP_KMXPLUS_VARS {
350362
static const KMXPLUS_IDENT IDENT = LDML_SECTIONID_VARS;
351363
COMP_KMXPLUS_HEADER header;
352-
KMX_DWORD_unaligned markers;
364+
KMXPLUS_LIST markers;
353365
KMX_DWORD_unaligned varCount;
354366
COMP_KMXPLUS_VARS_ITEM varEntries[];
355367
/**
356368
* @brief True if section is valid.
357369
*/
358370
bool valid(KMX_DWORD length) const;
371+
372+
const COMP_KMXPLUS_VARS_ITEM *findByStringId(KMX_DWORD strId) const;
359373
};
360374

361375
static_assert(sizeof(struct COMP_KMXPLUS_VARS) % 0x4 == 0, "Structs prior to variable part should align to 32-bit boundary");

core/src/ldml/ldml_transforms.cpp

Lines changed: 128 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -392,18 +392,74 @@ reorder_group::apply(std::u32string &str) const {
392392
return applied;
393393
}
394394

395-
transform_entry::transform_entry(const transform_entry &other) :
396-
fFrom(other.fFrom), fTo(other.fTo), fFromPattern(nullptr) {
395+
transform_entry::transform_entry(const transform_entry &other)
396+
: fFrom(other.fFrom), fTo(other.fTo), fFromPattern(nullptr), fMapFromStrId(other.fMapFromStrId),
397+
fMapToStrId(other.fMapToStrId), fMapFromList(other.fMapFromList), fMapToList(other.fMapToList) {
397398
if (other.fFromPattern) {
398399
// clone pattern
399400
fFromPattern.reset(other.fFromPattern->clone());
400401
}
401402
}
402403

403-
transform_entry::transform_entry(const std::u32string &from, const std::u32string &to) : fFrom(from), fTo(to) {
404+
transform_entry::transform_entry(const std::u32string &from, const std::u32string &to)
405+
: fFrom(from), fTo(to), fFromPattern(nullptr), fMapFromStrId(), fMapToStrId(), fMapFromList(), fMapToList() {
404406
assert(!fFrom.empty()); // TODO-LDML: should not happen?
405407

408+
init();
409+
}
410+
411+
// TODO-LDML: How do we return errors from here?
412+
transform_entry::transform_entry(
413+
const std::u32string &from,
414+
const std::u32string &to,
415+
KMX_DWORD mapFrom,
416+
KMX_DWORD mapTo,
417+
const kmx::kmx_plus &kplus)
418+
: fFrom(from), fTo(to), fFromPattern(nullptr), fMapFromStrId(mapFrom), fMapToStrId(mapTo) {
419+
assert(!fFrom.empty()); // TODO-LDML: should not happen?
420+
assert((fMapFromStrId == 0) == (fMapToStrId == 0)); // we have both or we have neither.
421+
assert(kplus.strs != nullptr);
422+
assert(kplus.vars != nullptr);
423+
assert(kplus.elem != nullptr);
424+
init();
425+
426+
// setup mapFrom
427+
if (fMapFromStrId != 0) {
428+
// Note: if we need the variable name it is available as follows,
429+
// but isn't needed for normal processing. Could be useful for debug messages.
430+
// auto mapFrom = kplus.strs->get(fMapFromStrId);
431+
// auto mapTo = kplus.strs->get(fMapToStrId);
432+
433+
// get the vars
434+
auto *fromVar = kplus.vars->findByStringId(fMapFromStrId);
435+
auto *toVar = kplus.vars->findByStringId(fMapToStrId);
436+
assert(fromVar != nullptr);
437+
assert(toVar != nullptr);
438+
439+
440+
// get the element lists
441+
assert(fromVar->type == LDML_VARS_ENTRY_TYPE_SET);
442+
assert(toVar->type == LDML_VARS_ENTRY_TYPE_SET);
443+
KMX_DWORD fromLength, toLength;
444+
auto *fromList = kplus.elem->getElementList(fromVar->elem, fromLength);
445+
auto *toList = kplus.elem->getElementList(toVar->elem, toLength);
446+
assert(fromLength == toLength);
447+
assert(fromList != nullptr);
448+
assert(toList != nullptr);
449+
450+
// populate the deques from the lists
451+
fMapFromList = fromList->loadAsStringList(fromLength, *(kplus.strs));
452+
fMapToList = toList->loadAsStringList(toLength, *(kplus.strs));
453+
// did we get the expected items?
454+
assert(fMapFromList.size() == fromLength);
455+
assert(fMapToList.size() == toLength);
456+
}
457+
}
458+
459+
void
460+
transform_entry::init() {
406461
if (!fFrom.empty()) {
462+
// TODO-LDML: if we have mapFrom, may need to do other processing.
407463
const std::u16string patstr = km::kbp::kmx::u32string_to_u16string(fFrom);
408464
UErrorCode status = U_ZERO_ERROR;
409465
/* const */ icu::UnicodeString patustr = icu::UnicodeString(patstr.data(), (int32_t)patstr.length());
@@ -421,7 +477,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons
421477
// TODO-LDML: Also, we could cache the u16 string at the transformGroup level or higher.
422478
UErrorCode status = U_ZERO_ERROR;
423479
const std::u16string matchstr = km::kbp::kmx::u32string_to_u16string(input);
424-
icu::UnicodeString matchustr = icu::UnicodeString(matchstr.data(), (int32_t)matchstr.length());
480+
icu::UnicodeString matchustr = icu::UnicodeString(matchstr.data(), (int32_t)matchstr.length());
425481
// TODO-LDML: create a new Matcher every time. These could be cached and reset.
426482
std::unique_ptr<icu::RegexMatcher> matcher(fFromPattern->matcher(matchustr, status));
427483
assert(U_SUCCESS(status));
@@ -438,25 +494,65 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons
438494
// extract..
439495
const icu::UnicodeString substr = matchustr.tempSubStringBetween(matchStart, matchEnd);
440496
// preflight to UTF-32 to get length
441-
UErrorCode substrStatus = U_ZERO_ERROR;
497+
UErrorCode substrStatus = U_ZERO_ERROR; // throwaway status
498+
// we need the UTF-32 matchLen for our return.
442499
auto matchLen = substr.toUTF32(nullptr, 0, substrStatus);
500+
501+
// should have matched something.
443502
assert(matchLen > 0);
444-
if (matchLen == 0) {
445-
return 0;
446-
}
447-
// Now, we have a matchLen.
448503

449504
// now, do the replace.
450-
// Convert the fTo into u16 TODO-LDML (we could cache this?)
451-
const std::u16string rstr = km::kbp::kmx::u32string_to_u16string(fTo);
452-
icu::UnicodeString rustr = icu::UnicodeString(rstr.data(), (int32_t)rstr.length());
453-
// This replace will apply $1, $2 etc. TODO-LDML it will NOT handle mapFrom or mapTo.
505+
506+
/** this is the 'to' or other replacement string.*/
507+
icu::UnicodeString rustr;
508+
if (fMapFromStrId == 0) {
509+
// Normal case: not a map.
510+
// This replace will apply $1, $2 etc.
511+
// Convert the fTo into u16 TODO-LDML (we could cache this?)
512+
const std::u16string rstr = km::kbp::kmx::u32string_to_u16string(fTo);
513+
rustr = icu::UnicodeString(rstr.data(), (int32_t)rstr.length());
514+
} else {
515+
// Set map case: mapping from/to
516+
517+
// we actually need the group(1) string here.
518+
// this is only the content in parenthesis ()
519+
icu::UnicodeString group1 = matcher->group(1, status);
520+
assert(U_SUCCESS(status)); // TODO-LDML: could be a malformed from pattern
521+
// now, how long is group1 in UTF-32, hmm?
522+
UErrorCode preflightStatus = U_ZERO_ERROR; // throwaway status
523+
auto group1Len = group1.toUTF32(nullptr, 0, preflightStatus);
524+
char32_t *s = new char32_t[group1Len + 1];
525+
assert(s != nullptr); // TODO-LDML: OOM
526+
// convert
527+
substr.toUTF32((UChar32 *)s, group1Len + 1, status);
528+
assert(U_SUCCESS(status));
529+
std::u32string match32(s, group1Len); // taken from just group1
530+
// clean up buffer
531+
delete [] s;
532+
533+
// Now we're ready to do the actual mapping.
534+
535+
// 1., we need to find the index in the source set.
536+
auto matchIndex = findIndexFrom(match32);
537+
assert(matchIndex != -1L); // TODO-LDML: not matching shouldn't happen, the regex wouldn't have matched.
538+
// we already asserted on load that the from and to sets have the same cardinality.
539+
540+
// 2. get the target string, convert to utf-16
541+
// we use the same matchIndex that was just found
542+
const std::u16string rstr = km::kbp::kmx::u32string_to_u16string(fMapToList.at(matchIndex));
543+
544+
// 3. update the UnicodeString for replacement
545+
rustr = icu::UnicodeString(rstr.data(), (int32_t)rstr.length());
546+
// and we return to the regular code flow.
547+
}
548+
// here we replace the match output.
454549
icu::UnicodeString entireOutput = matcher->replaceFirst(rustr, status);
455-
assert(U_SUCCESS(status));
550+
assert(U_SUCCESS(status)); // TODO-LDML: could fail here due to bad input (syntax err)
551+
456552
// entireOutput includes all of 'input', but modified. Need to substring it.
457553
icu::UnicodeString outu = entireOutput.tempSubString(matchStart);
458554

459-
// Special case if there's no output
555+
// Special case if there's no output, save some allocs
460556
if (outu.length() == 0) {
461557
output.clear();
462558
} else {
@@ -479,6 +575,20 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons
479575
return matchLen;
480576
}
481577

578+
int32_t transform_entry::findIndexFrom(const std::u32string &match) const {
579+
return findIndex(match, fMapFromList);
580+
}
581+
582+
int32_t transform_entry::findIndex(const std::u32string &match, const std::deque<std::u32string> list) {
583+
int32_t index = 0;
584+
for(auto e = list.begin(); e < list.end(); e++, index++) {
585+
if (match == *e) {
586+
return index;
587+
}
588+
}
589+
return -1; // not found
590+
}
591+
482592
any_group::any_group(const transform_group &g) : type(any_group_type::transform), transform(g), reorder() {
483593
}
484594

@@ -686,16 +796,9 @@ transforms::load(
686796
const kmx::COMP_KMXPLUS_TRAN_TRANSFORM *element = tranHelper.getTransform(group->index + itemNumber);
687797
const std::u32string fromStr = kmx::u16string_to_u32string(kplus.strs->get(element->from));
688798
const std::u32string toStr = kmx::u16string_to_u32string(kplus.strs->get(element->to));
689-
std::u16string mapFrom, mapTo;
690-
691-
if (element->mapFrom && element->mapTo) {
692-
// strings: variable name of from/to
693-
// TODO-LDML: not implemented
694-
mapFrom = kplus.strs->get(element->mapFrom);
695-
mapTo = kplus.strs->get(element->mapTo);
696-
}
697-
698-
newGroup.emplace_back(fromStr, toStr /* ,mapFrom, mapTo */); // creating a transform_entry
799+
KMX_DWORD mapFrom = element->mapFrom; // copy, because of alignment
800+
KMX_DWORD mapTo = element->mapTo; // copy, because of alignment
801+
newGroup.emplace_back(fromStr, toStr, mapFrom, mapTo, kplus); // creating a transform_entry
699802
}
700803
transforms->addGroup(newGroup);
701804
} else if (group->type == LDML_TRAN_GROUP_TYPE_REORDER) {

0 commit comments

Comments
 (0)