Skip to content

Commit bbb8dad

Browse files
authored
Merge pull request #9448 from keymanapp/feat/core/9119-marker-avoid-surrogate-cliff-epic-ldml
feat(core): limit max marker to 0xD7FF 🙀
2 parents 0171ef9 + 9f03327 commit bbb8dad

File tree

5 files changed

+20
-13
lines changed

5 files changed

+20
-13
lines changed

core/include/ldml/keyboardprocessor_ldml.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,10 +93,10 @@
9393
#define LDML_LENGTH_VARS_ITEM 0x10
9494
#define LDML_LENGTH_VKEY 0xC
9595
#define LDML_LENGTH_VKEY_ITEM 0x8
96-
#define LDML_MARKER_ANY_INDEX 0xFFFE
96+
#define LDML_MARKER_ANY_INDEX 0xD7FF
9797
#define LDML_MARKER_CODE 0x8
98-
#define LDML_MARKER_MAX_COUNT 0xFFFC
99-
#define LDML_MARKER_MAX_INDEX 0xFFFD
98+
#define LDML_MARKER_MAX_COUNT 0xD7FE
99+
#define LDML_MARKER_MAX_INDEX 0xD7FE
100100
#define LDML_MARKER_MIN_INDEX 0x1
101101
#define LDML_META_SETTINGS_FALLBACK_OMIT 0x1
102102
#define LDML_META_SETTINGS_TRANSFORMFAILURE_OMIT 0x2

core/include/ldml/keyboardprocessor_ldml.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -622,11 +622,11 @@ class Constants {
622622
/** minimum usable marker index */
623623
readonly marker_min_index = 0x0001;
624624
/** index value referring to the 'any' marker match */
625-
readonly marker_any_index = 0xFFFE;
625+
readonly marker_any_index = 0xD7FF;
626626
/** maximum marker index prior to the 'any' value */
627627
readonly marker_max_index = this.marker_any_index - 1;
628628
/** maximum count of markers (not including 'any') */
629-
readonly marker_max_count = this.marker_max_index - this.marker_min_index;
629+
readonly marker_max_count = this.marker_max_index - this.marker_min_index + 1;
630630

631631
};
632632

core/src/kmx/kmx_plus.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,11 @@ static_assert(LDML_UC_SENTINEL == UC_SENTINEL, "mismatch: LDML_UC_SENTINEL");
319319
static_assert(LDML_MARKER_CODE == CODE_DEADKEY, "mismatch: LDML_MARKER_CODE");
320320
static_assert(LDML_MARKER_ANY_INDEX < UC_SENTINEL, "expected LDML_MARKER_ANY_INDEX < UC_SENTINEL");
321321

322+
/** @returns true if a valid marker per spec */
323+
static inline bool is_valid_marker(KMX_DWORD marker_no) {
324+
return ((marker_no == LDML_MARKER_ANY_INDEX) || (marker_no >= LDML_MARKER_MIN_INDEX && marker_no <= LDML_MARKER_MAX_INDEX));
325+
}
326+
322327
/* ------------------------------------------------------------------
323328
* bksp section
324329
------------------------------------------------------------------ */

core/src/ldml/C9134_ldml_markers.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,10 @@ Markers can appear in both 'emitting' and 'matching-only' areas:
3434
- Keyman already uses `UC_SENTINEL` `U+FFFF` (noncharacter), with `CODE_DEADKEY` (0x0008)
3535
- The general proposal here is to use the sequence `U+FFFF U+0008 U+XXXX` to represent marker #XXXX (starting with `U+0001`)
3636
- `U+FFFF` cannot otherwise occur in text, so it is unique
37-
- `U+FFFF U+0008 U+FFFE` to indicate 'any marker' corresponds to `\m{.}`
38-
- This scheme allows for 65,533 (0xFFFD) unique markers, from `U+FFFF U+0008 U+0001` through `U+FFFF U+0008 U+FFFD`
37+
- `U+FFFF U+0008 U+D7FF` to indicate 'any marker' corresponds to `\m{.}`
38+
- The max marker identifier will be `0xD7FE`, with `0xD7FF` reserved to represent 'any marker' if that is needed in the text stream.
39+
- This scheme allows for 55,294 unique markers, from `U+FFFF U+0008 U+0001` through `U+FFFF U+0008 U+D7FE` inclusive.
40+
- This scheme avoids the Unicode surrogate space beginning at `U+D800` and other noncharacters.
3941

4042
## Terminology
4143
- A marker's "number" is its position in the `markers` list, starting at index 1 (U+0001) being the first element in that list.

core/src/ldml/ldml_processor.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -373,8 +373,6 @@ ldml_processor::emit_text(km_kbp_state *state, const std::u32string &str) {
373373
it++; // consume LDML_MARKER_CODE
374374
assert(it < str.end());
375375
const auto marker_no = *it;
376-
assert(marker_no >= LDML_MARKER_MIN_INDEX);
377-
assert(marker_no <= LDML_MARKER_ANY_INDEX);
378376
emit_marker(state, marker_no);
379377
} else {
380378
emit_text(state, ch);
@@ -389,10 +387,11 @@ ldml_processor::emit_text(km_kbp_state *state, km_kbp_usv ch) {
389387
state->actions().push_character(ch);
390388
}
391389

392-
void ldml_processor::emit_marker(km_kbp_state *state, KMX_DWORD marker_no) {
393-
// OK, push the marker
394-
state->actions().push_marker(marker_no);
395-
state->context().push_marker(marker_no);
390+
void
391+
ldml_processor::emit_marker(km_kbp_state *state, KMX_DWORD marker_no) {
392+
assert(km::kbp::kmx::is_valid_marker(marker_no));
393+
state->actions().push_marker(marker_no);
394+
state->context().push_marker(marker_no);
396395
}
397396

398397
size_t
@@ -406,6 +405,7 @@ ldml_processor::context_to_string(km_kbp_state *state, std::u32string &str) {
406405
if (last_type == KM_KBP_BT_CHAR) {
407406
str.insert(0, 1, c->character);
408407
} else if (last_type == KM_KBP_BT_MARKER) {
408+
assert(km::kbp::kmx::is_valid_marker(c->marker));
409409
prepend_marker(str, c->marker);
410410
} else {
411411
break;

0 commit comments

Comments
 (0)