Skip to content

Commit d88c04d

Browse files
authored
Merge pull request #9374 from keymanapp/feat/developer/9119-marker-update-spec-epic-ldml
chore(developer): update markers per review 🙀
2 parents db8e829 + 6632aaf commit d88c04d

File tree

8 files changed

+52
-32
lines changed

8 files changed

+52
-32
lines changed

common/web/types/src/ldml-keyboard/pattern-parser.ts

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,11 @@ export class MarkerParser {
5050
/**
5151
* Marker sentinel as a string - U+FFFF
5252
*/
53-
public static readonly SENTINEL = String.fromCodePoint(constants.marker_sentinel);
54-
53+
public static readonly SENTINEL = String.fromCodePoint(constants.uc_sentinel);
5554
/**
56-
* Matches all markers.
55+
* Marker code as a string - U+0008
5756
*/
58-
public static readonly SENTINEL_ALL_MARKERS = this.SENTINEL + this.SENTINEL;
57+
public static readonly MARKER_CODE = String.fromCodePoint(constants.marker_code);
5958

6059
/** Minimum ID (trailing code unit) */
6160
public static readonly MIN_MARKER_INDEX = constants.marker_min_index;
@@ -88,15 +87,15 @@ export class MarkerParser {
8887
if (n < MarkerParser.MIN_MARKER_INDEX || n > MarkerParser.ANY_MARKER_INDEX) {
8988
throw RangeError(`Internal Error: marker index out of range ${n}`);
9089
}
91-
return this.SENTINEL + String.fromCharCode(n);
90+
return this.SENTINEL + this.MARKER_CODE + String.fromCharCode(n);
9291
}
9392

9493
/** @returns all marker strings as sentinel values */
9594
public static toSentinelString(s: string, markers?: OrderedStringList) : string {
9695
if (!s) return s;
9796
return s.replaceAll(this.REFERENCE, (sub, arg) => {
9897
if (arg === MarkerParser.ANY_MARKER_ID) {
99-
return MarkerParser.SENTINEL_ALL_MARKERS;
98+
return MarkerParser.markerOutput(MarkerParser.ANY_MARKER_INDEX);
10099
}
101100
if (!markers) {
102101
throw RangeError(`Internal Error: Could not find marker \\m{${arg}} (no markers defined)`);

common/web/types/test/ldml-keyboard/test-pattern-parser.ts

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import 'mocha';
22
import { assert } from 'chai';
33
import { ElementParser, ElementSegment, ElementType, MarkerParser, OrderedStringList, VariableParser } from '../../src/ldml-keyboard/pattern-parser.js';
4+
import { constants } from '@keymanapp/ldml-keyboard-constants';
5+
import { KMXFile } from '../../src/kmx/kmx.js';
46

57
describe('Test of Pattern Parsers', () => {
68
describe('should test MarkerParser', () => {
@@ -52,8 +54,8 @@ describe('Test of Pattern Parsers', () => {
5254
}
5355
});
5456
it('should be able to emit sentinel values', () => {
55-
assert.equal(MarkerParser.markerOutput(295), '\uFFFF\u0127', 'Wrong sentinel value emitted');
56-
assert.equal(MarkerParser.markerOutput(MarkerParser.ANY_MARKER_INDEX), MarkerParser.SENTINEL_ALL_MARKERS, 'Wrong sentinel value emitted for ffff');
57+
assert.equal(MarkerParser.markerOutput(295), '\uFFFF\u0008\u0127', 'Wrong sentinel value emitted');
58+
assert.equal(MarkerParser.markerOutput(MarkerParser.ANY_MARKER_INDEX), '\uFFFF\u0008\uFFFE', 'Wrong sentinel value emitted for ffff');
5759
assert.throws(() => MarkerParser.markerOutput(0)); // below MIN
5860
assert.throws(() => MarkerParser.markerOutput(0x10000)); // above MAX
5961
});
@@ -87,7 +89,7 @@ describe('Test of Pattern Parsers', () => {
8789
);
8890
assert.equal(MarkerParser.toSentinelString(
8991
`Give me \\m{a} and \\m{c}, or \\m{.}.`, markers),
90-
`Give me \uFFFF\u0001 and \uFFFF\u0003, or \uFFFF\uFFFF.`
92+
`Give me \uFFFF\u0008\u0001 and \uFFFF\u0008\u0003, or \uFFFF\u0008\uFFFE.`
9193
);
9294
assert.throws(() =>
9395
MarkerParser.toSentinelString(
@@ -102,6 +104,10 @@ describe('Test of Pattern Parsers', () => {
102104
)
103105
);
104106
});
107+
it('should match some marker constants', () => {
108+
assert.equal(constants.uc_sentinel, KMXFile.UC_SENTINEL);
109+
assert.equal(constants.marker_code, KMXFile.CODE_DEADKEY);
110+
});
105111
});
106112
describe('should test VariableParser', () => {
107113
// same test as for markers

core/include/ldml/keyboardprocessor_ldml.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,11 @@
9393
#define LDML_LENGTH_VARS_ITEM 0x10
9494
#define LDML_LENGTH_VKEY 0xC
9595
#define LDML_LENGTH_VKEY_ITEM 0x8
96-
#define LDML_MARKER_ANY_INDEX 0xFFFF
97-
#define LDML_MARKER_MAX_COUNT 0xFFFD
98-
#define LDML_MARKER_MAX_INDEX 0xFFFE
96+
#define LDML_MARKER_ANY_INDEX 0xFFFE
97+
#define LDML_MARKER_CODE 0x8
98+
#define LDML_MARKER_MAX_COUNT 0xFFFC
99+
#define LDML_MARKER_MAX_INDEX 0xFFFD
99100
#define LDML_MARKER_MIN_INDEX 0x1
100-
#define LDML_MARKER_SENTINEL 0xFFFF
101101
#define LDML_META_SETTINGS_FALLBACK_OMIT 0x1
102102
#define LDML_META_SETTINGS_TRANSFORMFAILURE_OMIT 0x2
103103
#define LDML_META_SETTINGS_TRANSFORMPARTIAL_HIDE 0x4
@@ -134,6 +134,7 @@
134134
#define LDML_TRAN_FLAGS_ERROR 0x1
135135
#define LDML_TRAN_GROUP_TYPE_REORDER 0x1
136136
#define LDML_TRAN_GROUP_TYPE_TRANSFORM 0x0
137+
#define LDML_UC_SENTINEL 0xFFFF
137138
#define LDML_VARS_ENTRY_TYPE_SET 0x1
138139
#define LDML_VARS_ENTRY_TYPE_STRING 0x0
139140
#define LDML_VARS_ENTRY_TYPE_UNICODESET 0x2

core/include/ldml/keyboardprocessor_ldml.ts

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -615,16 +615,18 @@ class Constants {
615615
}
616616

617617
// ---- marker stuff ----
618-
/** sentinel value indicating a marker follows */
619-
readonly marker_sentinel = 0xFFFF;
618+
/** == kmx_file.UC_SENTINEL, always followed by `marker_code`, marker index 0x0001-0xfffe */
619+
readonly uc_sentinel = 0xFFFF;
620+
/** == kmx_file.CODE_DEADKEY */
621+
readonly marker_code = 0x0008;
620622
/** minimum usable marker index */
621-
readonly marker_min_index = 0x0001;
623+
readonly marker_min_index = 0x0001;
622624
/** index value referring to the 'any' marker match */
623-
readonly marker_any_index = 0xFFFF;
625+
readonly marker_any_index = 0xFFFE;
624626
/** maximum marker index prior to the 'any' value */
625-
readonly marker_max_index = this.marker_any_index - 1;
627+
readonly marker_max_index = this.marker_any_index - 1;
626628
/** maximum count of markers (not including 'any') */
627-
readonly marker_max_count = this.marker_max_index - this.marker_min_index;
629+
readonly marker_max_count = this.marker_max_index - this.marker_min_index;
628630

629631
};
630632

core/src/kmx/kmx_plus.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,10 @@ static_assert(sizeof(struct COMP_KMXPLUS_TRAN_GROUP) == LDML_LENGTH_TRAN_GROUP,
310310
static_assert(sizeof(struct COMP_KMXPLUS_TRAN_TRANSFORM) == LDML_LENGTH_TRAN_TRANSFORM, "mismatched size of tran transform");
311311
static_assert(sizeof(struct COMP_KMXPLUS_TRAN_REORDER) == LDML_LENGTH_TRAN_REORDER, "mismatched size of tran reorder");
312312

313+
// assert some parallel constants
314+
static_assert(LDML_UC_SENTINEL == UC_SENTINEL, "mismatch: LDML_UC_SENTINEL");
315+
static_assert(LDML_MARKER_CODE == CODE_DEADKEY, "mismatch: LDML_MARKER_CODE");
316+
static_assert(LDML_MARKER_ANY_INDEX < UC_SENTINEL, "expected LDML_MARKER_ANY_INDEX < UC_SENTINEL");
313317

314318
/* ------------------------------------------------------------------
315319
* bksp section

core/src/ldml/C9134_ldml_markers.md

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@ Markers can appear in both 'emitting' and 'matching-only' areas:
3131

3232
## Theory / Encoding
3333

34-
- Keyman already uses U_SENTINEL `U+FFFF` (noncharacter)
35-
- The general proposal here is to use the sequence `U+FFFF U+XXXX` to represent marker #XXXX (starting with `U+0001`)
34+
- Keyman already uses `UC_SENTINEL` `U+FFFF` (noncharacter), with `CODE_DEADKEY` (0x0008)
35+
- The general proposal here is to use the sequence `U+FFFF U+0008 U+XXXX` to represent marker #XXXX (starting with `U+0001`)
3636
- `U+FFFF` cannot otherwise occur in text, so it is unique
37-
- `U+FFFF U+FFFF` to indicate 'any marker' corresponds to `\m{.}`
38-
- This scheme allows for 65,534 (0xFFFE) unique markers, from `U+FFFF U+0001` through `U+FFFF U+FFFE`
37+
- `U+FFFF U+0008 U+FFFE` to indicate 'any marker' corresponds to `\m{.}`
38+
- This scheme allows for 65,533 (0xFFFD) unique markers, from `U+FFFF U+0008 U+0001` through `U+FFFF U+0008 U+FFFD`
3939

4040
## Terminology
4141
- A marker's "number" is its position in the `markers` list, starting at index 1 (U+0001) being the first element in that list.
@@ -55,20 +55,20 @@ Note that this is different from other 0-based indices in KMX+. If there are thr
5555
### Other sections
5656

5757
- `string value='\m{…}'` will simply store `\m{…}`, for application when expanded as with other variables.
58-
- Other emitters, such as `key`, `transform` will include the string `U+FFFF U+XXXX` where XXXX corresponds to the marker's 1-based number.
59-
- Transforms will need to match against the marker or markers desired, so may need to emit sequences such as `(?:\uFFFF\u0123)` meaning a match to marker #0x0123
60-
- matching `\m{.}` may then expand to `(?:\uFFFF.)`
58+
- Other emitters, such as `key`, `transform` will include the string `U+FFFF U+0008 U+XXXX` where XXXX corresponds to the marker's 1-based number.
59+
- Transforms will need to match against the marker or markers desired, so may need to emit sequences such as `(?:\uFFFF\u0008\u0123)` meaning a match to marker #0x0123
60+
- matching `\m{.}` may then expand to `(?:\uFFFF\u0008.)` (match a single codepoint after `UC_SENTINEL CODE_DEADKEY`)
6161

6262
## Binary (.kmx plus)
6363

6464
- The `vars.markers` is a pointer into the `list` section with a list (binary order) of the marker names
65-
- Other strings will be in `U+FFFF U+0123` form etc. as if it was in the original text stream as such.
65+
- Other strings will be in `U+FFFF U+0008 U+0123` form etc. as if it was in the original text stream as such.
6666

6767
## Implementation (core)
6868

69-
- Core needs to recognize `U+FFFF …` sequences and convert them to markers in the context stream, with `state->context().push_marker(marker_number)`
70-
- For normal processing, Core does _not_ need to correlate the marker _number_ with a marker _id_, although this would be helpful for a debugging or tracing facility. I.e. `U+FFFF U+0123` corresponding to entry 0x0123 in the `vars.markers` -> `list` table.
71-
- Core needs to remove `U+FFFF …` sequences before they are passed to the OS.
69+
- Core needs to recognize `U+FFFF U+0008 ` sequences and convert them to markers in the context stream, with `state->context().push_marker(marker_number)`
70+
- For normal processing, Core does _not_ need to correlate the marker _number_ with a marker _id_, although this would be helpful for a debugging or tracing facility. I.e. `U+FFFF U+0008 U+0123` corresponding to entry 0x0123 in the `vars.markers` -> `list` table.
71+
- Core needs to remove `U+FFFF U+0008 ` sequences before they are passed to the OS.
7272

7373
- Transform processing needs to recognize these markers in the context stream and pass them to the transforms appropriately.
7474
- User-defined backspace processing `<transform type="backspace"/> may specifically operate on backspaces in the context stream, just as with other transform processing.

developer/src/kmc-ldml/test/fixtures/basic.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ block(strs) # struct COMP_KMXPLUS_STRS {
465465
block(strKeys) 90 17 b6 17 block(x) 00 00 # 'ថា'
466466
# <reorder before="ᩫ" from="᩠᩵ᩅ" order="10 55 10" />
467467
block(strIndicator) 3d d8 40 de block(x) 00 00 # '🙀'
468-
block(strSentinel0001) FF FF 01 00 block(x) 00 00 # U+FFFF U+0001
468+
block(strSentinel0001) FF FF 08 00 01 00 block(x) 00 00 # UC_SENTINEL CODE_DEADKEY U+0001
469469

470470

471471

developer/src/kmc-ldml/test/test-vars.ts

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@ import { VarsCompiler } from '../src/compiler/vars.js';
44
import { CompilerMessages } from '../src/compiler/messages.js';
55
import { CompilerMessages as KmnCompilerMessages } from '@keymanapp/kmc-kmn';
66
import { testCompilationCases } from './helpers/index.js';
7-
import { KMXPlus } from '@keymanapp/common-types';
7+
import { KMXPlus, KMX } from '@keymanapp/common-types';
88
import { BASIC_DEPENDENCIES } from '../src/compiler/empty-compiler.js';
9+
import { constants } from '@keymanapp/ldml-keyboard-constants';
10+
911
// now that 'everything' depends on vars, we need an explicit dependency here
1012
const varsDependencies = BASIC_DEPENDENCIES.filter(c => c !== VarsCompiler);
1113

@@ -186,6 +188,12 @@ describe('vars', function () {
186188
],
187189
},
188190
], varsDependencies);
191+
describe('should match some marker constants', () => {
192+
// neither of these live here, but, common/web/types does not import ldml-keyboard-constants otherwise.
193+
194+
assert.equal(constants.uc_sentinel, KMX.KMXFile.UC_SENTINEL);
195+
assert.equal(constants.marker_code, KMX.KMXFile.CODE_DEADKEY);
196+
});
189197
describe('markers', function () {
190198
this.slow(500); // 0.5 sec -- json schema validation takes a while
191199

0 commit comments

Comments
 (0)