Skip to content

Commit 75afafd

Browse files
feat(mf2): Allow bidi controls (ALM/LRM/RLM/LRI/RLI/FSI/PDI) in whitespace & around names (unicode-org/message-format-wg#884)
1 parent ccc0331 commit 75afafd

File tree

8 files changed

+110
-80
lines changed

8 files changed

+110
-80
lines changed

packages/mf2-messageformat/src/cst/declarations.ts

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import { parseNameValue } from './names.js';
21
import { parseExpression } from './expression.js';
32
import type { ParseContext } from './parse-cst.js';
43
import type * as CST from './types.js';
@@ -16,24 +15,22 @@ export function parseDeclarations(
1615
let pos = start;
1716
const declarations: CST.Declaration[] = [];
1817
loop: while (source[pos] === '.') {
19-
const keyword = parseNameValue(source, pos + 1);
18+
const keyword = source.substr(pos, 6);
2019
let decl;
2120
switch (keyword) {
22-
case '':
23-
case 'match':
21+
case '.match':
2422
break loop;
25-
case 'input':
23+
case '.input':
2624
decl = parseInputDeclaration(ctx, pos);
2725
break;
28-
case 'local':
26+
case '.local':
2927
decl = parseLocalDeclaration(ctx, pos);
3028
break;
3129
default:
3230
decl = parseDeclarationJunk(ctx, pos);
3331
}
3432
declarations.push(decl);
35-
pos = decl.end;
36-
pos += whitespaces(source, pos);
33+
pos = whitespaces(source, decl.end).end;
3734
}
3835
return { declarations, end: pos };
3936
}
@@ -45,7 +42,7 @@ function parseInputDeclaration(
4542
//
4643
let pos = start + 6; // '.input'
4744
const keyword: CST.Syntax<'.input'> = { start, end: pos, value: '.input' };
48-
pos += whitespaces(ctx.source, pos);
45+
pos = whitespaces(ctx.source, pos).end;
4946

5047
const value = parseDeclarationValue(ctx, pos);
5148
if (value.type === 'expression') {
@@ -66,9 +63,9 @@ function parseLocalDeclaration(
6663
let pos = start + 6; // '.local'
6764
const keyword: CST.Syntax<'.local'> = { start, end: pos, value: '.local' };
6865
const ws = whitespaces(source, pos);
69-
pos += ws;
66+
pos = ws.end;
7067

71-
if (ws === 0) ctx.onError('missing-syntax', pos, ' ');
68+
if (!ws.hasWS) ctx.onError('missing-syntax', pos, ' ');
7269

7370
let target: CST.VariableRef | CST.Junk;
7471
if (source[pos] === '$') {
@@ -87,7 +84,7 @@ function parseLocalDeclaration(
8784
ctx.onError('missing-syntax', junkStart, '$');
8885
}
8986

90-
pos += whitespaces(source, pos);
87+
pos = whitespaces(source, pos).end;
9188
let equals: CST.Syntax<'='> | undefined;
9289
if (source[pos] === '=') {
9390
equals = { start: pos, end: pos + 1, value: '=' };
@@ -96,7 +93,7 @@ function parseLocalDeclaration(
9693
ctx.onError('missing-syntax', pos, '=');
9794
}
9895

99-
pos += whitespaces(source, pos);
96+
pos = whitespaces(source, pos).end;
10097
const value = parseDeclarationValue(ctx, pos);
10198

10299
return {

packages/mf2-messageformat/src/cst/expression.ts

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ export function parseExpression(
1111
): CST.Expression {
1212
const { source } = ctx;
1313
let pos = start + 1; // '{'
14-
pos += whitespaces(source, pos);
14+
pos = whitespaces(source, pos).end;
1515

1616
const arg =
1717
source[pos] === '$'
@@ -20,10 +20,10 @@ export function parseExpression(
2020
if (arg) {
2121
pos = arg.end;
2222
const ws = whitespaces(source, pos);
23-
if (ws === 0 && source[pos] !== '}') {
23+
if (!ws.hasWS && source[pos] !== '}') {
2424
ctx.onError('missing-syntax', pos, ' ');
2525
}
26-
pos += ws;
26+
pos = ws.end;
2727
}
2828

2929
let functionRef: CST.FunctionRef | CST.Junk | undefined;
@@ -56,16 +56,16 @@ export function parseExpression(
5656
const attributes: CST.Attribute[] = [];
5757
let reqWS = Boolean(functionRef || markup);
5858
let ws = whitespaces(source, pos);
59-
while (source[pos + ws] === '@') {
60-
if (reqWS && ws === 0) ctx.onError('missing-syntax', pos, ' ');
61-
pos += ws;
59+
while (source[ws.end] === '@') {
60+
if (reqWS && !ws.hasWS) ctx.onError('missing-syntax', pos, ' ');
61+
pos = ws.end;
6262
const attr = parseAttribute(ctx, pos);
6363
attributes.push(attr);
6464
pos = attr.end;
6565
reqWS = true;
6666
ws = whitespaces(source, pos);
6767
}
68-
pos += ws;
68+
pos = ws.end;
6969

7070
const open: CST.Syntax<'{'> = { start, end: start + 1, value: '{' };
7171
let close: CST.Syntax<'}'> | undefined;
@@ -125,17 +125,17 @@ function parseFunctionRefOrMarkup(
125125
let close: CST.Syntax<'/'> | undefined;
126126
while (pos < source.length) {
127127
let ws = whitespaces(source, pos);
128-
const next = source[pos + ws];
128+
const next = source[ws.end];
129129
if (next === '@' || next === '}') break;
130130
if (next === '/' && source[start] === '#') {
131-
pos += ws + 1;
131+
pos = ws.end + 1;
132132
close = { start: pos - 1, end: pos, value: '/' };
133133
ws = whitespaces(source, pos);
134-
if (ws > 0) ctx.onError('extra-content', pos, pos + ws);
134+
if (ws.hasWS) ctx.onError('extra-content', pos, ws.end);
135135
break;
136136
}
137-
if (ws === 0) ctx.onError('missing-syntax', pos, ' ');
138-
pos += ws;
137+
if (!ws.hasWS) ctx.onError('missing-syntax', pos, ' ');
138+
pos = ws.end;
139139
const opt = parseOption(ctx, pos);
140140
if (opt.end === pos) break; // error
141141
options.push(opt);
@@ -152,16 +152,15 @@ function parseFunctionRefOrMarkup(
152152

153153
function parseOption(ctx: ParseContext, start: number): CST.Option {
154154
const id = parseIdentifier(ctx, start);
155-
let pos = id.end;
156-
pos += whitespaces(ctx.source, pos);
155+
let pos = whitespaces(ctx.source, id.end).end;
157156
let equals: CST.Syntax<'='> | undefined;
158157
if (ctx.source[pos] === '=') {
159158
equals = { start: pos, end: pos + 1, value: '=' };
160159
pos += 1;
161160
} else {
162161
ctx.onError('missing-syntax', pos, '=');
163162
}
164-
pos += whitespaces(ctx.source, pos);
163+
pos = whitespaces(ctx.source, pos).end;
165164
const value =
166165
ctx.source[pos] === '$'
167166
? parseVariable(ctx, pos)
@@ -174,23 +173,22 @@ function parseIdentifier(
174173
start: number
175174
): { parts: CST.Identifier; end: number } {
176175
const { source } = ctx;
177-
const str0 = parseNameValue(source, start);
178-
if (!str0) {
176+
const name0 = parseNameValue(source, start);
177+
if (!name0) {
179178
ctx.onError('empty-token', start, start + 1);
180179
return { parts: [{ start, end: start, value: '' }], end: start };
181180
}
182-
let pos = start + str0.length;
183-
const id0 = { start, end: pos, value: str0 };
181+
let pos = name0.end;
182+
const id0 = { start, end: pos, value: name0.value };
184183
if (source[pos] !== ':') return { parts: [id0], end: pos };
185184

186185
const sep = { start: pos, end: pos + 1, value: ':' as const };
187186
pos += 1;
188187

189-
const str1 = parseNameValue(source, pos);
190-
if (str1) {
191-
const end = pos + str1.length;
192-
const id1 = { start: pos, end, value: str1 };
193-
return { parts: [id0, sep, id1], end };
188+
const name1 = parseNameValue(source, pos);
189+
if (name1) {
190+
const id1 = { start: pos, end: name1.end, value: name1.value };
191+
return { parts: [id0, sep, id1], end: name1.end };
194192
} else {
195193
ctx.onError('empty-token', pos, pos + 1);
196194
return { parts: [id0, sep], end: pos };
@@ -204,10 +202,10 @@ function parseAttribute(ctx: ParseContext, start: number): CST.Attribute {
204202
const ws = whitespaces(source, pos);
205203
let equals: CST.Syntax<'='> | undefined;
206204
let value: CST.Literal | undefined;
207-
if (source[pos + ws] === '=') {
208-
pos += ws + 1;
205+
if (source[ws.end] === '=') {
206+
pos = ws.end + 1;
209207
equals = { start: pos - 1, end: pos, value: '=' };
210-
pos += whitespaces(source, pos);
208+
pos = whitespaces(source, pos).end;
211209
value = parseLiteral(ctx, pos, true);
212210
pos = value.end;
213211
}

packages/mf2-messageformat/src/cst/names.ts

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
const bidiChars = new Set([
2+
0x061c, // ALM
3+
0x200e, // LRM
4+
0x200f, // RLM
5+
0x2066, // LRI
6+
0x2067, // RLI
7+
0x2068, // FSI
8+
0x2069 // PDI
9+
]);
10+
111
const isNameStartCode = (cc: number) =>
212
(cc >= 0x41 && cc <= 0x5a) || // A-Z
313
cc === 0x5f || // _
@@ -28,11 +38,24 @@ const isNameCharCode = (cc: number) =>
2838
// This is sticky so that parsing doesn't need to substring the source
2939
const numberLiteral = /-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][-+]?\d+)?/y;
3040

31-
export function parseNameValue(src: string, start: number): string {
32-
if (!isNameStartCode(src.charCodeAt(start))) return '';
33-
let pos = start + 1;
34-
while (isNameCharCode(src.charCodeAt(pos))) pos += 1;
35-
return src.substring(start, pos);
41+
export function parseNameValue(
42+
src: string,
43+
start: number
44+
): { value: string; end: number } | null {
45+
let pos = start;
46+
let nameStart = start;
47+
let cc = src.charCodeAt(start);
48+
if (bidiChars.has(cc)) {
49+
pos += 1;
50+
nameStart += 1;
51+
cc = src.charCodeAt(pos);
52+
}
53+
if (!isNameStartCode(cc)) return null;
54+
cc = src.charCodeAt(++pos);
55+
while (isNameCharCode(cc)) cc = src.charCodeAt(++pos);
56+
const name = src.substring(nameStart, pos);
57+
if (bidiChars.has(cc)) pos += 1;
58+
return { value: name, end: pos };
3659
}
3760

3861
export function isValidUnquotedLiteral(str: string): boolean {

packages/mf2-messageformat/src/cst/parse-cst.ts

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ export function parseCST(
5252
): CST.Message {
5353
const ctx = new ParseContext(source, opt);
5454

55-
const pos = whitespaces(source, 0);
55+
const pos = whitespaces(source, 0).end;
5656
if (source.startsWith('.', pos)) {
5757
const { declarations, end } = parseDeclarations(ctx, pos);
5858
return source.startsWith('.match', end)
@@ -72,8 +72,7 @@ function parsePatternMessage(
7272
complex: boolean
7373
): CST.SimpleMessage | CST.ComplexMessage {
7474
const pattern = parsePattern(ctx, start, complex);
75-
let pos = pattern.end;
76-
pos += whitespaces(ctx.source, pos);
75+
const pos = whitespaces(ctx.source, pattern.end).end;
7776

7877
if (pos < ctx.source.length) {
7978
ctx.onError('extra-content', pos, ctx.source.length);
@@ -92,17 +91,17 @@ function parseSelectMessage(
9291
let pos = start + 6; // '.match'
9392
const match: CST.Syntax<'.match'> = { start, end: pos, value: '.match' };
9493
let ws = whitespaces(ctx.source, pos);
95-
if (ws === 0) ctx.onError('missing-syntax', pos, "' '");
96-
pos += ws;
94+
if (!ws.hasWS) ctx.onError('missing-syntax', pos, "' '");
95+
pos = ws.end;
9796

9897
const selectors: CST.VariableRef[] = [];
9998
while (ctx.source[pos] === '$') {
10099
const sel = parseVariable(ctx, pos);
101100
selectors.push(sel);
102101
pos = sel.end;
103102
ws = whitespaces(ctx.source, pos);
104-
if (ws === 0) ctx.onError('missing-syntax', pos, "' '");
105-
pos += ws;
103+
if (!ws.hasWS) ctx.onError('missing-syntax', pos, "' '");
104+
pos = ws.end;
106105
}
107106
if (selectors.length === 0) ctx.onError('empty-token', pos, pos + 1);
108107

@@ -115,7 +114,7 @@ function parseSelectMessage(
115114
} else {
116115
pos += 1;
117116
}
118-
pos += whitespaces(ctx.source, pos);
117+
pos = whitespaces(ctx.source, pos).end;
119118
}
120119

121120
if (pos < ctx.source.length) {
@@ -137,11 +136,11 @@ function parseVariant(ctx: ParseContext, start: number): CST.Variant {
137136
const keys: Array<CST.Literal | CST.CatchallKey> = [];
138137
while (pos < ctx.source.length) {
139138
const ws = whitespaces(ctx.source, pos);
140-
pos += ws;
139+
pos = ws.end;
141140
const ch = ctx.source[pos];
142141
if (ch === '{') break;
143142

144-
if (pos > start && ws === 0) ctx.onError('missing-syntax', pos, "' '");
143+
if (pos > start && !ws.hasWS) ctx.onError('missing-syntax', pos, "' '");
145144

146145
const key =
147146
ch === '*'
Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,18 @@
1-
const whitespaceChars = new Set(['\t', '\n', '\r', ' ', '\u3000']);
1+
const bidiChars = new Set('\u061C\u200E\u200F\u2066\u2067\u2068\u2069');
2+
const whitespaceChars = new Set('\t\n\r \u3000');
23

3-
export function whitespaces(src: string, start: number): number {
4-
let length = 0;
5-
let ch = src[start];
4+
export function whitespaces(
5+
src: string,
6+
start: number
7+
): { hasWS: boolean; end: number } {
8+
let hasWS = false;
9+
let pos = start;
10+
let ch = src[pos];
11+
while (bidiChars.has(ch)) ch = src[++pos];
612
while (whitespaceChars.has(ch)) {
7-
length += 1;
8-
ch = src[start + length];
13+
hasWS = true;
14+
ch = src[++pos];
915
}
10-
return length;
16+
while (bidiChars.has(ch) || whitespaceChars.has(ch)) ch = src[++pos];
17+
return { hasWS, end: pos };
1118
}

packages/mf2-messageformat/src/cst/values.ts

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,11 @@ export function parseVariable(
132132
const pos = start + 1;
133133
const open = { start, end: pos, value: '$' as const };
134134
const name = parseNameValue(ctx.source, pos);
135-
const end = pos + name.length;
136-
if (!name) ctx.onError('empty-token', pos, pos + 1);
137-
return { type: 'variable', start, end, open, name };
135+
if (!name) {
136+
ctx.onError('empty-token', pos, pos + 1);
137+
return { type: 'variable', start, end: pos, open, name: '' };
138+
}
139+
return { type: 'variable', start, end: name.end, open, name: name.value };
138140
}
139141

140142
function parseEscape(

0 commit comments

Comments
 (0)