Skip to content

Commit 448cc86

Browse files
committed
feat(ohm-js): Add support for XID_Start, XID_Continue, White_Space
Fixes #180, 9 years later
1 parent 1ab05d5 commit 448cc86

File tree

10 files changed

+80
-61
lines changed

10 files changed

+80
-61
lines changed

doc/syntax-reference.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,8 @@ as well as multiline (`/* */`) comments like:
176176

177177
<code id="applySyntactic">applySyntactic&lt;<i>ruleName</i>&gt;</code>: Allows the syntactic rule _ruleName_ to be applied in a lexical context, which is otherwise not allowed. Spaces are skipped _before_ and _after_ the rule application. _New in Ohm v16.1.0._
178178

179+
<code>unicodeChar&lt;<i>categoryOrProp</i>&gt;</code>: matches a single Unicode code point from a given category, or with a given binary property. _categoryOrProp_ is a terminal that is either (a) a valid Unicode General_Category value (e.g. `"Zl"`), or (b) one of the following binary property names: `"XID_Start"`, `"XID_Continue"`, `"White_Space"`.
180+
179181
## Grammar Syntax
180182

181183
### Grammar Inheritance

packages/ohm-js/src/UnicodeCategories.js

Lines changed: 0 additions & 45 deletions
This file was deleted.

packages/ohm-js/src/pexprs-main.js

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import {UnicodeCategories} from './UnicodeCategories.js';
1+
import {UnicodeBinaryProperties, UnicodeCategories} from './unicode.js';
22
import * as common from './common.js';
33

44
// --------------------------------------------------------------------
@@ -180,9 +180,17 @@ export class Apply extends PExpr {
180180
// Unicode character
181181

182182
export class UnicodeChar extends PExpr {
183-
constructor(category) {
183+
constructor(categoryOrProp) {
184184
super();
185-
this.category = category;
186-
this.pattern = UnicodeCategories[category];
185+
this.categoryOrProp = categoryOrProp;
186+
if (categoryOrProp in UnicodeCategories) {
187+
this.pattern = UnicodeCategories[categoryOrProp];
188+
} else if (categoryOrProp in UnicodeBinaryProperties) {
189+
this.pattern = UnicodeBinaryProperties[categoryOrProp];
190+
} else {
191+
throw new Error(
192+
`Invalid Unicode category or property name: ${JSON.stringify(categoryOrProp)}`
193+
);
194+
}
187195
}
188196
}

packages/ohm-js/src/pexprs-outputRecipe.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,5 +92,5 @@ pexprs.Apply.prototype.outputRecipe = function(formals, grammarInterval) {
9292
};
9393

9494
pexprs.UnicodeChar.prototype.outputRecipe = function(formals, grammarInterval) {
95-
return ['unicodeChar', getMetaInfo(this, grammarInterval), this.category];
95+
return ['unicodeChar', getMetaInfo(this, grammarInterval), this.categoryOrProp];
9696
};

packages/ohm-js/src/pexprs-toDisplayString.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,5 +38,5 @@ pexprs.Apply.prototype.toDisplayString = function() {
3838
};
3939

4040
pexprs.UnicodeChar.prototype.toDisplayString = function() {
41-
return 'Unicode [' + this.category + '] character';
41+
return 'Unicode [' + this.categoryOrProp + '] character';
4242
};

packages/ohm-js/src/pexprs-toFailure.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ pexprs.Apply.prototype.toFailure = function(grammar) {
4545
};
4646

4747
pexprs.UnicodeChar.prototype.toFailure = function(grammar) {
48-
return new Failure(this, 'a Unicode [' + this.category + '] character', 'description');
48+
return new Failure(this, 'a Unicode [' + this.categoryOrProp + '] character', 'description');
4949
};
5050

5151
pexprs.Alt.prototype.toFailure = function(grammar) {

packages/ohm-js/src/pexprs-toString.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,5 +72,5 @@ pexprs.Apply.prototype.toString = function() {
7272
};
7373

7474
pexprs.UnicodeChar.prototype.toString = function() {
75-
return '\\p{' + this.category + '}';
75+
return '\\p{' + this.categoryOrProp + '}';
7676
};

packages/ohm-js/src/unicode.js

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
// The full list of categories from:
2+
// https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt.
3+
4+
const toRegExp = val => new RegExp(String.raw`\p{${val}}`, 'u');
5+
6+
/*
7+
grep -v '^#' DerivedGeneralCategory.txt \
8+
| cut -d';' -f2 \
9+
| awk 'NF{print $1}' \
10+
| sort -u \
11+
| awk '{printf "\x27%s\x27,\n",$1}'
12+
*/
13+
14+
export const UnicodeCategories = Object.fromEntries(
15+
[
16+
'Cc',
17+
'Cf',
18+
'Cn',
19+
'Co',
20+
'Cs',
21+
'Ll',
22+
'Lm',
23+
'Lo',
24+
'Lt',
25+
'Lu',
26+
'Mc',
27+
'Me',
28+
'Mn',
29+
'Nd',
30+
'Nl',
31+
'No',
32+
'Pc',
33+
'Pd',
34+
'Pe',
35+
'Pf',
36+
'Pi',
37+
'Po',
38+
'Ps',
39+
'Sc',
40+
'Sk',
41+
'Sm',
42+
'So',
43+
'Zl',
44+
'Zp',
45+
'Zs'
46+
].map(cat => [cat, toRegExp(cat)])
47+
);
48+
UnicodeCategories['Ltmo'] = /\p{Lt}|\p{Lm}|\p{Lo}/u;
49+
50+
// We only support a few of these for now, but could add more later.
51+
// See https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt
52+
export const UnicodeBinaryProperties = Object.fromEntries(
53+
['XID_Start', 'XID_Continue', 'White_Space'].map(prop => [prop, toRegExp(prop)])
54+
);

packages/wasm/src/index.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -915,7 +915,7 @@ export class Compiler {
915915
case pexprs.Terminal:
916916
return ir.terminal(exp.obj);
917917
case pexprs.UnicodeChar:
918-
return ir.unicodeChar(exp.category);
918+
return ir.unicodeChar(exp.categoryOrProp);
919919
default:
920920
throw new Error(`not handled: ${exp.constructor.name}`);
921921
}
@@ -1691,13 +1691,13 @@ export class Compiler {
16911691
const {asm} = this;
16921692

16931693
// TODO: Add support for more categories, by calling out to the host.
1694-
assert(['Ll', 'Lu', 'Ltmo'].includes(exp.category));
1694+
assert(['Ll', 'Lu', 'Ltmo'].includes(exp.categoryOrProp));
16951695

16961696
const makeLabels = () =>
16971697
asciiChars.map(c => {
16981698
const isLowercase = 'a' <= c && c <= 'z';
16991699
const isUppercase = 'A' <= c && c <= 'Z';
1700-
if ((exp.category === 'Lu' && isUppercase) || (exp.category === 'Ll' && isLowercase)) {
1700+
if ((exp.categoryOrProp === 'Lu' && isUppercase) || (exp.categoryOrProp === 'Ll' && isLowercase)) {
17011701
return w.labelidx(asm.depthOf('fastSuccess'));
17021702
}
17031703
return w.labelidx(asm.depthOf('failure'));
@@ -1723,7 +1723,7 @@ export class Compiler {
17231723

17241724
// Push the arg: a bitmap indicating the categories.
17251725
// prettier-ignore
1726-
switch (exp.category) {
1726+
switch (exp.categoryOrProp) {
17271727
case 'Lu': asm.i32Const(1 << 1); break;
17281728
case 'Ll': asm.i32Const(1 << 2); break;
17291729
case 'Ltmo': asm.i32Const((1 << 3) | (1 << 4) | (1 << 5)); break;

packages/wasm/src/ir.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -165,12 +165,12 @@ export const terminal = (value: string, caseInsensitive = false): Terminal => ({
165165

166166
export interface UnicodeChar {
167167
type: 'UnicodeChar';
168-
category: string;
168+
categoryOrProp: string;
169169
}
170170

171-
export const unicodeChar = (category: string): UnicodeChar => ({
171+
export const unicodeChar = (categoryOrProp: string): UnicodeChar => ({
172172
type: 'UnicodeChar',
173-
category
173+
categoryOrProp
174174
});
175175

176176
// Types that are specific to the IR
@@ -351,7 +351,7 @@ export function toString(exp: Expr): string {
351351
case 'Terminal':
352352
return JSON.stringify(exp.value);
353353
case 'UnicodeChar':
354-
return `$unicodeChar<${JSON.stringify(exp.category)}>`;
354+
return `$unicodeChar<${JSON.stringify(exp.categoryOrProp)}>`;
355355
case 'Dispatch':
356356
return `$dispatch`; // TODO: Improve this.
357357
case 'Lex':

0 commit comments

Comments
 (0)