Skip to content

Commit fea157c

Browse files
authored
Improve case insensitive Regex generation (#1878)
1 parent 68c0fa3 commit fea157c

File tree

5 files changed

+16
-26
lines changed

5 files changed

+16
-26
lines changed

examples/arithmetics/syntaxes/arithmetics.tmLanguage.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
},
1111
{
1212
"name": "keyword.control.arithmetics",
13-
"match": "\\b([dD][eE][fF]|[mM][oO][dD][uU][lL][eE])\\b"
13+
"match": "(?i)\\b(def|module)\\b"
1414
}
1515
],
1616
"repository": {

packages/langium-cli/src/generator/highlighting/textmate-generator.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -128,14 +128,14 @@ function getRepository(grammar: Grammar, config: LangiumLanguageConfig): Reposit
128128
function getControlKeywords(grammar: Grammar, pack: LangiumLanguageConfig): Pattern {
129129
const regex = /[A-Za-z]/;
130130
const controlKeywords = collectKeywords(grammar).filter(kw => regex.test(kw));
131-
const groups = groupKeywords(controlKeywords, pack.caseInsensitive);
131+
const groups = groupKeywords(controlKeywords);
132132
return {
133133
'name': `keyword.control.${pack.id}`,
134-
'match': groups.join('|')
134+
'match': `${pack.caseInsensitive ? '(?i)' : ''}${groups.join('|')}`
135135
};
136136
}
137137

138-
function groupKeywords(keywords: string[], caseInsensitive: boolean | undefined): string[] {
138+
function groupKeywords(keywords: string[]): string[] {
139139
const groups: {
140140
letter: string[],
141141
leftSpecial: string[],
@@ -144,7 +144,7 @@ function groupKeywords(keywords: string[], caseInsensitive: boolean | undefined)
144144
} = { letter: [], leftSpecial: [], rightSpecial: [], special: [] };
145145

146146
keywords.forEach(keyword => {
147-
const keywordPattern = caseInsensitive ? RegExpUtils.getCaseInsensitivePattern(keyword) : RegExpUtils.escapeRegExp(keyword);
147+
const keywordPattern = RegExpUtils.escapeRegExp(keyword);
148148
if (/\w/.test(keyword[0])) {
149149
if (/\w/.test(keyword[keyword.length - 1])) {
150150
groups.letter.push(keywordPattern);

packages/langium/src/parser/token-builder.ts

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import { Lexer } from 'chevrotain';
1111
import { isKeyword, isParserRule, isTerminalRule } from '../languages/generated/ast.js';
1212
import { streamAllContents } from '../utils/ast-utils.js';
1313
import { getAllReachableRules, terminalRegex } from '../utils/grammar-utils.js';
14-
import { getCaseInsensitivePattern, isWhitespace, partialMatches } from '../utils/regexp-utils.js';
14+
import { escapeRegExp, isWhitespace, partialMatches } from '../utils/regexp-utils.js';
1515
import { stream } from '../utils/stream.js';
1616

1717
export interface TokenBuilderOptions {
@@ -53,14 +53,10 @@ export class DefaultTokenBuilder implements TokenBuilder {
5353
const terminalTokens: TokenType[] = this.buildTerminalTokens(reachableRules);
5454
const tokens: TokenType[] = this.buildKeywordTokens(reachableRules, terminalTokens, options);
5555

56-
terminalTokens.forEach(terminalToken => {
57-
const pattern = terminalToken.PATTERN;
58-
if (typeof pattern === 'object' && pattern && 'test' in pattern && isWhitespace(pattern)) {
59-
tokens.unshift(terminalToken);
60-
} else {
61-
tokens.push(terminalToken);
62-
}
63-
});
56+
// Add all terminals tokens to the end in the order they were defined
57+
// Chevrotain documentation recommends to add Whitespace-like tokens at the start
58+
// However, assuming the lexer is able to optimize the tokens, it should not matter
59+
tokens.push(...terminalTokens);
6460
// We don't need to add the EOF token explicitly.
6561
// It is automatically available at the end of the token stream.
6662
return tokens;
@@ -148,7 +144,7 @@ export class DefaultTokenBuilder implements TokenBuilder {
148144

149145
protected buildKeywordPattern(keyword: Keyword, caseInsensitive: boolean): TokenPattern {
150146
return caseInsensitive ?
151-
new RegExp(getCaseInsensitivePattern(keyword.value)) :
147+
new RegExp(escapeRegExp(keyword.value), 'i') :
152148
keyword.value;
153149
}
154150

packages/langium/src/utils/regexp-utils.ts

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -155,12 +155,6 @@ export function escapeRegExp(value: string): string {
155155
return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
156156
}
157157

158-
export function getCaseInsensitivePattern(keyword: string): string {
159-
return Array.prototype.map.call(keyword, letter =>
160-
/\w/.test(letter) ? `[${letter.toLowerCase()}${letter.toUpperCase()}]` : escapeRegExp(letter)
161-
).join('');
162-
}
163-
164158
/**
165159
* Determines whether the given input has a partial match with the specified regex.
166160
* @param regex The regex to partially match against

packages/langium/test/parser/token-builder.test.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -140,23 +140,23 @@ describe('tokenBuilder#caseInsensitivePattern', () => {
140140
});
141141

142142
test('should create from keyword with special symbols', () => {
143-
expect(implementPattern).toEqual(/@[iI][mM][pP][lL][eE][mM][eE][nN][tT]/);
143+
expect(implementPattern).toEqual(/@implement/i);
144144
});
145145

146146
test('should create from keyword with special escape symbols', () => {
147-
expect(strangePattern).toEqual(/\\[sS][tT][rR][aA][nN][gG][eE]\\/);
147+
expect(strangePattern).toEqual(/\\strange\\/i);
148148
});
149149

150150
test('should create from mixed-case word', () => {
151-
expect(abcPattern).toEqual(/[aA][bB][cC]/);
151+
expect(abcPattern).toEqual(/AbC/i);
152152
});
153153

154154
test('should create from lower-case word', () => {
155-
expect(abPattern).toEqual(/[aA][bB]/);
155+
expect(abPattern).toEqual(/ab/i);
156156
});
157157

158158
test('should create from upper-case word', () => {
159-
expect(aPattern).toEqual(/[aA]/);
159+
expect(aPattern).toEqual(/A/i);
160160
});
161161

162162
test('should ignore terminals', () => {

0 commit comments

Comments
 (0)