Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions packages/langium/src/parser/lexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import type { ILexingError, IMultiModeLexerDefinition, IToken, TokenType, TokenTypeDictionary, TokenVocabulary } from 'chevrotain';
import type { LangiumCoreServices } from '../services.js';
import { IndentationAwareTokenBuilder } from './token-builder.js';
import { Lexer as ChevrotainLexer } from 'chevrotain';

export interface LexerResult {
Expand Down Expand Up @@ -66,6 +67,45 @@ export class DefaultLexer implements Lexer {
}
}

/**
* A lexer that is aware of indentation in the input text.
* The only purpose of this lexer is to reset the internal state of the {@link IndentationAwareTokenBuilder}
* between the tokenization of different text inputs.
*
* In your module, you can override the default lexer with this one as such:
* ```ts
* parser: {
* TokenBuilder: () => new IndentationAwareTokenBuilder(),
* Lexer: (services) => new IndentationAwareLexer(services),
* }
* ```
*/
export class IndentationAwareLexer extends DefaultLexer {
private indentationTokenBuilder?: IndentationAwareTokenBuilder;

constructor(services: LangiumCoreServices) {
super(services);
if (services.parser.TokenBuilder instanceof IndentationAwareTokenBuilder) {
this.indentationTokenBuilder = services.parser.TokenBuilder;
}
}

override tokenize(text: string): LexerResult {
const result = super.tokenize(text);

if (!this.indentationTokenBuilder) {
// A token builder other than the expected IndentationAwareTokenBuilder is used
return result;
}

// reset the indent stack between processing of different text inputs
const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text);
result.tokens.push(...remainingDedents);

return result;
}
}

/**
* Returns a check whether the given TokenVocabulary is TokenType array
*/
Expand Down
263 changes: 261 additions & 2 deletions packages/langium/src/parser/token-builder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
* terms of the MIT License, which is available in the project root.
******************************************************************************/

import type { CustomPatternMatcherFunc, TokenPattern, TokenType, TokenVocabulary } from 'chevrotain';
import type { CustomPatternMatcherFunc, TokenPattern, TokenType, TokenVocabulary, IToken } from 'chevrotain';
import type { AbstractRule, Grammar, Keyword, TerminalRule } from '../languages/generated/ast.js';
import type { Stream } from '../utils/stream.js';
import { Lexer } from 'chevrotain';
import { createToken, createTokenInstance, Lexer } from 'chevrotain';
import { isKeyword, isParserRule, isTerminalRule } from '../languages/generated/ast.js';
import { streamAllContents } from '../utils/ast-utils.js';
import { getAllReachableRules, terminalRegex } from '../utils/grammar-utils.js';
import { getCaseInsensitivePattern, isWhitespace, partialMatches } from '../utils/regexp-utils.js';
import { stream } from '../utils/stream.js';
import { isTokenTypeArray } from './lexer.js';

export interface TokenBuilderOptions {
caseInsensitive?: boolean
Expand Down Expand Up @@ -118,3 +119,261 @@ export class DefaultTokenBuilder implements TokenBuilder {
}, []);
}
}

export interface IndentationTokenBuilderOptions {
/**
* The name of the token used to denote indentation in the grammar.
* A possible definition in the grammar could look like this:
* ```langium
* terminal INDENT: ':synthetic-indent:';
* ```
*
* @default 'INDENT'
*/
indentTokenName: string;
/**
* The name of the token used to denote deindentation in the grammar.
* A possible definition in the grammar could look like this:
* ```langium
* terminal DEDENT: ':synthetic-dedent:';
* ```
*
* @default 'DEDENT'
*/
dedentTokenName: string;
/**
* The name of the token used to denote whitespace other than indentation and newlines in the grammar.
* A possible definition in the grammar could look like this:
* ```langium
* hidden terminal WS: /[ \t]+/;
* ```
*
* @default 'WS'
*/
whitespaceTokenName: string;
}

const indetationBuilderDefaultOptions: IndentationTokenBuilderOptions = {
indentTokenName: 'INDENT',
dedentTokenName: 'DEDENT',
whitespaceTokenName: 'WS',
};

/**
* A token builder that is sensitive to indentation in the input text.
* It will generate tokens for indentation and dedentation based on the indentation level.
*
* Inspired by https://github.com/chevrotain/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js
*/
export class IndentationAwareTokenBuilder extends DefaultTokenBuilder {
private indentationStack: number[] = [0];
private options: IndentationTokenBuilderOptions;

/**
* The token type to be used for indentation tokens
*/
protected indentTokenType: TokenType;

/**
* The token type to be used for dedentation tokens
*/
protected dedentTokenType: TokenType;

/**
* A regular expression to match a series of tabs and/or spaces.
* Override this to customize what the indentation is allowed to consist of.
*/
protected whitespaceRegExp = /[ \t]+/y;

constructor(options: Partial<IndentationTokenBuilderOptions> = indetationBuilderDefaultOptions) {
super();
this.options = {
...indetationBuilderDefaultOptions,
...options,
};

this.indentTokenType = createToken({
name: this.options.indentTokenName,
pattern: this.indentMatcher,
line_breaks: false,
});

this.dedentTokenType = createToken({
name: this.options.dedentTokenName,
pattern: this.dedentMatcher,
line_breaks: false,
});
}

override buildTokens(grammar: Grammar, options?: TokenBuilderOptions | undefined) {
const tokenTypes = super.buildTokens(grammar, options);
if (!isTokenTypeArray(tokenTypes)) {
throw new Error('Invalid tokens built by default builder');
}

const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options;

// Rearrange tokens because whitespace (which is ignored) goes to the beginning by default, consuming indentation as well
// Order should be: dedent, indent, spaces
const dedent = tokenTypes.find(tok => tok.name === dedentTokenName);
const indent = tokenTypes.find(tok => tok.name === indentTokenName);
const ws = tokenTypes.find(tok => tok.name === whitespaceTokenName);
if (!dedent || !indent || !ws) {
throw new Error('Some indentation/whitespace tokens not found!');
}

const spaceTokens = [dedent, indent, ws];
const otherTokens = tokenTypes.filter(tok => !spaceTokens.includes(tok));
return [...spaceTokens, ...otherTokens];
}

private isStartOfLine(text: string, offset: number): boolean {
return offset === 0 || '\r\n'.includes(text[offset - 1]);
}

private matchWhitespace(text: string, offset: number) {
this.whitespaceRegExp.lastIndex = offset;
const match = this.whitespaceRegExp.exec(text);
return {
currIndentLevel: match?.[0].length ?? 0,
prevIndentLevel: this.indentationStack.at(-1)!,
match,
};
}

private createIndentationTokenInstance(tokenType: TokenType, text: string, image: string, offset: number) {
const lineNumber = text.substring(0, offset).split(/\r\n|\r|\n/).length;
return createTokenInstance(
tokenType,
image,
offset, offset + image.length,
lineNumber, lineNumber,
0, image.length,
);
}

/**
* A custom pattern for matching indents
*
* @param text The full input string.
* @param offset The offset at which to attempt a match
* @param tokens Previously scanned Tokens
* @param groups Token Groups
*/
protected indentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => {
const {indentTokenName} = this.options;

if (!this.isStartOfLine(text, offset)) {
return null;
}

const {currIndentLevel, prevIndentLevel, match} = this.matchWhitespace(text, offset);

if (currIndentLevel <= prevIndentLevel) {
// shallower indentation (should be matched by dedent)
// or same indentation level (should be matched by whitespace and ignored)
return null;
}

this.indentationStack.push(currIndentLevel);

const indentToken = this.createIndentationTokenInstance(
this.indentTokenType,
text,
match?.[0] ?? indentTokenName,
offset,
);
tokens.push(indentToken);

// Token already added, let the indentation now be consumed as whitespace and ignored
return null;
};

/**
* A custom pattern for matching dedents
*
* @param text The full input string.
* @param offset The offset at which to attempt a match
* @param tokens Previously scanned Tokens
* @param groups Token Groups
*/
protected dedentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => {
const {dedentTokenName} = this.options;

if (!this.isStartOfLine(text, offset)) {
return null;
}

const {currIndentLevel, prevIndentLevel, match} = this.matchWhitespace(text, offset);

if (currIndentLevel >= prevIndentLevel) {
// bigger indentation (should be matched by indent)
// or same indentation level (should be matched by whitespace and ignored)
return null;
}

const matchIndentIndex = this.indentationStack.lastIndexOf(currIndentLevel);

// Any dedent must match some previous indentation level.
if (matchIndentIndex === -1) {
console.error(`Invalid dedent level ${currIndentLevel} at offset: ${offset}. Current indetation stack: ${this.indentationStack}`);
// throwing an error would crash the language server
// TODO: find a way to report error diagnostics message
return null;
}

const numberOfDedents = this.indentationStack.length - matchIndentIndex - 1;

for (let i = 0; i < numberOfDedents; i++) {
const token = this.createIndentationTokenInstance(
this.dedentTokenType,
text,
match?.[0] ?? dedentTokenName,
offset,
);
tokens.push(token);
this.indentationStack.pop();
}

// Token already added, let the dedentation now be consumed as whitespace and ignored
return null;
};

protected override buildTerminalToken(terminal: TerminalRule): TokenType {
const tokenType = super.buildTerminalToken(terminal);
const {indentTokenName, dedentTokenName, whitespaceTokenName} = this.options;

if (tokenType.name === indentTokenName) {
return this.indentTokenType;
} else if (tokenType.name === dedentTokenName) {
return this.dedentTokenType;
} else if (tokenType.name === whitespaceTokenName) {
return createToken({
name: whitespaceTokenName,
pattern: this.whitespaceRegExp,
group: Lexer.SKIPPED,
});
}

return tokenType;
}

/**
* Resets the indentation stack between different runs of the lexer
*
* @param text Full text that was tokenized
* @returns Remaining dedent tokens to match all previous indents at the end of the file
*/
public popRemainingDedents(text: string) {
const remainingDedents: IToken[] = [];
while (this.indentationStack.length > 1) {
remainingDedents.push(
this.createIndentationTokenInstance(this.dedentTokenType, text, this.options.dedentTokenName, text.length)
);
this.indentationStack.pop();
}

this.indentationStack = [0];
return remainingDedents;
}
}