Skip to content

Commit c1cd07f

Browse files
committed
remove whitespace tokens before parsing
1 parent e655ab9 commit c1cd07f

File tree

3 files changed

+64
-77
lines changed

3 files changed

+64
-77
lines changed

src/parser.ts

Lines changed: 2 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// Copyright (c) 2025 Marco Nikander
22

33
import { Token, TokenBoolean, TokenNumber, TokenString, TokenIdentifier, is_token_boolean, is_token_number, is_token_string, is_token_identifier, is_token_open, is_token_close, is_token_whitespace } from "./lexer";
4+
import { remove_whitespace } from "./whitespace";
45

56
export type Nested_Expression = Nested_Atom | Nested_Call | Nested_Lambda | Nested_Let | Nested_If;
67
export type Nested_Atom = Nested_Identifier | Nested_Binding | Nested_Boolean | Nested_Number | Nested_String;
@@ -35,7 +36,7 @@ class Parser {
3536
constructor(tokens: readonly Token[]) {
3637
this.index = 0;
3738
this.node_count = 0;
38-
this.tokens = tokens;
39+
this.tokens = remove_whitespace(tokens);
3940
}
4041

4142
peek(): Token {
@@ -76,8 +77,6 @@ class Parser {
7677
throw Error(`Parser::expr() is out-of-bounds (token ${this.index} of ${this.tokens.length})`);
7778
}
7879

79-
this.skip_whitespace();
80-
8180
if (is_token_boolean(this.peek())) {
8281
this.consume();
8382
const id = this.emit();
@@ -100,7 +99,6 @@ class Parser {
10099
}
101100
else if (is_token_open(this.peek())) {
102101
this.consume();
103-
this.skip_whitespace();
104102

105103
const potential_keyword: Token = this.peek();
106104
if (is_token_identifier(potential_keyword) && potential_keyword.value === "lambda") {
@@ -130,17 +128,13 @@ class Parser {
130128
const id = this.emit();
131129
const potential_keyword: Token = this.peek();
132130
this.consume();
133-
this.expect_whitespace();
134131

135132
if (!is_token_identifier(this.peek())) {
136133
throw new Error(`Expected an 'lambda' to be followed by an identifier but got a ${this.peek().lexeme} instead (token ${this.index} of ${this.tokens.length})`);
137134
}
138135
else {
139136
const variable: Nested_Binding = this.binding();
140-
this.expect_whitespace();
141-
142137
const body: Nested_Expression = this.expr();
143-
this.skip_whitespace();
144138
this.expect_closing();
145139
return { id: id, token: potential_keyword.id-1, tag: "Nested_Lambda", binding: variable, body: body };
146140
}
@@ -152,18 +146,14 @@ class Parser {
152146
const id = this.emit();
153147
const potential_keyword: Token = this.peek();
154148
this.consume();
155-
this.expect_whitespace();
156149

157150
if (!is_token_identifier(this.peek())) {
158151
throw new Error(`Expected an 'let' to be followed by an identifier but got a ${this.peek().lexeme} instead (token ${this.index} of ${this.tokens.length})`);
159152
}
160153
else {
161154
const variable: Nested_Binding = this.binding();
162-
this.expect_whitespace();
163155
const value: Nested_Expression = this.expr();
164-
this.expect_whitespace();
165156
const body: Nested_Expression = this.expr();
166-
this.skip_whitespace();
167157
this.expect_closing();
168158
return { id: id, token: potential_keyword.id-1, tag: "Nested_Let", binding: variable, value: value, body: body };
169159
}
@@ -176,13 +166,9 @@ class Parser {
176166
const potential_keyword: Token = this.peek();
177167
this.consume();
178168

179-
this.expect_whitespace();
180169
const condition: Nested_Expression = this.expr();
181-
this.expect_whitespace();
182170
const then_branch: Nested_Expression = this.expr();
183-
this.expect_whitespace();
184171
const else_branch: Nested_Expression = this.expr();
185-
this.skip_whitespace();
186172
this.expect_closing();
187173
return { id: id, token: potential_keyword.id-1, tag: "Nested_If", condition: condition, then_branch: then_branch, else_branch: else_branch };
188174
}
@@ -193,9 +179,7 @@ class Parser {
193179
const id = this.emit();
194180

195181
const fn: Nested_Expression = this.expr();
196-
this.expect_whitespace();
197182
const arg: Nested_Expression = this.expr();
198-
this.skip_whitespace();
199183
this.expect_closing();
200184
return { id: id, token: fn.token-1, tag: "Nested_Call", fn: fn, arg: arg };
201185
}
@@ -207,26 +191,6 @@ class Parser {
207191
return { id: id, token: token.id, tag: "Nested_Binding", name: token.value };
208192
}
209193

210-
skip_whitespace() {
211-
while (!this.is_at_end() && is_token_whitespace(this.peek())) {
212-
this.consume();
213-
}
214-
}
215-
216-
expect_whitespace() {
217-
if (this.is_at_end()) {
218-
throw Error(`Parser::expect_whitespace() is out-of-bounds (token ${this.index} of ${this.tokens.length})`);
219-
}
220-
else {
221-
if (is_token_whitespace(this.peek())) {
222-
this.consume();
223-
}
224-
else {
225-
throw new Error(`Expected a whitespace and another expression, but got a '${this.peek().lexeme}' instead (token ${this.index} of ${this.tokens.length})`);
226-
}
227-
}
228-
}
229-
230194
expect_closing() {
231195
if (this.is_at_end()) {
232196
throw Error(`Parser::expect_closing() is out-of-bounds (token ${this.index} of ${this.tokens.length})`);
@@ -244,9 +208,7 @@ class Parser {
244208

245209
export function parse(tokens: readonly Token[]) : { ast: Nested_Expression, node_count: number } {
246210
let parser = new Parser(tokens);
247-
parser.skip_whitespace();
248211
const expression: Nested_Expression = parser.expr();
249-
parser.skip_whitespace();
250212

251213
if (parser.is_at_end()) {
252214
return { ast: expression, node_count: parser.node_count };

src/whitespace.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,23 @@ export function check_whitespace(tokens: readonly Token[]): boolean {
5050
return true;
5151
}
5252

53+
export function remove_whitespace(tokens: readonly Token[]): Token[] {
54+
let output: Token[] = [];
55+
56+
let new_i: number = 0;
57+
for (let i = 0; i < tokens.length; i++) {
58+
const tk = tokens[i];
59+
if (!is_token_whitespace(tk)) {
60+
let temp = tk;
61+
temp.id = new_i;
62+
output.push(temp);
63+
new_i++;
64+
}
65+
}
66+
67+
return output;
68+
}
69+
5370
function make_whitespace(token_number: number, character_offset: number): TokenWhitespace {
5471
return { tag: 'Token', lexeme: 'WHITESPACE', id: token_number, offset: character_offset, value: ' ' };
5572
}

test/flatten.test.ts

Lines changed: 45 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { describe, it, expect } from 'vitest'
2-
import { lex } from '../src/lexer'
2+
import { lex, Token } from '../src/lexer'
33
import { parse } from '../src/parser';
44
import { Flat_Expression, Flat_AST } from "../src/flat_ast";
55
import { flatten } from '../src/flatten';
@@ -8,7 +8,8 @@ describe('convert atoms', () => {
88

99
it('must parse "true" to a boolean', () => {
1010
const text: string = 'true';
11-
const parsed = parse(lex(text));
11+
const lexed: Token[] = lex(text);
12+
const parsed = parse(lexed);
1213
const node_count: number = parsed.node_count;
1314
const flat_ast: Flat_AST = flatten(parsed.ast, parsed.node_count);
1415

@@ -23,7 +24,8 @@ describe('convert atoms', () => {
2324

2425
it('must parse "-0.1" to a number'), () => {
2526
const text: string = '-0.1';
26-
const parsed = parse(lex(text));
27+
const lexed: Token[] = lex(text);
28+
const parsed = parse(lexed);
2729
const node_count: number = parsed.node_count;
2830
const flat_ast: Flat_AST = flatten(parsed.ast, parsed.node_count);
2931

@@ -39,16 +41,17 @@ describe('convert atoms', () => {
3941
describe('expressions', () => {
4042
it('must produce a valid AST for arithmetic expressions', () => {
4143
const text: string = "((+ 1) 2)";
42-
const parsed = parse(lex(text));
44+
const lexed: Token[] = lex(text);
45+
const parsed = parse(lexed);
4346
const node_count: number = parsed.node_count;
4447
const flat_ast: Flat_AST = flatten(parsed.ast, parsed.node_count);
4548

4649
const expected: Flat_Expression[] = [
4750
{id: 0, token: 0, tag: 'Flat_Call', body: {id: 1}, arg: {id: 4}},
4851
{id: 1, token: 1, tag: 'Flat_Call', body: {id: 2}, arg: {id: 3}},
4952
{id: 2, token: 2, tag: 'Flat_Identifier', name: '+'},
50-
{id: 3, token: 4, tag: 'Flat_Literal', value: 1},
51-
{id: 4, token: 7, tag: 'Flat_Literal', value: 2},
53+
{id: 3, token: 3, tag: 'Flat_Literal', value: 1},
54+
{id: 4, token: 5, tag: 'Flat_Literal', value: 2},
5255
];
5356

5457
expect(node_count).toBe(5);
@@ -58,16 +61,17 @@ describe('expressions', () => {
5861

5962
it('must produce a valid AST for a simple lambda expression', () => {
6063
const text: string = "((lambda x x) 42)";
61-
const parsed = parse(lex(text));
64+
const lexed: Token[] = lex(text);
65+
const parsed = parse(lexed);
6266
const node_count: number = parsed.node_count;
6367
const flat_ast: Flat_AST = flatten(parsed.ast, parsed.node_count);
6468

6569
const expected: Flat_Expression[] = [
6670
{id: 0, token: 0, tag: 'Flat_Call', body: {id: 1}, arg: {id: 4}},
6771
{id: 1, token: 1, tag: 'Flat_Lambda', binding: {id: 2}, body: {id: 3}},
68-
{id: 2, token: 4, tag: 'Flat_Binding', name: 'x'},
69-
{id: 3, token: 6, tag: 'Flat_Identifier', name: 'x'},
70-
{id: 4, token: 9, tag: 'Flat_Literal', value: 42},
72+
{id: 2, token: 3, tag: 'Flat_Binding', name: 'x'},
73+
{id: 3, token: 4, tag: 'Flat_Identifier', name: 'x'},
74+
{id: 4, token: 6, tag: 'Flat_Literal', value: 42},
7175
];
7276

7377
expect(node_count).toBe(5);
@@ -77,20 +81,21 @@ describe('expressions', () => {
7781

7882
it('must produce a valid AST for lambda expressions', () => {
7983
const text: string = "(((lambda a (lambda b a)) 1) 2)";
80-
const parsed = parse(lex(text));
84+
const lexed: Token[] = lex(text);
85+
const parsed = parse(lexed);
8186
const node_count: number = parsed.node_count;
8287
const flat_ast: Flat_AST = flatten(parsed.ast, parsed.node_count);
8388

8489
const expected: Flat_Expression[] = [
8590
{id: 0, token: 0, tag: 'Flat_Call', body: {id: 1}, arg: {id: 8}},
8691
{id: 1, token: 1, tag: 'Flat_Call', body: {id: 2}, arg: {id: 7}},
8792
{id: 2, token: 2, tag: 'Flat_Lambda', binding: {id: 3}, body: {id: 4}},
88-
{id: 3, token: 5, tag: 'Flat_Binding', name: 'a'},
89-
{id: 4, token: 7, tag: 'Flat_Lambda', binding: {id: 5}, body: {id: 6}},
90-
{id: 5, token: 10, tag: 'Flat_Binding', name: 'b'},
91-
{id: 6, token: 12, tag: 'Flat_Identifier', name: 'a'},
92-
{id: 7, token: 16, tag: 'Flat_Literal', value: 1},
93-
{id: 8, token: 19, tag: 'Flat_Literal', value: 2}
93+
{id: 3, token: 4, tag: 'Flat_Binding', name: 'a'},
94+
{id: 4, token: 5, tag: 'Flat_Lambda', binding: {id: 5}, body: {id: 6}},
95+
{id: 5, token: 7, tag: 'Flat_Binding', name: 'b'},
96+
{id: 6, token: 8, tag: 'Flat_Identifier', name: 'a'},
97+
{id: 7, token: 11, tag: 'Flat_Literal', value: 1},
98+
{id: 8, token: 13, tag: 'Flat_Literal', value: 2}
9499
];
95100

96101
expect(node_count).toBe(9);
@@ -100,15 +105,16 @@ describe('expressions', () => {
100105

101106
it('must produce a valid AST for let-bindings', () => {
102107
const text: string = "(let x 42 x)";
103-
const parsed = parse(lex(text));
108+
const lexed: Token[] = lex(text);
109+
const parsed = parse(lexed);
104110
const node_count: number = parsed.node_count;
105111
const flat_ast: Flat_AST = flatten(parsed.ast, parsed.node_count);
106112

107113
const expected: Flat_Expression[] = [
108114
{id: 0, token: 0, tag: 'Flat_Let', binding: {id: 1}, value: {id: 2}, body: {id: 3}},
109-
{id: 1, token: 3, tag: 'Flat_Binding', name: 'x'},
110-
{id: 2, token: 5, tag: 'Flat_Literal', value: 42},
111-
{id: 3, token: 7, tag: 'Flat_Identifier', name: 'x'},
115+
{id: 1, token: 2, tag: 'Flat_Binding', name: 'x'},
116+
{id: 2, token: 3, tag: 'Flat_Literal', value: 42},
117+
{id: 3, token: 4, tag: 'Flat_Identifier', name: 'x'},
112118
];
113119

114120
expect(node_count).toBe(4);
@@ -118,23 +124,24 @@ describe('expressions', () => {
118124

119125
it('must produce a valid AST when let-binding to a function', () => {
120126
const text: string = "(let increment (lambda x ((+ 1) x)) (increment 41))";
121-
const parsed = parse(lex(text));
127+
const lexed: Token[] = lex(text);
128+
const parsed = parse(lexed);
122129
const node_count: number = parsed.node_count;
123130
const flat_ast: Flat_AST = flatten(parsed.ast, parsed.node_count);
124131

125132
const expected: Flat_Expression[] = [
126133
{id: 0, token: 0, tag: 'Flat_Let', binding: {id: 1}, value: {id: 2}, body: {id: 9}},
127-
{id: 1, token: 3, tag: 'Flat_Binding', name: 'increment'},
128-
{id: 2, token: 5, tag: 'Flat_Lambda', binding: {id: 3}, body: {id: 4}},
129-
{id: 3, token: 8, tag: 'Flat_Binding', name: 'x'},
130-
{id: 4, token: 10, tag: 'Flat_Call', body: {id: 5}, arg: {id: 8}}, // 8 is large
131-
{id: 5, token: 11, tag: 'Flat_Call', body: {id: 6}, arg: {id: 7}},
132-
{id: 6, token: 12, tag: 'Flat_Identifier', name: '+'},
133-
{id: 7, token: 14, tag: 'Flat_Literal', value: 1},
134-
{id: 8, token: 17, tag: 'Flat_Identifier', name: 'x'},
135-
{id: 9, token: 21, tag: 'Flat_Call', body: {id: 10}, arg: {id: 11}},
136-
{id: 10, token: 22, tag: 'Flat_Identifier', name: 'increment'},
137-
{id: 11, token: 24, tag: 'Flat_Literal', value: 41},
134+
{id: 1, token: 2, tag: 'Flat_Binding', name: 'increment'},
135+
{id: 2, token: 3, tag: 'Flat_Lambda', binding: {id: 3}, body: {id: 4}},
136+
{id: 3, token: 5, tag: 'Flat_Binding', name: 'x'},
137+
{id: 4, token: 6, tag: 'Flat_Call', body: {id: 5}, arg: {id: 8}}, // 8 is large
138+
{id: 5, token: 7, tag: 'Flat_Call', body: {id: 6}, arg: {id: 7}},
139+
{id: 6, token: 8, tag: 'Flat_Identifier', name: '+'},
140+
{id: 7, token: 9, tag: 'Flat_Literal', value: 1},
141+
{id: 8, token: 11, tag: 'Flat_Identifier', name: 'x'},
142+
{id: 9, token: 14, tag: 'Flat_Call', body: {id: 10}, arg: {id: 11}},
143+
{id: 10, token: 15, tag: 'Flat_Identifier', name: 'increment'},
144+
{id: 11, token: 16, tag: 'Flat_Literal', value: 41},
138145
];
139146

140147
expect(node_count).toBe(12);
@@ -144,15 +151,16 @@ describe('expressions', () => {
144151

145152
it('must produce a valid AST for if-expressions', () => {
146153
const text: string = "(if true 42 0)";
147-
const parsed = parse(lex(text));
154+
const lexed: Token[] = lex(text);
155+
const parsed = parse(lexed);
148156
const node_count: number = parsed.node_count;
149157
const flat_ast: Flat_AST = flatten(parsed.ast, parsed.node_count);
150158

151159
const expected: Flat_Expression[] = [
152160
{id: 0, token: 0, tag: 'Flat_If', condition: {id: 1}, then_branch: {id: 2}, else_branch: {id: 3}},
153-
{id: 1, token: 3, tag: 'Flat_Literal', value: true},
154-
{id: 2, token: 5, tag: 'Flat_Literal', value: 42},
155-
{id: 3, token: 7, tag: 'Flat_Literal', value: 0},
161+
{id: 1, token: 2, tag: 'Flat_Literal', value: true},
162+
{id: 2, token: 3, tag: 'Flat_Literal', value: 42},
163+
{id: 3, token: 4, tag: 'Flat_Literal', value: 0},
156164
];
157165

158166
expect(node_count).toBe(4);

0 commit comments

Comments
 (0)