Skip to content

Commit ed3db09

Browse files
committed
prototype an object-oriented recursive descent parser for ANF
1 parent adbe03f commit ed3db09

File tree

2 files changed

+545
-0
lines changed

2 files changed

+545
-0
lines changed

proto/oo_anf_parser.ts

Lines changed: 397 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,397 @@
1+
// Copyright (c) 2025 Marco Nikander
2+
3+
import { Item } from "../src/item";
4+
import { Lexeme, Token, TokenBoolean, TokenIdentifier, TokenNumber, TokenString, is_token } from "../src/lexer";
5+
import { remove_whitespace } from "../src/whitespace";
6+
7+
export type _Literal = _Boolean | _Number | _String;
8+
export type _Tail = _Atomic | _Call | _Complex;
9+
export type _Atomic = _Literal | _Binding | _Reference | _Lambda | _Block;
10+
export type _Complex = _IfThenElse;
11+
export type _Block = {id?: number, tk: number, tag: '_Block', let_bindings: _LetBind[], tail: _Tail};
12+
export type _LetBind = {id?: number, tk: number, tag: '_LetBind', binding: _Binding, value: (_Atomic | _Call)};
13+
export type _Lambda = {id?: number, tk: number, tag: '_Lambda', binding: _Binding, body: _Block};
14+
export type _Call = {id?: number, tk: number, tag: '_Call', fn: _Atomic, arg: _Atomic};
15+
export type _IfThenElse = {id?: number, tk: number, tag: '_IfThenElse', condition: _Atomic | _Call, then_branch: _Block, else_branch: _Block};
16+
export type _Binding = {id?: number, tk: number, tag: '_Binding', name: string};
17+
export type _Reference = {id?: number, tk: number, tag: '_Reference', target: string};
18+
export type _Boolean = {id?: number, tk: number, tag: '_Boolean', value: boolean};
19+
export type _Number = {id?: number, tk: number, tag: '_Number', value: number};
20+
export type _String = {id?: number, tk: number, tag: '_String', value: string};
21+
22+
export function is_literal(expr: Item): expr is _Literal { return is_boolean(expr) || is_number(expr) || is_string(expr); }
23+
export function is_tail(expr: Item): expr is _Tail { return is_atomic(expr) || is_call(expr) || is_complex(expr); }
24+
export function is_atomic(expr: Item): expr is _Atomic { return is_literal(expr) || is_binding(expr) || is_reference(expr) || is_lambda(expr) || is_block(expr); }
25+
export function is_complex(expr: Item): expr is _Complex { return is_if(expr); }
26+
export function is_block(expr: Item): expr is _Block { return expr.tag === '_Block'; }
27+
export function is_let(expr: Item): expr is _LetBind { return expr.tag === '_LetBind'; }
28+
export function is_lambda(expr: Item): expr is _Lambda { return expr.tag === '_Lambda'; }
29+
export function is_call(expr: Item): expr is _Call { return expr.tag === '_Call'; }
30+
export function is_if(expr: Item): expr is _IfThenElse { return expr.tag === '_IfThenElse'; }
31+
export function is_binding(expr: Item): expr is _Binding { return expr.tag === '_Binding'; }
32+
export function is_reference(expr: Item): expr is _Binding { return expr.tag === '_Reference'; }
33+
export function is_boolean(expr: Item): expr is _Boolean { return expr.tag === '_Boolean'; }
34+
export function is_number(expr: Item): expr is _Number { return expr.tag === '_Number'; }
35+
export function is_string(expr: Item): expr is _String { return expr.tag === '_String'; }
36+
37+
export type ParseError = { tag: 'ParseError', tk: number, message: string };
38+
39+
40+
//
41+
// With look-ahead I can decide on the correct rule to apply, that means if an error still occurs there is no coming back -> throw an exception
42+
// That will simplify the code drastically.
43+
// I need to make sure to remove the ambiguity from my grammar, by introducing a Call_or_Atomic rule,
44+
// which handles the ambiguity between ATOMIC and ATOMIC ATOMIC i.e. a function call
45+
46+
export function parse(tokens: readonly Token[]): _Block {
47+
const filtered_tokens: Token[] = remove_whitespace(tokens);
48+
let parser: ANF_Parser = new ANF_Parser(filtered_tokens);
49+
const ast: _Block = parser.block();
50+
51+
if (!parser.at_end()) {
52+
throw Error(`Expected input to be one program. A second program begins at token '${parser.peek().value}' of type '${parser.peek().lexeme}'.`);
53+
}
54+
55+
return ast;
56+
}
57+
58+
export class ANF_Parser {
59+
index: number;
60+
node_count: number;
61+
readonly tokens: readonly Token[];
62+
63+
constructor(tokens: readonly Token[]) {
64+
this.index = 0;
65+
this.node_count = 0;
66+
this.tokens = tokens;
67+
}
68+
69+
// Helper functions
70+
71+
// check: check type of current token against an expectation
72+
// expect: verify that the current token is what is expected, else error (not implemented)
73+
// consume: can be implemented as advance or expect-advance
74+
// match: expect-advance
75+
76+
// check if we have reached the end of the input
77+
at_end(): boolean {
78+
return is_token(this.peek(), 'EOF');
79+
}
80+
81+
// look at the current token
82+
peek(): Token {
83+
if (0 <= this.index && this.index >= this.tokens.length) {
84+
throw Error(`Current token is out of bounds.`);
85+
}
86+
else {
87+
return this.tokens[this.index];
88+
}
89+
}
90+
91+
// look at the previous token
92+
previous(): Token {
93+
if (0 <= this.index-1 && this.index-1 >= this.tokens.length) {
94+
throw Error(`Previous token is out of bounds.`);
95+
}
96+
else {
97+
return this.tokens[this.index-1];
98+
}
99+
}
100+
101+
// move on to the next token
102+
advance() {
103+
this.index++;
104+
}
105+
106+
// check type of current token against an expectation
107+
check(lexeme: Lexeme): undefined | Lexeme {
108+
if (is_token(this.peek(), lexeme)) {
109+
return lexeme;
110+
}
111+
else {
112+
return undefined;
113+
}
114+
}
115+
116+
// check the current token against an array of possibilities
117+
check_any(lexemes: Lexeme[]): undefined | Lexeme {
118+
let result: undefined | Lexeme = undefined;
119+
for (let i: number = 0; i<lexemes.length; i++) {
120+
result = result ?? this.check(lexemes[i]);
121+
}
122+
return result;
123+
}
124+
125+
// check type of current token against an expectation, advance if it's a match
126+
match(lexeme: Lexeme): undefined | Lexeme {
127+
if (is_token(this.peek(), lexeme)) {
128+
this.advance();
129+
return lexeme;
130+
}
131+
else {
132+
return undefined;
133+
}
134+
}
135+
136+
// creates an 'Error' for mismatched tokens, which helps with consistent formatting and reduction of boiler-plate
137+
report_expected(lexemes: Lexeme[]): Error {
138+
const expected: string = lexemes.reduce((acc: string, b: Lexeme) => `${acc}'${b}', `, "");
139+
return Error(`Expected '${expected}' got '${this.peek().value}' of type '${this.peek().lexeme}'.`);
140+
}
141+
142+
// BLOCK = open LET_STAR TAIL close
143+
block(): _Block {
144+
const id = this.node_count++;
145+
const tk = this.index;
146+
147+
if (!this.match('OPEN')) {
148+
throw this.report_expected(['OPEN']);
149+
}
150+
151+
const lets: _LetBind[] = this.let_star();
152+
const tail: _Tail = this.tail();
153+
154+
if (!this.match('CLOSE')) {
155+
throw this.report_expected(['CLOSE']);
156+
}
157+
158+
const node: _Block = {
159+
id: id,
160+
tk: tk,
161+
tag: '_Block',
162+
let_bindings: lets,
163+
tail: tail
164+
};
165+
return node;
166+
}
167+
168+
// LET_STAR = *('let' BINDING '=' ATOMIC_OR_CALL 'in')
169+
let_star(): _LetBind[] {
170+
let bindings: _LetBind[] = [];
171+
172+
while (is_token(this.peek(), 'LET')) {
173+
const id = this.node_count++;
174+
const tk = this.index;
175+
176+
if (!this.match('LET')) {
177+
throw this.report_expected(['LET']);
178+
}
179+
180+
const left = this.binding();
181+
182+
if (!this.match('ASSIGN')) {
183+
throw this.report_expected(['ASSIGN']);
184+
}
185+
186+
const right: _Atomic | _Call = this.atomic_or_call();
187+
188+
if (!this.match('IN')) {
189+
throw this.report_expected(['IN']);
190+
}
191+
192+
const node: _LetBind = {
193+
id: id,
194+
tk: tk,
195+
tag: '_LetBind',
196+
binding: left,
197+
value: right
198+
};
199+
bindings.push(node);
200+
}
201+
return bindings;
202+
}
203+
204+
// LAMBDA = 'lambda' BINDING BLOCK
205+
lambda(): _Lambda {
206+
const id = this.node_count++;
207+
const tk = this.index;
208+
209+
if (!this.match('LAMBDA')) {
210+
throw this.report_expected(['LAMBDA']);
211+
}
212+
213+
const binding = this.binding();
214+
const block = this.block();
215+
216+
const node: _Lambda = {
217+
id: id,
218+
tk: tk,
219+
tag: '_Lambda',
220+
binding: binding,
221+
body: block
222+
};
223+
return node;
224+
}
225+
226+
// IF = 'if' ATOMIC_OR_CALL 'then' BLOCK 'else' BLOCK
227+
if_then_else(): _IfThenElse {
228+
const id = this.node_count++;
229+
const tk = this.index;
230+
231+
if (!this.match('IF')) {
232+
throw this.report_expected(['IF']);
233+
}
234+
235+
const condition: _Atomic | _Call = this.atomic_or_call();
236+
237+
if (!this.match('THEN')) {
238+
throw this.report_expected(['THEN']);
239+
}
240+
241+
const then_branch: _Block = this.block();
242+
243+
if (!this.match('ELSE')) {
244+
throw this.report_expected(['ELSE']);
245+
}
246+
247+
const else_branch: _Block = this.block();
248+
249+
const node: _IfThenElse = {
250+
id: id,
251+
tk: tk,
252+
tag: '_IfThenElse',
253+
condition: condition,
254+
then_branch: then_branch,
255+
else_branch: else_branch
256+
};
257+
return node;
258+
}
259+
260+
// TAIL = ATOMIC_OR_CALL | COMPLEX
261+
tail(): _Tail {
262+
if(this.is_token_atomic())
263+
{
264+
return this.atomic_or_call();
265+
}
266+
else if (this.is_token_complex()) {
267+
return this.complex();
268+
}
269+
else {
270+
throw this.report_expected(['BOOLEAN', 'NUMBER', 'STRING', 'IDENTIFIER', 'LAMBDA', 'OPEN', 'IF']);
271+
}
272+
}
273+
274+
// ATOMIC_OR_CALL = ATOMIC [ATOMIC]
275+
atomic_or_call(): _Atomic | _Call {
276+
const id = this.node_count++;
277+
const tk = this.index;
278+
279+
const first_atom = this.atomic();
280+
281+
if(this.is_token_atomic()) {
282+
const second_atom = this.atomic();
283+
const call: _Call = {
284+
id: id,
285+
tk: tk,
286+
tag: '_Call',
287+
fn: first_atom,
288+
arg: second_atom
289+
};
290+
return call;
291+
}
292+
else {
293+
return first_atom;
294+
}
295+
}
296+
297+
// ATOMIC = LITERAL | REFERENCE | LAMBDA | BLOCK
298+
atomic(): _Atomic {
299+
const tok: Token = this.peek();
300+
if(this.is_token_literal()) {
301+
return this.literal();
302+
}
303+
else if (is_token(tok, 'IDENTIFIER')) {
304+
return this.reference();
305+
}
306+
else if (is_token(tok, 'LAMBDA')) {
307+
return this.lambda();
308+
}
309+
else if (is_token(tok, 'OPEN')) {
310+
return this.block();
311+
}
312+
else {
313+
throw this.report_expected(['BOOLEAN', 'NUMBER', 'STRING', 'IDENTIFIER', 'LAMBDA', 'OPEN']);
314+
}
315+
}
316+
is_token_atomic(): undefined | Lexeme {
317+
return this.check_any(['BOOLEAN', 'NUMBER', 'STRING', 'IDENTIFIER', 'LAMBDA', 'OPEN']);
318+
}
319+
320+
// COMPLEX = IF
321+
complex(): _Complex {
322+
const tok: Token = this.peek();
323+
if (is_token(tok, 'IF')) {
324+
return this.if_then_else();
325+
}
326+
else {
327+
throw this.report_expected(['IF']);
328+
}
329+
}
330+
is_token_complex(): undefined | Lexeme {
331+
return this.check_any(['IF']);
332+
}
333+
334+
// BINDING = identifier
335+
binding(): _Binding {
336+
const id = this.node_count++;
337+
const tk = this.index;
338+
339+
if (is_token(this.peek(), 'IDENTIFIER')) {
340+
const token = this.peek();
341+
this.advance();
342+
const node: _Binding = {
343+
id: id,
344+
tk: tk,
345+
tag: '_Binding',
346+
name: (token as TokenIdentifier).value
347+
};
348+
return node;
349+
}
350+
else {
351+
throw this.report_expected(['IDENTIFIER']);
352+
}
353+
}
354+
355+
// REFERENCE = identifier
356+
reference(): _Reference {
357+
const id = this.node_count++;
358+
const tk = this.index;
359+
360+
if (is_token(this.peek(), 'IDENTIFIER')) {
361+
const token = this.peek();
362+
this.advance();
363+
const node: _Reference = {
364+
id: id,
365+
tk: tk,
366+
tag: '_Reference',
367+
target: (token as TokenIdentifier).value
368+
};
369+
return node;
370+
}
371+
else {
372+
throw this.report_expected(['IDENTIFIER']);
373+
}
374+
}
375+
376+
// LITERAL = boolean | number | string
377+
literal(): _Literal {
378+
const id = this.node_count++;
379+
const tk = this.index;
380+
381+
if (this.match('BOOLEAN')) {
382+
return {id: id, tk: tk, tag: "_Boolean", value: (this.previous() as TokenBoolean).value};
383+
}
384+
else if (this.match('NUMBER')) {
385+
return {id: id, tk: tk, tag: "_Number", value: (this.previous() as TokenNumber).value};
386+
}
387+
else if (this.match('STRING')) {
388+
return {id: id, tk: tk, tag: "_String", value: (this.previous() as TokenString).value};
389+
}
390+
else {
391+
throw this.report_expected(['BOOLEAN', 'NUMBER', 'STRING']);
392+
}
393+
}
394+
is_token_literal(): undefined | Lexeme {
395+
return this.check_any(['BOOLEAN', 'NUMBER', 'STRING']);
396+
}
397+
}

0 commit comments

Comments
 (0)