Skip to content

Commit 6cf8a10

Browse files
authored
Merge pull request #9 from regeda/tokenizer
Tokenizer
2 parents e4f2d72 + b62b95f commit 6cf8a10

File tree

22 files changed

+1673
-2786
lines changed

22 files changed

+1673
-2786
lines changed

Makefile

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.PHONY: gen-proto gen-fbs gen-lexer test bench bench-report escape-analysis
1+
.PHONY: gen-proto gen-fbs gen-tokenz test bench bench-report escape-analysis
22

33
default: test
44

@@ -10,11 +10,11 @@ gen-fbs:
1010
@rm -f bytecode/*.go
1111
@flatc -g -o . bytecode/proto.fbs
1212

13-
gen-lexer: gen-proto
14-
@ragel -Z -G2 lexer/lexer.go.rl -o lexer/lexer.go
15-
@goimports -w lexer/lexer.go
13+
gen-tokenz: gen-proto
14+
@ragel -Z -G2 tokenz/tokenz.go.rl -o tokenz/tokenz.go
15+
@goimports -w tokenz/tokenz.go
1616

17-
test: gen-fbs gen-lexer
17+
test: gen-fbs gen-tokenz
1818
@go test -v -cover ./...
1919

2020
bench: test

README.md

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Expr – a tiny stack-based virtual machine written in Go
2+
23
[![Build Status](https://travis-ci.com/regeda/expr.svg?branch=main)](https://travis-ci.com/regeda/expr)
34
[![codecov](https://codecov.io/gh/regeda/expr/branch/main/graph/badge.svg?token=99QXNC2IAO)](https://codecov.io/gh/regeda/expr)
45
[![Go Reference](https://pkg.go.dev/badge/gihub.com/regeda/expr.svg)](https://pkg.go.dev/github.com/regeda/expr)
@@ -90,16 +91,34 @@ contains([1, 2, 3], 4) // false
9091
```
9192

9293
## Architecture
93-
The architecture consists of 3 components:
94-
1. Lexer
95-
2. Compiler
96-
3. Virtual Machine
9794

98-
**The lexer** generates a syntax tree parsing the input text:
95+
The architecture consists of 4 components:
96+
1. Tokenizer
97+
2. Syntax Tree Builder
98+
3. Compiler
99+
4. Virtual Machine
100+
101+
**The Tokenizer** parses the input text:
99102
```
100103
join(",", ["a", "b"])
101104
```
102-
The resulted syntax tree:
105+
and returns the following tokens:
106+
```
107+
IDENT join
108+
PUNCT (
109+
STR ","
110+
PUNCT ,
111+
PUNCT [
112+
STR "a"
113+
PUNCT ,
114+
STR "b"
115+
PUNCT ]
116+
PUNCT )
117+
```
118+
119+
> The tokenizer is implemented using [Ragel State Machine Compiler](https://www.colm.net/open-source/ragel/).
120+
121+
**The Syntax Tree Builder** generates a syntax tree from tokens:
103122
```
104123
EXIT
105124
|-- CALL(join)
@@ -109,9 +128,9 @@ EXIT
109128
|-- STR("b")
110129
```
111130

112-
> The lexer is implemented using [Ragel State Machine Compiler](https://www.colm.net/open-source/ragel/). The syntax tree is described by [Protocol Buffers 3](https://developers.google.com/protocol-buffers/) to make it easy traversable by any programming language.
131+
> A schema of the syntax tree is described by [Protocol Buffers 3](https://developers.google.com/protocol-buffers/) to make it easy traversable by any programming language.
113132
114-
**The compiler** makes a bytecode from the syntax tree to make it executable by **a stack-based virtual machine**:
133+
**The Compiler** makes a bytecode from the syntax tree to make it executable by **a stack-based virtual machine**:
115134
```
116135
PUSH_STR ","
117136
PUSH_STR "a"
@@ -159,7 +178,7 @@ if err != nil {
159178
}
160179
// `addr` contains the result, see github.com/regeda/expr/memory.Addr
161180
```
162-
> `Exec` is **not designed** to be run in concurrent environment. However, you can define a pool of executors to consume them in the safe mode.
181+
> `Exec` is **not designed** to be run in the concurrent environment. However, you can define a pool of executors to consume them in the safe mode.
163182
164183
## Benchmark
165184

asm/asm.go

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,23 @@
11
package asm
22

33
import (
4+
"github.com/regeda/expr/ast"
45
"github.com/regeda/expr/compiler"
5-
"github.com/regeda/expr/lexer"
6+
"github.com/regeda/expr/tokenz"
67
)
78

89
type ASM struct {
9-
lex *lexer.Lexer
10-
comp *compiler.Compiler
11-
}
12-
13-
func New() *ASM {
14-
return &ASM{
15-
lex: lexer.New(),
16-
comp: compiler.New(),
17-
}
10+
comp compiler.Compiler
11+
tkz tokenz.Tokenz
12+
astb ast.Builder
1813
}
1914

2015
func (a *ASM) Assemble(code []byte) ([]byte, error) {
21-
ast, err := a.lex.Parse(code)
16+
tokens, err := a.tkz.Parse(code)
17+
if err != nil {
18+
return nil, err
19+
}
20+
ast, err := a.astb.Build(tokens)
2221
if err != nil {
2322
return nil, err
2423
}

ast/ast.pb.go

Lines changed: 31 additions & 27 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ast/ast.proto

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@ package ast;
44

55
message Node {
66
enum Token {
7-
EXIT = 0;
8-
CALL = 1;
9-
STR = 2;
10-
INT = 3;
11-
BOOL = 4;
12-
ARR = 5;
7+
EXIT = 0;
8+
CALL = 1;
9+
STR = 2;
10+
INT = 3;
11+
BOOL = 4;
12+
ARR = 5;
13+
IDENT = 6;
1314
}
1415

1516
Token token = 1;

ast/builder.go

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
package ast
2+
3+
import (
4+
"bytes"
5+
"fmt"
6+
"strconv"
7+
8+
"github.com/pkg/errors"
9+
"github.com/regeda/expr/tokenz"
10+
)
11+
12+
var (
13+
errTkPunctWrongLen = errors.New("The token Punct should contain 1 byte of data")
14+
)
15+
16+
var (
17+
valueAfterPunct = []byte{'[', '(', ','}
18+
commaAfterPunct = []byte{']', ')'}
19+
)
20+
21+
type Builder struct {
22+
st stack
23+
}
24+
25+
func (b *Builder) Build(tokens []tokenz.Token) (*Node, error) {
26+
b.st.reset()
27+
28+
b.st.push(Exit())
29+
30+
btk := tokenz.Token{Tk: tokenz.TkNone}
31+
32+
for _, t := range tokens {
33+
switch t.Tk {
34+
case tokenz.TkInt:
35+
v, err := strconv.ParseInt(string(t.Dat), 10, 64)
36+
if err != nil {
37+
return nil, err
38+
}
39+
if !expectValueAfter(btk) {
40+
return nil, fmt.Errorf("unexpected integer after %v", btk)
41+
}
42+
b.st.nest(Int(v))
43+
case tokenz.TkStr:
44+
v, err := strconv.Unquote(string(t.Dat))
45+
if err != nil {
46+
return nil, errors.Wrapf(err, "strconv.Unquote %s", t.Dat)
47+
}
48+
if !expectValueAfter(btk) {
49+
return nil, fmt.Errorf("unexpected string after %v", btk)
50+
}
51+
b.st.nest(Str(v))
52+
case tokenz.TkIdent:
53+
if !expectValueAfter(btk) {
54+
return nil, fmt.Errorf("unexpected ident after %v", btk)
55+
}
56+
b.st.push(Ident(string(t.Dat)))
57+
case tokenz.TkTrue:
58+
if !expectValueAfter(btk) {
59+
return nil, fmt.Errorf("unexpected TRUE after %v", btk)
60+
}
61+
b.st.nest(True)
62+
case tokenz.TkFalse:
63+
if !expectValueAfter(btk) {
64+
return nil, fmt.Errorf("unexpected FALSE after %v", btk)
65+
}
66+
b.st.nest(False)
67+
case tokenz.TkPunct:
68+
if !t.DatLen(1) {
69+
return nil, errTkPunctWrongLen
70+
}
71+
switch t.Dat[0] {
72+
case '[':
73+
if !expectValueAfter(btk) {
74+
return nil, fmt.Errorf("unexpected array after %v", btk)
75+
}
76+
b.st.push(b.st.nest(Arr()))
77+
case ']':
78+
switch btk.Tk {
79+
case tokenz.TkInt, tokenz.TkStr, tokenz.TkTrue, tokenz.TkFalse, tokenz.TkPunct:
80+
default:
81+
return nil, fmt.Errorf("unexpected array closing after %v", btk)
82+
}
83+
n := b.st.pop()
84+
if n.Token != Node_ARR {
85+
return nil, fmt.Errorf("stack error: expected array, got %v", n.Token)
86+
}
87+
case '(':
88+
switch btk.Tk {
89+
case tokenz.TkIdent:
90+
n := b.st.pop()
91+
n.Token = Node_CALL
92+
b.st.push(b.st.nest(n))
93+
default:
94+
return nil, fmt.Errorf("unexpected invokation after %v", btk)
95+
}
96+
case ')':
97+
switch btk.Tk {
98+
case tokenz.TkInt, tokenz.TkStr, tokenz.TkTrue, tokenz.TkFalse, tokenz.TkPunct:
99+
default:
100+
return nil, fmt.Errorf("unexpected invokation closing after %v", btk)
101+
}
102+
n := b.st.pop()
103+
if n.Token != Node_CALL {
104+
return nil, fmt.Errorf("stack error: expected invokation, got %v", n.Token)
105+
}
106+
case ',':
107+
switch btk.Tk {
108+
case tokenz.TkInt, tokenz.TkStr, tokenz.TkTrue, tokenz.TkFalse:
109+
n := b.st.top()
110+
if n.Token != Node_CALL && n.Token != Node_ARR {
111+
return nil, fmt.Errorf("unexpected comma after %v", btk)
112+
}
113+
case tokenz.TkPunct:
114+
if bytes.IndexByte(commaAfterPunct, btk.Dat[0]) == -1 {
115+
return nil, fmt.Errorf("unexpected comma after %v", btk)
116+
}
117+
default:
118+
return nil, fmt.Errorf("unexpected comma after %v", btk)
119+
}
120+
default:
121+
return nil, fmt.Errorf("unexpected punct %s after %v", t.Dat, btk)
122+
}
123+
default:
124+
return nil, fmt.Errorf("unexpected token %v", t)
125+
}
126+
127+
btk = t
128+
}
129+
130+
if b.st.len() != 1 {
131+
return nil, fmt.Errorf("unexpected stack length %v", b.st.len())
132+
}
133+
134+
return b.st[0], nil
135+
}
136+
137+
func expectValueAfter(t tokenz.Token) bool {
138+
switch t.Tk {
139+
case tokenz.TkNone:
140+
return true
141+
case tokenz.TkPunct:
142+
return bytes.IndexByte(valueAfterPunct, t.Dat[0]) != -1
143+
default:
144+
return false
145+
}
146+
}

0 commit comments

Comments
 (0)