Skip to content

Commit 57998c1

Browse files
committed
Add lexer
0 parents  commit 57998c1

File tree

2 files changed

+463
-0
lines changed

2 files changed

+463
-0
lines changed

lexer.go

Lines changed: 319 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,319 @@
1+
package expr
2+
3+
import (
4+
"fmt"
5+
"strconv"
6+
"strings"
7+
"unicode"
8+
"unicode/utf8"
9+
)
10+
11+
type tokenKind int
12+
13+
const (
14+
name tokenKind = iota
15+
number
16+
text
17+
operator
18+
punctuation
19+
eof = -1
20+
)
21+
22+
type token struct {
23+
kind tokenKind
24+
value string
25+
pos int
26+
}
27+
28+
// is tests if token kind and value matches
29+
func (t token) is(kind tokenKind, values ...string) bool {
30+
var value *string
31+
if len(values) == 1 {
32+
value = &values[0]
33+
}
34+
return t.kind == kind && (value == nil || *value == t.value)
35+
}
36+
37+
func (t token) String() string {
38+
switch t.kind {
39+
case name:
40+
return fmt.Sprintf("name(%s)", t.value)
41+
case number:
42+
return fmt.Sprintf("number(%s)", t.value)
43+
case text:
44+
return fmt.Sprintf("text(%q)", t.value)
45+
case operator:
46+
return fmt.Sprintf("operator(%s)", t.value)
47+
case punctuation:
48+
return fmt.Sprintf("punctuation(%q)", t.value)
49+
case eof:
50+
return "EOF"
51+
default:
52+
return t.value
53+
}
54+
}
55+
56+
type stateFn func(*lexer) stateFn
57+
58+
type lexer struct {
59+
input string // the string being scanned
60+
pos int // current position in the input
61+
start int // start position of this token
62+
width int // width of last rune read from input
63+
brackets []rune // stack of brackets
64+
tokens []token // slice of scanned tokens
65+
err error // last error
66+
}
67+
68+
// next returns the next rune in the input.
69+
func (l *lexer) next() rune {
70+
if l.pos >= len(l.input) {
71+
l.width = 0
72+
return eof
73+
}
74+
r, w := utf8.DecodeRuneInString(l.input[l.pos:])
75+
l.width = w
76+
l.pos += l.width
77+
return r
78+
}
79+
80+
// peek returns but does not consume the next rune in the input.
81+
func (l *lexer) peek() rune {
82+
r := l.next()
83+
l.backup()
84+
return r
85+
}
86+
87+
// backup steps back one rune. Can only be called once per call of next.
88+
func (l *lexer) backup() {
89+
l.pos -= l.width
90+
}
91+
92+
func (l *lexer) word() string {
93+
return l.input[l.start:l.pos]
94+
}
95+
96+
func (l *lexer) emit(t tokenKind) {
97+
l.emitValue(t, l.word())
98+
}
99+
100+
func (l *lexer) emitValue(t tokenKind, value string) {
101+
c := len(l.tokens)
102+
// Special case for joining "not" and ".." operators
103+
if c > 0 && l.tokens[c-1].is(operator, "not") && t == operator && value == "in" {
104+
l.tokens[c-1].value = "not in"
105+
} else if c > 0 && l.tokens[c-1].is(punctuation, ".") && t == punctuation && value == "." {
106+
l.tokens[c-1].kind = operator
107+
l.tokens[c-1].value = ".."
108+
} else {
109+
l.tokens = append(l.tokens, token{
110+
kind: t,
111+
value: value,
112+
pos: l.start,
113+
})
114+
}
115+
l.start = l.pos
116+
}
117+
118+
// ignore skips over the pending input before this point.
119+
func (l *lexer) ignore() {
120+
l.start = l.pos
121+
}
122+
123+
func (l *lexer) accept(valid string) bool {
124+
if strings.ContainsRune(valid, l.next()) {
125+
return true
126+
}
127+
l.backup()
128+
return false
129+
}
130+
131+
func (l *lexer) acceptRun(valid string) {
132+
for strings.ContainsRune(valid, l.next()) {
133+
}
134+
l.backup()
135+
}
136+
137+
func (l *lexer) errorf(format string, args ...interface{}) stateFn {
138+
l.err = &syntaxError{
139+
message: fmt.Sprintf(format, args...),
140+
input: l.input,
141+
pos: l.start,
142+
}
143+
return nil
144+
}
145+
146+
func lex(input string) ([]token, error) {
147+
l := &lexer{
148+
input: input,
149+
tokens: make([]token, 0),
150+
}
151+
for state := lexRoot; state != nil; {
152+
state = state(l)
153+
}
154+
return l.tokens, l.err
155+
}
156+
157+
// state functions
158+
159+
func lexRoot(l *lexer) stateFn {
160+
switch r := l.next(); {
161+
case r == eof:
162+
if len(l.brackets) > 0 {
163+
return l.errorf("unclosed %q", string(l.brackets[0]))
164+
}
165+
l.emit(eof)
166+
return nil
167+
case isSpace(r):
168+
l.ignore()
169+
return lexRoot
170+
case r == '\'' || r == '"':
171+
l.backup()
172+
return lexQuote
173+
case '0' <= r && r <= '9':
174+
l.backup()
175+
return lexNumber
176+
case strings.ContainsRune("([{", r):
177+
l.emit(punctuation)
178+
l.brackets = append(l.brackets, r)
179+
case strings.ContainsRune(")]}", r):
180+
if len(l.brackets) > 0 {
181+
bracket := l.brackets[len(l.brackets)-1]
182+
l.brackets = l.brackets[:len(l.brackets)-1]
183+
if isBracketMatch(bracket, r) {
184+
l.emit(punctuation)
185+
} else {
186+
return l.errorf("unclosed %q", string(bracket))
187+
}
188+
} else {
189+
return l.errorf("unexpected %q", string(r))
190+
}
191+
case strings.ContainsRune(".,?:", r):
192+
l.emit(punctuation)
193+
case strings.ContainsRune("!%&*+-/<=>^|~", r):
194+
l.backup()
195+
return lexOperator
196+
case isAlphaNumeric(r):
197+
l.backup()
198+
return lexName
199+
default:
200+
return l.errorf("unrecognized character: %#U", r)
201+
}
202+
return lexRoot
203+
}
204+
205+
func lexNumber(l *lexer) stateFn {
206+
if !l.scanNumber() {
207+
return l.errorf("bad number syntax: %q", l.word())
208+
}
209+
l.emit(number)
210+
return lexRoot
211+
}
212+
213+
func (l *lexer) scanNumber() bool {
214+
// Is it hex?
215+
digits := "0123456789"
216+
l.acceptRun(digits)
217+
if l.accept(".") {
218+
// Lookup for .. operator.
219+
if l.peek() == '.' {
220+
l.backup()
221+
return true
222+
}
223+
l.acceptRun(digits)
224+
}
225+
if l.accept("eE") {
226+
l.accept("+-")
227+
l.acceptRun("0123456789")
228+
}
229+
// Next thing mustn't be alphanumeric.
230+
if isAlphaNumeric(l.peek()) {
231+
l.next()
232+
return false
233+
}
234+
return true
235+
}
236+
237+
func lexQuote(l *lexer) stateFn {
238+
quote := l.next()
239+
Loop:
240+
for {
241+
switch l.next() {
242+
case '\\':
243+
if r := l.next(); r != eof && r != '\n' {
244+
break
245+
}
246+
fallthrough
247+
case eof:
248+
return l.errorf("unterminated string")
249+
case quote:
250+
break Loop
251+
}
252+
}
253+
word := strings.Trim(l.word(), `"'`)
254+
value, err := strconv.Unquote(`"` + word + `"`)
255+
if err != nil {
256+
return l.errorf("unquote error: %v", err)
257+
}
258+
l.emitValue(text, value)
259+
return lexRoot
260+
}
261+
262+
func lexOperator(l *lexer) stateFn {
263+
l.next()
264+
l.accept("|&=*")
265+
l.emit(operator)
266+
return lexRoot
267+
}
268+
269+
func lexName(l *lexer) stateFn {
270+
Loop:
271+
for {
272+
switch r := l.next(); {
273+
case isAlphaNumeric(r):
274+
// absorb.
275+
default:
276+
l.backup()
277+
switch l.word() {
278+
case "not":
279+
l.emit(operator)
280+
case "in":
281+
l.emit(operator)
282+
case "or":
283+
l.emit(operator)
284+
case "and":
285+
l.emit(operator)
286+
case "matches":
287+
l.emit(operator)
288+
default:
289+
l.emit(name)
290+
}
291+
break Loop
292+
}
293+
}
294+
return lexRoot
295+
}
296+
297+
func isSpace(r rune) bool {
298+
return r == ' ' || r == '\t' || r == '\r' || r == '\n'
299+
}
300+
301+
func isAlphaNumeric(r rune) bool {
302+
return isAlphabetic(r) || unicode.IsDigit(r)
303+
}
304+
305+
func isAlphabetic(r rune) bool {
306+
return r == '_' || unicode.IsLetter(r)
307+
}
308+
309+
func isBracketMatch(open, close rune) bool {
310+
switch string([]rune{open, close}) {
311+
case "()":
312+
return true
313+
case "[]":
314+
return true
315+
case "{}":
316+
return true
317+
}
318+
return false
319+
}

0 commit comments

Comments
 (0)