-
-
Notifications
You must be signed in to change notification settings - Fork 455
Improve parser and lexer #811
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
f835edd
10e4040
89f608f
436b8ae
152f67d
2f4437c
a13ee0e
af94f4b
e2111b8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,48 +1,36 @@ | ||
package file | ||
|
||
import ( | ||
"strings" | ||
"unicode/utf8" | ||
) | ||
import "strings" | ||
|
||
type Source []rune | ||
type Source struct { | ||
raw string | ||
} | ||
|
||
func NewSource(contents string) Source { | ||
return []rune(contents) | ||
return Source{ | ||
raw: contents, | ||
} | ||
} | ||
|
||
func (s Source) String() string { | ||
return string(s) | ||
return s.raw | ||
} | ||
|
||
func (s Source) Snippet(line int) (string, bool) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This method is no longer used but I'm keeping it in case someone was using it. |
||
if s == nil { | ||
if s.raw == "" { | ||
return "", false | ||
} | ||
lines := strings.Split(string(s), "\n") | ||
lineOffsets := make([]int, len(lines)) | ||
var offset int | ||
for i, line := range lines { | ||
offset = offset + utf8.RuneCountInString(line) + 1 | ||
lineOffsets[i] = offset | ||
} | ||
charStart, found := getLineOffset(lineOffsets, line) | ||
if !found || len(s) == 0 { | ||
return "", false | ||
var start int | ||
for i := 1; i < line; i++ { | ||
pos := strings.IndexByte(s.raw[start:], '\n') | ||
if pos < 0 { | ||
return "", false | ||
} | ||
start += pos + 1 | ||
} | ||
charEnd, found := getLineOffset(lineOffsets, line+1) | ||
if found { | ||
return string(s[charStart : charEnd-1]), true | ||
} | ||
return string(s[charStart:]), true | ||
} | ||
|
||
func getLineOffset(lineOffsets []int, line int) (int, bool) { | ||
if line == 1 { | ||
return 0, true | ||
} else if line > 1 && line <= len(lineOffsets) { | ||
offset := lineOffsets[line-2] | ||
return offset, true | ||
end := start + strings.IndexByte(s.raw[start:], '\n') | ||
if end < start { | ||
end = len(s.raw) | ||
} | ||
return -1, false | ||
return s.raw[start:end], true | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,18 +3,19 @@ package lexer | |
import ( | ||
"fmt" | ||
"strings" | ||
"unicode/utf8" | ||
|
||
"github.com/expr-lang/expr/file" | ||
) | ||
|
||
const minTokens = 10 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This allows for optimistically allocating tokens. Discussing the "right" value for this parameter can be a loss of time (everyone has an opinion on what should be the "right" value). |
||
|
||
func Lex(source file.Source) ([]Token, error) { | ||
raw := source.String() | ||
l := &lexer{ | ||
source: source, | ||
tokens: make([]Token, 0), | ||
start: 0, | ||
end: 0, | ||
raw: raw, | ||
tokens: make([]Token, 0, minTokens), | ||
} | ||
l.commit() | ||
|
||
for state := root; state != nil; { | ||
state = state(l) | ||
|
@@ -28,10 +29,15 @@ func Lex(source file.Source) ([]Token, error) { | |
} | ||
|
||
type lexer struct { | ||
source file.Source | ||
raw string | ||
tokens []Token | ||
start, end int | ||
err *file.Error | ||
start, end pos | ||
eof bool | ||
} | ||
|
||
type pos struct { | ||
byte, rune int | ||
} | ||
|
||
const eof rune = -1 | ||
|
@@ -41,23 +47,39 @@ func (l *lexer) commit() { | |
} | ||
|
||
func (l *lexer) next() rune { | ||
if l.end >= len(l.source) { | ||
l.end++ | ||
if l.end.byte >= len(l.raw) { | ||
l.eof = true | ||
return eof | ||
} | ||
r := l.source[l.end] | ||
l.end++ | ||
r, sz := utf8.DecodeRuneInString(l.raw[l.end.byte:]) | ||
l.end.rune++ | ||
l.end.byte += sz | ||
return r | ||
} | ||
|
||
func (l *lexer) peek() rune { | ||
r := l.next() | ||
l.backup() | ||
return r | ||
if l.end.byte < len(l.raw) { | ||
r, _ := utf8.DecodeRuneInString(l.raw[l.end.byte:]) | ||
return r | ||
} | ||
return eof | ||
} | ||
|
||
func (l *lexer) peekByte() (byte, bool) { | ||
if l.end.byte >= 0 && l.end.byte < len(l.raw) { | ||
return l.raw[l.end.byte], true | ||
} | ||
return 0, false | ||
} | ||
|
||
func (l *lexer) backup() { | ||
l.end-- | ||
if l.eof { | ||
l.eof = false | ||
} else if l.end.rune > 0 { | ||
_, sz := utf8.DecodeLastRuneInString(l.raw[:l.end.byte]) | ||
l.end.byte -= sz | ||
l.end.rune-- | ||
} | ||
} | ||
|
||
func (l *lexer) emit(t Kind) { | ||
|
@@ -66,19 +88,19 @@ func (l *lexer) emit(t Kind) { | |
|
||
func (l *lexer) emitValue(t Kind, value string) { | ||
l.tokens = append(l.tokens, Token{ | ||
Location: file.Location{From: l.start, To: l.end}, | ||
Location: file.Location{From: l.start.rune, To: l.end.rune}, | ||
Kind: t, | ||
Value: value, | ||
}) | ||
l.commit() | ||
} | ||
|
||
func (l *lexer) emitEOF() { | ||
from := l.end - 2 | ||
from := l.end.rune - 1 | ||
if from < 0 { | ||
from = 0 | ||
} | ||
to := l.end - 1 | ||
to := l.end.rune - 0 | ||
if to < 0 { | ||
to = 0 | ||
} | ||
|
@@ -94,60 +116,37 @@ func (l *lexer) skip() { | |
} | ||
|
||
func (l *lexer) word() string { | ||
// TODO: boundary check is NOT needed here, but for some reason CI fuzz tests are failing. | ||
if l.start > len(l.source) || l.end > len(l.source) { | ||
return "__invalid__" | ||
} | ||
return string(l.source[l.start:l.end]) | ||
return l.raw[l.start.byte:l.end.byte] | ||
} | ||
|
||
func (l *lexer) accept(valid string) bool { | ||
if strings.ContainsRune(valid, l.next()) { | ||
if strings.ContainsRune(valid, l.peek()) { | ||
l.next() | ||
return true | ||
} | ||
l.backup() | ||
return false | ||
} | ||
|
||
func (l *lexer) acceptRun(valid string) { | ||
for strings.ContainsRune(valid, l.next()) { | ||
for l.accept(valid) { | ||
} | ||
l.backup() | ||
} | ||
|
||
func (l *lexer) skipSpaces() { | ||
r := l.peek() | ||
for ; r == ' '; r = l.peek() { | ||
l.next() | ||
} | ||
l.acceptRun(" ") | ||
l.skip() | ||
} | ||
|
||
func (l *lexer) acceptWord(word string) bool { | ||
pos := l.end | ||
|
||
l.skipSpaces() | ||
|
||
for _, ch := range word { | ||
if l.next() != ch { | ||
l.end = pos | ||
return false | ||
} | ||
} | ||
if r := l.peek(); r != ' ' && r != eof { | ||
l.end = pos | ||
return false | ||
} | ||
|
||
return true | ||
} | ||
|
||
func (l *lexer) error(format string, args ...any) stateFn { | ||
if l.err == nil { // show first error | ||
end := l.end.rune | ||
if l.eof { | ||
end++ | ||
} | ||
l.err = &file.Error{ | ||
Location: file.Location{ | ||
From: l.end - 1, | ||
To: l.end, | ||
From: end - 1, | ||
To: end, | ||
}, | ||
Message: fmt.Sprintf(format, args...), | ||
} | ||
|
@@ -225,6 +224,6 @@ func (l *lexer) scanRawString(quote rune) (n int) { | |
ch = l.next() | ||
n++ | ||
} | ||
l.emitValue(String, string(l.source[l.start+1:l.end-1])) | ||
l.emitValue(String, l.raw[l.start.byte+1:l.end.byte-1]) | ||
return | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -335,6 +335,7 @@ literal not terminated (1:10) | |
früh ♥︎ | ||
unrecognized character: U+2665 '♥' (1:6) | ||
| früh ♥︎ | ||
| .....^ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This appeared to be a small bug. In the original code, if it found a rune longer than 1 byte it would skip the indicator line entirely. I imagine that wouldn't be good if we have something like:
In that case it wouldn't put the indicator line because it will find the '世' rune. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice, I guess the problem is this rune width in terminal. |
||
` | ||
|
||
func TestLex_error(t *testing.T) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe just abandon Source altogether? And simply use string?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agree, I can do that really quick or in a separate PR if you prefer for easier reviewing.