Skip to content

Commit 002464c

Browse files
committed
initial commit
1 parent 2cf36cf commit 002464c

File tree

5 files changed

+436
-0
lines changed

5 files changed

+436
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.idea

go.mod

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
module github.com/speakeasy-api/jsonpath
2+
3+
go 1.20

go.sum

Whitespace-only changes.

pkg/token.go

Lines changed: 303 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,303 @@
1+
package pkg
2+
3+
import (
4+
"fmt"
5+
"strconv"
6+
"strings"
7+
)
8+
9+
// Token represents a lexical token in a JSONPath expression.
10+
type Token int
11+
12+
// The list of tokens.
13+
const (
14+
ILLEGAL Token = iota
15+
EOF
16+
LITERAL
17+
NUMBER
18+
STRING
19+
BOOLEAN
20+
NULL
21+
ROOT
22+
CURRENT
23+
WILDCARD
24+
RECURSIVE
25+
UNION
26+
CHILD
27+
SUBSCRIPT
28+
SLICE
29+
FILTER
30+
PAREN_LEFT
31+
PAREN_RIGHT
32+
BRACKET_LEFT
33+
BRACKET_RIGHT
34+
BRACE_LEFT
35+
BRACE_RIGHT
36+
COLON
37+
COMMA
38+
DOT
39+
PIPE
40+
QUESTION
41+
)
42+
43+
var tokens = [...]string{
44+
ILLEGAL: "ILLEGAL",
45+
EOF: "EOF",
46+
LITERAL: "LITERAL",
47+
NUMBER: "NUMBER",
48+
STRING: "STRING",
49+
BOOLEAN: "BOOLEAN",
50+
NULL: "NULL",
51+
ROOT: "$",
52+
CURRENT: "@",
53+
WILDCARD: "*",
54+
RECURSIVE: "..",
55+
UNION: ",",
56+
CHILD: ".",
57+
SUBSCRIPT: "[]",
58+
SLICE: ":",
59+
FILTER: "?",
60+
PAREN_LEFT: "(",
61+
PAREN_RIGHT: ")",
62+
BRACKET_LEFT: "[",
63+
BRACKET_RIGHT: "]",
64+
BRACE_LEFT: "{",
65+
BRACE_RIGHT: "}",
66+
COLON: ":",
67+
COMMA: ",",
68+
DOT: ".",
69+
PIPE: "|",
70+
QUESTION: "?",
71+
}
72+
73+
// String returns the string representation of the token.
74+
func (tok Token) String() string {
75+
if tok >= 0 && tok < Token(len(tokens)) {
76+
return tokens[tok]
77+
}
78+
return "token(" + strconv.Itoa(int(tok)) + ")"
79+
}
80+
81+
func (t Tokenizer) ErrorString(target TokenInfo, msg string) string {
82+
var errorBuilder strings.Builder
83+
84+
// Write the error message with line and column information
85+
errorBuilder.WriteString(fmt.Sprintf("Error at line %d, column %d: %s\n", target.Line, target.Column, msg))
86+
87+
// Find the start and end positions of the line containing the target token
88+
lineStart := 0
89+
lineEnd := len(t.input)
90+
for i := target.Line - 1; i > 0; i-- {
91+
if pos := strings.LastIndexByte(t.input[:lineStart], '\n'); pos != -1 {
92+
lineStart = pos + 1
93+
break
94+
}
95+
}
96+
if pos := strings.IndexByte(t.input[lineStart:], '\n'); pos != -1 {
97+
lineEnd = lineStart + pos
98+
}
99+
100+
// Extract the line containing the target token
101+
line := t.input[lineStart:lineEnd]
102+
errorBuilder.WriteString(line)
103+
errorBuilder.WriteString("\n")
104+
105+
// Calculate the number of spaces before the target token
106+
spaces := strings.Repeat(" ", target.Column)
107+
108+
// Write the caret symbol pointing to the target token
109+
errorBuilder.WriteString(spaces)
110+
errorBuilder.WriteString("^\n")
111+
112+
return errorBuilder.String()
113+
}
114+
115+
// TokenInfo represents a token and its associated information.
116+
type TokenInfo struct {
117+
Token Token
118+
Line int
119+
Column int
120+
Literal string
121+
}
122+
123+
// Tokenizer represents a JSONPath tokenizer.
124+
type Tokenizer struct {
125+
input string
126+
pos int
127+
line int
128+
column int
129+
tokens []TokenInfo
130+
}
131+
132+
// NewTokenizer creates a new JSONPath tokenizer for the given input string.
133+
func NewTokenizer(input string) *Tokenizer {
134+
return &Tokenizer{
135+
input: input,
136+
line: 1,
137+
}
138+
}
139+
140+
// Tokenize tokenizes the input string and returns a slice of TokenInfo.
141+
func (t *Tokenizer) Tokenize() []TokenInfo {
142+
for t.pos < len(t.input) {
143+
t.skipWhitespace()
144+
if t.pos >= len(t.input) {
145+
break
146+
}
147+
148+
switch ch := t.input[t.pos]; {
149+
case ch == '$':
150+
t.addToken(ROOT, "")
151+
case ch == '@':
152+
t.addToken(CURRENT, "")
153+
case ch == '*':
154+
t.addToken(WILDCARD, "")
155+
case ch == '.':
156+
if t.peek() == '.' {
157+
t.addToken(RECURSIVE, "")
158+
} else {
159+
t.addToken(CHILD, "")
160+
}
161+
case ch == ',':
162+
t.addToken(UNION, "")
163+
case ch == ':':
164+
t.addToken(SLICE, "")
165+
case ch == '?':
166+
t.addToken(FILTER, "")
167+
case ch == '(':
168+
t.addToken(PAREN_LEFT, "")
169+
case ch == ')':
170+
t.addToken(PAREN_RIGHT, "")
171+
case ch == '[':
172+
t.addToken(BRACKET_LEFT, "")
173+
case ch == ']':
174+
t.addToken(BRACKET_RIGHT, "")
175+
case ch == '{':
176+
t.addToken(BRACE_LEFT, "")
177+
case ch == '}':
178+
t.addToken(BRACE_RIGHT, "")
179+
case ch == '|':
180+
t.addToken(PIPE, "")
181+
case ch == '"' || ch == '\'':
182+
t.scanString(rune(ch))
183+
184+
case isDigit(ch):
185+
t.scanNumber()
186+
case isLetter(ch):
187+
t.scanLiteral()
188+
default:
189+
t.addToken(ILLEGAL, string(ch))
190+
}
191+
t.pos++
192+
t.column++
193+
}
194+
195+
t.addToken(EOF, "")
196+
return t.tokens
197+
}
198+
199+
func (t *Tokenizer) addToken(token Token, literal string) {
200+
t.tokens = append(t.tokens, TokenInfo{
201+
Token: token,
202+
Line: t.line,
203+
Column: t.column,
204+
Literal: literal,
205+
})
206+
}
207+
208+
func (t *Tokenizer) scanString(quote rune) {
209+
start := t.pos + 1
210+
for i := start; i < len(t.input); i++ {
211+
if t.input[i] == byte(quote) {
212+
t.addToken(STRING, t.input[start:i])
213+
t.pos = i
214+
t.column += i - start + 1
215+
return
216+
}
217+
}
218+
t.addToken(ILLEGAL, t.input[start:])
219+
t.pos = len(t.input) - 1
220+
t.column = len(t.input) - 1
221+
}
222+
223+
func (t *Tokenizer) scanNumber() {
224+
start := t.pos
225+
for i := start; i < len(t.input); i++ {
226+
if !isDigit(t.input[i]) {
227+
t.addToken(NUMBER, t.input[start:i])
228+
t.pos = i - 1
229+
t.column += i - start - 1
230+
return
231+
}
232+
}
233+
t.addToken(NUMBER, t.input[start:])
234+
t.pos = len(t.input) - 1
235+
t.column = len(t.input) - 1
236+
}
237+
238+
func (t *Tokenizer) scanLiteral() {
239+
start := t.pos
240+
for i := start; i < len(t.input); i++ {
241+
if !isLetter(t.input[i]) {
242+
literal := t.input[start:i]
243+
switch literal {
244+
case "true", "false":
245+
t.addToken(BOOLEAN, literal)
246+
case "null":
247+
t.addToken(NULL, literal)
248+
default:
249+
t.addToken(LITERAL, literal)
250+
}
251+
t.pos = i - 1
252+
t.column += i - start - 1
253+
return
254+
}
255+
}
256+
literal := t.input[start:]
257+
switch literal {
258+
case "true", "false":
259+
t.addToken(BOOLEAN, literal)
260+
case "null":
261+
t.addToken(NULL, literal)
262+
default:
263+
t.addToken(LITERAL, literal)
264+
}
265+
t.pos = len(t.input) - 1
266+
t.column = len(t.input) - 1
267+
268+
}
269+
270+
func (t *Tokenizer) skipWhitespace() {
271+
for t.pos < len(t.input) {
272+
ch := t.input[t.pos]
273+
if ch == '\n' {
274+
t.line++
275+
t.pos++
276+
t.column = 0
277+
} else if !isSpace(ch) {
278+
break
279+
} else {
280+
t.pos++
281+
t.column++
282+
}
283+
}
284+
}
285+
286+
func (t *Tokenizer) peek() byte {
287+
if t.pos+1 < len(t.input) {
288+
return t.input[t.pos+1]
289+
}
290+
return 0
291+
}
292+
293+
func isDigit(ch byte) bool {
294+
return '0' <= ch && ch <= '9'
295+
}
296+
297+
func isLetter(ch byte) bool {
298+
return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z'
299+
}
300+
301+
func isSpace(ch byte) bool {
302+
return ch == ' ' || ch == '\t' || ch == '\r'
303+
}

0 commit comments

Comments
 (0)