1
- def get_tokens (text ):
1
+ def get_tokens (filepath ):
2
2
tokens = []
3
3
4
+ with open (filepath , "r" ) as f :
5
+ text = f .read ()
6
+
4
7
text_len = len (text )
5
8
6
9
i = 0
7
10
while i < text_len :
8
11
char = text [i ]
9
12
10
13
if char == "/" :
11
- i = tokenize_comment (i , text_len , text , tokens )
14
+ i = tokenize_comment (i , text_len , text , tokens , filepath )
12
15
elif char == "\t " :
13
- i = tokenize_tabs (i , text_len , text , tokens )
16
+ i = tokenize_tabs (i , text_len , text , tokens , filepath )
14
17
elif char == " " :
15
- i = tokenize_spaces (i , text_len , text , tokens )
18
+ i = tokenize_spaces (i , text_len , text , tokens , filepath )
16
19
elif char == "=" :
17
- i = tokenize_equals (i , text_len , text , tokens )
20
+ i = tokenize_equals (i , text_len , text , tokens , filepath )
18
21
elif char == "\n " :
19
- i = tokenize_newline (i , text_len , text , tokens )
22
+ i = tokenize_newline (i , text_len , text , tokens , filepath )
20
23
else :
21
- i = tokenize_word (i , text_len , text , tokens )
24
+ i = tokenize_word (i , text_len , text , tokens , filepath )
22
25
23
26
return tokens
24
27
25
28
26
- def get_token (type_ , content ):
27
- return { "type" : type_ , "content" : content }
29
+ def get_token (type_ , content , i , filepath ):
30
+ return { "type" : type_ , "content" : content , "index" : i , "filepath" : filepath }
28
31
29
32
30
- def tokenize_comment (i , text_len , text , tokens ):
33
+ def tokenize_comment (i , text_len , text , tokens , filepath ):
31
34
if i + 1 < text_len and text [i + 1 ] == "/" :
32
- return tokenize_single_line_comment (i , text_len , text , tokens )
35
+ return tokenize_single_line_comment (i , text_len , text , tokens , filepath )
33
36
else :
34
- return tokenize_multi_line_comment (i , text_len , text , tokens )
37
+ return tokenize_multi_line_comment (i , text_len , text , tokens , filepath )
35
38
36
39
37
- def tokenize_single_line_comment (i , text_len , text , tokens ):
40
+ def tokenize_single_line_comment (i , text_len , text , tokens , filepath ):
38
41
token = ""
39
42
40
43
while i < text_len and text [i ] != "\n " :
41
44
token += text [i ]
42
45
i += 1
43
46
44
- tokens .append (get_token ("EXTRA" , token ))
47
+ tokens .append (get_token ("EXTRA" , token , i , filepath ))
45
48
46
49
return i
47
50
48
51
49
- def tokenize_multi_line_comment (i , text_len , text , tokens ):
52
+ def tokenize_multi_line_comment (i , text_len , text , tokens , filepath ):
50
53
token = ""
51
54
52
55
while i < text_len and not (text [i ] == "*" and i + 1 < text_len and text [i + 1 ] == "/" ):
@@ -56,66 +59,66 @@ def tokenize_multi_line_comment(i, text_len, text, tokens):
56
59
token += "*/"
57
60
i += 2
58
61
59
- tokens .append (get_token ("EXTRA" , token ))
62
+ tokens .append (get_token ("EXTRA" , token , i , filepath ))
60
63
61
64
return i
62
65
63
66
64
- def tokenize_tabs (i , text_len , text , tokens ):
67
+ def tokenize_tabs (i , text_len , text , tokens , filepath ):
65
68
token = ""
66
69
67
70
while i < text_len and text [i ] == "\t " :
68
71
token += text [i ]
69
72
i += 1
70
73
71
- tokens .append (get_token ("TABS" , token ))
74
+ tokens .append (get_token ("TABS" , token , i , filepath ))
72
75
73
76
return i
74
77
75
78
76
- def tokenize_spaces (i , text_len , text , tokens ):
79
+ def tokenize_spaces (i , text_len , text , tokens , filepath ):
77
80
token = ""
78
81
79
82
while i < text_len and text [i ] == " " :
80
83
token += text [i ]
81
84
i += 1
82
85
83
- tokens .append (get_token ("EXTRA" , token ))
86
+ tokens .append (get_token ("EXTRA" , token , i , filepath ))
84
87
85
88
return i
86
89
87
90
88
- def tokenize_equals (i , text_len , text , tokens ):
91
+ def tokenize_equals (i , text_len , text , tokens , filepath ):
89
92
token = ""
90
93
91
94
while i < text_len and text [i ] == "=" :
92
95
token += text [i ]
93
96
i += 1
94
97
95
- tokens .append (get_token ("EQUALS" , token ))
98
+ tokens .append (get_token ("EQUALS" , token , i , filepath ))
96
99
97
100
return i
98
101
99
102
100
- def tokenize_newline (i , text_len , text , tokens ):
103
+ def tokenize_newline (i , text_len , text , tokens , filepath ):
101
104
token = ""
102
105
103
106
while i < text_len and text [i ] == "\n " :
104
107
token += text [i ]
105
108
i += 1
106
109
107
- tokens .append (get_token ("NEWLINES" , token )) # TODO: Maybe use "NEWLINE" instead of the plural version?
110
+ tokens .append (get_token ("NEWLINES" , token , i , filepath )) # TODO: Maybe use "NEWLINE" instead of the plural version?
108
111
109
112
return i
110
113
111
114
112
- def tokenize_word (i , text_len , text , tokens ):
115
+ def tokenize_word (i , text_len , text , tokens , filepath ):
113
116
token = ""
114
117
115
118
while i < text_len and text [i ] not in ("\t =\n " ) and not (text [i ] == "/" and i + 1 < text_len and text [i + 1 ] == "/" ):
116
119
token += text [i ]
117
120
i += 1
118
121
119
- tokens .append (get_token ("WORD" , token ))
122
+ tokens .append (get_token ("WORD" , token , i , filepath ))
120
123
121
124
return i
0 commit comments