@@ -103,7 +103,8 @@ def get_token_kind_desc(kind):
103
103
def char_code_at (s , pos ):
104
104
if 0 <= pos < len (s ):
105
105
return ord (s [pos ])
106
- return 0
106
+
107
+ return None
107
108
108
109
109
110
PUNCT_CODE_TO_KIND = {
@@ -122,6 +123,10 @@ def char_code_at(s, pos):
122
123
}
123
124
124
125
126
+ def print_char_code (code ):
127
+ return 'EOF' if code is None else json .dumps (unichr (code ))
128
+
129
+
125
130
def read_token (source , from_position ):
126
131
"""Gets the next token from the source starting at the given position.
127
132
@@ -132,30 +137,52 @@ def read_token(source, from_position):
132
137
body_length = len (body )
133
138
134
139
position = position_after_whitespace (body , from_position )
135
- code = char_code_at (body , position )
136
140
137
141
if position >= body_length :
138
142
return Token (TokenKind .EOF , position , position )
139
143
144
+ code = char_code_at (body , position )
145
+
146
+ if code < 0x0020 and code not in (0x0009 , 0x000A , 0x000D ):
147
+ raise LanguageError (
148
+ source , position ,
149
+ u'Invalid character {}.' .format (print_char_code (code ))
150
+ )
151
+
140
152
kind = PUNCT_CODE_TO_KIND .get (code )
141
153
if kind is not None :
142
154
return Token (kind , position , position + 1 )
143
155
144
156
if code == 46 : # .
145
- if char_code_at (body , position + 1 ) == 46 and \
146
- char_code_at (body , position + 2 ) == 46 :
157
+ if char_code_at (body , position + 1 ) == char_code_at (body , position + 2 ) == 46 :
147
158
return Token (TokenKind .SPREAD , position , position + 3 )
159
+
148
160
elif 65 <= code <= 90 or code == 95 or 97 <= code <= 122 :
149
161
# A-Z, _, a-z
150
162
return read_name (source , position )
163
+
151
164
elif code == 45 or 48 <= code <= 57 : # -, 0-9
152
165
return read_number (source , position , code )
166
+
153
167
elif code == 34 : # "
154
168
return read_string (source , position )
155
169
156
170
raise LanguageError (
157
171
source , position ,
158
- u'Unexpected character {}' .format (json .dumps (body [position ])))
172
+ u'Unexpected character {}.' .format (print_char_code (code )))
173
+
174
+ ignored_whitespace_characters = frozenset ([
175
+ # BOM
176
+ 0xFEFF ,
177
+ # White Space
178
+ 0x0009 , # tab
179
+ 0x0020 , # space
180
+ # Line Terminator
181
+ 0x000A , # new line
182
+ 0x000D , # carriage return
183
+ # Comma
184
+ 0x002C
185
+ ])
159
186
160
187
161
188
def position_after_whitespace (body , start_position ):
@@ -166,20 +193,16 @@ def position_after_whitespace(body, start_position):
166
193
position = start_position
167
194
while position < body_length :
168
195
code = char_code_at (body , position )
169
- if code in (
170
- 32 , # space
171
- 44 , # comma
172
- 160 , # '\xa0'
173
- 0x2028 , # line separator
174
- 0x2029 , # paragraph separator
175
- ) or (code > 8 and code < 14 ): # whitespace
196
+ if code in ignored_whitespace_characters :
176
197
position += 1
198
+
177
199
elif code == 35 : # #, skip comments
178
200
position += 1
179
201
while position < body_length :
180
202
code = char_code_at (body , position )
181
- if not code or code in (10 , 13 , 0x2028 , 0x2029 ):
203
+ if not ( code is not None and ( code > 0x001F or code == 0x0009 ) and code not in (0x000A , 0x000D ) ):
182
204
break
205
+
183
206
position += 1
184
207
else :
185
208
break
@@ -204,43 +227,34 @@ def read_number(source, start, first_code):
204
227
if code == 48 : # 0
205
228
position += 1
206
229
code = char_code_at (body , position )
207
- elif 49 <= code <= 57 : # 1 - 9
208
- position += 1
209
- code = char_code_at (body , position )
210
- while 48 <= code <= 57 : # 0 - 9
211
- position += 1
212
- code = char_code_at (body , position )
230
+
231
+ if code is not None and 48 <= code <= 57 :
232
+ raise LanguageError (
233
+ source ,
234
+ position ,
235
+ u'Invalid number, unexpected digit after 0: {}.' .format (print_char_code (code ))
236
+ )
213
237
else :
214
- raise LanguageError (source , position , 'Invalid number' )
238
+ position = read_digits (source , position , code )
239
+ code = char_code_at (body , position )
215
240
216
241
if code == 46 : # .
217
242
is_float = True
218
243
219
244
position += 1
220
245
code = char_code_at (body , position )
221
- if 48 <= code <= 57 : # 0 - 9
222
- position += 1
223
- code = char_code_at (body , position )
224
- while 48 <= code <= 57 : # 0 - 9
225
- position += 1
226
- code = char_code_at (body , position )
227
- else :
228
- raise LanguageError (source , position , 'Invalid number' )
246
+ position = read_digits (source , position , code )
247
+ code = char_code_at (body , position )
229
248
230
- if code == 101 : # e
249
+ if code in (69 , 101 ): # E e
250
+ is_float = True
251
+ position += 1
252
+ code = char_code_at (body , position )
253
+ if code in (43 , 45 ): # + -
231
254
position += 1
232
255
code = char_code_at (body , position )
233
- if code == 45 : # -
234
- position += 1
235
- code = char_code_at (body , position )
236
- if 48 <= code <= 57 : # 0 - 9
237
- position += 1
238
- code = char_code_at (body , position )
239
- while 48 <= code <= 57 : # 0 - 9
240
- position += 1
241
- code = char_code_at (body , position )
242
- else :
243
- raise LanguageError (source , position , 'Invalid number' )
256
+
257
+ position = read_digits (source , position , code )
244
258
245
259
return Token (
246
260
TokenKind .FLOAT if is_float else TokenKind .INT ,
@@ -250,6 +264,28 @@ def read_number(source, start, first_code):
250
264
)
251
265
252
266
267
+ def read_digits (source , start , first_code ):
268
+ body = source .body
269
+ position = start
270
+ code = first_code
271
+
272
+ if code is not None and 48 <= code <= 57 : # 0 - 9
273
+ while True :
274
+ position += 1
275
+ code = char_code_at (body , position )
276
+
277
+ if not (code is not None and 48 <= code <= 57 ):
278
+ break
279
+
280
+ return position
281
+
282
+ raise LanguageError (
283
+ source ,
284
+ position ,
285
+ u'Invalid number, expected digit but got: {}.' .format (print_char_code (code ))
286
+ )
287
+
288
+
253
289
ESCAPED_CHAR_CODES = {
254
290
34 : '"' ,
255
291
47 : '/' ,
@@ -268,47 +304,73 @@ def read_string(source, start):
268
304
"([^"\\ \u000A \u000D \u2028 \u2029 ]|(\\ (u[0-9a-fA-F]{4}|["\\ /bfnrt])))*"
269
305
"""
270
306
body = source .body
307
+ body_length = len (body )
308
+
271
309
position = start + 1
272
310
chunk_start = position
273
- code = None
274
- value = u''
311
+ code = 0
312
+ value = []
313
+ append = value .append
275
314
276
- while position < len ( body ) :
315
+ while position < body_length :
277
316
code = char_code_at (body , position )
278
- if not code or code in (34 , 10 , 13 , 0x2028 , 0x2029 ):
317
+ if not (
318
+ code is not None and
319
+ code not in (
320
+ # LineTerminator
321
+ 0x000A , 0x000D ,
322
+ # Quote
323
+ 34
324
+ )
325
+ ):
279
326
break
327
+
328
+ if code < 0x0020 and code != 0x0009 :
329
+ raise LanguageError (
330
+ source ,
331
+ position ,
332
+ u'Invalid character within String: {}.' .format (print_char_code (code ))
333
+ )
334
+
280
335
position += 1
281
336
if code == 92 : # \
282
- value += body [chunk_start :position - 1 ]
337
+ append (body [chunk_start :position - 1 ])
338
+
283
339
code = char_code_at (body , position )
284
340
escaped = ESCAPED_CHAR_CODES .get (code )
285
341
if escaped is not None :
286
- value += escaped
287
- elif code == 117 :
342
+ append (escaped )
343
+
344
+ elif code == 117 : # u
288
345
char_code = uni_char_code (
289
346
char_code_at (body , position + 1 ) or 0 ,
290
347
char_code_at (body , position + 2 ) or 0 ,
291
348
char_code_at (body , position + 3 ) or 0 ,
292
349
char_code_at (body , position + 4 ) or 0 ,
293
350
)
351
+
294
352
if char_code < 0 :
295
353
raise LanguageError (
296
354
source , position ,
297
- 'Bad character escape sequence' )
298
- value += unichr (char_code )
355
+ u'Invalid character escape sequence: \\ u{}.' .format (body [position + 1 : position + 5 ])
356
+ )
357
+
358
+ append (unichr (char_code ))
299
359
position += 4
300
360
else :
301
361
raise LanguageError (
302
362
source , position ,
303
- 'Bad character escape sequence' )
363
+ u'Invalid character escape sequence: \\ {}.' .format (unichr (code ))
364
+ )
365
+
304
366
position += 1
305
367
chunk_start = position
306
368
307
- if code != 34 :
369
+ if code != 34 : # Quote (")
308
370
raise LanguageError (source , position , 'Unterminated string' )
309
371
310
- value += body [chunk_start :position ]
311
- return Token (TokenKind .STRING , start , position + 1 , value )
372
+ append ( body [chunk_start :position ])
373
+ return Token (TokenKind .STRING , start , position + 1 , u'' . join ( value ) )
312
374
313
375
314
376
def uni_char_code (a , b , c , d ):
@@ -348,15 +410,17 @@ def read_name(source, position):
348
410
body = source .body
349
411
body_length = len (body )
350
412
end = position + 1
351
- code = None
413
+
352
414
while end != body_length :
353
415
code = char_code_at (body , end )
354
- if not code or not (
416
+ if not ( code is not None and (
355
417
code == 95 or # _
356
418
48 <= code <= 57 or # 0-9
357
419
65 <= code <= 90 or # A-Z
358
420
97 <= code <= 122 # a-z
359
- ):
421
+ )) :
360
422
break
423
+
361
424
end += 1
425
+
362
426
return Token (TokenKind .NAME , position , end , body [position :end ])
0 commit comments