3737 , location_span_of_positions (lexbuf.lex_start_p, lexbuf.lex_curr_p) )
3838}
3939
40+ (*
41+ OCamllex does not know about unicode, it just operates over bytes.
42+ So, we can define all 'valid' byte sequences for UTF-8 like so
43+ *)
44+ (* 110xxxxx *)
45+ let utf8_head_byte2 = ['\192' - '\223' ]
46+ (* 1110xxxx *)
47+ let utf8_head_byte3 = ['\224' - '\239' ]
48+ (* 11110xxx *)
49+ let utf8_head_byte4 = ['\240' - '\247' ]
50+ (* 10xxxxxx *)
51+ let utf8_tail_byte = ['\128' - '\191' ]
52+
53+ (* utf8_1 is ascii *)
54+ let ascii_allowed = ['a' - 'z' 'A' - 'Z' '0' - '9' '_' ]
55+ (* 11 bits of payload *)
56+ let utf8_2 = utf8_head_byte2 utf8_tail_byte
57+ (* 16 bits of payload *)
58+ let utf8_3 = utf8_head_byte3 utf8_tail_byte utf8_tail_byte
59+ (* 21 bits of payload *)
60+ let utf8_4 = utf8_head_byte4 utf8_tail_byte utf8_tail_byte utf8_tail_byte
61+
62+ (* Any UTF-8-encoded code point, outside the ASCII range.
63+ This set includes more than it should for simplicity.
64+ *)
65+ let utf8_nonascii = utf8_2 | utf8_3 | utf8_4
66+
67+ (* identifiers here are overly permissive, and are checked
68+ in the semantic action of the rule that matches here.
69+ *)
70+ let identifier = (ascii_allowed | utf8_nonascii)+
71+
4072(* Some auxiliary definition for variables and constants *)
4173let string_literal = '"' [^ '"' '\r' '\n' ]* '"'
42- let identifier = ['a' - 'z' 'A' - 'Z' ] ['a' - 'z' 'A' - 'Z' '0' - '9' '_' ]* (* TODO: We should probably expand the alphabet *)
4374
4475let integer_constant = ['0' - '9' ]+ ('_' ['0' - '9' ]+ )*
4576
@@ -198,8 +229,10 @@ rule token = parse
198229 | string_literal as s { lexer_logger (" string_literal " ^ s) ;
199230 Parser. STRINGLITERAL (lexeme lexbuf) }
200231 | identifier as id { lexer_logger (" identifier " ^ id) ;
201- lexer_pos_logger (lexeme_start_p lexbuf);
202- Parser. IDENTIFIER (lexeme lexbuf) }
232+ let loc = (lexeme_start_p lexbuf) in
233+ lexer_pos_logger loc;
234+ let canonical_id = Unicode. validate_identifier loc id in
235+ Parser. IDENTIFIER (canonical_id) }
203236(* End of file *)
204237 | eof { lexer_logger " eof" ;
205238 if Preprocessor. size () = 1
@@ -210,7 +243,8 @@ rule token = parse
210243
211244 | _ { raise (Errors. SyntaxError
212245 (Errors. Lexing
213- (location_of_position
246+ (" Invalid character found." ,
247+ location_of_position
214248 (lexeme_start_p
215249 (current_buffer () ))))) }
216250
0 commit comments