@@ -49,7 +49,7 @@ let mk_string buf s =
4949 | 't' -> '\t'
5050 | '\\' -> '\\'
5151 | '\' ' -> '\' '
52- | '\ "' -> '\ "'
52+ | '"' -> '"'
5353 | 'u' ->
5454 let j = ! i + 2 in
5555 begin
@@ -103,13 +103,13 @@ let frac = [%sedlex.regexp? num]
103103
104104let float =
105105 [% sedlex.regexp?
106- ( Opt sign, num, '.' , Opt frac
107- | Opt sign, num, Opt ('.' , Opt frac), ('e' | 'E' ), Opt sign, num
108- | Opt sign, " 0x" , hexnum, '.' , Opt hexfrac
109- | Opt sign, " 0x" , hexnum, Opt ('.' , Opt hexfrac), ('p' | 'P' ), Opt sign, num
110- | Opt sign, " inf"
111- | Opt sign, " nan"
112- | Opt sign, " nan:" , " 0x" , hexnum )]
106+ ( Opt sign, num, '.' , Opt frac
107+ | Opt sign, num, Opt ('.' , Opt frac), ('e' | 'E' ), Opt sign, num
108+ | Opt sign, " 0x" , hexnum, '.' , Opt hexfrac
109+ | Opt sign, " 0x" , hexnum, Opt ('.' , Opt hexfrac), ('p' | 'P' ), Opt sign, num
110+ | Opt sign, " inf"
111+ | Opt sign, " nan"
112+ | Opt sign, " nan:" , " 0x" , hexnum )]
113113
114114let nat = [% sedlex.regexp? num | " 0x" , hexnum]
115115
@@ -119,15 +119,38 @@ let num = [%sedlex.regexp? float | int | nat]
119119
120120let id_char =
121121 [% sedlex.regexp?
122- ( '0' .. '9'
123- | 'a' .. 'z'
124- | 'A' .. 'Z'
125- | '!' | '#' | '$' | '%' | '&' | '\' ' | '*' | '+' | '-' | '.' | '/' | ':'
126- | '<' | '=' | '>' | '?' | '@' | '\\' | '^' | '_' | '`' | '|' | '~' )]
122+ ( '0' .. '9'
123+ | 'a' .. 'z'
124+ | 'A' .. 'Z'
125+ | '!' | '#' | '$' | '%' | '&' | '\' ' | '*' | '+' | '-' | '.' | '/' | ':'
126+ | '<' | '=' | '>' | '?' | '@' | '\\' | '^' | '_' | '`' | '|' | '~' )]
127127
128128let string_elem = [% sedlex.regexp? Sub (any, " \" " ) | " \\\" " ]
129129
130- let name = [% sedlex.regexp? " \" " , Star string_elem, " \" " ]
130+ let utf8cont = [% sedlex.regexp? '\x80' .. '\xbf' ]
131+
132+ let utf8enc =
133+ [% sedlex.regexp?
134+ ( '\xc2' .. '\xdf' , utf8cont
135+ | '\xe0' , '\xa0' .. '\xbf' , utf8cont
136+ | '\xed' , '\x80' .. '\x9f' , utf8cont
137+ | ('\xe1' .. '\xec' | '\xee' .. '\xef' ), utf8cont, utf8cont
138+ | '\xf0' , '\x90' .. '\xbf' , utf8cont, utf8cont
139+ | '\xf4' , '\x80' .. '\x8f' , utf8cont, utf8cont
140+ | '\xf1' .. '\xf3' , utf8cont, utf8cont, utf8cont )]
141+
142+ let escape = [% sedlex.regexp? 'n' | 'r' | 't' | '\\' | '\' ' | '"' ]
143+
144+ let character =
145+ [% sedlex.regexp?
146+ ( Sub (any, ('"' | '\\' | '\x00' .. '\x1f' | '\x7f' .. '\xff' ))
147+ | utf8enc
148+ | '\\' , escape
149+ | '\\' , hexdigit, hexdigit
150+ | '\\' , " u{" , hexnum, '}' )]
151+
152+ (* TODO: use character here instead of string_elem ? *)
153+ let name = [% sedlex.regexp? '"' , Star string_elem, '"' ]
131154
132155let operator =
133156 [% sedlex.regexp? Plus ('0' .. '9' | 'a' .. 'z' | '.' | '_' | ':' ), Star name]
@@ -142,7 +165,7 @@ let bad_num = [%sedlex.regexp? num, Plus id]
142165
143166let annot_atom =
144167 [% sedlex.regexp?
145- Plus id_char | num | name | ',' | ';' | '[' | ']' | '{' | '}' ]
168+ Plus id_char | num | name | ',' | ';' | '[' | ']' | '{' | '}' ]
146169
147170let keywords =
148171 let tbl = Hashtbl. create 512 in
@@ -449,33 +472,34 @@ let keywords =
449472 tbl
450473
451474let rec token buf =
475+ (* Fmt.epr "LXM = %S@\n" (Utf8.lexeme buf); *)
452476 match % sedlex buf with
453477 | Plus any_blank -> token buf
454478 | bad_num | bad_id | bad_name -> unknown_operator buf
455479 | num -> NUM (Utf8. lexeme buf)
456- | operator -> begin
480+ | operator ->
457481 let operator = Utf8. lexeme buf in
458482 match Hashtbl. find_opt keywords operator with
459483 | None -> unknown_operator buf
460484 | Some v -> v
461- end
462- (* comment *)
463- | ";;" ->
464- single_comment buf;
465- token buf
466- | "(;" ->
467- comment buf;
485+ end
486+ (* comment *)
487+ | ";;" ->
488+ single_comment buf;
489+ token buf
490+ | "(;" ->
491+ comment buf;
492+ token buf
493+ (* custom annotation *)
494+ | "(@" , name ->
495+ let annotid = Utf8. lexeme buf in
496+ let annotid = String. sub annotid 3 (String. length annotid - 4 ) in
497+ let annotid = mk_string buf annotid in
498+ if String. equal " " annotid then raise Empty_annotation_id
499+ else
500+ let items = Sexp. List (annot buf) in
501+ Annot. (record_annot annotid items);
468502 token buf
469- (* custom annotation *)
470- | "(@" , name ->
471- let annotid = Utf8. lexeme buf in
472- let annotid = String. sub annotid 3 (String. length annotid - 4 ) in
473- let annotid = mk_string buf annotid in
474- if String. equal " " annotid then raise Empty_annotation_id
475- else
476- let items = Sexp. List (annot buf) in
477- Annot. (record_annot annotid items);
478- token buf
479503 | "(@" , Plus id_char ->
480504 let annotid = Utf8. lexeme buf in
481505 let annotid = String. sub annotid 2 (String. length annotid - 2 ) in
@@ -501,7 +525,6 @@ let rec token buf =
501525 NAME name
502526 | "\" " , Star string_elem -> raise Unclosed_string
503527 | eof -> EOF
504- (* | "" -> EOF *)
505528 | any -> unknown_operator buf
506529 | _ -> unknown_operator buf
507530
0 commit comments