@@ -8,14 +8,31 @@ let pp_uchar ppf u =
88 if u_int < 128 then Fmt. string ppf (Char. chr u_int |> Char. escaped)
99 else Fmt. pf ppf " U+%04X" u_int
1010
11+ let is_ascii s =
12+ let rec loop max b i =
13+ if i > max then true
14+ else if Bytes. get_uint8 b i < 128 then loop max b (i + 1 )
15+ else false in
16+ let b = Bytes. of_string s in
17+ loop (Bytes. length b - 1 ) b 0
18+
19+ let validate_ascii_id ~loc id =
20+ Debugging. lexer_logger (" ascii id: " ^ id);
21+ let first = String. get_uint8 id 0 in
22+ if
23+ (first > = Char. code 'A' && first < = Char. code 'Z' )
24+ || (first > = Char. code 'a' && first < = Char. code 'z' )
25+ then id
26+ else error ~loc " Invalid character found."
27+
1128(* Validation based on the
1229 Unicode Standard Annex #31: Unicode Identifiers and Syntax
1330 https://www.unicode.org/reports/tr31 *)
1431
15- let validate_identifier loc id =
16- (* sanity check *)
32+ let validate_utf8_id ~loc id =
1733 if not (String. is_valid_utf_8 id) then
18- error " Identifier is not valid UTF-8 string" ~loc ;
34+ error ~loc " Identifier is not valid UTF-8 string" ;
35+ Debugging. lexer_logger (" unicode id: " ^ id);
1936 (* normalize to NFKC as recommended *)
2037 let id = Uunf_string. normalize_utf_8 `NFKC id in
2138 let out = Buffer. create 24 in
@@ -40,6 +57,9 @@ let validate_identifier loc id =
4057 let res_id = Buffer. contents out in
4158 (if not (String. equal res_id id) then
4259 Core. (
43- Common.FatalError. fatal_error_msg
60+ Common.ICE. internal_compiler_error
4461 [% message " Failed to properly encode id during lexing!" (id : string )]));
4562 id
63+
64+ let validate_identifier loc id =
65+ if is_ascii id then validate_ascii_id ~loc id else validate_utf8_id ~loc id
0 commit comments