First pass at unicode support

WardBrian · WardBrian · commit 092bf0665a7b · 2025-03-13T14:27:09.000-04:00
diff --git a/src/frontend/Errors.ml b/src/frontend/Errors.ml
@@ -4,7 +4,7 @@ open Core
 
 (** Our type of syntax error information *)
 type syntax_error =
-  | Lexing of Middle.Location.t
+  | Lexing of string * Middle.Location.t
   | UnexpectedEOF of Middle.Location.t
   | Include of string * Middle.Location.t
   | Parsing of string * Middle.Location_span.t
@@ -57,12 +57,12 @@ let pp_syntax_error ?printed_filename ?code ppf = function
         (Middle.Location_span.to_string ?printed_filename loc_span)
         (pp_context_with_message ?code)
         (message, loc_span.begin_loc)
-  | Lexing loc ->
+  | Lexing (message, loc) ->
       Fmt.pf ppf "Syntax error in %s, lexing error:@,%a@."
         (Middle.Location.to_string ?printed_filename
            {loc with col_num= loc.col_num - 1})
         (pp_context_with_message ?code)
-        ("Invalid character found.", loc)
+        (message, loc)
   | UnexpectedEOF loc ->
       Fmt.pf ppf "Syntax error in %s, lexing error:@,%a@."
         (Middle.Location.to_string ?printed_filename
diff --git a/src/frontend/Errors.mli b/src/frontend/Errors.mli
@@ -2,7 +2,7 @@
 
 (** Our type of syntax error information *)
 type syntax_error =
-  | Lexing of Middle.Location.t
+  | Lexing of string * Middle.Location.t
   | UnexpectedEOF of Middle.Location.t
   | Include of string * Middle.Location.t
   | Parsing of string * Middle.Location_span.t
diff --git a/src/frontend/Unicode.ml b/src/frontend/Unicode.ml
@@ -0,0 +1,45 @@
+let error ~loc msg =
+  raise
+    (Errors.SyntaxError
+       (Errors.Lexing (msg, Preprocessor.location_of_position loc)))
+
+let pp_uchar ppf u =
+  let u_int = Uchar.to_int u in
+  if u_int < 128 then Fmt.string ppf (Char.chr u_int |> Char.escaped)
+  else Fmt.pf ppf "U+%04X" u_int
+
+(* Validation based on the
+   Unicode Standard Annex #31: Unicode Identifiers and Syntax
+   https://www.unicode.org/reports/tr31 *)
+
+let validate_identifier loc id =
+  (* sanity check *)
+  if not (String.is_valid_utf_8 id) then
+    error "Identifier is not valid UTF-8 string" ~loc;
+  (* normalize to NFKC as recommended *)
+  let id = Uunf_string.normalize_utf_8 `NFKC id in
+  let out = Buffer.create 24 in
+  let len = String.length id in
+  let pos = ref 0 in
+  (* move through code point by code point *)
+  while !pos != len do
+    let decode = String.get_utf_8_uchar id !pos in
+    let char_length = Uchar.utf_decode_length decode in
+    let uchar = Uchar.utf_decode_uchar decode in
+    Buffer.add_utf_8_uchar out uchar;
+    match !pos with
+    | 0 when not (Uucp.Id.is_xid_start uchar) ->
+        error ~loc (Fmt.str "Invalid character: '%a'" pp_uchar uchar)
+    | _ when not (Uucp.Id.is_xid_continue uchar) ->
+        error ~loc
+          (Fmt.str "Invalid character in identifier at offset %d: '%a'" !pos
+             pp_uchar uchar)
+    | _ -> pos := !pos + char_length
+  done;
+  (* another sanity check *)
+  let res_id = Buffer.contents out in
+  (if not (String.equal res_id id) then
+     Core.(
+       Common.FatalError.fatal_error_msg
+         [%message "Failed to properly encode id during lexing!" (id : string)]));
+  id
diff --git a/src/frontend/dune b/src/frontend/dune
@@ -1,14 +1,20 @@
 (library
  (name frontend)
  (public_name stanc.frontend)
- (libraries core menhirLib yojson fmt middle stan_math_signatures)
+ (libraries core menhirLib uunf uucp yojson fmt middle stan_math_signatures)
  (instrumentation
   (backend bisect_ppx))
  (inline_tests)
  (preprocess
   (pps ppx_jane ppx_deriving.fold ppx_deriving.map)))
 
-(ocamllex lexer)
+(rule
+ (target lexer.ml)
+ (deps lexer.mll)
+ (action
+  (chdir
+   %{workspace_root}
+   (run %{bin:ocamllex} -ml -o %{target} %{deps}))))
 
 (rule
  (targets parsing_errors.ml)
diff --git a/src/frontend/lexer.mll b/src/frontend/lexer.mll
@@ -37,9 +37,40 @@
          , location_span_of_positions (lexbuf.lex_start_p, lexbuf.lex_curr_p) )
 }
 
+(*
+  OCamllex does not know about unicode, it just operates over bytes.
+  So, we can define all 'valid' byte sequences for UTF-8 like so
+*)
+(* 110xxxxx *)
+let utf8_head_byte2 = ['\192'-'\223']
+(* 1110xxxx *)
+let utf8_head_byte3 = ['\224'-'\239']
+(* 11110xxx *)
+let utf8_head_byte4 = ['\240'-'\247']
+(* 10xxxxxx *)
+let utf8_tail_byte = ['\128'-'\191']
+
+(* utf8_1 is ascii *)
+let ascii_allowed = ['a'-'z' 'A'-'Z' '0'-'9' '_']
+(* 11 bits of payload *)
+let utf8_2 = utf8_head_byte2 utf8_tail_byte
+(* 16 bits of payload *)
+let utf8_3 = utf8_head_byte3 utf8_tail_byte utf8_tail_byte
+(* 21 bits of payload *)
+let utf8_4 = utf8_head_byte4 utf8_tail_byte utf8_tail_byte utf8_tail_byte
+
+(* Any UTF-8-encoded code point, outside the ASCII range.
+  This set includes more than it should for simplicity.
+*)
+let utf8_nonascii = utf8_2 | utf8_3 | utf8_4
+
+(* identifiers here are overly permissive, and are checked
+   in the semantic action of the rule that matches here.
+*)
+let identifier = (ascii_allowed | utf8_nonascii)+
+
 (* Some auxiliary definition for variables and constants *)
 let string_literal = '"' [^ '"' '\r' '\n']* '"'
-let identifier = ['a'-'z' 'A'-'Z'] ['a'-'z' 'A'-'Z' '0'-'9' '_']*   (* TODO: We should probably expand the alphabet *)
 
 let integer_constant =  ['0'-'9']+ ('_' ['0'-'9']+)*
 
@@ -198,8 +229,10 @@ rule token = parse
   | string_literal as s       { lexer_logger ("string_literal " ^ s) ;
                                 Parser.STRINGLITERAL (lexeme lexbuf) }
   | identifier as id          { lexer_logger ("identifier " ^ id) ;
-                                lexer_pos_logger (lexeme_start_p lexbuf);
-                                Parser.IDENTIFIER (lexeme lexbuf) }
+                                let loc = (lexeme_start_p lexbuf) in
+                                lexer_pos_logger loc;
+                                let canonical_id = Unicode.validate_identifier loc id in
+                                Parser.IDENTIFIER (canonical_id) }
 (* End of file *)
   | eof                       { lexer_logger "eof" ;
                                 if Preprocessor.size () = 1
@@ -210,7 +243,8 @@ rule token = parse
 
   | _                         { raise (Errors.SyntaxError
                                         (Errors.Lexing
-                                          (location_of_position
+                                          ("Invalid character found.",
+                                           location_of_position
                                             (lexeme_start_p
                                               (current_buffer ()))))) }
 
diff --git a/test/integration/bad/lang/unicode_normalization.stan b/test/integration/bad/lang/unicode_normalization.stan
@@ -0,0 +1,5 @@
+data {
+  real ñabc;
+  // this is a different encoding than above, should be prevented still!
+  real ñabc;
+}
diff --git a/test/integration/good/lang/basic_unicode.stan b/test/integration/good/lang/basic_unicode.stan
@@ -0,0 +1,14 @@
+data {
+  int<lower=0> J;             // number of schools
+  array[J] real y;                  // estimated treatment effect (school j)
+  array[J] real<lower=0> σ;         // std err of effect estimate (school j)
+}
+parameters {
+  real μ;
+  array[J] real θ;
+  real<lower=0> τ;
+}
+model {
+  θ ~ normal(μ, τ);
+  y ~ normal(θ, σ);
+}