stan-dev · WardBrian · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -50,9 +50,9 @@ def runPerformanceTests(String testsPath, String stancFlags = ""){
         cd cmdstan; make clean-all;
     """
 
-    if (stancFlags?.trim()) {
-        sh "cd performance-tests-cmdstan/cmdstan && echo 'STANCFLAGS= $stancFlags' >> make/local"
-    }
+    // if (stancFlags?.trim()) {
+        sh "cd performance-tests-cmdstan/cmdstan && echo 'STANCFLAGS= --allow-unicode $stancFlags' >> make/local"
+    // }
 
     sh """
         cd performance-tests-cmdstan/cmdstan
@@ -115,7 +115,7 @@ pipeline {
     }
     environment {
         CXX = 'clang++-6.0'
-        MACOS_SWITCH = 'stanc3-4.14'
+        MACOS_SWITCH = 'stanc3-4.14-unicode'
         PARALLEL = 4
         GIT_AUTHOR_NAME = 'Stan Jenkins'
         GIT_AUTHOR_EMAIL = '[email protected]'
@@ -803,7 +803,7 @@ pipeline {
                                 dir '.'
                                 label 'linux && emulation'
                                 args "${qemuArchFlag(ARCHITECTURE)} --group-add=987 --group-add=980 --group-add=988 --entrypoint='' -v /var/run/docker.sock:/var/run/docker.sock"
-                                additionalBuildArgs  "${qemuArchFlag(ARCHITECTURE)} --build-arg PUID=\$(id -u) --build-arg PGID=\$(id -g)"
+                                additionalBuildArgs  "${qemuArchFlag(ARCHITECTURE)} --ulimit stack=67108864 --build-arg PUID=\$(id -u) --build-arg PGID=\$(id -g)"
                             }
                         }
                         steps {

diff --git a/dune-project b/dune-project
@@ -26,6 +26,10 @@
    (= 2.1.0))
   (cmdliner
    (= 1.3.0))
+  (uucp
+   (= 16.0.0))
+  (uunf
+   (= 16.0.0))
   (ocamlformat
    (and
     :with-test

diff --git a/scripts/install_build_deps.sh b/scripts/install_build_deps.sh
@@ -7,6 +7,7 @@ eval $(opam env)
 
 opam pin -y core v0.16.0 --no-action
 
-opam install -y dune core.v0.16.0 menhir.20230608 ppx_deriving.5.2.1 fmt.0.9.0 yojson.2.1.0 cmdliner.1.3.0
+opam install -y dune core.v0.16.0 menhir.20230608 ppx_deriving.5.2.1 fmt.0.9.0 yojson.2.1.0\
+ cmdliner.1.3.0 uucp.16.0.0 uunf.16.0.0
 
 eval $(opam env)
diff --git a/scripts/install_build_deps_windows.sh b/scripts/install_build_deps_windows.sh
@@ -9,14 +9,12 @@ eval $(opam env)
 # Add windows repository
 opam repository add windows http://github.com/ocaml-cross/opam-cross-windows.git
 
-# Request the compiler to be built with flambda optimizers
-opam install -y conf-flambda-windows
-
 # Install the compiler
 opam install -y "ocaml-windows64=4.14.1"
 
 # Install dependencies
 opam install -y core.v0.16.1 core-windows.v0.16.1 menhir.20230608 menhir-windows.20230608 ppx_deriving.5.2.1 ppx_deriving-windows.5.2.1\
- fmt.0.9.0 fmt-windows.0.9.0 yojson.2.1.0 yojson-windows.2.1.0 cmdliner.1.3.0 cmdliner-windows.1.3.0
+ fmt.0.9.0 fmt-windows.0.9.0 yojson.2.1.0 yojson-windows.2.1.0 cmdliner.1.3.0 cmdliner-windows.1.3.0 uucp.16.0.0 uucp-windows.16.0.0\
+ uunf.16.0.0 uunf-windows.16.0.0
 
 eval $(opam env)
diff --git a/src/common/Unicode.ml b/src/common/Unicode.ml
@@ -0,0 +1,107 @@
+let pp_uchar ppf u =
+  let u_int = Uchar.to_int u in
+  if u_int < 128 then Fmt.string ppf (Char.chr u_int |> Char.escaped)
+  else Fmt.pf ppf "U+%04X" u_int
+
+let is_ascii s =
+  let rec loop max b i =
+    if i > max then true
+    else if Bytes.get_uint8 b i < 128 then loop max b (i + 1)
+    else false in
+  let b = Bytes.of_string s in
+  loop (Bytes.length b - 1) b 0
+
+let normalize = Uunf_string.normalize_utf_8 `NFKC
+
+let foldi_uchars ~f acc str =
+  let len = String.length str in
+  let rec loop pos acc =
+    if pos == len then acc
+    else
+      let decode = String.get_utf_8_uchar str pos in
+      let char_length = Uchar.utf_decode_length decode in
+      let uchar = Uchar.utf_decode_uchar decode in
+      let acc = f acc pos uchar in
+      loop (pos + char_length) acc in
+  loop 0 acc
+
+let iteri_uchars ~f str =
+  let f' buf pos c =
+    f pos c;
+    Buffer.add_utf_8_uchar buf c;
+    buf in
+  let s_after =
+    Buffer.contents
+    @@ foldi_uchars ~f:f' (Buffer.create (String.length str)) str in
+  (* another sanity check *)
+  if not (String.equal str s_after) then
+    Core.(
+      ICE.internal_compiler_error
+        [%message
+          "Failed to round-trip unicode string!"
+            (str : string)
+            (s_after : string)])
+
+(* WIP:
+
+   While not strictly necessary, there are some additional restrictions which
+   are good to implement for validation and preventing strings that are visually
+   identical from being distinct identifiers.
+   A good summary can be found here: https://perl11.org/blog/unicode-identifiers.html
+
+   Most of these are only a problem if you assume maliciousness of the user,
+   so they may not be important for an initial version in Stan.
+*)
+
+(* Defined in https://www.unicode.org/reports/tr39/#Confusable_Detection *)
+let confusable x y =
+  let skeleton x =
+    let x = Uunf_string.normalize_utf_8 `NFD x in
+    let f acc _ c =
+      if Uucp.Gen.is_default_ignorable c then ()
+      else
+        (* TODO!! replace with prototype - need data? *)
+        Buffer.add_utf_8_uchar acc c;
+      acc in
+    let buf = foldi_uchars ~f (Buffer.create (String.length x)) x in
+    let x = Buffer.contents buf in
+    let x = Uunf_string.normalize_utf_8 `NFD x in
+    x in
+  String.compare (skeleton x) (skeleton y)
+
+module ScriptSet = Set.Make (Uucp.Script)
+
+(** copied from UUCP's definition of [Uucp.Script.t] *)
+let all =
+  ScriptSet.of_list
+    [ `Adlm; `Aghb; `Ahom; `Arab; `Armi; `Armn; `Avst; `Bali; `Bamu; `Bass; `Batk
+    ; `Beng; `Bhks; `Bopo; `Brah; `Brai; `Bugi; `Buhd; `Cakm; `Cans; `Cari
+    ; `Cham; `Cher; `Chrs; `Copt; `Cpmn; `Cprt; `Cyrl; `Deva; `Diak; `Dogr
+    ; `Dsrt; `Dupl; `Egyp; `Elba; `Elym; `Ethi; `Geor; `Glag; `Gong; `Gonm
+    ; `Goth; `Gran; `Grek; `Gujr; `Guru; `Hang; `Hani; `Hano; `Hatr; `Hebr
+    ; `Hira; `Hluw; `Hmng; `Hmnp; `Hrkt; `Hung; `Ital; `Java; `Kali; `Kana
+    ; `Kawi; `Khar; `Khmr; `Khoj; `Knda; `Kthi; `Kits; `Lana; `Laoo; `Latn
+    ; `Lepc; `Limb; `Lina; `Linb; `Lisu; `Lyci; `Lydi; `Mahj; `Maka; `Mand
+    ; `Mani; `Marc; `Medf; `Mend; `Merc; `Mero; `Mlym; `Modi; `Mong; `Mroo
+    ; `Mtei; `Mult; `Mymr; `Nagm; `Nand; `Narb; `Nbat; `Newa; `Nkoo; `Nshu
+    ; `Ogam; `Olck; `Orkh; `Orya; `Osge; `Osma; `Ougr; `Palm; `Pauc; `Perm
+    ; `Phag; `Phli; `Phlp; `Phnx; `Plrd; `Prti; `Qaai; `Rjng; `Rohg; `Runr
+    ; `Samr; `Sarb; `Saur; `Sgnw; `Shaw; `Shrd; `Sidd; `Sind; `Sinh; `Sogd
+    ; `Sogo; `Sora; `Soyo; `Sund; `Sylo; `Syrc; `Tagb; `Takr; `Tale; `Talu
+    ; `Taml; `Tang; `Tavt; `Telu; `Tfng; `Tglg; `Thaa; `Thai; `Tibt; `Tirh
+    ; `Tnsa; `Toto; `Ugar; `Vaii; `Vith; `Wara; `Wcho; `Xpeo; `Xsux; `Yezi
+    ; `Yiii; `Zanb; `Zinh; `Zyyy; `Zzzz ]
+
+let extended s =
+  if ScriptSet.mem `Zyyy s || ScriptSet.mem `Zinh s then all else s
+
+(* Defined in https://www.unicode.org/reports/tr39/#Restriction_Level_Detection *)
+let restriction_level x =
+  let f acc _ c =
+    let scripts =
+      Uucp.Script.script_extensions c |> ScriptSet.of_list |> extended in
+    scripts :: acc in
+  let soss = foldi_uchars ~f [] x in
+  let resolved = List.fold_right ScriptSet.inter soss all in
+  if not @@ ScriptSet.is_empty resolved then `Single
+  else `Unrestricted (* TODO implement levels 3-5 *)
diff --git a/src/common/dune b/src/common/dune
@@ -1,7 +1,7 @@
 (library
  (name common)
  (public_name stanc.common)
- (libraries core fmt)
+ (libraries core fmt uunf uucp)
  (instrumentation
   (backend bisect_ppx))
  (inline_tests)

diff --git a/src/driver/Entry.ml b/src/driver/Entry.ml
@@ -31,6 +31,7 @@ let set_model_name model_name =
 
 let reset_mutable_states model_name (flags : Flags.t) =
   Common.Gensym.reset_danger_use_cautiously ();
+  Identifiers.allow_unicode := flags.allow_unicode;
   Include_files.include_provider := flags.include_source;
   set_model_name model_name;
   Typechecker.check_that_all_functions_have_definition :=

diff --git a/src/driver/Flags.ml b/src/driver/Flags.ml
@@ -7,6 +7,7 @@ type t =
   ; standalone_functions: bool
   ; use_opencl: bool
   ; include_source: Frontend.Include_files.t
+  ; allow_unicode: bool
   ; info: bool
   ; version: bool
   ; auto_format: bool
@@ -49,6 +50,7 @@ let default =
   ; standalone_functions= false
   ; use_opencl= false
   ; include_source= Frontend.Include_files.FileSystemPaths []
+  ; allow_unicode= false
   ; info= false
   ; version= false
   ; auto_format= false

diff --git a/src/driver/Flags.mli b/src/driver/Flags.mli
@@ -9,6 +9,7 @@ type t =
   ; standalone_functions: bool
   ; use_opencl: bool
   ; include_source: Frontend.Include_files.t
+  ; allow_unicode: bool
         (* ------------------------- *)
         (* flags which switch compiler "modes" *)
   ; info: bool

diff --git a/src/frontend/Errors.ml b/src/frontend/Errors.ml
@@ -4,7 +4,7 @@ open Core
 
 (** Our type of syntax error information *)
 type syntax_error =
-  | Lexing of Middle.Location.t
+  | Lexing of string * Middle.Location.t
   | UnexpectedEOF of Middle.Location.t
   | Include of string * Middle.Location.t
   | Parsing of string * Middle.Location_span.t
@@ -57,12 +57,12 @@ let pp_syntax_error ?printed_filename ?code ppf = function
         (Middle.Location_span.to_string ?printed_filename loc_span)
         (pp_context_with_message ?code)
         (message, loc_span.begin_loc)
-  | Lexing loc ->
+  | Lexing (message, loc) ->
       Fmt.pf ppf "Syntax error in %s, lexing error:@,%a@."
         (Middle.Location.to_string ?printed_filename
            {loc with col_num= loc.col_num - 1})
         (pp_context_with_message ?code)
-        ("Invalid character found.", loc)
+        (message, loc)
   | UnexpectedEOF loc ->
       Fmt.pf ppf "Syntax error in %s, lexing error:@,%a@."
         (Middle.Location.to_string ?printed_filename

diff --git a/src/frontend/Errors.mli b/src/frontend/Errors.mli
@@ -2,7 +2,7 @@
 
 (** Our type of syntax error information *)
 type syntax_error =
-  | Lexing of Middle.Location.t
+  | Lexing of string * Middle.Location.t
   | UnexpectedEOF of Middle.Location.t
   | Include of string * Middle.Location.t
   | Parsing of string * Middle.Location_span.t

diff --git a/src/frontend/Identifiers.ml b/src/frontend/Identifiers.ml
@@ -0,0 +1,45 @@
+open Common.Unicode
+
+let allow_unicode = ref false
+
+let error ~loc msg =
+  raise
+    (Errors.SyntaxError
+       (Errors.Lexing (msg, Preprocessor.location_of_position loc)))
+
+let validate_ascii_id ~loc id =
+  Debugging.lexer_logger ("ascii id: " ^ id);
+  let first = String.get_uint8 id 0 in
+  if
+    (first >= Char.code 'A' && first <= Char.code 'Z')
+    || (first >= Char.code 'a' && first <= Char.code 'z')
+  then id
+  else error ~loc "Invalid character found."
+
+(* Validation based on the
+   Unicode Standard Annex #31: Unicode Identifiers and Syntax
+   https://www.unicode.org/reports/tr31 *)
+
+let validate_utf8_id ~loc id =
+  if not !allow_unicode then
+    error ~loc
+      "Unicode identifiers are not supported without the (experimental) \
+       allow-unicode flag";
+  if not (String.is_valid_utf_8 id) then
+    error ~loc "Identifier is not valid UTF-8 string";
+  Debugging.lexer_logger ("unicode id: " ^ id);
+  (* normalize to NFKC as recommended *)
+  let id = normalize id in
+  let f pos uchar =
+    if pos == 0 then (
+      if not (Uucp.Id.is_xid_start uchar) then
+        error ~loc (Fmt.str "Invalid character: '%a'" pp_uchar uchar))
+    else if not (Uucp.Id.is_xid_continue uchar) then
+      error ~loc
+        (Fmt.str "Invalid character in identifier at offset %d: '%a'" pos
+           pp_uchar uchar) in
+  iteri_uchars ~f id;
+  id
+
+let validate loc id =
+  if is_ascii id then validate_ascii_id ~loc id else validate_utf8_id ~loc id
diff --git a/src/frontend/Identifiers.mli b/src/frontend/Identifiers.mli
@@ -0,0 +1,2 @@
+val allow_unicode : bool ref
+val validate : Lexing.position -> string -> string
diff --git a/src/frontend/dune b/src/frontend/dune
@@ -1,14 +1,20 @@
 (library
  (name frontend)
  (public_name stanc.frontend)
- (libraries core menhirLib yojson fmt middle stan_math_signatures)
+ (libraries core menhirLib uucp yojson fmt middle stan_math_signatures)
  (instrumentation
   (backend bisect_ppx))
  (inline_tests)
  (preprocess
   (pps ppx_jane ppx_deriving.fold ppx_deriving.map)))
 
-(ocamllex lexer)
+(rule
+ (target lexer.ml)
+ (deps lexer.mll)
+ (action
+  (chdir
+   %{workspace_root}
+   (run %{bin:ocamllex} -ml -o %{target} %{deps}))))
 
 (rule
  (targets parsing_errors.ml)

diff --git a/src/frontend/lexer.mll b/src/frontend/lexer.mll
@@ -37,9 +37,40 @@
          , location_span_of_positions (lexbuf.lex_start_p, lexbuf.lex_curr_p) )
 }
 
+(*
+  OCamllex does not know about unicode, it just operates over bytes.
+  So, we can define all 'valid' byte sequences for UTF-8 like so
+*)
+(* 110xxxxx *)
+let utf8_head_byte2 = ['\192'-'\223']
+(* 1110xxxx *)
+let utf8_head_byte3 = ['\224'-'\239']
+(* 11110xxx *)
+let utf8_head_byte4 = ['\240'-'\247']
+(* 10xxxxxx *)
+let utf8_tail_byte = ['\128'-'\191']
+
+(* utf8_1 is ascii *)
+let ascii_allowed = ['a'-'z' 'A'-'Z' '0'-'9' '_']
+(* 11 bits of payload *)
+let utf8_2 = utf8_head_byte2 utf8_tail_byte
+(* 16 bits of payload *)
+let utf8_3 = utf8_head_byte3 utf8_tail_byte utf8_tail_byte
+(* 21 bits of payload *)
+let utf8_4 = utf8_head_byte4 utf8_tail_byte utf8_tail_byte utf8_tail_byte
+
+(* Any UTF-8-encoded code point, outside the ASCII range.
+  This set includes more than it should for simplicity.
+*)
+let utf8_nonascii = utf8_2 | utf8_3 | utf8_4
+
+(* identifiers here are overly permissive, and are checked
+   in the semantic action of the rule that matches here.
+*)
+let identifier = (ascii_allowed | utf8_nonascii)+
+
 (* Some auxiliary definition for variables and constants *)
 let string_literal = '"' [^ '"' '\r' '\n']* '"'
-let identifier = ['a'-'z' 'A'-'Z'] ['a'-'z' 'A'-'Z' '0'-'9' '_']*   (* TODO: We should probably expand the alphabet *)
 
 let integer_constant =  ['0'-'9']+ ('_' ['0'-'9']+)*
 
@@ -198,8 +229,10 @@ rule token = parse
   | string_literal as s       { lexer_logger ("string_literal " ^ s) ;
                                 Parser.STRINGLITERAL (lexeme lexbuf) }
   | identifier as id          { lexer_logger ("identifier " ^ id) ;
-                                lexer_pos_logger (lexeme_start_p lexbuf);
-                                Parser.IDENTIFIER (lexeme lexbuf) }
+                                let loc = (lexeme_start_p lexbuf) in
+                                lexer_pos_logger loc;
+                                let canonical_id = Identifiers.validate loc id in
+                                Parser.IDENTIFIER (canonical_id) }
 (* End of file *)
   | eof                       { lexer_logger "eof" ;
                                 if Preprocessor.size () = 1
@@ -210,7 +243,8 @@ rule token = parse
 
   | _                         { raise (Errors.SyntaxError
                                         (Errors.Lexing
-                                          (location_of_position
+                                          ("Invalid character found.",
+                                           location_of_position
                                             (lexeme_start_p
                                               (current_buffer ()))))) }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		val allow_unicode : bool ref
		val validate : Lexing.position -> string -> string