Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ def runPerformanceTests(String testsPath, String stancFlags = ""){
cd cmdstan; make clean-all;
"""

if (stancFlags?.trim()) {
sh "cd performance-tests-cmdstan/cmdstan && echo 'STANCFLAGS= $stancFlags' >> make/local"
}
// if (stancFlags?.trim()) {
sh "cd performance-tests-cmdstan/cmdstan && echo 'STANCFLAGS= --allow-unicode $stancFlags' >> make/local"
// }

sh """
cd performance-tests-cmdstan/cmdstan
Expand Down Expand Up @@ -115,7 +115,7 @@ pipeline {
}
environment {
CXX = 'clang++-6.0'
MACOS_SWITCH = 'stanc3-4.14'
MACOS_SWITCH = 'stanc3-4.14-unicode'
PARALLEL = 4
GIT_AUTHOR_NAME = 'Stan Jenkins'
GIT_AUTHOR_EMAIL = '[email protected]'
Expand Down Expand Up @@ -803,7 +803,7 @@ pipeline {
dir '.'
label 'linux && emulation'
args "${qemuArchFlag(ARCHITECTURE)} --group-add=987 --group-add=980 --group-add=988 --entrypoint='' -v /var/run/docker.sock:/var/run/docker.sock"
additionalBuildArgs "${qemuArchFlag(ARCHITECTURE)} --build-arg PUID=\$(id -u) --build-arg PGID=\$(id -g)"
additionalBuildArgs "${qemuArchFlag(ARCHITECTURE)} --ulimit stack=67108864 --build-arg PUID=\$(id -u) --build-arg PGID=\$(id -g)"
}
}
steps {
Expand Down
4 changes: 4 additions & 0 deletions dune-project
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
(= 2.1.0))
(cmdliner
(= 1.3.0))
(uucp
(= 16.0.0))
(uunf
(= 16.0.0))
(ocamlformat
(and
:with-test
Expand Down
3 changes: 2 additions & 1 deletion scripts/install_build_deps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ eval $(opam env)

opam pin -y core v0.16.0 --no-action

opam install -y dune core.v0.16.0 menhir.20230608 ppx_deriving.5.2.1 fmt.0.9.0 yojson.2.1.0 cmdliner.1.3.0
opam install -y dune core.v0.16.0 menhir.20230608 ppx_deriving.5.2.1 fmt.0.9.0 yojson.2.1.0\
cmdliner.1.3.0 uucp.16.0.0 uunf.16.0.0

eval $(opam env)
6 changes: 2 additions & 4 deletions scripts/install_build_deps_windows.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,12 @@ eval $(opam env)
# Add windows repository
opam repository add windows http://github.com/ocaml-cross/opam-cross-windows.git

# Request the compiler to be built with flambda optimizers
opam install -y conf-flambda-windows

# Install the compiler
opam install -y "ocaml-windows64=4.14.1"

# Install dependencies
opam install -y core.v0.16.1 core-windows.v0.16.1 menhir.20230608 menhir-windows.20230608 ppx_deriving.5.2.1 ppx_deriving-windows.5.2.1\
fmt.0.9.0 fmt-windows.0.9.0 yojson.2.1.0 yojson-windows.2.1.0 cmdliner.1.3.0 cmdliner-windows.1.3.0
fmt.0.9.0 fmt-windows.0.9.0 yojson.2.1.0 yojson-windows.2.1.0 cmdliner.1.3.0 cmdliner-windows.1.3.0 uucp.16.0.0 uucp-windows.16.0.0\
uunf.16.0.0 uunf-windows.16.0.0

eval $(opam env)
107 changes: 107 additions & 0 deletions src/common/Unicode.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
let pp_uchar ppf u =
let u_int = Uchar.to_int u in
if u_int < 128 then Fmt.string ppf (Char.chr u_int |> Char.escaped)
else Fmt.pf ppf "U+%04X" u_int

let is_ascii s =
let rec loop max b i =
if i > max then true
else if Bytes.get_uint8 b i < 128 then loop max b (i + 1)
else false in
let b = Bytes.of_string s in
loop (Bytes.length b - 1) b 0

let normalize = Uunf_string.normalize_utf_8 `NFKC

let foldi_uchars ~f acc str =
let len = String.length str in
let rec loop pos acc =
if pos == len then acc
else
let decode = String.get_utf_8_uchar str pos in
let char_length = Uchar.utf_decode_length decode in
let uchar = Uchar.utf_decode_uchar decode in
let acc = f acc pos uchar in
loop (pos + char_length) acc in
loop 0 acc

let iteri_uchars ~f str =
let f' buf pos c =
f pos c;
Buffer.add_utf_8_uchar buf c;
buf in
let s_after =
Buffer.contents
@@ foldi_uchars ~f:f' (Buffer.create (String.length str)) str in
(* another sanity check *)
if not (String.equal str s_after) then
Core.(

Check warning on line 38 in src/common/Unicode.ml

View check run for this annotation

Codecov / codecov/patch

src/common/Unicode.ml#L38

Added line #L38 was not covered by tests
ICE.internal_compiler_error
[%message
"Failed to round-trip unicode string!"
(str : string)
(s_after : string)])

(* WIP:

While not strictly necessary, there are some additional restrictions which
are good to implement for validation and preventing strings that are visually
identical from being distinct identifiers.
A good summary can be found here: https://perl11.org/blog/unicode-identifiers.html

Most of these are only a problem if you assume maliciousness of the user,
so they may not be important for an initial version in Stan.
*)

(* Defined in https://www.unicode.org/reports/tr39/#Confusable_Detection *)
let confusable x y =
let skeleton x =
let x = Uunf_string.normalize_utf_8 `NFD x in
let f acc _ c =
if Uucp.Gen.is_default_ignorable c then ()

Check warning on line 61 in src/common/Unicode.ml

View check run for this annotation

Codecov / codecov/patch

src/common/Unicode.ml#L58-L61

Added lines #L58 - L61 were not covered by tests
else
(* TODO!! replace with prototype - need data? *)
Buffer.add_utf_8_uchar acc c;

Check warning on line 64 in src/common/Unicode.ml

View check run for this annotation

Codecov / codecov/patch

src/common/Unicode.ml#L64

Added line #L64 was not covered by tests
acc in
let buf = foldi_uchars ~f (Buffer.create (String.length x)) x in
let x = Buffer.contents buf in
let x = Uunf_string.normalize_utf_8 `NFD x in
x in
String.compare (skeleton x) (skeleton y)

Check warning on line 70 in src/common/Unicode.ml

View check run for this annotation

Codecov / codecov/patch

src/common/Unicode.ml#L66-L70

Added lines #L66 - L70 were not covered by tests

module ScriptSet = Set.Make (Uucp.Script)

(** copied from UUCP's definition of [Uucp.Script.t] *)
let all =
ScriptSet.of_list
[ `Adlm; `Aghb; `Ahom; `Arab; `Armi; `Armn; `Avst; `Bali; `Bamu; `Bass; `Batk
; `Beng; `Bhks; `Bopo; `Brah; `Brai; `Bugi; `Buhd; `Cakm; `Cans; `Cari
; `Cham; `Cher; `Chrs; `Copt; `Cpmn; `Cprt; `Cyrl; `Deva; `Diak; `Dogr
; `Dsrt; `Dupl; `Egyp; `Elba; `Elym; `Ethi; `Geor; `Glag; `Gong; `Gonm
; `Goth; `Gran; `Grek; `Gujr; `Guru; `Hang; `Hani; `Hano; `Hatr; `Hebr
; `Hira; `Hluw; `Hmng; `Hmnp; `Hrkt; `Hung; `Ital; `Java; `Kali; `Kana
; `Kawi; `Khar; `Khmr; `Khoj; `Knda; `Kthi; `Kits; `Lana; `Laoo; `Latn
; `Lepc; `Limb; `Lina; `Linb; `Lisu; `Lyci; `Lydi; `Mahj; `Maka; `Mand
; `Mani; `Marc; `Medf; `Mend; `Merc; `Mero; `Mlym; `Modi; `Mong; `Mroo
; `Mtei; `Mult; `Mymr; `Nagm; `Nand; `Narb; `Nbat; `Newa; `Nkoo; `Nshu
; `Ogam; `Olck; `Orkh; `Orya; `Osge; `Osma; `Ougr; `Palm; `Pauc; `Perm
; `Phag; `Phli; `Phlp; `Phnx; `Plrd; `Prti; `Qaai; `Rjng; `Rohg; `Runr
; `Samr; `Sarb; `Saur; `Sgnw; `Shaw; `Shrd; `Sidd; `Sind; `Sinh; `Sogd
; `Sogo; `Sora; `Soyo; `Sund; `Sylo; `Syrc; `Tagb; `Takr; `Tale; `Talu
; `Taml; `Tang; `Tavt; `Telu; `Tfng; `Tglg; `Thaa; `Thai; `Tibt; `Tirh
; `Tnsa; `Toto; `Ugar; `Vaii; `Vith; `Wara; `Wcho; `Xpeo; `Xsux; `Yezi
; `Yiii; `Zanb; `Zinh; `Zyyy; `Zzzz ]

let extended s =
if ScriptSet.mem `Zyyy s || ScriptSet.mem `Zinh s then all else s

Check warning on line 96 in src/common/Unicode.ml

View check run for this annotation

Codecov / codecov/patch

src/common/Unicode.ml#L96

Added line #L96 was not covered by tests

(* Defined in https://www.unicode.org/reports/tr39/#Restriction_Level_Detection *)
let restriction_level x =
let f acc _ c =
let scripts =
Uucp.Script.script_extensions c |> ScriptSet.of_list |> extended in
scripts :: acc in

Check warning on line 103 in src/common/Unicode.ml

View check run for this annotation

Codecov / codecov/patch

src/common/Unicode.ml#L100-L103

Added lines #L100 - L103 were not covered by tests
let soss = foldi_uchars ~f [] x in
let resolved = List.fold_right ScriptSet.inter soss all in
if not @@ ScriptSet.is_empty resolved then `Single
else `Unrestricted (* TODO implement levels 3-5 *)

Check warning on line 107 in src/common/Unicode.ml

View check run for this annotation

Codecov / codecov/patch

src/common/Unicode.ml#L105-L107

Added lines #L105 - L107 were not covered by tests
2 changes: 1 addition & 1 deletion src/common/dune
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
(library
(name common)
(public_name stanc.common)
(libraries core fmt)
(libraries core fmt uunf uucp)
(instrumentation
(backend bisect_ppx))
(inline_tests)
Expand Down
1 change: 1 addition & 0 deletions src/driver/Entry.ml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ let set_model_name model_name =

let reset_mutable_states model_name (flags : Flags.t) =
Common.Gensym.reset_danger_use_cautiously ();
Identifiers.allow_unicode := flags.allow_unicode;
Include_files.include_provider := flags.include_source;
set_model_name model_name;
Typechecker.check_that_all_functions_have_definition :=
Expand Down
2 changes: 2 additions & 0 deletions src/driver/Flags.ml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ type t =
; standalone_functions: bool
; use_opencl: bool
; include_source: Frontend.Include_files.t
; allow_unicode: bool
; info: bool
; version: bool
; auto_format: bool
Expand Down Expand Up @@ -49,6 +50,7 @@ let default =
; standalone_functions= false
; use_opencl= false
; include_source= Frontend.Include_files.FileSystemPaths []
; allow_unicode= false
; info= false
; version= false
; auto_format= false
Expand Down
1 change: 1 addition & 0 deletions src/driver/Flags.mli
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ type t =
; standalone_functions: bool
; use_opencl: bool
; include_source: Frontend.Include_files.t
; allow_unicode: bool
(* ------------------------- *)
(* flags which switch compiler "modes" *)
; info: bool
Expand Down
6 changes: 3 additions & 3 deletions src/frontend/Errors.ml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ open Core

(** Our type of syntax error information *)
type syntax_error =
| Lexing of Middle.Location.t
| Lexing of string * Middle.Location.t
| UnexpectedEOF of Middle.Location.t
| Include of string * Middle.Location.t
| Parsing of string * Middle.Location_span.t
Expand Down Expand Up @@ -57,12 +57,12 @@ let pp_syntax_error ?printed_filename ?code ppf = function
(Middle.Location_span.to_string ?printed_filename loc_span)
(pp_context_with_message ?code)
(message, loc_span.begin_loc)
| Lexing loc ->
| Lexing (message, loc) ->
Fmt.pf ppf "Syntax error in %s, lexing error:@,%a@."
(Middle.Location.to_string ?printed_filename
{loc with col_num= loc.col_num - 1})
(pp_context_with_message ?code)
("Invalid character found.", loc)
(message, loc)
| UnexpectedEOF loc ->
Fmt.pf ppf "Syntax error in %s, lexing error:@,%a@."
(Middle.Location.to_string ?printed_filename
Expand Down
2 changes: 1 addition & 1 deletion src/frontend/Errors.mli
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

(** Our type of syntax error information *)
type syntax_error =
| Lexing of Middle.Location.t
| Lexing of string * Middle.Location.t
| UnexpectedEOF of Middle.Location.t
| Include of string * Middle.Location.t
| Parsing of string * Middle.Location_span.t
Expand Down
45 changes: 45 additions & 0 deletions src/frontend/Identifiers.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
open Common.Unicode

let allow_unicode = ref false

let error ~loc msg =
raise
(Errors.SyntaxError
(Errors.Lexing (msg, Preprocessor.location_of_position loc)))

let validate_ascii_id ~loc id =
Debugging.lexer_logger ("ascii id: " ^ id);
let first = String.get_uint8 id 0 in
if
(first >= Char.code 'A' && first <= Char.code 'Z')
|| (first >= Char.code 'a' && first <= Char.code 'z')
then id
else error ~loc "Invalid character found."

(* Validation based on the
Unicode Standard Annex #31: Unicode Identifiers and Syntax
https://www.unicode.org/reports/tr31 *)

let validate_utf8_id ~loc id =
if not !allow_unicode then
error ~loc

Check warning on line 25 in src/frontend/Identifiers.ml

View check run for this annotation

Codecov / codecov/patch

src/frontend/Identifiers.ml#L25

Added line #L25 was not covered by tests
"Unicode identifiers are not supported without the (experimental) \
allow-unicode flag";
if not (String.is_valid_utf_8 id) then
error ~loc "Identifier is not valid UTF-8 string";

Check warning on line 29 in src/frontend/Identifiers.ml

View check run for this annotation

Codecov / codecov/patch

src/frontend/Identifiers.ml#L29

Added line #L29 was not covered by tests
Debugging.lexer_logger ("unicode id: " ^ id);
(* normalize to NFKC as recommended *)
let id = normalize id in
let f pos uchar =
if pos == 0 then (
if not (Uucp.Id.is_xid_start uchar) then
error ~loc (Fmt.str "Invalid character: '%a'" pp_uchar uchar))
else if not (Uucp.Id.is_xid_continue uchar) then
error ~loc
(Fmt.str "Invalid character in identifier at offset %d: '%a'" pos
pp_uchar uchar) in
iteri_uchars ~f id;
id

let validate loc id =
if is_ascii id then validate_ascii_id ~loc id else validate_utf8_id ~loc id
2 changes: 2 additions & 0 deletions src/frontend/Identifiers.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
val allow_unicode : bool ref
val validate : Lexing.position -> string -> string
10 changes: 8 additions & 2 deletions src/frontend/dune
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
(library
(name frontend)
(public_name stanc.frontend)
(libraries core menhirLib yojson fmt middle stan_math_signatures)
(libraries core menhirLib uucp yojson fmt middle stan_math_signatures)
(instrumentation
(backend bisect_ppx))
(inline_tests)
(preprocess
(pps ppx_jane ppx_deriving.fold ppx_deriving.map)))

(ocamllex lexer)
(rule
(target lexer.ml)
(deps lexer.mll)
(action
(chdir
%{workspace_root}
(run %{bin:ocamllex} -ml -o %{target} %{deps}))))

(rule
(targets parsing_errors.ml)
Expand Down
42 changes: 38 additions & 4 deletions src/frontend/lexer.mll
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,40 @@
, location_span_of_positions (lexbuf.lex_start_p, lexbuf.lex_curr_p) )
}

(*
OCamllex does not know about unicode, it just operates over bytes.
So, we can define all 'valid' byte sequences for UTF-8 like so
*)
(* 110xxxxx *)
let utf8_head_byte2 = ['\192'-'\223']
(* 1110xxxx *)
let utf8_head_byte3 = ['\224'-'\239']
(* 11110xxx *)
let utf8_head_byte4 = ['\240'-'\247']
(* 10xxxxxx *)
let utf8_tail_byte = ['\128'-'\191']

(* utf8_1 is ascii *)
let ascii_allowed = ['a'-'z' 'A'-'Z' '0'-'9' '_']
(* 11 bits of payload *)
let utf8_2 = utf8_head_byte2 utf8_tail_byte
(* 16 bits of payload *)
let utf8_3 = utf8_head_byte3 utf8_tail_byte utf8_tail_byte
(* 21 bits of payload *)
let utf8_4 = utf8_head_byte4 utf8_tail_byte utf8_tail_byte utf8_tail_byte

(* Any UTF-8-encoded code point, outside the ASCII range.
This set includes more than it should for simplicity.
*)
let utf8_nonascii = utf8_2 | utf8_3 | utf8_4

(* identifiers here are overly permissive, and are checked
in the semantic action of the rule that matches here.
*)
let identifier = (ascii_allowed | utf8_nonascii)+

(* Some auxiliary definition for variables and constants *)
let string_literal = '"' [^ '"' '\r' '\n']* '"'
let identifier = ['a'-'z' 'A'-'Z'] ['a'-'z' 'A'-'Z' '0'-'9' '_']* (* TODO: We should probably expand the alphabet *)

let integer_constant = ['0'-'9']+ ('_' ['0'-'9']+)*

Expand Down Expand Up @@ -198,8 +229,10 @@ rule token = parse
| string_literal as s { lexer_logger ("string_literal " ^ s) ;
Parser.STRINGLITERAL (lexeme lexbuf) }
| identifier as id { lexer_logger ("identifier " ^ id) ;
lexer_pos_logger (lexeme_start_p lexbuf);
Parser.IDENTIFIER (lexeme lexbuf) }
let loc = (lexeme_start_p lexbuf) in
lexer_pos_logger loc;
let canonical_id = Identifiers.validate loc id in
Parser.IDENTIFIER (canonical_id) }
(* End of file *)
| eof { lexer_logger "eof" ;
if Preprocessor.size () = 1
Expand All @@ -210,7 +243,8 @@ rule token = parse

| _ { raise (Errors.SyntaxError
(Errors.Lexing
(location_of_position
("Invalid character found.",
location_of_position
(lexeme_start_p
(current_buffer ()))))) }

Expand Down
Loading