Skip to content

Commit 25625ce

Browse files
committed
Reorganize, code-gen UCNs
1 parent 0ba1518 commit 25625ce

File tree

9 files changed

+99
-81
lines changed

9 files changed

+99
-81
lines changed

src/common/Unicode.ml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
let pp_uchar ppf u =
2+
let u_int = Uchar.to_int u in
3+
if u_int < 128 then Fmt.string ppf (Char.chr u_int |> Char.escaped)
4+
else Fmt.pf ppf "U+%04X" u_int
5+
6+
let is_ascii s =
7+
let rec loop max b i =
8+
if i > max then true
9+
else if Bytes.get_uint8 b i < 128 then loop max b (i + 1)
10+
else false in
11+
let b = Bytes.of_string s in
12+
loop (Bytes.length b - 1) b 0
13+
14+
let normalize = Uunf_string.normalize_utf_8 `NFKC
15+
16+
let iter_uchars s f =
17+
let len = String.length s in
18+
let out = Buffer.create len in
19+
let pos = ref 0 in
20+
(* move through code point by code point *)
21+
while !pos != len do
22+
let decode = String.get_utf_8_uchar s !pos in
23+
let char_length = Uchar.utf_decode_length decode in
24+
let uchar = Uchar.utf_decode_uchar decode in
25+
Buffer.add_utf_8_uchar out uchar;
26+
f !pos uchar;
27+
pos := !pos + char_length
28+
done;
29+
(* another sanity check *)
30+
let s_after = Buffer.contents out in
31+
if not (String.equal s s_after) then
32+
Core.(
33+
ICE.internal_compiler_error
34+
[%message
35+
"Failed to round-trip unicode string!" (s : string) (s_after : string)])

src/common/dune

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
(library
22
(name common)
33
(public_name stanc.common)
4-
(libraries core fmt)
4+
(libraries core fmt uunf)
55
(instrumentation
66
(backend bisect_ppx))
77
(inline_tests)

src/driver/Entry.ml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ let set_model_name model_name =
3131

3232
let reset_mutable_states model_name (flags : Flags.t) =
3333
Common.Gensym.reset_danger_use_cautiously ();
34-
Unicode.allow_unicode := flags.allow_unicode;
34+
Identifiers.allow_unicode := flags.allow_unicode;
3535
Include_files.include_provider := flags.include_source;
3636
set_model_name model_name;
3737
Typechecker.check_that_all_functions_have_definition :=

src/frontend/Identifiers.ml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
open Common.Unicode
2+
3+
let allow_unicode = ref false
4+
5+
let error ~loc msg =
6+
raise
7+
(Errors.SyntaxError
8+
(Errors.Lexing (msg, Preprocessor.location_of_position loc)))
9+
10+
let validate_ascii_id ~loc id =
11+
Debugging.lexer_logger ("ascii id: " ^ id);
12+
let first = String.get_uint8 id 0 in
13+
if
14+
(first >= Char.code 'A' && first <= Char.code 'Z')
15+
|| (first >= Char.code 'a' && first <= Char.code 'z')
16+
then id
17+
else error ~loc "Invalid character found."
18+
19+
(* Validation based on the
20+
Unicode Standard Annex #31: Unicode Identifiers and Syntax
21+
https://www.unicode.org/reports/tr31 *)
22+
23+
let validate_utf8_id ~loc id =
24+
if not !allow_unicode then
25+
error ~loc
26+
"Unicode identifiers are not supported without the (experimental) \
27+
allow-unicode flag";
28+
if not (String.is_valid_utf_8 id) then
29+
error ~loc "Identifier is not valid UTF-8 string";
30+
Debugging.lexer_logger ("unicode id: " ^ id);
31+
(* normalize to NFKC as recommended *)
32+
let id = normalize id in
33+
let f pos uchar =
34+
if pos == 0 then (
35+
if not (Uucp.Id.is_xid_start uchar) then
36+
error ~loc (Fmt.str "Invalid character: '%a'" pp_uchar uchar))
37+
else if not (Uucp.Id.is_xid_continue uchar) then
38+
error ~loc
39+
(Fmt.str "Invalid character in identifier at offset %d: '%a'" pos
40+
pp_uchar uchar) in
41+
iter_uchars id f;
42+
id
43+
44+
let validate loc id =
45+
if is_ascii id then validate_ascii_id ~loc id else validate_utf8_id ~loc id

src/frontend/Identifiers.mli

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
val allow_unicode : bool ref
2+
val validate : Lexing.position -> string -> string

src/frontend/Unicode.ml

Lines changed: 0 additions & 71 deletions
This file was deleted.

src/frontend/dune

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
(library
22
(name frontend)
33
(public_name stanc.frontend)
4-
(libraries core menhirLib uunf uucp yojson fmt middle stan_math_signatures)
4+
(libraries core menhirLib uucp yojson fmt middle stan_math_signatures)
55
(instrumentation
66
(backend bisect_ppx))
77
(inline_tests)

src/frontend/lexer.mll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ rule token = parse
231231
| identifier as id { lexer_logger ("identifier " ^ id) ;
232232
let loc = (lexeme_start_p lexbuf) in
233233
lexer_pos_logger loc;
234-
let canonical_id = Unicode.validate_identifier loc id in
234+
let canonical_id = Identifiers.validate loc id in
235235
Parser.IDENTIFIER (canonical_id) }
236236
(* End of file *)
237237
| eof { lexer_logger "eof" ;

src/stan_math_backend/Cpp.ml

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,14 @@ module Printing = struct
389389
open Fmt
390390

391391
let trailing_space (t : 'a Fmt.t) : 'a Fmt.t = fun ppf -> pf ppf "%a@ " t
392-
let pp_identifier ppf = string ppf
392+
393+
let pp_identifier ppf s =
394+
if Common.Unicode.is_ascii s then string ppf s
395+
else
396+
(* so called "Universal character names" - not required on newer compilers
397+
but hopefully more backward-compatible *)
398+
let f _ c = Fmt.pf ppf "\\u%04X" (Uchar.to_scalar c) in
399+
Common.Unicode.iter_uchars s f
393400

394401
let rec pp_type_ ppf t =
395402
match t with
@@ -473,8 +480,8 @@ module Printing = struct
473480
pf ppf "<@,%a>" (list ~sep:comma pp_type_) types in
474481
match e with
475482
| Literal s -> pf ppf "%s" s
476-
| Var id -> string ppf id
477-
| VarRef id -> pf ppf "&%s" id
483+
| Var id -> pp_identifier ppf id
484+
| VarRef id -> pf ppf "&%a" pp_identifier id
478485
| Parens e -> pf ppf "(%a)" pp_expr e
479486
| Cast (t, e) -> pf ppf "@[(%a)@ %a@]" pp_type_ t pp_expr e
480487
| Constructor (t, es) ->
@@ -491,7 +498,7 @@ module Printing = struct
491498
| StreamInsertion (e, es) ->
492499
pf ppf "%a <<@[@ %a@]" pp_expr e (list ~sep:comma pp_expr) es
493500
| FunCall (fn, tys, es) ->
494-
pf ppf "@[<hov 2>%s%a(@,%a@])" fn maybe_templates tys
501+
pf ppf "@[<hov 2>%a%a(@,%a@])" pp_identifier fn maybe_templates tys
495502
(list ~sep:comma pp_expr) es
496503
| MethodCall (e, fn, tys, es) ->
497504
pf ppf "@[<hov 2>%a.%s%a(%a)@]" pp_expr e fn maybe_templates tys
@@ -519,8 +526,8 @@ module Printing = struct
519526
pf ppf "{@[<hov>%a@]}" (list ~sep:comma pp_expr) es in
520527
let static = if static then "static " else "" in
521528
let constexpr = if constexpr then "constexpr " else "" in
522-
pf ppf "@[<hov 2>%s%s%a@ %s%a@]" static constexpr pp_type_ type_ name
523-
pp_init init
529+
pf ppf "@[<hov 2>%s%s%a@ %a%a@]" static constexpr pp_type_ type_
530+
pp_identifier name pp_init init
524531

525532
let rec pp_stmt ppf s =
526533
match s with

0 commit comments

Comments
 (0)