Skip to content

Commit 0012e20

Browse files
committed
Add some notes on potential further validation
Likely needs more library support: dbuenzli/uucp#25
1 parent 2e8b66c commit 0012e20

File tree

11 files changed

+134
-23
lines changed

11 files changed

+134
-23
lines changed

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ pipeline {
115115
}
116116
environment {
117117
CXX = 'clang++-6.0'
118-
MACOS_SWITCH = 'stanc3-4.14'
118+
MACOS_SWITCH = 'stanc3-4.14-unicode'
119119
PARALLEL = 4
120120
GIT_AUTHOR_NAME = 'Stan Jenkins'
121121
GIT_AUTHOR_EMAIL = '[email protected]'

scripts/install_build_deps.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ eval $(opam env)
77

88
opam pin -y core v0.16.0 --no-action
99

10-
opam install -y dune core.v0.16.0 menhir.20230608 ppx_deriving.5.2.1 fmt.0.9.0 yojson.2.1.0 cmdliner.1.3.0
10+
opam install -y dune core.v0.16.0 menhir.20230608 ppx_deriving.5.2.1 fmt.0.9.0 yojson.2.1.0\
11+
cmdliner.1.3.0 uucp.16.0.0 uunf.16.0.0
1112

1213
eval $(opam env)

scripts/install_build_deps_windows.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ opam install -y "ocaml-windows64=4.14.1"
1717

1818
# Install dependencies
1919
opam install -y core.v0.16.1 core-windows.v0.16.1 menhir.20230608 menhir-windows.20230608 ppx_deriving.5.2.1 ppx_deriving-windows.5.2.1\
20-
fmt.0.9.0 fmt-windows.0.9.0 yojson.2.1.0 yojson-windows.2.1.0 cmdliner.1.3.0 cmdliner-windows.1.3.0
20+
fmt.0.9.0 fmt-windows.0.9.0 yojson.2.1.0 yojson-windows.2.1.0 cmdliner.1.3.0 cmdliner-windows.1.3.0 uucp.16.0.0 uucp-windows.16.0.0\
21+
uunf.16.0.0 uunf-windows.16.0.0
2122

2223
eval $(opam env)

src/common/Unicode.ml

Lines changed: 88 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,23 +13,95 @@ let is_ascii s =
1313

1414
let normalize = Uunf_string.normalize_utf_8 `NFKC
1515

16-
let iter_uchars s f =
17-
let len = String.length s in
18-
let out = Buffer.create len in
19-
let pos = ref 0 in
20-
(* move through code point by code point *)
21-
while !pos != len do
22-
let decode = String.get_utf_8_uchar s !pos in
23-
let char_length = Uchar.utf_decode_length decode in
24-
let uchar = Uchar.utf_decode_uchar decode in
25-
Buffer.add_utf_8_uchar out uchar;
26-
f !pos uchar;
27-
pos := !pos + char_length
28-
done;
16+
let foldi_uchars ~f acc str =
17+
let len = String.length str in
18+
let rec loop pos acc =
19+
if pos == len then acc
20+
else
21+
let decode = String.get_utf_8_uchar str pos in
22+
let char_length = Uchar.utf_decode_length decode in
23+
let uchar = Uchar.utf_decode_uchar decode in
24+
let acc = f acc pos uchar in
25+
loop (pos + char_length) acc in
26+
loop 0 acc
27+
28+
let iteri_uchars ~f str =
29+
let f' buf pos c =
30+
f pos c;
31+
Buffer.add_utf_8_uchar buf c;
32+
buf in
33+
let s_after =
34+
Buffer.contents
35+
@@ foldi_uchars ~f:f' (Buffer.create (String.length str)) str in
2936
(* another sanity check *)
30-
let s_after = Buffer.contents out in
31-
if not (String.equal s s_after) then
37+
if not (String.equal str s_after) then
3238
Core.(
3339
ICE.internal_compiler_error
3440
[%message
35-
"Failed to round-trip unicode string!" (s : string) (s_after : string)])
41+
"Failed to round-trip unicode string!"
42+
(str : string)
43+
(s_after : string)])
44+
45+
(* WIP:
46+
47+
While not strictly necessary, there are some additional restrictions which
48+
are good to implement for validation and preventing strings that are visually
49+
identical from being distinct identifiers.
50+
A good summary can be found here: https://perl11.org/blog/unicode-identifiers.html
51+
52+
Most of these are only a problem if you assume maliciousness of the user,
53+
so they may not be important for an initial version in Stan.
54+
*)
55+
56+
(* Defined in https://www.unicode.org/reports/tr39/#Confusable_Detection *)
57+
let confusable x y =
58+
let skeleton x =
59+
let x = Uunf_string.normalize_utf_8 `NFD x in
60+
let f acc _ c =
61+
if Uucp.Gen.is_default_ignorable c then ()
62+
else
63+
(* TODO!! replace with prototype - need data? *)
64+
Buffer.add_utf_8_uchar acc c;
65+
acc in
66+
let buf = foldi_uchars ~f (Buffer.create (String.length x)) x in
67+
let x = Buffer.contents buf in
68+
let x = Uunf_string.normalize_utf_8 `NFD x in
69+
x in
70+
String.compare (skeleton x) (skeleton y)
71+
72+
module ScriptSet = Set.Make (Uucp.Script)
73+
74+
(** copied from UUCP's definition of [Uucp.Script.t] *)
75+
let all =
76+
ScriptSet.of_list
77+
[ `Adlm; `Aghb; `Ahom; `Arab; `Armi; `Armn; `Avst; `Bali; `Bamu; `Bass; `Batk
78+
; `Beng; `Bhks; `Bopo; `Brah; `Brai; `Bugi; `Buhd; `Cakm; `Cans; `Cari
79+
; `Cham; `Cher; `Chrs; `Copt; `Cpmn; `Cprt; `Cyrl; `Deva; `Diak; `Dogr
80+
; `Dsrt; `Dupl; `Egyp; `Elba; `Elym; `Ethi; `Geor; `Glag; `Gong; `Gonm
81+
; `Goth; `Gran; `Grek; `Gujr; `Guru; `Hang; `Hani; `Hano; `Hatr; `Hebr
82+
; `Hira; `Hluw; `Hmng; `Hmnp; `Hrkt; `Hung; `Ital; `Java; `Kali; `Kana
83+
; `Kawi; `Khar; `Khmr; `Khoj; `Knda; `Kthi; `Kits; `Lana; `Laoo; `Latn
84+
; `Lepc; `Limb; `Lina; `Linb; `Lisu; `Lyci; `Lydi; `Mahj; `Maka; `Mand
85+
; `Mani; `Marc; `Medf; `Mend; `Merc; `Mero; `Mlym; `Modi; `Mong; `Mroo
86+
; `Mtei; `Mult; `Mymr; `Nagm; `Nand; `Narb; `Nbat; `Newa; `Nkoo; `Nshu
87+
; `Ogam; `Olck; `Orkh; `Orya; `Osge; `Osma; `Ougr; `Palm; `Pauc; `Perm
88+
; `Phag; `Phli; `Phlp; `Phnx; `Plrd; `Prti; `Qaai; `Rjng; `Rohg; `Runr
89+
; `Samr; `Sarb; `Saur; `Sgnw; `Shaw; `Shrd; `Sidd; `Sind; `Sinh; `Sogd
90+
; `Sogo; `Sora; `Soyo; `Sund; `Sylo; `Syrc; `Tagb; `Takr; `Tale; `Talu
91+
; `Taml; `Tang; `Tavt; `Telu; `Tfng; `Tglg; `Thaa; `Thai; `Tibt; `Tirh
92+
; `Tnsa; `Toto; `Ugar; `Vaii; `Vith; `Wara; `Wcho; `Xpeo; `Xsux; `Yezi
93+
; `Yiii; `Zanb; `Zinh; `Zyyy; `Zzzz ]
94+
95+
let extended s =
96+
if ScriptSet.mem `Zyyy s || ScriptSet.mem `Zinh s then all else s
97+
98+
(* Defined in https://www.unicode.org/reports/tr39/#Restriction_Level_Detection *)
99+
let restriction_level x =
100+
let f acc _ c =
101+
let scripts =
102+
Uucp.Script.script_extensions c |> ScriptSet.of_list |> extended in
103+
scripts :: acc in
104+
let soss = foldi_uchars ~f [] x in
105+
let resolved = List.fold_right ScriptSet.inter soss all in
106+
if not @@ ScriptSet.is_empty resolved then `Single
107+
else `Unrestricted (* TODO implement levels 3-5 *)

src/common/dune

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
(library
22
(name common)
33
(public_name stanc.common)
4-
(libraries core fmt uunf)
4+
(libraries core fmt uunf uucp)
55
(instrumentation
66
(backend bisect_ppx))
77
(inline_tests)

src/frontend/Identifiers.ml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ let validate_utf8_id ~loc id =
3838
error ~loc
3939
(Fmt.str "Invalid character in identifier at offset %d: '%a'" pos
4040
pp_uchar uchar) in
41-
iter_uchars id f;
41+
iteri_uchars ~f id;
4242
id
4343

4444
let validate loc id =

src/stan_math_backend/Cpp.ml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -395,8 +395,11 @@ module Printing = struct
395395
else
396396
(* so called "Universal character names" - not required on newer compilers
397397
but hopefully more backward-compatible *)
398-
let f _ c = Fmt.pf ppf "\\u%04X" (Uchar.to_scalar c) in
399-
Common.Unicode.iter_uchars s f
398+
let f _ c =
399+
let uchar_int = Uchar.to_scalar c in
400+
if uchar_int < 128 then Fmt.char ppf (Char.of_int_exn uchar_int)
401+
else Fmt.pf ppf "\\u%04X" (Uchar.to_scalar c) in
402+
Common.Unicode.iteri_uchars ~f s
400403

401404
let rec pp_type_ ppf t =
402405
match t with
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
data {
2+
// not in XID data
3+
real ℧;
4+
}

test/integration/bad/unicode/stanc.expected

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1+
$ ../../../../../install/default/bin/stanc --allow-unicode invalid-character.stan
2+
Syntax error in 'invalid-character.stan', line 3, column 6, lexing error:
3+
-------------------------------------------------
4+
1: data {
5+
2: // not in XID data
6+
3: real ℧;
7+
^
8+
4: }
9+
-------------------------------------------------
10+
11+
Invalid character: 'U+2127'
12+
[exit 1]
113
$ ../../../../../install/default/bin/stanc --allow-unicode invalid-utf8-1.stan
214
Syntax error in 'invalid-utf8-1.stan', line 2, column 5, lexing error:
315
-------------------------------------------------
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
2+
data {
3+
// https://www.unicode.org/reports/tr39/#Confusable_Detection
4+
// greek
5+
real Γ;
6+
// cyrillic
7+
real Г;
8+
}

0 commit comments

Comments
 (0)