Skip to content

Commit 456d0d7

Browse files
committed
Add some notes on potential further validation
Likely needs more library support: dbuenzli/uucp#25
1 parent 28dcd6e commit 456d0d7

File tree

6 files changed

+101
-1
lines changed

6 files changed

+101
-1
lines changed

src/common/Unicode.ml

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,69 @@ let iter_uchars s f =
3333
ICE.internal_compiler_error
3434
[%message
3535
"Failed to round-trip unicode string!" (s : string) (s_after : string)])
36+
37+
(* WIP:
38+
39+
While not strictly necessary, there are some additional restrictions which
40+
are good to implement for validation and preventing strings that are visually
41+
identical from being distinct identifiers.
42+
A good summary can be found here: https://perl11.org/blog/unicode-identifiers.html
43+
44+
Most of these are only a problem if you assume maliciousness of the user,
45+
so they may not be important for an initial version in Stan.
46+
*)
47+
48+
(* Defined in https://www.unicode.org/reports/tr39/#Confusable_Detection *)
49+
let confusable x y =
50+
let skeleton x =
51+
let x = Uunf_string.normalize_utf_8 `NFD x in
52+
let out = Buffer.create (String.length x) in
53+
let f _ c =
54+
if Uucp.Gen.is_default_ignorable c then ()
55+
else
56+
(* TODO!! replace with prototype - need data? *)
57+
Buffer.add_utf_8_uchar out c in
58+
iter_uchars x f;
59+
let x = Buffer.contents out in
60+
let x = Uunf_string.normalize_utf_8 `NFD x in
61+
x in
62+
String.compare (skeleton x) (skeleton y)
63+
64+
module ScriptSet = Set.Make (Uucp.Script)
65+
66+
(** copied from UUCP's definition of [Uucp.Script.t] *)
67+
let all =
68+
ScriptSet.of_list
69+
[ `Adlm; `Aghb; `Ahom; `Arab; `Armi; `Armn; `Avst; `Bali; `Bamu; `Bass; `Batk
70+
; `Beng; `Bhks; `Bopo; `Brah; `Brai; `Bugi; `Buhd; `Cakm; `Cans; `Cari
71+
; `Cham; `Cher; `Chrs; `Copt; `Cpmn; `Cprt; `Cyrl; `Deva; `Diak; `Dogr
72+
; `Dsrt; `Dupl; `Egyp; `Elba; `Elym; `Ethi; `Geor; `Glag; `Gong; `Gonm
73+
; `Goth; `Gran; `Grek; `Gujr; `Guru; `Hang; `Hani; `Hano; `Hatr; `Hebr
74+
; `Hira; `Hluw; `Hmng; `Hmnp; `Hrkt; `Hung; `Ital; `Java; `Kali; `Kana
75+
; `Kawi; `Khar; `Khmr; `Khoj; `Knda; `Kthi; `Kits; `Lana; `Laoo; `Latn
76+
; `Lepc; `Limb; `Lina; `Linb; `Lisu; `Lyci; `Lydi; `Mahj; `Maka; `Mand
77+
; `Mani; `Marc; `Medf; `Mend; `Merc; `Mero; `Mlym; `Modi; `Mong; `Mroo
78+
; `Mtei; `Mult; `Mymr; `Nagm; `Nand; `Narb; `Nbat; `Newa; `Nkoo; `Nshu
79+
; `Ogam; `Olck; `Orkh; `Orya; `Osge; `Osma; `Ougr; `Palm; `Pauc; `Perm
80+
; `Phag; `Phli; `Phlp; `Phnx; `Plrd; `Prti; `Qaai; `Rjng; `Rohg; `Runr
81+
; `Samr; `Sarb; `Saur; `Sgnw; `Shaw; `Shrd; `Sidd; `Sind; `Sinh; `Sogd
82+
; `Sogo; `Sora; `Soyo; `Sund; `Sylo; `Syrc; `Tagb; `Takr; `Tale; `Talu
83+
; `Taml; `Tang; `Tavt; `Telu; `Tfng; `Tglg; `Thaa; `Thai; `Tibt; `Tirh
84+
; `Tnsa; `Toto; `Ugar; `Vaii; `Vith; `Wara; `Wcho; `Xpeo; `Xsux; `Yezi
85+
; `Yiii; `Zanb; `Zinh; `Zyyy; `Zzzz ]
86+
87+
let extended s =
88+
if ScriptSet.mem `Zyyy s || ScriptSet.mem `Zinh s then all else s
89+
90+
(* Defined in https://www.unicode.org/reports/tr39/#Restriction_Level_Detection *)
91+
let restriction_level x =
92+
let soss = ref [] in
93+
let f _ c =
94+
let scripts =
95+
Uucp.Script.script_extensions c |> ScriptSet.of_list |> extended in
96+
soss := scripts :: !soss;
97+
() in
98+
iter_uchars x f;
99+
let resolved = List.fold_right ScriptSet.inter !soss all in
100+
if not @@ ScriptSet.is_empty resolved then `Single
101+
else `Unrestricted (* TODO implement levels 3-5 *)

src/common/dune

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
(library
22
(name common)
33
(public_name stanc.common)
4-
(libraries core fmt uunf)
4+
(libraries core fmt uunf uucp)
55
(instrumentation
66
(backend bisect_ppx))
77
(inline_tests)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
data {
2+
// not in XID data
3+
real ℧;
4+
}

test/integration/bad/unicode/stanc.expected

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1+
$ ../../../../../install/default/bin/stanc --allow-unicode invalid-character.stan
2+
Syntax error in 'invalid-character.stan', line 3, column 6, lexing error:
3+
-------------------------------------------------
4+
1: data {
5+
2: // not in XID data
6+
3: real ℧;
7+
^
8+
4: }
9+
-------------------------------------------------
10+
11+
Invalid character: 'U+2127'
12+
[exit 1]
113
$ ../../../../../install/default/bin/stanc --allow-unicode invalid-utf8-1.stan
214
Syntax error in 'invalid-utf8-1.stan', line 2, column 5, lexing error:
315
-------------------------------------------------
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
2+
data {
3+
// https://www.unicode.org/reports/tr39/#Confusable_Detection
4+
// greek
5+
real Γ;
6+
// cyrillic
7+
real Г;
8+
}

test/integration/good/unicode/pretty.expected

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,16 @@ model {
1414
y ~ normal(θ, σ);
1515
}
1616

17+
[exit 0]
18+
$ ../../../../../install/default/bin/stanc --auto-format --allow-unicode mixed-scripts.stan
19+
data {
20+
// https://www.unicode.org/reports/tr39/#Confusable_Detection
21+
// greek
22+
real Γ;
23+
// cyrillic
24+
real Г;
25+
}
26+
1727
[exit 0]
1828
$ ../../../../../install/default/bin/stanc --auto-format --allow-unicode unicode_special_funs.stan
1929
functions {

0 commit comments

Comments
 (0)