@@ -33,3 +33,69 @@ let iter_uchars s f =
3333 ICE. internal_compiler_error
3434 [% message
3535 " Failed to round-trip unicode string!" (s : string ) (s_after : string )])
36+
37+ (* WIP:
38+
39+ While not strictly necessary, there are some additional restrictions which
40+ are good to implement for validation and preventing strings that are visually
41+ identical from being distinct identifiers.
42+ A good summary can be found here: https://perl11.org/blog/unicode-identifiers.html
43+
44+ Most of these are only a problem if you assume maliciousness of the user,
45+ so they may not be important for an initial version in Stan.
46+ *)
47+
48+ (* Defined in https://www.unicode.org/reports/tr39/#Confusable_Detection *)
49+ let confusable x y =
50+ let skeleton x =
51+ let x = Uunf_string. normalize_utf_8 `NFD x in
52+ let out = Buffer. create (String. length x) in
53+ let f _ c =
54+ if Uucp.Gen. is_default_ignorable c then ()
55+ else
56+ (* TODO!! replace with prototype - need data? *)
57+ Buffer. add_utf_8_uchar out c in
58+ iter_uchars x f;
59+ let x = Buffer. contents out in
60+ let x = Uunf_string. normalize_utf_8 `NFD x in
61+ x in
62+ String. compare (skeleton x) (skeleton y)
63+
64+ module ScriptSet = Set. Make (Uucp. Script )
65+
66+ (* * copied from UUCP's definition of [Uucp.Script.t] *)
67+ let all =
68+ ScriptSet. of_list
69+ [ `Adlm ; `Aghb ; `Ahom ; `Arab ; `Armi ; `Armn ; `Avst ; `Bali ; `Bamu ; `Bass ; `Batk
70+ ; `Beng ; `Bhks ; `Bopo ; `Brah ; `Brai ; `Bugi ; `Buhd ; `Cakm ; `Cans ; `Cari
71+ ; `Cham ; `Cher ; `Chrs ; `Copt ; `Cpmn ; `Cprt ; `Cyrl ; `Deva ; `Diak ; `Dogr
72+ ; `Dsrt ; `Dupl ; `Egyp ; `Elba ; `Elym ; `Ethi ; `Geor ; `Glag ; `Gong ; `Gonm
73+ ; `Goth ; `Gran ; `Grek ; `Gujr ; `Guru ; `Hang ; `Hani ; `Hano ; `Hatr ; `Hebr
74+ ; `Hira ; `Hluw ; `Hmng ; `Hmnp ; `Hrkt ; `Hung ; `Ital ; `Java ; `Kali ; `Kana
75+ ; `Kawi ; `Khar ; `Khmr ; `Khoj ; `Knda ; `Kthi ; `Kits ; `Lana ; `Laoo ; `Latn
76+ ; `Lepc ; `Limb ; `Lina ; `Linb ; `Lisu ; `Lyci ; `Lydi ; `Mahj ; `Maka ; `Mand
77+ ; `Mani ; `Marc ; `Medf ; `Mend ; `Merc ; `Mero ; `Mlym ; `Modi ; `Mong ; `Mroo
78+ ; `Mtei ; `Mult ; `Mymr ; `Nagm ; `Nand ; `Narb ; `Nbat ; `Newa ; `Nkoo ; `Nshu
79+ ; `Ogam ; `Olck ; `Orkh ; `Orya ; `Osge ; `Osma ; `Ougr ; `Palm ; `Pauc ; `Perm
80+ ; `Phag ; `Phli ; `Phlp ; `Phnx ; `Plrd ; `Prti ; `Qaai ; `Rjng ; `Rohg ; `Runr
81+ ; `Samr ; `Sarb ; `Saur ; `Sgnw ; `Shaw ; `Shrd ; `Sidd ; `Sind ; `Sinh ; `Sogd
82+ ; `Sogo ; `Sora ; `Soyo ; `Sund ; `Sylo ; `Syrc ; `Tagb ; `Takr ; `Tale ; `Talu
83+ ; `Taml ; `Tang ; `Tavt ; `Telu ; `Tfng ; `Tglg ; `Thaa ; `Thai ; `Tibt ; `Tirh
84+ ; `Tnsa ; `Toto ; `Ugar ; `Vaii ; `Vith ; `Wara ; `Wcho ; `Xpeo ; `Xsux ; `Yezi
85+ ; `Yiii ; `Zanb ; `Zinh ; `Zyyy ; `Zzzz ]
86+
87+ let extended s =
88+ if ScriptSet. mem `Zyyy s || ScriptSet. mem `Zinh s then all else s
89+
90+ (* Defined in https://www.unicode.org/reports/tr39/#Restriction_Level_Detection *)
91+ let restriction_level x =
92+ let soss = ref [] in
93+ let f _ c =
94+ let scripts =
95+ Uucp.Script. script_extensions c |> ScriptSet. of_list |> extended in
96+ soss := scripts :: ! soss;
97+ () in
98+ iter_uchars x f;
99+ let resolved = List. fold_right ScriptSet. inter ! soss all in
100+ if not @@ ScriptSet. is_empty resolved then `Single
101+ else `Unrestricted (* TODO implement levels 3-5 *)
0 commit comments