@@ -13,23 +13,95 @@ let is_ascii s =
1313
1414let normalize = Uunf_string. normalize_utf_8 `NFKC
1515
16- let iter_uchars s f =
17- let len = String. length s in
18- let out = Buffer. create len in
19- let pos = ref 0 in
20- (* move through code point by code point *)
21- while ! pos != len do
22- let decode = String. get_utf_8_uchar s ! pos in
23- let char_length = Uchar. utf_decode_length decode in
24- let uchar = Uchar. utf_decode_uchar decode in
25- Buffer. add_utf_8_uchar out uchar;
26- f ! pos uchar;
27- pos := ! pos + char_length
28- done ;
16+ let foldi_uchars ~f acc str =
17+ let len = String. length str in
18+ let rec loop pos acc =
19+ if pos == len then acc
20+ else
21+ let decode = String. get_utf_8_uchar str pos in
22+ let char_length = Uchar. utf_decode_length decode in
23+ let uchar = Uchar. utf_decode_uchar decode in
24+ let acc = f acc pos uchar in
25+ loop (pos + char_length) acc in
26+ loop 0 acc
27+
28+ let iteri_uchars ~f str =
29+ let f' buf pos c =
30+ f pos c;
31+ Buffer. add_utf_8_uchar buf c;
32+ buf in
33+ let s_after =
34+ Buffer. contents
35+ @@ foldi_uchars ~f: f' (Buffer. create (String. length str)) str in
2936 (* another sanity check *)
30- let s_after = Buffer. contents out in
31- if not (String. equal s s_after) then
37+ if not (String. equal str s_after) then
3238 Core. (
3339 ICE. internal_compiler_error
3440 [% message
35- " Failed to round-trip unicode string!" (s : string ) (s_after : string )])
41+ " Failed to round-trip unicode string!"
42+ (str : string )
43+ (s_after : string )])
44+
45+ (* WIP:
46+
47+ While not strictly necessary, there are some additional restrictions which
48+ are good to implement for validation and preventing strings that are visually
49+ identical from being distinct identifiers.
50+ A good summary can be found here: https://perl11.org/blog/unicode-identifiers.html
51+
52+ Most of these are only a problem if you assume maliciousness of the user,
53+ so they may not be important for an initial version in Stan.
54+ *)
55+
56+ (* Defined in https://www.unicode.org/reports/tr39/#Confusable_Detection *)
57+ let confusable x y =
58+ let skeleton x =
59+ let x = Uunf_string. normalize_utf_8 `NFD x in
60+ let f acc _ c =
61+ if Uucp.Gen. is_default_ignorable c then ()
62+ else
63+ (* TODO!! replace with prototype - need data? *)
64+ Buffer. add_utf_8_uchar acc c;
65+ acc in
66+ let buf = foldi_uchars ~f (Buffer. create (String. length x)) x in
67+ let x = Buffer. contents buf in
68+ let x = Uunf_string. normalize_utf_8 `NFD x in
69+ x in
70+ String. compare (skeleton x) (skeleton y)
71+
72+ module ScriptSet = Set. Make (Uucp. Script )
73+
74+ (* * copied from UUCP's definition of [Uucp.Script.t] *)
75+ let all =
76+ ScriptSet. of_list
77+ [ `Adlm ; `Aghb ; `Ahom ; `Arab ; `Armi ; `Armn ; `Avst ; `Bali ; `Bamu ; `Bass ; `Batk
78+ ; `Beng ; `Bhks ; `Bopo ; `Brah ; `Brai ; `Bugi ; `Buhd ; `Cakm ; `Cans ; `Cari
79+ ; `Cham ; `Cher ; `Chrs ; `Copt ; `Cpmn ; `Cprt ; `Cyrl ; `Deva ; `Diak ; `Dogr
80+ ; `Dsrt ; `Dupl ; `Egyp ; `Elba ; `Elym ; `Ethi ; `Geor ; `Glag ; `Gong ; `Gonm
81+ ; `Goth ; `Gran ; `Grek ; `Gujr ; `Guru ; `Hang ; `Hani ; `Hano ; `Hatr ; `Hebr
82+ ; `Hira ; `Hluw ; `Hmng ; `Hmnp ; `Hrkt ; `Hung ; `Ital ; `Java ; `Kali ; `Kana
83+ ; `Kawi ; `Khar ; `Khmr ; `Khoj ; `Knda ; `Kthi ; `Kits ; `Lana ; `Laoo ; `Latn
84+ ; `Lepc ; `Limb ; `Lina ; `Linb ; `Lisu ; `Lyci ; `Lydi ; `Mahj ; `Maka ; `Mand
85+ ; `Mani ; `Marc ; `Medf ; `Mend ; `Merc ; `Mero ; `Mlym ; `Modi ; `Mong ; `Mroo
86+ ; `Mtei ; `Mult ; `Mymr ; `Nagm ; `Nand ; `Narb ; `Nbat ; `Newa ; `Nkoo ; `Nshu
87+ ; `Ogam ; `Olck ; `Orkh ; `Orya ; `Osge ; `Osma ; `Ougr ; `Palm ; `Pauc ; `Perm
88+ ; `Phag ; `Phli ; `Phlp ; `Phnx ; `Plrd ; `Prti ; `Qaai ; `Rjng ; `Rohg ; `Runr
89+ ; `Samr ; `Sarb ; `Saur ; `Sgnw ; `Shaw ; `Shrd ; `Sidd ; `Sind ; `Sinh ; `Sogd
90+ ; `Sogo ; `Sora ; `Soyo ; `Sund ; `Sylo ; `Syrc ; `Tagb ; `Takr ; `Tale ; `Talu
91+ ; `Taml ; `Tang ; `Tavt ; `Telu ; `Tfng ; `Tglg ; `Thaa ; `Thai ; `Tibt ; `Tirh
92+ ; `Tnsa ; `Toto ; `Ugar ; `Vaii ; `Vith ; `Wara ; `Wcho ; `Xpeo ; `Xsux ; `Yezi
93+ ; `Yiii ; `Zanb ; `Zinh ; `Zyyy ; `Zzzz ]
94+
95+ let extended s =
96+ if ScriptSet. mem `Zyyy s || ScriptSet. mem `Zinh s then all else s
97+
98+ (* Defined in https://www.unicode.org/reports/tr39/#Restriction_Level_Detection *)
99+ let restriction_level x =
100+ let f acc _ c =
101+ let scripts =
102+ Uucp.Script. script_extensions c |> ScriptSet. of_list |> extended in
103+ scripts :: acc in
104+ let soss = foldi_uchars ~f [] x in
105+ let resolved = List. fold_right ScriptSet. inter soss all in
106+ if not @@ ScriptSet. is_empty resolved then `Single
107+ else `Unrestricted (* TODO implement levels 3-5 *)
0 commit comments