@@ -6,52 +6,39 @@ const VER = UInt32(1)
6
6
7
7
const datapath = joinpath (Pkg. dir (), " Unicode_Entities" , " data" )
8
8
const dpath = " http://ftp.unicode.org/Public/UNIDATA/"
9
- const fname = " UnicodeData.txt"
9
+ const inpname = " UnicodeData.txt"
10
+ const fname = " unicode.dat"
10
11
const disp = [false ]
11
12
12
- function sortsplit! {T} (index:: Vector{UInt16} , vec:: Vector{Tuple{T, UInt16}} , base)
13
- sort! (vec)
14
- len = length (vec)
15
- valvec = Vector {T} (len)
16
- indvec = Vector {UInt16} (len)
17
- for (i, val) in enumerate (vec)
18
- valvec[i], ind = val
19
- indvec[i] = ind
20
- index[ind] = UInt16 (base + i)
21
- end
22
- base += len
23
- valvec, indvec, base
24
- end
25
-
26
13
const _empty_string = " "
27
14
const _empty_val = (false , ' ' , _empty_string, _empty_string)
28
15
29
- function process_line {T<:AbstractString} (vec:: Vector{T} )
16
+ function process_line (vec:: Vector{T} ) where {T <: AbstractString }
30
17
length (vec) < 11 && return _empty_val
31
18
num = vec[1 ]
32
19
str = vec[2 ]
33
20
alias = vec[11 ]
34
- ch = Char ( parse (UInt32, num, 16 ) )
21
+ ch = parse (UInt32, num, 16 )
35
22
str[1 ] == ' <' &&
36
23
return str == " <control>" ? (alias != " " , ch, _empty_string, alias) : _empty_val
37
24
# Don't save names that simply contain hex representation
38
25
len = length (num)
39
26
pos = sizeof (str) - len
40
27
pos > 1 && str[pos] == ' -' && str[pos+ 1 : end ] == num && return _empty_val
41
28
# Check for some characters we won't represent, all outside of BMP range
42
- ch <= ' \u ffff ' && return (true , ch, str, alias)
29
+ ch <= 0x0ffff && return (true , ch, str, alias)
43
30
# Ignore characters in Linear B range (0x10000-0x100ff)
44
- ' \U 10000 ' <= ch < ' \U 10100 ' && return _empty_val
31
+ 0x10000 <= ch < 0x10100 && return _empty_val
45
32
# Ignore characters in Linear A range (0x10600-0x107ff)
46
- ' \U 10600 ' <= ch < ' \U 10800 ' && return _empty_val
33
+ 0x10600 <= ch < 0x10800 && return _empty_val
47
34
# Ignore characters in hieroglyph range (0x13000-0x14fff)
48
- ' \U 13000 ' <= ch < ' \U 15000 ' && return _empty_val
35
+ 0x13000 <= ch < 0x15000 && return _empty_val
49
36
# Ignore characters in Tangut range (0x17000-0x18fff)
50
- ' \U 17000 ' <= ch < ' \U 19000 ' && return _empty_val
37
+ 0x17000 <= ch < 0x19000 && return _empty_val
51
38
# Ignore characters in Greek vocal/instrumental range (0x1d200-0x1d2ff)
52
- ' \U 1d000 ' <= ch < ' \U 1d300 ' && return _empty_val
39
+ 0x1d000 <= ch < 0x1d300 && return _empty_val
53
40
# Don't worry about characters outside of BMP/SMP1
54
- ch > ' \U 1FFFF ' && return _empty_val
41
+ ch > 0x1FFFF && return _empty_val
55
42
(true , ch, str, alias)
56
43
end
57
44
@@ -66,10 +53,10 @@ function load_unicode_data(datapath, dpath, fname)
66
53
download (src, lname)
67
54
println (" Saved to: " , lname)
68
55
end
69
- symnam = Vector { String} ()
70
- symval = Vector {Char} ()
71
- aliasnam = Vector { String} ()
72
- aliasval = Vector {Char} ()
56
+ symnam = String[]
57
+ symval = UInt32[]
58
+ aliasnam = String[]
59
+ aliasval = UInt32[]
73
60
count = lines = aliascnt = 0
74
61
open (lname, " r" ) do f
75
62
while (l = chomp (readline (f))) != " "
107
94
108
95
function split_tables (srtval)
109
96
# BMP characters
110
- l16 = Vector { Tuple{UInt16, UInt16}} ()
97
+ l16 = Tuple{UInt16, UInt16}[]
111
98
# non-BMP characters (in range 0x10000 - 0x1ffff)
112
- l32 = Vector { Tuple{UInt16, UInt16}} ()
99
+ l32 = Tuple{UInt16, UInt16}[]
113
100
114
101
for (i, ch) in enumerate (srtval)
115
- ch > ' \U 1ffff ' && error (" Character $ch too large: $( UInt32 (ch)) " )
116
- push! (ch > ' \u ffff ' ? l32 : l16, (ch% UInt16, i))
102
+ ch > 0x1ffff && error (" Character $ch too large" )
103
+ push! (ch > 0x0ffff ? l32 : l16, (ch% UInt16, i))
117
104
end
118
105
119
106
# We now have 2 vectors, one for single BMP characters, the other for SMP-1 characters
@@ -124,7 +111,7 @@ function split_tables(srtval)
124
111
# in each table to the index into the name table (so that we can find multiple names for
125
112
# each character)
126
113
127
- indvec = Vector {UInt16} ( length (srtval))
114
+ indvec = create_vector (UInt16, length (srtval))
128
115
vec16, ind16, base32 = sortsplit! (indvec, l16, 0 )
129
116
vec32, ind32, base2c = sortsplit! (indvec, l32, base32)
130
117
@@ -149,7 +136,7 @@ function create_map(wrd_vec, wrd_dict, tab1, tab2)
149
136
wrdmap, map1, map2
150
137
end
151
138
152
- keepword (str) = ! ismatch ( r" ^[A-Z0-9\- ]+$" , str )
139
+ keepword (str) = ! _contains (str, r" ^[A-Z0-9\- ]+$" )
153
140
154
141
function outseg! (out, str)
155
142
for ch in str
@@ -158,7 +145,7 @@ function outseg!(out, str)
158
145
end
159
146
160
147
function packword (inpvec:: Vector , wrdmap, wrd_vec, wrd_dict)
161
- out = Vector { UInt8} ()
148
+ out = UInt8[]
162
149
hasparts = false
163
150
prevw = 0x0000
164
151
for val16 in inpvec
@@ -196,22 +183,22 @@ function packword(inpvec::Vector, wrdmap, wrd_vec, wrd_dict)
196
183
out
197
184
end
198
185
199
- function split_words {T<:AbstractString} (input:: Vector{T } )
186
+ function split_words (input:: Vector{<:AbstractString } )
200
187
wrd_dict = Dict {String, Int} ()
201
- wrd_vec = Vector { String} ()
202
- wrd_frq = Vector { Int} ()
203
- wrd_loc = Vector { UInt16} ()
204
- str_vec = Vector {Vector{ UInt16}} ( length (input))
188
+ wrd_vec = String[]
189
+ wrd_frq = Int[]
190
+ wrd_loc = UInt16[]
191
+ str_vec = create_vector ( Vector{UInt16}, length (input))
205
192
#=
206
193
part_dic = Dict{String, Int}()
207
- part_vec = Vector{ String}()
208
- part_frq = Vector{ Int}()
209
- wrd_parts = Vector{Vector{ UInt16}}()
194
+ part_vec = String[]
195
+ part_frq = Int[]
196
+ wrd_parts = Vector{UInt16}[]
210
197
=#
211
198
ind = 0
212
199
for (i, wrd) in enumerate (input)
213
200
allwrds = split (wrd, ' ' )
214
- outwrds = Vector { UInt16} ()
201
+ outwrds = UInt16[]
215
202
for onewrd in allwrds
216
203
val = get (wrd_dict, onewrd, 0 )
217
204
if val == 0
@@ -253,8 +240,8 @@ function split_words{T<:AbstractString}(input::Vector{T})
253
240
# This has indexes into wrd_vec for words that will end up as 2-bytes
254
241
table2 = [wrdsav[i][2 ] for i= 203 : length (wrdsav)]
255
242
# Calculate the savings of remaining words, i.e. frequency * (length-2) (some will become 0)
256
- savfrq = Vector { Int} ()
257
- savval = Vector { UInt16} ()
243
+ savfrq = Int[]
244
+ savval = UInt16[]
258
245
for i in table2
259
246
savings = (wrd_frq[i]- 1 )* (sizeof (wrd_vec[i])- 2 )
260
247
if savings > 2 || keepword (wrd_vec[i])
@@ -267,31 +254,35 @@ function split_words{T<:AbstractString}(input::Vector{T})
267
254
wrd_map, map1, map2 = create_map (wrd_vec, wrd_dict, table1, savval)
268
255
269
256
# Pack words
270
- ent_map = Vector {Vector{ UInt8}} ( length (str_vec))
257
+ ent_map = create_vector ( Vector{UInt8}, length (str_vec))
271
258
for (i, vec16) in enumerate (str_vec)
272
259
ent_map[i] = packword (vec16, wrd_map, wrd_vec, wrd_dict)
273
260
end
274
261
PackedTable (ent_map), StrTable (map1), StrTable (map2)
275
262
end
276
263
277
- function make_tables (savfile, datapath, dpath, fname)
264
+ function make_tables (datapath, dpath, fname)
265
+ symnam, symval, src = load_unicode_data (datapath, dpath, fname)
266
+ srtind = sortperm (symnam)
267
+ srtnam = symnam[srtind]
268
+ srtval = symval[srtind]
269
+ entmap, map1, map2 = split_words (srtnam)
270
+ println (" Creating tables" )
271
+ base32, indvec, vec16, ind16, vec32, ind32 = split_tables (srtval)
272
+ (VER, string (now ()), src, base32, entmap, indvec, map1, map2, vec16, ind16, vec32, ind32)
273
+ end
274
+
275
+ println (" Creating tables" )
276
+ savfile = joinpath (datapath, fname)
277
+ tup = nothing
278
+ if ! isfile (savfile)
278
279
try
279
- symnam, symval, src = load_unicode_data (datapath, dpath, fname)
280
- srtind = sortperm (symnam)
281
- srtnam = symnam[srtind]
282
- srtval = symval[srtind]
283
- entmap, map1, map2 = split_words (srtnam)
284
- println (" Creating tables" )
285
- base32, indvec, vec16, ind16, vec32, ind32 = split_tables (srtval)
286
- println (" Saving tables to " , savfile)
287
- StrTables. save (savfile,
288
- (VER, string (now ()), src, base32, entmap, indvec, map1, map2,
289
- vec16, ind16, vec32, ind32))
290
- println (" Done" )
280
+ global tup
281
+ tup = make_tables (datapath, dpath, inpname)
291
282
catch ex
292
- println (" Error in make_tables: " , sprint (showerror, ex, catch_backtrace ()))
283
+ println (sprint (showerror, ex, catch_backtrace ()))
293
284
end
285
+ println (" Saving tables to " , savfile)
286
+ StrTables. save (savfile, tup)
294
287
end
295
-
296
- savfile = joinpath (datapath, " unicode.dat" )
297
- ! isfile (savfile) && make_tables (savfile, datapath, dpath, fname)
288
+ println (" Done" )
0 commit comments