|
| 1 | +# License is MIT: https://github.com/JuliaString/Unicode_Entities/LICENSE.md |
| 2 | + |
| 3 | +using StrTables |
| 4 | + |
| 5 | +const VER = UInt32(1) |
| 6 | + |
| 7 | +const datapath = joinpath(Pkg.dir(), "Unicode_Entities", "data") |
| 8 | +const dpath = "ftp://ftp.unicode.org/Public/UNIDATA/" |
| 9 | +const fname = "UnicodeData.txt" |
| 10 | +const disp = [false] |
| 11 | + |
| 12 | +function sortsplit!{T}(index::Vector{UInt16}, vec::Vector{Tuple{T, UInt16}}, base) |
| 13 | + sort!(vec) |
| 14 | + len = length(vec) |
| 15 | + valvec = Vector{T}(len) |
| 16 | + indvec = Vector{UInt16}(len) |
| 17 | + for (i, val) in enumerate(vec) |
| 18 | + valvec[i], ind = val |
| 19 | + indvec[i] = ind |
| 20 | + index[ind] = UInt16(base + i) |
| 21 | + end |
| 22 | + base += len |
| 23 | + valvec, indvec, base |
| 24 | +end |
| 25 | + |
| 26 | +const _empty_string = "" |
| 27 | +const _empty_val = (false, ' ', _empty_string, _empty_string) |
| 28 | + |
| 29 | +function process_line{T<:AbstractString}(vec::Vector{T}) |
| 30 | + length(vec) < 11 && return _empty_val |
| 31 | + num = vec[1] |
| 32 | + str = vec[2] |
| 33 | + alias = vec[11] |
| 34 | + ch = Char(parse(UInt32, num, 16)) |
| 35 | + str[1] == '<' && |
| 36 | + return str == "<control>" ? (alias != "", ch, _empty_string, alias) : _empty_val |
| 37 | + # Don't save names that simply contain hex representation |
| 38 | + len = length(num) |
| 39 | + pos = sizeof(str) - len |
| 40 | + pos > 1 && str[pos] == '-' && str[pos+1:end] == num && return _empty_val |
| 41 | + # Check for some characters we won't represent, all outside of BMP range |
| 42 | + ch <= '\uffff' && return (true, ch, str, alias) |
| 43 | + # Ignore characters in Linear B range (0x10000-0x100ff) |
| 44 | + '\U10000' <= ch < '\U10100' && return _empty_val |
| 45 | + # Ignore characters in Linear A range (0x10600-0x107ff) |
| 46 | + '\U10600' <= ch < '\U10800' && return _empty_val |
| 47 | + # Ignore characters in hieroglyph range (0x13000-0x14fff) |
| 48 | + '\U13000' <= ch < '\U15000' && return _empty_val |
| 49 | + # Ignore characters in Tangut range (0x17000-0x18fff) |
| 50 | + '\U17000' <= ch < '\U19000' && return _empty_val |
| 51 | + # Ignore characters in Greek vocal/instrumental range (0x1d200-0x1d2ff) |
| 52 | + '\U1d000' <= ch < '\U1d300' && return _empty_val |
| 53 | + # Don't worry about characters outside of BMP/SMP1 |
| 54 | + ch > '\U1FFFF' && return _empty_val |
| 55 | + (true, ch, str, alias) |
| 56 | +end |
| 57 | + |
| 58 | +function load_unicode_data(datapath, dpath, fname) |
| 59 | + lname = joinpath(datapath, fname) |
| 60 | + if isfile(lname) |
| 61 | + println("Loading Unicode Data: ", lname) |
| 62 | + src = lname |
| 63 | + else |
| 64 | + src = string(dpath, fname) |
| 65 | + println("Downloading Unicode Data: ", src) |
| 66 | + download(src, lname) |
| 67 | + println("Saved to: ", lname) |
| 68 | + end |
| 69 | + symnam = Vector{String}() |
| 70 | + symval = Vector{Char}() |
| 71 | + aliasnam = Vector{String}() |
| 72 | + aliasval = Vector{Char}() |
| 73 | + count = lines = aliascnt = 0 |
| 74 | + open(lname, "r") do f |
| 75 | + while (l = chomp(readline(f))) != "" |
| 76 | + lines += 1 |
| 77 | + flg, ch, str, alias = process_line(split(l, ";")) |
| 78 | + disp[] && println('#', lines, '\t', Int(flg), " ", l) |
| 79 | + flg || continue |
| 80 | + if symnam != "" |
| 81 | + count += 1 |
| 82 | + push!(symnam, str) |
| 83 | + push!(symval, ch) |
| 84 | + end |
| 85 | + if alias != "" |
| 86 | + aliascnt += 1 |
| 87 | + push!(aliasnam, alias) |
| 88 | + push!(aliasval, ch) |
| 89 | + end |
| 90 | + end |
| 91 | + end |
| 92 | + # Check for duplicates |
| 93 | + names = Set{String}(symnam) |
| 94 | + dupcnt = 0 |
| 95 | + for (str,ch) in zip(aliasnam, aliasval) |
| 96 | + if str in names |
| 97 | + dupcnt += 1 |
| 98 | + else |
| 99 | + push!(symnam, str) |
| 100 | + push!(symval, ch) |
| 101 | + end |
| 102 | + end |
| 103 | + println("Removed ",dupcnt," duplicate aliases") |
| 104 | + println("Finished loading ", count, " + ", aliascnt-dupcnt, " entities on ", lines, " lines") |
| 105 | + symnam, symval, src |
| 106 | +end |
| 107 | + |
| 108 | +function split_tables(srtval) |
| 109 | + # BMP characters |
| 110 | + l16 = Vector{Tuple{UInt16, UInt16}}() |
| 111 | + # non-BMP characters (in range 0x10000 - 0x1ffff) |
| 112 | + l32 = Vector{Tuple{UInt16, UInt16}}() |
| 113 | + |
| 114 | + for (i, ch) in enumerate(srtval) |
| 115 | + ch > '\U1ffff' && error("Character $ch too large: $(UInt32(ch))") |
| 116 | + push!(ch > '\uffff' ? l32 : l16, (ch%UInt16, i)) |
| 117 | + end |
| 118 | + |
| 119 | + # We now have 2 vectors, one for single BMP characters, the other for SMP-1 characters |
| 120 | + # each has the value and a index into the name table |
| 121 | + # We need to create a vector the same size as the name table, that gives the index |
| 122 | + # into one of the tables, in order to go from names to the output character |
| 123 | + # We also need, for each of the tables, a sorted vector that goes from the indices |
| 124 | + # in each table to the index into the name table (so that we can find multiple names for |
| 125 | + # each character) |
| 126 | + |
| 127 | + indvec = Vector{UInt16}(length(srtval)) |
| 128 | + vec16, ind16, base32 = sortsplit!(indvec, l16, 0) |
| 129 | + vec32, ind32, base2c = sortsplit!(indvec, l32, base32) |
| 130 | + |
| 131 | + base32%UInt32, indvec, vec16, ind16, vec32, ind32 |
| 132 | +end |
| 133 | + |
| 134 | +function update_map!(wrdmap, inp, off, wrd_vec, wrd_dict) |
| 135 | + wrd = wrd_vec[inp] |
| 136 | + srt = sortperm(wrd) |
| 137 | + tab = inp[srt] |
| 138 | + map = wrd_vec[tab] |
| 139 | + for (i, v) in enumerate(map) |
| 140 | + wrdmap[wrd_dict[v]] = (i+off)%UInt16 |
| 141 | + end |
| 142 | + map |
| 143 | +end |
| 144 | + |
| 145 | +function create_map(wrd_vec, wrd_dict, tab1, tab2) |
| 146 | + wrdmap = zeros(UInt16, length(wrd_vec)) |
| 147 | + map1 = update_map!(wrdmap, tab1, 53, wrd_vec, wrd_dict) |
| 148 | + map2 = update_map!(wrdmap, tab2, 255, wrd_vec, wrd_dict) |
| 149 | + wrdmap, map1, map2 |
| 150 | +end |
| 151 | + |
| 152 | +keepword(str) = !ismatch(r"^[A-Z0-9\-]+$", str) |
| 153 | + |
| 154 | +function outseg!(out, str) |
| 155 | + for ch in str |
| 156 | + push!(out, ch == '-' ? 0x01 : (ch%UInt8 - ((ch-'0')<=9 ? 0x2e : 0x35))) |
| 157 | + end |
| 158 | +end |
| 159 | + |
| 160 | +function packword(inpvec::Vector, wrdmap, wrd_vec, wrd_dict) |
| 161 | + out = Vector{UInt8}() |
| 162 | + hasparts = false |
| 163 | + prevw = 0x0000 |
| 164 | + for val16 in inpvec |
| 165 | + w = wrdmap[val16] |
| 166 | + if w > 0x00ff |
| 167 | + push!(out, ((w>>>8)+37)%UInt8, w%UInt8) |
| 168 | + elseif w != 0x0000 |
| 169 | + push!(out, w%UInt8) |
| 170 | + else |
| 171 | + str = wrd_vec[val16] |
| 172 | + !isempty(out) && (prevw < 0x26 || str[1] == '-') && push!(out, 0x00) |
| 173 | + if search(str, '-') != 0 |
| 174 | + parts = split(str, '-') |
| 175 | + hasparts = true |
| 176 | + disp[] && print(parts) |
| 177 | + len = length(parts) |
| 178 | + for pos = 1:len |
| 179 | + seg = parts[pos] |
| 180 | + if (pwrd = get(wrd_dict, seg, 0)) == 0 || (wp = wrdmap[pwrd]) == 0 |
| 181 | + outseg!(out, seg) |
| 182 | + elseif wp > 0x00ff |
| 183 | + push!(out, ((wp>>>8)+37)%UInt8, wp%UInt8) |
| 184 | + else |
| 185 | + push!(out, wp%UInt8) |
| 186 | + end |
| 187 | + pos != len && push!(out, 0x01) |
| 188 | + end |
| 189 | + else |
| 190 | + outseg!(out, str) |
| 191 | + end |
| 192 | + end |
| 193 | + prevw = w |
| 194 | + end |
| 195 | + hasparts && disp[] && println("\t",out) |
| 196 | + out |
| 197 | +end |
| 198 | + |
| 199 | +function split_words{T<:AbstractString}(input::Vector{T}) |
| 200 | + wrd_dict = Dict{String, Int}() |
| 201 | + wrd_vec = Vector{String}() |
| 202 | + wrd_frq = Vector{Int}() |
| 203 | + wrd_loc = Vector{UInt16}() |
| 204 | + str_vec = Vector{Vector{UInt16}}(length(input)) |
| 205 | + #= |
| 206 | + part_dic = Dict{String, Int}() |
| 207 | + part_vec = Vector{String}() |
| 208 | + part_frq = Vector{Int}() |
| 209 | + wrd_parts = Vector{Vector{UInt16}}() |
| 210 | + =# |
| 211 | + ind = 0 |
| 212 | + for (i, wrd) in enumerate(input) |
| 213 | + allwrds = split(wrd, ' ') |
| 214 | + outwrds = Vector{UInt16}() |
| 215 | + for onewrd in allwrds |
| 216 | + val = get(wrd_dict, onewrd, 0) |
| 217 | + if val == 0 |
| 218 | + val = (ind += 1) |
| 219 | + disp[] && println(val, '\t', i, '\t', onewrd) |
| 220 | + push!(wrd_vec, onewrd) |
| 221 | + push!(wrd_frq, 0) |
| 222 | + push!(wrd_loc, i) # location first found (may be only location) |
| 223 | + wrd_dict[onewrd] = val |
| 224 | + if search(onewrd, '-') != 0 |
| 225 | + allparts = split(onewrd, '-') |
| 226 | + for part in allparts |
| 227 | + part == "" && continue |
| 228 | + if (vp = get(wrd_dict, part, 0)) == 0 |
| 229 | + vp = (ind += 1) |
| 230 | + disp[] && println("\tparts:\t", vp, '\t', i, '\t', part) |
| 231 | + push!(wrd_vec, part) |
| 232 | + push!(wrd_frq, 0) |
| 233 | + push!(wrd_loc, i) # location first found (may be only location) |
| 234 | + wrd_dict[part] = vp |
| 235 | + end |
| 236 | + wrd_frq[vp] += 1 |
| 237 | + end |
| 238 | + end |
| 239 | + end |
| 240 | + wrd_frq[val] += 1 |
| 241 | + push!(outwrds, val) |
| 242 | + end |
| 243 | + str_vec[i] = outwrds |
| 244 | + end |
| 245 | + |
| 246 | + # Calculate the savings of each word, i.e. frequency * (length-1), where freq > 1 |
| 247 | + # take top 256-(16+38) = 202 words |
| 248 | + wrdsav = sort([((wrd_frq[i]-1)*(sizeof(wrd_vec[i])-1), i) |
| 249 | + for i = 1:length(wrd_vec) if wrd_frq[i]>1 || keepword(wrd_vec[i])], |
| 250 | + rev=true) |
| 251 | + # This has indexes into wrd_vec for words that will end up as 1-byte |
| 252 | + table1 = [wrdsav[i][2] for i=1:202] |
| 253 | + # This has indexes into wrd_vec for words that will end up as 2-bytes |
| 254 | + table2 = [wrdsav[i][2] for i=203:length(wrdsav)] |
| 255 | + # Calculate the savings of remaining words, i.e. frequency * (length-2) (some will become 0) |
| 256 | + savfrq = Vector{Int}() |
| 257 | + savval = Vector{UInt16}() |
| 258 | + for i in table2 |
| 259 | + savings = (wrd_frq[i]-1)*(sizeof(wrd_vec[i])-2) |
| 260 | + if savings > 2 || keepword(wrd_vec[i]) |
| 261 | + push!(savfrq, savings) |
| 262 | + push!(savval, i) |
| 263 | + end |
| 264 | + end |
| 265 | + |
| 266 | + # For every word in wrd_vec, create an entry to that has 0-37, 38-53, 54-255, 256-and above |
| 267 | + wrd_map, map1, map2 = create_map(wrd_vec, wrd_dict, table1, savval) |
| 268 | + |
| 269 | + # Pack words |
| 270 | + ent_map = Vector{Vector{UInt8}}(length(str_vec)) |
| 271 | + for (i, vec16) in enumerate(str_vec) |
| 272 | + ent_map[i] = packword(vec16, wrd_map, wrd_vec, wrd_dict) |
| 273 | + end |
| 274 | + PackedTable(ent_map), StrTable(map1), StrTable(map2) |
| 275 | +end |
| 276 | + |
| 277 | +function make_tables(savfile, datapath, dpath, fname) |
| 278 | + try |
| 279 | + symnam, symval, src = load_unicode_data(datapath, dpath, fname) |
| 280 | + srtind = sortperm(symnam) |
| 281 | + srtnam = symnam[srtind] |
| 282 | + srtval = symval[srtind] |
| 283 | + entmap, map1, map2 = split_words(srtnam) |
| 284 | + println("Creating tables") |
| 285 | + base32, indvec, vec16, ind16, vec32, ind32 = split_tables(srtval) |
| 286 | + println("Saving tables to ", savfile) |
| 287 | + StrTables.save(savfile, |
| 288 | + (VER, string(now()), src, base32, entmap, indvec, map1, map2, |
| 289 | + vec16, ind16, vec32, ind32)) |
| 290 | + println("Done") |
| 291 | + catch ex |
| 292 | + println("Error in make_tables: ", sprint(showerror, ex, catch_backtrace())) |
| 293 | + end |
| 294 | +end |
| 295 | + |
| 296 | +savfile = joinpath(datapath, "unicode.dat") |
| 297 | +!isfile(savfile) && make_tables(savfile, datapath, dpath, fname) |
0 commit comments