Skip to content

Commit b18a63c

Browse files
committed
Support using Unicode entity names in Julia
1 parent 324b87f commit b18a63c

File tree

8 files changed

+553
-7
lines changed

8 files changed

+553
-7
lines changed

LICENSE.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
The Unicode_Entities.jl package is licensed under the MIT "Expat" License:
22

3-
> Copyright (c) 2017: ScottPJones.
3+
> Copyright (c) 2017: Gandalf Software, Inc. (Scott Paul Jones) and other contributors
44
>
55
> Permission is hereby granted, free of charge, to any person obtaining a copy
66
> of this software and associated documentation files (the "Software"), to deal

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Unicode_Entities
22

3-
[![Build Status](https://travis-ci.org/ScottPJones/Unicode_Entities.jl.svg?branch=master)](https://travis-ci.org/ScottPJones/Unicode_Entities.jl)
3+
[![Build Status](https://travis-ci.org/JuliaString/Unicode_Entities.jl.svg?branch=master)](https://travis-ci.org/JuliaString/Unicode_Entities.jl)
44

5-
[![Coverage Status](https://coveralls.io/repos/ScottPJones/Unicode_Entities.jl/badge.svg?branch=master&service=github)](https://coveralls.io/github/ScottPJones/Unicode_Entities.jl?branch=master)
5+
[![Coverage Status](https://coveralls.io/repos/JuliaString/Unicode_Entities.jl/badge.svg?branch=master&service=github)](https://coveralls.io/github/JuliaString/Unicode_Entities.jl?branch=master)
66

7-
[![codecov.io](http://codecov.io/github/ScottPJones/Unicode_Entities.jl/coverage.svg?branch=master)](http://codecov.io/github/ScottPJones/Unicode_Entities.jl?branch=master)
7+
[![codecov.io](http://codecov.io/github/JuliaString/Unicode_Entities.jl/coverage.svg?branch=master)](http://codecov.io/github/JuliaString/Unicode_Entities.jl?branch=master)

REQUIRE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
julia 0.5
2+
StrTables

data/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Data files for Unicode_Entities
2+
3+
# Input:
4+
5+
* ``UnicodeData.txt``: From ftp://ftp.unicode.org/Public/UNIDATA
6+
7+
# Output:
8+
9+
* ``unicode.dat``

deps/build.jl

Lines changed: 297 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,297 @@
1+
# License is MIT: https://github.com/JuliaString/Unicode_Entities/LICENSE.md
2+
3+
using StrTables
4+
5+
const VER = UInt32(1)
6+
7+
const datapath = joinpath(Pkg.dir(), "Unicode_Entities", "data")
8+
const dpath = "ftp://ftp.unicode.org/Public/UNIDATA/"
9+
const fname = "UnicodeData.txt"
10+
const disp = [false]
11+
12+
function sortsplit!{T}(index::Vector{UInt16}, vec::Vector{Tuple{T, UInt16}}, base)
13+
sort!(vec)
14+
len = length(vec)
15+
valvec = Vector{T}(len)
16+
indvec = Vector{UInt16}(len)
17+
for (i, val) in enumerate(vec)
18+
valvec[i], ind = val
19+
indvec[i] = ind
20+
index[ind] = UInt16(base + i)
21+
end
22+
base += len
23+
valvec, indvec, base
24+
end
25+
26+
const _empty_string = ""
27+
const _empty_val = (false, ' ', _empty_string, _empty_string)
28+
29+
function process_line{T<:AbstractString}(vec::Vector{T})
30+
length(vec) < 11 && return _empty_val
31+
num = vec[1]
32+
str = vec[2]
33+
alias = vec[11]
34+
ch = Char(parse(UInt32, num, 16))
35+
str[1] == '<' &&
36+
return str == "<control>" ? (alias != "", ch, _empty_string, alias) : _empty_val
37+
# Don't save names that simply contain hex representation
38+
len = length(num)
39+
pos = sizeof(str) - len
40+
pos > 1 && str[pos] == '-' && str[pos+1:end] == num && return _empty_val
41+
# Check for some characters we won't represent, all outside of BMP range
42+
ch <= '\uffff' && return (true, ch, str, alias)
43+
# Ignore characters in Linear B range (0x10000-0x100ff)
44+
'\U10000' <= ch < '\U10100' && return _empty_val
45+
# Ignore characters in Linear A range (0x10600-0x107ff)
46+
'\U10600' <= ch < '\U10800' && return _empty_val
47+
# Ignore characters in hieroglyph range (0x13000-0x14fff)
48+
'\U13000' <= ch < '\U15000' && return _empty_val
49+
# Ignore characters in Tangut range (0x17000-0x18fff)
50+
'\U17000' <= ch < '\U19000' && return _empty_val
51+
# Ignore characters in Greek vocal/instrumental range (0x1d200-0x1d2ff)
52+
'\U1d000' <= ch < '\U1d300' && return _empty_val
53+
# Don't worry about characters outside of BMP/SMP1
54+
ch > '\U1FFFF' && return _empty_val
55+
(true, ch, str, alias)
56+
end
57+
58+
function load_unicode_data(datapath, dpath, fname)
59+
lname = joinpath(datapath, fname)
60+
if isfile(lname)
61+
println("Loading Unicode Data: ", lname)
62+
src = lname
63+
else
64+
src = string(dpath, fname)
65+
println("Downloading Unicode Data: ", src)
66+
download(src, lname)
67+
println("Saved to: ", lname)
68+
end
69+
symnam = Vector{String}()
70+
symval = Vector{Char}()
71+
aliasnam = Vector{String}()
72+
aliasval = Vector{Char}()
73+
count = lines = aliascnt = 0
74+
open(lname, "r") do f
75+
while (l = chomp(readline(f))) != ""
76+
lines += 1
77+
flg, ch, str, alias = process_line(split(l, ";"))
78+
disp[] && println('#', lines, '\t', Int(flg), " ", l)
79+
flg || continue
80+
if symnam != ""
81+
count += 1
82+
push!(symnam, str)
83+
push!(symval, ch)
84+
end
85+
if alias != ""
86+
aliascnt += 1
87+
push!(aliasnam, alias)
88+
push!(aliasval, ch)
89+
end
90+
end
91+
end
92+
# Check for duplicates
93+
names = Set{String}(symnam)
94+
dupcnt = 0
95+
for (str,ch) in zip(aliasnam, aliasval)
96+
if str in names
97+
dupcnt += 1
98+
else
99+
push!(symnam, str)
100+
push!(symval, ch)
101+
end
102+
end
103+
println("Removed ",dupcnt," duplicate aliases")
104+
println("Finished loading ", count, " + ", aliascnt-dupcnt, " entities on ", lines, " lines")
105+
symnam, symval, src
106+
end
107+
108+
function split_tables(srtval)
109+
# BMP characters
110+
l16 = Vector{Tuple{UInt16, UInt16}}()
111+
# non-BMP characters (in range 0x10000 - 0x1ffff)
112+
l32 = Vector{Tuple{UInt16, UInt16}}()
113+
114+
for (i, ch) in enumerate(srtval)
115+
ch > '\U1ffff' && error("Character $ch too large: $(UInt32(ch))")
116+
push!(ch > '\uffff' ? l32 : l16, (ch%UInt16, i))
117+
end
118+
119+
# We now have 2 vectors, one for single BMP characters, the other for SMP-1 characters
120+
# each has the value and a index into the name table
121+
# We need to create a vector the same size as the name table, that gives the index
122+
# into one of the tables, in order to go from names to the output character
123+
# We also need, for each of the tables, a sorted vector that goes from the indices
124+
# in each table to the index into the name table (so that we can find multiple names for
125+
# each character)
126+
127+
indvec = Vector{UInt16}(length(srtval))
128+
vec16, ind16, base32 = sortsplit!(indvec, l16, 0)
129+
vec32, ind32, base2c = sortsplit!(indvec, l32, base32)
130+
131+
base32%UInt32, indvec, vec16, ind16, vec32, ind32
132+
end
133+
134+
function update_map!(wrdmap, inp, off, wrd_vec, wrd_dict)
135+
wrd = wrd_vec[inp]
136+
srt = sortperm(wrd)
137+
tab = inp[srt]
138+
map = wrd_vec[tab]
139+
for (i, v) in enumerate(map)
140+
wrdmap[wrd_dict[v]] = (i+off)%UInt16
141+
end
142+
map
143+
end
144+
145+
function create_map(wrd_vec, wrd_dict, tab1, tab2)
146+
wrdmap = zeros(UInt16, length(wrd_vec))
147+
map1 = update_map!(wrdmap, tab1, 53, wrd_vec, wrd_dict)
148+
map2 = update_map!(wrdmap, tab2, 255, wrd_vec, wrd_dict)
149+
wrdmap, map1, map2
150+
end
151+
152+
keepword(str) = !ismatch(r"^[A-Z0-9\-]+$", str)
153+
154+
function outseg!(out, str)
155+
for ch in str
156+
push!(out, ch == '-' ? 0x01 : (ch%UInt8 - ((ch-'0')<=9 ? 0x2e : 0x35)))
157+
end
158+
end
159+
160+
function packword(inpvec::Vector, wrdmap, wrd_vec, wrd_dict)
161+
out = Vector{UInt8}()
162+
hasparts = false
163+
prevw = 0x0000
164+
for val16 in inpvec
165+
w = wrdmap[val16]
166+
if w > 0x00ff
167+
push!(out, ((w>>>8)+37)%UInt8, w%UInt8)
168+
elseif w != 0x0000
169+
push!(out, w%UInt8)
170+
else
171+
str = wrd_vec[val16]
172+
!isempty(out) && (prevw < 0x26 || str[1] == '-') && push!(out, 0x00)
173+
if search(str, '-') != 0
174+
parts = split(str, '-')
175+
hasparts = true
176+
disp[] && print(parts)
177+
len = length(parts)
178+
for pos = 1:len
179+
seg = parts[pos]
180+
if (pwrd = get(wrd_dict, seg, 0)) == 0 || (wp = wrdmap[pwrd]) == 0
181+
outseg!(out, seg)
182+
elseif wp > 0x00ff
183+
push!(out, ((wp>>>8)+37)%UInt8, wp%UInt8)
184+
else
185+
push!(out, wp%UInt8)
186+
end
187+
pos != len && push!(out, 0x01)
188+
end
189+
else
190+
outseg!(out, str)
191+
end
192+
end
193+
prevw = w
194+
end
195+
hasparts && disp[] && println("\t",out)
196+
out
197+
end
198+
199+
function split_words{T<:AbstractString}(input::Vector{T})
200+
wrd_dict = Dict{String, Int}()
201+
wrd_vec = Vector{String}()
202+
wrd_frq = Vector{Int}()
203+
wrd_loc = Vector{UInt16}()
204+
str_vec = Vector{Vector{UInt16}}(length(input))
205+
#=
206+
part_dic = Dict{String, Int}()
207+
part_vec = Vector{String}()
208+
part_frq = Vector{Int}()
209+
wrd_parts = Vector{Vector{UInt16}}()
210+
=#
211+
ind = 0
212+
for (i, wrd) in enumerate(input)
213+
allwrds = split(wrd, ' ')
214+
outwrds = Vector{UInt16}()
215+
for onewrd in allwrds
216+
val = get(wrd_dict, onewrd, 0)
217+
if val == 0
218+
val = (ind += 1)
219+
disp[] && println(val, '\t', i, '\t', onewrd)
220+
push!(wrd_vec, onewrd)
221+
push!(wrd_frq, 0)
222+
push!(wrd_loc, i) # location first found (may be only location)
223+
wrd_dict[onewrd] = val
224+
if search(onewrd, '-') != 0
225+
allparts = split(onewrd, '-')
226+
for part in allparts
227+
part == "" && continue
228+
if (vp = get(wrd_dict, part, 0)) == 0
229+
vp = (ind += 1)
230+
disp[] && println("\tparts:\t", vp, '\t', i, '\t', part)
231+
push!(wrd_vec, part)
232+
push!(wrd_frq, 0)
233+
push!(wrd_loc, i) # location first found (may be only location)
234+
wrd_dict[part] = vp
235+
end
236+
wrd_frq[vp] += 1
237+
end
238+
end
239+
end
240+
wrd_frq[val] += 1
241+
push!(outwrds, val)
242+
end
243+
str_vec[i] = outwrds
244+
end
245+
246+
# Calculate the savings of each word, i.e. frequency * (length-1), where freq > 1
247+
# take top 256-(16+38) = 202 words
248+
wrdsav = sort([((wrd_frq[i]-1)*(sizeof(wrd_vec[i])-1), i)
249+
for i = 1:length(wrd_vec) if wrd_frq[i]>1 || keepword(wrd_vec[i])],
250+
rev=true)
251+
# This has indexes into wrd_vec for words that will end up as 1-byte
252+
table1 = [wrdsav[i][2] for i=1:202]
253+
# This has indexes into wrd_vec for words that will end up as 2-bytes
254+
table2 = [wrdsav[i][2] for i=203:length(wrdsav)]
255+
# Calculate the savings of remaining words, i.e. frequency * (length-2) (some will become 0)
256+
savfrq = Vector{Int}()
257+
savval = Vector{UInt16}()
258+
for i in table2
259+
savings = (wrd_frq[i]-1)*(sizeof(wrd_vec[i])-2)
260+
if savings > 2 || keepword(wrd_vec[i])
261+
push!(savfrq, savings)
262+
push!(savval, i)
263+
end
264+
end
265+
266+
# For every word in wrd_vec, create an entry to that has 0-37, 38-53, 54-255, 256-and above
267+
wrd_map, map1, map2 = create_map(wrd_vec, wrd_dict, table1, savval)
268+
269+
# Pack words
270+
ent_map = Vector{Vector{UInt8}}(length(str_vec))
271+
for (i, vec16) in enumerate(str_vec)
272+
ent_map[i] = packword(vec16, wrd_map, wrd_vec, wrd_dict)
273+
end
274+
PackedTable(ent_map), StrTable(map1), StrTable(map2)
275+
end
276+
277+
function make_tables(savfile, datapath, dpath, fname)
278+
try
279+
symnam, symval, src = load_unicode_data(datapath, dpath, fname)
280+
srtind = sortperm(symnam)
281+
srtnam = symnam[srtind]
282+
srtval = symval[srtind]
283+
entmap, map1, map2 = split_words(srtnam)
284+
println("Creating tables")
285+
base32, indvec, vec16, ind16, vec32, ind32 = split_tables(srtval)
286+
println("Saving tables to ", savfile)
287+
StrTables.save(savfile,
288+
(VER, string(now()), src, base32, entmap, indvec, map1, map2,
289+
vec16, ind16, vec32, ind32))
290+
println("Done")
291+
catch ex
292+
println("Error in make_tables: ", sprint(showerror, ex, catch_backtrace()))
293+
end
294+
end
295+
296+
savfile = joinpath(datapath, "unicode.dat")
297+
!isfile(savfile) && make_tables(savfile, datapath, dpath, fname)

0 commit comments

Comments
 (0)