Skip to content

Commit 6603081

Browse files
authored
Merge pull request #4 from JuliaString/spj/v7update
Further updates for v0.7
2 parents da1ad4e + 3eb3cb4 commit 6603081

File tree

5 files changed

+84
-75
lines changed

5 files changed

+84
-75
lines changed

.travis.yml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,19 @@ os:
44
- linux
55
- osx
66
julia:
7-
- release
7+
- 0.6
88
- nightly
99
notifications:
1010
email: false
11+
git:
12+
depth: 99999999
13+
14+
## uncomment the following lines to allow failures on nightly julia
15+
## (tests will run but not make your overall status red)
16+
#matrix:
17+
# allow_failures:
18+
# - julia: nightly
19+
1120
# uncomment the following lines to override the default test script
1221
script:
1322
- if [[ -a .git/shallow ]]; then git fetch --unshallow; fi

REQUIRE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
julia 0.5
1+
julia 0.6
22
StrTables

appveyor.yml

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,16 @@
11
environment:
22
matrix:
3-
- JULIAVERSION: "julialang/bin/winnt/x86/0.5/julia-0.5-latest-win32.exe"
4-
- JULIAVERSION: "julialang/bin/winnt/x64/0.5/julia-0.5-latest-win64.exe"
5-
- JULIAVERSION: "julianightlies/bin/winnt/x86/julia-latest-win32.exe"
6-
- JULIAVERSION: "julianightlies/bin/winnt/x64/julia-latest-win64.exe"
3+
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.6/julia-0.6-latest-win32.exe"
4+
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe"
5+
- JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe"
6+
- JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x64/julia-latest-win64.exe"
7+
8+
## uncomment the following lines to allow failures on nightly julia
9+
## (tests will run but not make your overall status red)
10+
#matrix:
11+
# allow_failures:
12+
# - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe"
13+
# - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x64/julia-latest-win64.exe"
714

815
branches:
916
only:
@@ -17,9 +24,15 @@ notifications:
1724
on_build_status_changed: false
1825

1926
install:
27+
- ps: "[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.SecurityProtocolType]::Tls12"
28+
# If there's a newer build queued for the same PR, cancel this one
29+
- ps: if ($env:APPVEYOR_PULL_REQUEST_NUMBER -and $env:APPVEYOR_BUILD_NUMBER -ne ((Invoke-RestMethod `
30+
https://ci.appveyor.com/api/projects/$env:APPVEYOR_ACCOUNT_NAME/$env:APPVEYOR_PROJECT_SLUG/history?recordsNumber=50).builds | `
31+
Where-Object pullRequestId -eq $env:APPVEYOR_PULL_REQUEST_NUMBER)[0].buildNumber) { `
32+
throw "There are newer queued builds for this pull request, failing early." }
2033
# Download most recent Julia Windows binary
2134
- ps: (new-object net.webclient).DownloadFile(
22-
$("http://s3.amazonaws.com/"+$env:JULIAVERSION),
35+
$env:JULIA_URL,
2336
"C:\projects\julia-binary.exe")
2437
# Run installer silently, output to C:\projects\julia
2538
- C:\projects\julia-binary.exe /S /D=C:\projects\julia

deps/build.jl

Lines changed: 55 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -6,52 +6,39 @@ const VER = UInt32(1)
66

77
const datapath = joinpath(Pkg.dir(), "Unicode_Entities", "data")
88
const dpath = "http://ftp.unicode.org/Public/UNIDATA/"
9-
const fname = "UnicodeData.txt"
9+
const inpname = "UnicodeData.txt"
10+
const fname = "unicode.dat"
1011
const disp = [false]
1112

12-
function sortsplit!{T}(index::Vector{UInt16}, vec::Vector{Tuple{T, UInt16}}, base)
13-
sort!(vec)
14-
len = length(vec)
15-
valvec = Vector{T}(len)
16-
indvec = Vector{UInt16}(len)
17-
for (i, val) in enumerate(vec)
18-
valvec[i], ind = val
19-
indvec[i] = ind
20-
index[ind] = UInt16(base + i)
21-
end
22-
base += len
23-
valvec, indvec, base
24-
end
25-
2613
const _empty_string = ""
2714
const _empty_val = (false, ' ', _empty_string, _empty_string)
2815

29-
function process_line{T<:AbstractString}(vec::Vector{T})
16+
function process_line(vec::Vector{T}) where {T<:AbstractString}
3017
length(vec) < 11 && return _empty_val
3118
num = vec[1]
3219
str = vec[2]
3320
alias = vec[11]
34-
ch = Char(parse(UInt32, num, 16))
21+
ch = parse(UInt32, num, 16)
3522
str[1] == '<' &&
3623
return str == "<control>" ? (alias != "", ch, _empty_string, alias) : _empty_val
3724
# Don't save names that simply contain hex representation
3825
len = length(num)
3926
pos = sizeof(str) - len
4027
pos > 1 && str[pos] == '-' && str[pos+1:end] == num && return _empty_val
4128
# Check for some characters we won't represent, all outside of BMP range
42-
ch <= '\uffff' && return (true, ch, str, alias)
29+
ch <= 0x0ffff && return (true, ch, str, alias)
4330
# Ignore characters in Linear B range (0x10000-0x100ff)
44-
'\U10000' <= ch < '\U10100' && return _empty_val
31+
0x10000 <= ch < 0x10100 && return _empty_val
4532
# Ignore characters in Linear A range (0x10600-0x107ff)
46-
'\U10600' <= ch < '\U10800' && return _empty_val
33+
0x10600 <= ch < 0x10800 && return _empty_val
4734
# Ignore characters in hieroglyph range (0x13000-0x14fff)
48-
'\U13000' <= ch < '\U15000' && return _empty_val
35+
0x13000 <= ch < 0x15000 && return _empty_val
4936
# Ignore characters in Tangut range (0x17000-0x18fff)
50-
'\U17000' <= ch < '\U19000' && return _empty_val
37+
0x17000 <= ch < 0x19000 && return _empty_val
5138
# Ignore characters in Greek vocal/instrumental range (0x1d200-0x1d2ff)
52-
'\U1d000' <= ch < '\U1d300' && return _empty_val
39+
0x1d000 <= ch < 0x1d300 && return _empty_val
5340
# Don't worry about characters outside of BMP/SMP1
54-
ch > '\U1FFFF' && return _empty_val
41+
ch > 0x1FFFF && return _empty_val
5542
(true, ch, str, alias)
5643
end
5744

@@ -66,10 +53,10 @@ function load_unicode_data(datapath, dpath, fname)
6653
download(src, lname)
6754
println("Saved to: ", lname)
6855
end
69-
symnam = Vector{String}()
70-
symval = Vector{Char}()
71-
aliasnam = Vector{String}()
72-
aliasval = Vector{Char}()
56+
symnam = String[]
57+
symval = UInt32[]
58+
aliasnam = String[]
59+
aliasval = UInt32[]
7360
count = lines = aliascnt = 0
7461
open(lname, "r") do f
7562
while (l = chomp(readline(f))) != ""
@@ -107,13 +94,13 @@ end
10794

10895
function split_tables(srtval)
10996
# BMP characters
110-
l16 = Vector{Tuple{UInt16, UInt16}}()
97+
l16 = Tuple{UInt16, UInt16}[]
11198
# non-BMP characters (in range 0x10000 - 0x1ffff)
112-
l32 = Vector{Tuple{UInt16, UInt16}}()
99+
l32 = Tuple{UInt16, UInt16}[]
113100

114101
for (i, ch) in enumerate(srtval)
115-
ch > '\U1ffff' && error("Character $ch too large: $(UInt32(ch))")
116-
push!(ch > '\uffff' ? l32 : l16, (ch%UInt16, i))
102+
ch > 0x1ffff && error("Character $ch too large")
103+
push!(ch > 0x0ffff ? l32 : l16, (ch%UInt16, i))
117104
end
118105

119106
# We now have 2 vectors, one for single BMP characters, the other for SMP-1 characters
@@ -124,7 +111,7 @@ function split_tables(srtval)
124111
# in each table to the index into the name table (so that we can find multiple names for
125112
# each character)
126113

127-
indvec = Vector{UInt16}(length(srtval))
114+
indvec = create_vector(UInt16, length(srtval))
128115
vec16, ind16, base32 = sortsplit!(indvec, l16, 0)
129116
vec32, ind32, base2c = sortsplit!(indvec, l32, base32)
130117

@@ -149,7 +136,7 @@ function create_map(wrd_vec, wrd_dict, tab1, tab2)
149136
wrdmap, map1, map2
150137
end
151138

152-
keepword(str) = !ismatch(r"^[A-Z0-9\-]+$", str)
139+
keepword(str) = !_contains(str, r"^[A-Z0-9\-]+$")
153140

154141
function outseg!(out, str)
155142
for ch in str
@@ -158,7 +145,7 @@ function outseg!(out, str)
158145
end
159146

160147
function packword(inpvec::Vector, wrdmap, wrd_vec, wrd_dict)
161-
out = Vector{UInt8}()
148+
out = UInt8[]
162149
hasparts = false
163150
prevw = 0x0000
164151
for val16 in inpvec
@@ -196,22 +183,22 @@ function packword(inpvec::Vector, wrdmap, wrd_vec, wrd_dict)
196183
out
197184
end
198185

199-
function split_words{T<:AbstractString}(input::Vector{T})
186+
function split_words(input::Vector{<:AbstractString})
200187
wrd_dict = Dict{String, Int}()
201-
wrd_vec = Vector{String}()
202-
wrd_frq = Vector{Int}()
203-
wrd_loc = Vector{UInt16}()
204-
str_vec = Vector{Vector{UInt16}}(length(input))
188+
wrd_vec = String[]
189+
wrd_frq = Int[]
190+
wrd_loc = UInt16[]
191+
str_vec = create_vector(Vector{UInt16}, length(input))
205192
#=
206193
part_dic = Dict{String, Int}()
207-
part_vec = Vector{String}()
208-
part_frq = Vector{Int}()
209-
wrd_parts = Vector{Vector{UInt16}}()
194+
part_vec = String[]
195+
part_frq = Int[]
196+
wrd_parts = Vector{UInt16}[]
210197
=#
211198
ind = 0
212199
for (i, wrd) in enumerate(input)
213200
allwrds = split(wrd, ' ')
214-
outwrds = Vector{UInt16}()
201+
outwrds = UInt16[]
215202
for onewrd in allwrds
216203
val = get(wrd_dict, onewrd, 0)
217204
if val == 0
@@ -253,8 +240,8 @@ function split_words{T<:AbstractString}(input::Vector{T})
253240
# This has indexes into wrd_vec for words that will end up as 2-bytes
254241
table2 = [wrdsav[i][2] for i=203:length(wrdsav)]
255242
# Calculate the savings of remaining words, i.e. frequency * (length-2) (some will become 0)
256-
savfrq = Vector{Int}()
257-
savval = Vector{UInt16}()
243+
savfrq = Int[]
244+
savval = UInt16[]
258245
for i in table2
259246
savings = (wrd_frq[i]-1)*(sizeof(wrd_vec[i])-2)
260247
if savings > 2 || keepword(wrd_vec[i])
@@ -267,31 +254,35 @@ function split_words{T<:AbstractString}(input::Vector{T})
267254
wrd_map, map1, map2 = create_map(wrd_vec, wrd_dict, table1, savval)
268255

269256
# Pack words
270-
ent_map = Vector{Vector{UInt8}}(length(str_vec))
257+
ent_map = create_vector(Vector{UInt8}, length(str_vec))
271258
for (i, vec16) in enumerate(str_vec)
272259
ent_map[i] = packword(vec16, wrd_map, wrd_vec, wrd_dict)
273260
end
274261
PackedTable(ent_map), StrTable(map1), StrTable(map2)
275262
end
276263

277-
function make_tables(savfile, datapath, dpath, fname)
264+
function make_tables(datapath, dpath, fname)
265+
symnam, symval, src = load_unicode_data(datapath, dpath, fname)
266+
srtind = sortperm(symnam)
267+
srtnam = symnam[srtind]
268+
srtval = symval[srtind]
269+
entmap, map1, map2 = split_words(srtnam)
270+
println("Creating tables")
271+
base32, indvec, vec16, ind16, vec32, ind32 = split_tables(srtval)
272+
(VER, string(now()), src, base32, entmap, indvec, map1, map2, vec16, ind16, vec32, ind32)
273+
end
274+
275+
println("Creating tables")
276+
savfile = joinpath(datapath, fname)
277+
tup = nothing
278+
if !isfile(savfile)
278279
try
279-
symnam, symval, src = load_unicode_data(datapath, dpath, fname)
280-
srtind = sortperm(symnam)
281-
srtnam = symnam[srtind]
282-
srtval = symval[srtind]
283-
entmap, map1, map2 = split_words(srtnam)
284-
println("Creating tables")
285-
base32, indvec, vec16, ind16, vec32, ind32 = split_tables(srtval)
286-
println("Saving tables to ", savfile)
287-
StrTables.save(savfile,
288-
(VER, string(now()), src, base32, entmap, indvec, map1, map2,
289-
vec16, ind16, vec32, ind32))
290-
println("Done")
280+
global tup
281+
tup = make_tables(datapath, dpath, inpname)
291282
catch ex
292-
println("Error in make_tables: ", sprint(showerror, ex, catch_backtrace()))
283+
println(sprint(showerror, ex, catch_backtrace()))
293284
end
285+
println("Saving tables to ", savfile)
286+
StrTables.save(savfile, tup)
294287
end
295-
296-
savfile = joinpath(datapath, "unicode.dat")
297-
!isfile(savfile) && make_tables(savfile, datapath, dpath, fname)
288+
println("Done")

src/Unicode_Entities.jl

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,6 @@ module Unicode_Entities
1515

1616
using StrTables
1717

18-
if isdefined(Base, :Unicode)
19-
using Base.Unicode: uppercase
20-
end
21-
2218
struct PackedEntities{S,T} <: AbstractPackedTable{String}
2319
offsetvec::Vector{T}
2420
namtab::Vector{S}

0 commit comments

Comments
 (0)