Skip to content

Commit bcb97ec

Browse files
authored
Merge pull request #71 from Eight1911/julia-0.7
resolve #66 , fix Julia 0.7 deprecation warnings
2 parents e6a7e5a + c64b296 commit bcb97ec

File tree

22 files changed

+217
-117
lines changed

22 files changed

+217
-117
lines changed

.DS_Store

8 KB
Binary file not shown.

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
3+
makefile

.travis.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,12 @@ language: julia
22
os:
33
- linux
44
julia:
5-
- 0.6
65
- nightly
76
notifications:
87
email: false
98
sudo: false
109
script:
1110
- if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
12-
- julia -e 'Pkg.clone(pwd()); Pkg.build("DecisionTree"); Pkg.test("DecisionTree"; coverage=true)'
11+
- julia -e 'import Pkg; Pkg.clone(pwd()); Pkg.build("DecisionTree"); Pkg.test("DecisionTree"; coverage=true)'
1312
after_success:
14-
- julia -e 'cd(Pkg.dir("DecisionTree")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
13+
- julia -e 'import Pkg; cd(Pkg.dir("DecisionTree")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'

src/DecisionTree.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ depth(leaf::Leaf) = 0
7171
depth(tree::Node) = 1 + max(depth(tree.left), depth(tree.right))
7272

7373
function print_tree(leaf::Leaf, depth=-1, indent=0)
74-
matches = find(leaf.values .== leaf.majority)
74+
matches = findall(leaf.values .== leaf.majority)
7575
ratio = string(length(matches)) * "/" * string(length(leaf.values))
7676
println("$(leaf.majority) : $(ratio)")
7777
end

src/classification/main.jl

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
# Utilities
22

3+
34
include("tree.jl")
5+
import Distributed
6+
import Random
47

58
# Returns a dict ("Label1" => 1, "Label2" => 2, "Label3" => 3, ...)
6-
label_index(labels) = Dict([Pair(v => k) for (k, v) in enumerate(labels)])
9+
label_index(labels) = Dict(v => k for (k, v) in enumerate(labels))
710

811
## Helper function. Counts the votes.
912
## Returns a vector of probabilities (eg. [0.2, 0.6, 0.2]) which is in the same
@@ -26,7 +29,7 @@ end
2629
function stack_function_results(row_fun::Function, X::Matrix)
2730
N = size(X, 1)
2831
N_cols = length(row_fun(X[1, :])) # gets the number of columns
29-
out = Array{Float64}(N, N_cols)
32+
out = Array{Float64}(undef, N, N_cols)
3033
for i in 1:N
3134
out[i, :] = row_fun(X[i, :])
3235
end
@@ -53,7 +56,7 @@ function _split_neg_z1_loss(labels::Vector, features::Matrix, weights::Vector)
5356
end
5457

5558
function build_stump(labels::Vector, features::Matrix, weights=[0];
56-
rng=Base.GLOBAL_RNG)
59+
rng=Random.GLOBAL_RNG)
5760
if weights == [0]
5861
return build_tree(labels, features, 0, 1)
5962
end
@@ -72,8 +75,8 @@ end
7275

7376
function build_tree(labels::Vector, features::Matrix, n_subfeatures=0, max_depth=-1,
7477
min_samples_leaf=1, min_samples_split=2, min_purity_increase=0.0;
75-
rng=Base.GLOBAL_RNG)
76-
rng = mk_rng(rng)::AbstractRNG
78+
rng=Random.GLOBAL_RNG)
79+
rng = mk_rng(rng)::Random.AbstractRNG
7780
if max_depth < -1
7881
error("Unexpected value for max_depth: $(max_depth) (expected: max_depth >= 0, or max_depth = -1 for infinite depth)")
7982
end
@@ -121,7 +124,7 @@ function prune_tree(tree::LeafOrNode, purity_thresh=1.0)
121124
elseif N == 2 ## a stump
122125
all_labels = [tree.left.values; tree.right.values]
123126
majority = majority_vote(all_labels)
124-
matches = find(all_labels .== majority)
127+
matches = findall(all_labels .== majority)
125128
purity = length(matches) / length(all_labels)
126129
if purity >= purity_thresh
127130
return Leaf(majority, all_labels)
@@ -156,7 +159,7 @@ end
156159

157160
function apply_tree(tree::LeafOrNode, features::Matrix)
158161
N = size(features,1)
159-
predictions = Array{Any}(N)
162+
predictions = Array{Any}(undef, N)
160163
for i in 1:N
161164
predictions[i] = apply_tree(tree, features[i, :])
162165
end
@@ -191,12 +194,13 @@ end
191194
apply_tree_proba(tree::LeafOrNode, features::Matrix, labels) =
192195
stack_function_results(row->apply_tree_proba(tree, row, labels), features)
193196

194-
function build_forest(labels::Vector, features::Matrix, n_subfeatures=0, n_trees=10, partial_sampling=0.7, max_depth=-1; rng=Base.GLOBAL_RNG)
195-
rng = mk_rng(rng)::AbstractRNG
197+
function build_forest(labels::Vector, features::Matrix, n_subfeatures=0, n_trees=10, partial_sampling=0.7, max_depth=-1; rng=Random.GLOBAL_RNG)
198+
rng = mk_rng(rng)::Random.AbstractRNG
196199
partial_sampling = partial_sampling > 1.0 ? 1.0 : partial_sampling
197200
Nlabels = length(labels)
198201
Nsamples = _int(partial_sampling * Nlabels)
199-
forest = @parallel (vcat) for i in 1:n_trees
202+
203+
forest = @Distributed.distributed (vcat) for i in 1:n_trees
200204
inds = rand(rng, 1:Nlabels, Nsamples)
201205
build_tree(labels[inds], features[inds,:], n_subfeatures, max_depth;
202206
rng=rng)
@@ -206,7 +210,7 @@ end
206210

207211
function apply_forest(forest::Ensemble, features::Vector)
208212
n_trees = length(forest)
209-
votes = Array{Any}(n_trees)
213+
votes = Array{Any}(undef, n_trees)
210214
for i in 1:n_trees
211215
votes[i] = apply_tree(forest.trees[i], features)
212216
end
@@ -219,7 +223,7 @@ end
219223

220224
function apply_forest(forest::Ensemble, features::Matrix)
221225
N = size(features,1)
222-
predictions = Array{Any}(N)
226+
predictions = Array{Any}(undef, N)
223227
for i in 1:N
224228
predictions[i] = apply_forest(forest, features[i, :])
225229
end
@@ -247,7 +251,7 @@ apply_forest_proba(forest::Ensemble, features::Matrix, labels) =
247251
stack_function_results(row->apply_forest_proba(forest, row, labels),
248252
features)
249253

250-
function build_adaboost_stumps(labels::Vector, features::Matrix, n_iterations::Integer; rng=Base.GLOBAL_RNG)
254+
function build_adaboost_stumps(labels::Vector, features::Matrix, n_iterations::Integer; rng=Random.GLOBAL_RNG)
251255
N = length(labels)
252256
weights = ones(N) / N
253257
stumps = Node[]
@@ -290,7 +294,7 @@ end
290294

291295
function apply_adaboost_stumps(stumps::Ensemble, coeffs::Vector{Float64}, features::Matrix)
292296
N = size(features,1)
293-
predictions = Array{Any}(N)
297+
predictions = Array{Any}(undef, N)
294298
for i in 1:N
295299
predictions[i] = apply_adaboost_stumps(stumps, coeffs, features[i,:])
296300
end

src/classification/tree.jl

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
module treeclassifier
88
include("../util.jl")
9+
import Random
910

1011
export fit
1112

@@ -57,7 +58,7 @@ module treeclassifier
5758
ncr :: Array{Int64}, # ncr maintains the counts of labels on the right
5859
Xf :: Array{T},
5960
Yf :: Array{Int64},
60-
rng :: AbstractRNG) where T <: Any
61+
rng :: Random.AbstractRNG) where T <: Any
6162
region = node.region
6263
n_samples = length(region)
6364
r_start = region.start - 1
@@ -69,7 +70,7 @@ module treeclassifier
6970
@inbounds nc[Y[indX[i]]] += 1
7071
end
7172

72-
node.label = indmax(nc)
73+
node.label = argmax(nc)
7374

7475
if (min_samples_leaf * 2 > n_samples
7576
|| min_samples_split > n_samples
@@ -96,7 +97,6 @@ module treeclassifier
9697
# only sample n_features used features
9798
# is a hypergeometric random variable
9899
total_features = size(X, 2)
99-
100100
# this is the total number of features that we expect to not
101101
# be one of the known constant features. since we know exactly
102102
# what the non constant features are, we can sample at 'non_constants_used'
@@ -127,6 +127,7 @@ module treeclassifier
127127
nl, nr = 0, n_samples
128128
lo, hi = 0, 0
129129
is_constant = true
130+
last_f = Xf[1]
130131
while hi < n_samples
131132
lo = hi + 1
132133
curr_f = Xf[lo]
@@ -279,7 +280,7 @@ module treeclassifier
279280
label_dict[label_list[i]] = i
280281
end
281282

282-
_Y = Array{Int64}(length(Y))
283+
_Y = Array{Int64}(undef, length(Y))
283284
@inbounds for i in 1:length(Y)
284285
_Y[i] = label_dict[Y[i]]
285286
end
@@ -294,7 +295,7 @@ module treeclassifier
294295
min_samples_leaf :: Int64,
295296
min_samples_split :: Int64,
296297
min_purity_increase :: Float64;
297-
rng=Base.GLOBAL_RNG :: AbstractRNG) where T <: Any
298+
rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where T <: Any
298299
n_samples, n_features = size(X)
299300
label_list, _Y = assign(Y)
300301
n_classes = Int64(length(label_list))
@@ -312,11 +313,11 @@ module treeclassifier
312313
end
313314
stack = NodeMeta[ tree.root ]
314315

315-
nc = Array{Int64}(n_classes)
316-
ncl = Array{Int64}(n_classes)
317-
ncr = Array{Int64}(n_classes)
318-
Xf = Array{T}(n_samples)
319-
Yf = Array{Int64}(n_samples)
316+
nc = Array{Int64}(undef, n_classes)
317+
ncl = Array{Int64}(undef, n_classes)
318+
ncr = Array{Int64}(undef, n_classes)
319+
Xf = Array{T}(undef, n_samples)
320+
Yf = Array{Int64}(undef, n_samples)
320321
@inbounds while length(stack) > 0
321322
node = pop!(stack)
322323
_split!(

src/measures.jl

Lines changed: 43 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
using LinearAlgebra
2+
using Random
3+
14
struct ConfusionMatrix
25
classes::Vector
36
matrix::Matrix{Int}
@@ -16,24 +19,24 @@ function show(io::IO, cm::ConfusionMatrix)
1619
show(io, cm.kappa)
1720
end
1821

19-
function _hist_add!{T}(counts::Dict{T,Int}, labels::Vector{T}, region::UnitRange{Int})
22+
function _hist_add!(counts::Dict{T, Int}, labels::Vector{T}, region::UnitRange{Int}) where T
2023
for i in region
2124
lbl = labels[i]
2225
counts[lbl] = get(counts, lbl, 0) + 1
2326
end
2427
return counts
2528
end
2629

27-
_hist{T}(labels::Vector{T}, region::UnitRange{Int} = 1:endof(labels)) =
30+
_hist(labels::Vector{T}, region::UnitRange{Int} = 1:lastindex(labels)) where T =
2831
_hist_add!(Dict{T,Int}(), labels, region)
2932

30-
function _neg_z1_loss{T<:Real}(labels::Vector, weights::Vector{T})
33+
function _neg_z1_loss(labels::Vector, weights::Vector{T}) where T <: Real
3134
missmatches = labels .!= majority_vote(labels)
3235
loss = sum(weights[missmatches])
3336
return -loss
3437
end
3538

36-
function _weighted_error{T<:Real}(actual::Vector, predicted::Vector, weights::Vector{T})
39+
function _weighted_error(actual::Vector, predicted::Vector, weights::Vector{T}) where T <: Real
3740
mismatches = actual .!= predicted
3841
err = sum(weights[mismatches]) / sum(weights)
3942
return err
@@ -65,15 +68,15 @@ function confusion_matrix(actual::Vector, predicted::Vector)
6568
classes = sort(unique([actual; predicted]))
6669
N = length(classes)
6770
for i in 1:N
68-
_actual[actual .== classes[i]] = i
69-
_predicted[predicted .== classes[i]] = i
71+
_actual[actual .== classes[i]] .= i
72+
_predicted[predicted .== classes[i]] .= i
7073
end
7174
CM = zeros(Int,N,N)
7275
for i in zip(_actual, _predicted)
7376
CM[i[1],i[2]] += 1
7477
end
75-
accuracy = trace(CM) / sum(CM)
76-
prob_chance = (sum(CM,1) * sum(CM,2))[1] / sum(CM)^2
78+
accuracy = LinearAlgebra.tr(CM) / sum(CM)
79+
prob_chance = (sum(CM,dims=1) * sum(CM,dims=2))[1] / sum(CM)^2
7780
kappa = (accuracy - prob_chance) / (1.0 - prob_chance)
7881
return ConfusionMatrix(classes, CM, accuracy, kappa)
7982
end
@@ -94,11 +97,11 @@ function _nfoldCV(classifier::Symbol, labels, features, args...)
9497
end
9598
N = length(labels)
9699
ntest = _int(floor(N / nfolds))
97-
inds = randperm(N)
100+
inds = Random.randperm(N)
98101
accuracy = zeros(nfolds)
99102
for i in 1:nfolds
100103
test_inds = falses(N)
101-
test_inds[(i - 1) * ntest + 1 : i * ntest] = true
104+
test_inds[(i - 1) * ntest + 1 : i * ntest] .= true
102105
train_inds = (!).(test_inds)
103106
test_features = features[inds[test_inds],:]
104107
test_labels = labels[inds[test_inds]]
@@ -144,7 +147,29 @@ function R2(actual, predicted)
144147
return 1.0 - ss_residual/ss_total
145148
end
146149

147-
function _nfoldCV{T<:Float64}(regressor::Symbol, labels::Vector{T}, features::Matrix, args...)
150+
# Pearson's Correlation Coefficient
151+
function cor(x, y)
152+
@assert(length(x) == length(y))
153+
@assert(length(x) > 1)
154+
155+
n = length(x)
156+
157+
x_mean = sum(x) / n
158+
y_mean = sum(y) / n
159+
160+
x_centered = x .- x_mean
161+
y_centered = y .- y_mean
162+
163+
x_var = sum(x_centered .^ 2)
164+
y_var = sum(y_centered .^ 2)
165+
166+
xy_cov = sum(x_centered .* y_centered)
167+
168+
return xy_cov / sqrt(x_var * y_var)
169+
170+
end
171+
172+
function _nfoldCV(regressor::Symbol, labels::Vector{T}, features::Matrix, args...) where T <: Float64
148173
nfolds = args[end]
149174
if nfolds < 2
150175
return nothing
@@ -159,11 +184,11 @@ function _nfoldCV{T<:Float64}(regressor::Symbol, labels::Vector{T}, features::Ma
159184
end
160185
N = length(labels)
161186
ntest = _int(floor(N / nfolds))
162-
inds = randperm(N)
187+
inds = Random.randperm(N)
163188
R2s = zeros(nfolds)
164189
for i in 1:nfolds
165190
test_inds = falses(N)
166-
test_inds[(i - 1) * ntest + 1 : i * ntest] = true
191+
test_inds[(i - 1) * ntest + 1 : i * ntest] .= true
167192
train_inds = (!).(test_inds)
168193
test_features = features[inds[test_inds],:]
169194
test_labels = labels[inds[test_inds]]
@@ -189,7 +214,10 @@ function _nfoldCV{T<:Float64}(regressor::Symbol, labels::Vector{T}, features::Ma
189214
return R2s
190215
end
191216

192-
nfoldCV_tree{T<:Float64}(labels::Vector{T}, features::Matrix, nfolds::Integer, maxlabels::Integer=5) = _nfoldCV(:tree, labels, features, maxlabels, nfolds)
193-
nfoldCV_forest{T<:Float64}(labels::Vector{T}, features::Matrix, n_subfeatures::Integer, n_trees::Integer, nfolds::Integer, maxlabels::Integer=5, partial_sampling=0.7) = _nfoldCV(:forest, labels, features, n_subfeatures, n_trees, maxlabels, partial_sampling, nfolds)
217+
nfoldCV_tree(labels::Vector{T}, features::Matrix, nfolds::Integer, maxlabels::Integer = 5) where T <: Float64 =
218+
_nfoldCV(:tree, labels, features, maxlabels, nfolds)
219+
220+
nfoldCV_forest(labels::Vector{T}, features::Matrix, n_subfeatures::Integer, n_trees::Integer, nfolds::Integer, maxlabels::Integer = 5, partial_sampling = 0.7) where T <: Float64 =
221+
_nfoldCV(:forest, labels, features, n_subfeatures, n_trees, maxlabels, partial_sampling, nfolds)
194222

195223

src/regression/bench.jl

Whitespace-only changes.

src/regression/main.jl

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,20 @@
11
include("tree.jl")
2+
import Random
3+
import Distributed
24

35
# Convenience functions - make a Random Number Generator object
4-
mk_rng(rng::AbstractRNG) = rng
5-
mk_rng(seed::Int) = MersenneTwister(seed)
6+
mk_rng(rng::Random.AbstractRNG) = rng
7+
mk_rng(seed::Int) = Random.MersenneTwister(seed)
68

7-
function build_stump{T<:Float64}(labels::Vector{T}, features::Matrix; rng=Base.GLOBAL_RNG)
9+
function build_stump(labels::Vector{T}, features::Matrix; rng = Random.GLOBAL_RNG) where T <: Float64
810
return build_tree(labels, features, 1, 0, 1)
911
end
1012

11-
function build_tree{T<:Float64}(
12-
labels::Vector{T}, features::Matrix, min_samples_leaf=5, n_subfeatures=0,
13-
max_depth=-1, min_samples_split=2, min_purity_increase=0.0;
14-
rng=Base.GLOBAL_RNG)
15-
rng = mk_rng(rng)::AbstractRNG
13+
function build_tree(
14+
labels::Vector{T}, features::Matrix, min_samples_leaf = 5,
15+
n_subfeatures = 0, max_depth = -1, min_samples_split = 2,
16+
min_purity_increase = 0.0; rng = Random.GLOBAL_RNG) where T <: Float64
17+
rng = mk_rng(rng)::Random.AbstractRNG
1618
if max_depth < -1
1719
error("Unexpected value for max_depth: $(max_depth) (expected: max_depth >= 0, or max_depth = -1 for infinite depth)")
1820
end
@@ -42,12 +44,15 @@ function build_tree{T<:Float64}(
4244
return _convert(t)
4345
end
4446

45-
function build_forest{T<:Float64}(labels::Vector{T}, features::Matrix, n_subfeatures=0, n_trees=10, min_samples_leaf=5, partial_sampling=0.7, max_depth=-1; rng=Base.GLOBAL_RNG)
46-
rng = mk_rng(rng)::AbstractRNG
47+
function build_forest(
48+
labels::Vector{T}, features::Matrix, n_subfeatures = 0, n_trees = 10,
49+
min_samples_leaf = 5, partial_sampling = 0.7, max_depth = -1;
50+
rng = Random.GLOBAL_RNG) where T <: Float64
51+
rng = mk_rng(rng)::Random.AbstractRNG
4752
partial_sampling = partial_sampling > 1.0 ? 1.0 : partial_sampling
4853
Nlabels = length(labels)
4954
Nsamples = _int(partial_sampling * Nlabels)
50-
forest = @parallel (vcat) for i in 1:n_trees
55+
forest = @Distributed.distributed (vcat) for i in 1:n_trees
5156
inds = rand(rng, 1:Nlabels, Nsamples)
5257
build_tree(labels[inds], features[inds,:], min_samples_leaf, n_subfeatures, max_depth; rng=rng)
5358
end

0 commit comments

Comments
 (0)