From c85116a1e2bb42fdd81bfa5123317352823ba24b Mon Sep 17 00:00:00 2001 From: PGS62 Date: Sat, 30 Mar 2024 10:52:18 +0000 Subject: [PATCH 01/24] Rewrite pairwise, corkendall and corspearman --- .vscode/settings.json | 1 + src/pairwise.jl | 327 +++++++++++------ src/rankcorr.jl | 744 ++++++++++++++++++++++++++++++-------- test/pairwise.jl | 285 +++++++++------ test/rankcorr.jl | 519 ++++++++++++++++++-------- test/rankcorrspeedtest.jl | 98 +++++ 6 files changed, 1449 insertions(+), 525 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 test/rankcorrspeedtest.jl diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..9e26dfeeb --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/src/pairwise.jl b/src/pairwise.jl index de1d47544..d2775fe7e 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -1,25 +1,53 @@ -function _pairwise!(::Val{:none}, f, dest::AbstractMatrix, x, y, symmetric::Bool) - @inbounds for (i, xi) in enumerate(x), (j, yj) in enumerate(y) - symmetric && i > j && continue - - # For performance, diagonal is special-cased - if f === cor && eltype(dest) !== Union{} && i == j && xi === yj - # TODO: float() will not be needed after JuliaLang/Statistics.jl#61 - dest[i, j] = float(cor(xi)) - else - dest[i, j] = f(xi, yj) - end +function _pairwise!(::Val{:none}, f, dest::AbstractMatrix{V}, x, y, + symmetric::Bool) where {V} + + nr, nc = size(dest) + + # Swap x and y for more efficient threaded loop. + if nr < nc + dest′ = reshape(dest, size(dest, 2), size(dest, 1)) + _pairwise!(Val(:none), f, dest′, y, x, symmetric) + dest .= transpose(dest′) + return dest end - if symmetric - m, n = size(dest) - @inbounds for j in 1:n, i in (j+1):m - dest[i, j] = dest[j, i] + + #cor and friends are special-cased. + iscor = (f in (corkendall, corspearman, cor)) + (iscor || f == cov) && (symmetric = x === y) + #cov(x) is faster than cov(x, x) + (f == cov) && (f = ((x, y) -> x === y ? cov(x) : cov(x, y))) + + Threads.@threads for subset in equal_sum_subsets(nr, Threads.nthreads()) + for j in subset + for i = 1:(symmetric ? j : nc) + # For performance, diagonal is special-cased + if iscor && i==j && y[i] === x[j] && V !== Union{} + dest[j, i] = V === Missing ? missing : 1.0 + else + dest[j, i] = f(x[j], y[i]) + end + symmetric && (dest[i, j] = dest[j, i]) + end end end return dest end -function check_vectors(x, y, skipmissing::Symbol) +#Input validation for both pairwise and pairwise! +function check_vectors(x, y, skipmissing::Symbol, symmetric::Bool) + + if symmetric && x !== y + throw(ArgumentError("symmetric=true only makes sense passing " * + "a single set of variables (x === y)")) + end + + if !(skipmissing in (:none, :pairwise, :listwise)) + throw(ArgumentError("skipmissing must be one of :none, :pairwise or :listwise")) + end + + #When skipmissing is :none, elements of x/y can have unequal length. + skipmissing == :none && return + m = length(x) n = length(y) if !(all(xi -> xi isa AbstractVector, x) && all(yi -> yi isa AbstractVector, y)) @@ -46,76 +74,59 @@ function check_vectors(x, y, skipmissing::Symbol) end end -function _pairwise!(::Val{:pairwise}, f, dest::AbstractMatrix, x, y, symmetric::Bool) - check_vectors(x, y, :pairwise) - @inbounds for (j, yj) in enumerate(y) - ynminds = .!ismissing.(yj) - @inbounds for (i, xi) in enumerate(x) - symmetric && i > j && continue +function _pairwise!(::Val{:pairwise}, f, dest::AbstractMatrix{V}, x, y, symmetric::Bool) where {V} - if xi === yj - ynm = view(yj, ynminds) - # For performance, diagonal is special-cased - if f === cor && eltype(dest) !== Union{} && i == j - # TODO: float() will not be needed after JuliaLang/Statistics.jl#61 - dest[i, j] = float(cor(xi)) + nr, nc = size(dest) + m = length(x) == 0 ? 0 : length(first(x)) + + # Swap x and y for more efficient threaded loop. + if nr < nc + dest′ = reshape(dest, size(dest, 2), size(dest, 1)) + _pairwise!(Val(:pairwise), f, dest′, y, x, symmetric) + dest .= transpose(dest′) + return dest + end + + #cor and friends are special-cased. + iscor = (f in (corkendall, corspearman, cor)) + (iscor || f == cov) && (symmetric = x === y) + #cov(x) is faster than cov(x, x) + (f == cov) && (f = ((x, y) -> x === y ? cov(x) : cov(x, y))) + + nmtx = promoted_nmtype(x)[] + nmty = promoted_nmtype(y)[] + + Threads.@threads for subset in equal_sum_subsets(nr, Threads.nthreads()) + for j in subset + scratch_fx = task_local_vector(:scratch_fx, nmtx, m) + scratch_fy = task_local_vector(:scratch_fy, nmty, m) + for i = 1:(symmetric ? j : nc) + if iscor && i == j && y[i] === x[j] && V !== Union{} && V!== Missing + dest[j, i] = 1.0 else - dest[i, j] = f(ynm, ynm) + dest[j, i] = f(handle_pairwise(x[j], y[i]; scratch_fx, scratch_fy)...) end - else - nminds = .!ismissing.(xi) .& ynminds - xnm = view(xi, nminds) - ynm = view(yj, nminds) - dest[i, j] = f(xnm, ynm) + symmetric && (dest[i, j] = dest[j, i]) end end end - if symmetric - m, n = size(dest) - @inbounds for j in 1:n, i in (j+1):m - dest[i, j] = dest[j, i] - end - end return dest end function _pairwise!(::Val{:listwise}, f, dest::AbstractMatrix, x, y, symmetric::Bool) - check_vectors(x, y, :listwise) - nminds = .!ismissing.(first(x)) - @inbounds for xi in Iterators.drop(x, 1) - nminds .&= .!ismissing.(xi) - end - if x !== y - @inbounds for yj in y - nminds .&= .!ismissing.(yj) - end - end - - # Computing integer indices once for all vectors is faster - nminds′ = findall(nminds) - # TODO: check whether wrapping views in a custom array type which asserts - # that entries cannot be `missing` (similar to `skipmissing`) - # could offer better performance - return _pairwise!(Val(:none), f, dest, - [view(xi, nminds′) for xi in x], - [view(yi, nminds′) for yi in y], - symmetric) + return _pairwise!(Val(:none), f, dest, handle_listwise(x, y)..., symmetric) end -function _pairwise!(f, dest::AbstractMatrix, x, y; - symmetric::Bool=false, skipmissing::Symbol=:none) - if !(skipmissing in (:none, :pairwise, :listwise)) - throw(ArgumentError("skipmissing must be one of :none, :pairwise or :listwise")) - end +function _pairwise!(f, dest::AbstractMatrix, x, y; symmetric::Bool=false, + skipmissing::Symbol=:none) - x′ = x isa Union{AbstractArray, Tuple, NamedTuple} ? x : collect(x) - y′ = y isa Union{AbstractArray, Tuple, NamedTuple} ? y : collect(y) + x′ = x isa Union{AbstractArray,Tuple,NamedTuple} ? x : collect(x) + y′ = y isa Union{AbstractArray,Tuple,NamedTuple} ? y : collect(y) m = length(x′) n = length(y′) size(dest) != (m, n) && throw(DimensionMismatch("dest has dimensions $(size(dest)) but expected ($m, $n)")) - Base.has_offset_axes(dest) && throw("dest indices must start at 1") return _pairwise!(Val(skipmissing), f, dest, x′, y′, symmetric) @@ -134,7 +145,7 @@ end # Identical to `Base.promote_typejoin` except that it uses `promote_type` # instead of `typejoin` to combine members of `Union` types -function promote_type_union(::Type{T}) where T +function promote_type_union(::Type{T}) where {T} if T === Union{} return Union{} elseif T isa UnionAll @@ -149,14 +160,14 @@ function promote_type_union(::Type{T}) where T end function _pairwise(::Val{skipmissing}, f, x, y, symmetric::Bool) where {skipmissing} - x′ = x isa Union{AbstractArray, Tuple, NamedTuple} ? x : collect(x) - y′ = y isa Union{AbstractArray, Tuple, NamedTuple} ? y : collect(y) + x′ = x isa Union{AbstractArray,Tuple,NamedTuple} ? x : collect(x) + y′ = y isa Union{AbstractArray,Tuple,NamedTuple} ? y : collect(y) m = length(x′) n = length(y′) - T = Core.Compiler.return_type(f, Tuple{eltype(x′), eltype(y′)}) - Tsm = Core.Compiler.return_type((x, y) -> f(disallowmissing(x), disallowmissing(y)), - Tuple{eltype(x′), eltype(y′)}) + T = Core.Compiler.return_type(f, Tuple{eltype(x′),eltype(y′)}) + Tsm = Core.Compiler.return_type((x, y) -> f(handle_pairwise(x, y)...), + Tuple{eltype(x′),eltype(y′)}) if skipmissing === :none dest = Matrix{T}(undef, m, n) @@ -178,7 +189,7 @@ function _pairwise(::Val{skipmissing}, f, x, y, symmetric::Bool) where {skipmiss # but using `promote_type` rather than `promote_typejoin`) U = mapreduce(typeof, promote_type, dest) # V is inferred (contrary to U), but it only gives an upper bound for U - V = promote_type_union(Union{T, Tsm}) + V = promote_type_union(Union{T,Tsm}) return convert(Matrix{U}, dest)::Matrix{<:V} end end @@ -193,15 +204,15 @@ entries in `x` and columns to entries in `y`, and `dest` must therefore be of size `length(x) × length(y)`. If `y` is omitted then `x` is crossed with itself. -As a special case, if `f` is `cor`, diagonal cells for which entries -from `x` and `y` are identical (according to `===`) are set to one even -in the presence `missing`, `NaN` or `Inf` entries. +As a special case, if `f` is `cor`, `corspearman` or `corkendall`, diagonal cells for +which entries from `x` and `y` are identical (according to `===`) are set to one even in the +presence `missing`, `NaN` or `Inf` entries. # Keyword arguments - `symmetric::Bool=false`: If `true`, `f` is only called to compute for the lower triangle of the matrix, and these values are copied - to fill the upper triangle. Only allowed when `y` is omitted. - Defaults to `true` when `f` is `cor` or `cov`. + to fill the upper triangle. Only allowed when `y` is omitted and ignored (taken as `true`) + if `f` is `cov`, `cor`, `corkendall` or `corspearman`. - `skipmissing::Symbol=:none`: If `:none` (the default), missing values in inputs are passed to `f` without any modification. Use `:pairwise` to skip entries with a `missing` value in either @@ -245,11 +256,7 @@ julia> dest """ function pairwise!(f, dest::AbstractMatrix, x, y=x; symmetric::Bool=false, skipmissing::Symbol=:none) - if symmetric && x !== y - throw(ArgumentError("symmetric=true only makes sense passing " * - "a single set of variables (x === y)")) - end - + check_vectors(x, y, skipmissing, symmetric) return _pairwise!(f, dest, x, y, symmetric=symmetric, skipmissing=skipmissing) end @@ -262,15 +269,15 @@ of entries in iterators `x` and `y`. Rows correspond to entries in `x` and columns to entries in `y`. If `y` is omitted then a square matrix crossing `x` with itself is returned. -As a special case, if `f` is `cor`, diagonal cells for which entries -from `x` and `y` are identical (according to `===`) are set to one even -in the presence `missing`, `NaN` or `Inf` entries. +As a special case, if `f` is `cor`, `corspearman` or `corkendall`, diagonal cells for +which entries from `x` and `y` are identical (according to `===`) are set to one even in the +presence `missing`, `NaN` or `Inf` entries. # Keyword arguments - `symmetric::Bool=false`: If `true`, `f` is only called to compute for the lower triangle of the matrix, and these values are copied - to fill the upper triangle. Only allowed when `y` is omitted. - Defaults to `true` when `f` is `cor` or `cov`. + to fill the upper triangle. Only allowed when `y` is omitted and ignored (taken as `true`) + if `f` is `cov`, `cor`, `corkendall` or `corspearman`. - `skipmissing::Symbol=:none`: If `:none` (the default), missing values in inputs are passed to `f` without any modification. Use `:pairwise` to skip entries with a `missing` value in either @@ -307,34 +314,130 @@ julia> pairwise(cor, eachcol(y), skipmissing=:pairwise) ``` """ function pairwise(f, x, y=x; symmetric::Bool=false, skipmissing::Symbol=:none) - if symmetric && x !== y - throw(ArgumentError("symmetric=true only makes sense passing " * - "a single set of variables (x === y)")) - end - + check_vectors(x, y, skipmissing, symmetric) return _pairwise(Val(skipmissing), f, x, y, symmetric) end -# cov(x) is faster than cov(x, x) -_cov(x, y) = x === y ? cov(x) : cov(x, y) +# Auxiliary functions for pairwise + +promoted_type(x) = mapreduce(eltype, promote_type, x, init=Union{}) +promoted_nmtype(x) = mapreduce(nonmissingtype ∘ eltype, promote_type, x, init=Union{}) + +""" + handle_listwise(x, y) + +Remove missings in a listwise manner. Assumes `x` and `y` are vectors of iterables which +have been validated via `check_vectors`. + +## Example +```julia-repl +julia> a = [6,7,8,9,10,missing];b = [4,5,6,7,missing,8];c = [2,3,4,missing,5,6];d = [1,2,3,4,5,6]; + +julia> StatsBase.handle_listwise([a,b],[c,d]) +([[6, 7, 8], [4, 5, 6]], [[2, 3, 4], [1, 2, 3]]) +``` +""" +function handle_listwise(x, y) + if !(missing isa promoted_type(x) || missing isa promoted_type(y)) + return x, y + end + + nminds = .!ismissing.(first(x)) + @inbounds for xi in Iterators.drop(x, 1) + nminds .&= .!ismissing.(xi) + end + if x !== y + @inbounds for yj in y + nminds .&= .!ismissing.(yj) + end + end + + # Computing integer indices once for all vectors is faster + nminds′ = findall(nminds) + # TODO: check whether wrapping views in a custom array type which asserts + # that entries cannot be `missing` (similar to `skipmissing`) + # could offer better performance -pairwise!(::typeof(cov), dest::AbstractMatrix, x, y; - symmetric::Bool=false, skipmissing::Symbol=:none) = - pairwise!(_cov, dest, x, y, symmetric=symmetric, skipmissing=skipmissing) + x′ = [disallowmissing(view(xi, nminds′)) for xi in x] + if x === y + return x′, x′ + else + y′ = [disallowmissing(view(yi, nminds′)) for yi in y] + return x′, y′ + end +end -pairwise(::typeof(cov), x, y; symmetric::Bool=false, skipmissing::Symbol=:none) = - pairwise(_cov, x, y, symmetric=symmetric, skipmissing=skipmissing) +""" + handle_pairwise(x::AbstractVector, y::AbstractVector; + scratch_fx::AbstractVector=similar(x, nonmissingtype(eltype(x))), + scratch_fy::AbstractVector=similar(y, nonmissingtype(eltype(y)))) -pairwise!(::typeof(cov), dest::AbstractMatrix, x; - symmetric::Bool=true, skipmissing::Symbol=:none) = - pairwise!(_cov, dest, x, x, symmetric=symmetric, skipmissing=skipmissing) +Return a pair `(a,b)`, filtered copies of `(x,y)`, in which elements `x[i]` and +`y[i]` are excluded if `ismissing(x[i])||ismissing(y[i])`. +""" +function handle_pairwise(x::AbstractVector, y::AbstractVector; + scratch_fx::AbstractVector=similar(x, nonmissingtype(eltype(x))), + scratch_fy::AbstractVector=similar(y, nonmissingtype(eltype(y)))) + + axes(x) == axes(y) || throw(DimensionMismatch("x and y have inconsistent dimensions")) + lb = first(axes(x, 1)) + j = lb - 1 + @inbounds for i in eachindex(x) + if !(ismissing(x[i]) || ismissing(y[i])) + j += 1 + scratch_fx[j] = x[i] + scratch_fy[j] = y[i] + end + end + + return view(scratch_fx, lb:j), view(scratch_fy, lb:j) +end -pairwise(::typeof(cov), x; symmetric::Bool=true, skipmissing::Symbol=:none) = - pairwise(_cov, x, x, symmetric=symmetric, skipmissing=skipmissing) +#=Condition a) makes equal_sum_subsets useful for load-balancing the multi-threaded section +of _pairwise! in the non-symmetric case, and condition b) for the symmetric case.=# +""" + equal_sum_subsets(n::Int, num_subsets::Int)::Vector{Vector{Int}} + +Divide the integers 1:n into a number of subsets such that a) each subset has +(approximately) the same number of elements; and b) the sum of the elements in each subset +is nearly equal. If `n` is a multiple of `2 * num_subsets` both conditions hold exactly. + +## Example +```julia-repl +julia> StatsBase.equal_sum_subsets(30,5) +5-element Vector{Vector{Int64}}: + [30, 21, 20, 11, 10, 1] + [29, 22, 19, 12, 9, 2] + [28, 23, 18, 13, 8, 3] + [27, 24, 17, 14, 7, 4] + [26, 25, 16, 15, 6, 5] +``` +""" +function equal_sum_subsets(n::Int, num_subsets::Int)::Vector{Vector{Int}} + subsets = [Int[] for _ in 1:min(n, num_subsets)] + writeto, scanup = 1, true + for i = n:-1:1 + push!(subsets[writeto], i) + if scanup && writeto == num_subsets + scanup = false + elseif (!scanup) && writeto == 1 + scanup = true + else + writeto += scanup ? 1 : -1 + end + end + return subsets +end -pairwise!(::typeof(cor), dest::AbstractMatrix, x; - symmetric::Bool=true, skipmissing::Symbol=:none) = - pairwise!(cor, dest, x, x, symmetric=symmetric, skipmissing=skipmissing) +""" + task_local_vector(key::Symbol, similarto::AbstractArray{V}, + length::Int)::Vector{V} where {V} -pairwise(::typeof(cor), x; symmetric::Bool=true, skipmissing::Symbol=:none) = - pairwise(cor, x, x, symmetric=symmetric, skipmissing=skipmissing) +Retrieve from task local storage a vector of length `length` and matching the element +type of `similarto`, with initialisation on first call during a task. +""" +function task_local_vector(key::Symbol, similarto::AbstractArray{V}, + length::Int)::Vector{V} where {V} + haskey(task_local_storage(), key) || task_local_storage(key, similar(similarto, length)) + return task_local_storage(key) +end \ No newline at end of file diff --git a/src/rankcorr.jl b/src/rankcorr.jl index 95c5cb03e..1d5b3871a 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -11,110 +11,385 @@ ####################################### """ - corspearman(x, y=x) + corspearman(x, y=x; skipmissing::Symbol=:none) Compute Spearman's rank correlation coefficient. If `x` and `y` are vectors, the output is a float, otherwise it's a matrix corresponding to the pairwise correlations of the columns of `x` and `y`. + +Uses multiple threads when either `x` or `y` is a matrix and `skipmissing` is `:pairwise`. + +# Keyword argument + +- `skipmissing::Symbol=:none`: If `:none` (the default), `missing` entries in `x` or `y` + give rise to `missing` entries in the return. If `:pairwise` when calculating an element + of the return, both `i`th entries of the input vectors are skipped if either is missing. + If `:listwise` the `i`th rows of both `x` and `y` are skipped if `missing` appears in + either; note that this might skip a high proportion of entries. Only allowed when `x` or + `y` is a matrix. """ -function corspearman(x::AbstractVector{<:Real}, y::AbstractVector{<:Real}) - n = length(x) - n == length(y) || throw(DimensionMismatch("vectors must have same length")) - (any(isnan, x) || any(isnan, y)) && return NaN - return cor(tiedrank(x), tiedrank(y)) +function corspearman(x::AbstractVector, y::AbstractVector; skipmissing::Symbol=:none) + check_rankcor_args(x, y, skipmissing, false) + if x === y + return corspearman(x) + else + return corspearman_kernel!(x, y, skipmissing) + end end -function corspearman(X::AbstractMatrix{<:Real}, y::AbstractVector{<:Real}) - size(X, 1) == length(y) || - throw(DimensionMismatch("X and y have inconsistent dimensions")) - n = size(X, 2) - C = Matrix{Float64}(I, n, 1) - any(isnan, y) && return fill!(C, NaN) - yrank = tiedrank(y) - for j = 1:n - Xj = view(X, :, j) - if any(isnan, Xj) - C[j,1] = NaN - else - Xjrank = tiedrank(Xj) - C[j,1] = cor(Xjrank, yrank) +function corspearman(x::AbstractMatrix, y::AbstractVector; skipmissing::Symbol=:none) + return corspearman(x, reshape(y, (length(y), 1)); skipmissing) +end + +function corspearman(x::AbstractVector, y::AbstractMatrix; skipmissing::Symbol=:none) + return corspearman(reshape(x, (length(x), 1)), y; skipmissing) +end + +function corspearman(x::AbstractMatrix, y::AbstractMatrix=x; + skipmissing::Symbol=:none) + check_rankcor_args(x, y, skipmissing, true) + return pairwise(corspearman, eachcol(x), eachcol(y); skipmissing) +end + +function corspearman(x::AbstractVector{T}) where {T} + return T === Missing ? missing : 1.0 +end + +function _pairwise!(::Val{:listwise}, f::typeof(corspearman), dest::AbstractMatrix, x, y, + symmetric::Bool) + return _pairwise!(Val(:none), f, dest, handle_listwise(x, y)..., symmetric) +end + +function _pairwise!(::Val{:none}, f::typeof(corspearman), + dest::AbstractMatrix, x, y, symmetric::Bool) + + symmetric = x === y + if symmetric && promoted_type(x) === Missing + offdiag = (length(x[1]) < 2) ? NaN : missing + @inbounds for i in axes(dest, 1), j in axes(dest, 2) + dest[i, j] = i == j ? missing : offdiag end + return dest + end + + ranksx = ranks_matrix(x) + if symmetric + dest .= _cor(ranksx, ranksx) + else + ranksy = ranks_matrix(y) + dest .= _cor(ranksx, ranksy) + end + + #=When elements x[i] and y[i] are identical (according to `===`) then dest[i,i] should + be 1.0 even in the presence of missing and NaN values. But the return from `_cor` does + not respect that requirement. So amend.=# + autocor = (eltype(dest) === Missing && skipmissing == :none) ? missing : 1.0 + @inbounds for i in 1:min(size(dest, 1), size(dest, 2)) + x[i] === y[i] && (dest[i, i] = autocor) end - return C + return dest end -function corspearman(x::AbstractVector{<:Real}, Y::AbstractMatrix{<:Real}) - size(Y, 1) == length(x) || - throw(DimensionMismatch("x and Y have inconsistent dimensions")) - n = size(Y, 2) - C = Matrix{Float64}(I, 1, n) - any(isnan, x) && return fill!(C, NaN) - xrank = tiedrank(x) - for j = 1:n - Yj = view(Y, :, j) - if any(isnan, Yj) - C[1,j] = NaN - else - Yjrank = tiedrank(Yj) - C[1,j] = cor(xrank, Yjrank) +function _pairwise!(::Val{:pairwise}, f::typeof(corspearman), + dest::AbstractMatrix{V}, x, y, symmetric::Bool) where {V} + + nr, nc = size(dest) + m = length(x) == 0 ? 0 : length(first(x)) + symmetric = x === y + + # Swap x and y for more efficient threaded loop. + if nr < nc + dest′ = reshape(dest, size(dest, 2), size(dest, 1)) + _pairwise!(Val(:pairwise), f, dest′, y, x, symmetric) + dest .= transpose(dest′) + return dest + end + + tempx = sortperm_matrix(x) + tempy = symmetric ? tempx : sortperm_matrix(y) + + int64 = Int64[] + fl64 = Float64[] + nmtx = promoted_nmtype(x)[] + nmty = promoted_nmtype(y)[] + Threads.@threads for subset in equal_sum_subsets(nr, Threads.nthreads()) + + for j in subset + + inds = task_local_vector(:inds, int64, m) + spnmx = task_local_vector(:spnmx, int64, m) + spnmy = task_local_vector(:spnmy, int64, m) + nmx = task_local_vector(:nmx, nmtx, m) + nmy = task_local_vector(:nmy, nmty, m) + ranksx = task_local_vector(:ranksx, fl64, m) + ranksy = task_local_vector(:ranksy, fl64, m) + + for i = 1:(symmetric ? j : nc) + # For performance, diagonal is special-cased + if x[j] === y[i] && i == j && V !== Union{} + if missing isa V && eltype(x[j]) == Missing + dest[j, i] = missing + else + dest[j, i] = 1.0 + end + else + dest[j, i] = corspearman_kernel!(x[j], y[i], :pairwise, + view(tempx, :, j), view(tempy, :, i), inds, spnmx, spnmy, nmx, + nmy, ranksx, ranksy) + end + symmetric && (dest[i, j] = dest[j, i]) + end end end - return C + return dest end -function corspearman(X::AbstractMatrix{<:Real}) - n = size(X, 2) - C = Matrix{Float64}(I, n, n) - anynan = Vector{Bool}(undef, n) - for j = 1:n - Xj = view(X, :, j) - anynan[j] = any(isnan, Xj) - if anynan[j] - C[:,j] .= NaN - C[j,:] .= NaN - C[j,j] = 1 - continue - end - Xjrank = tiedrank(Xj) - for i = 1:(j-1) - Xi = view(X, :, i) - if anynan[i] - C[i,j] = C[j,i] = NaN +""" + corspearman_kernel!(x::AbstractVector{T}, y::AbstractVector{U}, + skipmissing::Symbol, sortpermx=sortperm(x), sortpermy=sortperm(y), + inds=zeros(Int64, length(x)), spnmx=zeros(Int64, length(x)), + spnmy=zeros(Int64, length(x)), nmx=similar(x, nonmissingtype(T)), + nmy=similar(y, nonmissingtype(U)), ranksx=similar(x, Float64), + ranksy=similar(y, Float64)) where {T,U} + +Compute Spearman's rank correlation coefficient between `x` and `y` with no allocations if +all arguments are provided. + +All vector arguments must have the same axes. +- `sortpermx`: The sort permutation of `x`. +- `sortpermy`: The sort permutation of `y`. +- `inds` ... `ranksy`: Pre-allocated "scratch" arguments. + +## Example +```julia-repl +julia> using StatsBase, BenchmarkTools, Random + +julia> Random.seed!(1); + +julia> x = ifelse.(rand(1000) .< 0.05,missing,randn(1000));y = ifelse.(rand(1000) .< 0.05,\ +missing,randn(1000)); + +julia> sortpermx=sortperm(x);sortpermy=sortperm(y);inds=zeros(Int64,1000);\ +spnmx=zeros(Int64,1000);spnmy=zeros(Int64,1000); + +julia> nmx=zeros(1000);nmy=zeros(1000);ranksx=similar(x,Float64);ranksy=similar(y,Float64); + +julia> @btime corspearman_kernel!(\$x,\$y,:pairwise,\$sortpermx,\$sortpermy,\ +\$inds,\$spnmx,\$spnmy,\$nmx,\$nmy,\$ranksx,\$ranksy) +4.671 μs (0 allocations: 0 bytes) +-0.016058512110609713 +``` +""" +function corspearman_kernel!(x::AbstractVector{T}, y::AbstractVector{U}, + skipmissing::Symbol, sortpermx=sortperm(x), sortpermy=sortperm(y), + inds=zeros(Int64, length(x)), spnmx=zeros(Int64, length(x)), + spnmy=zeros(Int64, length(x)), nmx=similar(x, nonmissingtype(T)), + nmy=similar(y, nonmissingtype(U)), ranksx=similar(x, Float64), + ranksy=similar(y, Float64)) where {T,U} + + (axes(x) == axes(sortpermx) == axes(y) == axes(sortpermy) == axes(inds) == + axes(spnmx) == axes(spnmy) == axes(nmx) == axes(nmy) == axes(ranksx) == + axes(ranksy)) || throw(ArgumentError("Axes of inputs must match")) + + if skipmissing == :pairwise + + lb = first(axes(x, 1)) + k = lb + #= Process (x,y) to obtain (nmx,nmy) by filtering out elements at position k if + either x[k] or y[k] is missing. inds provides the mapping of elements of x (or y) to + elements of nmx (or nmy) i.e. x[k] maps to nmx[inds[k]]. inds is then used to obtain + spnmx and spnmy more efficiently than calling sortperm(nmx) and sortperm(nmy). + =# + @inbounds for i in axes(x, 1) + if !(ismissing(x[i]) || ismissing(y[i])) + inds[i] = k + nmx[k] = x[i] + nmy[k] = y[i] + k += 1 else - Xirank = tiedrank(Xi) - C[i,j] = C[j,i] = cor(Xjrank, Xirank) + inds[i] = lb - 1 + end + end + + nnm = k - 1 + if nnm <= 1 + return NaN + end + nmx = view(nmx, lb:nnm) + nmy = view(nmy, lb:nnm) + + if any(_isnan, nmx) || any(_isnan, nmy) + return NaN + end + + k = lb + @inbounds for i in axes(x, 1) + if (inds[sortpermx[i]]) != lb - 1 + spnmx[k] = inds[sortpermx[i]] + k += 1 + end + end + spnmx = view(spnmx, lb:nnm) + + k = lb + @inbounds for i in axes(y, 1) + if (inds[sortpermy[i]]) != lb - 1 + spnmy[k] = inds[sortpermy[i]] + k += 1 end end + spnmy = view(spnmy, lb:nnm) + + if nnm <= 1 + return NaN + end + + _tiedrank!(view(ranksx, 1:nnm), nmx, spnmx) + _tiedrank!(view(ranksy, 1:nnm), nmy, spnmy) + + return cor(view(ranksx, 1:nnm), view(ranksy, 1:nnm)) + + else + if length(x) <= 1 + return NaN + elseif skipmissing == :none && (missing isa T || missing isa U) && + (any(ismissing, x) || any(ismissing, y)) + return missing + elseif any(_isnan, x) || any(_isnan, y) + return NaN + end + + _tiedrank!(ranksx, x, sortpermx) + _tiedrank!(ranksy, y, sortpermy) + return cor(ranksx, ranksy) end - return C end -function corspearman(X::AbstractMatrix{<:Real}, Y::AbstractMatrix{<:Real}) - size(X, 1) == size(Y, 1) || - throw(ArgumentError("number of rows in each array must match")) - nr = size(X, 2) - nc = size(Y, 2) - C = Matrix{Float64}(undef, nr, nc) - for j = 1:nr - Xj = view(X, :, j) - if any(isnan, Xj) - C[j,:] .= NaN - continue +# Auxiliary functions for Spearman's rank correlation + +""" + _cor(x, y) +Work-around various "unhelpful" features of cor: +a) Ensures that on-diagonal elements of the return are as they should be in the symmetric +case i.e. 1.0 unless eltype(x) is Missing in which case on-diagonal elements should be +missing. +b) Ensure that _cor(a,b) is NaN when a and b are vectors of equal length less than 2 +c) Works around some edge-case bugs in cor's handling of `missing` where the function throws +if `x` and `y` are matrices but nevertheless looping around the columns of `x` and `y` +works. https://github.com/JuliaStats/Statistics.jl/issues/63 + +# Example +```julia-repl +julia> x = y = fill(missing,2,2) +2×2 Matrix{Missing}: + missing missing + missing missing + +julia> Statistics.cor(x,y) +ERROR: MethodError: no method matching copy(::Missing) + +julia> StatsBase._cor(x,y) +2×2 Matrix{Union{Missing, Float64}}: + 1.0 missing + missing 1.0 + +julia> + +``` +""" +function _cor(ranksx::AbstractMatrix{T}, ranksy::AbstractMatrix{U}) where {T,U} + symmetric = ranksx === ranksy + + if size(ranksx, 1) < 2 + if symmetric && T === Missing + return ifelse.(axes(ranksx, 2) .== axes(ranksx, 2)', missing, NaN) + elseif symmetric + return ifelse.(axes(ranksx, 2) .== axes(ranksx, 2)', 1.0, NaN) + else + return fill(NaN, size(ranksx, 2), size(ranksy, 2)) end - Xjrank = tiedrank(Xj) - for i = 1:nc - Yi = view(Y, :, i) - if any(isnan, Yi) - C[j,i] = NaN - else - Yirank = tiedrank(Yi) - C[j,i] = cor(Xjrank, Yirank) + end + try + if symmetric + return cor(ranksx) + else + return cor(ranksx, ranksy) + end + catch + #=This catch block is hit when e.g. + ranksx === ranksy = [missing missing;missing missing] + =# + nr, nc = size(ranksx, 2), size(ranksy, 2) + if ranksx === ranksy && T === Missing + return fill(missing, nr, nc) + elseif missing isa T || missing isa U + C = ones(Union{Missing,Float64}, nr, nc) + else + C = ones(Float64, nr, nc) + end + + for j = (symmetric ? 2 : 1):nr + for i = 1:(symmetric ? j - 1 : nc) + C[j, i] = cor(view(ranksx, :, j), view(ranksy, :, i)) + symmetric && (C[i, j] = C[j, i]) end end + return C + end +end + +""" + sortperm_matrix(x) + +Given `x`, a vector of vectors, return a matrix who's ith column is the sort permutation of +the ith element of x. +""" +function sortperm_matrix(x) + m = length(x) == 0 ? 0 : length(first(x)) + nc = length(x) + int64 = Int64[] + out = Array{Int}(undef, m, nc) + + Threads.@threads for i in 1:nc + ints = task_local_vector(:ints, int64, m) + sortperm!(ints, x[i]) + out[:, i] .= ints end - return C + return out end +""" + ranks_matrix(x) + +Given `x`, a vector of vectors, return a matrix such that the (Pearson) correlaton between +columns of the return is the Spearman rank correlation between the elements of x. +""" +function ranks_matrix(x) + + m = length(x) == 0 ? 0 : length(first(x)) + nc = length(x) + int64 = Int64[] + + if promoted_type(x) === Missing + return fill(missing, m, nc) + end + + out = Array{Union{Missing,Int,Float64}}(undef, m, nc) + + Threads.@threads for i in 1:nc + ints = task_local_vector(:ints, int64, m) + + if any(ismissing, x[i]) + out[:, i] .= missing + elseif any(_isnan, x[i]) + out[:, i] .= NaN + else + sortperm!(ints, x[i]) + _tiedrank!(view(out, :, i), x[i], ints) + end + end + return out +end ####################################### # @@ -122,17 +397,202 @@ end # ####################################### +""" + corkendall(x, y=x; skipmissing::Symbol=:none) + +Compute Kendall's rank correlation coefficient, τ. `x` and `y` must be either vectors or +matrices, and entries may be `missing`. + +Uses multiple threads when either `x` or `y` is a matrix. + +# Keyword argument + +- `skipmissing::Symbol=:none`: If `:none` (the default), `missing` entries in `x` or `y` + give rise to `missing` entries in the return. If `:pairwise` when calculating an element + of the return, both `i`th entries of the input vectors are skipped if either is missing. + If `:listwise` the `i`th rows of both `x` and `y` are skipped if `missing` appears in + either; note that this might skip a high proportion of entries. Only allowed when `x` or + `y` is a matrix. +""" +function corkendall(x::AbstractMatrix, y::AbstractMatrix=x; + skipmissing::Symbol=:none) + check_rankcor_args(x, y, skipmissing, true) + return pairwise(corkendall, eachcol(x), eachcol(y); skipmissing) +end + +function corkendall(x::AbstractVector, y::AbstractVector; + skipmissing::Symbol=:none) + check_rankcor_args(x, y, skipmissing, false) + if x === y + return corkendall(x) + else + x = copy(x) + permx = sortperm(x) + permute!(x, permx) + return corkendall_kernel!(x, y, permx, skipmissing) + end +end + +#= corkendall returns a vector in this case, inconsistent with with Statistics.cor and +StatsBase.corspearman, but consistent with StatsBase.corkendall. + =# +function corkendall(x::AbstractMatrix, y::AbstractVector; skipmissing::Symbol=:none) + return vec(corkendall(x, reshape(y, (length(y), 1)); skipmissing)) +end + +function corkendall(x::AbstractVector, y::AbstractMatrix; skipmissing::Symbol=:none) + return corkendall(reshape(x, (length(x), 1)), y; skipmissing) +end + +function corkendall(x::AbstractVector{T}) where {T} + return T === Missing ? missing : 1.0 +end + +function _pairwise!(::Val{:none}, f::typeof(corkendall), dest::AbstractMatrix, x, y, + symmetric::Bool) + return corkendall_loop!(:none, f, dest, x, y, symmetric) +end + +function _pairwise!(::Val{:pairwise}, f::typeof(corkendall), dest::AbstractMatrix, x, y, + symmetric::Bool) + return corkendall_loop!(:pairwise, f, dest, x, y, symmetric) +end + +function _pairwise!(::Val{:listwise}, f::typeof(corkendall), dest::AbstractMatrix, x, y, + symmetric::Bool) + return corkendall_loop!(:none, f, dest, handle_listwise(x, y)..., symmetric) +end + +function corkendall_loop!(skipmissing::Symbol, f::typeof(corkendall), dest::AbstractMatrix{V}, + x, y, symmetric::Bool) where {V} + + nr, nc = size(dest) + m = length(x) == 0 ? 0 : length(first(x)) + + # Swap x and y for more efficient threaded loop. + if nr < nc + dest′ = reshape(dest, size(dest, 2), size(dest, 1)) + corkendall_loop!(skipmissing, f, dest′, y, x, symmetric) + dest .= transpose(dest′) + return dest + end + + intvec = Int[] + t = promoted_type(x)[] + u = promoted_type(y)[] + t′ = promoted_nmtype(x)[] + u′ = promoted_nmtype(y)[] + + symmetric = x === y + + Threads.@threads for subset in equal_sum_subsets(nr, Threads.nthreads()) + + for j in subset + + sortedxj = task_local_vector(:sortedxj, t, m) + scratch_py = task_local_vector(:scratch_py, u, m) + yi = task_local_vector(:yi, u, m) + permx = task_local_vector(:permx, intvec, m) + # Ensuring missing is not an element type of scratch_sy, scratch_fx, scratch_fy + # gives improved performance. + scratch_sy = task_local_vector(:scratch_sy, u′, m) + scratch_fx = task_local_vector(:scratch_fx, t′, m) + scratch_fy = task_local_vector(:scratch_fy, t′, m) + + sortperm!(permx, x[j]) + @inbounds for k in eachindex(sortedxj) + sortedxj[k] = x[j][permx[k]] + end + + for i = 1:(symmetric ? j : nc) + # For performance, diagonal is special-cased + if x[j] === y[i] && i == j && V !== Union{} + if missing isa V && eltype(x[j]) == Missing + dest[j, i] = missing + else + dest[j, i] = 1.0 + end + else + yi .= y[i] + dest[j, i] = corkendall_kernel!(sortedxj, yi, permx, skipmissing; + scratch_py, scratch_sy, scratch_fx, scratch_fy) + end + symmetric && (dest[i, j] = dest[j, i]) + end + end + end + return dest +end + +# Auxiliary functions for Kendall's rank correlation + # Knight, William R. “A Computer Method for Calculating Kendall's Tau with Ungrouped Data.” # Journal of the American Statistical Association, vol. 61, no. 314, 1966, pp. 436–439. # JSTOR, www.jstor.org/stable/2282833. -function corkendall!(x::AbstractVector{<:Real}, y::AbstractVector{<:Real}, permx::AbstractArray{<:Integer}=sortperm(x)) - if any(isnan, x) || any(isnan, y) return NaN end - n = length(x) - if n != length(y) error("Vectors must have same length") end - # Initial sorting - permute!(x, permx) - permute!(y, permx) +function check_rankcor_args(x, y, skipmissing, allowlistwise::Bool) + Base.require_one_based_indexing(x, y) + size(x, 1) == size(y, 1) || + throw(DimensionMismatch("x and y have inconsistent dimensions")) + if allowlistwise + skipmissing in (:none, :pairwise, :listwise) || + throw(ArgumentError("skipmissing must be one of :none, :pairwise or :listwise, \ + but got :$skipmissing")) + else + skipmissing in (:none, :pairwise) || + throw(ArgumentError("skipmissing must be either :none or :pairwise, but \ + got :$skipmissing")) + end +end + +""" + corkendall_kernel!(sortedx::AbstractVector, y::AbstractVector, + permx::AbstractVector{<:Integer}, skipmissing::Symbol; + scratch_py::AbstractVector=similar(y), + scratch_sy::AbstractVector=similar(y), + scratch_fx::AbstractVector=similar(sortedx), + scratch_fy::AbstractVector=similar(y)) + +Kendall correlation between two vectors but omitting the initial sorting of the first +argument. Calculating Kendall correlation between `x` and `y` is thus a two stage process: +a) sort `x` to get `sortedx`; b) call this function on `sortedx` and `y`, with +subsequent arguments: +- `permx`: The permutation that sorts `x` to yield `sortedx`. +- `scratch_py`: A vector used to permute `y` without allocation. +- `scratch_sy`: A vector used to sort `y` without allocation. +- `scratch_fx, scratch_fy`: Vectors used to filter `missing`s from `x` and `y` without + allocation. +""" +function corkendall_kernel!(sortedx::AbstractVector{T}, y::AbstractVector{U}, + permx::AbstractVector{<:Integer}, skipmissing::Symbol; + scratch_py::AbstractVector{V}=similar(y), + scratch_sy::AbstractVector=similar(y), + scratch_fx::AbstractVector=similar(sortedx), + scratch_fy::AbstractVector=similar(y)) where {T,U,V} + + length(sortedx) >= 2 || return NaN + + if skipmissing == :none + if missing isa T && any(ismissing, sortedx) + return missing + elseif missing isa U && any(ismissing, y) + return missing + end + end + + @inbounds for i in eachindex(y) + scratch_py[i] = y[permx[i]] + end + + if missing isa T || missing isa V + sortedx, permutedy = handle_pairwise(sortedx, scratch_py; scratch_fx, scratch_fy) + else + permutedy = scratch_py + end + + (any(_isnan, sortedx) || any(_isnan, permutedy)) && return NaN + + n = length(sortedx) # Use widen to avoid overflows on both 32bit and 64bit npairs = div(widen(n) * (n - 1), 2) @@ -140,80 +600,35 @@ function corkendall!(x::AbstractVector{<:Real}, y::AbstractVector{<:Real}, permx k = 0 @inbounds for i = 2:n - if x[i - 1] == x[i] + if sortedx[i-1] == sortedx[i] k += 1 elseif k > 0 - # Sort the corresponding chunk of y, so the rows of hcat(x,y) are - # sorted first on x, then (where x values are tied) on y. Hence - # double ties can be counted by calling countties. - sort!(view(y, (i - k - 1):(i - 1))) + #= + Sort the corresponding chunk of permutedy, so rows of hcat(sortedx,permutedy) + are sorted first on sortedx, then (where sortedx values are tied) on permutedy. + Hence double ties can be counted by calling countties. + =# + sort!(view(permutedy, (i-k-1):(i-1))) ntiesx += div(widen(k) * (k + 1), 2) # Must use wide integers here - ndoubleties += countties(y, i - k - 1, i - 1) + ndoubleties += countties(permutedy, i - k - 1, i - 1) k = 0 end end if k > 0 - sort!(view(y, (n - k):n)) + sort!(view(permutedy, (n-k):n)) ntiesx += div(widen(k) * (k + 1), 2) - ndoubleties += countties(y, n - k, n) + ndoubleties += countties(permutedy, n - k, n) end - nswaps = merge_sort!(y, 1, n) - ntiesy = countties(y, 1, n) + nswaps = merge_sort!(permutedy, 1, n, scratch_sy) + ntiesy = countties(permutedy, 1, n) # Calls to float below prevent possible overflow errors when - # length(x) exceeds 77_936 (32 bit) or 5_107_605_667 (64 bit) - (npairs + ndoubleties - ntiesx - ntiesy - 2 * nswaps) / - sqrt(float(npairs - ntiesx) * float(npairs - ntiesy)) -end - -""" - corkendall(x, y=x) - -Compute Kendall's rank correlation coefficient, τ. `x` and `y` must both be either -matrices or vectors. -""" -corkendall(x::AbstractVector{<:Real}, y::AbstractVector{<:Real}) = corkendall!(copy(x), copy(y)) - -function corkendall(X::AbstractMatrix{<:Real}, y::AbstractVector{<:Real}) - permy = sortperm(y) - return([corkendall!(copy(y), X[:,i], permy) for i in 1:size(X, 2)]) -end - -function corkendall(x::AbstractVector{<:Real}, Y::AbstractMatrix{<:Real}) - n = size(Y, 2) - permx = sortperm(x) - return(reshape([corkendall!(copy(x), Y[:,i], permx) for i in 1:n], 1, n)) -end - -function corkendall(X::AbstractMatrix{<:Real}) - n = size(X, 2) - C = Matrix{Float64}(I, n, n) - for j = 2:n - permx = sortperm(X[:,j]) - for i = 1:j - 1 - C[j,i] = corkendall!(X[:,j], X[:,i], permx) - C[i,j] = C[j,i] - end - end - return C + # length(sortedx) exceeds 77_936 (32 bit) or 5_107_605_667 (64 bit) + return (npairs + ndoubleties - ntiesx - ntiesy - 2 * nswaps) / + sqrt(float(npairs - ntiesx) * float(npairs - ntiesy)) end -function corkendall(X::AbstractMatrix{<:Real}, Y::AbstractMatrix{<:Real}) - nr = size(X, 2) - nc = size(Y, 2) - C = Matrix{Float64}(undef, nr, nc) - for j = 1:nr - permx = sortperm(X[:,j]) - for i = 1:nc - C[j,i] = corkendall!(X[:,j], Y[:,i], permx) - end - end - return C -end - -# Auxiliary functions for Kendall's rank correlation - """ countties(x::AbstractVector{<:Real}, lo::Integer, hi::Integer) @@ -224,8 +639,8 @@ function countties(x::AbstractVector, lo::Integer, hi::Integer) # length(x) exceeds 2^16 (32 bit) or 2^32 (64 bit) thistiecount = result = widen(0) checkbounds(x, lo:hi) - @inbounds for i = (lo + 1):hi - if x[i] == x[i - 1] + @inbounds for i = (lo+1):hi + if x[i] == x[i-1] thistiecount += 1 elseif thistiecount > 0 result += div(thistiecount * (thistiecount + 1), 2) @@ -236,7 +651,7 @@ function countties(x::AbstractVector, lo::Integer, hi::Integer) if thistiecount > 0 result += div(thistiecount * (thistiecount + 1), 2) end - result + return result end # Tests appear to show that a value of 64 is optimal, @@ -246,12 +661,15 @@ const SMALL_THRESHOLD = 64 # merge_sort! copied from Julia Base # (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) """ - merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, t::AbstractVector=similar(v, 0)) + merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, + t::AbstractVector=similar(v, 0)) Mutates `v` by sorting elements `x[lo:hi]` using the merge sort algorithm. -This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort distance. +This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort +distance. """ -function merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, t::AbstractVector=similar(v, 0)) +function merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, + t::AbstractVector=similar(v, 0)) # Use of widen below prevents possible overflow errors when # length(v) exceeds 2^16 (32 bit) or 2^32 (64 bit) nswaps = widen(0) @@ -261,7 +679,7 @@ function merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, t::AbstractVec m = midpoint(lo, hi) (length(t) < m - lo + 1) && resize!(t, m - lo + 1) - nswaps = merge_sort!(v, lo, m, t) + nswaps = merge_sort!(v, lo, m, t) nswaps += merge_sort!(v, m + 1, hi, t) i, j = 1, lo @@ -294,25 +712,28 @@ end # insertion_sort! and midpoint copied from Julia Base # (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) -midpoint(lo::T, hi::T) where T <: Integer = lo + ((hi - lo) >>> 0x01) +midpoint(lo::T, hi::T) where {T<:Integer} = lo + ((hi - lo) >>> 0x01) midpoint(lo::Integer, hi::Integer) = midpoint(promote(lo, hi)...) """ insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) Mutates `v` by sorting elements `x[lo:hi]` using the insertion sort algorithm. -This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort distance. +This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort +distance. """ function insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) - if lo == hi return widen(0) end + if lo == hi + return widen(0) + end nswaps = widen(0) - @inbounds for i = lo + 1:hi + @inbounds for i = lo+1:hi j = i x = v[i] while j > lo - if x < v[j - 1] + if x < v[j-1] nswaps += 1 - v[j] = v[j - 1] + v[j] = v[j-1] j -= 1 continue end @@ -322,3 +743,12 @@ function insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) end return nswaps end + +# Auxiliary functions for both Kendall's and Spearman's rank correlations + +# _isnan required so that corkendall and corspearman have correct handling of NaNs and +# can also accept arguments with element type for which isnan is not defined but isless is +# is defined, so that rank correlation makes sense. +_isnan(x::T) where {T<:Number} = isnan(x) +_isnan(x) = false + diff --git a/test/pairwise.jl b/test/pairwise.jl index 59e081b08..35ecd8279 100644 --- a/test/pairwise.jl +++ b/test/pairwise.jl @@ -9,8 +9,14 @@ Random.seed!(1) # to avoid using specialized method arbitrary_fun(x, y) = cor(x, y) -@testset "pairwise and pairwise! with $f" for f in (arbitrary_fun, cor, cov) +@testset "pairwise and pairwise! with $f" for f in (arbitrary_fun, cor, cov, corkendall, corspearman) + + isrankcor = f in (corkendall, corspearman) + iscor = isrankcor || f == cor + throwsforzerolengthinput = f in (arbitrary_fun, cor, cov) + @testset "basic interface" begin + x = [rand(10) for _ in 1:4] y = [rand(Float32, 10) for _ in 1:5] # to test case where inference of returned eltype fails @@ -23,24 +29,35 @@ arbitrary_fun(x, y) = cor(x, y) @test res == res2 == [f(xi, yi) for xi in x, yi in y] res = pairwise(f, y, z) - @test res isa Matrix{Float32} - res2 = zeros(Float32, size(res)) - @test pairwise!(f, res2, y, z) === res2 - @test res == res2 == [f(yi, zi) for yi in y, zi in z] + if isrankcor + @test res isa Matrix{Float64} + @test res == [f(yi, zi) for yi in y, zi in z] + else + @test res isa Matrix{Float32} + res2 = zeros(Float32, size(res)) + @test pairwise!(f, res2, y, z) === res2 + @test res == res2 == [f(yi, zi) for yi in y, zi in z] + end res = pairwise(f, Any[[1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0]]) @test res isa Matrix{Float64} res2 = zeros(AbstractFloat, size(res)) @test pairwise!(f, res2, Any[[1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0]]) === res2 - @test res == res2 == - [f(xi, yi) for xi in ([1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0]), - yi in ([1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0])] + + res3 = [f(xi, yi) for xi in ([1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0]), + yi in ([1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0])] + @test res == res2 + @test all(isapprox.(res2, res3)) + @test res isa Matrix{Float64} @inferred pairwise(f, x, y) - - @test_throws Union{ArgumentError,MethodError} pairwise(f, [Int[]], [Int[]]) - @test_throws Union{ArgumentError,MethodError} pairwise!(f, zeros(1, 1), [Int[]], [Int[]]) + if throwsforzerolengthinput + # @test_throws Union{ArgumentError,MethodError} pairwise(f, [Int[]], [Int[]]) + @test_throws CompositeException pairwise(f, [Int[]], [Int[]]) + #@test_throws Union{ArgumentError,MethodError} pairwise!(f, zeros(1, 1), [Int[]], [Int[]]) + @test_throws CompositeException pairwise!(f, zeros(1, 1), [Int[]], [Int[]]) + end res = pairwise(f, [], []) @test size(res) == (0, 0) @@ -69,7 +86,18 @@ arbitrary_fun(x, y) = cor(x, y) @test_throws DimensionMismatch pairwise!(f, zeros(1, 2), x, y) @test_throws DimensionMismatch pairwise!(f, zeros(1, 2), [], []) @test_throws DimensionMismatch pairwise!(f, zeros(0, 0), - [], [[1, 2], [2, 3]]) + [], [[1, 2], [2, 3]]) + end + + if f in (cor, corkendall, corspearman) + @testset "handling === elements" begin + xm = [missing, 1, 3, 2] + xn = [NaN, 1, 3, 2] + xmn = [missing, NaN, 1, 3, 2] + @test isequal(pairwise(f, [xm, xm], [xm, copy(xm)]), [1.0 missing; missing missing]) + @test isequal(pairwise(f, [xn, xn], [xn, copy(xn)]), [1.0 NaN; NaN NaN]) + @test isequal(pairwise(f, [xmn, xmn], [xmn, copy(xmn)]), [1.0 missing; missing missing]) + end end @testset "missing values handling interface" begin @@ -79,77 +107,99 @@ arbitrary_fun(x, y) = cor(x, y) res = pairwise(f, xm, ym) @test res isa Matrix{Missing} - res2 = zeros(Union{Float64, Missing}, size(res)) + res2 = zeros(Union{Float64,Missing}, size(res)) @test pairwise!(f, res2, xm, ym) === res2 @test res ≅ res2 ≅ [missing for xi in xm, yi in ym] res = pairwise(f, xm, ym, skipmissing=:pairwise) @test res isa Matrix{Float64} - res2 = zeros(Union{Float64, Missing}, size(res)) + res2 = zeros(Union{Float64,Missing}, size(res)) @test pairwise!(f, res2, xm, ym, skipmissing=:pairwise) === res2 @test res ≅ res2 - @test isapprox(res, [f(collect.(skipmissings(xi, yi))...) for xi in xm, yi in ym], - rtol=1e-6) + # Use myskipmissings rather than skipmissings to avoid deprecation warning + function myskipmissings(x, y) + which = @. !(ismissing(x) || ismissing(y)) + return x[which], y[which] + end + + #@test isapprox(res, [f(collect.(skipmissings(xi, yi))...) for xi in xm, yi in ym], + # rtol=1e-6) + @test isapprox(res, [f(myskipmissings(xi, yi)...) for xi in xm, yi in ym], + rtol=1e-6) res = pairwise(f, ym, zm, skipmissing=:pairwise) - @test res isa Matrix{Float32} - res2 = zeros(Union{Float32, Missing}, size(res)) - @test pairwise!(f, res2, ym, zm, skipmissing=:pairwise) === res2 - @test res ≅ res2 - @test isapprox(res, [f(collect.(skipmissings(yi, zi))...) for yi in ym, zi in zm], - rtol=1e-6) + if isrankcor + @test res isa Matrix{Float64} + # @test isequal(res, [f(collect.(skipmissings(yi, zi))...) for yi in ym, zi in zm]) + @test isequal(res, [f(myskipmissings(yi, zi)...) for yi in ym, zi in zm]) + else + @test res isa Matrix{Float32} + res2 = zeros(Union{Float32,Missing}, size(res)) + @test pairwise!(f, res2, ym, zm, skipmissing=:pairwise) === res2 + @test res ≅ res2 + #@test isapprox(res, [f(collect.(skipmissings(yi, zi))...) for yi in ym, zi in zm], + # rtol=1e-6) + @test isapprox(res, [f(myskipmissings(yi, zi)...) for yi in ym, zi in zm], + rtol=1e-6) + end nminds = mapreduce(x -> .!ismissing.(x), - (x, y) -> x .& y, - [xm; ym]) + (x, y) -> x .& y, + [xm; ym]) res = pairwise(f, xm, ym, skipmissing=:listwise) @test res isa Matrix{Float64} - res2 = zeros(Union{Float64, Missing}, size(res)) + res2 = zeros(Union{Float64,Missing}, size(res)) @test pairwise!(f, res2, xm, ym, skipmissing=:listwise) === res2 @test res ≅ res2 @test isapprox(res, [f(view(xi, nminds), view(yi, nminds)) for xi in xm, yi in ym], - rtol=1e-6) + rtol=1e-6) if VERSION >= v"1.6.0-DEV" # inference of cor fails so use an inferrable function - # to check that pairwise itself is inferrable + # to check that StatsBase.pairwise itself is inferrable for skipmissing in (:none, :pairwise, :listwise) g(x, y=x) = pairwise((x, y) -> x[1] * y[1], x, y, skipmissing=skipmissing) - @test Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}}) == - Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}, - Vector{Vector{Union{Float64, Missing}}}}) == - Matrix{<: Union{Float64, Missing}} + @test Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64,Missing}}}}) == + Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64,Missing}}}, + Vector{Vector{Union{Float64,Missing}}}}) == + Matrix{<:Union{Float64,Missing}} if skipmissing in (:pairwise, :listwise) - @test_broken Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}}) == - Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}, - Vector{Vector{Union{Float64, Missing}}}}) == - Matrix{Float64} + @test_broken Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64,Missing}}}}) == + Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64,Missing}}}, + Vector{Vector{Union{Float64,Missing}}}}) == + Matrix{Float64} end end end @test_throws ArgumentError pairwise(f, xm, ym, skipmissing=:something) - @test_throws ArgumentError pairwise!(f, zeros(Union{Float64, Missing}, - length(xm), length(ym)), xm, ym, - skipmissing=:something) + @test_throws ArgumentError pairwise!(f, zeros(Union{Float64,Missing}, + length(xm), length(ym)), xm, ym, + skipmissing=:something) # variable with only missings xm = [fill(missing, 10), rand(10)] ym = [rand(10), rand(10)] res = pairwise(f, xm, ym) - @test res isa Matrix{Union{Float64, Missing}} - res2 = zeros(Union{Float64, Missing}, size(res)) + @test res isa Matrix{Union{Float64,Missing}} + res2 = zeros(Union{Float64,Missing}, size(res)) @test pairwise!(f, res2, xm, ym) === res2 @test res ≅ res2 ≅ [f(xi, yi) for xi in xm, yi in ym] if VERSION >= v"1.5" # Fails with UndefVarError on Julia 1.0 - @test_throws Union{ArgumentError,MethodError} pairwise(f, xm, ym, skipmissing=:pairwise) - @test_throws Union{ArgumentError,MethodError} pairwise(f, xm, ym, skipmissing=:listwise) - - res = zeros(Union{Float64, Missing}, length(xm), length(ym)) - @test_throws Union{ArgumentError,MethodError} pairwise!(f, res, xm, ym, skipmissing=:pairwise) - @test_throws Union{ArgumentError,MethodError} pairwise!(f, res, xm, ym, skipmissing=:listwise) + if throwsforzerolengthinput + #@test_throws Union{ArgumentError,MethodError} pairwise(f, xm, ym, skipmissing=:pairwise) + @test_throws CompositeException pairwise(f, xm, ym, skipmissing=:pairwise) + #@test_throws Union{ArgumentError,MethodError} pairwise(f, xm, ym, skipmissing=:listwise) + @test_throws CompositeException pairwise(f, xm, ym, skipmissing=:listwise) + + res = zeros(Union{Float64,Missing}, length(xm), length(ym)) + #@test_throws Union{ArgumentError,MethodError} pairwise!(f, res, xm, ym, skipmissing=:pairwise) + @test_throws CompositeException pairwise!(f, res, xm, ym, skipmissing=:pairwise) + #@test_throws Union{ArgumentError,MethodError} pairwise!(f, res, xm, ym, skipmissing=:listwise) + @test_throws CompositeException pairwise!(f, res, xm, ym, skipmissing=:listwise) + end end for sm in (:pairwise, :listwise) @@ -162,12 +212,11 @@ arbitrary_fun(x, y) = cor(x, y) @testset "iterators" begin x = (v for v in [rand(10) for _ in 1:4]) y = (v for v in [rand(10) for _ in 1:4]) - res = @inferred pairwise(f, x, y) + res2 = zeros(size(res)) @test pairwise!(f, res2, x, y) === res2 @test res == res2 == pairwise(f, collect(x), collect(y)) - res = @inferred(pairwise(f, x)) res2 = zeros(size(res)) @test pairwise!(f, res2, x) === res2 @@ -179,13 +228,13 @@ arbitrary_fun(x, y) = cor(x, y) y = (Iterators.drop(v, 1) for v in [rand(10) for _ in 1:4]) @test pairwise((x, y) -> f(collect(x), collect(y)), x, y) == - [f(collect(xi), collect(yi)) for xi in x, yi in y] + [f(collect(xi), collect(yi)) for xi in x, yi in y] @test pairwise((x, y) -> f(collect(x), collect(y)), x) == - [f(collect(xi1), collect(xi2)) for xi1 in x, xi2 in x] + [f(collect(xi1), collect(xi2)) for xi1 in x, xi2 in x] @test_throws ArgumentError pairwise((x, y) -> f(collect(x), collect(y)), x, y, - skipmissing=:pairwise) + skipmissing=:pairwise) @test_throws ArgumentError pairwise((x, y) -> f(collect(x), collect(y)), x, y, - skipmissing=:listwise) + skipmissing=:listwise) end @testset "two-argument method" begin @@ -201,8 +250,8 @@ arbitrary_fun(x, y) = cor(x, y) y = [rand(10) for _ in 1:4] @test pairwise(f, x, x, symmetric=true) == - pairwise(f, x, symmetric=true) == - Symmetric(pairwise(f, x, x), :U) + pairwise(f, x, symmetric=true) == + Symmetric(pairwise(f, x, x), :U) res = zeros(4, 4) res2 = zeros(4, 4) @@ -214,47 +263,64 @@ arbitrary_fun(x, y) = cor(x, y) @test_throws ArgumentError pairwise!(f, res, x, y, symmetric=true) end - @testset "cor corner cases" begin - # Integer inputs must give a Float64 output - res = pairwise(cor, [[1, 2, 3], [1, 5, 2]]) - @test res isa Matrix{Float64} - @test res == [cor(xi, yi) for xi in ([1, 2, 3], [1, 5, 2]), - yi in ([1, 2, 3], [1, 5, 2])] - - # NaNs are ignored for the diagonal - res = pairwise(cor, [[1, 2, NaN], [1, 5, 2]]) - @test res isa Matrix{Float64} - @test res ≅ [1.0 NaN - NaN 1.0] - - # missings are ignored for the diagonal - res = pairwise(cor, [[1, 2, 7], [1, 5, missing]]) - @test res isa Matrix{Union{Float64, Missing}} - @test res ≅ [1.0 missing - missing 1.0] - res = pairwise(cor, Vector{Union{Int, Missing}}[[missing, missing, missing], - [missing, missing, missing]]) - @test res isa Matrix{Union{Float64, Missing}} - @test res ≅ [1.0 missing - missing 1.0] - if VERSION >= v"1.5" - # except when eltype is Missing - res = pairwise(cor, [[missing, missing, missing], - [missing, missing, missing]]) - @test res isa Matrix{Missing} - @test res ≅ [missing missing - missing missing] - end + if iscor + @testset "$f corner cases" begin + # Integer inputs must give a Float64 output + res = pairwise(f, [[1, 2, 3], [1, 5, 2]]) + @test res isa Matrix{Float64} + if f == corspearman + @test isapprox(res, [f(xi, yi) for xi in ([1, 2, 3], [1, 5, 2]), + yi in ([1, 2, 3], [1, 5, 2])]) + else + @test res == [f(xi, yi) for xi in ([1, 2, 3], [1, 5, 2]), + yi in ([1, 2, 3], [1, 5, 2])] + end - for sm in (:pairwise, :listwise) - res = pairwise(cor, [[1, 2, NaN, 4], [1, 5, 5, missing]], skipmissing=sm) + # NaNs are ignored for the diagonal + res = pairwise(f, [[1, 2, NaN], [1, 5, 2]]) @test res isa Matrix{Float64} @test res ≅ [1.0 NaN - NaN 1.0] + NaN 1.0] + + # missings are ignored for the diagonal + res = pairwise(f, [[1, 2, 7], [1, 5, missing]]) + @test res isa Matrix{Union{Float64,Missing}} + @test res ≅ [1.0 missing + missing 1.0] + res = pairwise(f, Vector{Union{Int,Missing}}[[missing, missing, missing], + [missing, missing, missing]]) + @test res isa Matrix{Union{Float64,Missing}} + @test res ≅ [1.0 missing + missing 1.0] + + #NaNs and missings are ignored for the "diagonal" of a not-square result. + v = [missing, 1, 2, 3] + @test isequal(pairwise(f, [v, v], [v, v, v]), + [1.0 missing missing; missing 1.0 missing]) + w = [NaN, 1, 2, 3] + @test isequal(pairwise(f, [w, w], [w, w, w]), [1.0 NaN NaN; NaN 1.0 NaN]) + if VERSION >= v"1.5" - @test_throws ArgumentError pairwise(cor, [[missing, missing, missing], - [missing, missing, missing]], - skipmissing=sm) + # except when eltype is Missing + res = pairwise(f, [[missing, missing, missing], + [missing, missing, missing]]) + @test res isa Matrix{Missing} + @test res ≅ [missing missing + missing missing] + end + + for sm in (:pairwise, :listwise) + res = pairwise(f, [[1, 2, NaN, 4], [1, 5, 5, missing]], skipmissing=sm) + @test res isa Matrix{Float64} + @test res ≅ [1.0 NaN + NaN 1.0] + if VERSION >= v"1.5" + if f == cor + @test_throws CompositeException pairwise(f, [[missing, missing, missing], + [missing, missing, missing]], + skipmissing=sm) + end + end end end end @@ -262,30 +328,45 @@ arbitrary_fun(x, y) = cor(x, y) @testset "promote_type_union" begin @test StatsBase.promote_type_union(Int) === Int @test StatsBase.promote_type_union(Real) === Real - @test StatsBase.promote_type_union(Union{Int, Float64}) === Float64 - @test StatsBase.promote_type_union(Union{Int, Missing}) === Union{Int, Missing} - @test StatsBase.promote_type_union(Union{Int, String}) === Any + @test StatsBase.promote_type_union(Union{Int,Float64}) === Float64 + @test StatsBase.promote_type_union(Union{Int,Missing}) === Union{Int,Missing} + @test StatsBase.promote_type_union(Union{Int,String}) === Any @test StatsBase.promote_type_union(Vector) === Any @test StatsBase.promote_type_union(Union{}) === Union{} if VERSION >= v"1.6.0-DEV" - @test StatsBase.promote_type_union(Tuple{Union{Int, Float64}}) === - Tuple{Real} + @test StatsBase.promote_type_union(Tuple{Union{Int,Float64}}) === + Tuple{Real} else - @test StatsBase.promote_type_union(Tuple{Union{Int, Float64}}) === - Any + @test StatsBase.promote_type_union(Tuple{Union{Int,Float64}}) === + Any end end @testset "type-unstable corner case (#771)" begin - v = [rand(5) for _=1:10] + v = [rand(5) for _ = 1:10] function f(v) pairwise(v) do x, y (x[1] < 0 ? nothing : - x[1] > y[1] ? 1 : 1.5, - 0) + x[1] > y[1] ? 1 : 1.5, + 0) end end res = f(v) - @test res isa Matrix{Tuple{Real, Int}} + @test res isa Matrix{Tuple{Real,Int}} + end +end + +@testset "pairwise against inputs with non-numeric elements" begin + for f in (corkendall, corspearman) + x = [["a", "b", "c"], ["c", "b", "a"]] + @test pairwise(f, x) ≈ [1.0 -1.0; -1.0 1.0] end +end + +@testset "pairwise with vectors of unequal length" begin + f(x, y) = length(x) - length(y) + x = [randn(i) for i in 0:3] + @test isequal(pairwise(f, x), (0:3) .- (0:3)') + @test_throws ArgumentError pairwise(f, x, skipmissing=:pairwise) + @test_throws ArgumentError pairwise(f, x, skipmissing=:listwise) end \ No newline at end of file diff --git a/test/rankcorr.jl b/test/rankcorr.jl index dc0207ee1..484474d99 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -1,161 +1,372 @@ -using StatsBase +using StatsBase, Random using Test -X = Float64[1 0; 2 1; 3 0; 4 1; 5 10] -Y = Float64[5 5 6; 3 4 1; 4 0 4; 2 6 1; 5 7 10] - -x1 = X[:,1] -x2 = X[:,2] -y = Y[:,1] - -# corspearman - -@test corspearman(x1, y) ≈ -0.102597835208515 -@test corspearman(x2, y) ≈ -0.081110710565381 - -@test corspearman(X, y) ≈ [-0.102597835208515, -0.081110710565381] -@test corspearman(y, X) ≈ [-0.102597835208515 -0.081110710565381] - -c11 = corspearman(x1, x1) -c12 = corspearman(x1, x2) -c22 = corspearman(x2, x2) -@test c11 ≈ 1.0 -@test c22 ≈ 1.0 -@test corspearman(X, X) ≈ [c11 c12; c12 c22] -@test corspearman(X) ≈ [c11 c12; c12 c22] - -@test corspearman(X, Y) == - [corspearman(X[:,i], Y[:,j]) for i in axes(X, 2), j in axes(Y, 2)] - -# corkendall - -# Check error, handling of NaN, Inf etc -@test_throws ErrorException("Vectors must have same length") corkendall([1,2,3,4], [1,2,3]) -@test isnan(corkendall([1,2], [3,NaN])) -@test isnan(corkendall([1,1,1], [1,2,3])) -@test corkendall([-Inf,-0.0,Inf],[1,2,3]) == 1.0 - -# Test, with exact equality, some known results. -# AbstractVector{<:Real}, AbstractVector{<:Real} -@test corkendall(x1, y) == -1/sqrt(90) -@test corkendall(x2, y) == -1/sqrt(72) -# AbstractMatrix{<:Real}, AbstractVector{<:Real} -@test corkendall(X, y) == [-1/sqrt(90), -1/sqrt(72)] -# AbstractVector{<:Real}, AbstractMatrix{<:Real} -@test corkendall(y, X) == [-1/sqrt(90) -1/sqrt(72)] - -# n = 78_000 tests for overflow errors on 32 bit -# Testing for overflow errors on 64bit would require n be too large for practicality -# This also tests merge_sort! since n is (much) greater than SMALL_THRESHOLD. -n = 78_000 -# Test with many repeats -@test corkendall(repeat(x1, n), repeat(y, n)) ≈ -1/sqrt(90) -@test corkendall(repeat(x2, n), repeat(y, n)) ≈ -1/sqrt(72) -@test corkendall(repeat(X, n), repeat(y, n)) ≈ [-1/sqrt(90), -1/sqrt(72)] -@test corkendall(repeat(y, n), repeat(X, n)) ≈ [-1/sqrt(90) -1/sqrt(72)] -@test corkendall(repeat([0,1,1,0], n), repeat([1,0,1,0], n)) == 0.0 - -# Test with no repeats, note testing for exact equality -@test corkendall(collect(1:n), collect(1:n)) == 1.0 -@test corkendall(collect(1:n), reverse(collect(1:n))) == -1.0 - -# All elements identical should yield NaN -@test isnan(corkendall(repeat([1], n), collect(1:n))) - -c11 = corkendall(x1, x1) -c12 = corkendall(x1, x2) -c22 = corkendall(x2, x2) - -# AbstractMatrix{<:Real}, AbstractMatrix{<:Real} -@test corkendall(X, X) ≈ [c11 c12; c12 c22] -# AbstractMatrix{<:Real} -@test corkendall(X) ≈ [c11 c12; c12 c22] - -@test c11 == 1.0 -@test c22 == 1.0 -@test c12 == 3/sqrt(20) - -# Finished testing for overflow, so redefine n for speedier tests -n = 100 - -@test corkendall(repeat(X, n), repeat(X, n)) ≈ [c11 c12; c12 c22] -@test corkendall(repeat(X, n)) ≈ [c11 c12; c12 c22] - -# All eight three-element permutations -z = [1 1 1; - 1 1 2; - 1 2 2; - 1 2 2; - 1 2 1; - 2 1 2; - 1 1 2; - 2 2 2] - -@test corkendall(z) == [1 0 1/3; 0 1 0; 1/3 0 1] -@test corkendall(z, z) == [1 0 1/3; 0 1 0; 1/3 0 1] -@test corkendall(z[:,1], z) == [1 0 1/3] -@test corkendall(z, z[:,1]) == [1; 0; 1/3] - -z = float(z) -@test corkendall(z) == [1 0 1/3; 0 1 0; 1/3 0 1] -@test corkendall(z, z) == [1 0 1/3; 0 1 0; 1/3 0 1] -@test corkendall(z[:,1], z) == [1 0 1/3] -@test corkendall(z, z[:,1]) == [1; 0; 1/3] - -w = repeat(z, n) -@test corkendall(w) == [1 0 1/3; 0 1 0; 1/3 0 1] -@test corkendall(w, w) == [1 0 1/3; 0 1 0; 1/3 0 1] -@test corkendall(w[:,1], w) == [1 0 1/3] -@test corkendall(w, w[:,1]) == [1; 0; 1/3] - -StatsBase.midpoint(1,10) == 5 -StatsBase.midpoint(1,widen(10)) == 5 - - -# NaN handling - -Xnan = copy(X) -Xnan[1,1] = NaN -Ynan = copy(Y) -Ynan[2,1] = NaN - -for f in (corspearman, corkendall) - @test isnan(f([1.0, NaN, 2.0], [2.0, 1.0, 3.4])) - @test all(isnan, f([1.0, NaN], [1 2; 3 4])) - @test all(isnan, f([1 2; 3 4], [1.0, NaN])) - @test isequal(f([1 NaN; NaN 4]), [1 NaN; NaN 1]) - @test all(isnan, f([1 NaN; NaN 4], [1 NaN; NaN 4])) - @test all(isnan, f([1 NaN; NaN 4], [NaN 1; NaN 4])) - - @test isequal(f(Xnan, Ynan), - [f(Xnan[:,i], Ynan[:,j]) for i in axes(Xnan, 2), j in axes(Ynan, 2)]) - @test isequal(f(Xnan), - [i == j ? 1.0 : f(Xnan[:,i], Xnan[:,j]) - for i in axes(Xnan, 2), j in axes(Xnan, 2)]) - for k in 1:2 - @test isequal(f(Xnan[:,k], Ynan), - [f(Xnan[:,k], Ynan[:,j]) for i in 1:1, j in axes(Ynan, 2)]) - # TODO: fix corkendall (PR#659) - if f === corspearman - @test isequal(f(Xnan, Ynan[:,k]), - [f(Xnan[:,i], Ynan[:,k]) for i in axes(Xnan, 2), j in 1:1]) - else - @test isequal(f(Xnan, Ynan[:,k]), - [f(Xnan[:,i], Ynan[:,k]) for i in axes(Xnan, 2)]) - end - end +@testset "rank_correlation_auxiliary_fns" begin + + #Auxiliary functions for corkendall + n = 100 + x = [1, 2, 3, missing, 4] + y = [missing, 1, 2, 3, 4] + u = [missing, missing, 1, 2] + v = [3, 4, missing, missing] + + mx = [1 2 + missing 3 + 4 missing + missing missing + 5 6] + + @test StatsBase.handle_pairwise(x, y) == ([2, 3, 4], [1, 2, 4]) + @test StatsBase.handle_pairwise(float.(x), y) == ([2.0, 3.0, 4.0], [1, 2, 4]) + @test StatsBase.handle_pairwise(x, float.(y)) == ([2, 3, 4], [1.0, 2.0, 4.0]) + @test StatsBase.handle_pairwise(u, v) == (Int64[], Int64[]) + + v = collect(100:-1:1) + StatsBase.insertion_sort!(v, 1, n) + @test v == 1:n + + v = collect(1000:-1:1) + StatsBase.merge_sort!(v, 1, 1000) + @test v == 1:1000 + + @test StatsBase.midpoint(1, 10) == 5 + @test StatsBase.midpoint(1, widen(10)) == 5 + + @test StatsBase.equal_sum_subsets(0, 1) == Vector{Int64}[] + @test sum.(StatsBase.equal_sum_subsets(100, 5)) == repeat([1010], 5) + @test sort(vcat(StatsBase.equal_sum_subsets(500, 7)...)) == collect(1:500) + end +#= +Edge cases (to refer to when writing tests) +=========================================== +In the symmetric case (x===y) on-diagonal terms should be 1.0 apart from the special case of +eltype(x) === Missing && skipmissing == :none, in which case the on-diagonal terms should be +missing. +Otherwise, if x and y are equal-length vectors with length(x)<2 +f(x,y) == NaN +otherwise if x and y are equal-length vectors and either contains missing +f(x,y) == missing +otherwise if x and y are equal-length vectors and either contains NaN +f(x,y) == NaN +otherwise +f(x,y) = the required Kendall or Spearman correlation. + +This behaviour is close to but not quite identical to Statistics.cor. e.g.: + +julia> cor(fill(missing,3,2)) +2×2 Matrix{Missing}: + missing missing + missing missing + +julia> corkendall(fill(missing,3,2)) #SAME behavior as cor +2×2 Matrix{Missing}: + missing missing + missing missing + +julia> cor(Matrix{Union{Missing,Float64}}(missing,5,3)) +3×3 Matrix{Missing}: + missing missing missing + missing missing missing + missing missing missing + +julia> corkendall(Matrix{Union{Missing,Float64}}(missing,5,3)) #DIFFERENT behaviour from cor +3×3 Matrix{Union{Missing, Float64}}: + 1.0 missing missing + missing 1.0 missing + missing missing 1.0 +=# + +@testset "$f" for f in (corkendall, corspearman) + + n = 100 + big_n = 78_000 + X = Float64[1 0; 2 1; 3 0; 4 1; 5 10] + Y = Float64[5 5 6; 3 4 1; 4 0 4; 2 6 1; 5 7 10] + Xm = [1 0; missing 1; 2 1; 3 0; 4 1; 5 10] + Ym = [5 5 6; 1 2 3; 3 4 1; 4 0 4; 2 6 1; 5 7 10] + xm = [missing, missing, missing, missing, missing] + xmm = hcat(xm, xm) + a = [5, 2, 3, 4, 1] + b = [1, 4, 2, 3, 5] + + x1 = X[:, 1] + x2 = X[:, 2] + y1 = Y[:, 1] + + c11 = f(x1, x1) + c12 = f(x1, x2) + c22 = f(x2, x2) + + # Test some known results + if f == corkendall + # Vector, Vector + @test f(x1, y1) == -1 / sqrt(90) + @test f(x2, y1) == -1 / sqrt(72) + # Matrix, Vector + @test f(X, y1) == [-1 / sqrt(90), -1 / sqrt(72)] + # Vector, Matrix + @test f(y1, X) == [-1 / sqrt(90) -1 / sqrt(72)] + + # big_n = 78_000 tests for overflow errors on 32 bit + # Testing for overflow errors on 64 bit is not practical + # This also tests merge_sort! since big_n is (much) greater than SMALL_THRESHOLD. + # Test with many repeats + @test f(repeat(x1, big_n), repeat(y1, big_n)) ≈ -1 / sqrt(90) + @test f(repeat(x2, big_n), repeat(y1, big_n)) ≈ -1 / sqrt(72) + @test f(repeat(X, big_n), repeat(y1, big_n)) ≈ [-1 / sqrt(90), -1 / sqrt(72)] + @test f(repeat(y1, big_n), repeat(X, big_n)) ≈ [-1 / sqrt(90) -1 / sqrt(72)] + elseif f == corspearman + @test corspearman(x1, y1) ≈ -1 / sqrt(95) + @test corspearman(repeat(x1, 1000), repeat(y1, 1000)) ≈ -1 / sqrt(95) + @test corspearman(vcat(missing, x1, missing), vcat(missing, y1, missing), skipmissing=:pairwise) ≈ -1 / sqrt(95) + @test corspearman(x2, y1) ≈ -3 / sqrt(1368) + @test corspearman(X, y1) ≈ [-1 / sqrt(95), -3 / sqrt(1368)] + @test corspearman(y1, X) ≈ [-1 / sqrt(95) -3 / sqrt(1368)] + end + + # Matrix, Matrix + @test f(X, X) ≈ [c11 c12; c12 c22] + # Matrix + @test f(X) ≈ [c11 c12; c12 c22] + + @test c11 == 1.0 + @test c22 == 1.0 + if f == corkendall + @test c12 == 3 / sqrt(20) + elseif f == corspearman + @test c12 == 7 / sqrt(90) + end + @test f(repeat(X, n), repeat(X, n)) ≈ [c11 c12; c12 c22] + @test f(repeat(X, n)) ≈ [c11 c12; c12 c22] + @test f(X, Y) ≈ + [f(X[:, i], Y[:, j]) for i in axes(X, 2), j in axes(Y, 2)] + + @test f(vcat(missing, a), vcat(missing, b), skipmissing=:pairwise) == f(a, b) + @test f(vcat(a, missing), vcat(missing, b), skipmissing=:pairwise) == + f(a[2:end], b[1:(end-1)]) + @test f(hcat(vcat(a, missing), vcat(missing, b)), skipmissing=:listwise) == + f(hcat(a[2:end], b[1:(end-1)])) + @test f(Xm, Xm, skipmissing=:pairwise) == f(X, X) + @test f(Xm, Xm, skipmissing=:listwise) == f(X, X) + @test f(Xm, Ym, skipmissing=:listwise) == f(X, Y) + if f == corkendall + @test f(Xm, Ym, skipmissing=:pairwise) ≈ [-1/√90 0.4 1/√90; -2/√154 7/√165 -1/√154] + end + @test isnan(f([1, 2, 3, 4, 5], xm, skipmissing=:pairwise)) + @test isnan(f(xm, [1, 2, 3, 4, 5], skipmissing=:pairwise)) + @test isequal(f(xmm, skipmissing=:pairwise), [1.0 NaN; NaN 1.0]) + @test isequal(f(xmm, skipmissing=:none), [missing missing; missing missing]) + @test isequal(f(xmm, xmm, skipmissing=:none), [missing missing; missing missing]) + @test isequal(f(xmm, copy(xmm), skipmissing=:none), + [missing missing; missing missing]) + @test isequal(f(xmm, xmm, skipmissing=:listwise), [1.0 NaN; NaN 1.0]) + @test isequal(f(xmm, copy(xmm), skipmissing=:listwise), [NaN NaN; NaN NaN]) + @test isequal(f(xmm, copy(xmm), skipmissing=:pairwise), [NaN NaN; NaN NaN]) + @test ismissing(f([1, 2, 3, 4, 5], xm, skipmissing=:none)) + @test ismissing(f([1, 2, 3, 4, 5], xm, skipmissing=:none)) + @test isequal(f(xmm, skipmissing=:none), [missing missing; missing missing]) + @test isequal(f(xmm, copy(xmm), skipmissing=:none), + [missing missing; missing missing]) + @test isequal(f(hcat(Y, xm), skipmissing=:none), vcat(hcat(f(Y, skipmissing=:none), + [missing, missing, missing]), [missing missing missing 1.0])) + @test_throws ArgumentError f([1, 2, 3, 4], [4, 3, 2, 1], skipmissing=:listwise) + + # All elements identical should yield NaN + @test isnan(f(repeat([1], n), collect(1:n))) + @test f(repeat([0, 1, 1, 0], n), repeat([1, 0, 1, 0], n)) == 0.0 + + # Test with no repeats + @test f(collect(1:n), collect(1:n)) == 1.0 + @test f(collect(1:n), reverse(collect(1:n))) == -1.0 + + # Inf and -Inf work ok + @test f([-Inf, -0.0, Inf], [1, 2, 3]) == 1.0 + + #Interaction of NaN and missing with skipmissing argument + nan_and_missing = hcat(fill(NaN, 10), fill(missing, 10), + vcat(fill(NaN, 5), fill(missing, 5))) + @test isequal(f(nan_and_missing, skipmissing=:none), + [1.0 missing missing; missing 1.0 missing; missing missing 1.0]) + @test isequal(f(nan_and_missing, copy(nan_and_missing), skipmissing=:none), + [NaN missing missing; missing missing missing; missing missing missing]) + @test isequal(f(nan_and_missing, skipmissing=:pairwise), + [1.0 NaN NaN; NaN 1.0 NaN; NaN NaN 1.0]) + @test isequal(f(nan_and_missing, copy(nan_and_missing), skipmissing=:pairwise), + fill(NaN, 3, 3)) + @test isequal(f(nan_and_missing, skipmissing=:listwise), + [1.0 NaN NaN; NaN 1.0 NaN; NaN NaN 1.0]) + @test isequal(f(nan_and_missing, copy(nan_and_missing), skipmissing=:listwise), + fill(NaN, 3, 3)) + + #Reject nonsense skipmissing argument + @test_throws ArgumentError f(X; skipmissing=:foo) + @test_throws ArgumentError f(Xm; skipmissing=:foo) + + # Inputs have fewer than 2 rows + @test isequal(f([], []), NaN) + @test isequal(f(fill(1, 0, 2), fill(1, 0, 2)), [NaN NaN; NaN NaN]) + @test isequal(f(fill(1, 0, 2)), [1.0 NaN; NaN 1.0]) + @test isequal(f([1;;], [1;;]), [NaN;;]) + @test isequal(f([1;;]), [1.0;;]) + @test isequal(f([missing], [missing]), NaN) + @test isequal(f([1], [1]), NaN) + @test isequal(f([NaN], [NaN]), NaN) + @test isequal(f([1], [1]), NaN) + @test isequal(f([]), 1.0) + @test isequal(f([1]), 1.0) + @test isequal(f([NaN]), 1.0) + @test isequal(f([missing]), missing) + @test isequal(f([missing]), missing) + @test isequal(f([missing], [missing missing]), [NaN NaN]) + @test isequal(f([missing missing]), [missing NaN; NaN missing]) + @test isequal(f([missing missing], [missing missing]), [NaN NaN; NaN NaN]) + + # Exercise catch block in method _cor (when f === corspearman) + @test isequal(f(vcat([1 2 3], fill(missing, 2, 3))), + [1.0 missing missing; missing 1.0 missing; missing missing 1.0]) + # Exercise "correction" of on-diagonal terms in method + # _pairwise!(::Val{:none}, f::typeof(corspearman),... + @test isequal(f([missing missing; 1 1; 1 1]), [1.0 missing; missing 1.0]) + + # Works for not-numbers + @test isequal(f(["a", "b", "c"], ["a", "b", "c"]), 1.0) + @test isequal(f(["a", "b", "c"], ["c", "b", "a"]), -1.0) + @test (f(["a" "z"; "b" "y"; "c" "x"]) ≈ [1.0 -1.0; -1.0 1.0]) + @test (f(["a" 3; "b" 2; "c" 1]) ≈ [1.0 -1.0; -1.0 1.0]) + + #Works for zero size input + @test isequal(f([;;]), [;;]) + @test isequal(f([;;], [;;]), [;;]) + @test isequal(f([;;], [;;], skipmissing=:pairwise), [;;]) + @test isequal(f([;;], [;;], skipmissing=:listwise), [;;]) + + # Wrong dimensions + @test_throws DimensionMismatch f([1], [1, 2]) + @test_throws DimensionMismatch f([1], [1 2; 3 4]) + @test_throws DimensionMismatch f([1 2; 3 4], [1]) + @test_throws DimensionMismatch f([1 2; 3 4; 4 6], [1 2; 3 4]) + + # All eight three-element permutations, for these cases corspearman and corkendall + # have the same return values + z = [1 1 1; + 1 1 2; + 1 2 2; + 1 2 2; + 1 2 1; + 2 1 2; + 1 1 2; + 2 2 2] + + @test f(z) ≈ [1 0 1/3; 0 1 0; 1/3 0 1] + @test f(z, z) ≈ [1 0 1/3; 0 1 0; 1/3 0 1] + @test f(z[:, 1], z) ≈ [1 0 1 / 3] + @test f(z, z[:, 1]) ≈ [1; 0; 1 / 3] + + z = float(z) + @test f(z) ≈ [1 0 1/3; 0 1 0; 1/3 0 1] + @test f(z, z) ≈ [1 0 1/3; 0 1 0; 1/3 0 1] + @test f(z[:, 1], z) ≈ [1 0 1 / 3] + @test f(z, z[:, 1]) ≈ [1; 0; 1 / 3] + + w = repeat(z, n) + @test f(w) ≈ [1 0 1/3; 0 1 0; 1/3 0 1] + @test f(w, w) ≈ [1 0 1/3; 0 1 0; 1/3 0 1] + @test f(w[:, 1], w) ≈ [1 0 1 / 3] + @test f(w, w[:, 1]) ≈ [1; 0; 1 / 3] + + # NaN handling + Xnan = copy(X) + Xnan[1, 1] = NaN + Ynan = copy(Y) + Ynan[2, 1] = NaN + + @test isnan(f([1.0, NaN, 2.0], [2.0, 1.0, 3.4])) + @test all(isnan, f([1.0, NaN], [1 2; 3 4])) + @test all(isnan, f([1 2; 3 4], [1.0, NaN])) + @test isequal(f([1 NaN; NaN 4]), [1 NaN; NaN 1]) + @test all(isnan, f([1 NaN; NaN 4], [1 NaN; NaN 4])) + @test all(isnan, f([1 NaN; NaN 4], [NaN 1; NaN 4])) + + @test isequal(f(Xnan, Ynan), + [f(Xnan[:, i], Ynan[:, j]) for i in axes(Xnan, 2), j in axes(Ynan, 2)]) + @test isequal(f(Xnan), + [i == j ? 1.0 : f(Xnan[:, i], Xnan[:, j]) + for i in axes(Xnan, 2), j in axes(Xnan, 2)]) + for k in 1:2 + @test isequal(f(Xnan[:, k], Ynan), + [f(Xnan[:, k], Ynan[:, j]) for i in 1:1, j in axes(Ynan, 2)]) + # TODO: fix corkendall (PR#659) + if f === corspearman + @test isequal(f(Xnan, Ynan[:, k]), + [f(Xnan[:, i], Ynan[:, k]) for i in axes(Xnan, 2), j in 1:1]) + else + @test isequal(f(Xnan, Ynan[:, k]), + [f(Xnan[:, i], Ynan[:, k]) for i in axes(Xnan, 2)]) + end + end + +end -# Wrong dimensions +# Don't artificially boost coverage stats when checking for mutation and allocation size +# COV_EXCL_START +@testset "Check no mutation in $f" for f in (corkendall, corspearman) + + nr = 50 + nc = 5 + cutoff = min(0.1, 10 / (nr * nc)) + X = randn(nr, nc) + x = randn(nr) + Xm = ifelse.(rand(nr, nc) .< cutoff, missing, randn(nr, nc)) + xm = ifelse.(rand(nr) .< cutoff, missing, randn(nr)) + Y = randn(nr, nc) + y = randn(nr) + Ym = ifelse.(rand(nr, nc) .< cutoff, missing, randn(nr, nc)) + ym = ifelse.(rand(nr) .< cutoff, missing, randn(nr)) + + for arg1 in (X, x, Xm, xm) + for arg2 in (Y, y, Ym, ym) + for skipmissing in (:pairwise, :none, :listwise) + for f in (corkendall, corspearman) + if !((arg1 isa Vector) && (arg2 isa Vector) && skipmissing == :listwise) + copy1 = copy(arg1) + copy2 = copy(arg2) + res = f(arg1, arg2; skipmissing) + @test isequal(arg1, copy1) + @test isequal(arg2, copy2) + end + end + end + end + end +end -@test_throws DimensionMismatch corspearman([1], [1, 2]) -@test_throws DimensionMismatch corspearman([1], [1 2; 3 4]) -@test_throws DimensionMismatch corspearman([1 2; 3 4], [1]) -@test_throws ArgumentError corspearman([1 2; 3 4; 4 6], [1 2; 3 4]) +@testset "corkendall and corspearman allocations" begin + + Random.seed!(1) + x = rand(1000, 10) + xm = ifelse.(x .< 0.1, missing, x) + #compile + corkendall(x) + corkendall(xm, skipmissing=:listwise) + corkendall(xm, skipmissing=:pairwise) + corspearman(x) + corspearman(xm, skipmissing=:listwise) + corspearman(xm, skipmissing=:pairwise) + x = rand(1000, 100) + xm = ifelse.(x .< 0.01, missing, x) + + #=When executing code such as corkendall(x) allocations are approximately affine in the + number of threads, thanks to use of task-local storage. The tests below have a "safety + factor" of 1.2 against the expected size of allocations. + =# + @test (@allocated corkendall(x)) < (897_808 + Threads.nthreads() * 58_044) * 1.2 + @test (@allocated corkendall(xm,skipmissing=:listwise)) < (1_119_392 + Threads.nthreads() * 22_172) * 1.2 + @test (@allocated corkendall(xm,skipmissing=:pairwise)) < (892_112 + Threads.nthreads() * 61_116) * 1.2 + @test (@allocated corspearman(x)) < (2_678_448 + Threads.nthreads() * 9_128) * 1.2 + @test (@allocated corspearman(xm,skipmissing=:listwise)) < (1_803_712 + Threads.nthreads() * 3_992) * 1.2 + @test (@allocated corspearman(xm,skipmissing=:pairwise)) < (1_692_208 + Threads.nthreads() * 67_172) * 1.2 -# TODO: fix corkendall to match corspearman (PR#659) -@test_throws ErrorException corkendall([1], [1, 2]) -@test_throws ErrorException corkendall([1], [1 2; 3 4]) -@test_throws ErrorException corkendall([1 2; 3 4], [1]) -@test_throws ErrorException corkendall([1 2; 3 4; 4 6], [1 2; 3 4]) +end +# COV_EXCL_STOP \ No newline at end of file diff --git a/test/rankcorrspeedtest.jl b/test/rankcorrspeedtest.jl new file mode 100644 index 000000000..1657dd29f --- /dev/null +++ b/test/rankcorrspeedtest.jl @@ -0,0 +1,98 @@ +using Random, StatsBase, Dates, BenchmarkTools + +using Random; +x = rand(MersenneTwister(0), 1000, 10); +xm = ifelse.(x .< 0.05, missing, x); + +#compile... +res_1 = corkendall(x) +res_2 = corkendall(xm; skipmissing=:pairwise) +res_3 = pairwise(corkendall, eachcol(xm); skipmissing=:pairwise) +res_4 = corkendall(xm; skipmissing=:listwise) +res_5 = pairwise(corkendall, eachcol(xm); skipmissing=:listwise) +res_6 = corkendall(xm; skipmissing=:none) +res_7 = pairwise(corkendall, eachcol(xm), skipmissing=:none) +res_8 = corspearman(x) +res_9 = corspearman(xm; skipmissing=:pairwise) +res_10 = pairwise(corspearman, eachcol(xm); skipmissing=:pairwise) +res_11 = corspearman(xm; skipmissing=:listwise) +res_12 = pairwise(corspearman, eachcol(xm); skipmissing=:listwise) +res_13 = corspearman(xm; skipmissing=:none) +res_14 = pairwise(corspearman, eachcol(xm), skipmissing=:none) + +@assert res_2 == res_3 +@assert res_4 == res_5 +@assert res_9 == res_10 +@assert res_11 ≈ res_12 + +x = rand(MersenneTwister(0), 1000, 1000); +xm = ifelse.(x .< 0.05, missing, x); + +println("="^130) +@show(Dates.now()) +@show ENV["COMPUTERNAME"] +println("Julia Version $VERSION") +@show Threads.nthreads() +@show size(x) +@show typeof(x) +@show size(xm) +@show typeof(xm) + +print("corkendall(\$x) ") +@btime res_1 = corkendall($x); +print("corkendall(\$xm; skipmissing=:pairwise) ") +@btime res_2 = corkendall($xm; skipmissing=:pairwise); +print("pairwise(corkendall,eachcol(\$xm); skipmissing=:pairwise) ") +@btime res_3 = pairwise(corkendall, eachcol($xm); skipmissing=:pairwise); +print("corkendall(\$xm; skipmissing=:listwise) ") +@btime res_4 = corkendall($xm; skipmissing=:listwise); +print("pairwise(corkendall,eachcol(\$xm); skipmissing=:listwise) ") +@btime res_5 = pairwise(corkendall, eachcol($xm); skipmissing=:listwise); +print("corkendall(\$xm; skipmissing=:none) ") +@btime res_6 = corkendall($xm; skipmissing=:none); +print("pairwise(corkendall,eachcol(\$xm),skipmissing=:none) ") +@btime res_7 = pairwise(corkendall, eachcol($xm), skipmissing=:none); +print("corspearman(\$x) ") +@btime res_8 = corspearman($x); +print("corspearman(\$xm; skipmissing=:pairwise) ") +@btime res_9 = corspearman($xm; skipmissing=:pairwise); +print("pairwise(corspearman,eachcol(\$xm); skipmissing=:pairwise)") +@btime res_10 = pairwise(corspearman, eachcol($xm); skipmissing=:pairwise); +print("corspearman(\$xm; skipmissing=:listwise) ") +@btime res_11 = corspearman($xm; skipmissing=:listwise); +print("pairwise(corspearman,eachcol(\$xm); skipmissing=:listwise)") +@btime res_12 = pairwise(corspearman, eachcol($xm); skipmissing=:listwise); +print("corspearman(\$xm; skipmissing=:none) ") +@btime res_13 = corspearman($xm; skipmissing=:none); +print("pairwise(corspearman,eachcol(\$xm),skipmissing=:none) ") +@btime res_14 = pairwise(corspearman, eachcol($xm), skipmissing=:none); + +println("="^130) + +#= +================================================================================================================================== +Dates.now() = DateTime("2024-03-30T10:16:29.303") +ENV["COMPUTERNAME"] = "PHILIP-LAPTOP" +Julia Version 1.10.2 +Threads.nthreads() = 8 +size(x) = (1000, 1000) +typeof(x) = Matrix{Float64} +size(xm) = (1000, 1000) +typeof(xm) = Matrix{Union{Missing, Float64}} +corkendall($x) 5.758 s (1193 allocations: 15.84 MiB) +corkendall($xm; skipmissing=:pairwise) 5.250 s (1193 allocations: 15.51 MiB) +pairwise(corkendall,eachcol($xm); skipmissing=:pairwise) 5.541 s (1161 allocations: 15.51 MiB) +corkendall($xm; skipmissing=:listwise) 9.694 ms (2194 allocations: 11.82 MiB) +pairwise(corkendall,eachcol($xm); skipmissing=:listwise) 9.863 ms (2162 allocations: 11.82 MiB) +corkendall($xm; skipmissing=:none) 258.379 ms (1204 allocations: 16.47 MiB) +pairwise(corkendall,eachcol($xm),skipmissing=:none) 238.871 ms (1202 allocations: 16.47 MiB) +corspearman($x) 35.905 ms (1127 allocations: 39.31 MiB) +corspearman($xm; skipmissing=:pairwise) 1.971 s (1260 allocations: 23.19 MiB) +pairwise(corspearman,eachcol($xm); skipmissing=:pairwise) 1.706 s (1228 allocations: 23.18 MiB) +corspearman($xm; skipmissing=:listwise) 4.466 ms (2113 allocations: 19.43 MiB) +pairwise(corspearman,eachcol($xm); skipmissing=:listwise) 4.432 ms (2081 allocations: 19.43 MiB) +corspearman($xm; skipmissing=:none) 20.017 ms (1625 allocations: 17.27 MiB) +pairwise(corspearman,eachcol($xm),skipmissing=:none) 20.085 ms (1623 allocations: 17.27 MiB) +================================================================================================================================== + +=# \ No newline at end of file From 8ced336285531c493ce318b2993f82dc25aa2240 Mon Sep 17 00:00:00 2001 From: PGS62 Date: Sat, 30 Mar 2024 11:25:09 +0000 Subject: [PATCH 02/24] Delete rankcorrspeedtest.jl --- test/rankcorrspeedtest.jl | 98 --------------------------------------- 1 file changed, 98 deletions(-) delete mode 100644 test/rankcorrspeedtest.jl diff --git a/test/rankcorrspeedtest.jl b/test/rankcorrspeedtest.jl deleted file mode 100644 index 1657dd29f..000000000 --- a/test/rankcorrspeedtest.jl +++ /dev/null @@ -1,98 +0,0 @@ -using Random, StatsBase, Dates, BenchmarkTools - -using Random; -x = rand(MersenneTwister(0), 1000, 10); -xm = ifelse.(x .< 0.05, missing, x); - -#compile... -res_1 = corkendall(x) -res_2 = corkendall(xm; skipmissing=:pairwise) -res_3 = pairwise(corkendall, eachcol(xm); skipmissing=:pairwise) -res_4 = corkendall(xm; skipmissing=:listwise) -res_5 = pairwise(corkendall, eachcol(xm); skipmissing=:listwise) -res_6 = corkendall(xm; skipmissing=:none) -res_7 = pairwise(corkendall, eachcol(xm), skipmissing=:none) -res_8 = corspearman(x) -res_9 = corspearman(xm; skipmissing=:pairwise) -res_10 = pairwise(corspearman, eachcol(xm); skipmissing=:pairwise) -res_11 = corspearman(xm; skipmissing=:listwise) -res_12 = pairwise(corspearman, eachcol(xm); skipmissing=:listwise) -res_13 = corspearman(xm; skipmissing=:none) -res_14 = pairwise(corspearman, eachcol(xm), skipmissing=:none) - -@assert res_2 == res_3 -@assert res_4 == res_5 -@assert res_9 == res_10 -@assert res_11 ≈ res_12 - -x = rand(MersenneTwister(0), 1000, 1000); -xm = ifelse.(x .< 0.05, missing, x); - -println("="^130) -@show(Dates.now()) -@show ENV["COMPUTERNAME"] -println("Julia Version $VERSION") -@show Threads.nthreads() -@show size(x) -@show typeof(x) -@show size(xm) -@show typeof(xm) - -print("corkendall(\$x) ") -@btime res_1 = corkendall($x); -print("corkendall(\$xm; skipmissing=:pairwise) ") -@btime res_2 = corkendall($xm; skipmissing=:pairwise); -print("pairwise(corkendall,eachcol(\$xm); skipmissing=:pairwise) ") -@btime res_3 = pairwise(corkendall, eachcol($xm); skipmissing=:pairwise); -print("corkendall(\$xm; skipmissing=:listwise) ") -@btime res_4 = corkendall($xm; skipmissing=:listwise); -print("pairwise(corkendall,eachcol(\$xm); skipmissing=:listwise) ") -@btime res_5 = pairwise(corkendall, eachcol($xm); skipmissing=:listwise); -print("corkendall(\$xm; skipmissing=:none) ") -@btime res_6 = corkendall($xm; skipmissing=:none); -print("pairwise(corkendall,eachcol(\$xm),skipmissing=:none) ") -@btime res_7 = pairwise(corkendall, eachcol($xm), skipmissing=:none); -print("corspearman(\$x) ") -@btime res_8 = corspearman($x); -print("corspearman(\$xm; skipmissing=:pairwise) ") -@btime res_9 = corspearman($xm; skipmissing=:pairwise); -print("pairwise(corspearman,eachcol(\$xm); skipmissing=:pairwise)") -@btime res_10 = pairwise(corspearman, eachcol($xm); skipmissing=:pairwise); -print("corspearman(\$xm; skipmissing=:listwise) ") -@btime res_11 = corspearman($xm; skipmissing=:listwise); -print("pairwise(corspearman,eachcol(\$xm); skipmissing=:listwise)") -@btime res_12 = pairwise(corspearman, eachcol($xm); skipmissing=:listwise); -print("corspearman(\$xm; skipmissing=:none) ") -@btime res_13 = corspearman($xm; skipmissing=:none); -print("pairwise(corspearman,eachcol(\$xm),skipmissing=:none) ") -@btime res_14 = pairwise(corspearman, eachcol($xm), skipmissing=:none); - -println("="^130) - -#= -================================================================================================================================== -Dates.now() = DateTime("2024-03-30T10:16:29.303") -ENV["COMPUTERNAME"] = "PHILIP-LAPTOP" -Julia Version 1.10.2 -Threads.nthreads() = 8 -size(x) = (1000, 1000) -typeof(x) = Matrix{Float64} -size(xm) = (1000, 1000) -typeof(xm) = Matrix{Union{Missing, Float64}} -corkendall($x) 5.758 s (1193 allocations: 15.84 MiB) -corkendall($xm; skipmissing=:pairwise) 5.250 s (1193 allocations: 15.51 MiB) -pairwise(corkendall,eachcol($xm); skipmissing=:pairwise) 5.541 s (1161 allocations: 15.51 MiB) -corkendall($xm; skipmissing=:listwise) 9.694 ms (2194 allocations: 11.82 MiB) -pairwise(corkendall,eachcol($xm); skipmissing=:listwise) 9.863 ms (2162 allocations: 11.82 MiB) -corkendall($xm; skipmissing=:none) 258.379 ms (1204 allocations: 16.47 MiB) -pairwise(corkendall,eachcol($xm),skipmissing=:none) 238.871 ms (1202 allocations: 16.47 MiB) -corspearman($x) 35.905 ms (1127 allocations: 39.31 MiB) -corspearman($xm; skipmissing=:pairwise) 1.971 s (1260 allocations: 23.19 MiB) -pairwise(corspearman,eachcol($xm); skipmissing=:pairwise) 1.706 s (1228 allocations: 23.18 MiB) -corspearman($xm; skipmissing=:listwise) 4.466 ms (2113 allocations: 19.43 MiB) -pairwise(corspearman,eachcol($xm); skipmissing=:listwise) 4.432 ms (2081 allocations: 19.43 MiB) -corspearman($xm; skipmissing=:none) 20.017 ms (1625 allocations: 17.27 MiB) -pairwise(corspearman,eachcol($xm),skipmissing=:none) 20.085 ms (1623 allocations: 17.27 MiB) -================================================================================================================================== - -=# \ No newline at end of file From 6a8fecda3e91cc00caa39f2452ae186c74bf1080 Mon Sep 17 00:00:00 2001 From: PGS62 Date: Tue, 2 Apr 2024 09:00:01 +0100 Subject: [PATCH 03/24] Partial solution to test failures on Julia 1.0.5 --- .vscode/settings.json | 3 ++- src/pairwise.jl | 2 +- src/rankcorr.jl | 36 +++++++++++++++++------------------- test/rankcorr.jl | 20 +++++++++++--------- 4 files changed, 31 insertions(+), 30 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 9e26dfeeb..7a73a41bf 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1 +1,2 @@ -{} \ No newline at end of file +{ +} \ No newline at end of file diff --git a/src/pairwise.jl b/src/pairwise.jl index d2775fe7e..025566b16 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -104,7 +104,7 @@ function _pairwise!(::Val{:pairwise}, f, dest::AbstractMatrix{V}, x, y, symmetri if iscor && i == j && y[i] === x[j] && V !== Union{} && V!== Missing dest[j, i] = 1.0 else - dest[j, i] = f(handle_pairwise(x[j], y[i]; scratch_fx, scratch_fy)...) + dest[j, i] = f(handle_pairwise(x[j], y[i]; scratch_fx = scratch_fx, scratch_fy = scratch_fy)...) end symmetric && (dest[i, j] = dest[j, i]) end diff --git a/src/rankcorr.jl b/src/rankcorr.jl index 1d5b3871a..338839025 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -38,17 +38,17 @@ function corspearman(x::AbstractVector, y::AbstractVector; skipmissing::Symbol=: end function corspearman(x::AbstractMatrix, y::AbstractVector; skipmissing::Symbol=:none) - return corspearman(x, reshape(y, (length(y), 1)); skipmissing) + return corspearman(x, reshape(y, (length(y), 1)); skipmissing=skipmissing) end function corspearman(x::AbstractVector, y::AbstractMatrix; skipmissing::Symbol=:none) - return corspearman(reshape(x, (length(x), 1)), y; skipmissing) + return corspearman(reshape(x, (length(x), 1)), y; skipmissing=skipmissing) end function corspearman(x::AbstractMatrix, y::AbstractMatrix=x; skipmissing::Symbol=:none) check_rankcor_args(x, y, skipmissing, true) - return pairwise(corspearman, eachcol(x), eachcol(y); skipmissing) + return pairwise(corspearman, _eachcol(x), _eachcol(y); skipmissing=skipmissing) end function corspearman(x::AbstractVector{T}) where {T} @@ -166,16 +166,13 @@ julia> using StatsBase, BenchmarkTools, Random julia> Random.seed!(1); -julia> x = ifelse.(rand(1000) .< 0.05,missing,randn(1000));y = ifelse.(rand(1000) .< 0.05,\ -missing,randn(1000)); +julia> x = ifelse.(rand(1000) .< 0.05,missing,randn(1000));y = ifelse.(rand(1000) .< 0.05, missing,randn(1000)); -julia> sortpermx=sortperm(x);sortpermy=sortperm(y);inds=zeros(Int64,1000);\ -spnmx=zeros(Int64,1000);spnmy=zeros(Int64,1000); +julia> sortpermx=sortperm(x);sortpermy=sortperm(y);inds=zeros(Int64,1000);spnmx=zeros(Int64,1000);spnmy=zeros(Int64,1000); julia> nmx=zeros(1000);nmy=zeros(1000);ranksx=similar(x,Float64);ranksy=similar(y,Float64); -julia> @btime corspearman_kernel!(\$x,\$y,:pairwise,\$sortpermx,\$sortpermy,\ -\$inds,\$spnmx,\$spnmy,\$nmx,\$nmy,\$ranksx,\$ranksy) +julia> @btime corspearman_kernel!(\$x,\$y,:pairwise,\$sortpermx,\$sortpermy,\$inds,\$spnmx,\$spnmy,\$nmx,\$nmy,\$ranksx,\$ranksy) 4.671 μs (0 allocations: 0 bytes) -0.016058512110609713 ``` @@ -417,7 +414,7 @@ Uses multiple threads when either `x` or `y` is a matrix. function corkendall(x::AbstractMatrix, y::AbstractMatrix=x; skipmissing::Symbol=:none) check_rankcor_args(x, y, skipmissing, true) - return pairwise(corkendall, eachcol(x), eachcol(y); skipmissing) + return pairwise(corkendall, _eachcol(x), _eachcol(y); skipmissing=skipmissing) end function corkendall(x::AbstractVector, y::AbstractVector; @@ -433,15 +430,17 @@ function corkendall(x::AbstractVector, y::AbstractVector; end end +_eachcol(x) = VERSION >= v"1.1" ? eachcol(x) : [view(x,:,j) for j in axes(x,2)] + #= corkendall returns a vector in this case, inconsistent with with Statistics.cor and StatsBase.corspearman, but consistent with StatsBase.corkendall. =# function corkendall(x::AbstractMatrix, y::AbstractVector; skipmissing::Symbol=:none) - return vec(corkendall(x, reshape(y, (length(y), 1)); skipmissing)) + return vec(corkendall(x, reshape(y, (length(y), 1)); skipmissing=skipmissing)) end function corkendall(x::AbstractVector, y::AbstractMatrix; skipmissing::Symbol=:none) - return corkendall(reshape(x, (length(x), 1)), y; skipmissing) + return corkendall(reshape(x, (length(x), 1)), y; skipmissing=skipmissing) end function corkendall(x::AbstractVector{T}) where {T} @@ -515,7 +514,8 @@ function corkendall_loop!(skipmissing::Symbol, f::typeof(corkendall), dest::Abst else yi .= y[i] dest[j, i] = corkendall_kernel!(sortedxj, yi, permx, skipmissing; - scratch_py, scratch_sy, scratch_fx, scratch_fy) + scratch_py=scratch_py, scratch_sy=scratch_sy, scratch_fx=scratch_fx, + scratch_fy=scratch_fy) end symmetric && (dest[i, j] = dest[j, i]) end @@ -531,17 +531,15 @@ end # JSTOR, www.jstor.org/stable/2282833. function check_rankcor_args(x, y, skipmissing, allowlistwise::Bool) - Base.require_one_based_indexing(x, y) + # Base.require_one_based_indexing(x, y) #TODO find how to reject non-1-based in a way that works in Julia 1.0.5 size(x, 1) == size(y, 1) || throw(DimensionMismatch("x and y have inconsistent dimensions")) if allowlistwise skipmissing in (:none, :pairwise, :listwise) || - throw(ArgumentError("skipmissing must be one of :none, :pairwise or :listwise, \ - but got :$skipmissing")) + throw(ArgumentError("skipmissing must be one of :none, :pairwise or :listwise, but got :$skipmissing")) else skipmissing in (:none, :pairwise) || - throw(ArgumentError("skipmissing must be either :none or :pairwise, but \ - got :$skipmissing")) + throw(ArgumentError("skipmissing must be either :none or :pairwise, but got :$skipmissing")) end end @@ -585,7 +583,7 @@ function corkendall_kernel!(sortedx::AbstractVector{T}, y::AbstractVector{U}, end if missing isa T || missing isa V - sortedx, permutedy = handle_pairwise(sortedx, scratch_py; scratch_fx, scratch_fy) + sortedx, permutedy = handle_pairwise(sortedx, scratch_py; scratch_fx=scratch_fx, scratch_fy=scratch_fy) else permutedy = scratch_py end diff --git a/test/rankcorr.jl b/test/rankcorr.jl index 484474d99..719870d3d 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -236,11 +236,13 @@ julia> corkendall(Matrix{Union{Missing,Float64}}(missing,5,3)) #DIFFERENT behavi @test (f(["a" "z"; "b" "y"; "c" "x"]) ≈ [1.0 -1.0; -1.0 1.0]) @test (f(["a" 3; "b" 2; "c" 1]) ≈ [1.0 -1.0; -1.0 1.0]) - #Works for zero size input - @test isequal(f([;;]), [;;]) - @test isequal(f([;;], [;;]), [;;]) - @test isequal(f([;;], [;;], skipmissing=:pairwise), [;;]) - @test isequal(f([;;], [;;], skipmissing=:listwise), [;;]) + #Works for zero size input ( [;;] not compatible with Julia 1.0.5) + let nada = Array{Any,2}(undef, 0, 0) + @test isequal(f(nada), nada) + @test isequal(f(nada, nada), nada) + @test isequal(f(nada, nada, skipmissing=:pairwise), nada) + @test isequal(f(nada, nada, skipmissing=:listwise), nada) + end # Wrong dimensions @test_throws DimensionMismatch f([1], [1, 2]) @@ -362,11 +364,11 @@ end factor" of 1.2 against the expected size of allocations. =# @test (@allocated corkendall(x)) < (897_808 + Threads.nthreads() * 58_044) * 1.2 - @test (@allocated corkendall(xm,skipmissing=:listwise)) < (1_119_392 + Threads.nthreads() * 22_172) * 1.2 - @test (@allocated corkendall(xm,skipmissing=:pairwise)) < (892_112 + Threads.nthreads() * 61_116) * 1.2 + @test (@allocated corkendall(xm, skipmissing=:listwise)) < (1_119_392 + Threads.nthreads() * 22_172) * 1.2 + @test (@allocated corkendall(xm, skipmissing=:pairwise)) < (892_112 + Threads.nthreads() * 61_116) * 1.2 @test (@allocated corspearman(x)) < (2_678_448 + Threads.nthreads() * 9_128) * 1.2 - @test (@allocated corspearman(xm,skipmissing=:listwise)) < (1_803_712 + Threads.nthreads() * 3_992) * 1.2 - @test (@allocated corspearman(xm,skipmissing=:pairwise)) < (1_692_208 + Threads.nthreads() * 67_172) * 1.2 + @test (@allocated corspearman(xm, skipmissing=:listwise)) < (1_803_712 + Threads.nthreads() * 3_992) * 1.2 + @test (@allocated corspearman(xm, skipmissing=:pairwise)) < (1_692_208 + Threads.nthreads() * 67_172) * 1.2 end # COV_EXCL_STOP \ No newline at end of file From eff32e4c0760201d63cfaa9d561558479740181d Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Tue, 2 Apr 2024 16:12:57 +0100 Subject: [PATCH 04/24] Reverted to using eachcol in the hope that support for Julia 1.0.5 will be dropped. --- src/rankcorr.jl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index 338839025..ca59753ca 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -48,7 +48,7 @@ end function corspearman(x::AbstractMatrix, y::AbstractMatrix=x; skipmissing::Symbol=:none) check_rankcor_args(x, y, skipmissing, true) - return pairwise(corspearman, _eachcol(x), _eachcol(y); skipmissing=skipmissing) + return pairwise(corspearman, eachcol(x), eachcol(y); skipmissing=skipmissing) end function corspearman(x::AbstractVector{T}) where {T} @@ -414,7 +414,7 @@ Uses multiple threads when either `x` or `y` is a matrix. function corkendall(x::AbstractMatrix, y::AbstractMatrix=x; skipmissing::Symbol=:none) check_rankcor_args(x, y, skipmissing, true) - return pairwise(corkendall, _eachcol(x), _eachcol(y); skipmissing=skipmissing) + return pairwise(corkendall, eachcol(x), eachcol(y); skipmissing=skipmissing) end function corkendall(x::AbstractVector, y::AbstractVector; @@ -430,8 +430,6 @@ function corkendall(x::AbstractVector, y::AbstractVector; end end -_eachcol(x) = VERSION >= v"1.1" ? eachcol(x) : [view(x,:,j) for j in axes(x,2)] - #= corkendall returns a vector in this case, inconsistent with with Statistics.cor and StatsBase.corspearman, but consistent with StatsBase.corkendall. =# From 1a4d064d63cd2b09f3153e5524656c07d407d077 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Tue, 2 Apr 2024 16:13:16 +0100 Subject: [PATCH 05/24] Deleted commented out tests --- test/pairwise.jl | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/test/pairwise.jl b/test/pairwise.jl index 35ecd8279..6a3bf1df1 100644 --- a/test/pairwise.jl +++ b/test/pairwise.jl @@ -53,9 +53,7 @@ arbitrary_fun(x, y) = cor(x, y) @inferred pairwise(f, x, y) if throwsforzerolengthinput - # @test_throws Union{ArgumentError,MethodError} pairwise(f, [Int[]], [Int[]]) @test_throws CompositeException pairwise(f, [Int[]], [Int[]]) - #@test_throws Union{ArgumentError,MethodError} pairwise!(f, zeros(1, 1), [Int[]], [Int[]]) @test_throws CompositeException pairwise!(f, zeros(1, 1), [Int[]], [Int[]]) end @@ -122,23 +120,18 @@ arbitrary_fun(x, y) = cor(x, y) return x[which], y[which] end - #@test isapprox(res, [f(collect.(skipmissings(xi, yi))...) for xi in xm, yi in ym], - # rtol=1e-6) @test isapprox(res, [f(myskipmissings(xi, yi)...) for xi in xm, yi in ym], rtol=1e-6) res = pairwise(f, ym, zm, skipmissing=:pairwise) if isrankcor @test res isa Matrix{Float64} - # @test isequal(res, [f(collect.(skipmissings(yi, zi))...) for yi in ym, zi in zm]) @test isequal(res, [f(myskipmissings(yi, zi)...) for yi in ym, zi in zm]) else @test res isa Matrix{Float32} res2 = zeros(Union{Float32,Missing}, size(res)) @test pairwise!(f, res2, ym, zm, skipmissing=:pairwise) === res2 @test res ≅ res2 - #@test isapprox(res, [f(collect.(skipmissings(yi, zi))...) for yi in ym, zi in zm], - # rtol=1e-6) @test isapprox(res, [f(myskipmissings(yi, zi)...) for yi in ym, zi in zm], rtol=1e-6) @@ -189,15 +182,11 @@ arbitrary_fun(x, y) = cor(x, y) if VERSION >= v"1.5" # Fails with UndefVarError on Julia 1.0 if throwsforzerolengthinput - #@test_throws Union{ArgumentError,MethodError} pairwise(f, xm, ym, skipmissing=:pairwise) @test_throws CompositeException pairwise(f, xm, ym, skipmissing=:pairwise) - #@test_throws Union{ArgumentError,MethodError} pairwise(f, xm, ym, skipmissing=:listwise) @test_throws CompositeException pairwise(f, xm, ym, skipmissing=:listwise) res = zeros(Union{Float64,Missing}, length(xm), length(ym)) - #@test_throws Union{ArgumentError,MethodError} pairwise!(f, res, xm, ym, skipmissing=:pairwise) @test_throws CompositeException pairwise!(f, res, xm, ym, skipmissing=:pairwise) - #@test_throws Union{ArgumentError,MethodError} pairwise!(f, res, xm, ym, skipmissing=:listwise) @test_throws CompositeException pairwise!(f, res, xm, ym, skipmissing=:listwise) end end From 893d4b67e8de28ac4ec6f48b0c812073dd21c1d6 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Tue, 2 Apr 2024 16:16:44 +0100 Subject: [PATCH 06/24] Delete settings.json --- .vscode/settings.json | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 7a73a41bf..000000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,2 +0,0 @@ -{ -} \ No newline at end of file From e2c9ef59e98593ef29c14924f030a28088865aab Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Tue, 2 Apr 2024 17:19:45 +0100 Subject: [PATCH 07/24] Update pairwise.jl in response to some of devmotion's comments --- src/pairwise.jl | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/pairwise.jl b/src/pairwise.jl index 025566b16..c8bbf2357 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -5,15 +5,15 @@ function _pairwise!(::Val{:none}, f, dest::AbstractMatrix{V}, x, y, # Swap x and y for more efficient threaded loop. if nr < nc - dest′ = reshape(dest, size(dest, 2), size(dest, 1)) - _pairwise!(Val(:none), f, dest′, y, x, symmetric) - dest .= transpose(dest′) + dest2 = reshape(dest, size(dest, 2), size(dest, 1)) + _pairwise!(Val(:none), f, dest2, y, x, symmetric) + dest .= transpose(dest2) return dest end #cor and friends are special-cased. - iscor = (f in (corkendall, corspearman, cor)) - (iscor || f == cov) && (symmetric = x === y) + diag_is_1 = (f in (corkendall, corspearman, cor)) + (diag_is_1 || f == cov) && (symmetric = x === y) #cov(x) is faster than cov(x, x) (f == cov) && (f = ((x, y) -> x === y ? cov(x) : cov(x, y))) @@ -21,13 +21,13 @@ function _pairwise!(::Val{:none}, f, dest::AbstractMatrix{V}, x, y, for j in subset for i = 1:(symmetric ? j : nc) # For performance, diagonal is special-cased - if iscor && i==j && y[i] === x[j] && V !== Union{} + if diag_is_1 && i == j && y[i] === x[j] && V !== Union{} dest[j, i] = V === Missing ? missing : 1.0 else dest[j, i] = f(x[j], y[i]) end - symmetric && (dest[i, j] = dest[j, i]) end + symmetric && LinearAlgebra.copytri!(dest, 'L') end end return dest @@ -81,15 +81,15 @@ function _pairwise!(::Val{:pairwise}, f, dest::AbstractMatrix{V}, x, y, symmetri # Swap x and y for more efficient threaded loop. if nr < nc - dest′ = reshape(dest, size(dest, 2), size(dest, 1)) - _pairwise!(Val(:pairwise), f, dest′, y, x, symmetric) - dest .= transpose(dest′) + dest2 = reshape(dest, size(dest, 2), size(dest, 1)) + _pairwise!(Val(:pairwise), f, dest2, y, x, symmetric) + dest .= transpose(dest2) return dest end #cor and friends are special-cased. - iscor = (f in (corkendall, corspearman, cor)) - (iscor || f == cov) && (symmetric = x === y) + diag_is_1 = (f in (corkendall, corspearman, cor)) + (diag_is_1 || f == cov) && (symmetric = x === y) #cov(x) is faster than cov(x, x) (f == cov) && (f = ((x, y) -> x === y ? cov(x) : cov(x, y))) @@ -97,17 +97,17 @@ function _pairwise!(::Val{:pairwise}, f, dest::AbstractMatrix{V}, x, y, symmetri nmty = promoted_nmtype(y)[] Threads.@threads for subset in equal_sum_subsets(nr, Threads.nthreads()) + scratch_fx = task_local_vector(:scratch_fx, nmtx, m) + scratch_fy = task_local_vector(:scratch_fy, nmty, m) for j in subset - scratch_fx = task_local_vector(:scratch_fx, nmtx, m) - scratch_fy = task_local_vector(:scratch_fy, nmty, m) for i = 1:(symmetric ? j : nc) - if iscor && i == j && y[i] === x[j] && V !== Union{} && V!== Missing + if diag_is_1 && i == j && y[i] === x[j] && V !== Union{} && V !== Missing dest[j, i] = 1.0 else - dest[j, i] = f(handle_pairwise(x[j], y[i]; scratch_fx = scratch_fx, scratch_fy = scratch_fy)...) + dest[j, i] = f(handle_pairwise(x[j], y[i]; scratch_fx=scratch_fx, scratch_fy=scratch_fy)...) end - symmetric && (dest[i, j] = dest[j, i]) end + symmetric && LinearAlgebra.copytri!(dest, 'L') end end return dest @@ -255,7 +255,7 @@ julia> dest ``` """ function pairwise!(f, dest::AbstractMatrix, x, y=x; - symmetric::Bool=false, skipmissing::Symbol=:none) + symmetric::Bool=false, skipmissing::Symbol=:none) check_vectors(x, y, skipmissing, symmetric) return _pairwise!(f, dest, x, y, symmetric=symmetric, skipmissing=skipmissing) end From 4c2e7a06988256d3c5200f5e860940a0f6fadb2d Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Tue, 2 Apr 2024 17:37:37 +0100 Subject: [PATCH 08/24] Was calling copytry! too soon. Fixed that. --- src/pairwise.jl | 4 ++-- src/rankcorr.jl | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pairwise.jl b/src/pairwise.jl index c8bbf2357..b0b79448a 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -27,9 +27,9 @@ function _pairwise!(::Val{:none}, f, dest::AbstractMatrix{V}, x, y, dest[j, i] = f(x[j], y[i]) end end - symmetric && LinearAlgebra.copytri!(dest, 'L') end end + symmetric && LinearAlgebra.copytri!(dest, 'L') return dest end @@ -107,9 +107,9 @@ function _pairwise!(::Val{:pairwise}, f, dest::AbstractMatrix{V}, x, y, symmetri dest[j, i] = f(handle_pairwise(x[j], y[i]; scratch_fx=scratch_fx, scratch_fy=scratch_fy)...) end end - symmetric && LinearAlgebra.copytri!(dest, 'L') end end + symmetric && LinearAlgebra.copytri!(dest, 'L') return dest end diff --git a/src/rankcorr.jl b/src/rankcorr.jl index ca59753ca..e94db2f04 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -137,10 +137,10 @@ function _pairwise!(::Val{:pairwise}, f::typeof(corspearman), view(tempx, :, j), view(tempy, :, i), inds, spnmx, spnmy, nmx, nmy, ranksx, ranksy) end - symmetric && (dest[i, j] = dest[j, i]) end end end + symmetric && LinearAlgebra.copytri!(dest, 'L') return dest end @@ -515,10 +515,10 @@ function corkendall_loop!(skipmissing::Symbol, f::typeof(corkendall), dest::Abst scratch_py=scratch_py, scratch_sy=scratch_sy, scratch_fx=scratch_fx, scratch_fy=scratch_fy) end - symmetric && (dest[i, j] = dest[j, i]) end end end + symmetric && LinearAlgebra.copytri!(dest, 'L') return dest end From 2119804bb83450a500b41349f7766de6a24c5692 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Tue, 2 Apr 2024 18:22:41 +0100 Subject: [PATCH 09/24] Update pairwise.jl --- src/pairwise.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pairwise.jl b/src/pairwise.jl index b0b79448a..f55823219 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -381,16 +381,16 @@ function handle_pairwise(x::AbstractVector, y::AbstractVector; axes(x) == axes(y) || throw(DimensionMismatch("x and y have inconsistent dimensions")) lb = first(axes(x, 1)) - j = lb - 1 + j = lb @inbounds for i in eachindex(x) if !(ismissing(x[i]) || ismissing(y[i])) - j += 1 scratch_fx[j] = x[i] scratch_fy[j] = y[i] + j += 1 end end - return view(scratch_fx, lb:j), view(scratch_fy, lb:j) + return view(scratch_fx, lb:(j-1)), view(scratch_fy, lb:(j-1)) end #=Condition a) makes equal_sum_subsets useful for load-balancing the multi-threaded section From 5da5943902e992e2dbb9851a02cf6833ffef20e0 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Wed, 3 Apr 2024 14:47:20 +0100 Subject: [PATCH 10/24] nested loops more friendly to Julia being column-major --- src/pairwise.jl | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/pairwise.jl b/src/pairwise.jl index f55823219..11dbbcda5 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -4,8 +4,8 @@ function _pairwise!(::Val{:none}, f, dest::AbstractMatrix{V}, x, y, nr, nc = size(dest) # Swap x and y for more efficient threaded loop. - if nr < nc - dest2 = reshape(dest, size(dest, 2), size(dest, 1)) + if nc < nr + dest2 = similar(dest, size(dest, 2), size(dest, 1)) _pairwise!(Val(:none), f, dest2, y, x, symmetric) dest .= transpose(dest2) return dest @@ -17,14 +17,14 @@ function _pairwise!(::Val{:none}, f, dest::AbstractMatrix{V}, x, y, #cov(x) is faster than cov(x, x) (f == cov) && (f = ((x, y) -> x === y ? cov(x) : cov(x, y))) - Threads.@threads for subset in equal_sum_subsets(nr, Threads.nthreads()) + Threads.@threads for subset in equal_sum_subsets(nc, Threads.nthreads()) for j in subset - for i = 1:(symmetric ? j : nc) + for i = (symmetric ? j : 1):nr # For performance, diagonal is special-cased - if diag_is_1 && i == j && y[i] === x[j] && V !== Union{} - dest[j, i] = V === Missing ? missing : 1.0 + if diag_is_1 && i == j && x[i] === y[j] + dest[i, j] = V === Missing ? missing : 1.0 else - dest[j, i] = f(x[j], y[i]) + dest[i, j] = f(x[i], y[j]) end end end @@ -80,8 +80,8 @@ function _pairwise!(::Val{:pairwise}, f, dest::AbstractMatrix{V}, x, y, symmetri m = length(x) == 0 ? 0 : length(first(x)) # Swap x and y for more efficient threaded loop. - if nr < nc - dest2 = reshape(dest, size(dest, 2), size(dest, 1)) + if nc < nr + dest2 = similar(dest, size(dest, 2), size(dest, 1)) _pairwise!(Val(:pairwise), f, dest2, y, x, symmetric) dest .= transpose(dest2) return dest @@ -96,15 +96,15 @@ function _pairwise!(::Val{:pairwise}, f, dest::AbstractMatrix{V}, x, y, symmetri nmtx = promoted_nmtype(x)[] nmty = promoted_nmtype(y)[] - Threads.@threads for subset in equal_sum_subsets(nr, Threads.nthreads()) + Threads.@threads for subset in equal_sum_subsets(nc, Threads.nthreads()) scratch_fx = task_local_vector(:scratch_fx, nmtx, m) scratch_fy = task_local_vector(:scratch_fy, nmty, m) for j in subset - for i = 1:(symmetric ? j : nc) - if diag_is_1 && i == j && y[i] === x[j] && V !== Union{} && V !== Missing - dest[j, i] = 1.0 + for i = (symmetric ? j : 1):nr + if diag_is_1 && i == j && x[i] === y[j] && V !== Missing + dest[i, j] = 1.0 else - dest[j, i] = f(handle_pairwise(x[j], y[i]; scratch_fx=scratch_fx, scratch_fy=scratch_fy)...) + dest[i, j] = f(handle_pairwise(x[i], y[j]; scratch_fx=scratch_fx, scratch_fy=scratch_fy)...) end end end From baf193b72446c217a158193bbe8b5280c8a996fd Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Wed, 3 Apr 2024 14:48:01 +0100 Subject: [PATCH 11/24] flipped i and j loop variables. reshape -> similar --- src/rankcorr.jl | 53 ++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index e94db2f04..2b06d9896 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -99,9 +99,9 @@ function _pairwise!(::Val{:pairwise}, f::typeof(corspearman), # Swap x and y for more efficient threaded loop. if nr < nc - dest′ = reshape(dest, size(dest, 2), size(dest, 1)) - _pairwise!(Val(:pairwise), f, dest′, y, x, symmetric) - dest .= transpose(dest′) + dest2 = similar(dest, size(dest, 2), size(dest, 1)) + _pairwise!(Val(:pairwise), f, dest2, y, x, symmetric) + dest .= transpose(dest2) return dest end @@ -114,7 +114,7 @@ function _pairwise!(::Val{:pairwise}, f::typeof(corspearman), nmty = promoted_nmtype(y)[] Threads.@threads for subset in equal_sum_subsets(nr, Threads.nthreads()) - for j in subset + for i in subset inds = task_local_vector(:inds, int64, m) spnmx = task_local_vector(:spnmx, int64, m) @@ -124,17 +124,17 @@ function _pairwise!(::Val{:pairwise}, f::typeof(corspearman), ranksx = task_local_vector(:ranksx, fl64, m) ranksy = task_local_vector(:ranksy, fl64, m) - for i = 1:(symmetric ? j : nc) + for j = 1:(symmetric ? i : nc) # For performance, diagonal is special-cased - if x[j] === y[i] && i == j && V !== Union{} - if missing isa V && eltype(x[j]) == Missing - dest[j, i] = missing + if x[i] === y[j] && i == j + if missing isa V && eltype(x[i]) == Missing + dest[i, j] = missing else - dest[j, i] = 1.0 + dest[i, j] = 1.0 end else - dest[j, i] = corspearman_kernel!(x[j], y[i], :pairwise, - view(tempx, :, j), view(tempy, :, i), inds, spnmx, spnmy, nmx, + dest[i, j] = corspearman_kernel!(x[i], y[j], :pairwise, + view(tempx, :, i), view(tempy, :, j), inds, spnmx, spnmy, nmx, nmy, ranksx, ranksy) end end @@ -468,9 +468,9 @@ function corkendall_loop!(skipmissing::Symbol, f::typeof(corkendall), dest::Abst # Swap x and y for more efficient threaded loop. if nr < nc - dest′ = reshape(dest, size(dest, 2), size(dest, 1)) - corkendall_loop!(skipmissing, f, dest′, y, x, symmetric) - dest .= transpose(dest′) + dest2 = similar(dest, size(dest, 2), size(dest, 1)) + corkendall_loop!(skipmissing, f, dest2, y, x, symmetric) + dest .= transpose(dest2) return dest end @@ -484,11 +484,11 @@ function corkendall_loop!(skipmissing::Symbol, f::typeof(corkendall), dest::Abst Threads.@threads for subset in equal_sum_subsets(nr, Threads.nthreads()) - for j in subset + for i in subset sortedxj = task_local_vector(:sortedxj, t, m) scratch_py = task_local_vector(:scratch_py, u, m) - yi = task_local_vector(:yi, u, m) + yj = task_local_vector(:yj, u, m) permx = task_local_vector(:permx, intvec, m) # Ensuring missing is not an element type of scratch_sy, scratch_fx, scratch_fy # gives improved performance. @@ -496,22 +496,22 @@ function corkendall_loop!(skipmissing::Symbol, f::typeof(corkendall), dest::Abst scratch_fx = task_local_vector(:scratch_fx, t′, m) scratch_fy = task_local_vector(:scratch_fy, t′, m) - sortperm!(permx, x[j]) + sortperm!(permx, x[i]) @inbounds for k in eachindex(sortedxj) - sortedxj[k] = x[j][permx[k]] + sortedxj[k] = x[i][permx[k]] end - for i = 1:(symmetric ? j : nc) + for j = 1:(symmetric ? i : nc) # For performance, diagonal is special-cased - if x[j] === y[i] && i == j && V !== Union{} - if missing isa V && eltype(x[j]) == Missing - dest[j, i] = missing + if x[i] === y[j] && j == i + if missing isa V && eltype(x[i]) == Missing + dest[i, j] = missing else - dest[j, i] = 1.0 + dest[i, j] = 1.0 end else - yi .= y[i] - dest[j, i] = corkendall_kernel!(sortedxj, yi, permx, skipmissing; + yj .= y[j] + dest[i, j] = corkendall_kernel!(sortedxj, yj, permx, skipmissing; scratch_py=scratch_py, scratch_sy=scratch_sy, scratch_fx=scratch_fx, scratch_fy=scratch_fy) end @@ -746,5 +746,4 @@ end # can also accept arguments with element type for which isnan is not defined but isless is # is defined, so that rank correlation makes sense. _isnan(x::T) where {T<:Number} = isnan(x) -_isnan(x) = false - +_isnan(x) = false \ No newline at end of file From a1d9ae994ffdc5868b34de47bdb42daa163b34e3 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Thu, 4 Apr 2024 16:33:53 +0100 Subject: [PATCH 12/24] Added iterators EqualSumSubsets and TwoStepRange --- src/pairwise.jl | 139 +++++++++++++++++++++++++++++++++++------------ src/rankcorr.jl | 4 +- test/rankcorr.jl | 13 ++++- 3 files changed, 115 insertions(+), 41 deletions(-) diff --git a/src/pairwise.jl b/src/pairwise.jl index 11dbbcda5..641f02db8 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -17,7 +17,7 @@ function _pairwise!(::Val{:none}, f, dest::AbstractMatrix{V}, x, y, #cov(x) is faster than cov(x, x) (f == cov) && (f = ((x, y) -> x === y ? cov(x) : cov(x, y))) - Threads.@threads for subset in equal_sum_subsets(nc, Threads.nthreads()) + Threads.@threads for subset in EqualSumSubsets(nc, Threads.nthreads()) for j in subset for i = (symmetric ? j : 1):nr # For performance, diagonal is special-cased @@ -96,7 +96,7 @@ function _pairwise!(::Val{:pairwise}, f, dest::AbstractMatrix{V}, x, y, symmetri nmtx = promoted_nmtype(x)[] nmty = promoted_nmtype(y)[] - Threads.@threads for subset in equal_sum_subsets(nc, Threads.nthreads()) + Threads.@threads for subset in EqualSumSubsets(nc, Threads.nthreads()) scratch_fx = task_local_vector(:scratch_fx, nmtx, m) scratch_fy = task_local_vector(:scratch_fy, nmty, m) for j in subset @@ -393,51 +393,118 @@ function handle_pairwise(x::AbstractVector, y::AbstractVector; return view(scratch_fx, lb:(j-1)), view(scratch_fy, lb:(j-1)) end -#=Condition a) makes equal_sum_subsets useful for load-balancing the multi-threaded section -of _pairwise! in the non-symmetric case, and condition b) for the symmetric case.=# """ - equal_sum_subsets(n::Int, num_subsets::Int)::Vector{Vector{Int}} + task_local_vector(key::Symbol, similarto::AbstractArray{V}, + length::Int)::Vector{V} where {V} + +Retrieve from task local storage a vector of length `length` and matching the element +type of `similarto`, with initialisation on first call during a task. +""" +function task_local_vector(key::Symbol, similarto::AbstractArray{V}, + length::Int)::Vector{V} where {V} + haskey(task_local_storage(), key) || task_local_storage(key, similar(similarto, length)) + return task_local_storage(key) +end -Divide the integers 1:n into a number of subsets such that a) each subset has -(approximately) the same number of elements; and b) the sum of the elements in each subset -is nearly equal. If `n` is a multiple of `2 * num_subsets` both conditions hold exactly. +""" + EqualSumSubsets + +An iterator that partitions the integers 1 to n into `num_subsets` "subsets" (of type +TwoStepRange) such that a) each subset is of nearly equal length; and b) the sum of the +elements in each subset is nearly equal. If `n` is a multiple of `2 * num_subsets` both +conditions hold exactly. ## Example ```julia-repl -julia> StatsBase.equal_sum_subsets(30,5) -5-element Vector{Vector{Int64}}: - [30, 21, 20, 11, 10, 1] - [29, 22, 19, 12, 9, 2] - [28, 23, 18, 13, 8, 3] - [27, 24, 17, 14, 7, 4] - [26, 25, 16, 15, 6, 5] +julia> for s in StatsBase.EqualSumSubsets(30,5) +println((collect(s), sum(s))) +end +([30, 21, 20, 11, 10, 1], 93) +([29, 22, 19, 12, 9, 2], 93) +([28, 23, 18, 13, 8, 3], 93) +([27, 24, 17, 14, 7, 4], 93) +([26, 25, 16, 15, 6, 5], 93) + +#Check for correct partitioning, in this case of integers 1:1000 into 17 subsets. +julia> sort(vcat([collect(s) for s in StatsBase.EqualSumSubsets(1000,17)]...))==1:1000 +true + ``` """ -function equal_sum_subsets(n::Int, num_subsets::Int)::Vector{Vector{Int}} - subsets = [Int[] for _ in 1:min(n, num_subsets)] - writeto, scanup = 1, true - for i = n:-1:1 - push!(subsets[writeto], i) - if scanup && writeto == num_subsets - scanup = false - elseif (!scanup) && writeto == 1 - scanup = true - else - writeto += scanup ? 1 : -1 - end +struct EqualSumSubsets + n::Int64 + num_subsets::Int64 + + function EqualSumSubsets(n, num_subsets) + n >= 0 || throw("n must not be negative, but got $n") + num_subsets > 0 || throw("num_subsets must be positive, but got $num_subsets") + return new(n, num_subsets) end - return subsets + +end + +Base.eltype(::EqualSumSubsets) = TwoStepRange +Base.length(x::EqualSumSubsets) = min(x.n, x.num_subsets) +Base.firstindex(::EqualSumSubsets) = 1 + +function Base.iterate(ess::EqualSumSubsets, state::Int64=1) + state > length(ess) && return nothing + return getindex(ess, state), state + 1 +end + +function Base.getindex(ess::EqualSumSubsets, i::Int64=1) + n, nss = ess.n, ess.num_subsets + step1 = 2i - 2nss - 1 + step2 = 1 - 2i + return TwoStepRange(n - i + 1, step1, step2) end """ - task_local_vector(key::Symbol, similarto::AbstractArray{V}, - length::Int)::Vector{V} where {V} +TwoStepRange -Retrieve from task local storage a vector of length `length` and matching the element -type of `similarto`, with initialisation on first call during a task. +Range with a starting value of `start`, stop value of `1` and a step that alternates +between `step1` and `step2`. `start` must be positive and `step1` and `step2` must be +negative. + +# Examples +```jldoctest +julia> collect(StatsBase.TwoStepRange(30,-7,-3)) +6-element Vector{Int64}: + 30 + 23 + 20 + 13 + 10 + 3 + +``` """ -function task_local_vector(key::Symbol, similarto::AbstractArray{V}, - length::Int)::Vector{V} where {V} - haskey(task_local_storage(), key) || task_local_storage(key, similar(similarto, length)) - return task_local_storage(key) +struct TwoStepRange + start::Int64 + step1::Int64 + step2::Int64 + + function TwoStepRange(start, step1, step2) + start > 0 || throw("start must be positive, but got $start") + step1 < 0 || throw("step1 must be negative, but got $step1") + step2 < 0 || throw("step2 must be negative, but got $step2") + return new(start, step1, step2) + end +end + +Base.eltype(::TwoStepRange) = Int64 + +function Base.length(tsr::TwoStepRange) + return length((tsr.start):(tsr.step1+tsr.step2):1) + + length((tsr.start+tsr.step1):(tsr.step1+tsr.step2):1) +end + +function Base.iterate(tsr::TwoStepRange, i::Int64=1) + (i > length(tsr)) && return nothing + return getindex(tsr, i), i + 1 +end + +function Base.getindex(tsr::TwoStepRange, i::Int64=1)::Int64 + a, b = divrem(i - 1, 2) + return tsr.start + (tsr.step1 + tsr.step2) * a + b * tsr.step1 end \ No newline at end of file diff --git a/src/rankcorr.jl b/src/rankcorr.jl index 2b06d9896..dca82770e 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -112,7 +112,7 @@ function _pairwise!(::Val{:pairwise}, f::typeof(corspearman), fl64 = Float64[] nmtx = promoted_nmtype(x)[] nmty = promoted_nmtype(y)[] - Threads.@threads for subset in equal_sum_subsets(nr, Threads.nthreads()) + Threads.@threads for subset in EqualSumSubsets(nr, Threads.nthreads()) for i in subset @@ -482,7 +482,7 @@ function corkendall_loop!(skipmissing::Symbol, f::typeof(corkendall), dest::Abst symmetric = x === y - Threads.@threads for subset in equal_sum_subsets(nr, Threads.nthreads()) + Threads.@threads for subset in EqualSumSubsets(nr, Threads.nthreads()) for i in subset diff --git a/test/rankcorr.jl b/test/rankcorr.jl index 719870d3d..9cd9ee1ce 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -32,9 +32,16 @@ using Test @test StatsBase.midpoint(1, 10) == 5 @test StatsBase.midpoint(1, widen(10)) == 5 - @test StatsBase.equal_sum_subsets(0, 1) == Vector{Int64}[] - @test sum.(StatsBase.equal_sum_subsets(100, 5)) == repeat([1010], 5) - @test sort(vcat(StatsBase.equal_sum_subsets(500, 7)...)) == collect(1:500) + for n in 1:200, nss in 1:7 + #check is a partition + @test sort(vcat([collect(s) for s in StatsBase.EqualSumSubsets(n, nss)]...)) == 1:n + #check near-equal lengths + lengths = [length(s) for s in StatsBase.EqualSumSubsets(n, nss)] + @test (maximum(lengths) - minimum(lengths)) <= 1 + #check near-equal sums + sums = [sum(collect(s)) for s in StatsBase.EqualSumSubsets(n, nss)] + @test (maximum(sums) - minimum(sums)) < nss + end end From 951b3916ca2da28898d296b8539e57ba9498bcbe Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Fri, 5 Apr 2024 12:27:35 +0100 Subject: [PATCH 13/24] Int64 -> Int to work on 32bit --- src/pairwise.jl | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/pairwise.jl b/src/pairwise.jl index 641f02db8..96a204b57 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -222,7 +222,7 @@ presence `missing`, `NaN` or `Inf` entries. Only allowed when entries in `x` and `y` are vectors. # Examples -```jldoctest +```julia-repl julia> using StatsBase, Statistics julia> dest = zeros(3, 3); @@ -287,7 +287,7 @@ presence `missing`, `NaN` or `Inf` entries. Only allowed when entries in `x` and `y` are vectors. # Examples -```jldoctest +```julia-repl julia> using StatsBase, Statistics julia> x = [1 3 7 @@ -432,8 +432,8 @@ true ``` """ struct EqualSumSubsets - n::Int64 - num_subsets::Int64 + n::Int + num_subsets::Int function EqualSumSubsets(n, num_subsets) n >= 0 || throw("n must not be negative, but got $n") @@ -447,12 +447,12 @@ Base.eltype(::EqualSumSubsets) = TwoStepRange Base.length(x::EqualSumSubsets) = min(x.n, x.num_subsets) Base.firstindex(::EqualSumSubsets) = 1 -function Base.iterate(ess::EqualSumSubsets, state::Int64=1) +function Base.iterate(ess::EqualSumSubsets, state::Int=1) state > length(ess) && return nothing return getindex(ess, state), state + 1 end -function Base.getindex(ess::EqualSumSubsets, i::Int64=1) +function Base.getindex(ess::EqualSumSubsets, i::Int=1) n, nss = ess.n, ess.num_subsets step1 = 2i - 2nss - 1 step2 = 1 - 2i @@ -467,7 +467,7 @@ between `step1` and `step2`. `start` must be positive and `step1` and `step2` mu negative. # Examples -```jldoctest +```julia-repl julia> collect(StatsBase.TwoStepRange(30,-7,-3)) 6-element Vector{Int64}: 30 @@ -480,9 +480,9 @@ julia> collect(StatsBase.TwoStepRange(30,-7,-3)) ``` """ struct TwoStepRange - start::Int64 - step1::Int64 - step2::Int64 + start::Int + step1::Int + step2::Int function TwoStepRange(start, step1, step2) start > 0 || throw("start must be positive, but got $start") @@ -492,19 +492,19 @@ struct TwoStepRange end end -Base.eltype(::TwoStepRange) = Int64 +Base.eltype(::TwoStepRange) = Int function Base.length(tsr::TwoStepRange) return length((tsr.start):(tsr.step1+tsr.step2):1) + length((tsr.start+tsr.step1):(tsr.step1+tsr.step2):1) end -function Base.iterate(tsr::TwoStepRange, i::Int64=1) +function Base.iterate(tsr::TwoStepRange, i::Int=1) (i > length(tsr)) && return nothing return getindex(tsr, i), i + 1 end -function Base.getindex(tsr::TwoStepRange, i::Int64=1)::Int64 +function Base.getindex(tsr::TwoStepRange, i::Int=1)::Int a, b = divrem(i - 1, 2) return tsr.start + (tsr.step1 + tsr.step2) * a + b * tsr.step1 end \ No newline at end of file From 75d206a31c1a34978ecb2e33de44d13fd56d2cee Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Fri, 5 Apr 2024 12:28:03 +0100 Subject: [PATCH 14/24] Update tests --- test/rankcorr.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/rankcorr.jl b/test/rankcorr.jl index 9cd9ee1ce..af79131b8 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -32,7 +32,7 @@ using Test @test StatsBase.midpoint(1, 10) == 5 @test StatsBase.midpoint(1, widen(10)) == 5 - for n in 1:200, nss in 1:7 + for n in vcat(1:5, 10:20:90,1000), nss in [1, 4, 8, 20, 32, 64] #check is a partition @test sort(vcat([collect(s) for s in StatsBase.EqualSumSubsets(n, nss)]...)) == 1:n #check near-equal lengths From 56d27cc2dc3b994aafd47df1d1d69c9062ec6c81 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Fri, 5 Apr 2024 12:39:44 +0100 Subject: [PATCH 15/24] Update tests --- test/rankcorr.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/rankcorr.jl b/test/rankcorr.jl index af79131b8..381752e45 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -370,12 +370,12 @@ end number of threads, thanks to use of task-local storage. The tests below have a "safety factor" of 1.2 against the expected size of allocations. =# - @test (@allocated corkendall(x)) < (897_808 + Threads.nthreads() * 58_044) * 1.2 - @test (@allocated corkendall(xm, skipmissing=:listwise)) < (1_119_392 + Threads.nthreads() * 22_172) * 1.2 - @test (@allocated corkendall(xm, skipmissing=:pairwise)) < (892_112 + Threads.nthreads() * 61_116) * 1.2 + @test (@allocated corkendall(x)) < (896_144 + Threads.nthreads() * 57_976) * 1.2 + @test (@allocated corkendall(xm,skipmissing=:listwise)) < (1_117_728 + Threads.nthreads() * 22_104) * 1.2 + @test (@allocated corkendall(xm,skipmissing=:pairwise)) < (890_448 + Threads.nthreads() * 61_048) * 1.2 @test (@allocated corspearman(x)) < (2_678_448 + Threads.nthreads() * 9_128) * 1.2 - @test (@allocated corspearman(xm, skipmissing=:listwise)) < (1_803_712 + Threads.nthreads() * 3_992) * 1.2 - @test (@allocated corspearman(xm, skipmissing=:pairwise)) < (1_692_208 + Threads.nthreads() * 67_172) * 1.2 - + @test (@allocated corspearman(xm,skipmissing=:listwise)) < (1_803_712 + Threads.nthreads() * 3_992) * 1.2 + @test (@allocated corspearman(xm,skipmissing=:pairwise)) < (1_690_544 + Threads.nthreads() * 67_104) * 1.2 + end # COV_EXCL_STOP \ No newline at end of file From 682859a1179870ae207ea5e5502ccd3a598edff4 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Fri, 5 Apr 2024 18:10:35 +0100 Subject: [PATCH 16/24] Tests now pass on Julia 1.8.5 No longer use eachcol when VERSION < v"1.9" --- src/rankcorr.jl | 18 +++++++++++++++--- test/pairwise.jl | 14 +++++++------- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index dca82770e..eb9c4c211 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -48,7 +48,11 @@ end function corspearman(x::AbstractMatrix, y::AbstractMatrix=x; skipmissing::Symbol=:none) check_rankcor_args(x, y, skipmissing, true) - return pairwise(corspearman, eachcol(x), eachcol(y); skipmissing=skipmissing) + if x === y + return pairwise(corspearman, _eachcol(x); skipmissing=skipmissing) + else + return pairwise(corspearman, _eachcol(x), _eachcol(y); skipmissing=skipmissing) + end end function corspearman(x::AbstractVector{T}) where {T} @@ -414,7 +418,11 @@ Uses multiple threads when either `x` or `y` is a matrix. function corkendall(x::AbstractMatrix, y::AbstractMatrix=x; skipmissing::Symbol=:none) check_rankcor_args(x, y, skipmissing, true) - return pairwise(corkendall, eachcol(x), eachcol(y); skipmissing=skipmissing) + if x === y + return pairwise(corkendall, _eachcol(x); skipmissing=skipmissing) + else + return pairwise(corkendall, _eachcol(x), _eachcol(y); skipmissing=skipmissing) + end end function corkendall(x::AbstractVector, y::AbstractVector; @@ -746,4 +754,8 @@ end # can also accept arguments with element type for which isnan is not defined but isless is # is defined, so that rank correlation makes sense. _isnan(x::T) where {T<:Number} = isnan(x) -_isnan(x) = false \ No newline at end of file +_isnan(x) = false + +# eachcol was added in Julia 1.1 but for Julia 1.8, keys(eachcol(x)) fails for any Matrix x +# which causes `check_vectors` to fail. Solution is to use the comprehension below. +_eachcol(x) = VERSION < v"1.9" ? [view(x, :, i) for i in axes(x, 2)] : eachcol(x) \ No newline at end of file diff --git a/test/pairwise.jl b/test/pairwise.jl index 6a3bf1df1..77ab279ea 100644 --- a/test/pairwise.jl +++ b/test/pairwise.jl @@ -53,8 +53,8 @@ arbitrary_fun(x, y) = cor(x, y) @inferred pairwise(f, x, y) if throwsforzerolengthinput - @test_throws CompositeException pairwise(f, [Int[]], [Int[]]) - @test_throws CompositeException pairwise!(f, zeros(1, 1), [Int[]], [Int[]]) + @test_throws Union{CompositeException,TaskFailedException} pairwise(f, [Int[]], [Int[]]) + @test_throws Union{CompositeException,TaskFailedException} pairwise!(f, zeros(1, 1), [Int[]], [Int[]]) end res = pairwise(f, [], []) @@ -182,12 +182,12 @@ arbitrary_fun(x, y) = cor(x, y) if VERSION >= v"1.5" # Fails with UndefVarError on Julia 1.0 if throwsforzerolengthinput - @test_throws CompositeException pairwise(f, xm, ym, skipmissing=:pairwise) - @test_throws CompositeException pairwise(f, xm, ym, skipmissing=:listwise) + @test_throws Union{CompositeException,TaskFailedException} pairwise(f, xm, ym, skipmissing=:pairwise) + @test_throws Union{CompositeException,TaskFailedException} pairwise(f, xm, ym, skipmissing=:listwise) res = zeros(Union{Float64,Missing}, length(xm), length(ym)) - @test_throws CompositeException pairwise!(f, res, xm, ym, skipmissing=:pairwise) - @test_throws CompositeException pairwise!(f, res, xm, ym, skipmissing=:listwise) + @test_throws Union{CompositeException,TaskFailedException} pairwise!(f, res, xm, ym, skipmissing=:pairwise) + @test_throws Union{CompositeException,TaskFailedException} pairwise!(f, res, xm, ym, skipmissing=:listwise) end end @@ -305,7 +305,7 @@ arbitrary_fun(x, y) = cor(x, y) NaN 1.0] if VERSION >= v"1.5" if f == cor - @test_throws CompositeException pairwise(f, [[missing, missing, missing], + @test_throws Union{CompositeException,TaskFailedException} pairwise(f, [[missing, missing, missing], [missing, missing, missing]], skipmissing=sm) end From 9c039dd39edd122e950e5448617de40ee1530f66 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Fri, 5 Apr 2024 19:00:28 +0100 Subject: [PATCH 17/24] Compatibility with Julia 1.6 --- test/pairwise.jl | 6 +++++- test/rankcorr.jl | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/test/pairwise.jl b/test/pairwise.jl index 77ab279ea..6bfb483c6 100644 --- a/test/pairwise.jl +++ b/test/pairwise.jl @@ -116,7 +116,11 @@ arbitrary_fun(x, y) = cor(x, y) @test res ≅ res2 # Use myskipmissings rather than skipmissings to avoid deprecation warning function myskipmissings(x, y) - which = @. !(ismissing(x) || ismissing(y)) + #which = @. !(ismissing(x) || ismissing(y)) # not compatible with Julia 1.6 + which = similar(x,Bool) + for i in eachindex(x) + which[i] = !(ismissing(x[i])|| ismissing(y[i])) + end return x[which], y[which] end diff --git a/test/rankcorr.jl b/test/rankcorr.jl index 381752e45..e2e025dc1 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -215,8 +215,8 @@ julia> corkendall(Matrix{Union{Missing,Float64}}(missing,5,3)) #DIFFERENT behavi @test isequal(f([], []), NaN) @test isequal(f(fill(1, 0, 2), fill(1, 0, 2)), [NaN NaN; NaN NaN]) @test isequal(f(fill(1, 0, 2)), [1.0 NaN; NaN 1.0]) - @test isequal(f([1;;], [1;;]), [NaN;;]) - @test isequal(f([1;;]), [1.0;;]) + @test isequal(f(reshape([1],(1,1)), reshape([1],(1,1))), reshape([NaN],(1,1))) + @test isequal(f(reshape([1],(1,1))), reshape([1.0],(1,1))) @test isequal(f([missing], [missing]), NaN) @test isequal(f([1], [1]), NaN) @test isequal(f([NaN], [NaN]), NaN) From f53b7b50903ba931aeca7aa73254f890a8d97290 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Fri, 5 Apr 2024 21:50:34 +0100 Subject: [PATCH 18/24] Test against Julia 1.6, 1.7, 1.8 and 1.9 to find possible new minimum version --- .github/workflows/ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 74e4c14cd..0b3acc82f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,6 +13,10 @@ jobs: matrix: version: - '1.0' + - '1.6' + - '1.7' + - '1.8' + - '1.9' - '1' # automatically expands to the latest stable 1.x release of Julia - 'nightly' os: From 88bf8f60f16f4a266488878fdcbec6b798648bd7 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Fri, 5 Apr 2024 22:16:24 +0100 Subject: [PATCH 19/24] Test on Julia 1.1, 1.2, 1.3. 1.4 and 1.5 --- .github/workflows/ci.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0b3acc82f..dce83bfd1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,11 +12,11 @@ jobs: fail-fast: false matrix: version: - - '1.0' - - '1.6' - - '1.7' - - '1.8' - - '1.9' + - '1.1' + - '1.2' + - '1.3' + - '1.4' + - '1.5' - '1' # automatically expands to the latest stable 1.x release of Julia - 'nightly' os: From ddb1b516c123352d6dc2818447af9be0ebeb92b7 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Fri, 5 Apr 2024 22:56:51 +0100 Subject: [PATCH 20/24] Demonstrate that 1.6 could be new minimum version --- .github/workflows/ci.yml | 6 +----- Project.toml | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dce83bfd1..433237ecc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,11 +12,7 @@ jobs: fail-fast: false matrix: version: - - '1.1' - - '1.2' - - '1.3' - - '1.4' - - '1.5' + - '1.6' - '1' # automatically expands to the latest stable 1.x release of Julia - 'nightly' os: diff --git a/Project.toml b/Project.toml index 5e6bffed1..076dd40d4 100644 --- a/Project.toml +++ b/Project.toml @@ -24,7 +24,7 @@ Missings = "0.3, 0.4, 1.0" SortingAlgorithms = "0.3, 1.0" Statistics = "1" StatsAPI = "1.2" -julia = "1" +julia = "1.6" [extras] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" From f67e867250d6eda98b556ff728ce5f1393ad248d Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Sun, 7 Apr 2024 12:41:14 +0100 Subject: [PATCH 21/24] Corrected call to keyword argument --- test/rankcorr.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/rankcorr.jl b/test/rankcorr.jl index e2e025dc1..be0aeaa97 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -341,7 +341,7 @@ end if !((arg1 isa Vector) && (arg2 isa Vector) && skipmissing == :listwise) copy1 = copy(arg1) copy2 = copy(arg2) - res = f(arg1, arg2; skipmissing) + res = f(arg1, arg2; skipmissing=skipmissing) @test isequal(arg1, copy1) @test isequal(arg2, copy2) end From 246f32d5f6b152d41a68a9b2442f7f675bee5a6f Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Wed, 10 Apr 2024 15:23:18 +0100 Subject: [PATCH 22/24] Improved test coverage (src/pairwise.jl#L84-L87) --- test/pairwise.jl | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/test/pairwise.jl b/test/pairwise.jl index 6bfb483c6..24ede4492 100644 --- a/test/pairwise.jl +++ b/test/pairwise.jl @@ -85,6 +85,10 @@ arbitrary_fun(x, y) = cor(x, y) @test_throws DimensionMismatch pairwise!(f, zeros(1, 2), [], []) @test_throws DimensionMismatch pairwise!(f, zeros(0, 0), [], [[1, 2], [2, 3]]) + + @test pairwise(f, x, y) == (pairwise(f, y, x))' + @test pairwise(f, x, y, skipmissing=:pairwise) == (pairwise(f, y, x, skipmissing=:pairwise))' + @test pairwise(f, x, y, skipmissing=:listwise) == (pairwise(f, y, x, skipmissing=:listwise))' end if f in (cor, corkendall, corspearman) @@ -117,9 +121,9 @@ arbitrary_fun(x, y) = cor(x, y) # Use myskipmissings rather than skipmissings to avoid deprecation warning function myskipmissings(x, y) #which = @. !(ismissing(x) || ismissing(y)) # not compatible with Julia 1.6 - which = similar(x,Bool) + which = similar(x, Bool) for i in eachindex(x) - which[i] = !(ismissing(x[i])|| ismissing(y[i])) + which[i] = !(ismissing(x[i]) || ismissing(y[i])) end return x[which], y[which] end From 46d5655d8287cc41ad25096b812b554666108df5 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Wed, 10 Apr 2024 16:15:21 +0100 Subject: [PATCH 23/24] Improved test coverage (StatsBase._cor) --- test/rankcorr.jl | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/test/rankcorr.jl b/test/rankcorr.jl index be0aeaa97..c399ea16f 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -21,6 +21,24 @@ using Test @test StatsBase.handle_pairwise(x, float.(y)) == ([2, 3, 4], [1.0, 2.0, 4.0]) @test StatsBase.handle_pairwise(u, v) == (Int64[], Int64[]) + ranksx = [missing missing] + ranksy = [1 2] + @test isequal(StatsBase._cor(ranksx, ranksy), [NaN NaN; NaN NaN]) + @test isequal(StatsBase._cor(ranksx, ranksx), [missing NaN; NaN missing]) + @test isequal(StatsBase._cor(ranksy, ranksy), [1.0 NaN; NaN 1.0]) + + ranksx = [missing missing; missing missing] + @test isequal(StatsBase._cor(ranksx, ranksx), [missing missing; missing missing]) + + Random.seed!(1) + ranksx = rand(10,5) ;ranksy = rand(10,6) + @test StatsBase._cor(ranksx,ranksx) == cor(ranksx) + @test StatsBase._cor(ranksx,ranksy) == cor(ranksx,ranksy) + + ranksx = fill(missing,3,3);ranksy = hcat(ranksx,[1,2,3]) + @test isequal(StatsBase._cor(ranksx,ranksx),fill(missing,3,3)) + @test isequal(StatsBase._cor(ranksx,ranksy),fill(missing,3,4)) + v = collect(100:-1:1) StatsBase.insertion_sort!(v, 1, n) @test v == 1:n @@ -32,7 +50,7 @@ using Test @test StatsBase.midpoint(1, 10) == 5 @test StatsBase.midpoint(1, widen(10)) == 5 - for n in vcat(1:5, 10:20:90,1000), nss in [1, 4, 8, 20, 32, 64] + for n in vcat(1:5, 10:20:90, 1000), nss in [1, 4, 8, 20, 32, 64] #check is a partition @test sort(vcat([collect(s) for s in StatsBase.EqualSumSubsets(n, nss)]...)) == 1:n #check near-equal lengths @@ -215,8 +233,8 @@ julia> corkendall(Matrix{Union{Missing,Float64}}(missing,5,3)) #DIFFERENT behavi @test isequal(f([], []), NaN) @test isequal(f(fill(1, 0, 2), fill(1, 0, 2)), [NaN NaN; NaN NaN]) @test isequal(f(fill(1, 0, 2)), [1.0 NaN; NaN 1.0]) - @test isequal(f(reshape([1],(1,1)), reshape([1],(1,1))), reshape([NaN],(1,1))) - @test isequal(f(reshape([1],(1,1))), reshape([1.0],(1,1))) + @test isequal(f(reshape([1], (1, 1)), reshape([1], (1, 1))), reshape([NaN], (1, 1))) + @test isequal(f(reshape([1], (1, 1))), reshape([1.0], (1, 1))) @test isequal(f([missing], [missing]), NaN) @test isequal(f([1], [1]), NaN) @test isequal(f([NaN], [NaN]), NaN) @@ -371,11 +389,11 @@ end factor" of 1.2 against the expected size of allocations. =# @test (@allocated corkendall(x)) < (896_144 + Threads.nthreads() * 57_976) * 1.2 - @test (@allocated corkendall(xm,skipmissing=:listwise)) < (1_117_728 + Threads.nthreads() * 22_104) * 1.2 - @test (@allocated corkendall(xm,skipmissing=:pairwise)) < (890_448 + Threads.nthreads() * 61_048) * 1.2 + @test (@allocated corkendall(xm, skipmissing=:listwise)) < (1_117_728 + Threads.nthreads() * 22_104) * 1.2 + @test (@allocated corkendall(xm, skipmissing=:pairwise)) < (890_448 + Threads.nthreads() * 61_048) * 1.2 @test (@allocated corspearman(x)) < (2_678_448 + Threads.nthreads() * 9_128) * 1.2 - @test (@allocated corspearman(xm,skipmissing=:listwise)) < (1_803_712 + Threads.nthreads() * 3_992) * 1.2 - @test (@allocated corspearman(xm,skipmissing=:pairwise)) < (1_690_544 + Threads.nthreads() * 67_104) * 1.2 - + @test (@allocated corspearman(xm, skipmissing=:listwise)) < (1_803_712 + Threads.nthreads() * 3_992) * 1.2 + @test (@allocated corspearman(xm, skipmissing=:pairwise)) < (1_690_544 + Threads.nthreads() * 67_104) * 1.2 + end # COV_EXCL_STOP \ No newline at end of file From e1395b8c00af90ed4ae1b28a65002ac26c3853cf Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Thu, 11 Apr 2024 11:58:14 +0100 Subject: [PATCH 24/24] Trivial change to re-run tests --- test/rankcorr.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/rankcorr.jl b/test/rankcorr.jl index c399ea16f..a08070408 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -1,5 +1,4 @@ -using StatsBase, Random -using Test +using StatsBase, Random, Test @testset "rank_correlation_auxiliary_fns" begin