Skip to content

Commit f876eb6

Browse files
committed
Cleanup
1 parent 41b00c4 commit f876eb6

File tree

3 files changed

+78
-180
lines changed

3 files changed

+78
-180
lines changed

src/Statistics.jl

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -211,13 +211,16 @@ _mean(A::AbstractArray, dims, w::AbstractArray) =
211211
realXcY(x::Real, y::Real) = x*y
212212
realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y)
213213

214-
var(iterable; corrected::Bool=true, mean=nothing) = _var(iterable, corrected, mean)
214+
function var(iterable; corrected::Bool=true, mean=nothing)
215+
s, count = _sumsq(iterable, mean)
216+
s / (count - Int(corrected))
217+
end
215218

216-
function _var(iterable, corrected::Bool, mean)
219+
function _sumsq(iterable, mean)
217220
y = iterate(iterable)
218221
if y === nothing
219222
T = eltype(iterable)
220-
return oftype((abs2(zero(T)) + abs2(zero(T)))/2, NaN)
223+
return oftype((abs2(zero(T)) + abs2(zero(T)))/2, NaN), 0
221224
end
222225
count = 1
223226
value, state = y
@@ -235,7 +238,7 @@ function _var(iterable, corrected::Bool, mean)
235238
S = S + realXcY(value - M, value - new_M)
236239
M = new_M
237240
end
238-
return S / (count - Int(corrected))
241+
return S, count
239242
elseif isa(mean, Number) # mean provided
240243
# Cannot use a compensated version, e.g. the one from
241244
# "Updating Formulae and a Pairwise Algorithm for Computing Sample Variances."
@@ -249,7 +252,7 @@ function _var(iterable, corrected::Bool, mean)
249252
count += 1
250253
sum2 += abs2(value - mean)
251254
end
252-
return sum2 / (count - Int(corrected))
255+
return sum2, count
253256
else
254257
throw(ArgumentError("invalid value of mean, $(mean)::$(typeof(mean))"))
255258
end
@@ -356,7 +359,7 @@ varm(A::AbstractArray, m; corrected::Bool=true, dims=:,
356359
_varm(A, m, corrected, dims, weights)
357360

358361
varm(iterable, m; corrected::Bool=true) =
359-
_var(iterable, corrected, m)
362+
var(iterable, mean=m, corrected=corrected)
360363

361364
_varm(A::AbstractArray, m, corrected::Bool, dims, w::Nothing) =
362365
varm!(Base.reducedim_init(t -> abs2(t)/2, +, A, dims), A, m, w, corrected=corrected)

src/scalarstats.jl

Lines changed: 24 additions & 146 deletions
Original file line numberDiff line numberDiff line change
@@ -172,16 +172,17 @@ Return the `p`th percentile of a collection `x`, i.e. `quantile(x, p / 100)`.
172172
"""
173173
percentile(x, p) = quantile(x, p * 0.01)
174174

175+
# TODO: move to same place as other quantile methods
175176
"""
176-
nquantile(x, n::Integer)
177+
quantile(x, n::Integer)
177178
178179
Return the n-quantiles of collection `x`, i.e. the values which
179180
partition `v` into `n` subsets of nearly equal size.
180181
181-
Equivalent to `quantile(x, [0:n]/n)`. For example, `nquantiles(x, 5)`
182+
Equivalent to `quantile(x, [0:n]/n)`. For example, `quantile(x, 5)`
182183
returns a vector of quantiles, respectively at `[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]`.
183184
"""
184-
nquantile(x, n::Integer) = quantile(x, (0:n)/n)
185+
quantile(x, n::Integer) = quantile(x, (0:n)/n)
185186

186187

187188
#############################
@@ -199,16 +200,15 @@ The minimum and maximum of `x` are computed in one pass using `extrema`.
199200
"""
200201
span(x) = ((a, b) = extrema(x); a:b)
201202

202-
# Variation coefficient: std / mean
203+
# Coefficient of variation: std / mean
203204
"""
204205
variation(x, m=mean(x))
205206
206207
Return the coefficient of variation of collection `x`, optionally specifying
207208
a precomputed mean `m`. The coefficient of variation is the ratio of the
208209
standard deviation to the mean.
209210
"""
210-
variation(x, m) = stdm(x, m) / m
211-
variation(x) = ((m, s) = mean_and_std(x); s/m)
211+
variation(x, m=mean(x)) = std(x, mean=m) / m
212212

213213
# Standard error of the mean: std / sqrt(len)
214214
# Code taken from var in the Statistics stdlib module
@@ -221,34 +221,13 @@ realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y)
221221
sem(x)
222222
223223
Return the standard error of the mean of collection `x`,
224-
i.e. `sqrt(var(x, corrected=true) / length(x))`.
224+
i.e. `std(x, corrected=true) / sqrt(length(x))`.
225225
"""
226226
function sem(x)
227-
y = iterate(x)
228-
if y === nothing
229-
T = eltype(x)
230-
# Return the NaN of the type that we would get, had this collection
231-
# contained any elements (this is consistent with std)
232-
return oftype(sqrt((abs2(zero(T)) + abs2(zero(T)))/2), NaN)
233-
end
234-
count = 1
235-
value, state = y
236-
y = iterate(x, state)
237-
# Use Welford algorithm as seen in (among other places)
238-
# Knuth's TAOCP, Vol 2, page 232, 3rd edition.
239-
M = value / 1
240-
S = real(zero(M))
241-
while y !== nothing
242-
value, state = y
243-
y = iterate(x, state)
244-
count += 1
245-
new_M = M + (value - M) / count
246-
S = S + realXcY(value - M, value - new_M)
247-
M = new_M
248-
end
249-
var = S / (count - 1)
250-
return sqrt(var/count)
227+
s, count = _sumsq(iterable, mean)
228+
sqrt((s / (count - 1)) / count)
251229
end
230+
sem(x::AbstractArray) = sqrt(var(x, corrected=true) / length(x))
252231

253232
# Median absolute deviation
254233
@irrational mad_constant 1.4826022185056018 BigFloat("1.482602218505601860547076529360423431326703202590312896536266275245674447622701")
@@ -355,7 +334,7 @@ matrix of `X`.
355334
genvar(X::AbstractMatrix) = size(X, 2) == 1 ? var(vec(X)) : det(cov(X))
356335
genvar(itr) = var(itr)
357336

358-
# Total variation
337+
# Total variance
359338
"""
360339
totalvar(X)
361340
@@ -367,114 +346,6 @@ of the covariance matrix of `X`.
367346
totalvar(X::AbstractMatrix) = sum(var(X, dims=1))
368347
totalvar(itr) = var(itr)
369348

370-
#############################
371-
#
372-
# Z-scores
373-
#
374-
#############################
375-
376-
function _zscore!(Z::AbstractArray, X::AbstractArray, μ::Real, σ::Real)
377-
# Z and X are assumed to have the same size
378-
= inv(σ)
379-
if μ == zero(μ)
380-
for i = 1 : length(X)
381-
@inbounds Z[i] = X[i] *
382-
end
383-
else
384-
for i = 1 : length(X)
385-
@inbounds Z[i] = (X[i] - μ) *
386-
end
387-
end
388-
return Z
389-
end
390-
391-
@generated function _zscore!(Z::AbstractArray{S,N}, X::AbstractArray{T,N},
392-
μ::AbstractArray, σ::AbstractArray) where {S,T,N}
393-
quote
394-
# Z and X are assumed to have the same size
395-
# μ and σ are assumed to have the same size, that is compatible with size(X)
396-
siz1 = size(X, 1)
397-
@nextract $N ud d->size(μ, d)
398-
if size(μ, 1) == 1 && siz1 > 1
399-
@nloops $N i d->(d>1 ? (1:size(X,d)) : (1:1)) d->(j_d = ud_d ==1 ? 1 : i_d) begin
400-
v = (@nref $N μ j)
401-
c = inv(@nref $N σ j)
402-
for i_1 = 1:siz1
403-
(@nref $N Z i) = ((@nref $N X i) - v) * c
404-
end
405-
end
406-
else
407-
@nloops $N i X d->(j_d = ud_d ==1 ? 1 : i_d) begin
408-
(@nref $N Z i) = ((@nref $N X i) - (@nref $N μ j)) / (@nref $N σ j)
409-
end
410-
end
411-
return Z
412-
end
413-
end
414-
415-
function _zscore_chksize(X::AbstractArray, μ::AbstractArray, σ::AbstractArray)
416-
size(μ) == size(σ) || throw(DimensionMismatch("μ and σ should have the same size."))
417-
for i=1:ndims(X)
418-
dμ_i = size(μ,i)
419-
(dμ_i == 1 || dμ_i == size(X,i)) || throw(DimensionMismatch("X and μ have incompatible sizes."))
420-
end
421-
end
422-
423-
424-
"""
425-
zscore!([Z], X, μ, σ)
426-
427-
Compute the z-scores of an array `X` with mean `μ` and standard deviation `σ`.
428-
z-scores are the signed number of standard deviations above the mean that an
429-
observation lies, i.e. ``(x - μ) / σ``.
430-
431-
If a destination array `Z` is provided, the scores are stored
432-
in `Z` and it must have the same shape as `X`. Otherwise `X` is overwritten.
433-
"""
434-
function zscore!(Z::AbstractArray{ZT}, X::AbstractArray{T}, μ::Real, σ::Real) where {ZT<:AbstractFloat,T<:Real}
435-
size(Z) == size(X) || throw(DimensionMismatch("Z and X must have the same size."))
436-
_zscore!(Z, X, μ, σ)
437-
end
438-
439-
function zscore!(Z::AbstractArray{<:AbstractFloat}, X::AbstractArray{<:Real},
440-
μ::AbstractArray{<:Real}, σ::AbstractArray{<:Real})
441-
size(Z) == size(X) || throw(DimensionMismatch("Z and X must have the same size."))
442-
_zscore_chksize(X, μ, σ)
443-
_zscore!(Z, X, μ, σ)
444-
end
445-
446-
zscore!(X::AbstractArray{<:AbstractFloat}, μ::Real, σ::Real) = _zscore!(X, X, μ, σ)
447-
448-
zscore!(X::AbstractArray{<:AbstractFloat}, μ::AbstractArray{<:Real}, σ::AbstractArray{<:Real}) =
449-
(_zscore_chksize(X, μ, σ); _zscore!(X, X, μ, σ))
450-
451-
452-
"""
453-
zscore(X, [μ, σ])
454-
455-
Compute the z-scores of `X`, optionally specifying a precomputed mean `μ` and
456-
standard deviation `σ`. z-scores are the signed number of standard deviations
457-
above the mean that an observation lies, i.e. ``(x - μ) / σ``.
458-
459-
`μ` and `σ` should be both scalars or both arrays. The computation is broadcasting.
460-
In particular, when `μ` and `σ` are arrays, they should have the same size, and
461-
`size(μ, i) == 1 || size(μ, i) == size(X, i)` for each dimension.
462-
"""
463-
function zscore(X::AbstractArray{T}, μ::Real, σ::Real) where T<:Real
464-
ZT = typeof((zero(T) - zero(μ)) / one(σ))
465-
_zscore!(Array{ZT}(undef, size(X)), X, μ, σ)
466-
end
467-
468-
function zscore(X::AbstractArray{T}, μ::AbstractArray{U}, σ::AbstractArray{S}) where {T<:Real,U<:Real,S<:Real}
469-
_zscore_chksize(X, μ, σ)
470-
ZT = typeof((zero(T) - zero(U)) / one(S))
471-
_zscore!(Array{ZT}(undef, size(X)), X, μ, σ)
472-
end
473-
474-
zscore(X::AbstractArray{<:Real}) = ((μ, σ) = mean_and_std(X); zscore(X, μ, σ))
475-
zscore(X::AbstractArray{<:Real}, dim::Int) = ((μ, σ) = mean_and_std(X, dim); zscore(X, μ, σ))
476-
477-
478349

479350
#############################
480351
#
@@ -521,7 +392,7 @@ function renyientropy(p::AbstractArray{T}, α::Real) where T<:Real
521392
end
522393
end
523394
s = s / scale
524-
elseif (isinf))
395+
elseif isinf(α)
525396
s = -log(maximum(p))
526397
else # a normal Rényi entropy
527398
for i = 1:length(p)
@@ -586,7 +457,7 @@ kldivergence(p::AbstractArray{T}, q::AbstractArray{T}, b::Real) where {T<:Real}
586457

587458
#############################
588459
#
589-
# summary
460+
# Summary Statistics
590461
#
591462
#############################
592463

@@ -599,17 +470,18 @@ struct SummaryStats{T<:Union{AbstractFloat,Missing}}
599470
max::T
600471
nobs::Int
601472
nmiss::Int
473+
isnumeric::Bool
602474
end
603475

604476

605477
"""
606-
summarystats(a)
478+
describe(a)
607479
608480
Compute summary statistics for a real-valued array `a`. Returns a
609481
`SummaryStats` object containing the mean, minimum, 25th percentile,
610482
median, 75th percentile, and maxmimum.
611483
"""
612-
function summarystats(a::AbstractArray{T}) where T<:Union{Real,Missing}
484+
function describe(a::AbstractArray{T}) where T<:Union{Real,Missing}
613485
# `mean` doesn't fail on empty input but rather returns `NaN`, so we can use the
614486
# return type to populate the `SummaryStats` structure.
615487
s = T >: Missing ? collect(skipmissing(a)) : a
@@ -624,14 +496,20 @@ function summarystats(a::AbstractArray{T}) where T<:Union{Real,Missing}
624496
else
625497
quantile(s, [0.00, 0.25, 0.50, 0.75, 1.00])
626498
end
627-
SummaryStats{R}(m, qs..., n, n - ns)
499+
SummaryStats{R}(m, qs..., n, n - ns, true)
500+
end
501+
502+
function describe(a::AbstractArray{T}) where T
503+
nmiss = T >: Missing ? count(ismissing, a) : 0
504+
SummaryStats{R}(NaN, NaN, NaN, NaN, NaN, length(a), nmiss, false)
628505
end
629506

630507
function Base.show(io::IO, ss::SummaryStats)
631-
println(io, "Summary Stats:")
508+
println(io, "Summary Statistics:")
632509
@printf(io, "Length: %i\n", ss.nobs)
633510
ss.nobs > 0 || return
634511
@printf(io, "Missing Count: %i\n", ss.nmiss)
512+
ss.isnumeric || return
635513
@printf(io, "Mean: %.6f\n", ss.mean)
636514
@printf(io, "Minimum: %.6f\n", ss.min)
637515
@printf(io, "1st Quartile: %.6f\n", ss.q25)

0 commit comments

Comments
 (0)