Adjust eweights calculation to avoid precision issues (#509)

rofinn · web-flow · commit 1678fd1d12f6 · 2021-11-19T15:06:09.000-06:00
* Adjust eweights calculation to avoid precision issues.

The modified function is equivalent to dividing all weights by the largest value.

```
julia&gt; x = [
        0.3
        0.42857142857142855
        0.6122448979591837
        0.8746355685131197
        1.249479383590171
        1.7849705479859588
        2.549957925694227
        3.642797036706039
        5.203995766722913
        7.434279666747019
        ]
10-element Array{Float64,1}:
 0.3
 0.42857142857142855
 0.6122448979591837
 0.8746355685131197
 1.249479383590171
 1.7849705479859588
 2.549957925694227
 3.642797036706039
 5.203995766722913
 7.434279666747019

julia&gt; x ./ last(x)
10-element Array{Float64,1}:
 0.04035360699999998
 0.057648009999999965
 0.08235429999999996
 0.11764899999999996
 0.16806999999999994
 0.24009999999999995
 0.34299999999999997
 0.49
 0.7
 1.0

julia&gt; using StatsBase
[ Info: Recompiling stale cache file /Users/rory/.julia/compiled/v1.0/StatsBase/EZjIG.ji for StatsBase [2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91]

julia&gt; eweights(1:10, 0.3)
10-element Weights{Float64,Float64,Array{Float64,1}}:
 0.04035360699999998
 0.05764800999999997
 0.08235429999999996
 0.11764899999999996
 0.16806999999999994
 0.24009999999999995
 0.3429999999999999
 0.48999999999999994
 0.7
 1.0
```

* Fix eweight tests.

* Add a couple links to the docstring.

* Address review comments and properly deprecate unscaled behaviour with a `scaled::DepBool` kwarg.

* Rename scaled =&gt; scale.

* Fixup docstring slightly and don't make (t, λ, n) method a public method.

* More docstring updates.

* Reword `n` explanation

* Bump patch release
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "StatsBase"
 uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 authors = ["JuliaStats"]
-version = "0.33.12"
+version = "0.33.13"
 
 [deps]
 DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
diff --git a/src/common.jl b/src/common.jl
@@ -23,9 +23,9 @@ const RealFP = Union{Float32, Float64}
 # A convenient typealias for deprecating default corrected Bool
 const DepBool = Union{Bool, Nothing}
 
-function depcheck(fname::Symbol, b::DepBool)
-    if b == nothing
-        msg = "$fname will default to corrected=true in the future. Use corrected=false for previous behaviour."
+function depcheck(fname::Symbol, varname::Symbol, b::DepBool)
+    if b === nothing
+        msg = "$fname will default to $varname=true in the future. Use $varname=false for previous behaviour."
         Base.depwarn(msg, fname)
         false
     else
diff --git a/src/cov.jl b/src/cov.jl
@@ -92,11 +92,11 @@ _scattermatm(x::DenseMatrix, wv::AbstractWeights, mean, dims::Int) =
 ## weighted cov
 covm(x::DenseMatrix, mean, w::AbstractWeights, dims::Int=1;
      corrected::DepBool=nothing) =
-    rmul!(scattermat(x, w, mean=mean, dims=dims), varcorrection(w, depcheck(:covm, corrected)))
+    rmul!(scattermat(x, w, mean=mean, dims=dims), varcorrection(w, depcheck(:covm, :corrected, corrected)))
 
 
 cov(x::DenseMatrix, w::AbstractWeights, dims::Int=1; corrected::DepBool=nothing) =
-    covm(x, mean(x, w, dims=dims), w, dims; corrected=depcheck(:cov, corrected))
+    covm(x, mean(x, w, dims=dims), w, dims; corrected=depcheck(:cov, :corrected, corrected))
 
 function corm(x::DenseMatrix, mean, w::AbstractWeights, vardim::Int=1)
     c = covm(x, mean, w, vardim; corrected=false)
@@ -120,7 +120,7 @@ end
 function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, dims::Int=1;
                       corrected::DepBool=nothing)
     m = mean(x, wv, dims=dims)
-    return m, cov(x, wv, dims; corrected=depcheck(:mean_and_cov, corrected))
+    return m, cov(x, wv, dims; corrected=depcheck(:mean_and_cov, :corrected, corrected))
 end
 
 """
diff --git a/src/moments.jl b/src/moments.jl
@@ -19,7 +19,7 @@ the population variance is computed by replacing
 * `Weights`: `ArgumentError` (bias correction not supported)
 """
 varm(v::RealArray, w::AbstractWeights, m::Real; corrected::DepBool=nothing) =
-    _moment2(v, w, m; corrected=depcheck(:varm, corrected))
+    _moment2(v, w, m; corrected=depcheck(:varm, :corrected, corrected))
 
 """
     var(x::AbstractArray, w::AbstractWeights, [dim]; mean=nothing, corrected=false)
@@ -40,7 +40,7 @@ replacing ``\\frac{1}{\\sum{w}}`` with a factor dependent on the type of weights
 """
 function var(v::RealArray, w::AbstractWeights; mean=nothing,
                   corrected::DepBool=nothing)
-    corrected = depcheck(:var, corrected)
+    corrected = depcheck(:var, :corrected, corrected)
 
     if mean == nothing
         varm(v, w, Statistics.mean(v, w); corrected=corrected)
@@ -53,14 +53,14 @@ end
 
 function varm!(R::AbstractArray, A::RealArray, w::AbstractWeights, M::RealArray,
                     dim::Int; corrected::DepBool=nothing)
-    corrected = depcheck(:varm!, corrected)
+    corrected = depcheck(:varm!, :corrected, corrected)
     rmul!(_wsum_centralize!(R, abs2, A, convert(Vector, w), M, dim, true),
           varcorrection(w, corrected))
 end
 
 function var!(R::AbstractArray, A::RealArray, w::AbstractWeights, dims::Int;
               mean=nothing, corrected::DepBool=nothing)
-    corrected = depcheck(:var!, corrected)
+    corrected = depcheck(:var!, :corrected, corrected)
 
     if mean == 0
         varm!(R, A, w, Base.reducedim_initarray(A, dims, 0, eltype(R)), dims;
@@ -84,14 +84,14 @@ end
 
 function varm(A::RealArray, w::AbstractWeights, M::RealArray, dim::Int;
                    corrected::DepBool=nothing)
-    corrected = depcheck(:varm, corrected)
+    corrected = depcheck(:varm, :corrected, corrected)
     varm!(similar(A, Float64, Base.reduced_indices(axes(A), dim)), A, w, M,
                dim; corrected=corrected)
 end
 
 function var(A::RealArray, w::AbstractWeights, dim::Int; mean=nothing,
                   corrected::DepBool=nothing)
-    corrected = depcheck(:var, corrected)
+    corrected = depcheck(:var, :corrected, corrected)
     var!(similar(A, Float64, Base.reduced_indices(axes(A), dim)), A, w, dim;
          mean=mean, corrected=corrected)
 end
@@ -115,7 +115,7 @@ dependent on the type of weights used:
 * `Weights`: `ArgumentError` (bias correction not supported)
 """
 stdm(v::RealArray, w::AbstractWeights, m::Real; corrected::DepBool=nothing) =
-    sqrt(varm(v, w, m, corrected=depcheck(:stdm, corrected)))
+    sqrt(varm(v, w, m, corrected=depcheck(:stdm, :corrected, corrected)))
 
 """
     std(x::AbstractArray, w::AbstractWeights, [dim]; mean=nothing, corrected=false)
@@ -136,18 +136,18 @@ weights used:
 * `Weights`: `ArgumentError` (bias correction not supported)
 """
 std(v::RealArray, w::AbstractWeights; mean=nothing, corrected::DepBool=nothing) =
-    sqrt.(var(v, w; mean=mean, corrected=depcheck(:std, corrected)))
+    sqrt.(var(v, w; mean=mean, corrected=depcheck(:std, :corrected, corrected)))
 
 stdm(v::RealArray, m::RealArray, dim::Int; corrected::DepBool=nothing) =
-    sqrt!(varm(v, m, dims=dim, corrected=depcheck(:stdm, corrected)))
+    sqrt!(varm(v, m, dims=dim, corrected=depcheck(:stdm, :corrected, corrected)))
 
 stdm(v::RealArray, w::AbstractWeights, m::RealArray, dim::Int;
           corrected::DepBool=nothing) =
-    sqrt.(varm(v, w, m, dim; corrected=depcheck(:stdm, corrected)))
+    sqrt.(varm(v, w, m, dim; corrected=depcheck(:stdm, :corrected, corrected)))
 
 std(v::RealArray, w::AbstractWeights, dim::Int; mean=nothing,
          corrected::DepBool=nothing) =
-    sqrt.(var(v, w, dim; mean=mean, corrected=depcheck(:std, corrected)))
+    sqrt.(var(v, w, dim; mean=mean, corrected=depcheck(:std, :corrected, corrected)))
 
 ##### Fused statistics
 """
@@ -183,12 +183,12 @@ end
 
 function mean_and_var(x::RealArray, w::AbstractWeights; corrected::DepBool=nothing)
     m = mean(x, w)
-    v = varm(x, w, m; corrected=depcheck(:mean_and_var, corrected))
+    v = varm(x, w, m; corrected=depcheck(:mean_and_var, :corrected, corrected))
     m, v
 end
 function mean_and_std(x::RealArray, w::AbstractWeights; corrected::DepBool=nothing)
     m = mean(x, w)
-    s = stdm(x, w, m; corrected=depcheck(:mean_and_std, corrected))
+    s = stdm(x, w, m; corrected=depcheck(:mean_and_std, :corrected, corrected))
     m, s
 end
 
@@ -208,13 +208,13 @@ end
 function mean_and_var(x::RealArray, w::AbstractWeights, dims::Int;
                       corrected::DepBool=nothing)
     m = mean(x, w, dims=dims)
-    v = varm(x, w, m, dims; corrected=depcheck(:mean_and_var, corrected))
+    v = varm(x, w, m, dims; corrected=depcheck(:mean_and_var, :corrected, corrected))
     m, v
 end
 function mean_and_std(x::RealArray, w::AbstractWeights, dims::Int;
                       corrected::DepBool=nothing)
     m = mean(x, w, dims=dims)
-    s = stdm(x, w, m, dims; corrected=depcheck(:mean_and_std, corrected))
+    s = stdm(x, w, m, dims; corrected=depcheck(:mean_and_std, :corrected, corrected))
     m, s
 end
 
diff --git a/src/weights.jl b/src/weights.jl
@@ -205,59 +205,82 @@ pweights(vs::RealArray) = ProbabilityWeights(vec(vs))
 end
 
 """
-    eweights(t::AbstractVector{<:Integer}, λ::Real)
-    eweights(t::AbstractVector{T}, r::StepRange{T}, λ::Real) where T
-    eweights(n::Integer, λ::Real)
+    eweights(t::AbstractVector{<:Integer}, λ::Real; scale=false)
+    eweights(t::AbstractVector{T}, r::StepRange{T}, λ::Real; scale=false) where T
+    eweights(n::Integer, λ::Real; scale=false)
 
 Construct a [`Weights`](@ref) vector which assigns exponentially decreasing weights to past
-observations, which in this case corresponds to larger integer values `i` in `t`.
-If an integer `n` is provided, weights are generated for values from 1 to `n`
-(equivalent to `t = 1:n`).
+observations (larger integer values `i` in `t`).
+The integer value `n` represents the number of past observations to consider.
+`n` defaults to `maximum(t) - minimum(t) + 1` if only `t` is passed in
+and the elements are integers, and to `length(r)` if a superset range `r` is also passed in.
+If `n` is explicitly passed instead of `t`, `t` defaults to `1:n`.
 
-For each element `i` in `t` the weight value is computed as:
+If `scale` is `true` then for each element `i` in `t` the weight value is computed as:
+
+``(1 - λ)^{n - i}``
+
+If `scale` is `false` then each value is computed as:
 
 ``λ (1 - λ)^{1 - i}``
 
 # Arguments
 
 - `t::AbstractVector`: temporal indices or timestamps
 - `r::StepRange`: a larger range to use when constructing weights from a subset of timestamps
-- `n::Integer`: if provided instead of `t`, temporal indices are taken to be `1:n`
+- `n::Integer`: the number of past events to consider
 - `λ::Real`: a smoothing factor or rate parameter such that ``0 < λ ≤ 1``.
   As this value approaches 0, the resulting weights will be almost equal,
   while values closer to 1 will put greater weight on the tail elements of the vector.
 
+# Keyword arguments
+
+- `scale::Bool`: Return the weights scaled to between 0 and 1 (default: false)
+
 # Examples
 ```julia-repl
-julia> eweights(1:10, 0.3)
+julia> eweights(1:10, 0.3; scale=true)
 10-element Weights{Float64,Float64,Array{Float64,1}}:
- 0.3
- 0.42857142857142855
- 0.6122448979591837
- 0.8746355685131197
- 1.249479383590171
- 1.7849705479859588
- 2.549957925694227
- 3.642797036706039
- 5.203995766722913
- 7.434279666747019
+ 0.04035360699999998
+ 0.05764800999999997
+ 0.08235429999999996
+ 0.11764899999999996
+ 0.16806999999999994
+ 0.24009999999999995
+ 0.3429999999999999
+ 0.48999999999999994
+ 0.7
+ 1.0
 ```
-"""
-function eweights(t::AbstractVector{T}, λ::Real) where T<:Integer
+# Links
+- https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average
+- https://en.wikipedia.org/wiki/Exponential_smoothing
+"""
+function eweights(t::AbstractVector{<:Integer}, λ::Real; kwargs...)
+    isempty(t) && return Weights(copy(t), 0)
+    (lo, hi) = extrema(t)
+    return _eweights(t, λ, hi - lo + 1; kwargs...)
+end
+
+eweights(n::Integer, λ::Real; kwargs...) = _eweights(1:n, λ, n; kwargs...)
+eweights(t::AbstractVector, r::AbstractRange, λ::Real; kwargs...) =
+    _eweights(something.(indexin(t, r)), λ, length(r); kwargs...)
+
+function _eweights(t::AbstractVector{<:Integer}, λ::Real, n::Integer; scale::DepBool=nothing)
     0 < λ <= 1 || throw(ArgumentError("Smoothing factor must be between 0 and 1"))
+    f = depcheck(:eweights, :scale, scale) ? _scaled_eweight : _unscaled_eweight
 
     w0 = map(t) do i
         i > 0 || throw(ArgumentError("Time indices must be non-zero positive integers"))
-        λ * (1 - λ)^(1 - i)
+        f(i, λ, n)
     end
 
     s = sum(w0)
     Weights(w0, s)
 end
 
-eweights(n::Integer, λ::Real) = eweights(1:n, λ)
-eweights(t::AbstractVector, r::AbstractRange, λ::Real) =
-    eweights(something.(indexin(t, r)), λ)
+_unscaled_eweight(i, λ, n) = λ * (1 - λ)^(1 - i)
+_scaled_eweight(i, λ, n) = (1 - λ)^(n - i)
 
 # NOTE: no variance correction is implemented for exponential weights
 
@@ -310,7 +333,7 @@ julia> uweights(3)
  1
  1
  1
- 
+
 julia> uweights(Float64, 3)
 3-element UnitWeights{Float64}:
  1.0
diff --git a/test/weights.jl b/test/weights.jl
@@ -502,53 +502,62 @@ end
 end
 
 @testset "Exponential Weights" begin
+    λ = 0.2
     @testset "Usage" begin
-        θ = 5.25
-        λ = 1 - exp(-1 / θ)     # simple conversion for the more common/readable method
-        v = [λ*(1-λ)^(1-i) for i = 1:4]
+        v = [(1 - λ) ^ (4 - i) for i = 1:4]
         w = Weights(v)
 
-        @test round.(w, digits=4) == [0.1734, 0.2098, 0.2539, 0.3071]
+        @test round.(w, digits=4) == [0.512, 0.64, 0.8, 1.0]
 
         @testset "basic" begin
-            @test eweights(1:4, λ) ≈ w
+            @test eweights(1:4, λ; scale=true) ≈ w
         end
 
         @testset "1:n" begin
-            @test eweights(4, λ) ≈ w
+            @test eweights(4, λ; scale=true) ≈ w
         end
 
         @testset "indexin" begin
-            v = [λ*(1-λ)^(1-i) for i = 1:10]
+            v = [(1 - λ) ^ (10 - i) for i = 1:10]
 
             # Test that we should be able to skip indices easily
-            @test eweights([1, 3, 5, 7], 1:10, λ) ≈ Weights(v[[1, 3, 5, 7]])
+            @test eweights([1, 3, 5, 7], 1:10, λ; scale=true) ≈ Weights(v[[1, 3, 5, 7]])
 
             # This should also work with actual time types
             t1 = DateTime(2019, 1, 1, 1)
             tx = t1 + Hour(7)
-            tn = DateTime(2019, 1, 2, 1)
+            tn = DateTime(2019, 1, 1, 10)
 
-            @test eweights(t1:Hour(2):tx, t1:Hour(1):tn, λ) ≈ Weights(v[[1, 3, 5, 7]])
+            @test eweights(t1:Hour(2):tx, t1:Hour(1):tn, λ; scale=true) ≈ Weights(v[[1, 3, 5, 7]])
         end
     end
 
     @testset "Empty" begin
-        @test eweights(0, 0.3) == Weights(Float64[])
-        @test eweights(1:0, 0.3) == Weights(Float64[])
-        @test eweights(Int[], 1:10, 0.4) == Weights(Float64[])
+        @test eweights(0, 0.3; scale=true) == Weights(Float64[])
+        @test eweights(1:0, 0.3; scale=true) == Weights(Float64[])
+        @test eweights(Int[], 1:10, 0.4; scale=true) == Weights(Float64[])
     end
 
     @testset "Failure Conditions" begin
         # λ > 1.0
-        @test_throws ArgumentError eweights(1, 1.1)
+        @test_throws ArgumentError eweights(1, 1.1; scale=true)
 
         # time indices are not all positive non-zero integers
-        @test_throws ArgumentError eweights([0, 1, 2, 3], 0.3)
+        @test_throws ArgumentError eweights([0, 1, 2, 3], 0.3; scale=true)
 
         # Passing in an array of bools will work because Bool <: Integer,
         # but any `false` values will trigger the same argument error as 0.0
-        @test_throws ArgumentError eweights([true, false, true, true], 0.3)
+        @test_throws ArgumentError eweights([true, false, true, true], 0.3; scale=true)
+    end
+
+    @testset "scale=false" begin
+        v = [λ * (1 - λ)^(1 - i) for i = 1:4]
+        w = Weights(v)
+
+        @test round.(w, digits=4) == [0.2, 0.25, 0.3125, 0.3906]
+
+        wv = eweights(1:10, λ; scale=false)
+        @test eweights(1:10, λ; scale=true) ≈ wv / maximum(wv)
     end
 end