add PSIS for vectors, use entropy-based ESS (#50)

Carlos Parada · web-flow · commit 8650285a02a5 · 2021-09-26T17:09:08.000-07:00
diff --git a/Project.toml b/Project.toml
@@ -6,6 +6,7 @@ version = "0.6.6"
 [deps]
 AxisKeys = "94b1ba4f-4ee9-5380-92f1-94cde586c3c5"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+Lazy = "50d2b5c4-7a5e-59d5-8109-a42b560f39c0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
 MCMCDiagnosticTools = "be115224-59cd-429b-ad48-344e309966f0"
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -11,4 +11,9 @@ Documentation for [ParetoSmooth](https://github.com/TuringLang/ParetoSmooth.jl).
 
 ```@autodocs
 Modules = [ParetoSmooth]
+Private = false
+```
+
+```@docs
+naive_lpd
 ```
diff --git a/src/ESS.jl b/src/ESS.jl
@@ -11,12 +11,16 @@ export relative_eff, psis_ess, sup_ess
 
 Calculate the relative efficiency of an MCMC chain, i.e. the effective sample size divided
 by the nominal sample size.
+
+# Arguments 
+
+  - `sample::AbstractArray{<:Real, 3}`: An array of log-likelihood values.
 """
 function relative_eff(sample::AbstractArray{<:Real, 3}; maxlag=size(sample, 2), kwargs...)
     dims = size(sample)
     post_sample_size = dims[2] * dims[3]
-    ess_sample = inv.(permutedims(sample, [2, 1, 3]))
-    ess, = MCMCDiagnosticTools.ess_rhat(ess_sample; method=method, maxlag=dims[2])
+    ess_sample = permutedims(sample, [2, 1, 3])
+    ess, = MCMCDiagnosticTools.ess_rhat(ess_sample; maxlag=dims[2], kwargs...)
     r_eff = ess / post_sample_size
     return r_eff
 end
@@ -34,28 +38,20 @@ distance of the proposal and target distributions.
 
 # Arguments
 
-  - `weights`: A set of importance sampling weights derived from PSIS.
+  - `weights`: A set of normalized importance sampling weights derived from PSIS.
   - `r_eff`: The relative efficiency of the MCMC chains from which PSIS samples were derived.
 
 See `?relative_eff` to calculate `r_eff`.
 """
-function psis_ess(
-    weights::AbstractVector{T}, r_eff::AbstractVector{T}
-) where {T <: Union{Real, Missing}}
-    @tullio sum_of_squares := weights[x]^2
-    return r_eff ./ sum_of_squares
-end
-
-
 function psis_ess(
     weights::AbstractMatrix{T}, r_eff::AbstractVector{T}
-) where {T <: Union{Real, Missing}}
-    @tullio sum_of_squares[x] := weights[x, y]^2
+) where {T <: Real}
+    @tullio sum_of_squares[x] := xlogx(weights[x, y]) |> exp
     return r_eff ./ sum_of_squares
 end
 
 
-function psis_ess(weights::AbstractMatrix{<:Union{Real, Missing}})
+function psis_ess(weights::AbstractMatrix{<:Real})
     @warn "PSIS ESS not adjusted based on MCMC ESS. MCSE and ESS estimates " *
           "will be overoptimistic if samples are autocorrelated."
     return psis_ess(weights, ones(size(weights)))
@@ -77,7 +73,7 @@ L-∞ norm.
   - `r_eff`: The relative efficiency of the MCMC chains; see also [`relative_eff`]@ref.
 """
 function sup_ess(
-    weights::AbstractMatrix{T}, r_eff::V
-) where {T<:Real, V<:AbstractVector{T}}
+    weights::AbstractMatrix{T}, r_eff::AbstractVector{T}
+) where {T<:Real}
     return inv.(dropdims(maximum(weights; dims=2); dims=2)) .* r_eff
 end
diff --git a/src/ImportanceSampling.jl b/src/ImportanceSampling.jl
@@ -24,7 +24,9 @@ A struct containing the results of Pareto-smoothed importance sampling.
 
 # Fields
 
-  - `weights`: A vector of smoothed, truncated, and normalized importance sampling weights.
+  - `log_weights`: A vector of smoothed and truncated but *unnormalized* importance sampling
+    weights.
+  - `weights`: A lazy
   - `pareto_k`: Estimates of the shape parameter `k` of the generalized Pareto distribution.
   - `ess`: Estimated effective sample size for each LOO evaluation, based on the variance of
     the weights.
@@ -102,7 +104,8 @@ See also: [`relative_eff`]@ref, [`psis_loo`]@ref, [`psis_ess`]@ref.
 function psis(
     log_ratios::AbstractArray{<:Real, 3};
     r_eff::AbstractVector{<:Real}=similar(log_ratios, 0),
-    source::Union{AbstractString, Symbol}="mcmc"
+    source::Union{AbstractString, Symbol}="mcmc",
+    log_weights::Bool=true
 )
 
     source = lowercase(String(source))
@@ -114,12 +117,8 @@ function psis(
     # Reshape to matrix (easier to deal with)
     log_ratios = reshape(log_ratios, data_size, post_sample_size)
     r_eff = _generate_r_eff(log_ratios, dims, r_eff, source)
-    weights = similar(log_ratios)
-    # Shift ratios by maximum to prevent overflow
-    @. weights = exp(log_ratios - $maximum(log_ratios; dims=2))
-    
-    r_eff = _generate_r_eff(weights, dims, r_eff, source)
     _check_input_validity_psis(reshape(log_ratios, dims), r_eff)
+    weights = @. exp(log_ratios - $maximum(log_ratios; dims=2))
 
     tail_length = Vector{Int}(undef, data_size)
     ξ = similar(r_eff)
@@ -159,36 +158,44 @@ function psis(
 end
 
 
-function psis(is_ratios::AbstractVector{<:Real}, args...)
+function psis(is_ratios::AbstractVector{<:Real}, args...; kwargs...)
     new_ratios = copy(is_ratios)
-    ξ = psis!(new_ratios)
+    ξ = psis!(new_ratios, kwargs...)
     return new_ratios, ξ
 end
 
 
 
 """
-    psis!(is_ratios::AbstractVector{<:Real}, tail_length::Integer) -> Real
-    psis!(is_ratios::AbstractVector{<:Real}, r_eff::Real) -> Real
+    psis!(is_ratios::AbstractVector{<:Real}, tail_length::Integer; log_ratios=false) -> Real
+    psis!(is_ratios::AbstractVector{<:Real}, r_eff::Real; log_ratios=false) -> Real
 
 Do PSIS on a single vector, smoothing its tail values *in place* before returning the 
-estimated tail value.
+estimated shape constant for the `pareto_k` distribution. This *does not* normalize the 
+log-weights.
 
 # Arguments
 
   - `is_ratios::AbstractVector{<:Real}`: A vector of importance sampling ratios,
     scaled to have a maximum of 1.
-  - `r_eff::AbstractVector{<:Real}`: A vector of relative effective sample sizes if .
+  - `r_eff::AbstractVector{<:Real}`: The relative effective sample size, used for improving
+    the .
+    case `psis!` will automatically calculate the correct tail length.
+  - `log_weights::Bool`: A boolean indicating whether the input vector is a vector of log
+    ratios, rather than raw importance sampling ratios.
 
 # Returns
 
-  - `T<:Real`: ξ, the shape parameter for the GPD; big numbers indicate thick tails.
+  - `Real`: ξ, the shape parameter for the GPD. Bigger numbers indicate thicker tails.
 
 # Notes
 
-Unlike `psis`, `psis!` performs no checks to make sure the input values are valid.
+Unlike the methods for arrays, `psis!` performs no checks to make sure the input values are 
+valid.
 """
-function psis!(is_ratios::AbstractVector{<:Real}, tail_length::Integer)
+function psis!(is_ratios::AbstractVector{<:Real}, tail_length::Integer; 
+    log_weights::Bool=false
+)
 
     len = length(is_ratios)
     tail_start = len - tail_length + 1  # index of smallest tail value
@@ -199,6 +206,10 @@ function psis!(is_ratios::AbstractVector{<:Real}, tail_length::Integer)
     is_ratios .= first.(ratio_index)
     @views tail = is_ratios[tail_start:len]
     _check_tail(tail)
+    if log_weights 
+        biggest = maximum(tail)
+        @. tail = exp(tail - biggest)
+    end
 
     # Get value just before the tail starts:
     cutoff = is_ratios[tail_start - 1]
@@ -209,7 +220,11 @@ function psis!(is_ratios::AbstractVector{<:Real}, tail_length::Integer)
     # unsort the ratios to their original position:
     invpermute!(is_ratios, last.(ratio_index))
 
-    return ξ::T
+    if log_weights 
+        @. tail = log(tail + biggest)
+    end
+
+    return ξ
 end
 
 
@@ -222,7 +237,8 @@ end
 """
     _def_tail_length(log_ratios::AbstractVector, r_eff::Real) -> Integer
 
-Define the tail length as in Vehtari et al. (2019).
+Define the tail length as in Vehtari et al. (2019), with the small addition that the tail
+must a multiple of `32*bit_length` (which improves performance).
 """
 function _def_tail_length(length::Integer, r_eff::Real=1)
     return min(cld(length, 5), ceil(3 * sqrt(length / r_eff))) |> Int
@@ -322,7 +338,7 @@ end
 Check the tail to make sure a GPD fit is possible.
 """
 function _check_tail(tail::AbstractVector{T}) where {T <: Real}
-    if maximum(tail) ≈ minimum(tail)
+    if tail[end] ≈ tail[1]
         throw(
             ArgumentError(
                 "Unable to fit generalized Pareto distribution: all tail values are the " *
@@ -333,7 +349,7 @@ function _check_tail(tail::AbstractVector{T}) where {T <: Real}
         throw(
             ArgumentError(
                 "Unable to fit generalized Pareto distribution: tail length was too " *
-                "short. Likely causese are: \n$LIKELY_ERROR_CAUSES",
+                "short. Likely causes are: \n$LIKELY_ERROR_CAUSES",
             ),
         )
     end
diff --git a/src/NaiveLPD.jl b/src/NaiveLPD.jl
@@ -2,7 +2,10 @@
     naive_lpd(log_likelihood::AbstractArray{<:Real}[, chain_index])
 
 Calculate the naive (in-sample) estimate of the expected log probability density, otherwise
-known as the in-sample Bayes score. Not recommended for most uses.
+known as the in-sample Bayes score. This method yields heavily biased results, and we advise
+against using it; it is included only for pedagogical purposes.
+
+This method is unexported and can only be accessed by calling `ParetoSmooth.naive_lpd`.
 
 # Arguments
   - $LOG_LIK_ARR