Merge pull request #40 from bkamins/patch-1

OkonSamuel · web-flow · commit e8e4aa6d76ed · 2023-02-02T21:18:18.000+01:00
Improve implementation of pdf
diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml
@@ -4,6 +4,22 @@ on:
     types:
       - created
   workflow_dispatch:
+    inputs:
+      lookback:
+        default: 3
+permissions:
+  actions: read
+  checks: read
+  contents: write
+  deployments: read
+  issues: read
+  discussions: read
+  packages: read
+  pages: read
+  pull-requests: read
+  repository-projects: read
+  security-events: read
+  statuses: read
 jobs:
   TagBot:
     if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -17,13 +17,17 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.3'
           - '1.6'
           - '1' # automatically expands to the latest stable 1.x release of Julia.
+          - 'nightly'
         os:
           - ubuntu-latest
         arch:
           - x64
+        include:
+          - os: windows-latest
+            version: '1'
+            arch: x86
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@v1
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "CategoricalDistributions"
 uuid = "af321ab8-2d2e-40a6-b165-3d674595d28e"
 authors = ["Anthony D. Blaom <anthony.blaom@gmail.com>"]
-version = "0.1.9"
+version = "0.1.10"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
@@ -19,7 +19,7 @@ Missings = "0.4, 1"
 OrderedCollections = "1.1"
 ScientificTypes = "3.0"
 UnicodePlots = "2, 3"
-julia = "1.3"
+julia = "1.6"
 
 [extras]
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
diff --git a/src/arithmetic.jl b/src/arithmetic.jl
@@ -1,7 +1,7 @@
 # ## ARITHMETIC
 
 const ERR_DIFFERENT_SAMPLE_SPACES = ArgumentError(
-    "Adding two `UnivariateFinite` objects whose "*
+    "Adding two `UnivariateFinite` objects whose " *
     "sample spaces have different labellings is not allowed. ")
 
 import Base: +, *, /, -
diff --git a/src/arrays.jl b/src/arrays.jl
@@ -14,6 +14,11 @@ function Base.getindex(u::UniFinArr{<:Any,<:Any,R,P,N},
     return UnivariateFinite(u.scitype, u.decoder, prob_given_ref)
 end
 
+function Base.getindex(u::UniFinArr, idx::CartesianIndex)
+    checkbounds(u, idx)
+    return u[Tuple(idx)...]
+end
+
 function Base.getindex(u::UniFinArr{<:Any,<:Any,R,P,N},
                        I...) where {R,P,N}
     prob_given_ref = LittleDict{R,Array{P,N}}()
@@ -35,9 +40,9 @@ end
 # TODO: return an exception without throwing it:
 
 _err_incompatible_levels() = throw(DomainError(
-    "Cannot concatenate `UnivariateFiniteArray`s with "*
-    "different categorical levels (classes), "*
-    "or whose levels, when ordered, are not  "*
+    "Cannot concatenate `UnivariateFiniteArray`s with " *
+    "different categorical levels (classes), " *
+    "or whose levels, when ordered, are not  " *
     "consistently ordered. "))
 
 # terminology:
@@ -61,14 +66,12 @@ function Base.cat(us::UniFinArr{S,V,R,P,N}...;
     for i in 2:length(us)
         isordered(us[i]) == ordered || _err_incompatible_levels()
         if ordered
-            classes(us[i]) ==
-                _classes|| _err_incompatible_levels()
+            classes(us[i]) == _classes || _err_incompatible_levels()
         else
-            Set(classes(us[i])) ==
-                Set(_classes) || _err_incompatible_levels()
+            Set(classes(us[i])) == Set(_classes) || _err_incompatible_levels()
         end
-        support_with_duplicates =
-            vcat(support_with_duplicates, Dist.support(us[i]))
+        support_with_duplicates = vcat(support_with_duplicates,
+                                       Dist.support(us[i]))
     end
     _support = unique(support_with_duplicates) # no-longer categorical!
 
@@ -99,14 +102,12 @@ for func in [:pdf, :logpdf]
     eval(quote
         function Distributions.$func(
             u::AbstractArray{UnivariateFinite{S,V,R,P},N},
-            C::AbstractVector{<:Union{
-                V,
-                CategoricalValue{V,R}}}) where {S,V,R,P,N}
+            C::AbstractVector) where {S,V,R,P,N}
 
-            #ret = Array{P,N+1}(undef, size(u)..., length(C))
             ret = zeros(P, size(u)..., length(C))
-            for i in eachindex(C)
-                ret[fill(:,N)...,i] .= broadcast($func, u, C[i])
+            # note that we do not require C to use 1-base indexing
+            for (i, c) in enumerate(C)
+                ret[fill(:,N)..., i] .= broadcast($func, u, c)
             end
             return ret
         end
@@ -126,7 +127,7 @@ end
 # returns `x[i]` for `Array` inputs `x`
 # For non-Array inputs returns `zero(dtype)`
 #This avoids using an if statement
-_getindex(x::Array,i, dtype)=x[i]
+_getindex(x::Array, i, dtype)=x[i]
 _getindex(::Nothing, i, dtype) = zero(dtype)
 
 # pdf.(u, cv)
@@ -135,19 +136,23 @@ function Base.Broadcast.broadcasted(
     u::UniFinArr{S,V,R,P,N},
     cv::CategoricalValue) where {S,V,R,P,N}
 
-    cv in classes(u) || throw(err_missing_class(cv))
+    # we assume that we compare categorical values by their unwrapped value
+    # and pick the index of this value from classes(u)
+    cv_loc = findfirst(==(cv), classes(u))
+    cv_loc == 0 && throw(err_missing_class(cv))
 
     f() = zeros(P, size(u)) #default caller function
 
     return Base.Broadcast.Broadcasted(
         identity,
-        (get(f, u.prob_given_ref, int(cv)),)
+        (get(f, u.prob_given_ref, cv_loc),)
         )
 end
+
 Base.Broadcast.broadcasted(
     ::typeof(pdf),
     u::UniFinArr{S,V,R,P,N},
-    ::Missing) where {S,V,R,P,N} = Missings.missings(P, length(u))
+    ::Missing) where {S,V,R,P,N} = Missings.missings(P, size(u))
 
 # pdf.(u, v)
 function Base.Broadcast.broadcasted(
@@ -160,17 +165,15 @@ function Base.Broadcast.broadcasted(
     length(u) == length(v) ||throw(DimensionMismatch(
         "Arrays could not be broadcast to a common size; "*
         "got a dimension with lengths $(length(u)) and $(length(v))"))
-    for cv in v
-        ismissing(cv) || cv in classes(u) || throw(err_missing_class(cv))
-    end
 
-    # will use linear indexing:
-    v_flat = ((v[i], i) for i in 1:length(v))
+    v_loc_flat = [(ismissing(x) ? missing : findfirst(==(x), classes(u)), i)
+                  for (i, x) in enumerate(v)]
+    any(isequal(0), v_loc_flat) && throw(err_missing_class(cv))
 
-    getter((cv, i), dtype) =
-        _getindex(get(u.prob_given_ref, int(cv), nothing), i, dtype)
+    getter((cv_loc, i), dtype) =
+        _getindex(get(u.prob_given_ref, cv_loc, nothing), i, dtype)
     getter(::Tuple{Missing,Any}, dtype) = missing
-    ret_flat = getter.(v_flat, P)
+    ret_flat = getter.(v_loc_flat, P)
     return reshape(ret_flat, size(u))
 end
 
@@ -243,10 +246,10 @@ function Base.Broadcast.broadcasted(::typeof(mode),
     mode_flat = map(1:length(u)) do i
         max_prob = maximum(dic[ref][i] for ref in keys(dic))
         m = zero(R)
-        
-        # `maximum` of any iterable containing `NaN` would return `NaN` 
+
+        # `maximum` of any iterable containing `NaN` would return `NaN`
         # For this case the index `m` won't be updated in the loop as relations
-        # involving NaN as one of it's argument always returns false 
+        # involving NaN as one of it's argument always returns false
         # (e.g `==(NaN, NaN)` returns false)
         throw_nan_error_if_needed(max_prob)
         for ref in keys(dic)
@@ -269,9 +272,7 @@ const ERR_EMPTY_UNIVARIATE_FINITE = ArgumentError(
     "No `UnivariateFinite` object found from which to extract classes. ")
 
 function classes(yhat::AbstractArray{<:Union{Missing,UnivariateFinite}})
-    i = findfirst(x->!ismissing(x), yhat)
+    i = findfirst(!ismissing, yhat)
     i === nothing && throw(ERR_EMPTY_UNIVARIATE_FINITE)
     return classes(yhat[i])
 end
-
-
diff --git a/src/methods.jl b/src/methods.jl
@@ -164,40 +164,24 @@ One can also do weighted fits:
 
 See also `classes`, `support`.
 """
-function Dist.pdf(
-    d::UnivariateFinite{S,V,R,P},
-    cv::CategoricalValue,
-) where {S,V,R,P}
-    return get(d.prob_given_ref, int(cv), zero(P))
-end
-Dist.pdf(d::UnivariateFinite{S,V}, c::V) where {S,V} = _pdf(d, c)
-Dist.pdf(::UnivariateFinite{S,V}, ::Missing) where {S,V} = missing
-
-# Avoid method ambiguity errors with Dist >= 0.24
-Dist.pdf(d::UnivariateFinite{S,V}, c::V) where {S,V<:Real} = _pdf(d, c)
+Dist.pdf(::UnivariateFinite, ::Missing) = missing
 
-function _pdf(d::UnivariateFinite, c)
+function Dist.pdf(d::UnivariateFinite{S,V,R,P}, c) where {S,V,R,P}
     _classes = classes(d)
     c in _classes || throw(DomainError("Value $c not in pool. "))
     pool = CategoricalArrays.pool(_classes)
-    class = pool[get(pool, c)]
-    return pdf(d, class)
+    return get(d.prob_given_ref, get(pool, c), zero(P))
 end
 
-Dist.logpdf(d::UnivariateFinite, cv::CategoricalValue) = log(pdf(d,cv))
-Dist.logpdf(d::UnivariateFinite{S,V}, c::V) where {S,V} = log(pdf(d, c))
-Dist.logpdf(::UnivariateFinite{S,V}, ::Missing) where {S,V} = missing
-
-# Avoid method ambiguity errors with Dist >= 0.24
-Dist.logpdf(d::UnivariateFinite{S,V}, c::V) where {S,V<:Real} = log(pdf(d, c))
+Dist.logpdf(d::UnivariateFinite, c) = log(pdf(d, c))
 
 function Dist.mode(d::UnivariateFinite)
     dic = d.prob_given_ref
     p = values(dic)
     max_prob = maximum(p)
     m = first(first(dic)) # mode, just some ref for now
 
-    # `maximum` of any iterable containing `NaN` would return `NaN` 
+    # `maximum` of any iterable containing `NaN` would return `NaN`
     # For this case the index `m` won't be updated in the loop below as relations
     # involving NaN as one of it's arguments always returns false
     # (e.g `==(NaN, NaN)` returns false)
diff --git a/test/arrays.jl b/test/arrays.jl
@@ -290,4 +290,69 @@ end
 
 end
 
+function ≅(x::T, y::T) where {T<:UnivariateFinite}
+    return x.decoder == y.decoder &&
+           x.prob_given_ref == y.prob_given_ref &&
+           x.scitype == y.scitype
+end
+
+@testset "CartesianIndex" begin
+    v = categorical(["a", "b"], ordered=true)
+    m = UnivariateFinite(v, rand(rng, 5, 2), augment=true)
+    @test m[1, 1] ≅ m[CartesianIndex(1, 1)] ≅ m[CartesianIndex(1, 1, 1)]
+    @test_throws BoundsError m[CartesianIndex(1)]
+    @test all(zip(Matrix(m), copy(m), m)) do (x, y, z)
+        return x ≅ y ≅ z
+    end
+    @test Matrix(m) isa Matrix
+    # TODO: probably it would be better for copy to keep it
+    #       UnivariateFiniteArray but it would be breaking
+    @test copy(m) isa Matrix
+    @test similar(m) isa Matrix
+end
+
+@testset "broadcasted pdf" begin
+    v = categorical(["a", "b"], ordered=true)
+    v2 = categorical(["a", "b"], ordered=true, levels=["b", "a"])
+    x = UnivariateFinite(v, rand(rng, 5), augment=true)
+    @test pdf.(x, v[1]) == pdf.(x, v2[1]) == pdf.(x, "a")
+    @test pdf.(x, v[2]) == pdf.(x, v2[2]) == pdf.(x, "b")
+
+    x = UnivariateFinite(v, rand(rng, 5, 2), augment=true)
+    @test size(pdf.(x, missing)) == (5, 2)
+
+    v3 = categorical(["a" "b"], ordered=true)
+    v4 = categorical(["a" "b"], ordered=true, levels=["b", "a"])
+    # note that v5 and v6 have the same shape and contents as v3 and v4
+    # just they are Matrix{Any} not CategoricalMatrix
+    v5 = Any[v3[1] v3[2]]
+    v6 = Any[v4[1] v4[2]]
+    x = UnivariateFinite(v, hcat([0.1, 0.2]), augment=true)
+
+    # these tests show that now we have corrected refpools
+    # but still there is an inconsistency in behavior
+    @test pdf.(x, v) == hcat([0.9, 0.2])
+    @test pdf.(x, v2) == hcat([0.9, 0.2])
+    @test pdf.(x, v3) == hcat([0.9, 0.2])
+    @test pdf.(x, v4) == hcat([0.9, 0.2])
+    @test pdf.(x, v5) == [0.9 0.1; 0.8 0.2]
+    @test pdf.(x, v6) == [0.9 0.1; 0.8 0.2]
+end
+
+@testset "pdf with various types" begin
+    v = categorical(["a", "b"], ordered=true)
+    a = view("a", 1:1) # quite common case when splitting strings
+    b = view("b", 1:1)
+    x = UnivariateFinite(v, [0.1, 0.2, 0.3], augment=true)
+    @test pdf.(x, a) == pdf.(x, "a") == pdf.(x, v[1])
+    @test logpdf.(x, a) == logpdf.(x, "a") == logpdf.(x, v[1])
+    @test pdf(x, [a, b]) == pdf(x, ["a", "b"]) == pdf(x, v)
+    @test logpdf(x, [a, b]) == logpdf(x, ["a", "b"]) == logpdf(x, v)
+
+    x = UnivariateFinite(v, 0.1, augment=true)
+    @test pdf.(x, a) == pdf.(x, "a") == pdf.(x, v[1]) == 0.9
+    @test logpdf.(x, a) == logpdf.(x, "a") == logpdf.(x, v[1]) == log(0.9)
+    @test pdf(x, a) == pdf(x, "a") == pdf(x, v[1]) == 0.9
+    @test logpdf(x, a) == logpdf(x, "a") == logpdf(x, v[1]) == log(0.9)
+end
 true
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -9,6 +9,12 @@ rng = StableRNGs.StableRNG(123)
 
 import CategoricalDistributions: classes, decoder, int
 
+ambiguities_vec = Test.detect_ambiguities(CategoricalDistributions,
+                                          recursive=true)
+if !isempty(ambiguities_vec)
+    @warn "$(length(ambiguities_vec)) method ambiguities detected"
+end
+
 @testset "utilities" begin
      @test include("utilities.jl")
 end