diff --git a/Project.toml b/Project.toml index f3d264e..c70244d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "LearnDataFrontEnds" uuid = "5cca22a3-9356-470e-ba1b-8268d0135a4b" authors = ["Anthony D. Blaom "] -version = "0.1.2" +version = "0.2.0" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" @@ -11,7 +11,7 @@ StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -CategoricalArrays = "0.10" +CategoricalArrays = "1" LearnAPI = "0.2, 1, 2" MLCore = "1.0.0" StatsModels = "0.7.4" diff --git a/docs/src/quick_start.md b/docs/src/quick_start.md index 6305c68..aa33708 100644 --- a/docs/src/quick_start.md +++ b/docs/src/quick_start.md @@ -4,7 +4,7 @@ - [Supervised classifiers](@ref) - [Transformers](@ref) - Refer to the front end [docstrings](@ref front_ends) for options ignored below. + Refer to the front end [docstrings](@ref front_ends) for options ignored below. ## Supervised regressors @@ -31,20 +31,20 @@ Your [`LearnAPI.fit`](@ref) implementation will then look like this: ```julia function LearnAPI.fit( - learner::MyLearner, - observations::Obs; - verbosity=1, - ) - X = observations.features # p x n matrix - y = observations.target # n-vector (use `Saffron(multitarget=true)` for matrix) - feature_names = observations.names + learner::MyLearner, + observations::Obs; + verbosity=1, + ) + X = observations.features # p x n matrix + y = observations.target # n-vector (use `Saffron(multitarget=true)` for matrix) + feature_names = observations.names - # do stuff with `X`, `y` and `feature_names`: - ... + # do stuff with `X`, `y` and `feature_names`: + ... end LearnAPI.fit(learner::MyLearner, data; kwargs...) = - LearnAPI.fit(learner, LearnAPI.obs(learner, data); kwargs...) + LearnAPI.fit(learner, LearnAPI.obs(learner, data); kwargs...) ``` For each [`KindOfProxy`](@ref) subtype `K` to be supported (e.g., `Point`), your @@ -52,14 +52,14 @@ For each [`KindOfProxy`](@ref) subtype `K` to be supported (e.g., `Point`), your ```julia function LearnAPI.predict(model::MyModel, :K, observations::Obs) - X = observations.features # p x n matrix - names = observations.names # if really needed + X = observations.features # p x n matrix + names = observations.names # if really needed - # do stuff with `X`: - ... + # do stuff with `X`: + ... end LearnAPI.predict(model::MyModel, kind_of_proxy, X) = - LearnAPI.predict(model, kind_of_proxy, obs(model, X)) + LearnAPI.predict(model, kind_of_proxy, obs(model, X)) ``` ## Supervised classifiers @@ -94,13 +94,13 @@ function LearnAPI.fit( X = observations.features # p x n matrix y = observations.target # n-vector decoder = observations.decoder - classes_seen = observatioins.classes_seen + levels_seen = observations.levels_seen feature_names = observations.names # do stuff with `X`, `y` and `feature_names`: - # return a `model` object which also stores the `decoder` and/or `classes_seen` - # to make them available to `predict`. - ... + # return a `model` object which also stores the `decoder` and/or `levels_seen` + # to make them available to `predict`. + ... end LearnAPI.fit(learner::MyLearner, data; kwargs...) = LearnAPI.fit(learner, LearnAPI.obs(learner, data); kwargs...) @@ -116,10 +116,10 @@ function LearnAPI.predict(model::MyModel, :K, observations::Obs) # Do stuff with `X` and `model` to obtain raw `predictions` (a vector of integer # codes for `K = Point`, or an `n x c` matrix of probabilities for `K = Distribution`). - # Extract `decoder` or `classes_seen` from `model`. + # Extract `decoder` or `levels_seen` from `model`. # For `K = Point`, return `decoder.(predictions)`. # For `K = Distribution`, return, say, - # `CategoricalDistributions.Univariate(classes_seen, predictions)`. + # `CategoricalDistributions.Univariate(levels_seen, predictions)`. ... end LearnAPI.predict(model::MyModel, kind_of_proxy, X) = LearnAPI.predict(model, @@ -152,29 +152,29 @@ Your [`LearnAPI.fit`](@ref) implementation will then look like this: ```julia function LearnAPI.fit( - learner::MyLearner, - observations::Obs; - verbosity=1, - ) - x = observations.features # p x n matrix - feature_names = observations.names - - # do stuff with `x` and `feature_names`: - ... + learner::MyLearner, + observations::Obs; + verbosity=1, + ) + x = observations.features # p x n matrix + feature_names = observations.names + + # do stuff with `x` and `feature_names`: + ... end LearnAPI.fit(learner::MyLearner, data; kwargs...) = - LearnAPI.fit(learner, LearnAPI.obs(learner, data); kwargs...) + LearnAPI.fit(learner, LearnAPI.obs(learner, data); kwargs...) ``` Your [`LearnAPI.transform`](@ref) implementation will look like this: ```julia function LearnAPI.transform(model::MyModel, observations::Obs) - x = observations.features # p x n matrix - feature_names = observations.names # if really needed + x = observations.features # p x n matrix + feature_names = observations.names # if really needed - # do stuff with `x`: - ... + # do stuff with `x`: + ... end LearnAPI.transform(model::MyModel, X) = LearnAPI.transform(model, obs(model, X)) ``` diff --git a/docs/src/reference.md b/docs/src/reference.md index afd44d4..a8c5be3 100644 --- a/docs/src/reference.md +++ b/docs/src/reference.md @@ -15,6 +15,5 @@ LearnDataFrontEnds.feature_names LearnDataFrontEnds.swapdims LearnDataFrontEnds.decoder LearnDataFrontEnds.decompose -LearnDataFrontEnds.classes LearnDataFrontEnds.canonify ``` diff --git a/src/backends.jl b/src/backends.jl index c0aa7e6..94ec163 100644 --- a/src/backends.jl +++ b/src/backends.jl @@ -31,10 +31,10 @@ If [`Sage`](@ref)`(multitarget=..., code_type=...)` has been implemented, then `observations.target` has an integer element type controlled by `code_type`, and we additionally have: -- `observations.classes`: A categorical vector of the ordered target classes, as actually - seen in the user-supplied target, with the full pool of classes available by applying - `Categorical.levels` to the result. The corresponding integer codes will be - `sort(unique(observations.target))`. +- `observations.levels`: A categorical vector of the ordered target levels, as actually + seen in the user-supplied target. The corresponding integer codes will be + `sort(unique(observations.target))`. To get the full pool of levels, apply + `CategoricalArrays.levels` to `observations.levels_seen`; see the example below. - `observations.decoder`: A callable function that converts an integer code back to the original `CategoricalValue` it represents. @@ -42,6 +42,75 @@ additionally have: Pass the first onto `predict` for making probabilistic predictions, and the second for point predictions; see [`Sage`](@ref) for details. +# Extended help + +In the example below, `observations` implements the full `Obs` interface described above, +for a learner implementing the `Sage` front end: + +```julia-repl +using LearnAPI, LearnDataFrontEnds, LearnTestAPI +using CategoricalDistributions, CategoricalArrays, DataFrames +X = DataFrame(rand(10, 3), :auto) +y = categorical(collect("ababababac")) +learner = LearnTestAPI.ConstantClassifier() +observations = obs(learner, (X[1:9,:], y[1:9])) + +julia> observations.features +3×9 Matrix{Float64}: + 0.234043 0.526468 0.227417 0.956471 … 0.00587146 0.169291 0.353518 0.402631 + 0.631083 0.151317 0.781049 0.00320728 0.756519 0.15317 0.452169 0.127005 + 0.285315 0.347433 0.69174 0.516915 0.900343 0.404006 0.448986 0.962649 + +julia> yint = observations.target +9-element Vector{UInt32}: + 0x00000001 + 0x00000002 + 0x00000001 + 0x00000002 + 0x00000001 + 0x00000002 + 0x00000001 + 0x00000002 + 0x00000001 + +julia> observations.levels_seen +2-element CategoricalArray{Char,1,UInt32}: + 'a' + 'b' + +julia> sort(unique(observations.target)) +2-element Vector{UInt32}: + 0x00000001 + 0x00000002 + +julia> observations.levels_seen |> levels +3-element CategoricalArray{Char,1,UInt32}: + 'a' + 'b' + 'c' + +julia> observations.decoder.(yint) +9-element CategoricalArray{Char,1,UInt32}: + 'a' + 'b' + 'a' + 'b' + 'a' + 'b' + 'a' + 'b' + 'a' + +julia> d = UnivariateFinite(observations.levels_seen, [0.4, 0.6]) +UnivariateFinite{Multiclass{3}}(a=>0.4, b=>0.6) + +julia> levels(d) +3-element CategoricalArray{Char,1,UInt32}: + 'a' + 'b' + 'c' +``` + """ abstract type Obs end @@ -111,7 +180,7 @@ struct SageObs{F,T,E,D} <: Obs features::F # p x n names::Vector{Symbol} target::T - classes_seen::CategoricalArrays.CategoricalVector{E} + levels_seen::CategoricalArrays.CategoricalVector{E} decoder::D end @@ -122,8 +191,8 @@ function Base.show(io::IO, ::MIME"text/plain", observations::SageObs) println(io, " features :: $(typeof(A))($(size(A)))") println(io, " names: $(observations.names)") println(io, " target :: $(typeof(y))($(size(y)))") - println(io, " classes_seen: "* - "$(CategoricalArrays.unwrap.(observations.classes_seen)) "* + println(io, " levels_seen: "* + "$(CategoricalArrays.unwrap.(observations.levels_seen)) "* "(categorical vector with complete pool)") print(io, " decoder: ") end @@ -133,7 +202,7 @@ Base.getindex(observations::SageObs, idx) = MLCore.getobs(observations.features, idx), observations.names, MLCore.getobs(observations.target, idx), - observations.classes_seen, + observations.levels_seen, observations.decoder, ) diff --git a/src/saffron.jl b/src/saffron.jl index 716700f..3e953fd 100644 --- a/src/saffron.jl +++ b/src/saffron.jl @@ -150,13 +150,13 @@ function finalize(x, names, y, int) # here `int` is `levelcode` or `refcode` fu CategoricalArrays.CategoricalArray, SubArray{<:Any, <:Any, <:CategoricalArrays.CategoricalArray}, } || throw(ERR_EXPECTED_CATEGORICAL) - l = LearnDataFrontEnds.classes(y) + l = CategoricalArrays.levels(y) u = unique(y) mask = map(in(u), l) - _classes_seen = l[mask] + _levels_seen = l[mask] _decoder = LearnDataFrontEnds.decoder(l) - return SageObs(x, names, int.(y), _classes_seen, _decoder) + return SageObs(x, names, int.(y), _levels_seen, _decoder) end # for input `(x::AbstractMatrix, y::MatrixOrVector)`: diff --git a/src/sage.jl b/src/sage.jl index 89d536e..42ebabe 100644 --- a/src/sage.jl +++ b/src/sage.jl @@ -104,12 +104,12 @@ function LearnAPI.fit( X = observations.features # p x n matrix y = observations.target # n-vector or q x n matrix decoder = observations.decoder - classes_seen = observations.classes_seen + levels_seen = observations.levels_seen feature_names = observations.names # do stuff with `X`, `y` and `feature_names`: # return a `model` object which also stores the `decoder` and/or - # `classes_seen` to make them available to `predict`. + # `levels_seen` to make them available to `predict`. ... end @@ -127,10 +127,10 @@ function LearnAPI.predict(model::MyModel, :K, observations::Obs) # Do stuff with `X` and `model` to obtain raw `predictions` (a vector of integer # codes for `K = Point`, or an `n x c` matrix of probabilities for `K = Distribution`). - # Extract `decoder` or `classes_seen` from `model`. + # Extract `decoder` or `levels_seen` from `model`. # For `K = Point`, return `decoder.(predictions)`. # For `K = Distribution`, return, say, - # `CategoricalDistributions.Univariate(classes_seen, predictions)`. + # `CategoricalDistributions.Univariate(levels_seen, predictions)`. ... end LearnAPI.predict(model::MyModel, kind_of_proxy, X) = LearnAPI.predict(model, diff --git a/src/tools.jl b/src/tools.jl index 14b557b..2925d4d 100644 --- a/src/tools.jl +++ b/src/tools.jl @@ -112,58 +112,8 @@ function decompose(X, v, _targets::NTuple) return swapdims(A, v), collect(names), swapdims(B, v) end -""" - classes(x) - -*Private method.* - -Return, as a `CategoricalVector`, all the categorical elements with -the same pool as `CategoricalValue` `x` (including `x`), with an -ordering consistent with the pool. Note that `x in classes(x)` is -always true. - -Not to be confused with `levels(x.pool)`. See the example below. - -Also, overloaded for `x` a `CategoricalArray`, `CategoricalPool`, and for views of -`CategoricalArray`. - - julia> v = categorical(['c', 'b', 'c', 'a']) - 4-element CategoricalArrays.CategoricalArray{Char,1,UInt32}: - 'c' - 'b' - 'c' - 'a' - - julia> levels(v) - 3-element Array{Char,1}: - 'a' - 'b' - 'c' - - julia> x = v[4] - CategoricalArrays.CategoricalValue{Char,UInt32} 'a' - - julia> classes(x) - 3-element CategoricalArrays.CategoricalArray{Char,1,UInt32}: - 'a' - 'b' - 'c' - - julia> levels(x.pool) - 3-element Array{Char,1}: - 'a' - 'b' - 'c' - -""" -classes(p::CategoricalArrays.CategoricalPool) = [p[i] for i in 1:length(p)] -classes(x::CategoricalArrays.CategoricalValue) = classes(CategoricalArrays.pool(x)) -classes(v::CategoricalArrays.CategoricalArray) = classes(CategoricalArrays.pool(v)) -classes(v::SubArray{<:Any, <:Any, <:CategoricalArrays.CategoricalArray}) = classes(parent(v)) - - struct CategoricalDecoder{V,R} - classes::CategoricalArrays.CategoricalVector{ + levels::CategoricalArrays.CategoricalVector{ V, R, V, @@ -193,7 +143,7 @@ pool as `x`. *Warning:* There is no guarantee that `levelcode.(d.(u)) == u` will always holds. """ -decoder(x) = CategoricalDecoder(classes(x)) +decoder(x) = CategoricalDecoder(CategoricalArrays.levels(x)) (d::CategoricalDecoder{V,R})(i::Integer) where {V,R} = - CategoricalArrays.CategoricalValue{V,R}(d.classes[i]) + CategoricalArrays.CategoricalValue{V,R}(d.levels[i]) diff --git a/test/backends.jl b/test/backends.jl index d98ec93..9f04217 100644 --- a/test/backends.jl +++ b/test/backends.jl @@ -7,7 +7,7 @@ import CategoricalArrays y = [3, 2, 1] names = [:x1, :x2] ycat = CategoricalArrays.categorical(y) - c = LearnDataFrontEnds.classes(ycat) + c = CategoricalArrays.levels(ycat) d = LearnDataFrontEnds.decoder(ycat) mime = MIME"text/plain"() @@ -18,7 +18,7 @@ import CategoricalArrays "[:x1, :x2]\n target :: Vector{Int64}((3,))" @test sprint(show, mime, LearnDataFrontEnds.SageObs(x, names, y, c, d)) == "SageObs\n features :: Matrix{Int64}((2, 3))\n names: "* - "[:x1, :x2]\n target :: Vector{Int64}((3,))\n classes_seen: "* + "[:x1, :x2]\n target :: Vector{Int64}((3,))\n levels_seen: "* "[1, 2, 3] (categorical vector with complete pool)\n decoder: " end diff --git a/test/sage.jl b/test/sage.jl index e755b13..d5c1385 100644 --- a/test/sage.jl +++ b/test/sage.jl @@ -62,8 +62,8 @@ f = @formula(t ~ c + a) @test o.names == [:x1, :x2] @test o.target == repeat([1, 2, 1], n) @test eltype(o.target) == Int - @test o.classes_seen == CA.levels(y)[1:2] - @test o.classes_seen isa CA.CategoricalArray + @test o.levels_seen == CA.levels(y)[1:2] + @test o.levels_seen isa CA.CategoricalArray yy = o.decoder.(o.target) @test yy == y @test yy isa CA.CategoricalVector @@ -87,8 +87,8 @@ end @test o.names == [:c, :a] @test o.target == repeat([1, 2, 1], n) @test eltype(o.target) == Int - @test o.classes_seen == CA.levels(y)[1:2] - @test o.classes_seen isa CA.CategoricalArray + @test o.levels_seen == CA.levels(y)[1:2] + @test o.levels_seen isa CA.CategoricalArray yy = o.decoder.(o.target) @test yy == y @test yy isa CA.CategoricalVector @@ -112,8 +112,8 @@ end @test o.features == x @test o.target == repeat([1, 2, 1], n) @test eltype(o.target) == Int - @test o.classes_seen == CA.levels(y)[1:2] - @test o.classes_seen isa CA.CategoricalArray + @test o.levels_seen == CA.levels(y)[1:2] + @test o.levels_seen isa CA.CategoricalArray yy = o.decoder.(o.target) @test yy == y @test yy isa CA.CategoricalVector @@ -138,8 +138,8 @@ end @test o.names == [:c, :a] @test o.target == repeat([1, 2, 1], n) @test eltype(o.target) == Int - @test o.classes_seen == CA.levels(y)[1:2] - @test o.classes_seen isa CA.CategoricalArray + @test o.levels_seen == CA.levels(y)[1:2] + @test o.levels_seen isa CA.CategoricalArray yy = o.decoder.(o.target) @test yy == y @test yy isa CA.CategoricalVector @@ -163,8 +163,8 @@ end @test o.names == [:c, :a] @test o.target == repeat([1, 2, 1], n) @test eltype(o.target) == Int - @test o.classes_seen == CA.levels(y)[1:2] - @test o.classes_seen isa CA.CategoricalArray + @test o.levels_seen == CA.levels(y)[1:2] + @test o.levels_seen isa CA.CategoricalArray yy = o.decoder.(o.target) @test yy == y @test yy isa CA.CategoricalVector @@ -223,7 +223,7 @@ struct ConstantClassifierFitted learner::ConstantClassifier probabilities names::Vector{Symbol} - classes_seen + levels_seen codes_seen decoder end @@ -256,7 +256,7 @@ function LearnAPI.fit(learner::ConstantClassifier, observations::Obs; verbosity= y = observations.target # integer "codes" names = observations.names - classes_seen = observations.classes_seen + levels_seen = observations.levels_seen codes_seen = sort(unique(y)) decoder = observations.decoder @@ -268,7 +268,7 @@ function LearnAPI.fit(learner::ConstantClassifier, observations::Obs; verbosity= learner, probabilities, names, - classes_seen, + levels_seen, codes_seen, decoder, ) @@ -290,7 +290,7 @@ function LearnAPI.predict(model::ConstantClassifierFitted, ::Distribution, obser probs = model.probabilities # repeat vertically to get rows of a matrix: probs_matrix = reshape(repeat(probs, n), (length(probs), n))' - return CategoricalDistributions.UnivariateFinite(model.classes_seen, probs_matrix) + return CategoricalDistributions.UnivariateFinite(model.levels_seen, probs_matrix) end LearnAPI.predict(model::ConstantClassifierFitted, ::Distribution, Xnew) = predict(model, Distribution(), obs(model, Xnew)) diff --git a/test/tools.jl b/test/tools.jl index a810341..061aea5 100644 --- a/test/tools.jl +++ b/test/tools.jl @@ -8,8 +8,9 @@ import LearnDataFrontEnds: DoView, DontView, Multitarget, Unitarget using CategoricalArrays using Random using StableRNGs -import LearnDataFrontEnds: classes, decoder +import LearnDataFrontEnds: decoder +# developers, use this to work locally: # include("_some_learners.jl") @testset "decompose" begin @@ -79,17 +80,6 @@ end rng = StableRNGs.StableRNG(123) -@testset "classes" begin - v = categorical(collect("asqfasqffqsaaaa"), ordered=true) - @test classes(v[1]) == levels(v) - @test classes(v) == levels(v) - levels!(v, reverse(levels(v))) - @test classes(v[1]) == levels(v) - @test classes(v) == levels(v) - vsub = view(v, 1:2) - @test classes(vsub) == classes(v) -end - const int = CategoricalArrays.refcode @testset "decoder" begin @@ -123,7 +113,7 @@ const int = CategoricalArrays.refcode e = decoder(y) @test e.(int.(W)) == W - @test int.(classes(y)) == 1:length(classes(x)) + @test int.(levels(y)) == 1:length(levels(x)) v = categorical(['a', 'b', 'c'], ordered=true) end