🎨 Add ContrastEncoder

EssamWisam · EssamWisam · commit 0a37ea6614e7 · 2024-07-24T14:24:58.000-05:00
diff --git a/Project.toml b/Project.toml
@@ -9,6 +9,7 @@ CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
@@ -31,8 +32,9 @@ julia = "1.6.7"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "DataFrames", "MLJBase", "Random", "StableRNGs"]
+test = ["Test", "DataFrames", "MLJBase", "Random", "StableRNGs", "StatsModels"]
diff --git a/src/MLJTransforms.jl b/src/MLJTransforms.jl
@@ -6,13 +6,15 @@ using CategoricalArrays
 using MLJModelInterface
 using TableOperations
 using StatsBase
+using LinearAlgebra
 # Other transformers
 using Combinatorics
 import Distributions
 using Parameters
 using Dates
 using OrderedCollections
 
+
 const MMI = MLJModelInterface
 
 # Functions of generic use across transformers
@@ -23,17 +25,18 @@ include("utils.jl")
 include("encoders/target_encoding/errors.jl")
 include("encoders/target_encoding/target_encoding.jl")
 include("encoders/target_encoding/interface_mlj.jl")
-export target_encoder_fit, target_encoder_transform, TargetEncoder
+export  TargetEncoder
 
 # Ordinal encoding
 include("encoders/ordinal_encoding/ordinal_encoding.jl")
 include("encoders/ordinal_encoding/interface_mlj.jl")
-export ordinal_encoder_fit, ordinal_encoder_transform, OrdinalEncoder
+export  OrdinalEncoder
 
 # Frequency encoding
 include("encoders/frequency_encoding/frequency_encoding.jl")
 include("encoders/frequency_encoding/interface_mlj.jl")
 export frequency_encoder_fit, frequency_encoder_transform, FrequencyEncoder
+export  FrequencyEncoder
 
 # Cardinality reduction
 include("transformers/cardinality_reducer/cardinality_reducer.jl")
diff --git a/src/encoders/contrast_encoder/contrast_encoder.jl b/src/encoders/contrast_encoder/contrast_encoder.jl
@@ -0,0 +1,161 @@
+include("errors.jl")
+
+"""
+** Private Method **
+This and the following four methods implement the contrast matrix for dummy coding, sum coding, 
+    backaward/forward difference coding and helmert coding.
+Where `k` is the number of levels in the feature and the returned contrast matrix has dimensions (k,k-1).
+"""
+### 1. Dummy Coding
+function get_dummy_contrast(k)
+    return Matrix(1.0I, k, k-1)
+end
+
+
+### 2. Sum Coding
+function get_sum_contrast(k)
+    C = Matrix(1.0I, k, k-1)
+    C[end, :] .= -1.0
+    return C
+end
+
+### 3. Backward Difference Coding
+function create_backward_vector(index::Int, length::Int)
+    # [i/k i/k i/k .. i/k i/k]
+    vec = ones(length) .* index / length
+
+    # [ -(k-i)/k -(k-i)/k -(k-i)/k .. i/k i/k]
+    vec[1:index] .= index/length - 1
+    return vec
+end
+function get_backward_diff_contrast(k)
+    return hcat([create_backward_vector(i, k) for i in 1:k-1]...)
+end
+
+### 4. Forward Difference Coding
+function get_forward_diff_contrast(k)
+    return -get_backward_diff_contrast(k)
+end
+
+### 5. Helmert Coding
+function create_helmert_vector(index::Int, length::Int)
+    # [-1 -1 -1 .. -1 -1]
+    vec = -ones(length)
+    # [ -1 -1 -1 i .. 0 0]
+    vec[index+1] = index
+    # [ 0 0 i .. -1 -1]
+    if index + 2 <= length
+        vec[index+2:end] .= 0.0
+    end
+    return vec
+end
+function get_helmert_contrast(k)
+    return hcat([create_helmert_vector(i, k) for i in 1:k-1]...)
+end
+
+"""
+** Private Method **
+
+Fit a contrast encoing scheme on given data in `X`.
+
+# Arguments
+
+  - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
+  - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
+  - `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
+  If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
+  contrast encoding scheme for each feature
+  - `buildmatrix=nothing`: A function that takes a vector of levels and the number of levels as input and should return a contrast or hypothesis matrix. 
+  Only relevant if `mode` is `:contrast` or `:hypothesis`.
+  - `ignore=true`: Whether to exclude or includes the features given in `features`
+  - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
+
+# Returns (in a dict)
+
+  - `vec_given_feat_level`: Maps each level for each column in the selected categorical features to a vector
+  - `encoded_features`: The subset of the categorical features of X that were encoded
+"""
+function contrast_encoder_fit(
+    X,
+    features::AbstractVector{Symbol} = Symbol[];
+    mode::Union{Symbol, AbstractVector{Symbol}} = :dummy,
+    buildmatrix = nothing,
+    ignore::Bool = true,
+    ordered_factor::Bool = false,
+)
+    # mode should be a vector only if features is a vector of the same length
+    mode_is_vector = false
+    if mode isa Vector{Symbol}
+        mode_is_vector = true
+        ignore && throw(ArgumentError(IGNORE_MUST_FALSE_VEC_MODE))
+        length(features) == length(mode) || throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
+    end
+    
+    # buildmatrix should be specified if mode is :contrast or :hypothesis
+    if mode in (:contrast, :hypothesis)
+        buildmatrix === nothing && throw(ArgumentError(BUILDFUNC_MUST_BE_SPECIFIED))
+    end
+
+
+    # ensure mode is one of :contrast, :dummy, :sum, :backward_diff, :forward_diff, :helmert, :polynomial, :hypothesis
+    function feature_mapper(col, name)
+        feat_levels = levels(col)
+        k = length(feat_levels)
+        feat_mode = (mode_is_vector) ? mode[findfirst(isequal(name), features)] : mode
+        if feat_mode == :contrast
+            contrastmatrix = buildmatrix(feat_levels, k)
+            size(contrastmatrix) == (k, k-1) || throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
+        elseif feat_mode == :hypothesis
+            hypothesismatrix = buildmatrix(feat_levels, k)
+            size(hypothesismatrix) == (k-1, k) || throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
+            contrastmatrix = pinv(hypothesismatrix)
+        elseif feat_mode == :dummy
+            contrastmatrix = get_dummy_contrast(k)
+        elseif feat_mode == :sum
+            contrastmatrix = get_sum_contrast(k)
+        elseif feat_mode == :backward_diff
+            contrastmatrix = get_backward_diff_contrast(k)
+        elseif feat_mode == :forward_diff
+            contrastmatrix = get_forward_diff_contrast(k)
+        elseif feat_mode == :helmert
+            contrastmatrix = get_helmert_contrast(k)
+        else
+            throw(ArgumentError("Mode $feat_mode is not supported."))
+        end
+
+        vec_given_feat_val = Dict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
+        return vec_given_feat_val
+    end
+
+    # 2. Pass it to generic_fit
+    vec_given_feat_val, encoded_features = generic_fit(
+        X, features; ignore = ignore, ordered_factor = ordered_factor,
+        feature_mapper = feature_mapper,
+    )
+
+    cache = Dict(
+        :vec_given_feat_val  => vec_given_feat_val,
+        :encoded_features => encoded_features,
+    )
+
+    return cache
+end
+
+"""
+** Private Method **
+
+Use a fitted contrast encoder to encode the levels of selected categorical variables with contrast encoding.
+
+# Arguments
+
+  - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
+  - `cache`: The output of `contrast_encoder_fit`
+
+# Returns
+
+  - `X_tr`: The table with selected features after the selected features are encoded by contrast encoding.
+"""
+function contrast_encoder_transform(X, cache::Dict)
+    vec_given_feat_val = cache[:vec_given_feat_val]
+    return generic_transform(X, vec_given_feat_val, single_feat = false)
+end
diff --git a/src/encoders/contrast_encoder/errors.jl b/src/encoders/contrast_encoder/errors.jl
@@ -0,0 +1,5 @@
+MATRIX_SIZE_ERROR(k, matrix_size, feat_name)= "In ContrastEncoder, a categorical variable with $k levels should have a contrast matrix of size ($k, $k-1). However, the given contrast matrix by `buildmatrix` is $matrix_size for feature $feat_name."
+MATRIX_SIZE_ERROR_HYP(k, matrix_size, feat_name)= "In ContrastEncoder, a categorical variable with $k levels should have a hypothesis matrix of size ($k-1, $k). However, the given hypothesis matrix by `buildmatrix` is $matrix_size for feature $feat_name."
+IGNORE_MUST_FALSE_VEC_MODE = "In ContrastEncoder with mode given as a vector of symbols, the ignore argument must be set to false and features must be explictly specified in features."
+BUILDFUNC_MUST_BE_SPECIFIED = "In ContrastEncoder with mode=:contrast or mode=:hypothesis, the `buildmatrix` argument must be specified."
+LENGTH_MISMATCH_VEC_MODE(len_mode, len_feat) = "In ContrastEncoder with mode given as a vector of symbols, the length of the features argument must match the number of specified modes. However, the method received $(len_mode) modes and $(len_feat) features."
diff --git a/src/encoders/contrast_encoder/interface_mlj.jl b/src/encoders/contrast_encoder/interface_mlj.jl
@@ -0,0 +1,154 @@
+### ContrastEncoding with MLJ Interface
+
+# 1. Interface Struct
+mutable struct ContrastEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised
+    features::AS
+    ignore::Bool
+    mode::Union{Symbol, AS}
+    buildmatrix::Any
+    ordered_factor::Bool
+end;
+
+# 2. Constructor
+function ContrastEncoder(;
+    features = Symbol[],
+    ignore = true,
+    mode = :dummy,
+    buildmatrix = nothing,
+    ordered_factor = false,
+)
+    return ContrastEncoder(features, ignore, mode, buildmatrix, ordered_factor)
+end;
+
+
+# 4. Fitted parameters (for user access)
+MMI.fitted_params(::ContrastEncoder, fitresult) = (
+    vec_given_feat_val = fitresult,
+)
+
+# 5. Fit method
+function MMI.fit(transformer::ContrastEncoder, verbosity::Int, X)
+    generic_cache = contrast_encoder_fit(
+        X,
+        transformer.features;
+        ignore = transformer.ignore,
+        mode = transformer.mode,
+        buildmatrix = transformer.buildmatrix,
+        ordered_factor = transformer.ordered_factor,
+    )
+    fitresult = generic_cache[:vec_given_feat_val]
+
+    report = (encoded_features = generic_cache[:encoded_features],)        # report only has list of encoded features
+    cache = nothing
+    return fitresult, cache, report
+end;
+
+
+# 6. Transform method
+function MMI.transform(transformer::ContrastEncoder, fitresult, Xnew)
+    generic_cache = Dict(
+        :vec_given_feat_val =>
+            fitresult,
+    )
+    Xnew_transf = contrast_encoder_transform(Xnew, generic_cache)
+    return Xnew_transf
+end
+
+# 8. Extra metadata
+MMI.metadata_pkg(
+    ContrastEncoder,
+    package_name = "MLJTransforms",
+    package_uuid = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6",
+    package_url = "https://github.com/JuliaAI/MLJTransforms.jl",
+    is_pure_julia = true,
+)
+
+MMI.metadata_model(
+    ContrastEncoder,
+    input_scitype = Table,
+    output_scitype = Table,
+    load_path = "MLJTransforms.ContrastEncoder",
+)
+
+
+"""
+$(MMI.doc_header(ContrastEncoder))
+
+`ContrastEncoder` implements various contrast encoding methods including dummy, sum, backward/forward different, and helmert coding and
+    supports more generic coding methods by specifying a function that returns a contrast or hypothesis matrix.
+
+# Training data
+
+In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
+
+    mach = machine(model, X)
+
+Here:
+
+- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
+   have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to 
+   check scitypes. 
+
+Train the machine using `fit!(mach, rows=...)`.
+
+# Hyper-parameters
+
+- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
+- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
+If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
+contrast encoding scheme for each feature
+- `buildmatrix=nothing`: A function that takes a vector of levels and the number of levels as input and should return a contrast or hypothesis matrix. 
+Only relevant if `mode` is `:contrast` or `:hypothesis`.
+- `ignore=true`: Whether to exclude or includes the features given in `features`
+- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
+
+# Operations
+
+- `transform(mach, Xnew)`: Apply contrast encoding to selected `Multiclass` or `OrderedFactor features of `Xnew` specified by hyper-parameters, and 
+   return the new table. Features that are neither `Multiclass` nor `OrderedFactor`
+   are always left unchanged.
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+- `vec_given_feat_val`: A dictionary that maps each level for each column in a subset of the categorical features of X into its frequency.
+
+# Report
+
+The fields of `report(mach)` are:
+
+- `encoded_features`: The subset of the categorical features of X that were encoded
+
+# Examples
+
+```julia
+using MLJ
+
+# Define categorical dataset
+X = (name   = categorical(["Ben", "John", "Mary", "John"]),
+height = [1.85, 1.67, 1.5, 1.67],
+favnum = categorical([7, 5, 10, 1]),
+age    = [23, 23, 14, 23])
+
+# Check scitype coercions:
+schema(X)
+
+encoder =  ContrastEncoder(features=[:name, :favnum]; ignore=false, mode = [:dummy, :helmert])
+mach = fit!(machine(encoder, X))
+Xnew = transform(mach, X)
+
+julia > Xnew
+    (name_1 = [1.0, 0.0, 0.0, 0.0],
+    name_2 = [0.0, 1.0, 0.0, 1.0],
+    height = [1.85, 1.67, 1.5, 1.67],
+    favnum_1 = [0.0, 1.0, 0.0, -1.0],
+    favnum_2 = [2.0, -1.0, 0.0, -1.0],
+    favnum_3 = [-1.0, -1.0, 3.0, -1.0],
+    age = [23, 23, 14, 23],)
+```
+
+See also
+[`OneHotEncoder`](@ref)
+"""
+ContrastEncoder
diff --git a/test/encoders/contrast_encoder.jl b/test/encoders/contrast_encoder.jl
diff --git a/test/runtests.jl b/test/runtests.jl