JuliaAI
diff --git a/‎Project.toml‎
Lines changed: 1 addition & 1 deletion b/‎Project.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/MLJTransforms.jl‎
Lines changed: 24 additions & 3 deletions b/‎src/MLJTransforms.jl‎
Lines changed: 24 additions & 3 deletions
diff --git a/‎src/encoders/contrast_encoder/contrast_encoder.jl‎
Lines changed: 163 additions & 0 deletions b/‎src/encoders/contrast_encoder/contrast_encoder.jl‎
Lines changed: 163 additions & 0 deletions
diff --git a/‎src/encoders/contrast_encoder/errors.jl‎
Lines changed: 5 additions & 0 deletions b/‎src/encoders/contrast_encoder/errors.jl‎
Lines changed: 5 additions & 0 deletions
@@ -33,8 +33,8 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
 
 [targets]
 test = ["Test", "DataFrames", "MLJBase", "Random", "StableRNGs", "StatsModels"]
@@ -6,13 +6,16 @@ using CategoricalArrays
 using MLJModelInterface
 using TableOperations
 using StatsBase
+using LinearAlgebra
+
 # Other transformers
 using Combinatorics
 import Distributions
 using Parameters
 using Dates
 using OrderedCollections
 
+
 const MMI = MLJModelInterface
 
 # Functions of generic use across transformers
@@ -23,13 +26,11 @@ include("utils.jl")
 include("encoders/target_encoding/errors.jl")
 include("encoders/target_encoding/target_encoding.jl")
 include("encoders/target_encoding/interface_mlj.jl")
-export target_encoder_fit, target_encoder_transform, TargetEncoder
 export  TargetEncoder
 
 # Ordinal encoding
 include("encoders/ordinal_encoding/ordinal_encoding.jl")
 include("encoders/ordinal_encoding/interface_mlj.jl")
-export ordinal_encoder_fit, ordinal_encoder_transform, OrdinalEncoder
 export  OrdinalEncoder
 
 # Frequency encoding
@@ -47,4 +48,24 @@ include("encoders/missingness_encoding/missingness_encoding.jl")
 include("encoders/missingness_encoding/interface_mlj.jl")
 export  MissingnessEncoder
 
-end
+# Contrast encoder
+include("encoders/contrast_encoder/contrast_encoder.jl")
+include("encoders/contrast_encoder/interface_mlj.jl")
+export ContrastEncoder
+
+# MLJModels transformers
+include("transformers/other_transformers/continuous_encoder.jl")
+include("transformers/other_transformers/interaction_transformer.jl")
+include("transformers/other_transformers/univariate_time_type_to_continuous.jl")
+include("transformers/other_transformers/fill_imputer.jl")
+include("transformers/other_transformers/one_hot_encoder.jl")
+include("transformers/other_transformers/standardizer.jl")
+include("transformers/other_transformers/univariate_boxcox_transformer.jl")
+include("transformers/other_transformers/univariate_discretizer.jl")
+include("transformers/other_transformers/metadata_shared.jl")
+
+export UnivariateDiscretizer,
+    UnivariateStandardizer, Standardizer, UnivariateBoxCoxTransformer,
+    OneHotEncoder, ContinuousEncoder, FillImputer, UnivariateFillImputer,
+    UnivariateTimeTypeToContinuous, InteractionTransformer
+end
@@ -0,0 +1,163 @@
+include("errors.jl")
+
+"""
+** Private Method **
+
+This and the following four methods implement the contrast matrix for dummy coding, sum coding, 
+    backaward/forward difference coding and helmert coding.
+Where `k` is the number of levels in the feature and the returned contrast matrix has dimensions (k,k-1).
+"""
+### 1. Dummy Coding
+function get_dummy_contrast(k)
+    return Matrix(1.0I, k, k-1)
+end
+
+
+### 2. Sum Coding
+function get_sum_contrast(k)
+    C = Matrix(1.0I, k, k-1)
+    C[end, :] .= -1.0
+    return C
+end
+
+### 3. Backward Difference Coding
+function create_backward_vector(index::Int, length::Int)
+    # [i/k i/k i/k .. i/k i/k]
+    vec = ones(length) .* index / length
+
+    # [ -(k-i)/k -(k-i)/k -(k-i)/k .. i/k i/k]
+    vec[1:index] .= index/length - 1
+    return vec
+end
+function get_backward_diff_contrast(k)
+    return hcat([create_backward_vector(i, k) for i in 1:k-1]...)
+end
+
+### 4. Forward Difference Coding
+function get_forward_diff_contrast(k)
+    return -get_backward_diff_contrast(k)
+end
+
+### 5. Helmert Coding
+function create_helmert_vector(index::Int, length::Int)
+    # [-1 -1 -1 .. -1 -1]
+    vec = -ones(length)
+    # [ -1 -1 -1 i .. 0 0]
+    vec[index+1] = index
+    # [ 0 0 i .. -1 -1]
+    if index + 2 <= length
+        vec[index+2:end] .= 0.0
+    end
+    return vec
+end
+function get_helmert_contrast(k)
+    return hcat([create_helmert_vector(i, k) for i in 1:k-1]...)
+end
+
+"""
+** Private Method **
+
+Fit a contrast encoing scheme on given data in `X`.
+
+# Arguments
+
+  - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
+  - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
+  - `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
+  If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
+  contrast encoding scheme for each feature
+  - `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`, 
+  where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or 
+  hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
+  - `ignore=true`: Whether to exclude or includes the features given in `features`
+  - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
+
+# Returns (in a dict)
+
+  - `vec_given_feat_level`: Maps each level for each column in the selected categorical features to a vector
+  - `encoded_features`: The subset of the categorical features of X that were encoded
+"""
+function contrast_encoder_fit(
+    X,
+    features::AbstractVector{Symbol} = Symbol[];
+    mode::Union{Symbol, AbstractVector{Symbol}} = :dummy,
+    buildmatrix = nothing,
+    ignore::Bool = true,
+    ordered_factor::Bool = false,
+)
+    # mode should be a vector only if features is a vector of the same length
+    mode_is_vector = false
+    if mode isa Vector{Symbol}
+        mode_is_vector = true
+        ignore && throw(ArgumentError(IGNORE_MUST_FALSE_VEC_MODE))
+        length(features) == length(mode) || throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
+    end
+    
+    # buildmatrix should be specified if mode is :contrast or :hypothesis
+    if mode in (:contrast, :hypothesis)
+        buildmatrix === nothing && throw(ArgumentError(BUILDFUNC_MUST_BE_SPECIFIED))
+    end
+
+
+    # ensure mode is one of :contrast, :dummy, :sum, :backward_diff, :forward_diff, :helmert, :polynomial, :hypothesis
+    function feature_mapper(col, name)
+        feat_levels = levels(col)
+        k = length(feat_levels)
+        feat_mode = (mode_is_vector) ? mode[findfirst(isequal(name), features)] : mode
+        if feat_mode == :contrast
+            contrastmatrix = buildmatrix(name, k)            
+            size(contrastmatrix) == (k, k-1) || throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
+        elseif feat_mode == :hypothesis
+            hypothesismatrix = buildmatrix(name, k) 
+            size(hypothesismatrix) == (k-1, k) || throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
+            contrastmatrix = pinv(hypothesismatrix)
+        elseif feat_mode == :dummy
+            contrastmatrix = get_dummy_contrast(k)
+        elseif feat_mode == :sum
+            contrastmatrix = get_sum_contrast(k)
+        elseif feat_mode == :backward_diff
+            contrastmatrix = get_backward_diff_contrast(k)
+        elseif feat_mode == :forward_diff
+            contrastmatrix = get_forward_diff_contrast(k)
+        elseif feat_mode == :helmert
+            contrastmatrix = get_helmert_contrast(k)
+        else
+            throw(ArgumentError("Mode $feat_mode is not supported."))
+        end
+
+        vector_given_value_given_feature = Dict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
+        return vector_given_value_given_feature
+    end
+
+    # 2. Pass it to generic_fit
+    vector_given_value_given_feature, encoded_features = generic_fit(
+        X, features; ignore = ignore, ordered_factor = ordered_factor,
+        feature_mapper = feature_mapper,
+    )
+
+    cache = Dict(
+        :vector_given_value_given_feature  => vector_given_value_given_feature,
+        :encoded_features => encoded_features,
+    )
+
+    return cache
+end
+
+"""
+** Private Method **
+
+Use a fitted contrast encoder to encode the levels of selected categorical variables with contrast encoding.
+
+# Arguments
+
+  - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
+  - `cache`: The output of `contrast_encoder_fit`
+
+# Returns
+
+  - `X_tr`: The table with selected features after the selected features are encoded by contrast encoding.
+"""
+function contrast_encoder_transform(X, cache::Dict)
+    vector_given_value_given_feature = cache[:vector_given_value_given_feature]
+    return generic_transform(X, vector_given_value_given_feature, single_feat = false)
+end
@@ -0,0 +1,5 @@
+MATRIX_SIZE_ERROR(k, matrix_size, feat_name)= "In ContrastEncoder, a categorical variable with $k levels should have a contrast matrix of size ($k, $k-1). However, the contrast matrix returned by `buildmatrix` is $matrix_size for feature $feat_name."
+MATRIX_SIZE_ERROR_HYP(k, matrix_size, feat_name)= "In ContrastEncoder, a categorical variable with $k levels should have a hypothesis matrix of size ($k-1, $k). However, the given hypothesis matrix returned by `buildmatrix` is $matrix_size for feature $feat_name."
+IGNORE_MUST_FALSE_VEC_MODE = "In ContrastEncoder with mode given as a vector of symbols, the ignore argument must be set to false and features must be explictly specified in features."
+BUILDFUNC_MUST_BE_SPECIFIED = "In ContrastEncoder with mode=:contrast or mode=:hypothesis, the `buildmatrix` argument must be specified."
+LENGTH_MISMATCH_VEC_MODE(len_mode, len_feat) = "In ContrastEncoder with mode given as a vector of symbols, the length of the features argument must match the number of specified modes. However, the method received $(len_mode) modes and $(len_feat) features."