Skip to content

Commit 972da5c

Browse files
authored
Merge pull request #9 from JuliaAI/Contrast-Encoding
🎨 Add ContrastEncoder
2 parents aaa9b49 + 9381b01 commit 972da5c

File tree

6 files changed

+636
-2
lines changed

6 files changed

+636
-2
lines changed

src/MLJTransforms.jl

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ using Parameters
1515
using Dates
1616
using OrderedCollections
1717

18+
1819
const MMI = MLJModelInterface
1920

2021
# Functions of generic use across transformers
@@ -25,22 +26,29 @@ include("utils.jl")
2526
include("encoders/target_encoding/errors.jl")
2627
include("encoders/target_encoding/target_encoding.jl")
2728
include("encoders/target_encoding/interface_mlj.jl")
28-
export target_encoder_fit, target_encoder_transform, TargetEncoder
29+
export TargetEncoder
2930

3031
# Ordinal encoding
3132
include("encoders/ordinal_encoding/ordinal_encoding.jl")
3233
include("encoders/ordinal_encoding/interface_mlj.jl")
33-
export ordinal_encoder_fit, ordinal_encoder_transform, OrdinalEncoder
34+
export OrdinalEncoder
3435

3536
# Frequency encoding
3637
include("encoders/frequency_encoding/frequency_encoding.jl")
3738
include("encoders/frequency_encoding/interface_mlj.jl")
3839
export frequency_encoder_fit, frequency_encoder_transform, FrequencyEncoder
40+
export FrequencyEncoder
3941

4042
# Cardinality reduction
4143
include("transformers/cardinality_reducer/cardinality_reducer.jl")
4244
include("transformers/cardinality_reducer/interface_mlj.jl")
4345
export cardinality_reducer_fit, cardinality_reducer_transform, CardinalityReducer
46+
47+
# Contrast encoder
48+
include("encoders/contrast_encoder/contrast_encoder.jl")
49+
include("encoders/contrast_encoder/interface_mlj.jl")
50+
export ContrastEncoder
51+
4452
# MLJModels transformers
4553
include("transformers/other_transformers/continuous_encoder.jl")
4654
include("transformers/other_transformers/interaction_transformer.jl")
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
include("errors.jl")
2+
3+
"""
4+
** Private Method **
5+
6+
This and the following four methods implement the contrast matrix for dummy coding, sum coding,
7+
backaward/forward difference coding and helmert coding.
8+
Where `k` is the number of levels in the feature and the returned contrast matrix has dimensions (k,k-1).
9+
"""
10+
### 1. Dummy Coding
11+
function get_dummy_contrast(k)
12+
return Matrix(1.0I, k, k-1)
13+
end
14+
15+
16+
### 2. Sum Coding
17+
function get_sum_contrast(k)
18+
C = Matrix(1.0I, k, k-1)
19+
C[end, :] .= -1.0
20+
return C
21+
end
22+
23+
### 3. Backward Difference Coding
24+
function create_backward_vector(index::Int, length::Int)
25+
# [i/k i/k i/k .. i/k i/k]
26+
vec = ones(length) .* index / length
27+
28+
# [ -(k-i)/k -(k-i)/k -(k-i)/k .. i/k i/k]
29+
vec[1:index] .= index/length - 1
30+
return vec
31+
end
32+
function get_backward_diff_contrast(k)
33+
return hcat([create_backward_vector(i, k) for i in 1:k-1]...)
34+
end
35+
36+
### 4. Forward Difference Coding
37+
function get_forward_diff_contrast(k)
38+
return -get_backward_diff_contrast(k)
39+
end
40+
41+
### 5. Helmert Coding
42+
function create_helmert_vector(index::Int, length::Int)
43+
# [-1 -1 -1 .. -1 -1]
44+
vec = -ones(length)
45+
# [ -1 -1 -1 i .. 0 0]
46+
vec[index+1] = index
47+
# [ 0 0 i .. -1 -1]
48+
if index + 2 <= length
49+
vec[index+2:end] .= 0.0
50+
end
51+
return vec
52+
end
53+
function get_helmert_contrast(k)
54+
return hcat([create_helmert_vector(i, k) for i in 1:k-1]...)
55+
end
56+
57+
"""
58+
** Private Method **
59+
60+
Fit a contrast encoing scheme on given data in `X`.
61+
62+
# Arguments
63+
64+
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
65+
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
66+
- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
67+
If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
68+
contrast encoding scheme for each feature
69+
- `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
70+
where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
71+
hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
72+
- `ignore=true`: Whether to exclude or includes the features given in `features`
73+
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
74+
75+
# Returns (in a dict)
76+
77+
- `vec_given_feat_level`: Maps each level for each column in the selected categorical features to a vector
78+
- `encoded_features`: The subset of the categorical features of X that were encoded
79+
"""
80+
function contrast_encoder_fit(
81+
X,
82+
features::AbstractVector{Symbol} = Symbol[];
83+
mode::Union{Symbol, AbstractVector{Symbol}} = :dummy,
84+
buildmatrix = nothing,
85+
ignore::Bool = true,
86+
ordered_factor::Bool = false,
87+
)
88+
# mode should be a vector only if features is a vector of the same length
89+
mode_is_vector = false
90+
if mode isa Vector{Symbol}
91+
mode_is_vector = true
92+
ignore && throw(ArgumentError(IGNORE_MUST_FALSE_VEC_MODE))
93+
length(features) == length(mode) || throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
94+
end
95+
96+
# buildmatrix should be specified if mode is :contrast or :hypothesis
97+
if mode in (:contrast, :hypothesis)
98+
buildmatrix === nothing && throw(ArgumentError(BUILDFUNC_MUST_BE_SPECIFIED))
99+
end
100+
101+
102+
# ensure mode is one of :contrast, :dummy, :sum, :backward_diff, :forward_diff, :helmert, :polynomial, :hypothesis
103+
function feature_mapper(col, name)
104+
feat_levels = levels(col)
105+
k = length(feat_levels)
106+
feat_mode = (mode_is_vector) ? mode[findfirst(isequal(name), features)] : mode
107+
if feat_mode == :contrast
108+
contrastmatrix = buildmatrix(name, k)
109+
size(contrastmatrix) == (k, k-1) || throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
110+
elseif feat_mode == :hypothesis
111+
hypothesismatrix = buildmatrix(name, k)
112+
size(hypothesismatrix) == (k-1, k) || throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
113+
contrastmatrix = pinv(hypothesismatrix)
114+
elseif feat_mode == :dummy
115+
contrastmatrix = get_dummy_contrast(k)
116+
elseif feat_mode == :sum
117+
contrastmatrix = get_sum_contrast(k)
118+
elseif feat_mode == :backward_diff
119+
contrastmatrix = get_backward_diff_contrast(k)
120+
elseif feat_mode == :forward_diff
121+
contrastmatrix = get_forward_diff_contrast(k)
122+
elseif feat_mode == :helmert
123+
contrastmatrix = get_helmert_contrast(k)
124+
else
125+
throw(ArgumentError("Mode $feat_mode is not supported."))
126+
end
127+
128+
vector_given_value_given_feature = Dict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
129+
return vector_given_value_given_feature
130+
end
131+
132+
# 2. Pass it to generic_fit
133+
vector_given_value_given_feature, encoded_features = generic_fit(
134+
X, features; ignore = ignore, ordered_factor = ordered_factor,
135+
feature_mapper = feature_mapper,
136+
)
137+
138+
cache = Dict(
139+
:vector_given_value_given_feature => vector_given_value_given_feature,
140+
:encoded_features => encoded_features,
141+
)
142+
143+
return cache
144+
end
145+
146+
"""
147+
** Private Method **
148+
149+
Use a fitted contrast encoder to encode the levels of selected categorical variables with contrast encoding.
150+
151+
# Arguments
152+
153+
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
154+
- `cache`: The output of `contrast_encoder_fit`
155+
156+
# Returns
157+
158+
- `X_tr`: The table with selected features after the selected features are encoded by contrast encoding.
159+
"""
160+
function contrast_encoder_transform(X, cache::Dict)
161+
vector_given_value_given_feature = cache[:vector_given_value_given_feature]
162+
return generic_transform(X, vector_given_value_given_feature, single_feat = false)
163+
end
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
MATRIX_SIZE_ERROR(k, matrix_size, feat_name)= "In ContrastEncoder, a categorical variable with $k levels should have a contrast matrix of size ($k, $k-1). However, the contrast matrix returned by `buildmatrix` is $matrix_size for feature $feat_name."
2+
MATRIX_SIZE_ERROR_HYP(k, matrix_size, feat_name)= "In ContrastEncoder, a categorical variable with $k levels should have a hypothesis matrix of size ($k-1, $k). However, the given hypothesis matrix returned by `buildmatrix` is $matrix_size for feature $feat_name."
3+
IGNORE_MUST_FALSE_VEC_MODE = "In ContrastEncoder with mode given as a vector of symbols, the ignore argument must be set to false and features must be explictly specified in features."
4+
BUILDFUNC_MUST_BE_SPECIFIED = "In ContrastEncoder with mode=:contrast or mode=:hypothesis, the `buildmatrix` argument must be specified."
5+
LENGTH_MISMATCH_VEC_MODE(len_mode, len_feat) = "In ContrastEncoder with mode given as a vector of symbols, the length of the features argument must match the number of specified modes. However, the method received $(len_mode) modes and $(len_feat) features."
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
### ContrastEncoding with MLJ Interface
2+
3+
# 1. Interface Struct
4+
mutable struct ContrastEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised
5+
features::AS
6+
ignore::Bool
7+
mode::Union{Symbol, AS}
8+
buildmatrix::Any
9+
ordered_factor::Bool
10+
end;
11+
12+
# 2. Constructor
13+
function ContrastEncoder(;
14+
features = Symbol[],
15+
ignore = true,
16+
mode = :dummy,
17+
buildmatrix = nothing,
18+
ordered_factor = false,
19+
)
20+
return ContrastEncoder(features, ignore, mode, buildmatrix, ordered_factor)
21+
end;
22+
23+
24+
# 4. Fitted parameters (for user access)
25+
MMI.fitted_params(::ContrastEncoder, fitresult) = (
26+
vector_given_value_given_feature = fitresult,
27+
)
28+
29+
# 5. Fit method
30+
function MMI.fit(transformer::ContrastEncoder, verbosity::Int, X)
31+
generic_cache = contrast_encoder_fit(
32+
X,
33+
transformer.features;
34+
ignore = transformer.ignore,
35+
mode = transformer.mode,
36+
buildmatrix = transformer.buildmatrix,
37+
ordered_factor = transformer.ordered_factor,
38+
)
39+
fitresult = generic_cache[:vector_given_value_given_feature]
40+
41+
report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features
42+
cache = nothing
43+
return fitresult, cache, report
44+
end;
45+
46+
47+
# 6. Transform method
48+
function MMI.transform(transformer::ContrastEncoder, fitresult, Xnew)
49+
generic_cache = Dict(
50+
:vector_given_value_given_feature =>
51+
fitresult,
52+
)
53+
Xnew_transf = contrast_encoder_transform(Xnew, generic_cache)
54+
return Xnew_transf
55+
end
56+
57+
# 8. Extra metadata
58+
MMI.metadata_pkg(
59+
ContrastEncoder,
60+
package_name = "MLJTransforms",
61+
package_uuid = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6",
62+
package_url = "https://github.com/JuliaAI/MLJTransforms.jl",
63+
is_pure_julia = true,
64+
)
65+
66+
MMI.metadata_model(
67+
ContrastEncoder,
68+
input_scitype = Table,
69+
output_scitype = Table,
70+
load_path = "MLJTransforms.ContrastEncoder",
71+
)
72+
73+
74+
"""
75+
$(MMI.doc_header(ContrastEncoder))
76+
77+
`ContrastEncoder` implements the following contrast encoding methods for
78+
categorical features: dummy, sum, backward/forward difference, and Helmert coding.
79+
More generally, users can specify a custom contrast or hypothesis matrix, and each feature
80+
can be encoded using a different method.
81+
82+
# Training data
83+
84+
In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
85+
86+
mach = machine(model, X)
87+
88+
Here:
89+
90+
- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
91+
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
92+
check scitypes.
93+
94+
Train the machine using `fit!(mach, rows=...)`.
95+
96+
# Hyper-parameters
97+
98+
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
99+
- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
100+
If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
101+
contrast encoding scheme for each feature
102+
- `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
103+
where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
104+
hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
105+
- `ignore=true`: Whether to exclude or includes the features given in `features`
106+
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
107+
108+
# Operations
109+
110+
- `transform(mach, Xnew)`: Apply contrast encoding to selected `Multiclass` or `OrderedFactor features of `Xnew` specified by hyper-parameters, and
111+
return the new table. Features that are neither `Multiclass` nor `OrderedFactor`
112+
are always left unchanged.
113+
114+
# Fitted parameters
115+
116+
The fields of `fitted_params(mach)` are:
117+
118+
- `vector_given_value_given_feature`: A dictionary that maps each level for each column in a subset of the categorical features of X into its frequency.
119+
120+
# Report
121+
122+
The fields of `report(mach)` are:
123+
124+
- `encoded_features`: The subset of the categorical features of X that were encoded
125+
126+
# Examples
127+
128+
```julia
129+
using MLJ
130+
131+
# Define categorical dataset
132+
X = (
133+
name = categorical(["Ben", "John", "Mary", "John"]),
134+
height = [1.85, 1.67, 1.5, 1.67],
135+
favnum = categorical([7, 5, 10, 1]),
136+
age = [23, 23, 14, 23],
137+
)
138+
139+
# Check scitype coercions:
140+
schema(X)
141+
142+
encoder = ContrastEncoder(
143+
features = [:name, :favnum],
144+
ignore = false,
145+
mode = [:dummy, :helmert],
146+
)
147+
mach = fit!(machine(encoder, X))
148+
Xnew = transform(mach, X)
149+
150+
julia > Xnew
151+
(name_1 = [1.0, 0.0, 0.0, 0.0],
152+
name_2 = [0.0, 1.0, 0.0, 1.0],
153+
height = [1.85, 1.67, 1.5, 1.67],
154+
favnum_1 = [0.0, 1.0, 0.0, -1.0],
155+
favnum_2 = [2.0, -1.0, 0.0, -1.0],
156+
favnum_3 = [-1.0, -1.0, 3.0, -1.0],
157+
age = [23, 23, 14, 23],)
158+
```
159+
160+
See also
161+
[`OneHotEncoder`](@ref)
162+
"""
163+
ContrastEncoder

0 commit comments

Comments
 (0)