Skip to content

Commit 0a37ea6

Browse files
committed
🎨 Add ContrastEncoder
1 parent eff5da6 commit 0a37ea6

File tree

7 files changed

+624
-4
lines changed

7 files changed

+624
-4
lines changed

Project.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
99
Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
1010
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
1111
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
12+
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
1213
MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
1314
OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
1415
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
@@ -31,8 +32,9 @@ julia = "1.6.7"
3132
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
3233
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
3334
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
34-
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
3535
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
36+
StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
37+
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
3638

3739
[targets]
38-
test = ["Test", "DataFrames", "MLJBase", "Random", "StableRNGs"]
40+
test = ["Test", "DataFrames", "MLJBase", "Random", "StableRNGs", "StatsModels"]

src/MLJTransforms.jl

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,15 @@ using CategoricalArrays
66
using MLJModelInterface
77
using TableOperations
88
using StatsBase
9+
using LinearAlgebra
910
# Other transformers
1011
using Combinatorics
1112
import Distributions
1213
using Parameters
1314
using Dates
1415
using OrderedCollections
1516

17+
1618
const MMI = MLJModelInterface
1719

1820
# Functions of generic use across transformers
@@ -23,17 +25,18 @@ include("utils.jl")
2325
include("encoders/target_encoding/errors.jl")
2426
include("encoders/target_encoding/target_encoding.jl")
2527
include("encoders/target_encoding/interface_mlj.jl")
26-
export target_encoder_fit, target_encoder_transform, TargetEncoder
28+
export TargetEncoder
2729

2830
# Ordinal encoding
2931
include("encoders/ordinal_encoding/ordinal_encoding.jl")
3032
include("encoders/ordinal_encoding/interface_mlj.jl")
31-
export ordinal_encoder_fit, ordinal_encoder_transform, OrdinalEncoder
33+
export OrdinalEncoder
3234

3335
# Frequency encoding
3436
include("encoders/frequency_encoding/frequency_encoding.jl")
3537
include("encoders/frequency_encoding/interface_mlj.jl")
3638
export frequency_encoder_fit, frequency_encoder_transform, FrequencyEncoder
39+
export FrequencyEncoder
3740

3841
# Cardinality reduction
3942
include("transformers/cardinality_reducer/cardinality_reducer.jl")
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
include("errors.jl")
2+
3+
"""
4+
** Private Method **
5+
This and the following four methods implement the contrast matrix for dummy coding, sum coding,
6+
backaward/forward difference coding and helmert coding.
7+
Where `k` is the number of levels in the feature and the returned contrast matrix has dimensions (k,k-1).
8+
"""
9+
### 1. Dummy Coding
10+
function get_dummy_contrast(k)
11+
return Matrix(1.0I, k, k-1)
12+
end
13+
14+
15+
### 2. Sum Coding
16+
function get_sum_contrast(k)
17+
C = Matrix(1.0I, k, k-1)
18+
C[end, :] .= -1.0
19+
return C
20+
end
21+
22+
### 3. Backward Difference Coding
23+
function create_backward_vector(index::Int, length::Int)
24+
# [i/k i/k i/k .. i/k i/k]
25+
vec = ones(length) .* index / length
26+
27+
# [ -(k-i)/k -(k-i)/k -(k-i)/k .. i/k i/k]
28+
vec[1:index] .= index/length - 1
29+
return vec
30+
end
31+
function get_backward_diff_contrast(k)
32+
return hcat([create_backward_vector(i, k) for i in 1:k-1]...)
33+
end
34+
35+
### 4. Forward Difference Coding
36+
function get_forward_diff_contrast(k)
37+
return -get_backward_diff_contrast(k)
38+
end
39+
40+
### 5. Helmert Coding
41+
function create_helmert_vector(index::Int, length::Int)
42+
# [-1 -1 -1 .. -1 -1]
43+
vec = -ones(length)
44+
# [ -1 -1 -1 i .. 0 0]
45+
vec[index+1] = index
46+
# [ 0 0 i .. -1 -1]
47+
if index + 2 <= length
48+
vec[index+2:end] .= 0.0
49+
end
50+
return vec
51+
end
52+
function get_helmert_contrast(k)
53+
return hcat([create_helmert_vector(i, k) for i in 1:k-1]...)
54+
end
55+
56+
"""
57+
** Private Method **
58+
59+
Fit a contrast encoing scheme on given data in `X`.
60+
61+
# Arguments
62+
63+
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
64+
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
65+
- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
66+
If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
67+
contrast encoding scheme for each feature
68+
- `buildmatrix=nothing`: A function that takes a vector of levels and the number of levels as input and should return a contrast or hypothesis matrix.
69+
Only relevant if `mode` is `:contrast` or `:hypothesis`.
70+
- `ignore=true`: Whether to exclude or includes the features given in `features`
71+
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
72+
73+
# Returns (in a dict)
74+
75+
- `vec_given_feat_level`: Maps each level for each column in the selected categorical features to a vector
76+
- `encoded_features`: The subset of the categorical features of X that were encoded
77+
"""
78+
function contrast_encoder_fit(
79+
X,
80+
features::AbstractVector{Symbol} = Symbol[];
81+
mode::Union{Symbol, AbstractVector{Symbol}} = :dummy,
82+
buildmatrix = nothing,
83+
ignore::Bool = true,
84+
ordered_factor::Bool = false,
85+
)
86+
# mode should be a vector only if features is a vector of the same length
87+
mode_is_vector = false
88+
if mode isa Vector{Symbol}
89+
mode_is_vector = true
90+
ignore && throw(ArgumentError(IGNORE_MUST_FALSE_VEC_MODE))
91+
length(features) == length(mode) || throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
92+
end
93+
94+
# buildmatrix should be specified if mode is :contrast or :hypothesis
95+
if mode in (:contrast, :hypothesis)
96+
buildmatrix === nothing && throw(ArgumentError(BUILDFUNC_MUST_BE_SPECIFIED))
97+
end
98+
99+
100+
# ensure mode is one of :contrast, :dummy, :sum, :backward_diff, :forward_diff, :helmert, :polynomial, :hypothesis
101+
function feature_mapper(col, name)
102+
feat_levels = levels(col)
103+
k = length(feat_levels)
104+
feat_mode = (mode_is_vector) ? mode[findfirst(isequal(name), features)] : mode
105+
if feat_mode == :contrast
106+
contrastmatrix = buildmatrix(feat_levels, k)
107+
size(contrastmatrix) == (k, k-1) || throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
108+
elseif feat_mode == :hypothesis
109+
hypothesismatrix = buildmatrix(feat_levels, k)
110+
size(hypothesismatrix) == (k-1, k) || throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
111+
contrastmatrix = pinv(hypothesismatrix)
112+
elseif feat_mode == :dummy
113+
contrastmatrix = get_dummy_contrast(k)
114+
elseif feat_mode == :sum
115+
contrastmatrix = get_sum_contrast(k)
116+
elseif feat_mode == :backward_diff
117+
contrastmatrix = get_backward_diff_contrast(k)
118+
elseif feat_mode == :forward_diff
119+
contrastmatrix = get_forward_diff_contrast(k)
120+
elseif feat_mode == :helmert
121+
contrastmatrix = get_helmert_contrast(k)
122+
else
123+
throw(ArgumentError("Mode $feat_mode is not supported."))
124+
end
125+
126+
vec_given_feat_val = Dict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
127+
return vec_given_feat_val
128+
end
129+
130+
# 2. Pass it to generic_fit
131+
vec_given_feat_val, encoded_features = generic_fit(
132+
X, features; ignore = ignore, ordered_factor = ordered_factor,
133+
feature_mapper = feature_mapper,
134+
)
135+
136+
cache = Dict(
137+
:vec_given_feat_val => vec_given_feat_val,
138+
:encoded_features => encoded_features,
139+
)
140+
141+
return cache
142+
end
143+
144+
"""
145+
** Private Method **
146+
147+
Use a fitted contrast encoder to encode the levels of selected categorical variables with contrast encoding.
148+
149+
# Arguments
150+
151+
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
152+
- `cache`: The output of `contrast_encoder_fit`
153+
154+
# Returns
155+
156+
- `X_tr`: The table with selected features after the selected features are encoded by contrast encoding.
157+
"""
158+
function contrast_encoder_transform(X, cache::Dict)
159+
vec_given_feat_val = cache[:vec_given_feat_val]
160+
return generic_transform(X, vec_given_feat_val, single_feat = false)
161+
end
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
MATRIX_SIZE_ERROR(k, matrix_size, feat_name)= "In ContrastEncoder, a categorical variable with $k levels should have a contrast matrix of size ($k, $k-1). However, the given contrast matrix by `buildmatrix` is $matrix_size for feature $feat_name."
2+
MATRIX_SIZE_ERROR_HYP(k, matrix_size, feat_name)= "In ContrastEncoder, a categorical variable with $k levels should have a hypothesis matrix of size ($k-1, $k). However, the given hypothesis matrix by `buildmatrix` is $matrix_size for feature $feat_name."
3+
IGNORE_MUST_FALSE_VEC_MODE = "In ContrastEncoder with mode given as a vector of symbols, the ignore argument must be set to false and features must be explictly specified in features."
4+
BUILDFUNC_MUST_BE_SPECIFIED = "In ContrastEncoder with mode=:contrast or mode=:hypothesis, the `buildmatrix` argument must be specified."
5+
LENGTH_MISMATCH_VEC_MODE(len_mode, len_feat) = "In ContrastEncoder with mode given as a vector of symbols, the length of the features argument must match the number of specified modes. However, the method received $(len_mode) modes and $(len_feat) features."
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
### ContrastEncoding with MLJ Interface
2+
3+
# 1. Interface Struct
4+
mutable struct ContrastEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised
5+
features::AS
6+
ignore::Bool
7+
mode::Union{Symbol, AS}
8+
buildmatrix::Any
9+
ordered_factor::Bool
10+
end;
11+
12+
# 2. Constructor
13+
function ContrastEncoder(;
14+
features = Symbol[],
15+
ignore = true,
16+
mode = :dummy,
17+
buildmatrix = nothing,
18+
ordered_factor = false,
19+
)
20+
return ContrastEncoder(features, ignore, mode, buildmatrix, ordered_factor)
21+
end;
22+
23+
24+
# 4. Fitted parameters (for user access)
25+
MMI.fitted_params(::ContrastEncoder, fitresult) = (
26+
vec_given_feat_val = fitresult,
27+
)
28+
29+
# 5. Fit method
30+
function MMI.fit(transformer::ContrastEncoder, verbosity::Int, X)
31+
generic_cache = contrast_encoder_fit(
32+
X,
33+
transformer.features;
34+
ignore = transformer.ignore,
35+
mode = transformer.mode,
36+
buildmatrix = transformer.buildmatrix,
37+
ordered_factor = transformer.ordered_factor,
38+
)
39+
fitresult = generic_cache[:vec_given_feat_val]
40+
41+
report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features
42+
cache = nothing
43+
return fitresult, cache, report
44+
end;
45+
46+
47+
# 6. Transform method
48+
function MMI.transform(transformer::ContrastEncoder, fitresult, Xnew)
49+
generic_cache = Dict(
50+
:vec_given_feat_val =>
51+
fitresult,
52+
)
53+
Xnew_transf = contrast_encoder_transform(Xnew, generic_cache)
54+
return Xnew_transf
55+
end
56+
57+
# 8. Extra metadata
58+
MMI.metadata_pkg(
59+
ContrastEncoder,
60+
package_name = "MLJTransforms",
61+
package_uuid = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6",
62+
package_url = "https://github.com/JuliaAI/MLJTransforms.jl",
63+
is_pure_julia = true,
64+
)
65+
66+
MMI.metadata_model(
67+
ContrastEncoder,
68+
input_scitype = Table,
69+
output_scitype = Table,
70+
load_path = "MLJTransforms.ContrastEncoder",
71+
)
72+
73+
74+
"""
75+
$(MMI.doc_header(ContrastEncoder))
76+
77+
`ContrastEncoder` implements various contrast encoding methods including dummy, sum, backward/forward different, and helmert coding and
78+
supports more generic coding methods by specifying a function that returns a contrast or hypothesis matrix.
79+
80+
# Training data
81+
82+
In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
83+
84+
mach = machine(model, X)
85+
86+
Here:
87+
88+
- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
89+
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
90+
check scitypes.
91+
92+
Train the machine using `fit!(mach, rows=...)`.
93+
94+
# Hyper-parameters
95+
96+
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
97+
- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
98+
If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
99+
contrast encoding scheme for each feature
100+
- `buildmatrix=nothing`: A function that takes a vector of levels and the number of levels as input and should return a contrast or hypothesis matrix.
101+
Only relevant if `mode` is `:contrast` or `:hypothesis`.
102+
- `ignore=true`: Whether to exclude or includes the features given in `features`
103+
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
104+
105+
# Operations
106+
107+
- `transform(mach, Xnew)`: Apply contrast encoding to selected `Multiclass` or `OrderedFactor features of `Xnew` specified by hyper-parameters, and
108+
return the new table. Features that are neither `Multiclass` nor `OrderedFactor`
109+
are always left unchanged.
110+
111+
# Fitted parameters
112+
113+
The fields of `fitted_params(mach)` are:
114+
115+
- `vec_given_feat_val`: A dictionary that maps each level for each column in a subset of the categorical features of X into its frequency.
116+
117+
# Report
118+
119+
The fields of `report(mach)` are:
120+
121+
- `encoded_features`: The subset of the categorical features of X that were encoded
122+
123+
# Examples
124+
125+
```julia
126+
using MLJ
127+
128+
# Define categorical dataset
129+
X = (name = categorical(["Ben", "John", "Mary", "John"]),
130+
height = [1.85, 1.67, 1.5, 1.67],
131+
favnum = categorical([7, 5, 10, 1]),
132+
age = [23, 23, 14, 23])
133+
134+
# Check scitype coercions:
135+
schema(X)
136+
137+
encoder = ContrastEncoder(features=[:name, :favnum]; ignore=false, mode = [:dummy, :helmert])
138+
mach = fit!(machine(encoder, X))
139+
Xnew = transform(mach, X)
140+
141+
julia > Xnew
142+
(name_1 = [1.0, 0.0, 0.0, 0.0],
143+
name_2 = [0.0, 1.0, 0.0, 1.0],
144+
height = [1.85, 1.67, 1.5, 1.67],
145+
favnum_1 = [0.0, 1.0, 0.0, -1.0],
146+
favnum_2 = [2.0, -1.0, 0.0, -1.0],
147+
favnum_3 = [-1.0, -1.0, 3.0, -1.0],
148+
age = [23, 23, 14, 23],)
149+
```
150+
151+
See also
152+
[`OneHotEncoder`](@ref)
153+
"""
154+
ContrastEncoder

0 commit comments

Comments
 (0)