Skip to content

Commit eea0466

Browse files
authored
Merge branch 'main' into missingness-encoder
2 parents c30e7d7 + 972da5c commit eea0466

24 files changed

+3284
-4
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
3333
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
3434
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
3535
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
36-
StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
3736
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
37+
StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
3838

3939
[targets]
4040
test = ["Test", "DataFrames", "MLJBase", "Random", "StableRNGs", "StatsModels"]

src/MLJTransforms.jl

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,16 @@ using CategoricalArrays
66
using MLJModelInterface
77
using TableOperations
88
using StatsBase
9+
using LinearAlgebra
10+
911
# Other transformers
1012
using Combinatorics
1113
import Distributions
1214
using Parameters
1315
using Dates
1416
using OrderedCollections
1517

18+
1619
const MMI = MLJModelInterface
1720

1821
# Functions of generic use across transformers
@@ -23,13 +26,11 @@ include("utils.jl")
2326
include("encoders/target_encoding/errors.jl")
2427
include("encoders/target_encoding/target_encoding.jl")
2528
include("encoders/target_encoding/interface_mlj.jl")
26-
export target_encoder_fit, target_encoder_transform, TargetEncoder
2729
export TargetEncoder
2830

2931
# Ordinal encoding
3032
include("encoders/ordinal_encoding/ordinal_encoding.jl")
3133
include("encoders/ordinal_encoding/interface_mlj.jl")
32-
export ordinal_encoder_fit, ordinal_encoder_transform, OrdinalEncoder
3334
export OrdinalEncoder
3435

3536
# Frequency encoding
@@ -47,4 +48,24 @@ include("encoders/missingness_encoding/missingness_encoding.jl")
4748
include("encoders/missingness_encoding/interface_mlj.jl")
4849
export MissingnessEncoder
4950

50-
end
51+
# Contrast encoder
52+
include("encoders/contrast_encoder/contrast_encoder.jl")
53+
include("encoders/contrast_encoder/interface_mlj.jl")
54+
export ContrastEncoder
55+
56+
# MLJModels transformers
57+
include("transformers/other_transformers/continuous_encoder.jl")
58+
include("transformers/other_transformers/interaction_transformer.jl")
59+
include("transformers/other_transformers/univariate_time_type_to_continuous.jl")
60+
include("transformers/other_transformers/fill_imputer.jl")
61+
include("transformers/other_transformers/one_hot_encoder.jl")
62+
include("transformers/other_transformers/standardizer.jl")
63+
include("transformers/other_transformers/univariate_boxcox_transformer.jl")
64+
include("transformers/other_transformers/univariate_discretizer.jl")
65+
include("transformers/other_transformers/metadata_shared.jl")
66+
67+
export UnivariateDiscretizer,
68+
UnivariateStandardizer, Standardizer, UnivariateBoxCoxTransformer,
69+
OneHotEncoder, ContinuousEncoder, FillImputer, UnivariateFillImputer,
70+
UnivariateTimeTypeToContinuous, InteractionTransformer
71+
end
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
include("errors.jl")
2+
3+
"""
4+
** Private Method **
5+
6+
This and the following four methods implement the contrast matrix for dummy coding, sum coding,
7+
backaward/forward difference coding and helmert coding.
8+
Where `k` is the number of levels in the feature and the returned contrast matrix has dimensions (k,k-1).
9+
"""
10+
### 1. Dummy Coding
11+
function get_dummy_contrast(k)
12+
return Matrix(1.0I, k, k-1)
13+
end
14+
15+
16+
### 2. Sum Coding
17+
function get_sum_contrast(k)
18+
C = Matrix(1.0I, k, k-1)
19+
C[end, :] .= -1.0
20+
return C
21+
end
22+
23+
### 3. Backward Difference Coding
24+
function create_backward_vector(index::Int, length::Int)
25+
# [i/k i/k i/k .. i/k i/k]
26+
vec = ones(length) .* index / length
27+
28+
# [ -(k-i)/k -(k-i)/k -(k-i)/k .. i/k i/k]
29+
vec[1:index] .= index/length - 1
30+
return vec
31+
end
32+
function get_backward_diff_contrast(k)
33+
return hcat([create_backward_vector(i, k) for i in 1:k-1]...)
34+
end
35+
36+
### 4. Forward Difference Coding
37+
function get_forward_diff_contrast(k)
38+
return -get_backward_diff_contrast(k)
39+
end
40+
41+
### 5. Helmert Coding
42+
function create_helmert_vector(index::Int, length::Int)
43+
# [-1 -1 -1 .. -1 -1]
44+
vec = -ones(length)
45+
# [ -1 -1 -1 i .. 0 0]
46+
vec[index+1] = index
47+
# [ 0 0 i .. -1 -1]
48+
if index + 2 <= length
49+
vec[index+2:end] .= 0.0
50+
end
51+
return vec
52+
end
53+
function get_helmert_contrast(k)
54+
return hcat([create_helmert_vector(i, k) for i in 1:k-1]...)
55+
end
56+
57+
"""
58+
** Private Method **
59+
60+
Fit a contrast encoing scheme on given data in `X`.
61+
62+
# Arguments
63+
64+
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
65+
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
66+
- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
67+
If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
68+
contrast encoding scheme for each feature
69+
- `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
70+
where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
71+
hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
72+
- `ignore=true`: Whether to exclude or includes the features given in `features`
73+
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
74+
75+
# Returns (in a dict)
76+
77+
- `vec_given_feat_level`: Maps each level for each column in the selected categorical features to a vector
78+
- `encoded_features`: The subset of the categorical features of X that were encoded
79+
"""
80+
function contrast_encoder_fit(
81+
X,
82+
features::AbstractVector{Symbol} = Symbol[];
83+
mode::Union{Symbol, AbstractVector{Symbol}} = :dummy,
84+
buildmatrix = nothing,
85+
ignore::Bool = true,
86+
ordered_factor::Bool = false,
87+
)
88+
# mode should be a vector only if features is a vector of the same length
89+
mode_is_vector = false
90+
if mode isa Vector{Symbol}
91+
mode_is_vector = true
92+
ignore && throw(ArgumentError(IGNORE_MUST_FALSE_VEC_MODE))
93+
length(features) == length(mode) || throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
94+
end
95+
96+
# buildmatrix should be specified if mode is :contrast or :hypothesis
97+
if mode in (:contrast, :hypothesis)
98+
buildmatrix === nothing && throw(ArgumentError(BUILDFUNC_MUST_BE_SPECIFIED))
99+
end
100+
101+
102+
# ensure mode is one of :contrast, :dummy, :sum, :backward_diff, :forward_diff, :helmert, :polynomial, :hypothesis
103+
function feature_mapper(col, name)
104+
feat_levels = levels(col)
105+
k = length(feat_levels)
106+
feat_mode = (mode_is_vector) ? mode[findfirst(isequal(name), features)] : mode
107+
if feat_mode == :contrast
108+
contrastmatrix = buildmatrix(name, k)
109+
size(contrastmatrix) == (k, k-1) || throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
110+
elseif feat_mode == :hypothesis
111+
hypothesismatrix = buildmatrix(name, k)
112+
size(hypothesismatrix) == (k-1, k) || throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
113+
contrastmatrix = pinv(hypothesismatrix)
114+
elseif feat_mode == :dummy
115+
contrastmatrix = get_dummy_contrast(k)
116+
elseif feat_mode == :sum
117+
contrastmatrix = get_sum_contrast(k)
118+
elseif feat_mode == :backward_diff
119+
contrastmatrix = get_backward_diff_contrast(k)
120+
elseif feat_mode == :forward_diff
121+
contrastmatrix = get_forward_diff_contrast(k)
122+
elseif feat_mode == :helmert
123+
contrastmatrix = get_helmert_contrast(k)
124+
else
125+
throw(ArgumentError("Mode $feat_mode is not supported."))
126+
end
127+
128+
vector_given_value_given_feature = Dict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
129+
return vector_given_value_given_feature
130+
end
131+
132+
# 2. Pass it to generic_fit
133+
vector_given_value_given_feature, encoded_features = generic_fit(
134+
X, features; ignore = ignore, ordered_factor = ordered_factor,
135+
feature_mapper = feature_mapper,
136+
)
137+
138+
cache = Dict(
139+
:vector_given_value_given_feature => vector_given_value_given_feature,
140+
:encoded_features => encoded_features,
141+
)
142+
143+
return cache
144+
end
145+
146+
"""
147+
** Private Method **
148+
149+
Use a fitted contrast encoder to encode the levels of selected categorical variables with contrast encoding.
150+
151+
# Arguments
152+
153+
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
154+
- `cache`: The output of `contrast_encoder_fit`
155+
156+
# Returns
157+
158+
- `X_tr`: The table with selected features after the selected features are encoded by contrast encoding.
159+
"""
160+
function contrast_encoder_transform(X, cache::Dict)
161+
vector_given_value_given_feature = cache[:vector_given_value_given_feature]
162+
return generic_transform(X, vector_given_value_given_feature, single_feat = false)
163+
end
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
MATRIX_SIZE_ERROR(k, matrix_size, feat_name)= "In ContrastEncoder, a categorical variable with $k levels should have a contrast matrix of size ($k, $k-1). However, the contrast matrix returned by `buildmatrix` is $matrix_size for feature $feat_name."
2+
MATRIX_SIZE_ERROR_HYP(k, matrix_size, feat_name)= "In ContrastEncoder, a categorical variable with $k levels should have a hypothesis matrix of size ($k-1, $k). However, the given hypothesis matrix returned by `buildmatrix` is $matrix_size for feature $feat_name."
3+
IGNORE_MUST_FALSE_VEC_MODE = "In ContrastEncoder with mode given as a vector of symbols, the ignore argument must be set to false and features must be explictly specified in features."
4+
BUILDFUNC_MUST_BE_SPECIFIED = "In ContrastEncoder with mode=:contrast or mode=:hypothesis, the `buildmatrix` argument must be specified."
5+
LENGTH_MISMATCH_VEC_MODE(len_mode, len_feat) = "In ContrastEncoder with mode given as a vector of symbols, the length of the features argument must match the number of specified modes. However, the method received $(len_mode) modes and $(len_feat) features."

0 commit comments

Comments
 (0)