Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/MLJTransforms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ using OrderedCollections
const MMI = MLJModelInterface

# Functions of generic use across transformers
include("common_docs.jl")
include("generic.jl")
include("utils.jl")

Expand Down
27 changes: 27 additions & 0 deletions src/common_docs.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
const X_doc = """
- X: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
`Multiclass` or `OrderedFactor`
"""
const X_doc_mlj = """
- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
check scitypes.
"""
const features_doc = """
- features=[]: A list of names of categorical features given as symbols to exclude or include from encoding,
according to the value of `ignore`, or a single symbol (which is treated as a vector with one symbol),
or a callable that returns true for features to be included/excluded
"""
const ignore_doc = """
- ignore=true: Whether to exclude or include the features given in `features`
"""
const ordered_factor_doc = """
- ordered_factor=false: Whether to encode `OrderedFactor` or ignore them
"""
const encoded_features_doc = """
- encoded_features: The subset of the categorical features of `X` that were encoded
"""
const cache_doc = """
- `cache`: The output of `contrast_encoder_fit`
"""

67 changes: 38 additions & 29 deletions src/encoders/contrast_encoder/contrast_encoder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ Where `k` is the number of levels in the feature and the returned contrast matri
"""
### 1. Dummy Coding
function get_dummy_contrast(k)
return Matrix(1.0I, k, k-1)
return Matrix(1.0I, k, k - 1)
end


### 2. Sum Coding
function get_sum_contrast(k)
C = Matrix(1.0I, k, k-1)
C = Matrix(1.0I, k, k - 1)
C[end, :] .= -1.0
return C
end
Expand All @@ -26,7 +26,7 @@ function create_backward_vector(index::Int, length::Int)
vec = ones(length) .* index / length

# [ -(k-i)/k -(k-i)/k -(k-i)/k .. i/k i/k]
vec[1:index] .= index/length - 1
vec[1:index] .= index / length - 1
return vec
end
function get_backward_diff_contrast(k)
Expand Down Expand Up @@ -61,21 +61,21 @@ Fit a contrast encoing scheme on given data in `X`.

# Arguments

- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
$X_doc
$features_doc
- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
contrast encoding scheme for each feature
- `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
- `ignore=true`: Whether to exclude or includes the features given in `features`
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
contrast encoding scheme for each feature
- `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
$ignore_doc
$ordered_factor_doc

# Returns (in a dict)
# Returns as a named-tuple

- `vec_given_feat_level`: Maps each level for each column in the selected categorical features to a vector
- `encoded_features`: The subset of the categorical features of X that were encoded
$encoded_features_doc
"""
function contrast_encoder_fit(
X,
Expand All @@ -90,9 +90,10 @@ function contrast_encoder_fit(
if mode isa Vector{Symbol}
mode_is_vector = true
ignore && throw(ArgumentError(IGNORE_MUST_FALSE_VEC_MODE))
length(features) == length(mode) || throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
length(features) == length(mode) ||
throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
end

# buildmatrix should be specified if mode is :contrast or :hypothesis
if mode in (:contrast, :hypothesis)
buildmatrix === nothing && throw(ArgumentError(BUILDFUNC_MUST_BE_SPECIFIED))
Expand All @@ -105,11 +106,13 @@ function contrast_encoder_fit(
k = length(feat_levels)
feat_mode = (mode_is_vector) ? mode[findfirst(isequal(name), features)] : mode
if feat_mode == :contrast
contrastmatrix = buildmatrix(name, k)
size(contrastmatrix) == (k, k-1) || throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
contrastmatrix = buildmatrix(name, k)
size(contrastmatrix) == (k, k - 1) ||
throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
elseif feat_mode == :hypothesis
hypothesismatrix = buildmatrix(name, k)
size(hypothesismatrix) == (k-1, k) || throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
hypothesismatrix = buildmatrix(name, k)
size(hypothesismatrix) == (k - 1, k) ||
throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
contrastmatrix = pinv(hypothesismatrix)
elseif feat_mode == :dummy
contrastmatrix = get_dummy_contrast(k)
Expand All @@ -125,7 +128,9 @@ function contrast_encoder_fit(
throw(ArgumentError("Mode $feat_mode is not supported."))
end

vector_given_value_given_feature = OrderedDict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
vector_given_value_given_feature = OrderedDict(
level => contrastmatrix[l, :] for (l, level) in enumerate(feat_levels)
)
return vector_given_value_given_feature
end

Expand All @@ -134,10 +139,9 @@ function contrast_encoder_fit(
X, features; ignore = ignore, ordered_factor = ordered_factor,
feature_mapper = feature_mapper,
)

cache = Dict(
:vector_given_value_given_feature => vector_given_value_given_feature,
:encoded_features => encoded_features,
cache = (
vector_given_value_given_feature = vector_given_value_given_feature,
encoded_features = encoded_features,
)

return cache
Expand All @@ -157,7 +161,12 @@ Use a fitted contrast encoder to encode the levels of selected categorical varia

- `X_tr`: The table with selected features after the selected features are encoded by contrast encoding.
"""
function contrast_encoder_transform(X, cache::Dict)
vector_given_value_given_feature = cache[:vector_given_value_given_feature]
return generic_transform(X, vector_given_value_given_feature, single_feat = false; use_levelnames = true)
end
function contrast_encoder_transform(X, cache::NamedTuple)
vector_given_value_given_feature = cache.vector_given_value_given_feature
return generic_transform(
X,
vector_given_value_given_feature,
single_feat = false;
use_levelnames = true,
)
end
21 changes: 9 additions & 12 deletions src/encoders/contrast_encoder/interface_mlj.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,18 @@ function MMI.fit(transformer::ContrastEncoder, verbosity::Int, X)
buildmatrix = transformer.buildmatrix,
ordered_factor = transformer.ordered_factor,
)
fitresult = generic_cache[:vector_given_value_given_feature]
fitresult = generic_cache.vector_given_value_given_feature

report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features
report = (encoded_features = generic_cache.encoded_features,) # report only has list of encoded features
cache = nothing
return fitresult, cache, report
end;


# 6. Transform method
function MMI.transform(transformer::ContrastEncoder, fitresult, Xnew)
generic_cache = Dict(
:vector_given_value_given_feature =>
fitresult,
generic_cache = (
vector_given_value_given_feature = fitresult,
)
Xnew_transf = contrast_encoder_transform(Xnew, generic_cache)
return Xnew_transf
Expand Down Expand Up @@ -87,23 +86,21 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with

Here:

- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
check scitypes.
$X_doc_mlj

Train the machine using `fit!(mach, rows=...)`.

# Hyper-parameters

- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
$features_doc
- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
contrast encoding scheme for each feature
- `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
- `ignore=true`: Whether to exclude or includes the features given in `features`
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
$ignore_doc
$ordered_factor_doc

# Operations

Expand All @@ -121,7 +118,7 @@ The fields of `fitted_params(mach)` are:

The fields of `report(mach)` are:

- `encoded_features`: The subset of the categorical features of X that were encoded
$encoded_features_doc

# Examples

Expand Down
26 changes: 13 additions & 13 deletions src/encoders/frequency_encoding/frequency_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,16 @@ categorical features with their (normalized or raw) frequencies of occurrence in

# Arguments

- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
- `ignore=true`: Whether to exclude or includes the features given in `features`
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
$X_doc
$features_doc
$ignore_doc
$ordered_factor_doc
- `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.

# Returns (in a dict)
# Returns as a named-tuple

- `statistic_given_feat_val`: The frequency of each level of each selected categorical feature
- `encoded_features`: The subset of the categorical features of X that were encoded
$encoded_features_doc
"""
function frequency_encoder_fit(
X,
Expand All @@ -39,11 +39,11 @@ function frequency_encoder_fit(
# 2. Pass it to generic_fit
statistic_given_feat_val, encoded_features = generic_fit(
X, features; ignore = ignore, ordered_factor = ordered_factor,
feature_mapper = feature_mapper,
)
cache = Dict(
:statistic_given_feat_val => statistic_given_feat_val,
:encoded_features => encoded_features,
feature_mapper = feature_mapper)

cache = (
statistic_given_feat_val = statistic_given_feat_val,
encoded_features = encoded_features,
)
return cache
end
Expand All @@ -62,7 +62,7 @@ Encode the levels of a categorical variable in a given table with their (normali

- `X_tr`: The table with selected features after the selected features are encoded by frequency encoding.
"""
function frequency_encoder_transform(X, cache::Dict)
statistic_given_feat_val = cache[:statistic_given_feat_val]
function frequency_encoder_transform(X, cache::NamedTuple)
statistic_given_feat_val = cache.statistic_given_feat_val
return generic_transform(X, statistic_given_feat_val)
end
23 changes: 10 additions & 13 deletions src/encoders/frequency_encoding/interface_mlj.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,18 @@ function MMI.fit(transformer::FrequencyEncoder, verbosity::Int, X)
normalize = transformer.normalize,
output_type = transformer.output_type,
)
fitresult = generic_cache[:statistic_given_feat_val]
fitresult = generic_cache.statistic_given_feat_val

report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features
report = (encoded_features = generic_cache.encoded_features,) # report only has list of encoded features
cache = nothing
return fitresult, cache, report
end;


# 6. Transform method
function MMI.transform(transformer::FrequencyEncoder, fitresult, Xnew)
generic_cache = Dict(
:statistic_given_feat_val =>
fitresult,
generic_cache = (
statistic_given_feat_val = fitresult,
)
Xnew_transf = frequency_encoder_transform(Xnew, generic_cache)
return Xnew_transf
Expand Down Expand Up @@ -87,18 +86,16 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with

Here:

- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
check scitypes.
$X_doc_mlj

Train the machine using `fit!(mach, rows=...)`.

# Hyper-parameters

- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
- `ignore=true`: Whether to exclude or include the features given in `features`
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
- `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
$features_doc
$ignore_doc
$ordered_factor_doc
- ` normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
- `output_type=Float32`: The type of the output values. The default is `Float32`, but you can set it to `Float64` or any other type that can hold the frequency values.

# Operations
Expand All @@ -117,7 +114,7 @@ The fields of `fitted_params(mach)` are:

The fields of `report(mach)` are:

- `encoded_features`: The subset of the categorical features of X that were encoded
$encoded_features_doc

# Examples

Expand Down
21 changes: 9 additions & 12 deletions src/encoders/missingness_encoding/interface_mlj.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,19 +39,18 @@ function MMI.fit(transformer::MissingnessEncoder, verbosity::Int, X)
ordered_factor = transformer.ordered_factor,
label_for_missing = transformer.label_for_missing,
)
fitresult = generic_cache[:label_for_missing_given_feature]
fitresult = generic_cache.label_for_missing_given_feature

report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features
report = (encoded_features = generic_cache.encoded_features,) # report only has list of encoded features
cache = nothing
return fitresult, cache, report
end;


# 6. Transform method
function MMI.transform(transformer::MissingnessEncoder, fitresult, Xnew)
generic_cache = Dict(
:label_for_missing_given_feature =>
fitresult,
generic_cache = (
label_for_missing_given_feature = fitresult,
)
Xnew_transf = missingness_encoder_transform(Xnew, generic_cache)
return Xnew_transf
Expand Down Expand Up @@ -91,17 +90,15 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with

Here:

- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
check scitypes.
$X_doc_mlj

Train the machine using `fit!(mach, rows=...)`.

# Hyper-parameters

- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
- `ignore=true`: Whether to exclude or includes the features given in `features`
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
$features_doc
$ignore_doc
$ordered_factor_doc
- `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A
dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and where each value
signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
Expand All @@ -124,7 +121,7 @@ The fields of `fitted_params(mach)` are:

The fields of `report(mach)` are:

- `encoded_features`: The subset of the categorical features of X that were encoded
$encoded_features_doc

# Examples

Expand Down
Loading