Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ meh/*.ipynb
.DS_Store
/*.jl
scratchpad/
examples/test.jl
5 changes: 4 additions & 1 deletion src/encoders/frequency_encoding/frequency_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ function frequency_encoder_fit(
# 1. Define feature mapper
function feature_mapper(col, name)
frequency_map = (!normalize) ? countmap(col) : proportionmap(col)
statistic_given_feat_val = Dict{Any, Real}(level=>frequency_map[level] for level in levels(col))
feat_levels = levels(col)
statistic_given_feat_val = Dict{eltype(feat_levels), Float32}(
level => frequency_map[level] for level in feat_levels
)
return statistic_given_feat_val
end

Expand Down
20 changes: 13 additions & 7 deletions src/encoders/missingness_encoding/missingness_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ function missingness_encoder_fit(
features::AbstractVector{Symbol} = Symbol[];
ignore::Bool = true,
ordered_factor::Bool = false,
label_for_missing::Dict{<:Type, <:Any} = Dict(
label_for_missing::Dict{<:Type, <:Any} = Dict(
AbstractString => "missing",
Char => 'm',
),
Expand All @@ -40,8 +40,8 @@ function missingness_encoder_fit(

# 1. Define feature mapper
function feature_mapper(col, name)
col_type = nonmissingtype(eltype(col)).parameters[1]
feat_levels = levels(col; skipmissing=true)
feat_levels = levels(col; skipmissing = true)
col_type = nonmissingtype(eltype(feat_levels))

# Ensure column type is valid (can't test because never occurs)
# Converting array elements to strings before wrapping in a `CategoricalArray`, as...
Expand All @@ -58,7 +58,7 @@ function missingness_encoder_fit(

# Check no collision between keys(label_for_missing) and feat_levels
for value in values(label_for_missing)
if !ismissing(value)
if !ismissing(value)
if value in feat_levels
throw(ArgumentError(COLLISION_NEW_VAL_ME(value)))
end
Expand All @@ -73,7 +73,7 @@ function missingness_encoder_fit(
break
end
end

# Nonmissing levels remain as is
label_for_missing_given_feature = Dict{Missing, col_type}()

Expand All @@ -91,7 +91,8 @@ function missingness_encoder_fit(

# 2. Pass it to generic_fit
label_for_missing_given_feature, encoded_features = generic_fit(
X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper,
X, features; ignore = ignore, ordered_factor = ordered_factor,
feature_mapper = feature_mapper,
)
cache = Dict(
:label_for_missing_given_feature => label_for_missing_given_feature,
Expand All @@ -117,6 +118,11 @@ Apply a fitted missingness encoder to a table given the output of `missingness_e
"""
function missingness_encoder_transform(X, cache::Dict)
label_for_missing_given_feature = cache[:label_for_missing_given_feature]
return generic_transform(X, label_for_missing_given_feature; ignore_unknown = true)
return generic_transform(
X,
label_for_missing_given_feature;
ignore_unknown = true,
ensure_categorical = true,
)
end

6 changes: 5 additions & 1 deletion src/encoders/ordinal_encoding/interface_mlj.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,17 @@ mutable struct OrdinalEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised
features::AS
ignore::Bool
ordered_factor::Bool
op_dtype::Type
end;

# 2. Constructor
function OrdinalEncoder(;
features = Symbol[],
ignore = true,
ordered_factor = false,
op_dtype = Float32,
)
return OrdinalEncoder(features, ignore, ordered_factor)
return OrdinalEncoder(features, ignore, ordered_factor, op_dtype)
end;


Expand All @@ -29,6 +31,7 @@ function MMI.fit(transformer::OrdinalEncoder, verbosity::Int, X)
transformer.features;
ignore = transformer.ignore,
ordered_factor = transformer.ordered_factor,
op_dtype = transformer.op_dtype,
)
fitresult =
generic_cache[:index_given_feat_level]
Expand Down Expand Up @@ -92,6 +95,7 @@ Train the machine using `fit!(mach, rows=...)`.
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
- `ignore=true`: Whether to exclude or includes the features given in `features`
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
- `op_dtype`: The numerical concrete type of the encoded features. Default is `Float32`.

# Operations

Expand Down
5 changes: 3 additions & 2 deletions src/encoders/ordinal_encoding/ordinal_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Fit an encoder to encode the levels of categorical variables in a given table as
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
- `ignore=true`: Whether to exclude or includes the features given in `features`
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them

- `dtype`: The numerical concrete type of the encoded features. Default is `Float32`.
# Returns (in a dict)

- `index_given_feat_level`: Maps each level for each column in a subset of the categorical features of X into an integer.
Expand All @@ -21,12 +21,13 @@ function ordinal_encoder_fit(
features::AbstractVector{Symbol} = Symbol[];
ignore::Bool = true,
ordered_factor::Bool = false,
op_dtype::Type = Float32,
)
# 1. Define feature mapper
function feature_mapper(col, name)
feat_levels = levels(col)
index_given_feat_val =
Dict{Any, Integer}(value => index for (index, value) in enumerate(feat_levels))
Dict{eltype(feat_levels), op_dtype}(value => index for (index, value) in enumerate(feat_levels))
return index_given_feat_val
end

Expand Down
3 changes: 2 additions & 1 deletion src/encoders/target_encoding/target_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,9 @@ function target_encoder_fit(

# 3. Define function to compute the new value(s) for each level given a column
function feature_mapper(col, name)
feat_levels = levels(col)
y_stat_given_feat_level_for_col =
Dict{Any, Union{AbstractFloat, AbstractVector{<:AbstractFloat}}}()
Dict{eltype(feat_levels), Any}()
for level in levels(col)
# Get the targets of an example that belong to this level
targets_for_level = y[col.==level]
Expand Down
50 changes: 33 additions & 17 deletions src/generic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,13 @@ function generic_fit(X,
feat_col = Tables.getcolumn(X, feat_name)
feat_type = elscitype(feat_col)
feat_has_allowed_type =
feat_type <: Union{Missing, Multiclass} || (ordered_factor && feat_type <: Union{Missing, OrderedFactor})
feat_type <: Union{Missing, Multiclass} ||
(ordered_factor && feat_type <: Union{Missing, OrderedFactor})
if feat_has_allowed_type # then should be encoded
push!(encoded_features, feat_name)
# Compute the dict using the given feature_mapper function
mapping_per_feat_level[feat_name] = feature_mapper(feat_col, feat_name, args...; kwargs...)
mapping_per_feat_level[feat_name] =
feature_mapper(feat_col, feat_name, args...; kwargs...)
end
end
return mapping_per_feat_level, encoded_features
Expand All @@ -72,7 +74,7 @@ function generate_new_feat_names(feat_name, num_inds, existing_names)

new_column_names = []
while conflict
suffix = repeat("_", count)
suffix = repeat("_", count)
new_column_names = [Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds]
conflict = any(name -> name in existing_names, new_column_names)
count += 1
Expand All @@ -85,22 +87,29 @@ end
"""
**Private method.**

Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
a subset of categorical features of X into a scalar or a vector (as specified in single_feat)

- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
into a scalar (single_feat=true)
- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
into a scalar (single_feat=true)

- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
into a set of k features where k is the length of the vector (single_feat=false)
- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
into a set of k features where k is the length of the vector (single_feat=false)
- In both cases it attempts to preserve the type of the table.
- In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such
assumption is necessary because any column in X must correspond to a constant number of features
assumption is necessary because any column in X must correspond to a constant number of features
in the output table (which is equal to k).
- Features not in the dictionary are mapped to themselves (i.e., not changed).
- Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error.
- Levels not in the nested dictionary are mapped to themselves if `ignore unknown` is true else raise an error.
- If `ensure_categorical` is true, then any input categorical column will remain categorical
"""
function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore_unknown = false)
function generic_transform(
X,
mapping_per_feat_level;
single_feat = true,
ignore_unknown = false,
ensure_categorical = false,
)
feat_names = Tables.schema(X).names
new_feat_names = Symbol[]
new_cols = []
Expand All @@ -115,18 +124,25 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
if !issubset(test_levels, train_levels)
# get the levels in test that are not in train
lost_levels = setdiff(test_levels, train_levels)
error("While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.")
error(
"While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.",
)
end
end

if single_feat
level2scalar = mapping_per_feat_level[feat_name]
new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
if ensure_categorical
new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
else
new_col = !isempty(level2scalar) ? unwrap.(recode(col, level2scalar...)) : col
end

push!(new_cols, new_col)
push!(new_feat_names, feat_name)
else
level2vector = mapping_per_feat_level[feat_name]
new_multi_col = map(x->get(level2vector, x, x), col)
new_multi_col = map(x -> get(level2vector, x, x), col)
new_multi_col = [col for col in eachrow(hcat(new_multi_col...))]
push!(new_cols, new_multi_col...)

Expand All @@ -144,8 +160,8 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
end
end

transformed_X= NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
transformed_X = NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
# Attempt to preserve table type
transformed_X = Tables.materializer(X)(transformed_X)
return transformed_X
end
end
19 changes: 12 additions & 7 deletions src/transformers/cardinality_reducer/cardinality_reducer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,20 +35,20 @@ function cardinality_reducer_fit(
features::AbstractVector{Symbol} = Symbol[];
ignore::Bool = true,
ordered_factor::Bool = false,
min_frequency::Real = 3,
label_for_infrequent::Dict{<:Type, <:Any} = Dict(
min_frequency::Real = 3,
label_for_infrequent::Dict{<:Type, <:Any} = Dict(
AbstractString => "Other",
Char => 'O',
),
)
)
supportedtypes_list = [Char, AbstractString, Number]
supportedtypes = Union{supportedtypes_list...}

# 1. Define feature mapper
function feature_mapper(col, name)
val_to_freq = (min_frequency isa AbstractFloat) ? proportionmap(col) : countmap(col)
col_type = eltype(col).parameters[1]
feat_levels = levels(col)
col_type = eltype(feat_levels)

# Ensure column type is valid (can't test because never occurs)
# Converting array elements to strings before wrapping in a `CategoricalArray`, as...
Expand Down Expand Up @@ -88,7 +88,11 @@ function cardinality_reducer_fit(
elseif elgrandtype == Number
new_cat_given_col_val[level] = minimum(feat_levels) - 1
else
throw(ArgumentError(UNSPECIFIED_COL_TYPE(col_type, label_for_infrequent)))
throw(
ArgumentError(
UNSPECIFIED_COL_TYPE(col_type, label_for_infrequent),
),
)
end
end
end
Expand All @@ -98,7 +102,8 @@ function cardinality_reducer_fit(

# 2. Pass it to generic_fit
new_cat_given_col_val, encoded_features = generic_fit(
X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper,
X, features; ignore = ignore, ordered_factor = ordered_factor,
feature_mapper = feature_mapper,
)
cache = Dict(
:new_cat_given_col_val => new_cat_given_col_val,
Expand All @@ -125,5 +130,5 @@ Apply a fitted cardinality reducer to a table given the output of `cardinality_r
"""
function cardinality_reducer_transform(X, cache::Dict)
new_cat_given_col_val = cache[:new_cat_given_col_val]
return generic_transform(X, new_cat_given_col_val; ignore_unknown = true)
return generic_transform(X, new_cat_given_col_val; ignore_unknown = true, ensure_categorical = true)
end
38 changes: 36 additions & 2 deletions test/encoders/contrast_encoder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ end
cache = contrast_encoder_fit(X, [:name]; ignore=false, mode = :dummy)
k = length(levels(X.name))
contrast_matrix = get_dummy_contrast(k)
print()
for (i, level) in enumerate(levels(X.name))
println(cache[:vector_given_value_given_feature])
@test cache[:vector_given_value_given_feature][:name][level] == contrast_matrix[i, :]
end
end
Expand Down Expand Up @@ -289,4 +287,40 @@ end

# Test report
@test report(mach) == (encoded_features = generic_cache[:encoded_features],)
end


@testset "Test Contrast Encoder Output Types" begin
X = (
name = categorical(["Ben", "John", "Mary", "John"]),
height = [1.85, 1.67, 1.5, 1.67],
favnum = categorical([7, 5, 10, 1]),
age = [23, 23, 14, 23],
)

methods = [:contrast, :dummy, :sum, :backward_diff, :helmert, :hypothesis]
matrix_func = [buildrandomcontrast, nothing, nothing, nothing, nothing, buildrandomhypothesis]

for (i, method) in enumerate(methods)
encoder = ContrastEncoder(
features = [:name, :favnum],
ignore = false,
mode = method,
buildmatrix=matrix_func[i]
)
mach = fit!(machine(encoder, X))
Xnew = MMI.transform(mach, X)

# Test Consistency with Types
scs = schema(Xnew).scitypes
ts = schema(Xnew).types

# Check scitypes for previously continuos or categorical features
@test all(scs[1:end-1] .== Continuous)
@test all(t -> (t <: AbstractFloat) && isconcretetype(t), ts[1:end-1])
# Check scitypes for previously Count feature
last_type, last_sctype = ts[end], scs[end]
@test last_type <: Integer && isconcretetype(last_type)
@test last_sctype <: Count
end
end
Loading
Loading