Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ meh/*.ipynb
.DS_Store
/*.jl
scratchpad/
examples/test.jl
5 changes: 4 additions & 1 deletion src/encoders/frequency_encoding/frequency_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ function frequency_encoder_fit(
# 1. Define feature mapper
function feature_mapper(col, name)
frequency_map = (!normalize) ? countmap(col) : proportionmap(col)
statistic_given_feat_val = Dict{Any, Real}(level=>frequency_map[level] for level in levels(col))
feat_levels = levels(col)
statistic_given_feat_val = Dict{eltype(feat_levels), Float32}(
level => frequency_map[level] for level in feat_levels
)
return statistic_given_feat_val
end

Expand Down
50 changes: 33 additions & 17 deletions src/generic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,13 @@ function generic_fit(X,
feat_col = Tables.getcolumn(X, feat_name)
feat_type = elscitype(feat_col)
feat_has_allowed_type =
feat_type <: Union{Missing, Multiclass} || (ordered_factor && feat_type <: Union{Missing, OrderedFactor})
feat_type <: Union{Missing, Multiclass} ||
(ordered_factor && feat_type <: Union{Missing, OrderedFactor})
if feat_has_allowed_type # then should be encoded
push!(encoded_features, feat_name)
# Compute the dict using the given feature_mapper function
mapping_per_feat_level[feat_name] = feature_mapper(feat_col, feat_name, args...; kwargs...)
mapping_per_feat_level[feat_name] =
feature_mapper(feat_col, feat_name, args...; kwargs...)
end
end
return mapping_per_feat_level, encoded_features
Expand All @@ -72,7 +74,7 @@ function generate_new_feat_names(feat_name, num_inds, existing_names)

new_column_names = []
while conflict
suffix = repeat("_", count)
suffix = repeat("_", count)
new_column_names = [Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds]
conflict = any(name -> name in existing_names, new_column_names)
count += 1
Expand All @@ -85,22 +87,29 @@ end
"""
**Private method.**

Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
a subset of categorical features of X into a scalar or a vector (as specified in single_feat)

- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
into a scalar (single_feat=true)
- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
into a scalar (single_feat=true)

- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
into a set of k features where k is the length of the vector (single_feat=false)
- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
into a set of k features where k is the length of the vector (single_feat=false)
- In both cases it attempts to preserve the type of the table.
- In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such
assumption is necessary because any column in X must correspond to a constant number of features
assumption is necessary because any column in X must correspond to a constant number of features
in the output table (which is equal to k).
- Features not in the dictionary are mapped to themselves (i.e., not changed).
- Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error.
- Levels not in the nested dictionary are mapped to themselves if `ignore unknown` is true else raise an error.
- If `ensure_categorical` is true, then any input categorical column will remain categorical
"""
function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore_unknown = false)
function generic_transform(
X,
mapping_per_feat_level;
single_feat = true,
ignore_unknown = false,
ensure_categorical = false,
)
feat_names = Tables.schema(X).names
new_feat_names = Symbol[]
new_cols = []
Expand All @@ -115,18 +124,25 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
if !issubset(test_levels, train_levels)
# get the levels in test that are not in train
lost_levels = setdiff(test_levels, train_levels)
error("While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.")
error(
"While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.",
)
end
end

if single_feat
level2scalar = mapping_per_feat_level[feat_name]
new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
if ensure_categorical
new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
else
new_col = !isempty(level2scalar) ? unwrap.(recode(col, level2scalar...)) : col
end

push!(new_cols, new_col)
push!(new_feat_names, feat_name)
else
level2vector = mapping_per_feat_level[feat_name]
new_multi_col = map(x->get(level2vector, x, x), col)
new_multi_col = map(x -> get(level2vector, x, x), col)
new_multi_col = [col for col in eachrow(hcat(new_multi_col...))]
push!(new_cols, new_multi_col...)

Expand All @@ -144,8 +160,8 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
end
end

transformed_X= NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
transformed_X = NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
# Attempt to preserve table type
transformed_X = Tables.materializer(X)(transformed_X)
return transformed_X
end
end
45 changes: 42 additions & 3 deletions test/encoders/frequency_encoder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ using MLJTransforms: frequency_encoder_fit, frequency_encoder_transform
for norm in normalize
result = frequency_encoder_fit(X; normalize = norm)[:statistic_given_feat_val]
enc =
(col, level) -> ((norm) ? sum(col .== level) / length(col) : sum(col .== level))
(col, level) ->
Float32((norm) ? sum(col .== level) / length(col) : sum(col .== level))
true_output = Dict{Symbol, Dict{Any, Any}}(
:F => Dict(
"m" => enc(F_col, "m"),
Expand Down Expand Up @@ -44,7 +45,7 @@ end
X_tr = frequency_encoder_transform(X, cache)
enc =
(col, level) ->
((norm) ? sum(X[col] .== level) / length(X[col]) : sum(X[col] .== level))
Float32((norm) ? sum(X[col] .== level) / length(X[col]) : sum(X[col] .== level))

target = (
A = [enc(:A, X[:A][i]) for i in 1:10],
Expand Down Expand Up @@ -81,4 +82,42 @@ end
# Test report
@test report(mach) == (encoded_features = generic_cache[:encoded_features],)
end
end
end

@testset "Test Frequency Encoding Output Types" begin
# Define categorical features
A = ["g", "b", "g", "r", "r"]
B = [1.0, 2.0, 3.0, 4.0, 5.0]
C = ["f", "f", "f", "m", "f"]
D = [true, false, true, false, true]
E = [1, 2, 3, 4, 5]

# Combine into a named tuple
X = (A = A, B = B, C = C, D = D, E = E)

# Coerce A, C, D to multiclass and B to continuous and E to ordinal
X = coerce(X,
:A => Multiclass,
:B => Continuous,
:C => Multiclass,
:D => Multiclass,
:E => OrderedFactor,
)

# Check scitype coercions:
schema(X)

encoder = FrequencyEncoder(ordered_factor = false, normalize = false)
mach = fit!(machine(encoder, X))
Xnew = MMI.transform(mach, X)


scs = schema(Xnew).scitypes
ts = schema(Xnew).types
# Check scitypes correctness
@test all(scs[1:end-1] .== Continuous)
@test all(t -> (t <: AbstractFloat) && isconcretetype(t), ts[1:end-1])
# Ordinal column should be intact
@test scs[end] === schema(X).scitypes[end]
@test ts[end] == schema(X).types[end]
end
2 changes: 1 addition & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ using StatsModels

# Other transformers
using Tables, CategoricalArrays
using ScientificTypes: scitype
using ScientificTypes: scitype, schema
using Statistics
using StableRNGs
stable_rng = StableRNGs.StableRNG(123)
Expand Down