Skip to content

Commit c401f3b

Browse files
authored
Merge branch 'main' into use-level-names-for-categories
2 parents a8b3296 + d9d28ed commit c401f3b

File tree

15 files changed

+281
-30
lines changed

15 files changed

+281
-30
lines changed

src/encoders/frequency_encoding/frequency_encoding.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,15 @@ function frequency_encoder_fit(
2424
ignore::Bool = true,
2525
ordered_factor::Bool = false,
2626
normalize::Bool = false,
27+
output_type::Type = Float32,
2728
)
2829
# 1. Define feature mapper
2930
function feature_mapper(col, name)
3031
frequency_map = (!normalize) ? countmap(col) : proportionmap(col)
31-
statistic_given_feat_val = Dict{Any, Real}(level=>frequency_map[level] for level in levels(col))
32+
feat_levels = levels(col)
33+
statistic_given_feat_val = Dict{eltype(feat_levels), output_type}(
34+
level => frequency_map[level] for level in feat_levels
35+
)
3236
return statistic_given_feat_val
3337
end
3438

src/encoders/frequency_encoding/interface_mlj.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ mutable struct FrequencyEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised
66
ignore::Bool
77
ordered_factor::Bool
88
normalize::Bool
9+
output_type::Type
910
end;
1011

1112
# 2. Constructor
@@ -14,8 +15,9 @@ function FrequencyEncoder(;
1415
ignore = true,
1516
ordered_factor = false,
1617
normalize = false,
18+
output_type = Float32,
1719
)
18-
return FrequencyEncoder(features, ignore, ordered_factor, normalize)
20+
return FrequencyEncoder(features, ignore, ordered_factor, normalize, output_type)
1921
end;
2022

2123

@@ -32,6 +34,7 @@ function MMI.fit(transformer::FrequencyEncoder, verbosity::Int, X)
3234
ignore = transformer.ignore,
3335
ordered_factor = transformer.ordered_factor,
3436
normalize = transformer.normalize,
37+
output_type = transformer.output_type,
3538
)
3639
fitresult = generic_cache[:statistic_given_feat_val]
3740

@@ -96,6 +99,7 @@ Train the machine using `fit!(mach, rows=...)`.
9699
- `ignore=true`: Whether to exclude or include the features given in `features`
97100
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
98101
- `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
102+
- `output_type=Float32`: The type of the output values. The default is `Float32`, but you can set it to `Float64` or any other type that can hold the frequency values.
99103
100104
# Operations
101105

src/encoders/missingness_encoding/missingness_encoding.jl

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ function missingness_encoder_fit(
3030
features::AbstractVector{Symbol} = Symbol[];
3131
ignore::Bool = true,
3232
ordered_factor::Bool = false,
33-
label_for_missing::Dict{<:Type, <:Any} = Dict(
33+
label_for_missing::Dict{<:Type, <:Any} = Dict(
3434
AbstractString => "missing",
3535
Char => 'm',
3636
),
@@ -40,8 +40,8 @@ function missingness_encoder_fit(
4040

4141
# 1. Define feature mapper
4242
function feature_mapper(col, name)
43-
col_type = nonmissingtype(eltype(col)).parameters[1]
44-
feat_levels = levels(col; skipmissing=true)
43+
feat_levels = levels(col; skipmissing = true)
44+
col_type = nonmissingtype(eltype(feat_levels))
4545

4646
# Ensure column type is valid (can't test because never occurs)
4747
# Converting array elements to strings before wrapping in a `CategoricalArray`, as...
@@ -58,7 +58,7 @@ function missingness_encoder_fit(
5858

5959
# Check no collision between keys(label_for_missing) and feat_levels
6060
for value in values(label_for_missing)
61-
if !ismissing(value)
61+
if !ismissing(value)
6262
if value in feat_levels
6363
throw(ArgumentError(COLLISION_NEW_VAL_ME(value)))
6464
end
@@ -73,7 +73,7 @@ function missingness_encoder_fit(
7373
break
7474
end
7575
end
76-
76+
7777
# Nonmissing levels remain as is
7878
label_for_missing_given_feature = Dict{Missing, col_type}()
7979

@@ -91,7 +91,8 @@ function missingness_encoder_fit(
9191

9292
# 2. Pass it to generic_fit
9393
label_for_missing_given_feature, encoded_features = generic_fit(
94-
X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper,
94+
X, features; ignore = ignore, ordered_factor = ordered_factor,
95+
feature_mapper = feature_mapper,
9596
)
9697
cache = Dict(
9798
:label_for_missing_given_feature => label_for_missing_given_feature,
@@ -117,6 +118,11 @@ Apply a fitted missingness encoder to a table given the output of `missingness_e
117118
"""
118119
function missingness_encoder_transform(X, cache::Dict)
119120
label_for_missing_given_feature = cache[:label_for_missing_given_feature]
120-
return generic_transform(X, label_for_missing_given_feature; ignore_unknown = true)
121+
return generic_transform(
122+
X,
123+
label_for_missing_given_feature;
124+
ignore_unknown = true,
125+
ensure_categorical = true,
126+
)
121127
end
122128

src/encoders/ordinal_encoding/interface_mlj.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,17 @@ mutable struct OrdinalEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised
55
features::AS
66
ignore::Bool
77
ordered_factor::Bool
8+
output_type::Type
89
end;
910

1011
# 2. Constructor
1112
function OrdinalEncoder(;
1213
features = Symbol[],
1314
ignore = true,
1415
ordered_factor = false,
16+
output_type = Float32,
1517
)
16-
return OrdinalEncoder(features, ignore, ordered_factor)
18+
return OrdinalEncoder(features, ignore, ordered_factor, output_type)
1719
end;
1820

1921

@@ -29,6 +31,7 @@ function MMI.fit(transformer::OrdinalEncoder, verbosity::Int, X)
2931
transformer.features;
3032
ignore = transformer.ignore,
3133
ordered_factor = transformer.ordered_factor,
34+
output_type = transformer.output_type,
3235
)
3336
fitresult =
3437
generic_cache[:index_given_feat_level]
@@ -92,6 +95,7 @@ Train the machine using `fit!(mach, rows=...)`.
9295
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
9396
- `ignore=true`: Whether to exclude or includes the features given in `features`
9497
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
98+
- `output_type`: The numerical concrete type of the encoded features. Default is `Float32`.
9599
96100
# Operations
97101

src/encoders/ordinal_encoding/ordinal_encoding.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ Fit an encoder to encode the levels of categorical variables in a given table as
1010
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
1111
- `ignore=true`: Whether to exclude or includes the features given in `features`
1212
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
13-
13+
- `dtype`: The numerical concrete type of the encoded features. Default is `Float32`.
1414
# Returns (in a dict)
1515
1616
- `index_given_feat_level`: Maps each level for each column in a subset of the categorical features of X into an integer.
@@ -21,12 +21,13 @@ function ordinal_encoder_fit(
2121
features::AbstractVector{Symbol} = Symbol[];
2222
ignore::Bool = true,
2323
ordered_factor::Bool = false,
24+
output_type::Type = Float32,
2425
)
2526
# 1. Define feature mapper
2627
function feature_mapper(col, name)
2728
feat_levels = levels(col)
2829
index_given_feat_val =
29-
Dict{Any, Integer}(value => index for (index, value) in enumerate(feat_levels))
30+
Dict{eltype(feat_levels), output_type}(value => index for (index, value) in enumerate(feat_levels))
3031
return index_given_feat_val
3132
end
3233

src/encoders/target_encoding/target_encoding.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,8 +166,9 @@ function target_encoder_fit(
166166

167167
# 3. Define function to compute the new value(s) for each level given a column
168168
function feature_mapper(col, name)
169+
feat_levels = levels(col)
169170
y_stat_given_feat_level_for_col =
170-
Dict{Any, Union{AbstractFloat, AbstractVector{<:AbstractFloat}}}()
171+
Dict{eltype(feat_levels), Any}()
171172
for level in levels(col)
172173
# Get the targets of an example that belong to this level
173174
targets_for_level = y[col.==level]

src/generic.jl

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ function generic_transform(
130130
ignore_unknown = false,
131131
use_levelnames = false,
132132
custom_levels = nothing,
133+
ensure_categorical = false,
133134
)
134135
feat_names = Tables.schema(X).names
135136
new_feat_names = Symbol[]
@@ -153,7 +154,12 @@ function generic_transform(
153154

154155
if single_feat
155156
level2scalar = mapping_per_feat_level[feat_name]
156-
new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
157+
if ensure_categorical
158+
new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
159+
else
160+
new_col = !isempty(level2scalar) ? unwrap.(recode(col, level2scalar...)) : col
161+
end
162+
157163
push!(new_cols, new_col)
158164
push!(new_feat_names, feat_name)
159165
else

src/transformers/cardinality_reducer/cardinality_reducer.jl

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,20 +35,20 @@ function cardinality_reducer_fit(
3535
features::AbstractVector{Symbol} = Symbol[];
3636
ignore::Bool = true,
3737
ordered_factor::Bool = false,
38-
min_frequency::Real = 3,
39-
label_for_infrequent::Dict{<:Type, <:Any} = Dict(
38+
min_frequency::Real = 3,
39+
label_for_infrequent::Dict{<:Type, <:Any} = Dict(
4040
AbstractString => "Other",
4141
Char => 'O',
4242
),
43-
)
43+
)
4444
supportedtypes_list = [Char, AbstractString, Number]
4545
supportedtypes = Union{supportedtypes_list...}
4646

4747
# 1. Define feature mapper
4848
function feature_mapper(col, name)
4949
val_to_freq = (min_frequency isa AbstractFloat) ? proportionmap(col) : countmap(col)
50-
col_type = eltype(col).parameters[1]
5150
feat_levels = levels(col)
51+
col_type = eltype(feat_levels)
5252

5353
# Ensure column type is valid (can't test because never occurs)
5454
# Converting array elements to strings before wrapping in a `CategoricalArray`, as...
@@ -88,7 +88,11 @@ function cardinality_reducer_fit(
8888
elseif elgrandtype == Number
8989
new_cat_given_col_val[level] = minimum(feat_levels) - 1
9090
else
91-
throw(ArgumentError(UNSPECIFIED_COL_TYPE(col_type, label_for_infrequent)))
91+
throw(
92+
ArgumentError(
93+
UNSPECIFIED_COL_TYPE(col_type, label_for_infrequent),
94+
),
95+
)
9296
end
9397
end
9498
end
@@ -98,7 +102,8 @@ function cardinality_reducer_fit(
98102

99103
# 2. Pass it to generic_fit
100104
new_cat_given_col_val, encoded_features = generic_fit(
101-
X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper,
105+
X, features; ignore = ignore, ordered_factor = ordered_factor,
106+
feature_mapper = feature_mapper,
102107
)
103108
cache = Dict(
104109
:new_cat_given_col_val => new_cat_given_col_val,
@@ -125,5 +130,5 @@ Apply a fitted cardinality reducer to a table given the output of `cardinality_r
125130
"""
126131
function cardinality_reducer_transform(X, cache::Dict)
127132
new_cat_given_col_val = cache[:new_cat_given_col_val]
128-
return generic_transform(X, new_cat_given_col_val; ignore_unknown = true)
133+
return generic_transform(X, new_cat_given_col_val; ignore_unknown = true, ensure_categorical = true)
129134
end

test/encoders/contrast_encoder.jl

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,7 @@ end
5151
cache = contrast_encoder_fit(X, [:name]; ignore=false, mode = :dummy)
5252
k = length(levels(X.name))
5353
contrast_matrix = get_dummy_contrast(k)
54-
print()
5554
for (i, level) in enumerate(levels(X.name))
56-
println(cache[:vector_given_value_given_feature])
5755
@test cache[:vector_given_value_given_feature][:name][level] == contrast_matrix[i, :]
5856
end
5957
end
@@ -289,4 +287,40 @@ end
289287

290288
# Test report
291289
@test report(mach) == (encoded_features = generic_cache[:encoded_features],)
290+
end
291+
292+
293+
@testset "Test Contrast Encoder Output Types" begin
294+
X = (
295+
name = categorical(["Ben", "John", "Mary", "John"]),
296+
height = [1.85, 1.67, 1.5, 1.67],
297+
favnum = categorical([7, 5, 10, 1]),
298+
age = [23, 23, 14, 23],
299+
)
300+
301+
methods = [:contrast, :dummy, :sum, :backward_diff, :helmert, :hypothesis]
302+
matrix_func = [buildrandomcontrast, nothing, nothing, nothing, nothing, buildrandomhypothesis]
303+
304+
for (i, method) in enumerate(methods)
305+
encoder = ContrastEncoder(
306+
features = [:name, :favnum],
307+
ignore = false,
308+
mode = method,
309+
buildmatrix=matrix_func[i]
310+
)
311+
mach = fit!(machine(encoder, X))
312+
Xnew = MMI.transform(mach, X)
313+
314+
# Test Consistency with Types
315+
scs = schema(Xnew).scitypes
316+
ts = schema(Xnew).types
317+
318+
# Check scitypes for previously continuos or categorical features
319+
@test all(scs[1:end-1] .== Continuous)
320+
@test all(t -> (t <: AbstractFloat) && isconcretetype(t), ts[1:end-1])
321+
# Check scitypes for previously Count feature
322+
last_type, last_sctype = ts[end], scs[end]
323+
@test last_type <: Integer && isconcretetype(last_type)
324+
@test last_sctype <: Count
325+
end
292326
end

test/encoders/frequency_encoder.jl

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ using MLJTransforms: frequency_encoder_fit, frequency_encoder_transform
99
for norm in normalize
1010
result = frequency_encoder_fit(X; normalize = norm)[:statistic_given_feat_val]
1111
enc =
12-
(col, level) -> ((norm) ? sum(col .== level) / length(col) : sum(col .== level))
12+
(col, level) ->
13+
Float32((norm) ? sum(col .== level) / length(col) : sum(col .== level))
1314
true_output = Dict{Symbol, Dict{Any, Any}}(
1415
:F => Dict(
1516
"m" => enc(F_col, "m"),
@@ -44,7 +45,7 @@ end
4445
X_tr = frequency_encoder_transform(X, cache)
4546
enc =
4647
(col, level) ->
47-
((norm) ? sum(X[col] .== level) / length(X[col]) : sum(X[col] .== level))
48+
Float32((norm) ? sum(X[col] .== level) / length(X[col]) : sum(X[col] .== level))
4849

4950
target = (
5051
A = [enc(:A, X[:A][i]) for i in 1:10],
@@ -81,4 +82,42 @@ end
8182
# Test report
8283
@test report(mach) == (encoded_features = generic_cache[:encoded_features],)
8384
end
84-
end
85+
end
86+
87+
@testset "Test Frequency Encoding Output Types" begin
88+
# Define categorical features
89+
A = ["g", "b", "g", "r", "r"]
90+
B = [1.0, 2.0, 3.0, 4.0, 5.0]
91+
C = ["f", "f", "f", "m", "f"]
92+
D = [true, false, true, false, true]
93+
E = [1, 2, 3, 4, 5]
94+
95+
# Combine into a named tuple
96+
X = (A = A, B = B, C = C, D = D, E = E)
97+
98+
# Coerce A, C, D to multiclass and B to continuous and E to ordinal
99+
X = coerce(X,
100+
:A => Multiclass,
101+
:B => Continuous,
102+
:C => Multiclass,
103+
:D => Multiclass,
104+
:E => OrderedFactor,
105+
)
106+
107+
# Check scitype coercions:
108+
schema(X)
109+
110+
encoder = FrequencyEncoder(ordered_factor = false, normalize = false)
111+
mach = fit!(machine(encoder, X))
112+
Xnew = MMI.transform(mach, X)
113+
114+
115+
scs = schema(Xnew).scitypes
116+
ts = schema(Xnew).types
117+
# Check scitypes correctness
118+
@test all(scs[1:end-1] .== Continuous)
119+
@test all(t -> (t <: AbstractFloat) && isconcretetype(t), ts[1:end-1])
120+
# Ordinal column should be intact
121+
@test scs[end] === schema(X).scitypes[end]
122+
@test ts[end] == schema(X).types[end]
123+
end

0 commit comments

Comments
 (0)