Skip to content

Commit d9d28ed

Browse files
authored
Merge pull request #24 from JuliaAI/fix-encoder-output-types
Fix encoder output types
2 parents 14a5671 + 16758b4 commit d9d28ed

File tree

16 files changed

+308
-46
lines changed

16 files changed

+308
-46
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,4 @@ meh/*.ipynb
2727
.DS_Store
2828
/*.jl
2929
scratchpad/
30+
examples/test.jl

src/encoders/frequency_encoding/frequency_encoding.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,15 @@ function frequency_encoder_fit(
2424
ignore::Bool = true,
2525
ordered_factor::Bool = false,
2626
normalize::Bool = false,
27+
output_type::Type = Float32,
2728
)
2829
# 1. Define feature mapper
2930
function feature_mapper(col, name)
3031
frequency_map = (!normalize) ? countmap(col) : proportionmap(col)
31-
statistic_given_feat_val = Dict{Any, Real}(level=>frequency_map[level] for level in levels(col))
32+
feat_levels = levels(col)
33+
statistic_given_feat_val = Dict{eltype(feat_levels), output_type}(
34+
level => frequency_map[level] for level in feat_levels
35+
)
3236
return statistic_given_feat_val
3337
end
3438

src/encoders/frequency_encoding/interface_mlj.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ mutable struct FrequencyEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised
66
ignore::Bool
77
ordered_factor::Bool
88
normalize::Bool
9+
output_type::Type
910
end;
1011

1112
# 2. Constructor
@@ -14,8 +15,9 @@ function FrequencyEncoder(;
1415
ignore = true,
1516
ordered_factor = false,
1617
normalize = false,
18+
output_type = Float32,
1719
)
18-
return FrequencyEncoder(features, ignore, ordered_factor, normalize)
20+
return FrequencyEncoder(features, ignore, ordered_factor, normalize, output_type)
1921
end;
2022

2123

@@ -32,6 +34,7 @@ function MMI.fit(transformer::FrequencyEncoder, verbosity::Int, X)
3234
ignore = transformer.ignore,
3335
ordered_factor = transformer.ordered_factor,
3436
normalize = transformer.normalize,
37+
output_type = transformer.output_type,
3538
)
3639
fitresult = generic_cache[:statistic_given_feat_val]
3740

@@ -96,6 +99,7 @@ Train the machine using `fit!(mach, rows=...)`.
9699
- `ignore=true`: Whether to exclude or include the features given in `features`
97100
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
98101
- `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
102+
- `output_type=Float32`: The type of the output values. The default is `Float32`, but you can set it to `Float64` or any other type that can hold the frequency values.
99103
100104
# Operations
101105

src/encoders/missingness_encoding/missingness_encoding.jl

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ function missingness_encoder_fit(
3030
features::AbstractVector{Symbol} = Symbol[];
3131
ignore::Bool = true,
3232
ordered_factor::Bool = false,
33-
label_for_missing::Dict{<:Type, <:Any} = Dict(
33+
label_for_missing::Dict{<:Type, <:Any} = Dict(
3434
AbstractString => "missing",
3535
Char => 'm',
3636
),
@@ -40,8 +40,8 @@ function missingness_encoder_fit(
4040

4141
# 1. Define feature mapper
4242
function feature_mapper(col, name)
43-
col_type = nonmissingtype(eltype(col)).parameters[1]
44-
feat_levels = levels(col; skipmissing=true)
43+
feat_levels = levels(col; skipmissing = true)
44+
col_type = nonmissingtype(eltype(feat_levels))
4545

4646
# Ensure column type is valid (can't test because never occurs)
4747
# Converting array elements to strings before wrapping in a `CategoricalArray`, as...
@@ -58,7 +58,7 @@ function missingness_encoder_fit(
5858

5959
# Check no collision between keys(label_for_missing) and feat_levels
6060
for value in values(label_for_missing)
61-
if !ismissing(value)
61+
if !ismissing(value)
6262
if value in feat_levels
6363
throw(ArgumentError(COLLISION_NEW_VAL_ME(value)))
6464
end
@@ -73,7 +73,7 @@ function missingness_encoder_fit(
7373
break
7474
end
7575
end
76-
76+
7777
# Nonmissing levels remain as is
7878
label_for_missing_given_feature = Dict{Missing, col_type}()
7979

@@ -91,7 +91,8 @@ function missingness_encoder_fit(
9191

9292
# 2. Pass it to generic_fit
9393
label_for_missing_given_feature, encoded_features = generic_fit(
94-
X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper,
94+
X, features; ignore = ignore, ordered_factor = ordered_factor,
95+
feature_mapper = feature_mapper,
9596
)
9697
cache = Dict(
9798
:label_for_missing_given_feature => label_for_missing_given_feature,
@@ -117,6 +118,11 @@ Apply a fitted missingness encoder to a table given the output of `missingness_e
117118
"""
118119
function missingness_encoder_transform(X, cache::Dict)
119120
label_for_missing_given_feature = cache[:label_for_missing_given_feature]
120-
return generic_transform(X, label_for_missing_given_feature; ignore_unknown = true)
121+
return generic_transform(
122+
X,
123+
label_for_missing_given_feature;
124+
ignore_unknown = true,
125+
ensure_categorical = true,
126+
)
121127
end
122128

src/encoders/ordinal_encoding/interface_mlj.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,17 @@ mutable struct OrdinalEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised
55
features::AS
66
ignore::Bool
77
ordered_factor::Bool
8+
output_type::Type
89
end;
910

1011
# 2. Constructor
1112
function OrdinalEncoder(;
1213
features = Symbol[],
1314
ignore = true,
1415
ordered_factor = false,
16+
output_type = Float32,
1517
)
16-
return OrdinalEncoder(features, ignore, ordered_factor)
18+
return OrdinalEncoder(features, ignore, ordered_factor, output_type)
1719
end;
1820

1921

@@ -29,6 +31,7 @@ function MMI.fit(transformer::OrdinalEncoder, verbosity::Int, X)
2931
transformer.features;
3032
ignore = transformer.ignore,
3133
ordered_factor = transformer.ordered_factor,
34+
output_type = transformer.output_type,
3235
)
3336
fitresult =
3437
generic_cache[:index_given_feat_level]
@@ -92,6 +95,7 @@ Train the machine using `fit!(mach, rows=...)`.
9295
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
9396
- `ignore=true`: Whether to exclude or includes the features given in `features`
9497
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
98+
- `output_type`: The numerical concrete type of the encoded features. Default is `Float32`.
9599
96100
# Operations
97101

src/encoders/ordinal_encoding/ordinal_encoding.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ Fit an encoder to encode the levels of categorical variables in a given table as
1010
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
1111
- `ignore=true`: Whether to exclude or includes the features given in `features`
1212
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
13-
13+
- `dtype`: The numerical concrete type of the encoded features. Default is `Float32`.
1414
# Returns (in a dict)
1515
1616
- `index_given_feat_level`: Maps each level for each column in a subset of the categorical features of X into an integer.
@@ -21,12 +21,13 @@ function ordinal_encoder_fit(
2121
features::AbstractVector{Symbol} = Symbol[];
2222
ignore::Bool = true,
2323
ordered_factor::Bool = false,
24+
output_type::Type = Float32,
2425
)
2526
# 1. Define feature mapper
2627
function feature_mapper(col, name)
2728
feat_levels = levels(col)
2829
index_given_feat_val =
29-
Dict{Any, Integer}(value => index for (index, value) in enumerate(feat_levels))
30+
Dict{eltype(feat_levels), output_type}(value => index for (index, value) in enumerate(feat_levels))
3031
return index_given_feat_val
3132
end
3233

src/encoders/target_encoding/target_encoding.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,8 +166,9 @@ function target_encoder_fit(
166166

167167
# 3. Define function to compute the new value(s) for each level given a column
168168
function feature_mapper(col, name)
169+
feat_levels = levels(col)
169170
y_stat_given_feat_level_for_col =
170-
Dict{Any, Union{AbstractFloat, AbstractVector{<:AbstractFloat}}}()
171+
Dict{eltype(feat_levels), Any}()
171172
for level in levels(col)
172173
# Get the targets of an example that belong to this level
173174
targets_for_level = y[col.==level]

src/generic.jl

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,13 @@ function generic_fit(X,
4949
feat_col = Tables.getcolumn(X, feat_name)
5050
feat_type = elscitype(feat_col)
5151
feat_has_allowed_type =
52-
feat_type <: Union{Missing, Multiclass} || (ordered_factor && feat_type <: Union{Missing, OrderedFactor})
52+
feat_type <: Union{Missing, Multiclass} ||
53+
(ordered_factor && feat_type <: Union{Missing, OrderedFactor})
5354
if feat_has_allowed_type # then should be encoded
5455
push!(encoded_features, feat_name)
5556
# Compute the dict using the given feature_mapper function
56-
mapping_per_feat_level[feat_name] = feature_mapper(feat_col, feat_name, args...; kwargs...)
57+
mapping_per_feat_level[feat_name] =
58+
feature_mapper(feat_col, feat_name, args...; kwargs...)
5759
end
5860
end
5961
return mapping_per_feat_level, encoded_features
@@ -72,7 +74,7 @@ function generate_new_feat_names(feat_name, num_inds, existing_names)
7274

7375
new_column_names = []
7476
while conflict
75-
suffix = repeat("_", count)
77+
suffix = repeat("_", count)
7678
new_column_names = [Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds]
7779
conflict = any(name -> name in existing_names, new_column_names)
7880
count += 1
@@ -85,22 +87,29 @@ end
8587
"""
8688
**Private method.**
8789
88-
Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
90+
Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
8991
a subset of categorical features of X into a scalar or a vector (as specified in single_feat)
9092
91-
- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
92-
into a scalar (single_feat=true)
93+
- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
94+
into a scalar (single_feat=true)
9395
94-
- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
95-
into a set of k features where k is the length of the vector (single_feat=false)
96+
- transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
97+
into a set of k features where k is the length of the vector (single_feat=false)
9698
- In both cases it attempts to preserve the type of the table.
9799
- In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such
98-
assumption is necessary because any column in X must correspond to a constant number of features
100+
assumption is necessary because any column in X must correspond to a constant number of features
99101
in the output table (which is equal to k).
100102
- Features not in the dictionary are mapped to themselves (i.e., not changed).
101-
- Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error.
103+
- Levels not in the nested dictionary are mapped to themselves if `ignore unknown` is true else raise an error.
104+
- If `ensure_categorical` is true, then any input categorical column will remain categorical
102105
"""
103-
function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore_unknown = false)
106+
function generic_transform(
107+
X,
108+
mapping_per_feat_level;
109+
single_feat = true,
110+
ignore_unknown = false,
111+
ensure_categorical = false,
112+
)
104113
feat_names = Tables.schema(X).names
105114
new_feat_names = Symbol[]
106115
new_cols = []
@@ -115,18 +124,25 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
115124
if !issubset(test_levels, train_levels)
116125
# get the levels in test that are not in train
117126
lost_levels = setdiff(test_levels, train_levels)
118-
error("While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.")
127+
error(
128+
"While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.",
129+
)
119130
end
120131
end
121-
132+
122133
if single_feat
123134
level2scalar = mapping_per_feat_level[feat_name]
124-
new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
135+
if ensure_categorical
136+
new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
137+
else
138+
new_col = !isempty(level2scalar) ? unwrap.(recode(col, level2scalar...)) : col
139+
end
140+
125141
push!(new_cols, new_col)
126142
push!(new_feat_names, feat_name)
127143
else
128144
level2vector = mapping_per_feat_level[feat_name]
129-
new_multi_col = map(x->get(level2vector, x, x), col)
145+
new_multi_col = map(x -> get(level2vector, x, x), col)
130146
new_multi_col = [col for col in eachrow(hcat(new_multi_col...))]
131147
push!(new_cols, new_multi_col...)
132148

@@ -144,8 +160,8 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
144160
end
145161
end
146162

147-
transformed_X= NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
163+
transformed_X = NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
148164
# Attempt to preserve table type
149165
transformed_X = Tables.materializer(X)(transformed_X)
150166
return transformed_X
151-
end
167+
end

src/transformers/cardinality_reducer/cardinality_reducer.jl

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,20 +35,20 @@ function cardinality_reducer_fit(
3535
features::AbstractVector{Symbol} = Symbol[];
3636
ignore::Bool = true,
3737
ordered_factor::Bool = false,
38-
min_frequency::Real = 3,
39-
label_for_infrequent::Dict{<:Type, <:Any} = Dict(
38+
min_frequency::Real = 3,
39+
label_for_infrequent::Dict{<:Type, <:Any} = Dict(
4040
AbstractString => "Other",
4141
Char => 'O',
4242
),
43-
)
43+
)
4444
supportedtypes_list = [Char, AbstractString, Number]
4545
supportedtypes = Union{supportedtypes_list...}
4646

4747
# 1. Define feature mapper
4848
function feature_mapper(col, name)
4949
val_to_freq = (min_frequency isa AbstractFloat) ? proportionmap(col) : countmap(col)
50-
col_type = eltype(col).parameters[1]
5150
feat_levels = levels(col)
51+
col_type = eltype(feat_levels)
5252

5353
# Ensure column type is valid (can't test because never occurs)
5454
# Converting array elements to strings before wrapping in a `CategoricalArray`, as...
@@ -88,7 +88,11 @@ function cardinality_reducer_fit(
8888
elseif elgrandtype == Number
8989
new_cat_given_col_val[level] = minimum(feat_levels) - 1
9090
else
91-
throw(ArgumentError(UNSPECIFIED_COL_TYPE(col_type, label_for_infrequent)))
91+
throw(
92+
ArgumentError(
93+
UNSPECIFIED_COL_TYPE(col_type, label_for_infrequent),
94+
),
95+
)
9296
end
9397
end
9498
end
@@ -98,7 +102,8 @@ function cardinality_reducer_fit(
98102

99103
# 2. Pass it to generic_fit
100104
new_cat_given_col_val, encoded_features = generic_fit(
101-
X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper,
105+
X, features; ignore = ignore, ordered_factor = ordered_factor,
106+
feature_mapper = feature_mapper,
102107
)
103108
cache = Dict(
104109
:new_cat_given_col_val => new_cat_given_col_val,
@@ -125,5 +130,5 @@ Apply a fitted cardinality reducer to a table given the output of `cardinality_r
125130
"""
126131
function cardinality_reducer_transform(X, cache::Dict)
127132
new_cat_given_col_val = cache[:new_cat_given_col_val]
128-
return generic_transform(X, new_cat_given_col_val; ignore_unknown = true)
133+
return generic_transform(X, new_cat_given_col_val; ignore_unknown = true, ensure_categorical = true)
129134
end

test/encoders/contrast_encoder.jl

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,7 @@ end
5151
cache = contrast_encoder_fit(X, [:name]; ignore=false, mode = :dummy)
5252
k = length(levels(X.name))
5353
contrast_matrix = get_dummy_contrast(k)
54-
print()
5554
for (i, level) in enumerate(levels(X.name))
56-
println(cache[:vector_given_value_given_feature])
5755
@test cache[:vector_given_value_given_feature][:name][level] == contrast_matrix[i, :]
5856
end
5957
end
@@ -289,4 +287,40 @@ end
289287

290288
# Test report
291289
@test report(mach) == (encoded_features = generic_cache[:encoded_features],)
290+
end
291+
292+
293+
@testset "Test Contrast Encoder Output Types" begin
294+
X = (
295+
name = categorical(["Ben", "John", "Mary", "John"]),
296+
height = [1.85, 1.67, 1.5, 1.67],
297+
favnum = categorical([7, 5, 10, 1]),
298+
age = [23, 23, 14, 23],
299+
)
300+
301+
methods = [:contrast, :dummy, :sum, :backward_diff, :helmert, :hypothesis]
302+
matrix_func = [buildrandomcontrast, nothing, nothing, nothing, nothing, buildrandomhypothesis]
303+
304+
for (i, method) in enumerate(methods)
305+
encoder = ContrastEncoder(
306+
features = [:name, :favnum],
307+
ignore = false,
308+
mode = method,
309+
buildmatrix=matrix_func[i]
310+
)
311+
mach = fit!(machine(encoder, X))
312+
Xnew = MMI.transform(mach, X)
313+
314+
# Test Consistency with Types
315+
scs = schema(Xnew).scitypes
316+
ts = schema(Xnew).types
317+
318+
# Check scitypes for previously continuos or categorical features
319+
@test all(scs[1:end-1] .== Continuous)
320+
@test all(t -> (t <: AbstractFloat) && isconcretetype(t), ts[1:end-1])
321+
# Check scitypes for previously Count feature
322+
last_type, last_sctype = ts[end], scs[end]
323+
@test last_type <: Integer && isconcretetype(last_type)
324+
@test last_sctype <: Count
325+
end
292326
end

0 commit comments

Comments
 (0)