Skip to content

Commit 16758b4

Browse files
committed
✨ Add support of output type to frequency encoder
Based on normalize, we may want integers or floats so let's give this flexibility
1 parent bab7664 commit 16758b4

File tree

5 files changed

+15
-10
lines changed

5 files changed

+15
-10
lines changed

src/encoders/frequency_encoding/frequency_encoding.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,13 @@ function frequency_encoder_fit(
2424
ignore::Bool = true,
2525
ordered_factor::Bool = false,
2626
normalize::Bool = false,
27+
output_type::Type = Float32,
2728
)
2829
# 1. Define feature mapper
2930
function feature_mapper(col, name)
3031
frequency_map = (!normalize) ? countmap(col) : proportionmap(col)
3132
feat_levels = levels(col)
32-
statistic_given_feat_val = Dict{eltype(feat_levels), Float32}(
33+
statistic_given_feat_val = Dict{eltype(feat_levels), output_type}(
3334
level => frequency_map[level] for level in feat_levels
3435
)
3536
return statistic_given_feat_val

src/encoders/frequency_encoding/interface_mlj.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ mutable struct FrequencyEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised
66
ignore::Bool
77
ordered_factor::Bool
88
normalize::Bool
9+
output_type::Type
910
end;
1011

1112
# 2. Constructor
@@ -14,8 +15,9 @@ function FrequencyEncoder(;
1415
ignore = true,
1516
ordered_factor = false,
1617
normalize = false,
18+
output_type = Float32,
1719
)
18-
return FrequencyEncoder(features, ignore, ordered_factor, normalize)
20+
return FrequencyEncoder(features, ignore, ordered_factor, normalize, output_type)
1921
end;
2022

2123

@@ -32,6 +34,7 @@ function MMI.fit(transformer::FrequencyEncoder, verbosity::Int, X)
3234
ignore = transformer.ignore,
3335
ordered_factor = transformer.ordered_factor,
3436
normalize = transformer.normalize,
37+
output_type = transformer.output_type,
3538
)
3639
fitresult = generic_cache[:statistic_given_feat_val]
3740

@@ -96,6 +99,7 @@ Train the machine using `fit!(mach, rows=...)`.
9699
- `ignore=true`: Whether to exclude or include the features given in `features`
97100
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
98101
- `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
102+
- `output_type=Float32`: The type of the output values. The default is `Float32`, but you can set it to `Float64` or any other type that can hold the frequency values.
99103
100104
# Operations
101105

src/encoders/ordinal_encoding/interface_mlj.jl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,17 @@ mutable struct OrdinalEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised
55
features::AS
66
ignore::Bool
77
ordered_factor::Bool
8-
op_dtype::Type
8+
output_type::Type
99
end;
1010

1111
# 2. Constructor
1212
function OrdinalEncoder(;
1313
features = Symbol[],
1414
ignore = true,
1515
ordered_factor = false,
16-
op_dtype = Float32,
16+
output_type = Float32,
1717
)
18-
return OrdinalEncoder(features, ignore, ordered_factor, op_dtype)
18+
return OrdinalEncoder(features, ignore, ordered_factor, output_type)
1919
end;
2020

2121

@@ -31,7 +31,7 @@ function MMI.fit(transformer::OrdinalEncoder, verbosity::Int, X)
3131
transformer.features;
3232
ignore = transformer.ignore,
3333
ordered_factor = transformer.ordered_factor,
34-
op_dtype = transformer.op_dtype,
34+
output_type = transformer.output_type,
3535
)
3636
fitresult =
3737
generic_cache[:index_given_feat_level]
@@ -95,7 +95,7 @@ Train the machine using `fit!(mach, rows=...)`.
9595
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
9696
- `ignore=true`: Whether to exclude or includes the features given in `features`
9797
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
98-
- `op_dtype`: The numerical concrete type of the encoded features. Default is `Float32`.
98+
- `output_type`: The numerical concrete type of the encoded features. Default is `Float32`.
9999
100100
# Operations
101101

src/encoders/ordinal_encoding/ordinal_encoding.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,13 @@ function ordinal_encoder_fit(
2121
features::AbstractVector{Symbol} = Symbol[];
2222
ignore::Bool = true,
2323
ordered_factor::Bool = false,
24-
op_dtype::Type = Float32,
24+
output_type::Type = Float32,
2525
)
2626
# 1. Define feature mapper
2727
function feature_mapper(col, name)
2828
feat_levels = levels(col)
2929
index_given_feat_val =
30-
Dict{eltype(feat_levels), op_dtype}(value => index for (index, value) in enumerate(feat_levels))
30+
Dict{eltype(feat_levels), output_type}(value => index for (index, value) in enumerate(feat_levels))
3131
return index_given_feat_val
3232
end
3333

test/encoders/ordinal_encoding.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ end
121121
schema(X).scitypes[end]
122122

123123
## Int32 case
124-
encoder = OrdinalEncoder(ordered_factor = false, op_dtype = Int32)
124+
encoder = OrdinalEncoder(ordered_factor = false, output_type = Int32)
125125
mach = fit!(machine(encoder, X))
126126
Xnew = MMI.transform(mach, X)
127127
scs = schema(Xnew).scitypes

0 commit comments

Comments
 (0)