Skip to content

Commit d0c67ac

Browse files
authored
Merge branch 'main' into add-callable-and-better-errors
2 parents 5e0af90 + 98893ea commit d0c67ac

File tree

21 files changed

+372
-72
lines changed

21 files changed

+372
-72
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,4 @@ meh/*.ipynb
2727
.DS_Store
2828
/*.jl
2929
scratchpad/
30+
examples/test.jl

src/MLJTransforms.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ using MLJModelInterface
77
using TableOperations
88
using StatsBase
99
using LinearAlgebra
10-
10+
using OrderedCollections: OrderedDict
1111
# Other transformers
1212
using Combinatorics
1313
import Distributions

src/encoders/contrast_encoder/contrast_encoder.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ function contrast_encoder_fit(
125125
throw(ArgumentError("Mode $feat_mode is not supported."))
126126
end
127127

128-
vector_given_value_given_feature = Dict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
128+
vector_given_value_given_feature = OrderedDict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
129129
return vector_given_value_given_feature
130130
end
131131

@@ -159,5 +159,5 @@ Use a fitted contrast encoder to encode the levels of selected categorical varia
159159
"""
160160
function contrast_encoder_transform(X, cache::Dict)
161161
vector_given_value_given_feature = cache[:vector_given_value_given_feature]
162-
return generic_transform(X, vector_given_value_given_feature, single_feat = false)
162+
return generic_transform(X, vector_given_value_given_feature, single_feat = false; use_levelnames = true)
163163
end

src/encoders/contrast_encoder/interface_mlj.jl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -148,12 +148,12 @@ mach = fit!(machine(encoder, X))
148148
Xnew = transform(mach, X)
149149
150150
julia > Xnew
151-
(name_1 = [1.0, 0.0, 0.0, 0.0],
152-
name_2 = [0.0, 1.0, 0.0, 1.0],
151+
(name_John = [1.0, 0.0, 0.0, 0.0],
152+
name_Mary = [0.0, 1.0, 0.0, 1.0],
153153
height = [1.85, 1.67, 1.5, 1.67],
154-
favnum_1 = [0.0, 1.0, 0.0, -1.0],
155-
favnum_2 = [2.0, -1.0, 0.0, -1.0],
156-
favnum_3 = [-1.0, -1.0, 3.0, -1.0],
154+
favnum_5 = [0.0, 1.0, 0.0, -1.0],
155+
favnum_7 = [2.0, -1.0, 0.0, -1.0],
156+
favnum_10 = [-1.0, -1.0, 3.0, -1.0],
157157
age = [23, 23, 14, 23],)
158158
```
159159

src/encoders/frequency_encoding/frequency_encoding.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,15 @@ function frequency_encoder_fit(
2424
ignore::Bool = true,
2525
ordered_factor::Bool = false,
2626
normalize::Bool = false,
27+
output_type::Type = Float32,
2728
)
2829
# 1. Define feature mapper
2930
function feature_mapper(col, name)
3031
frequency_map = (!normalize) ? countmap(col) : proportionmap(col)
31-
statistic_given_feat_val = Dict{Any, Real}(level=>frequency_map[level] for level in levels(col))
32+
feat_levels = levels(col)
33+
statistic_given_feat_val = Dict{eltype(feat_levels), output_type}(
34+
level => frequency_map[level] for level in feat_levels
35+
)
3236
return statistic_given_feat_val
3337
end
3438

src/encoders/frequency_encoding/interface_mlj.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ mutable struct FrequencyEncoder{A <: Any} <: Unsupervised
66
ignore::Bool
77
ordered_factor::Bool
88
normalize::Bool
9+
output_type::Type
910
end;
1011

1112
# 2. Constructor
@@ -14,8 +15,9 @@ function FrequencyEncoder(;
1415
ignore = true,
1516
ordered_factor = false,
1617
normalize = false,
18+
output_type = Float32,
1719
)
18-
return FrequencyEncoder(features, ignore, ordered_factor, normalize)
20+
return FrequencyEncoder(features, ignore, ordered_factor, normalize, output_type)
1921
end;
2022

2123

@@ -32,6 +34,7 @@ function MMI.fit(transformer::FrequencyEncoder, verbosity::Int, X)
3234
ignore = transformer.ignore,
3335
ordered_factor = transformer.ordered_factor,
3436
normalize = transformer.normalize,
37+
output_type = transformer.output_type,
3538
)
3639
fitresult = generic_cache[:statistic_given_feat_val]
3740

@@ -96,6 +99,7 @@ Train the machine using `fit!(mach, rows=...)`.
9699
- `ignore=true`: Whether to exclude or include the features given in `features`
97100
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
98101
- `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
102+
- `output_type=Float32`: The type of the output values. The default is `Float32`, but you can set it to `Float64` or any other type that can hold the frequency values.
99103
100104
# Operations
101105

src/encoders/missingness_encoding/missingness_encoding.jl

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ function missingness_encoder_fit(
3030
features = Symbol[];
3131
ignore::Bool = true,
3232
ordered_factor::Bool = false,
33-
label_for_missing::Dict{<:Type, <:Any} = Dict(
33+
label_for_missing::Dict{<:Type, <:Any} = Dict(
3434
AbstractString => "missing",
3535
Char => 'm',
3636
),
@@ -40,8 +40,8 @@ function missingness_encoder_fit(
4040

4141
# 1. Define feature mapper
4242
function feature_mapper(col, name)
43-
col_type = nonmissingtype(eltype(col)).parameters[1]
44-
feat_levels = levels(col; skipmissing=true)
43+
feat_levels = levels(col; skipmissing = true)
44+
col_type = nonmissingtype(eltype(feat_levels))
4545

4646
# Ensure column type is valid (can't test because never occurs)
4747
# Converting array elements to strings before wrapping in a `CategoricalArray`, as...
@@ -58,7 +58,7 @@ function missingness_encoder_fit(
5858

5959
# Check no collision between keys(label_for_missing) and feat_levels
6060
for value in values(label_for_missing)
61-
if !ismissing(value)
61+
if !ismissing(value)
6262
if value in feat_levels
6363
throw(ArgumentError(COLLISION_NEW_VAL_ME(value)))
6464
end
@@ -73,7 +73,7 @@ function missingness_encoder_fit(
7373
break
7474
end
7575
end
76-
76+
7777
# Nonmissing levels remain as is
7878
label_for_missing_given_feature = Dict{Missing, col_type}()
7979

@@ -91,7 +91,8 @@ function missingness_encoder_fit(
9191

9292
# 2. Pass it to generic_fit
9393
label_for_missing_given_feature, encoded_features = generic_fit(
94-
X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper,
94+
X, features; ignore = ignore, ordered_factor = ordered_factor,
95+
feature_mapper = feature_mapper,
9596
)
9697
cache = Dict(
9798
:label_for_missing_given_feature => label_for_missing_given_feature,
@@ -117,6 +118,11 @@ Apply a fitted missingness encoder to a table given the output of `missingness_e
117118
"""
118119
function missingness_encoder_transform(X, cache::Dict)
119120
label_for_missing_given_feature = cache[:label_for_missing_given_feature]
120-
return generic_transform(X, label_for_missing_given_feature; ignore_unknown = true)
121+
return generic_transform(
122+
X,
123+
label_for_missing_given_feature;
124+
ignore_unknown = true,
125+
ensure_categorical = true,
126+
)
121127
end
122128

src/encoders/ordinal_encoding/interface_mlj.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,17 @@ mutable struct OrdinalEncoder{A <: Any} <: Unsupervised
55
features::A
66
ignore::Bool
77
ordered_factor::Bool
8+
output_type::Type
89
end;
910

1011
# 2. Constructor
1112
function OrdinalEncoder(;
1213
features = Symbol[],
1314
ignore = true,
1415
ordered_factor = false,
16+
output_type = Float32,
1517
)
16-
return OrdinalEncoder(features, ignore, ordered_factor)
18+
return OrdinalEncoder(features, ignore, ordered_factor, output_type)
1719
end;
1820

1921

@@ -29,6 +31,7 @@ function MMI.fit(transformer::OrdinalEncoder, verbosity::Int, X)
2931
transformer.features;
3032
ignore = transformer.ignore,
3133
ordered_factor = transformer.ordered_factor,
34+
output_type = transformer.output_type,
3235
)
3336
fitresult =
3437
generic_cache[:index_given_feat_level]
@@ -92,6 +95,7 @@ Train the machine using `fit!(mach, rows=...)`.
9295
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
9396
- `ignore=true`: Whether to exclude or includes the features given in `features`
9497
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
98+
- `output_type`: The numerical concrete type of the encoded features. Default is `Float32`.
9599
96100
# Operations
97101

src/encoders/ordinal_encoding/ordinal_encoding.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ Fit an encoder to encode the levels of categorical variables in a given table as
1010
- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
1111
- `ignore=true`: Whether to exclude or includes the features given in `features`
1212
- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
13-
13+
- `dtype`: The numerical concrete type of the encoded features. Default is `Float32`.
1414
# Returns (in a dict)
1515
1616
- `index_given_feat_level`: Maps each level for each column in a subset of the categorical features of X into an integer.
@@ -21,12 +21,13 @@ function ordinal_encoder_fit(
2121
features = Symbol[];
2222
ignore::Bool = true,
2323
ordered_factor::Bool = false,
24+
output_type::Type = Float32,
2425
)
2526
# 1. Define feature mapper
2627
function feature_mapper(col, name)
2728
feat_levels = levels(col)
2829
index_given_feat_val =
29-
Dict{Any, Integer}(value => index for (index, value) in enumerate(feat_levels))
30+
Dict{eltype(feat_levels), output_type}(value => index for (index, value) in enumerate(feat_levels))
3031
return index_given_feat_val
3132
end
3233

src/encoders/target_encoding/interface_mlj.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ struct TargetEncoderResult{
5151
y_stat_given_feat_level::Dict{A, A}
5252
task::S # "Regression", "Classification"
5353
num_classes::I # num_classes in case of classification
54+
y_classes::A # y_classes in case of classification
55+
5456
end
5557

5658

@@ -76,6 +78,7 @@ function MMI.fit(transformer::TargetEncoder, verbosity::Int, X, y)
7678
generic_cache[:y_stat_given_feat_level],
7779
generic_cache[:task],
7880
generic_cache[:num_classes],
81+
generic_cache[:y_classes],
7982
)
8083
report = (encoded_features = generic_cache[:encoded_features],) # report only has list of encoded features
8184
cache = nothing
@@ -90,6 +93,7 @@ function MMI.transform(transformer::TargetEncoder, fitresult, Xnew)
9093
fitresult.y_stat_given_feat_level,
9194
:num_classes => fitresult.num_classes,
9295
:task => fitresult.task,
96+
:y_classes => fitresult.y_classes,
9397
)
9498
Xnew_transf = target_encoder_transform(Xnew, generic_cache)
9599
return Xnew_transf

0 commit comments

Comments
 (0)