Skip to content

Commit 09e14a2

Browse files
committed
adapt to new behaviour of CategoricalArrays.levels
1 parent 81c319e commit 09e14a2

File tree

16 files changed

+64
-59
lines changed

16 files changed

+64
-59
lines changed

src/MLJTransforms.jl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ using OrderedCollections
2020

2121
const MMI = MLJModelInterface
2222

23+
# old behaviour of `levels` (before CategoricalArrays 1.0):
24+
rawlevels(A) = unwrap.(levels(A))
25+
2326
# Functions of generic use across transformers
2427
include("common_docs.jl")
2528
include("generic.jl")

src/encoders/contrast_encoder/contrast_encoder.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ function contrast_encoder_fit(
102102

103103
# ensure mode is one of :contrast, :dummy, :sum, :backward_diff, :forward_diff, :helmert, :polynomial, :hypothesis
104104
function feature_mapper(col, name)
105-
feat_levels = levels(col)
105+
feat_levels = rawlevels(col)
106106
k = length(feat_levels)
107107
feat_mode = (mode_is_vector) ? mode[findfirst(isequal(name), features)] : mode
108108
if feat_mode == :contrast

src/encoders/frequency_encoding/frequency_encoding.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ function frequency_encoder_fit(
2929
# 1. Define feature mapper
3030
function feature_mapper(col, name)
3131
frequency_map = (!normalize) ? countmap(col) : proportionmap(col)
32-
feat_levels = levels(col)
32+
feat_levels = rawlevels(col)
3333
statistic_given_feat_val = Dict{eltype(feat_levels), output_type}(
3434
level => get(frequency_map, level, 0) for level in feat_levels
3535
)

src/encoders/missingness_encoding/missingness_encoding.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ function missingness_encoder_fit(
3939

4040
# 1. Define feature mapper
4141
function feature_mapper(col, name)
42-
feat_levels = levels(col; skipmissing = true)
42+
feat_levels = unwrap.(levels(col; skipmissing = true))
4343
col_type = nonmissingtype(eltype(feat_levels))
4444

4545
# Ensure column type is valid (can't test because never occurs)

src/encoders/ordinal_encoding/ordinal_encoding.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ function ordinal_encoder_fit(
2525
)
2626
# 1. Define feature mapper
2727
function feature_mapper(col, name)
28-
feat_levels = levels(col)
28+
feat_levels = rawlevels(col)
2929
index_given_feat_val =
3030
Dict{eltype(feat_levels), output_type}(
3131
value => index for (index, value) in enumerate(feat_levels)

src/encoders/target_encoding/target_encoding.jl

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -148,12 +148,12 @@ function target_encoder_fit(
148148
"Your target must be Continuous/Count for regression or Multiclass/OrderedFactor for classification",
149149
)
150150

151-
# 2. Setup prior statistics
151+
# 2. Setup prior statistics
152152
if task == "Regression"
153153
y_mean = mean(y) # for mixing
154154
m == :auto && (y_var = std(y)^2) # for empirical Bayes estimation
155155
else
156-
y_classes = levels(y)
156+
y_classes = rawlevels(y)
157157
is_multiclass = length(y_classes) > 2
158158
if !is_multiclass # binary case
159159
y_prior = sum(y .== y_classes[1]) / length(y) # for mixing
@@ -165,10 +165,10 @@ function target_encoder_fit(
165165

166166
# 3. Define function to compute the new value(s) for each level given a column
167167
function feature_mapper(col, name)
168-
feat_levels = levels(col)
168+
feat_levels = rawlevels(col)
169169
y_stat_given_feat_level_for_col =
170170
Dict{eltype(feat_levels), Any}()
171-
for level in levels(col)
171+
for level in rawlevels(col)
172172
# Get the targets of an example that belong to this level
173173
targets_for_level = y[col.==level]
174174

@@ -230,14 +230,14 @@ end
230230
Transform given data with fitted target encoder cache.
231231
232232
# Arguments
233-
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
233+
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
234234
`Multiclass` or `OrderedFactor`
235-
- `cache`: A dictionary containing a dictionary `y_stat_given_feat_level` with the necessary statistics for
235+
- `cache`: A dictionary containing a dictionary `y_stat_given_feat_level` with the necessary statistics for
236236
every categorical feature as well as other metadata needed for transform
237237
238238
# Returns
239239
- `X`: A table where the categorical features as specified during fitting are transformed by target encoding. Other features will remain
240-
the same. This will attempt to preserve the type of the table but may not succeed.
240+
the same. This will attempt to preserve the type of the table but may not succeed.
241241
"""
242242

243243
function target_encoder_transform(X, cache)
@@ -253,4 +253,3 @@ function target_encoder_transform(X, cache)
253253
use_levelnames = true,
254254
custom_levels = y_classes)
255255
end
256-

src/generic.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ function generic_transform(
207207
if feat_name in keys(mapping_per_feat_level)
208208
if !ignore_unknown
209209
train_levels = keys(mapping_per_feat_level[feat_name])
210-
test_levels = levels(col)
210+
test_levels = rawlevels(col)
211211
# test levels must be a subset of train levels
212212
if !issubset(test_levels, train_levels)
213213
# get the levels in test that are not in train

src/transformers/cardinality_reducer/cardinality_reducer.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ function cardinality_reducer_fit(
4646
# 1. Define feature mapper
4747
function feature_mapper(col, name)
4848
val_to_freq = (min_frequency isa AbstractFloat) ? proportionmap(col) : countmap(col)
49-
feat_levels = levels(col)
49+
feat_levels = rawlevels(col)
5050
col_type = eltype(feat_levels)
5151

5252
# Ensure column type is valid (can't test because never occurs)

src/transformers/other_transformers/one_hot_encoder.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ function MMI.fit(transformer::OneHotEncoder, verbosity::Int, X)
6161
if T <: allowed_scitypes && ftr in specified_features
6262
ref_name_pairs_given_feature[ftr] = Pair{<:Unsigned,Symbol}[]
6363
shift = transformer.drop_last ? 1 : 0
64-
levels = classes(col)
64+
levels = CategoricalArrays.levels(col)
6565
fitted_levels_given_feature[ftr] = levels
6666
if verbosity > 0
6767
@info "Spawning $(length(levels)-shift) sub-features "*
@@ -136,7 +136,7 @@ function MMI.transform(transformer::OneHotEncoder, fitresult, X)
136136
col = MMI.selectcols(X, ftr)
137137
if ftr in features_to_be_transformed
138138
Set(fitresult.fitted_levels_given_feature[ftr]) ==
139-
Set(classes(col)) ||
139+
Set(levels(col)) ||
140140
error("Found category level mismatch in feature `$(ftr)`. "*
141141
"Consider using `levels!` to ensure fitted and transforming "*
142142
"features have the same category levels.")
@@ -289,4 +289,4 @@ julia> schema(W)
289289
See also [`ContinuousEncoder`](@ref).
290290
291291
"""
292-
OneHotEncoder
292+
OneHotEncoder

test/encoders/contrast_encoder.jl

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,9 @@ end
8282
end
8383
# test that fit is correct for dummy Coding
8484
cache = contrast_encoder_fit(X, [:name]; ignore = false, mode = :dummy)
85-
k = length(levels(X.name))
85+
k = length(rawlevels(X.name))
8686
contrast_matrix = get_dummy_contrast(k)
87-
for (i, level) in enumerate(levels(X.name))
87+
for (i, level) in enumerate(rawlevels(X.name))
8888
@test cache.vector_given_value_given_feature[:name][level] == contrast_matrix[i, :]
8989
end
9090
end
@@ -110,9 +110,9 @@ end
110110
@test size(contrast_matrix_3) == (3, 2)
111111
# test that fit is correct for sum Coding
112112
cache = contrast_encoder_fit(X, [:name, :favnum]; ignore = false, mode = :sum)
113-
k = length(levels(X.favnum))
113+
k = length(rawlevels(X.favnum))
114114
contrast_matrix = get_sum_contrast(k)
115-
for (i, level) in enumerate(levels(X.favnum))
115+
for (i, level) in enumerate(rawlevels(X.favnum))
116116
@test cache.vector_given_value_given_feature[:favnum][level] ==
117117
contrast_matrix[i, :]
118118
end
@@ -130,9 +130,9 @@ end
130130

131131
# Test that fit is correct for backward Coding
132132
cache = contrast_encoder_fit(X, [:name, :favnum]; ignore = false, mode = :backward_diff)
133-
k = length(levels(X.favnum))
133+
k = length(rawlevels(X.favnum))
134134
contrast_matrix = get_backward_diff_contrast(k)
135-
for (i, level) in enumerate(levels(X.favnum))
135+
for (i, level) in enumerate(rawlevels(X.favnum))
136136
@test cache.vector_given_value_given_feature[:favnum][level] ==
137137
contrast_matrix[i, :]
138138
end
@@ -148,9 +148,9 @@ end
148148

149149
# Test that fit is correct for forward Coding
150150
cache = contrast_encoder_fit(X, [:name, :favnum]; ignore = false, mode = :forward_diff)
151-
k = length(levels(X.favnum))
151+
k = length(rawlevels(X.favnum))
152152
contrast_matrix = get_forward_diff_contrast(k)
153-
for (i, level) in enumerate(levels(X.favnum))
153+
for (i, level) in enumerate(rawlevels(X.favnum))
154154
@test cache.vector_given_value_given_feature[:favnum][level] ==
155155
contrast_matrix[i, :]
156156
end
@@ -171,9 +171,9 @@ end
171171
0.0 0.0 3.0]
172172
# test that fit is correct for helmert Coding
173173
cache = contrast_encoder_fit(X, [:name, :favnum]; ignore = false, mode = :helmert)
174-
k = length(levels(X.name))
174+
k = length(rawlevels(X.name))
175175
contrast_matrix = get_helmert_contrast(k)
176-
for (i, level) in enumerate(levels(X.name))
176+
for (i, level) in enumerate(rawlevels(X.name))
177177
@test cache.vector_given_value_given_feature[:name][level] == contrast_matrix[i, :]
178178
end
179179
end
@@ -227,12 +227,12 @@ end
227227
contrasts = Dict(
228228
:name => StatsModels.HypothesisCoding(
229229
buildrandomhypothesis(nothing, 3);
230-
levels = levels(X.name),
230+
levels = rawlevels(X.name),
231231
labels = [],
232232
),
233233
:favnum => StatsModels.HypothesisCoding(
234234
buildrandomhypothesis(nothing, 4);
235-
levels = levels(X.favnum),
235+
levels = rawlevels(X.favnum),
236236
labels = [],
237237
),
238238
),
@@ -263,7 +263,7 @@ end
263263
StatsModels.HelmertCoding(),
264264
StatsModels.HypothesisCoding(
265265
buildrandomhypothesis(nothing, k);
266-
levels = (k == 3) ? levels(X.name) : levels(X.favnum),
266+
levels = (k == 3) ? rawlevels(X.name) : rawlevels(X.favnum),
267267
labels = [],
268268
),
269269
][ind]
@@ -304,7 +304,7 @@ end
304304
StatsModels.HelmertCoding(),
305305
StatsModels.HypothesisCoding(
306306
buildrandomhypothesis(nothing, k);
307-
levels = (k == 3) ? levels(X.name) : levels(X.favnum),
307+
levels = (k == 3) ? rawlevels(X.name) : rawlevels(X.favnum),
308308
labels = [],
309309
),
310310
][ind]

0 commit comments

Comments
 (0)