Skip to content

Commit f0140f7

Browse files
authored
Merge pull request #61 from JuliaAI/levels
Bump [compat] entry: CategoricalArrays="1.0"
2 parents 30e2020 + d094c0e commit f0140f7

File tree

19 files changed

+128
-108
lines changed

19 files changed

+128
-108
lines changed

.gitignore

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,5 @@ scratchpad/
3030
examples/test.jl
3131
catboost_info/**
3232
/catboost_info
33-
/catboost_info
34-
/docs/src/tutorials/adult_example/.CondaPkg
35-
/docs/src/tutorials/adult_example/catboost_info
33+
/docs/src/tutorials/**/.CondaPkg
34+
/docs/src/tutorials/**/catboost_info

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
2323

2424
[compat]
2525
BitBasis = "0.9"
26-
CategoricalArrays = "0.10"
26+
CategoricalArrays = "1"
2727
Combinatorics = "1"
2828
Dates = "1"
2929
Distributions = "0.25"

docs/Project.toml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
[deps]
2-
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
3-
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
42
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
53
DocumenterTools = "35a29f4d-8980-5a13-9543-d66fff28ecb8"
6-
MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
74
MLJFlux = "094fc8d1-fd35-5302-93ea-dabda2abf845"
85
MLJTransforms = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6"
96

src/MLJTransforms.jl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ using OrderedCollections
2020

2121
const MMI = MLJModelInterface
2222

23+
# old behaviour of `levels` (before CategoricalArrays 1.0):
24+
rawlevels(A) = unwrap.(levels(A))
25+
2326
# Functions of generic use across transformers
2427
include("common_docs.jl")
2528
include("generic.jl")

src/encoders/contrast_encoder/contrast_encoder.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ function contrast_encoder_fit(
102102

103103
# ensure mode is one of :contrast, :dummy, :sum, :backward_diff, :forward_diff, :helmert, :polynomial, :hypothesis
104104
function feature_mapper(col, name)
105-
feat_levels = levels(col)
105+
feat_levels = rawlevels(col)
106106
k = length(feat_levels)
107107
feat_mode = (mode_is_vector) ? mode[findfirst(isequal(name), features)] : mode
108108
if feat_mode == :contrast

src/encoders/frequency_encoding/frequency_encoding.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ function frequency_encoder_fit(
2929
# 1. Define feature mapper
3030
function feature_mapper(col, name)
3131
frequency_map = (!normalize) ? countmap(col) : proportionmap(col)
32-
feat_levels = levels(col)
32+
feat_levels = rawlevels(col)
3333
statistic_given_feat_val = Dict{eltype(feat_levels), output_type}(
3434
level => get(frequency_map, level, 0) for level in feat_levels
3535
)

src/encoders/missingness_encoding/missingness_encoding.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ function missingness_encoder_fit(
3939

4040
# 1. Define feature mapper
4141
function feature_mapper(col, name)
42-
feat_levels = levels(col; skipmissing = true)
42+
feat_levels = unwrap.(levels(col; skipmissing = true))
4343
col_type = nonmissingtype(eltype(feat_levels))
4444

4545
# Ensure column type is valid (can't test because never occurs)

src/encoders/ordinal_encoding/ordinal_encoding.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ function ordinal_encoder_fit(
2525
)
2626
# 1. Define feature mapper
2727
function feature_mapper(col, name)
28-
feat_levels = levels(col)
28+
feat_levels = rawlevels(col)
2929
index_given_feat_val =
3030
Dict{eltype(feat_levels), output_type}(
3131
value => index for (index, value) in enumerate(feat_levels)

src/encoders/target_encoding/target_encoding.jl

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -148,12 +148,12 @@ function target_encoder_fit(
148148
"Your target must be Continuous/Count for regression or Multiclass/OrderedFactor for classification",
149149
)
150150

151-
# 2. Setup prior statistics
151+
# 2. Setup prior statistics
152152
if task == "Regression"
153153
y_mean = mean(y) # for mixing
154154
m == :auto && (y_var = std(y)^2) # for empirical Bayes estimation
155155
else
156-
y_classes = levels(y)
156+
y_classes = rawlevels(y)
157157
is_multiclass = length(y_classes) > 2
158158
if !is_multiclass # binary case
159159
y_prior = sum(y .== y_classes[1]) / length(y) # for mixing
@@ -165,10 +165,10 @@ function target_encoder_fit(
165165

166166
# 3. Define function to compute the new value(s) for each level given a column
167167
function feature_mapper(col, name)
168-
feat_levels = levels(col)
168+
feat_levels = rawlevels(col)
169169
y_stat_given_feat_level_for_col =
170170
Dict{eltype(feat_levels), Any}()
171-
for level in levels(col)
171+
for level in rawlevels(col)
172172
# Get the targets of an example that belong to this level
173173
targets_for_level = y[col.==level]
174174

@@ -230,14 +230,14 @@ end
230230
Transform given data with fitted target encoder cache.
231231
232232
# Arguments
233-
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
233+
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
234234
`Multiclass` or `OrderedFactor`
235-
- `cache`: A dictionary containing a dictionary `y_stat_given_feat_level` with the necessary statistics for
235+
- `cache`: A dictionary containing a dictionary `y_stat_given_feat_level` with the necessary statistics for
236236
every categorical feature as well as other metadata needed for transform
237237
238238
# Returns
239239
- `X`: A table where the categorical features as specified during fitting are transformed by target encoding. Other features will remain
240-
the same. This will attempt to preserve the type of the table but may not succeed.
240+
the same. This will attempt to preserve the type of the table but may not succeed.
241241
"""
242242

243243
function target_encoder_transform(X, cache)
@@ -253,4 +253,3 @@ function target_encoder_transform(X, cache)
253253
use_levelnames = true,
254254
custom_levels = y_classes)
255255
end
256-

src/generic.jl

Lines changed: 62 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,13 @@ generic_fit(X,
1212
)
1313
```
1414
15-
Given a `feature_mapper` (see definition below), this method applies
16-
`feature_mapper` across a specified subset of categorical columns in X and returns a dictionary
17-
whose keys are the feature names, and each value is the corresponding
18-
level‑to‑value mapping produced by `feature_mapper`.
15+
Given a `feature_mapper` (see definition below), this method applies `feature_mapper`
16+
across a specified subset of categorical columns in X and returns a dictionary whose keys
17+
are the feature names, and each value is the corresponding level‑to‑value mapping produced
18+
by `feature_mapper`.
1919
20-
In essence, it spares effort of looping over each column and applying the `feature_mapper` function manually as well as handling the feature selection logic.
20+
In essence, it spares effort of looping over each column and applying the `feature_mapper`
21+
function manually as well as handling the feature selection logic.
2122
2223
2324
# Arguments
@@ -26,17 +27,22 @@ $X_doc
2627
$features_doc
2728
$ignore_doc
2829
$ordered_factor_doc
29-
- feature_mapper: function that, for a given vector (eg, corresponding to a categorical column from the dataset `X`),
30-
produces a mapping from each category level name in this vector to a scalar or vector according to specified transformation logic.
30+
31+
- feature_mapper: function that, for a given vector (eg, corresponding to a categorical
32+
column from the dataset `X`), produces a mapping from each category level name in this
33+
vector to a scalar or vector according to specified transformation logic.
3134
3235
# Note
3336
34-
- Any additional arguments (whether keyword or not) provided to this function are passed to the `feature_mapper` function which
35-
is helpful when `feature_mapper` requires additional arguments to compute the mapping (eg, hyperparameters).
37+
- Any additional arguments (whether keyword or not) provided to this function are passed
38+
to the `feature_mapper` function which is helpful when `feature_mapper` requires
39+
additional arguments to compute the mapping (eg, hyperparameters).
3640
3741
# Returns
38-
- `mapping_per_feat_level`: Maps each level for each feature in a subset of the categorical features of
39-
X into a scalar or a vector.
42+
43+
- `mapping_per_feat_level`: Maps each level for each feature in a subset of the
44+
categorical features of X into a scalar or a vector.
45+
4046
$encoded_features_doc
4147
"""
4248
function generic_fit(X,
@@ -50,11 +56,11 @@ function generic_fit(X,
5056
# 1. Get X column types and names
5157
feat_names = Tables.schema(X).names
5258

53-
#2. Modify column_names based on features
59+
#2. Modify column_names based on features
5460
if features isa Symbol
5561
features = [features]
5662
end
57-
63+
5864
if features isa AbstractVector{Symbol}
5965
# Original behavior for vector of symbols
6066
feat_names =
@@ -94,8 +100,9 @@ end
94100
"""
95101
**Private method.**
96102
97-
Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n or if possible,
98-
feat_name_level_0, feat_name_level_1,..., feat_name_level_n
103+
Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n or if
104+
possible, feat_name_level_0, feat_name_level_1,..., feat_name_level_n
105+
99106
"""
100107
function generate_new_feat_names(
101108
feat_name,
@@ -115,7 +122,8 @@ function generate_new_feat_names(
115122
suffix = repeat("_", count)
116123
if use_levelnames
117124
# Always use the first num_inds level names
118-
new_column_names = [ Symbol("$(feat_name)$(suffix)$(levels_vec[i])") for i in 1:num_inds ]
125+
new_column_names =
126+
[ Symbol("$(feat_name)$(suffix)$(levels_vec[i])") for i in 1:num_inds ]
119127
else
120128
# Always use numeric indices
121129
new_column_names = [ Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds ]
@@ -144,34 +152,42 @@ generic_transform(
144152
```
145153
146154
147-
Apply a per‐level feature mapping to selected categorical columns in `X`, returning a new table of the same type.
155+
Apply a per‐level feature mapping to selected categorical columns in `X`, returning a new
156+
table of the same type.
148157
149158
# Arguments
150159
151160
$X_doc
152-
- `mapping_per_feat_level::Dict{Symbol,Dict}`:
153-
A dict whose keys are feature names (`Symbol`) and values are themselves dictionaries
154-
mapping each observed level to either a scalar (if `single_feat=true`) or a fixed‐length vector
155-
(if `single_feat=false`). Only columns whose names appear in `mapping_per_feat_level` are
156-
transformed; others pass through unchanged.
157-
- `single_feat::Bool=true`:
158-
If `true`, each input level is mapped to a single scalar feature; if `false`,
159-
each input level is mapped to a length‑`k` vector, producing `k` output columns.
160-
- `ignore_unknown::Bool=false`:
161-
If `false`, novel levels in `X` (not seen during fit) will raise an error;
162-
if `true`, novel levels will be left unchanged (identity mapping).
163-
- `use_levelnames::Bool=false`:
164-
When `single_feat=false`, controls naming of the expanded columns: `true`: use actual level names (e.g. `:color_red`, `:color_blue`),
165-
`false`: use numeric indices (e.g. `:color_1`, `:color_2`).
166-
- `custom_levels::Union{Nothing,Vector}`:
167-
If not `nothing`, overrides the names of levels used to generate feature names when `single_feat=false`.
168-
- `ensure_categorical::Bool=false`:
169-
Only when `single_feat=true` and if `true`, preserves the categorical type of the column after
170-
recoding (eg, feature should still be recognized as `Multiclass` after transformation)
161+
162+
- `mapping_per_feat_level::Dict{Symbol,Dict}`: A dict whose keys are feature names
163+
(`Symbol`) and values are themselves dictionaries mapping each observed level to either
164+
a scalar (if `single_feat=true`) or a fixed‐length vector (if
165+
`single_feat=false`). Only columns whose names appear in `mapping_per_feat_level` are
166+
transformed; others pass through unchanged.
167+
168+
- `single_feat::Bool=true`: If `true`, each input level is mapped to a single scalar
169+
feature; if `false`, each input level is mapped to a length‑`k` vector, producing `k`
170+
output columns.
171+
172+
- `ignore_unknown::Bool=false`: If `false`, novel levels in `X` (not seen during fit) will
173+
raise an error; if `true`, novel levels will be left unchanged (identity mapping).
174+
175+
- `use_levelnames::Bool=false`: When `single_feat=false`, controls naming of the expanded
176+
columns: `true`: use actual level names (e.g. `:color_red`, `:color_blue`), `false`:
177+
use numeric indices (e.g. `:color_1`, `:color_2`).
178+
179+
- `custom_levels::Union{Nothing,Vector}`: If not `nothing`, overrides the names of levels
180+
used to generate feature names when `single_feat=false`.
181+
182+
- `ensure_categorical::Bool=false`: Only when `single_feat=true` and if `true`, preserves
183+
the categorical type of the column after recoding (eg, feature should still be
184+
recognized as `Multiclass` after transformation)
171185
172186
# Returns
173187
174-
A new table of potentially similar to `X` but with categorical columns transformed according to `mapping_per_feat_level`.
188+
A new table of potentially similar to `X` but with categorical columns transformed
189+
according to `mapping_per_feat_level`.
190+
175191
"""
176192
function generic_transform(
177193
X,
@@ -191,13 +207,14 @@ function generic_transform(
191207
if feat_name in keys(mapping_per_feat_level)
192208
if !ignore_unknown
193209
train_levels = keys(mapping_per_feat_level[feat_name])
194-
test_levels = levels(col)
210+
test_levels = rawlevels(col)
195211
# test levels must be a subset of train levels
196212
if !issubset(test_levels, train_levels)
197213
# get the levels in test that are not in train
198214
lost_levels = setdiff(test_levels, train_levels)
199215
error(
200-
"While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.",
216+
"While transforming, found novel levels for the column "*
217+
"$(feat_name): $(lost_levels) that were not seen while training.",
201218
)
202219
end
203220
end
@@ -206,10 +223,11 @@ function generic_transform(
206223
level2scalar = mapping_per_feat_level[feat_name]
207224
if ensure_categorical
208225
new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
209-
else
210-
new_col = !isempty(level2scalar) ? unwrap.(recode(col, level2scalar...)) : col
226+
else
227+
new_col =
228+
!isempty(level2scalar) ? unwrap.(recode(col, level2scalar...)) : col
211229
end
212-
230+
213231
push!(new_cols, new_col)
214232
push!(new_feat_names, feat_name)
215233
else
@@ -221,7 +239,8 @@ function generic_transform(
221239
feat_names_with_inds = generate_new_feat_names(
222240
feat_name,
223241
length(first(mapping_per_feat_level[feat_name])[2]),
224-
(custom_levels === nothing) ? keys(mapping_per_feat_level[feat_name]) : custom_levels,
242+
(custom_levels === nothing) ?
243+
keys(mapping_per_feat_level[feat_name]) : custom_levels,
225244
feat_names;
226245
use_levelnames = use_levelnames,
227246
)

0 commit comments

Comments
 (0)