Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,5 @@ scratchpad/
examples/test.jl
catboost_info/**
/catboost_info
/catboost_info
/docs/src/tutorials/adult_example/.CondaPkg
/docs/src/tutorials/adult_example/catboost_info
/docs/src/tutorials/**/.CondaPkg
/docs/src/tutorials/**/catboost_info
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
BitBasis = "0.9"
CategoricalArrays = "0.10"
CategoricalArrays = "1"
Combinatorics = "1"
Dates = "1"
Distributions = "0.25"
Expand Down
3 changes: 0 additions & 3 deletions docs/Project.toml
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
DocumenterTools = "35a29f4d-8980-5a13-9543-d66fff28ecb8"
MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
MLJFlux = "094fc8d1-fd35-5302-93ea-dabda2abf845"
MLJTransforms = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6"

Expand Down
3 changes: 3 additions & 0 deletions src/MLJTransforms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ using OrderedCollections

const MMI = MLJModelInterface

# old behaviour of `levels` (before CategoricalArrays 1.0):
rawlevels(A) = unwrap.(levels(A))

# Functions of generic use across transformers
include("common_docs.jl")
include("generic.jl")
Expand Down
2 changes: 1 addition & 1 deletion src/encoders/contrast_encoder/contrast_encoder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ function contrast_encoder_fit(

# ensure mode is one of :contrast, :dummy, :sum, :backward_diff, :forward_diff, :helmert, :polynomial, :hypothesis
function feature_mapper(col, name)
feat_levels = levels(col)
feat_levels = rawlevels(col)
k = length(feat_levels)
feat_mode = (mode_is_vector) ? mode[findfirst(isequal(name), features)] : mode
if feat_mode == :contrast
Expand Down
2 changes: 1 addition & 1 deletion src/encoders/frequency_encoding/frequency_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ function frequency_encoder_fit(
# 1. Define feature mapper
function feature_mapper(col, name)
frequency_map = (!normalize) ? countmap(col) : proportionmap(col)
feat_levels = levels(col)
feat_levels = rawlevels(col)
statistic_given_feat_val = Dict{eltype(feat_levels), output_type}(
level => get(frequency_map, level, 0) for level in feat_levels
)
Expand Down
2 changes: 1 addition & 1 deletion src/encoders/missingness_encoding/missingness_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ function missingness_encoder_fit(

# 1. Define feature mapper
function feature_mapper(col, name)
feat_levels = levels(col; skipmissing = true)
feat_levels = unwrap.(levels(col; skipmissing = true))
col_type = nonmissingtype(eltype(feat_levels))

# Ensure column type is valid (can't test because never occurs)
Expand Down
2 changes: 1 addition & 1 deletion src/encoders/ordinal_encoding/ordinal_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ function ordinal_encoder_fit(
)
# 1. Define feature mapper
function feature_mapper(col, name)
feat_levels = levels(col)
feat_levels = rawlevels(col)
index_given_feat_val =
Dict{eltype(feat_levels), output_type}(
value => index for (index, value) in enumerate(feat_levels)
Expand Down
15 changes: 7 additions & 8 deletions src/encoders/target_encoding/target_encoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -148,12 +148,12 @@ function target_encoder_fit(
"Your target must be Continuous/Count for regression or Multiclass/OrderedFactor for classification",
)

# 2. Setup prior statistics
# 2. Setup prior statistics
if task == "Regression"
y_mean = mean(y) # for mixing
m == :auto && (y_var = std(y)^2) # for empirical Bayes estimation
else
y_classes = levels(y)
y_classes = rawlevels(y)
is_multiclass = length(y_classes) > 2
if !is_multiclass # binary case
y_prior = sum(y .== y_classes[1]) / length(y) # for mixing
Expand All @@ -165,10 +165,10 @@ function target_encoder_fit(

# 3. Define function to compute the new value(s) for each level given a column
function feature_mapper(col, name)
feat_levels = levels(col)
feat_levels = rawlevels(col)
y_stat_given_feat_level_for_col =
Dict{eltype(feat_levels), Any}()
for level in levels(col)
for level in rawlevels(col)
# Get the targets of an example that belong to this level
targets_for_level = y[col.==level]

Expand Down Expand Up @@ -230,14 +230,14 @@ end
Transform given data with fitted target encoder cache.

# Arguments
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
`Multiclass` or `OrderedFactor`
- `cache`: A dictionary containing a dictionary `y_stat_given_feat_level` with the necessary statistics for
- `cache`: A dictionary containing a dictionary `y_stat_given_feat_level` with the necessary statistics for
every categorical feature as well as other metadata needed for transform

# Returns
- `X`: A table where the categorical features as specified during fitting are transformed by target encoding. Other features will remain
the same. This will attempt to preserve the type of the table but may not succeed.
the same. This will attempt to preserve the type of the table but may not succeed.
"""

function target_encoder_transform(X, cache)
Expand All @@ -253,4 +253,3 @@ function target_encoder_transform(X, cache)
use_levelnames = true,
custom_levels = y_classes)
end

105 changes: 62 additions & 43 deletions src/generic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@ generic_fit(X,
)
```

Given a `feature_mapper` (see definition below), this method applies
`feature_mapper` across a specified subset of categorical columns in X and returns a dictionary
whose keys are the feature names, and each value is the corresponding
level‑to‑value mapping produced by `feature_mapper`.
Given a `feature_mapper` (see definition below), this method applies `feature_mapper`
across a specified subset of categorical columns in X and returns a dictionary whose keys
are the feature names, and each value is the corresponding level‑to‑value mapping produced
by `feature_mapper`.

In essence, it spares effort of looping over each column and applying the `feature_mapper` function manually as well as handling the feature selection logic.
In essence, it spares effort of looping over each column and applying the `feature_mapper`
function manually as well as handling the feature selection logic.


# Arguments
Expand All @@ -26,17 +27,22 @@ $X_doc
$features_doc
$ignore_doc
$ordered_factor_doc
- feature_mapper: function that, for a given vector (eg, corresponding to a categorical column from the dataset `X`),
produces a mapping from each category level name in this vector to a scalar or vector according to specified transformation logic.

- feature_mapper: function that, for a given vector (eg, corresponding to a categorical
column from the dataset `X`), produces a mapping from each category level name in this
vector to a scalar or vector according to specified transformation logic.

# Note

- Any additional arguments (whether keyword or not) provided to this function are passed to the `feature_mapper` function which
is helpful when `feature_mapper` requires additional arguments to compute the mapping (eg, hyperparameters).
- Any additional arguments (whether keyword or not) provided to this function are passed
to the `feature_mapper` function which is helpful when `feature_mapper` requires
additional arguments to compute the mapping (eg, hyperparameters).

# Returns
- `mapping_per_feat_level`: Maps each level for each feature in a subset of the categorical features of
X into a scalar or a vector.

- `mapping_per_feat_level`: Maps each level for each feature in a subset of the
categorical features of X into a scalar or a vector.

$encoded_features_doc
"""
function generic_fit(X,
Expand All @@ -50,11 +56,11 @@ function generic_fit(X,
# 1. Get X column types and names
feat_names = Tables.schema(X).names

#2. Modify column_names based on features
#2. Modify column_names based on features
if features isa Symbol
features = [features]
end

if features isa AbstractVector{Symbol}
# Original behavior for vector of symbols
feat_names =
Expand Down Expand Up @@ -94,8 +100,9 @@ end
"""
**Private method.**

Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n or if possible,
feat_name_level_0, feat_name_level_1,..., feat_name_level_n
Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n or if
possible, feat_name_level_0, feat_name_level_1,..., feat_name_level_n

"""
function generate_new_feat_names(
feat_name,
Expand All @@ -115,7 +122,8 @@ function generate_new_feat_names(
suffix = repeat("_", count)
if use_levelnames
# Always use the first num_inds level names
new_column_names = [ Symbol("$(feat_name)$(suffix)$(levels_vec[i])") for i in 1:num_inds ]
new_column_names =
[ Symbol("$(feat_name)$(suffix)$(levels_vec[i])") for i in 1:num_inds ]
else
# Always use numeric indices
new_column_names = [ Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds ]
Expand Down Expand Up @@ -144,34 +152,42 @@ generic_transform(
```


Apply a per‐level feature mapping to selected categorical columns in `X`, returning a new table of the same type.
Apply a per‐level feature mapping to selected categorical columns in `X`, returning a new
table of the same type.

# Arguments

$X_doc
- `mapping_per_feat_level::Dict{Symbol,Dict}`:
A dict whose keys are feature names (`Symbol`) and values are themselves dictionaries
mapping each observed level to either a scalar (if `single_feat=true`) or a fixed‐length vector
(if `single_feat=false`). Only columns whose names appear in `mapping_per_feat_level` are
transformed; others pass through unchanged.
- `single_feat::Bool=true`:
If `true`, each input level is mapped to a single scalar feature; if `false`,
each input level is mapped to a length‑`k` vector, producing `k` output columns.
- `ignore_unknown::Bool=false`:
If `false`, novel levels in `X` (not seen during fit) will raise an error;
if `true`, novel levels will be left unchanged (identity mapping).
- `use_levelnames::Bool=false`:
When `single_feat=false`, controls naming of the expanded columns: `true`: use actual level names (e.g. `:color_red`, `:color_blue`),
`false`: use numeric indices (e.g. `:color_1`, `:color_2`).
- `custom_levels::Union{Nothing,Vector}`:
If not `nothing`, overrides the names of levels used to generate feature names when `single_feat=false`.
- `ensure_categorical::Bool=false`:
Only when `single_feat=true` and if `true`, preserves the categorical type of the column after
recoding (eg, feature should still be recognized as `Multiclass` after transformation)

- `mapping_per_feat_level::Dict{Symbol,Dict}`: A dict whose keys are feature names
(`Symbol`) and values are themselves dictionaries mapping each observed level to either
a scalar (if `single_feat=true`) or a fixed‐length vector (if
`single_feat=false`). Only columns whose names appear in `mapping_per_feat_level` are
transformed; others pass through unchanged.

- `single_feat::Bool=true`: If `true`, each input level is mapped to a single scalar
feature; if `false`, each input level is mapped to a length‑`k` vector, producing `k`
output columns.

- `ignore_unknown::Bool=false`: If `false`, novel levels in `X` (not seen during fit) will
raise an error; if `true`, novel levels will be left unchanged (identity mapping).

- `use_levelnames::Bool=false`: When `single_feat=false`, controls naming of the expanded
columns: `true`: use actual level names (e.g. `:color_red`, `:color_blue`), `false`:
use numeric indices (e.g. `:color_1`, `:color_2`).

- `custom_levels::Union{Nothing,Vector}`: If not `nothing`, overrides the names of levels
used to generate feature names when `single_feat=false`.

- `ensure_categorical::Bool=false`: Only when `single_feat=true` and if `true`, preserves
the categorical type of the column after recoding (eg, feature should still be
recognized as `Multiclass` after transformation)

# Returns

A new table of potentially similar to `X` but with categorical columns transformed according to `mapping_per_feat_level`.
A new table of potentially similar to `X` but with categorical columns transformed
according to `mapping_per_feat_level`.

"""
function generic_transform(
X,
Expand All @@ -191,13 +207,14 @@ function generic_transform(
if feat_name in keys(mapping_per_feat_level)
if !ignore_unknown
train_levels = keys(mapping_per_feat_level[feat_name])
test_levels = levels(col)
test_levels = rawlevels(col)
# test levels must be a subset of train levels
if !issubset(test_levels, train_levels)
# get the levels in test that are not in train
lost_levels = setdiff(test_levels, train_levels)
error(
"While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.",
"While transforming, found novel levels for the column "*
"$(feat_name): $(lost_levels) that were not seen while training.",
)
end
end
Expand All @@ -206,10 +223,11 @@ function generic_transform(
level2scalar = mapping_per_feat_level[feat_name]
if ensure_categorical
new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
else
new_col = !isempty(level2scalar) ? unwrap.(recode(col, level2scalar...)) : col
else
new_col =
!isempty(level2scalar) ? unwrap.(recode(col, level2scalar...)) : col
end

push!(new_cols, new_col)
push!(new_feat_names, feat_name)
else
Expand All @@ -221,7 +239,8 @@ function generic_transform(
feat_names_with_inds = generate_new_feat_names(
feat_name,
length(first(mapping_per_feat_level[feat_name])[2]),
(custom_levels === nothing) ? keys(mapping_per_feat_level[feat_name]) : custom_levels,
(custom_levels === nothing) ?
keys(mapping_per_feat_level[feat_name]) : custom_levels,
feat_names;
use_levelnames = use_levelnames,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ function cardinality_reducer_fit(
# 1. Define feature mapper
function feature_mapper(col, name)
val_to_freq = (min_frequency isa AbstractFloat) ? proportionmap(col) : countmap(col)
feat_levels = levels(col)
feat_levels = rawlevels(col)
col_type = eltype(feat_levels)

# Ensure column type is valid (can't test because never occurs)
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/other_transformers/one_hot_encoder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ function MMI.fit(transformer::OneHotEncoder, verbosity::Int, X)
if T <: allowed_scitypes && ftr in specified_features
ref_name_pairs_given_feature[ftr] = Pair{<:Unsigned,Symbol}[]
shift = transformer.drop_last ? 1 : 0
levels = classes(col)
levels = CategoricalArrays.levels(col)
fitted_levels_given_feature[ftr] = levels
if verbosity > 0
@info "Spawning $(length(levels)-shift) sub-features "*
Expand Down Expand Up @@ -136,7 +136,7 @@ function MMI.transform(transformer::OneHotEncoder, fitresult, X)
col = MMI.selectcols(X, ftr)
if ftr in features_to_be_transformed
Set(fitresult.fitted_levels_given_feature[ftr]) ==
Set(classes(col)) ||
Set(levels(col)) ||
error("Found category level mismatch in feature `$(ftr)`. "*
"Consider using `levels!` to ensure fitted and transforming "*
"features have the same category levels.")
Expand Down Expand Up @@ -289,4 +289,4 @@ julia> schema(W)
See also [`ContinuousEncoder`](@ref).

"""
OneHotEncoder
OneHotEncoder
Loading
Loading