JuliaAI · EssamWisam · Nov 4, 2025 · Oct 21, 2025 · Oct 21, 2025 · Oct 21, 2025
diff --git a/.gitignore b/.gitignore
@@ -30,6 +30,5 @@ scratchpad/
 examples/test.jl
 catboost_info/**
 /catboost_info
-/catboost_info
-/docs/src/tutorials/adult_example/.CondaPkg
-/docs/src/tutorials/adult_example/catboost_info
+/docs/src/tutorials/**/.CondaPkg
+/docs/src/tutorials/**/catboost_info
diff --git a/Project.toml b/Project.toml
@@ -23,7 +23,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
 BitBasis = "0.9"
-CategoricalArrays = "0.10"
+CategoricalArrays = "1"
 Combinatorics = "1"
 Dates = "1"
 Distributions = "0.25"

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,9 +1,6 @@
 [deps]
-CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
-DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DocumenterTools = "35a29f4d-8980-5a13-9543-d66fff28ecb8"
-MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
 MLJFlux = "094fc8d1-fd35-5302-93ea-dabda2abf845"
 MLJTransforms = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6"
 

diff --git a/src/MLJTransforms.jl b/src/MLJTransforms.jl
@@ -20,6 +20,9 @@ using OrderedCollections
 
 const MMI = MLJModelInterface
 
+# old behaviour of `levels` (before CategoricalArrays 1.0):
+rawlevels(A) = unwrap.(levels(A))
+
 # Functions of generic use across transformers
 include("common_docs.jl")
 include("generic.jl")

diff --git a/src/encoders/contrast_encoder/contrast_encoder.jl b/src/encoders/contrast_encoder/contrast_encoder.jl
@@ -102,7 +102,7 @@ function contrast_encoder_fit(
 
     # ensure mode is one of :contrast, :dummy, :sum, :backward_diff, :forward_diff, :helmert, :polynomial, :hypothesis
     function feature_mapper(col, name)
-        feat_levels = levels(col)
+        feat_levels = rawlevels(col)
         k = length(feat_levels)
         feat_mode = (mode_is_vector) ? mode[findfirst(isequal(name), features)] : mode
         if feat_mode == :contrast

diff --git a/src/encoders/frequency_encoding/frequency_encoding.jl b/src/encoders/frequency_encoding/frequency_encoding.jl
@@ -29,7 +29,7 @@ function frequency_encoder_fit(
     # 1. Define feature mapper
     function feature_mapper(col, name)
         frequency_map = (!normalize) ? countmap(col) : proportionmap(col)
-        feat_levels = levels(col)
+        feat_levels = rawlevels(col)
         statistic_given_feat_val = Dict{eltype(feat_levels), output_type}(
             level => get(frequency_map, level, 0) for level in feat_levels
         )

diff --git a/src/encoders/missingness_encoding/missingness_encoding.jl b/src/encoders/missingness_encoding/missingness_encoding.jl
@@ -39,7 +39,7 @@ function missingness_encoder_fit(
 
     # 1. Define feature mapper
     function feature_mapper(col, name)
-        feat_levels = levels(col; skipmissing = true)
+        feat_levels = unwrap.(levels(col; skipmissing = true))
         col_type = nonmissingtype(eltype(feat_levels))
 
         # Ensure column type is valid (can't test because never occurs)

diff --git a/src/encoders/ordinal_encoding/ordinal_encoding.jl b/src/encoders/ordinal_encoding/ordinal_encoding.jl
@@ -25,7 +25,7 @@ function ordinal_encoder_fit(
 )
     # 1. Define feature mapper
     function feature_mapper(col, name)
-        feat_levels = levels(col)
+        feat_levels = rawlevels(col)
         index_given_feat_val =
             Dict{eltype(feat_levels), output_type}(
                 value => index for (index, value) in enumerate(feat_levels)

diff --git a/src/encoders/target_encoding/target_encoding.jl b/src/encoders/target_encoding/target_encoding.jl
@@ -148,12 +148,12 @@ function target_encoder_fit(
         "Your target must be Continuous/Count for regression or Multiclass/OrderedFactor for classification",
     )
 
-    # 2. Setup prior statistics 
+    # 2. Setup prior statistics
     if task == "Regression"
         y_mean = mean(y)                             # for mixing
         m == :auto && (y_var = std(y)^2)              # for empirical Bayes estimation
     else
-        y_classes = levels(y)
+        y_classes = rawlevels(y)
         is_multiclass = length(y_classes) > 2
         if !is_multiclass       # binary case
             y_prior = sum(y .== y_classes[1]) / length(y)   # for mixing
@@ -165,10 +165,10 @@ function target_encoder_fit(
 
     # 3. Define function to compute the new value(s) for each level given a column
     function feature_mapper(col, name)
-        feat_levels = levels(col)
+        feat_levels = rawlevels(col)
         y_stat_given_feat_level_for_col =
             Dict{eltype(feat_levels), Any}()
-        for level in levels(col)
+        for level in rawlevels(col)
             # Get the targets of an example that belong to this level
             targets_for_level = y[col.==level]
 
@@ -230,14 +230,14 @@ end
 Transform given data with fitted target encoder cache.
 
 # Arguments
-- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) 
+- `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
 `Multiclass` or `OrderedFactor`
-- `cache`: A dictionary containing a dictionary `y_stat_given_feat_level` with the necessary statistics for 
+- `cache`: A dictionary containing a dictionary `y_stat_given_feat_level` with the necessary statistics for
 every categorical feature as well as other metadata needed for transform
 
 # Returns
 - `X`: A table where the categorical features as specified during fitting are transformed by target encoding. Other features will remain
-    the same. This will attempt to preserve the type of the table but may not succeed. 
+    the same. This will attempt to preserve the type of the table but may not succeed.
 """
 
 function target_encoder_transform(X, cache)
@@ -253,4 +253,3 @@ function target_encoder_transform(X, cache)
         use_levelnames = true,
         custom_levels = y_classes)
 end
-
diff --git a/src/generic.jl b/src/generic.jl
@@ -12,12 +12,13 @@ generic_fit(X,
 )
 ```
 
-Given a `feature_mapper` (see definition below), this method applies 
-    `feature_mapper` across a specified subset of categorical columns in X and returns a dictionary 
-    whose keys are the feature names, and each value is the corresponding 
-    level‑to‑value mapping produced by `feature_mapper`. 
+Given a `feature_mapper` (see definition below), this method applies `feature_mapper`
+across a specified subset of categorical columns in X and returns a dictionary whose keys
+are the feature names, and each value is the corresponding level‑to‑value mapping produced
+by `feature_mapper`.
 
-In essence, it spares effort of looping over each column and applying the `feature_mapper` function manually as well as handling the feature selection logic.
+In essence, it spares effort of looping over each column and applying the `feature_mapper`
+function manually as well as handling the feature selection logic.
 
 
 # Arguments
@@ -26,17 +27,22 @@ $X_doc
 $features_doc
 $ignore_doc
 $ordered_factor_doc
-- feature_mapper: function that, for a given vector (eg, corresponding to a categorical column from the dataset `X`), 
-    produces a mapping from each category level name in this vector to a scalar or vector according to specified transformation logic.
+
+- feature_mapper: function that, for a given vector (eg, corresponding to a categorical
+  column from the dataset `X`), produces a mapping from each category level name in this
+  vector to a scalar or vector according to specified transformation logic.
 
 # Note
 
-- Any additional arguments (whether keyword or not) provided to this function are passed to the `feature_mapper` function which
-    is helpful when `feature_mapper` requires additional arguments to compute the mapping (eg, hyperparameters).
+- Any additional arguments (whether keyword or not) provided to this function are passed
+  to the `feature_mapper` function which is helpful when `feature_mapper` requires
+  additional arguments to compute the mapping (eg, hyperparameters).
 
 # Returns
-- `mapping_per_feat_level`: Maps each level for each feature in a subset of the categorical features of
-    X into a scalar or a vector. 
+
+- `mapping_per_feat_level`: Maps each level for each feature in a subset of the
+  categorical features of X into a scalar or a vector.
+
 $encoded_features_doc
 """
 function generic_fit(X,
@@ -50,11 +56,11 @@ function generic_fit(X,
     # 1. Get X column types and names
     feat_names = Tables.schema(X).names
 
-    #2.  Modify column_names based on features 
+    #2.  Modify column_names based on features
     if features isa Symbol
         features = [features]
     end
-    
+
     if features isa AbstractVector{Symbol}
         # Original behavior for vector of symbols
         feat_names =
@@ -94,8 +100,9 @@ end
 """
 **Private method.**
 
-Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n or if possible,
-feat_name_level_0, feat_name_level_1,..., feat_name_level_n
+Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n or if
+possible, feat_name_level_0, feat_name_level_1,..., feat_name_level_n
+
 """
 function generate_new_feat_names(
     feat_name,
@@ -115,7 +122,8 @@ function generate_new_feat_names(
         suffix = repeat("_", count)
         if use_levelnames
             # Always use the first num_inds level names
-            new_column_names = [ Symbol("$(feat_name)$(suffix)$(levels_vec[i])") for i in 1:num_inds ]
+            new_column_names =
+                [ Symbol("$(feat_name)$(suffix)$(levels_vec[i])") for i in 1:num_inds ]
         else
             # Always use numeric indices
             new_column_names = [ Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds ]
@@ -144,34 +152,42 @@ generic_transform(
 ```
 
 
-Apply a per‐level feature mapping to selected categorical columns in `X`, returning a new table of the same type.
+Apply a per‐level feature mapping to selected categorical columns in `X`, returning a new
+table of the same type.
 
 # Arguments
 
 $X_doc
-- `mapping_per_feat_level::Dict{Symbol,Dict}`:
-    A dict whose keys are feature names (`Symbol`) and values are themselves dictionaries 
-    mapping each observed level to either a scalar (if `single_feat=true`) or a fixed‐length vector 
-        (if `single_feat=false`). Only columns whose names appear in `mapping_per_feat_level` are 
-            transformed; others pass through unchanged.
-- `single_feat::Bool=true`:
-    If `true`, each input level is mapped to a single scalar feature; if `false`,
-    each input level is mapped to a length‑`k` vector, producing `k` output columns.
-- `ignore_unknown::Bool=false`:
-    If `false`, novel levels in `X` (not seen during fit) will raise an error; 
-    if `true`, novel levels will be left unchanged (identity mapping).
-- `use_levelnames::Bool=false`:
-    When `single_feat=false`, controls naming of the expanded columns: `true`: use actual level names (e.g. `:color_red`, `:color_blue`), 
-    `false`: use numeric indices (e.g. `:color_1`, `:color_2`).
-- `custom_levels::Union{Nothing,Vector}`:
-    If not `nothing`, overrides the names of levels used to generate feature names when `single_feat=false`.
-- `ensure_categorical::Bool=false`:
-    Only when `single_feat=true` and if `true`, preserves the categorical type of the column after 
-        recoding (eg, feature should still be recognized as `Multiclass` after transformation)
+
+- `mapping_per_feat_level::Dict{Symbol,Dict}`: A dict whose keys are feature names
+   (`Symbol`) and values are themselves dictionaries mapping each observed level to either
+   a scalar (if `single_feat=true`) or a fixed‐length vector (if
+   `single_feat=false`). Only columns whose names appear in `mapping_per_feat_level` are
+   transformed; others pass through unchanged.
+
+- `single_feat::Bool=true`: If `true`, each input level is mapped to a single scalar
+   feature; if `false`, each input level is mapped to a length‑`k` vector, producing `k`
+   output columns.
+
+- `ignore_unknown::Bool=false`: If `false`, novel levels in `X` (not seen during fit) will
+   raise an error; if `true`, novel levels will be left unchanged (identity mapping).
+
+- `use_levelnames::Bool=false`: When `single_feat=false`, controls naming of the expanded
+   columns: `true`: use actual level names (e.g. `:color_red`, `:color_blue`), `false`:
+   use numeric indices (e.g. `:color_1`, `:color_2`).
+
+- `custom_levels::Union{Nothing,Vector}`: If not `nothing`, overrides the names of levels
+   used to generate feature names when `single_feat=false`.
+
+- `ensure_categorical::Bool=false`: Only when `single_feat=true` and if `true`, preserves
+  the categorical type of the column after recoding (eg, feature should still be
+  recognized as `Multiclass` after transformation)
 
 # Returns
 
-A new table of potentially similar to `X` but with categorical columns transformed according to `mapping_per_feat_level`.
+A new table of potentially similar to `X` but with categorical columns transformed
+according to `mapping_per_feat_level`.
+
 """
 function generic_transform(
     X,
@@ -191,13 +207,14 @@ function generic_transform(
         if feat_name in keys(mapping_per_feat_level)
             if !ignore_unknown
                 train_levels = keys(mapping_per_feat_level[feat_name])
-                test_levels = levels(col)
+                test_levels = rawlevels(col)
                 # test levels must be a subset of train levels
                 if !issubset(test_levels, train_levels)
                     # get the levels in test that are not in train
                     lost_levels = setdiff(test_levels, train_levels)
                     error(
-                        "While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.",
+                    "While transforming, found novel levels for the column "*
+                        "$(feat_name): $(lost_levels) that were not seen while training.",
                     )
                 end
             end
@@ -206,10 +223,11 @@ function generic_transform(
                 level2scalar = mapping_per_feat_level[feat_name]
                 if ensure_categorical
                     new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
-                else 
-                    new_col = !isempty(level2scalar) ? unwrap.(recode(col, level2scalar...)) : col
+                else
+                    new_col =
+                        !isempty(level2scalar) ? unwrap.(recode(col, level2scalar...)) : col
                 end
-               
+
                 push!(new_cols, new_col)
                 push!(new_feat_names, feat_name)
             else
@@ -221,7 +239,8 @@ function generic_transform(
                 feat_names_with_inds = generate_new_feat_names(
                     feat_name,
                     length(first(mapping_per_feat_level[feat_name])[2]),
-                    (custom_levels === nothing) ? keys(mapping_per_feat_level[feat_name]) : custom_levels,
+                    (custom_levels === nothing) ?
+                        keys(mapping_per_feat_level[feat_name]) : custom_levels,
                     feat_names;
                     use_levelnames = use_levelnames,
                 )

diff --git a/src/transformers/cardinality_reducer/cardinality_reducer.jl b/src/transformers/cardinality_reducer/cardinality_reducer.jl
@@ -46,7 +46,7 @@ function cardinality_reducer_fit(
     # 1. Define feature mapper
     function feature_mapper(col, name)
         val_to_freq = (min_frequency isa AbstractFloat) ? proportionmap(col) : countmap(col)
-        feat_levels = levels(col)
+        feat_levels = rawlevels(col)
         col_type = eltype(feat_levels)
 
         # Ensure column type is valid (can't test because never occurs)

diff --git a/src/transformers/other_transformers/one_hot_encoder.jl b/src/transformers/other_transformers/one_hot_encoder.jl
@@ -61,7 +61,7 @@ function MMI.fit(transformer::OneHotEncoder, verbosity::Int, X)
         if T <: allowed_scitypes && ftr in specified_features
             ref_name_pairs_given_feature[ftr] = Pair{<:Unsigned,Symbol}[]
             shift = transformer.drop_last ? 1 : 0
-            levels = classes(col)
+            levels = CategoricalArrays.levels(col)
             fitted_levels_given_feature[ftr] = levels
             if verbosity > 0
                 @info "Spawning $(length(levels)-shift) sub-features "*
@@ -136,7 +136,7 @@ function MMI.transform(transformer::OneHotEncoder, fitresult, X)
         col = MMI.selectcols(X, ftr)
         if ftr in features_to_be_transformed
             Set(fitresult.fitted_levels_given_feature[ftr]) ==
-                Set(classes(col)) ||
+                Set(levels(col)) ||
             error("Found category level mismatch in feature `$(ftr)`. "*
             "Consider using `levels!` to ensure fitted and transforming "*
             "features have the same category levels.")
@@ -289,4 +289,4 @@ julia> schema(W)
 See also [`ContinuousEncoder`](@ref).
 
 """
-OneHotEncoder
+OneHotEncoder