JuliaAI · ablaom · Sep 23, 2025 · Aug 18, 2025 · Sep 1, 2025 · Sep 1, 2025
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -30,7 +30,7 @@ jobs:
         arch:
           - x64
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
       - uses: julia-actions/setup-julia@v2
         with:
           version: ${{ matrix.version }}

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJTransforms"
 uuid = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6"
 authors = ["Essam <[email protected]> and contributors"]
-version = "0.1.1"
+version = "0.1.2"
 
 [deps]
 BitBasis = "50ba71b6-fa0f-514d-ae9a-0916efc90dcf"

diff --git a/src/MLJTransforms.jl b/src/MLJTransforms.jl
@@ -5,7 +5,7 @@ using Tables
 # https://github.com/JuliaAI/MLJBase.jl/issues/1002
 import ScientificTypes: elscitype, schema, coerce, ScientificTimeType
 using MLJModelInterface # exports `scitype`, which will call `ScientificTypes.scitype`,
-                        # once MLJBase is loaded (but this is not a dependency!)
+# once MLJBase is loaded (but this is not a dependency!)
 using CategoricalArrays
 using TableOperations
 using StatsBase
@@ -29,27 +29,27 @@ include("utils.jl")
 include("encoders/target_encoding/errors.jl")
 include("encoders/target_encoding/target_encoding.jl")
 include("encoders/target_encoding/interface_mlj.jl")
-export  TargetEncoder
+export TargetEncoder
 
 # Ordinal encoding
 include("encoders/ordinal_encoding/ordinal_encoding.jl")
 include("encoders/ordinal_encoding/interface_mlj.jl")
-export  OrdinalEncoder
+export OrdinalEncoder
 
 # Frequency encoding
 include("encoders/frequency_encoding/frequency_encoding.jl")
 include("encoders/frequency_encoding/interface_mlj.jl")
 export frequency_encoder_fit, frequency_encoder_transform, FrequencyEncoder
-export  FrequencyEncoder
+export FrequencyEncoder
 
 # Cardinality reduction
 include("transformers/cardinality_reducer/cardinality_reducer.jl")
 include("transformers/cardinality_reducer/interface_mlj.jl")
 export cardinality_reducer_fit, cardinality_reducer_transform, CardinalityReducer
-export  CardinalityReducer
+export CardinalityReducer
 include("encoders/missingness_encoding/missingness_encoding.jl")
 include("encoders/missingness_encoding/interface_mlj.jl")
-export  MissingnessEncoder
+export MissingnessEncoder
 
 # Contrast encoder
 include("encoders/contrast_encoder/contrast_encoder.jl")
@@ -65,7 +65,6 @@ include("transformers/other_transformers/one_hot_encoder.jl")
 include("transformers/other_transformers/standardizer.jl")
 include("transformers/other_transformers/univariate_boxcox_transformer.jl")
 include("transformers/other_transformers/univariate_discretizer.jl")
-include("transformers/other_transformers/metadata_shared.jl")
 
 export UnivariateDiscretizer,
     UnivariateStandardizer, Standardizer, UnivariateBoxCoxTransformer,

diff --git a/src/common_docs.jl b/src/common_docs.jl
@@ -1,16 +1,18 @@
 const X_doc = """
-- X: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) 
-    `Multiclass` or `OrderedFactor`
+- X: A table where the elements of the categorical features have
+  [scitypes](https://juliaai.  github.io/ScientificTypes.jl/dev/) `Multiclass` or
+  `OrderedFactor`
 """
 const X_doc_mlj = """
 - `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
-   have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to 
-   check scitypes. 
+   have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to
+   check scitypes.
 """
 const features_doc = """
-- features=[]: A list of names of categorical features given as symbols to exclude or include from encoding,
-  according to the value of `ignore`, or a single symbol (which is treated as a vector with one symbol),
-  or a callable that returns true for features to be included/excluded
+- features=[]: A list of names of categorical features given as symbols to exclude or in
+  clude from encoding, according to the value of `ignore`, or a single symbol (which is
+  treated as a vector with one symbol), or a callable that returns true for features to be
+  included/excluded.
 """
 const ignore_doc = """
 - ignore=true: Whether to exclude or include the features given in `features`
@@ -24,4 +26,3 @@ const encoded_features_doc = """
 const cache_doc = """
 - `cache`: The output of `contrast_encoder_fit`
 """
-
diff --git a/src/encoders/contrast_encoder/interface_mlj.jl b/src/encoders/contrast_encoder/interface_mlj.jl
@@ -73,10 +73,10 @@ MMI.metadata_model(
 """
 $(MMI.doc_header(ContrastEncoder))
 
-`ContrastEncoder` implements the following contrast encoding methods for 
-categorical features: dummy, sum, backward/forward difference, and Helmert coding. 
-More generally, users can specify a custom contrast or hypothesis matrix, and each feature 
-can be encoded using a different method.
+`ContrastEncoder` implements the following contrast encoding methods for categorical
+features: dummy, sum, backward/forward difference, and Helmert coding.  More generally,
+users can specify a custom contrast or hypothesis matrix, and each feature can be encoded
+using a different method.
 
 # Training data
 
@@ -93,26 +93,36 @@ Train the machine using `fit!(mach, rows=...)`.
 # Hyper-parameters
 
 $features_doc
-- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
-If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
-contrast encoding scheme for each feature
-- `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`, 
-where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or 
-hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
+
+- `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`,
+  `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`. If `ignore=false`
+  (features to be encoded are listed explictly in `features`), then this can be a vector
+  of the same length as `features` to specify a different contrast encoding scheme for
+  each feature
+
+- `buildmatrix=nothing`: A function or other callable with signature
+  `buildmatrix(colname,k)`, where `colname` is the name of the feature levels and `k` is
+  it's length, and which returns contrast or hypothesis matrix with row/column ordering
+  consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or
+  `:hypothesis`.
+
 $ignore_doc
+
 $ordered_factor_doc
 
 # Operations
 
-- `transform(mach, Xnew)`: Apply contrast encoding to selected `Multiclass` or `OrderedFactor features of `Xnew` specified by hyper-parameters, and 
-   return the new table. Features that are neither `Multiclass` nor `OrderedFactor`
-   are always left unchanged.
+- `transform(mach, Xnew)`: Apply contrast encoding to selected `Multiclass` or
+  `OrderedFactor features of `Xnew` specified by hyper-parameters, and return the new
+  table. Features that are neither `Multiclass` nor `OrderedFactor` are always left
+  unchanged.
 
 # Fitted parameters
 
 The fields of `fitted_params(mach)` are:
 
-- `vector_given_value_given_feature`: A dictionary that maps each level for each column in a subset of the categorical features of X into its frequency.
+- `vector_given_value_given_feature`: A dictionary that maps each level for each column in
+  a subset of the categorical features of X into its frequency.
 
 # Report
 
@@ -138,7 +148,7 @@ schema(X)
 
 encoder =  ContrastEncoder(
     features = [:name, :favnum],
-    ignore = false, 
+    ignore = false,
     mode = [:dummy, :helmert],
 )
 mach = fit!(machine(encoder, X))
@@ -157,4 +167,4 @@ julia > Xnew
 See also
 [`OneHotEncoder`](@ref)
 """
-ContrastEncoder
+ContrastEncoder
diff --git a/src/encoders/missingness_encoding/interface_mlj.jl b/src/encoders/missingness_encoding/interface_mlj.jl
@@ -77,10 +77,10 @@ MMI.metadata_model(
 """
 $(MMI.doc_header(MissingnessEncoder))
 
-`MissingnessEncoder` maps any missing level of a categorical feature into a new level (e.g., "Missing"). 
-By this, missingness will be treated as a new
-level by any subsequent model. This assumes that the categorical features have raw
-types that are in `Char`, `AbstractString`, and `Number`.
+`MissingnessEncoder` maps any missing level of a categorical feature into a new level
+(e.g., "Missing").  By this, missingness will be treated as a new level by any subsequent
+model. This assumes that the categorical features have raw types that are in `Char`,
+`AbstractString`, and `Number`.
 
 # Training data
 
@@ -97,25 +97,32 @@ Train the machine using `fit!(mach, rows=...)`.
 # Hyper-parameters
 
 $features_doc
+
 $ignore_doc
+
 $ordered_factor_doc
-- `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A
-dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and where each value
-signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
-then missing values will be replaced with `"missing"` and if the raw type subtypes `Char` then the new value is `'m'`
-and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1.
+
+- `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char =>
+  'm', )`: A dictionary where the possible values for keys are the types in `Char`,
+  `AbstractString`, and `Number` and where each value signifies the new level to map into
+  given a column raw super type. By default, if the raw type of the column subtypes
+  `AbstractString` then missing values will be replaced with `"missing"` and if the raw
+  type subtypes `Char` then the new value is `'m'` and if the raw type subtypes `Number`
+  then the new value is the lowest value in the column - 1.
 
 # Operations
 
-- `transform(mach, Xnew)`: Apply cardinality reduction to selected `Multiclass` or `OrderedFactor` features of `Xnew` specified by hyper-parameters, and 
-   return the new table.   Features that are neither `Multiclass` nor `OrderedFactor`
-   are always left unchanged.
+- `transform(mach, Xnew)`: Apply cardinality reduction to selected `Multiclass` or
+  `OrderedFactor` features of `Xnew` specified by hyper-parameters, and return the new
+  table.  Features that are neither `Multiclass` nor `OrderedFactor` are always left
+  unchanged.
 
 # Fitted parameters
 
 The fields of `fitted_params(mach)` are:
 
-- `label_for_missing_given_feature`: A dictionary that for each column, maps `missing` into some value according to `label_for_missing`
+- `label_for_missing_given_feature`: A dictionary that for each column, maps `missing`
+  into some value according to `label_for_missing`
 
 # Report
 
@@ -154,4 +161,4 @@ julia> Xnew
 See also
 [`CardinalityReducer`](@ref)
 """
-MissingnessEncoder
+MissingnessEncoder
diff --git a/src/encoders/target_encoding/interface_mlj.jl b/src/encoders/target_encoding/interface_mlj.jl
@@ -49,7 +49,7 @@ struct TargetEncoderResult{
 } <: MMI.MLJType
     # target statistic for each level of each categorical feature
     y_stat_given_feat_level::Dict{A, A}
-    task::S            # "Regression", "Classification" 
+    task::S            # "Regression", "Classification"
     num_classes::I     # num_classes in case of classification
     y_classes::A      # y_classes in case of classification
 
@@ -120,7 +120,7 @@ MMI.target_in_fit(::Type{<:TargetEncoder}) = true
 """
 $(MMI.doc_header(TargetEncoder))
 
-`TargetEncoder` implements target encoding as defined in [1] to encode categorical variables 
+`TargetEncoder` implements target encoding as defined in [1] to encode categorical variables
     into continuous ones using statistics from the target variable.
 
 # Training data
@@ -133,34 +133,42 @@ Here:
 
 $X_doc_mlj
 
-- `y` is the target, which can be any `AbstractVector` whose element
-  scitype is `Continuous` or `Count` for regression problems and 
-  `Multiclass` or `OrderedFactor` for classification problems; check the scitype with `schema(y)`
+- `y` is the target, which can be any `AbstractVector` whose element scitype is
+  `Continuous` or `Count` for regression problems and `Multiclass` or `OrderedFactor` for
+  classification problems; check the scitype with `schema(y)`
 
 Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
 $features_doc
+
 $ignore_doc
+
 $ordered_factor_doc
-- `λ`: Shrinkage hyperparameter used to mix between posterior and prior statistics as described in [1]
-- `m`: An integer hyperparameter to compute shrinkage as described in [1]. If `m=:auto` then m will be computed using
- empirical Bayes estimation as described in [1]
+
+- `λ`: Shrinkage hyperparameter used to mix between posterior and prior statistics as
+  described in [1]
+
+- `m`: An integer hyperparameter to compute shrinkage as described in [1]. If `m=:auto`
+  then m will be computed using empirical Bayes estimation as described in [1]
 
 # Operations
 
-- `transform(mach, Xnew)`: Apply target encoding to selected `Multiclass` or `OrderedFactor features of `Xnew` specified by hyper-parameters, and 
-   return the new table.   Features that are neither `Multiclass` nor `OrderedFactor`
-   are always left unchanged.
+- `transform(mach, Xnew)`: Apply target encoding to selected `Multiclass` or
+  `OrderedFactor` features of `Xnew` specified by hyper-parameters, and return the new
+  table.  Features that are neither `Multiclass` nor `OrderedFactor` are always left
+  unchanged.
 
 # Fitted parameters
 
 The fields of `fitted_params(mach)` are:
 
 - `task`: Whether the task is `Classification` or `Regression`
-- `y_statistic_given_feat_level`: A dictionary with the necessary statistics to encode each categorical feature. It maps each 
-    level in each categorical feature to a statistic computed over the target.
+
+- `y_statistic_given_feat_level`: A dictionary with the necessary statistics to encode
+  each categorical feature. It maps each level in each categorical feature to a statistic
+  computed over the target.
 
 # Report
 
@@ -174,13 +182,13 @@ $encoded_features_doc
 using MLJ
 
 # Define categorical features
-A = ["g", "b", "g", "r", "r",]  
+A = ["g", "b", "g", "r", "r",]
 B = [1.0, 2.0, 3.0, 4.0, 5.0,]
-C = ["f", "f", "f", "m", "f",]  
+C = ["f", "f", "f", "m", "f",]
 D = [true, false, true, false, true,]
 E = [1, 2, 3, 4, 5,]
 
-# Define the target variable 
+# Define the target variable
 y = ["c1", "c2", "c3", "c1", "c2",]
 
 # Combine into a named tuple
@@ -219,11 +227,11 @@ julia > schema(Xnew)
 ```
 
 # Reference
-[1] Micci-Barreca, Daniele. 
-    “A preprocessing scheme for high-cardinality categorical attributes in classification and prediction problems” 
+[1] Micci-Barreca, Daniele.
+    “A preprocessing scheme for high-cardinality categorical attributes in classification and prediction problems”
     SIGKDD Explor. Newsl. 3, 1 (July 2001), 27–32.
 
 See also
 [`OneHotEncoder`](@ref)
 """
-TargetEncoder
+TargetEncoder
diff --git a/src/generic.jl b/src/generic.jl
@@ -59,7 +59,7 @@ function generic_fit(X,
     # 4. Use feature mapper to compute the mapping of each level in each column
     encoded_features = Symbol[]# to store column that were actually encoded
     for feat_name in feat_names
-        feat_col = Tables.getcolumn(X, feat_name)
+        feat_col = MMI.selectcols(X, feat_name)
         feat_type = elscitype(feat_col)
         feat_has_allowed_type =
             feat_type <: Union{Missing, Multiclass} ||
@@ -149,7 +149,7 @@ function generic_transform(
     new_feat_names = Symbol[]
     new_cols = []
     for feat_name in feat_names
-        col = Tables.getcolumn(X, feat_name)
+        col = MMI.selectcols(X, feat_name)
         # Create the transformation function for each column
         if feat_name in keys(mapping_per_feat_level)
             if !ignore_unknown