JuliaAI · ablaom · May 31, 2025 · May 27, 2025 · May 27, 2025
diff --git a/src/MLJTransforms.jl b/src/MLJTransforms.jl
@@ -19,6 +19,7 @@ using OrderedCollections
 const MMI = MLJModelInterface
 
 # Functions of generic use across transformers
+include("common_docs.jl")
 include("generic.jl")
 include("utils.jl")
 

diff --git a/src/common_docs.jl b/src/common_docs.jl
@@ -0,0 +1,27 @@
+const X_doc = """
+- X: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) 
+    `Multiclass` or `OrderedFactor`
+"""
+const X_doc_mlj = """
+- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
+   have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to 
+   check scitypes. 
+"""
+const features_doc = """
+- features=[]: A list of names of categorical features given as symbols to exclude or include from encoding,
+  according to the value of `ignore`, or a single symbol (which is treated as a vector with one symbol),
+  or a callable that returns true for features to be included/excluded
+"""
+const ignore_doc = """
+- ignore=true: Whether to exclude or include the features given in `features`
+"""
+const ordered_factor_doc = """
+- ordered_factor=false: Whether to encode `OrderedFactor` or ignore them
+"""
+const encoded_features_doc = """
+- encoded_features: The subset of the categorical features of `X` that were encoded
+"""
+const cache_doc = """
+- `cache`: The output of `contrast_encoder_fit`
+"""
+
diff --git a/src/encoders/contrast_encoder/contrast_encoder.jl b/src/encoders/contrast_encoder/contrast_encoder.jl
@@ -9,13 +9,13 @@ Where `k` is the number of levels in the feature and the returned contrast matri
 """
 ### 1. Dummy Coding
 function get_dummy_contrast(k)
-    return Matrix(1.0I, k, k-1)
+    return Matrix(1.0I, k, k - 1)
 end
 
 
 ### 2. Sum Coding
 function get_sum_contrast(k)
-    C = Matrix(1.0I, k, k-1)
+    C = Matrix(1.0I, k, k - 1)
     C[end, :] .= -1.0
     return C
 end
@@ -26,7 +26,7 @@ function create_backward_vector(index::Int, length::Int)
     vec = ones(length) .* index / length
 
     # [ -(k-i)/k -(k-i)/k -(k-i)/k .. i/k i/k]
-    vec[1:index] .= index/length - 1
+    vec[1:index] .= index / length - 1
     return vec
 end
 function get_backward_diff_contrast(k)
@@ -61,21 +61,21 @@ Fit a contrast encoing scheme on given data in `X`.
 
 # Arguments
 
-  - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
-  - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
+    $X_doc
+    $features_doc
   - `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
-  If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
-  contrast encoding scheme for each feature
-  - `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`, 
-  where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or 
-  hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
-  - `ignore=true`: Whether to exclude or includes the features given in `features`
-  - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
+    If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
+    contrast encoding scheme for each feature
+  - `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
+    where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
+    hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
+    $ignore_doc
+    $ordered_factor_doc
 
-# Returns (in a dict)
+# Returns as a named-tuple
 
   - `vec_given_feat_level`: Maps each level for each column in the selected categorical features to a vector
-  - `encoded_features`: The subset of the categorical features of X that were encoded
+  $encoded_features_doc
 """
 function contrast_encoder_fit(
     X,
@@ -90,9 +90,10 @@ function contrast_encoder_fit(
     if mode isa Vector{Symbol}
         mode_is_vector = true
         ignore && throw(ArgumentError(IGNORE_MUST_FALSE_VEC_MODE))
-        length(features) == length(mode) || throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
+        length(features) == length(mode) ||
+            throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
     end
-    
+
     # buildmatrix should be specified if mode is :contrast or :hypothesis
     if mode in (:contrast, :hypothesis)
         buildmatrix === nothing && throw(ArgumentError(BUILDFUNC_MUST_BE_SPECIFIED))
@@ -105,11 +106,13 @@ function contrast_encoder_fit(
         k = length(feat_levels)
         feat_mode = (mode_is_vector) ? mode[findfirst(isequal(name), features)] : mode
         if feat_mode == :contrast
-            contrastmatrix = buildmatrix(name, k)            
-            size(contrastmatrix) == (k, k-1) || throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
+            contrastmatrix = buildmatrix(name, k)
+            size(contrastmatrix) == (k, k - 1) ||
+                throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
         elseif feat_mode == :hypothesis
-            hypothesismatrix = buildmatrix(name, k) 
-            size(hypothesismatrix) == (k-1, k) || throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
+            hypothesismatrix = buildmatrix(name, k)
+            size(hypothesismatrix) == (k - 1, k) ||
+                throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
             contrastmatrix = pinv(hypothesismatrix)
         elseif feat_mode == :dummy
             contrastmatrix = get_dummy_contrast(k)
@@ -125,7 +128,9 @@ function contrast_encoder_fit(
             throw(ArgumentError("Mode $feat_mode is not supported."))
         end
 
-        vector_given_value_given_feature = OrderedDict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
+        vector_given_value_given_feature = OrderedDict(
+            level => contrastmatrix[l, :] for (l, level) in enumerate(feat_levels)
+        )
         return vector_given_value_given_feature
     end
 
@@ -134,10 +139,9 @@ function contrast_encoder_fit(
         X, features; ignore = ignore, ordered_factor = ordered_factor,
         feature_mapper = feature_mapper,
     )
-
-    cache = Dict(
-        :vector_given_value_given_feature  => vector_given_value_given_feature,
-        :encoded_features => encoded_features,
+    cache = (
+        vector_given_value_given_feature = vector_given_value_given_feature,
+        encoded_features = encoded_features,
     )
 
     return cache
@@ -157,7 +161,12 @@ Use a fitted contrast encoder to encode the levels of selected categorical varia
 
   - `X_tr`: The table with selected features after the selected features are encoded by contrast encoding.
 """
-function contrast_encoder_transform(X, cache::Dict)
-    vector_given_value_given_feature = cache[:vector_given_value_given_feature]
-    return generic_transform(X, vector_given_value_given_feature, single_feat = false; use_levelnames = true)
-end
+function contrast_encoder_transform(X, cache::NamedTuple)
+    vector_given_value_given_feature = cache.vector_given_value_given_feature
+    return generic_transform(
+        X,
+        vector_given_value_given_feature,
+        single_feat = false;
+        use_levelnames = true,
+    )
+end
diff --git a/src/encoders/contrast_encoder/interface_mlj.jl b/src/encoders/contrast_encoder/interface_mlj.jl
@@ -36,19 +36,18 @@ function MMI.fit(transformer::ContrastEncoder, verbosity::Int, X)
         buildmatrix = transformer.buildmatrix,
         ordered_factor = transformer.ordered_factor,
     )
-    fitresult = generic_cache[:vector_given_value_given_feature]
+    fitresult = generic_cache.vector_given_value_given_feature
 
-    report = (encoded_features = generic_cache[:encoded_features],)        # report only has list of encoded features
+    report = (encoded_features = generic_cache.encoded_features,)        # report only has list of encoded features
     cache = nothing
     return fitresult, cache, report
 end;
 
 
 # 6. Transform method
 function MMI.transform(transformer::ContrastEncoder, fitresult, Xnew)
-    generic_cache = Dict(
-        :vector_given_value_given_feature =>
-            fitresult,
+    generic_cache = (
+        vector_given_value_given_feature = fitresult,
     )
     Xnew_transf = contrast_encoder_transform(Xnew, generic_cache)
     return Xnew_transf
@@ -87,23 +86,21 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
 
 Here:
 
-- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
-   have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to 
-   check scitypes. 
+$X_doc_mlj
 
 Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
-- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
+$features_doc
 - `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
 If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
 contrast encoding scheme for each feature
 - `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`, 
 where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or 
 hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
-- `ignore=true`: Whether to exclude or includes the features given in `features`
-- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
+$ignore_doc
+$ordered_factor_doc
 
 # Operations
 
@@ -121,7 +118,7 @@ The fields of `fitted_params(mach)` are:
 
 The fields of `report(mach)` are:
 
-- `encoded_features`: The subset of the categorical features of X that were encoded
+$encoded_features_doc
 
 # Examples
 

diff --git a/src/encoders/frequency_encoding/frequency_encoding.jl b/src/encoders/frequency_encoding/frequency_encoding.jl
@@ -7,16 +7,16 @@ categorical features with their (normalized or raw) frequencies of occurrence in
 
 # Arguments
 
-  - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
-  - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
-  - `ignore=true`: Whether to exclude or includes the features given in `features`
-  - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
+    $X_doc
+    $features_doc
+    $ignore_doc
+    $ordered_factor_doc
   - `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
 
-# Returns (in a dict)
+# Returns as a named-tuple
 
   - `statistic_given_feat_val`: The frequency of each level of each selected categorical feature
-  - `encoded_features`: The subset of the categorical features of X that were encoded
+  $encoded_features_doc
 """
 function frequency_encoder_fit(
     X,
@@ -39,11 +39,11 @@ function frequency_encoder_fit(
     # 2. Pass it to generic_fit
     statistic_given_feat_val, encoded_features = generic_fit(
         X, features; ignore = ignore, ordered_factor = ordered_factor,
-        feature_mapper = feature_mapper,
-    )
-    cache = Dict(
-        :statistic_given_feat_val => statistic_given_feat_val,
-        :encoded_features => encoded_features,
+        feature_mapper = feature_mapper)
+
+    cache = (
+        statistic_given_feat_val = statistic_given_feat_val,
+        encoded_features = encoded_features,
     )
     return cache
 end
@@ -62,7 +62,7 @@ Encode the levels of a categorical variable in a given table with their (normali
 
   - `X_tr`: The table with selected features after the selected features are encoded by frequency encoding.
 """
-function frequency_encoder_transform(X, cache::Dict)
-    statistic_given_feat_val = cache[:statistic_given_feat_val]
+function frequency_encoder_transform(X, cache::NamedTuple)
+    statistic_given_feat_val = cache.statistic_given_feat_val
     return generic_transform(X, statistic_given_feat_val)
 end
diff --git a/src/encoders/frequency_encoding/interface_mlj.jl b/src/encoders/frequency_encoding/interface_mlj.jl
@@ -36,19 +36,18 @@ function MMI.fit(transformer::FrequencyEncoder, verbosity::Int, X)
         normalize = transformer.normalize,
         output_type = transformer.output_type,
     )
-    fitresult = generic_cache[:statistic_given_feat_val]
+    fitresult = generic_cache.statistic_given_feat_val
 
-    report = (encoded_features = generic_cache[:encoded_features],)        # report only has list of encoded features
+    report = (encoded_features = generic_cache.encoded_features,)        # report only has list of encoded features
     cache = nothing
     return fitresult, cache, report
 end;
 
 
 # 6. Transform method
 function MMI.transform(transformer::FrequencyEncoder, fitresult, Xnew)
-    generic_cache = Dict(
-        :statistic_given_feat_val =>
-            fitresult,
+    generic_cache = (
+        statistic_given_feat_val = fitresult,
     )
     Xnew_transf = frequency_encoder_transform(Xnew, generic_cache)
     return Xnew_transf
@@ -87,18 +86,16 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
 
 Here:
 
-- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
-   have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to 
-   check scitypes. 
+$X_doc_mlj
 
 Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
-- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
-- `ignore=true`: Whether to exclude or include the features given in `features`
-- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
-- `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
+$features_doc
+$ignore_doc
+$ordered_factor_doc
+- ` normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
 - `output_type=Float32`: The type of the output values. The default is `Float32`, but you can set it to `Float64` or any other type that can hold the frequency values.
 
 # Operations
@@ -117,7 +114,7 @@ The fields of `fitted_params(mach)` are:
 
 The fields of `report(mach)` are:
 
-- `encoded_features`: The subset of the categorical features of X that were encoded
+$encoded_features_doc
 
 # Examples
 

diff --git a/src/encoders/missingness_encoding/interface_mlj.jl b/src/encoders/missingness_encoding/interface_mlj.jl
@@ -39,19 +39,18 @@ function MMI.fit(transformer::MissingnessEncoder, verbosity::Int, X)
         ordered_factor = transformer.ordered_factor,
         label_for_missing = transformer.label_for_missing,
     )
-    fitresult = generic_cache[:label_for_missing_given_feature]
+    fitresult = generic_cache.label_for_missing_given_feature
 
-    report = (encoded_features = generic_cache[:encoded_features],)        # report only has list of encoded features
+    report = (encoded_features = generic_cache.encoded_features,)        # report only has list of encoded features
     cache = nothing
     return fitresult, cache, report
 end;
 
 
 # 6. Transform method
 function MMI.transform(transformer::MissingnessEncoder, fitresult, Xnew)
-    generic_cache = Dict(
-        :label_for_missing_given_feature =>
-            fitresult,
+    generic_cache = (
+        label_for_missing_given_feature = fitresult,
     )
     Xnew_transf = missingness_encoder_transform(Xnew, generic_cache)
     return Xnew_transf
@@ -91,17 +90,15 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
 
 Here:
 
-- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
-   have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to 
-   check scitypes. 
+$X_doc_mlj
 
 Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
-- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
-- `ignore=true`: Whether to exclude or includes the features given in `features`
-- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
+$features_doc
+$ignore_doc
+$ordered_factor_doc
 - `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A
 dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and where each value
 signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
@@ -124,7 +121,7 @@ The fields of `fitted_params(mach)` are:
 
 The fields of `report(mach)` are:
 
-- `encoded_features`: The subset of the categorical features of X that were encoded
+$encoded_features_doc
 
 # Examples