✨ Better docstring redundance

EssamWisam · EssamWisam · commit 47441035becd · 2025-05-27T12:49:44.000+03:00
diff --git a/src/MLJTransforms.jl b/src/MLJTransforms.jl
@@ -19,6 +19,7 @@ using OrderedCollections
 const MMI = MLJModelInterface
 
 # Functions of generic use across transformers
+include("common_docs.jl")
 include("generic.jl")
 include("utils.jl")
 
diff --git a/src/common_docs.jl b/src/common_docs.jl
@@ -0,0 +1,27 @@
+const X_doc = """
+- X: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) 
+    `Multiclass` or `OrderedFactor`
+"""
+const X_doc_mlj = """
+- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
+   have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to 
+   check scitypes. 
+"""
+const features_doc = """
+- features=[]: A list of names of categorical features given as symbols to exclude or include from encoding,
+  according to the value of `ignore`, or a single symbol (which is treated as a vector with one symbol),
+  or a callable that returns true for features to be included/excluded
+"""
+const ignore_doc = """
+- ignore=true: Whether to exclude or include the features given in `features`
+"""
+const ordered_factor_doc = """
+- ordered_factor=false: Whether to encode `OrderedFactor` or ignore them
+"""
+const encoded_features_doc = """
+- encoded_features: The subset of the categorical features of `X` that were encoded
+"""
+const cache_doc = """
+- `cache`: The output of `contrast_encoder_fit`
+"""
+
diff --git a/src/encoders/contrast_encoder/contrast_encoder.jl b/src/encoders/contrast_encoder/contrast_encoder.jl
@@ -9,13 +9,13 @@ Where `k` is the number of levels in the feature and the returned contrast matri
 """
 ### 1. Dummy Coding
 function get_dummy_contrast(k)
-    return Matrix(1.0I, k, k-1)
+    return Matrix(1.0I, k, k - 1)
 end
 
 
 ### 2. Sum Coding
 function get_sum_contrast(k)
-    C = Matrix(1.0I, k, k-1)
+    C = Matrix(1.0I, k, k - 1)
     C[end, :] .= -1.0
     return C
 end
@@ -26,7 +26,7 @@ function create_backward_vector(index::Int, length::Int)
     vec = ones(length) .* index / length
 
     # [ -(k-i)/k -(k-i)/k -(k-i)/k .. i/k i/k]
-    vec[1:index] .= index/length - 1
+    vec[1:index] .= index / length - 1
     return vec
 end
 function get_backward_diff_contrast(k)
@@ -61,21 +61,21 @@ Fit a contrast encoing scheme on given data in `X`.
 
 # Arguments
 
-  - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
-  - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
+    $X_doc
+    $features_doc
   - `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
-  If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
-  contrast encoding scheme for each feature
-  - `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`, 
-  where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or 
-  hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
-  - `ignore=true`: Whether to exclude or includes the features given in `features`
-  - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
+    If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
+    contrast encoding scheme for each feature
+  - `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
+    where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
+    hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
+    $ignore_doc
+    $ordered_factor_doc
 
-# Returns (in a dict)
+# Returns as a named-tuple
 
   - `vec_given_feat_level`: Maps each level for each column in the selected categorical features to a vector
-  - `encoded_features`: The subset of the categorical features of X that were encoded
+  $encoded_features_doc
 """
 function contrast_encoder_fit(
     X,
@@ -90,9 +90,10 @@ function contrast_encoder_fit(
     if mode isa Vector{Symbol}
         mode_is_vector = true
         ignore && throw(ArgumentError(IGNORE_MUST_FALSE_VEC_MODE))
-        length(features) == length(mode) || throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
+        length(features) == length(mode) ||
+            throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
     end
-    
+
     # buildmatrix should be specified if mode is :contrast or :hypothesis
     if mode in (:contrast, :hypothesis)
         buildmatrix === nothing && throw(ArgumentError(BUILDFUNC_MUST_BE_SPECIFIED))
@@ -105,11 +106,13 @@ function contrast_encoder_fit(
         k = length(feat_levels)
         feat_mode = (mode_is_vector) ? mode[findfirst(isequal(name), features)] : mode
         if feat_mode == :contrast
-            contrastmatrix = buildmatrix(name, k)            
-            size(contrastmatrix) == (k, k-1) || throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
+            contrastmatrix = buildmatrix(name, k)
+            size(contrastmatrix) == (k, k - 1) ||
+                throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
         elseif feat_mode == :hypothesis
-            hypothesismatrix = buildmatrix(name, k) 
-            size(hypothesismatrix) == (k-1, k) || throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
+            hypothesismatrix = buildmatrix(name, k)
+            size(hypothesismatrix) == (k - 1, k) ||
+                throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
             contrastmatrix = pinv(hypothesismatrix)
         elseif feat_mode == :dummy
             contrastmatrix = get_dummy_contrast(k)
@@ -125,7 +128,9 @@ function contrast_encoder_fit(
             throw(ArgumentError("Mode $feat_mode is not supported."))
         end
 
-        vector_given_value_given_feature = OrderedDict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
+        vector_given_value_given_feature = OrderedDict(
+            level => contrastmatrix[l, :] for (l, level) in enumerate(feat_levels)
+        )
         return vector_given_value_given_feature
     end
 
@@ -158,5 +163,10 @@ Use a fitted contrast encoder to encode the levels of selected categorical varia
 """
 function contrast_encoder_transform(X, cache::NamedTuple)
     vector_given_value_given_feature = cache.vector_given_value_given_feature
-    return generic_transform(X, vector_given_value_given_feature, single_feat = false; use_levelnames = true)
-end
+    return generic_transform(
+        X,
+        vector_given_value_given_feature,
+        single_feat = false;
+        use_levelnames = true,
+    )
+end
diff --git a/src/encoders/contrast_encoder/interface_mlj.jl b/src/encoders/contrast_encoder/interface_mlj.jl
@@ -86,23 +86,21 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
 
 Here:
 
-- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
-   have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to 
-   check scitypes. 
+$X_doc_mlj
 
 Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
-- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
+$features_doc
 - `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
 If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
 contrast encoding scheme for each feature
 - `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`, 
 where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or 
 hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
-- `ignore=true`: Whether to exclude or includes the features given in `features`
-- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
+$ignore_doc
+$ordered_factor_doc
 
 # Operations
 
@@ -120,7 +118,7 @@ The fields of `fitted_params(mach)` are:
 
 The fields of `report(mach)` are:
 
-- `encoded_features`: The subset of the categorical features of X that were encoded
+$encoded_features_doc
 
 # Examples
 
diff --git a/src/encoders/frequency_encoding/frequency_encoding.jl b/src/encoders/frequency_encoding/frequency_encoding.jl
@@ -7,16 +7,16 @@ categorical features with their (normalized or raw) frequencies of occurrence in
 
 # Arguments
 
-  - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
-  - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
-  - `ignore=true`: Whether to exclude or includes the features given in `features`
-  - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
+    $X_doc
+    $features_doc
+    $ignore_doc
+    $ordered_factor_doc
   - `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
 
-# Returns (in a dict)
+# Returns as a named-tuple
 
   - `statistic_given_feat_val`: The frequency of each level of each selected categorical feature
-  - `encoded_features`: The subset of the categorical features of X that were encoded
+  $encoded_features_doc
 """
 function frequency_encoder_fit(
     X,
@@ -39,11 +39,11 @@ function frequency_encoder_fit(
     # 2. Pass it to generic_fit
     statistic_given_feat_val, encoded_features = generic_fit(
         X, features; ignore = ignore, ordered_factor = ordered_factor,
-        feature_mapper = feature_mapper,)
-        
+        feature_mapper = feature_mapper)
+
     cache = (
-      statistic_given_feat_val = statistic_given_feat_val,
-      encoded_features = encoded_features,
+        statistic_given_feat_val = statistic_given_feat_val,
+        encoded_features = encoded_features,
     )
     return cache
 end
diff --git a/src/encoders/frequency_encoding/interface_mlj.jl b/src/encoders/frequency_encoding/interface_mlj.jl
@@ -86,18 +86,16 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
 
 Here:
 
-- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
-   have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to 
-   check scitypes. 
+$X_doc_mlj
 
 Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
-- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
-- `ignore=true`: Whether to exclude or include the features given in `features`
-- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
-- `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
+$features_doc
+$ignore_doc
+$ordered_factor_doc
+- ` normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
 - `output_type=Float32`: The type of the output values. The default is `Float32`, but you can set it to `Float64` or any other type that can hold the frequency values.
 
 # Operations
@@ -116,7 +114,7 @@ The fields of `fitted_params(mach)` are:
 
 The fields of `report(mach)` are:
 
-- `encoded_features`: The subset of the categorical features of X that were encoded
+$encoded_features_doc
 
 # Examples
 
diff --git a/src/encoders/missingness_encoding/interface_mlj.jl b/src/encoders/missingness_encoding/interface_mlj.jl
@@ -90,17 +90,15 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
 
 Here:
 
-- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
-   have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to 
-   check scitypes. 
+$X_doc_mlj
 
 Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
-- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
-- `ignore=true`: Whether to exclude or includes the features given in `features`
-- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
+$features_doc
+$ignore_doc
+$ordered_factor_doc
 - `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A
 dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and where each value
 signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
@@ -123,7 +121,7 @@ The fields of `fitted_params(mach)` are:
 
 The fields of `report(mach)` are:
 
-- `encoded_features`: The subset of the categorical features of X that were encoded
+$encoded_features_doc
 
 # Examples
 
diff --git a/src/encoders/missingness_encoding/missingness_encoding.jl b/src/encoders/missingness_encoding/missingness_encoding.jl
@@ -9,21 +9,20 @@ types that are in `Char`, `AbstractString`, and `Number`.
 
 # Arguments
 
-  - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/)
-    `Multiclass` or `OrderedFactor`
-  - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
-  - `ignore=true`: Whether to exclude or includes the features given in `features`
-  - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
+    $X_doc
+    $features_doc
+    $ignore_doc
+    $ordered_factor_doc
   - `label_for_missing::Dict{<:Type, <:Any}()= Dict( AbstractString => "missing", Char => 'm', )`: A
     dictionary where the possible values for keys are the types in `Char`, `AbstractString`, and `Number` and where each value
     signifies the new level to map into given a column raw super type. By default, if the raw type of the column subtypes `AbstractString`
     then missing values will be replaced with `"missing"` and if the raw type subtypes `Char` then the new value is `'m'`
     and if the raw type subtypes `Number` then the new value is the lowest value in the column - 1.
 
-# Returns (in a dict)
+# Returns as a named-tuple
 
   - `label_for_missing_given_feature`: A dictionary that for each column, maps `missing` into some value according to `label_for_missing`
-  - `encoded_features`: The subset of the categorical features of X that were encoded
+  $encoded_features_doc
 """
 function missingness_encoder_fit(
     X,
diff --git a/src/encoders/ordinal_encoding/interface_mlj.jl b/src/encoders/ordinal_encoding/interface_mlj.jl
@@ -82,17 +82,15 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
 
 Here:
 
-- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
-   have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to 
-   check scitypes. 
+$X_doc_mlj
 
 Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
-- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
-- `ignore=true`: Whether to exclude or includes the features given in `features`
-- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
+$features_doc
+$ignore_doc
+$ordered_factor_doc
 - `output_type`: The numerical concrete type of the encoded features. Default is `Float32`.
 
 # Operations
@@ -111,7 +109,7 @@ The fields of `fitted_params(mach)` are:
 
 The fields of `report(mach)` are:
 
-- `encoded_features`: The subset of the categorical features of X that were encoded
+$encoded_features_doc
 
 # Examples
 
diff --git a/src/encoders/ordinal_encoding/ordinal_encoding.jl b/src/encoders/ordinal_encoding/ordinal_encoding.jl
@@ -5,14 +5,13 @@
 Fit an encoder to encode the levels of categorical variables in a given table as integers (ordered arbitrarily).
 
 # Arguments
-
-  - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
-  - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
-  - `ignore=true`: Whether to exclude or includes the features given in `features`
-  - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
+    $X_doc
+    $features_doc
+    $ignore_doc
+    $ordered_factor_doc
   - `dtype`: The numerical concrete type of the encoded features. Default is `Float32`.
 
-# Returns (in a dict)
+# Returns as a named-tuple
 
   - `index_given_feat_level`: Maps each level for each column in a subset of the categorical features of X into an integer.
   - `encoded_features`: The subset of the categorical features of X that were encoded
@@ -37,10 +36,10 @@ function ordinal_encoder_fit(
     # 2. Pass it to generic_fit
     index_given_feat_level, encoded_features = generic_fit(
         X, features; ignore = ignore, ordered_factor = ordered_factor,
-        feature_mapper = feature_mapper,)
+        feature_mapper = feature_mapper)
     cache = (
-      index_given_feat_level = index_given_feat_level,
-      encoded_features = encoded_features,
+        index_given_feat_level = index_given_feat_level,
+        encoded_features = encoded_features,
     )
     return cache
 end
diff --git a/src/encoders/target_encoding/interface_mlj.jl b/src/encoders/target_encoding/interface_mlj.jl
diff --git a/src/encoders/target_encoding/target_encoding.jl b/src/encoders/target_encoding/target_encoding.jl
diff --git a/src/generic.jl b/src/generic.jl
diff --git a/src/transformers/cardinality_reducer/cardinality_reducer.jl b/src/transformers/cardinality_reducer/cardinality_reducer.jl
diff --git a/src/transformers/cardinality_reducer/interface_mlj.jl b/src/transformers/cardinality_reducer/interface_mlj.jl