JuliaAI · EssamWisam · Jun 18, 2025 · Feb 3, 2025 · Feb 7, 2025 · May 12, 2025
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -24,8 +24,7 @@ jobs:
       matrix:
         version:
           - '1.10'
-          - '1.6'
-          - 'nightly'
+          - '1'
         os:
           - ubuntu-latest
         arch:

diff --git a/.gitignore b/.gitignore
@@ -27,3 +27,4 @@ meh/*.ipynb
 .DS_Store
 /*.jl
 scratchpad/
+examples/test.jl
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJTransforms"
 uuid = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6"
 authors = ["Essam <[email protected]> and contributors"]
-version = "1.0.0-DEV"
+version = "0.1.0"
 
 [deps]
 BitBasis = "50ba71b6-fa0f-514d-ae9a-0916efc90dcf"
@@ -20,13 +20,21 @@ TableOperations = "ab02a1b2-a7df-11e8-156e-fb1833f50b87"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
+BitBasis = "0.9"
 CategoricalArrays = "0.10"
 MLJModelInterface = "1.11"
+Combinatorics = "1"
+Dates = "1"
+Distributions = "0.25"
+LinearAlgebra = "1"
+OrderedCollections = "1"
+Parameters = "0.12"
 ScientificTypes = "3.0"
+Statistics = "1"
 StatsBase = "0.34"
 TableOperations = "1.2"
 Tables = "1.11"
-julia = "1.6.7"
+julia = "1.10"
 
 [extras]
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"

diff --git a/src/MLJTransforms.jl b/src/MLJTransforms.jl
@@ -7,7 +7,7 @@ using MLJModelInterface
 using TableOperations
 using StatsBase
 using LinearAlgebra
-
+using OrderedCollections: OrderedDict
 # Other transformers
 using Combinatorics
 import Distributions
@@ -19,6 +19,7 @@ using OrderedCollections
 const MMI = MLJModelInterface
 
 # Functions of generic use across transformers
+include("common_docs.jl")
 include("generic.jl")
 include("utils.jl")
 

diff --git a/src/common_docs.jl b/src/common_docs.jl
@@ -0,0 +1,27 @@
+const X_doc = """
+- X: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) 
+    `Multiclass` or `OrderedFactor`
+"""
+const X_doc_mlj = """
+- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
+   have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to 
+   check scitypes. 
+"""
+const features_doc = """
+- features=[]: A list of names of categorical features given as symbols to exclude or include from encoding,
+  according to the value of `ignore`, or a single symbol (which is treated as a vector with one symbol),
+  or a callable that returns true for features to be included/excluded
+"""
+const ignore_doc = """
+- ignore=true: Whether to exclude or include the features given in `features`
+"""
+const ordered_factor_doc = """
+- ordered_factor=false: Whether to encode `OrderedFactor` or ignore them
+"""
+const encoded_features_doc = """
+- encoded_features: The subset of the categorical features of `X` that were encoded
+"""
+const cache_doc = """
+- `cache`: The output of `contrast_encoder_fit`
+"""
+
diff --git a/src/encoders/contrast_encoder/contrast_encoder.jl b/src/encoders/contrast_encoder/contrast_encoder.jl
@@ -9,13 +9,13 @@ Where `k` is the number of levels in the feature and the returned contrast matri
 """
 ### 1. Dummy Coding
 function get_dummy_contrast(k)
-    return Matrix(1.0I, k, k-1)
+    return Matrix(1.0I, k, k - 1)
 end
 
 
 ### 2. Sum Coding
 function get_sum_contrast(k)
-    C = Matrix(1.0I, k, k-1)
+    C = Matrix(1.0I, k, k - 1)
     C[end, :] .= -1.0
     return C
 end
@@ -26,7 +26,7 @@ function create_backward_vector(index::Int, length::Int)
     vec = ones(length) .* index / length
 
     # [ -(k-i)/k -(k-i)/k -(k-i)/k .. i/k i/k]
-    vec[1:index] .= index/length - 1
+    vec[1:index] .= index / length - 1
     return vec
 end
 function get_backward_diff_contrast(k)
@@ -61,25 +61,25 @@ Fit a contrast encoing scheme on given data in `X`.
 
 # Arguments
 
-  - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
-  - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
+    $X_doc
+    $features_doc
   - `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
-  If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
-  contrast encoding scheme for each feature
-  - `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`, 
-  where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or 
-  hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
-  - `ignore=true`: Whether to exclude or includes the features given in `features`
-  - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
+    If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
+    contrast encoding scheme for each feature
+  - `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`,
+    where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or
+    hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
+    $ignore_doc
+    $ordered_factor_doc
 
-# Returns (in a dict)
+# Returns as a named-tuple
 
   - `vec_given_feat_level`: Maps each level for each column in the selected categorical features to a vector
-  - `encoded_features`: The subset of the categorical features of X that were encoded
+  $encoded_features_doc
 """
 function contrast_encoder_fit(
     X,
-    features::AbstractVector{Symbol} = Symbol[];
+    features = Symbol[];
     mode::Union{Symbol, AbstractVector{Symbol}} = :dummy,
     buildmatrix = nothing,
     ignore::Bool = true,
@@ -90,9 +90,10 @@ function contrast_encoder_fit(
     if mode isa Vector{Symbol}
         mode_is_vector = true
         ignore && throw(ArgumentError(IGNORE_MUST_FALSE_VEC_MODE))
-        length(features) == length(mode) || throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
+        length(features) == length(mode) ||
+            throw(ArgumentError(LENGTH_MISMATCH_VEC_MODE(length(mode), length(features))))
     end
-    
+
     # buildmatrix should be specified if mode is :contrast or :hypothesis
     if mode in (:contrast, :hypothesis)
         buildmatrix === nothing && throw(ArgumentError(BUILDFUNC_MUST_BE_SPECIFIED))
@@ -105,11 +106,13 @@ function contrast_encoder_fit(
         k = length(feat_levels)
         feat_mode = (mode_is_vector) ? mode[findfirst(isequal(name), features)] : mode
         if feat_mode == :contrast
-            contrastmatrix = buildmatrix(name, k)            
-            size(contrastmatrix) == (k, k-1) || throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
+            contrastmatrix = buildmatrix(name, k)
+            size(contrastmatrix) == (k, k - 1) ||
+                throw(ArgumentError(MATRIX_SIZE_ERROR(k, size(contrastmatrix), name)))
         elseif feat_mode == :hypothesis
-            hypothesismatrix = buildmatrix(name, k) 
-            size(hypothesismatrix) == (k-1, k) || throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
+            hypothesismatrix = buildmatrix(name, k)
+            size(hypothesismatrix) == (k - 1, k) ||
+                throw(ArgumentError(MATRIX_SIZE_ERROR_HYP(k, size(hypothesismatrix), name)))
             contrastmatrix = pinv(hypothesismatrix)
         elseif feat_mode == :dummy
             contrastmatrix = get_dummy_contrast(k)
@@ -125,7 +128,9 @@ function contrast_encoder_fit(
             throw(ArgumentError("Mode $feat_mode is not supported."))
         end
 
-        vector_given_value_given_feature = Dict(level=>contrastmatrix[l, :] for (l, level) in enumerate(feat_levels))
+        vector_given_value_given_feature = OrderedDict(
+            level => contrastmatrix[l, :] for (l, level) in enumerate(feat_levels)
+        )
         return vector_given_value_given_feature
     end
 
@@ -134,10 +139,9 @@ function contrast_encoder_fit(
         X, features; ignore = ignore, ordered_factor = ordered_factor,
         feature_mapper = feature_mapper,
     )
-
-    cache = Dict(
-        :vector_given_value_given_feature  => vector_given_value_given_feature,
-        :encoded_features => encoded_features,
+    cache = (
+        vector_given_value_given_feature = vector_given_value_given_feature,
+        encoded_features = encoded_features,
     )
 
     return cache
@@ -157,7 +161,12 @@ Use a fitted contrast encoder to encode the levels of selected categorical varia
 
   - `X_tr`: The table with selected features after the selected features are encoded by contrast encoding.
 """
-function contrast_encoder_transform(X, cache::Dict)
-    vector_given_value_given_feature = cache[:vector_given_value_given_feature]
-    return generic_transform(X, vector_given_value_given_feature, single_feat = false)
-end
+function contrast_encoder_transform(X, cache::NamedTuple)
+    vector_given_value_given_feature = cache.vector_given_value_given_feature
+    return generic_transform(
+        X,
+        vector_given_value_given_feature,
+        single_feat = false;
+        use_levelnames = true,
+    )
+end
diff --git a/src/encoders/contrast_encoder/interface_mlj.jl b/src/encoders/contrast_encoder/interface_mlj.jl
@@ -1,11 +1,11 @@
 ### ContrastEncoding with MLJ Interface
 
 # 1. Interface Struct
-mutable struct ContrastEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised
-    features::AS
+mutable struct ContrastEncoder{ASS <: Union{Symbol, AbstractVector{Symbol}}, A1 <: Any, A2 <: Any} <: Unsupervised
+    features::A1
     ignore::Bool
-    mode::Union{Symbol, AS}
-    buildmatrix::Any
+    mode:: ASS
+    buildmatrix::A2
     ordered_factor::Bool
 end;
 
@@ -36,19 +36,18 @@ function MMI.fit(transformer::ContrastEncoder, verbosity::Int, X)
         buildmatrix = transformer.buildmatrix,
         ordered_factor = transformer.ordered_factor,
     )
-    fitresult = generic_cache[:vector_given_value_given_feature]
+    fitresult = generic_cache.vector_given_value_given_feature
 
-    report = (encoded_features = generic_cache[:encoded_features],)        # report only has list of encoded features
+    report = (encoded_features = generic_cache.encoded_features,)        # report only has list of encoded features
     cache = nothing
     return fitresult, cache, report
 end;
 
 
 # 6. Transform method
 function MMI.transform(transformer::ContrastEncoder, fitresult, Xnew)
-    generic_cache = Dict(
-        :vector_given_value_given_feature =>
-            fitresult,
+    generic_cache = (
+        vector_given_value_given_feature = fitresult,
     )
     Xnew_transf = contrast_encoder_transform(Xnew, generic_cache)
     return Xnew_transf
@@ -87,23 +86,21 @@ In MLJ (or MLJBase) bind an instance unsupervised `model` to data with
 
 Here:
 
-- `X` is any table of input features (eg, a `DataFrame`). Features to be transformed must
-   have element scitype `Multiclass` or `OrderedFactor`. Use `schema(X)` to 
-   check scitypes. 
+$X_doc_mlj
 
 Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
-- `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
+$features_doc
 - `mode=:dummy`: The type of encoding to use. Can be one of `:contrast`, `:dummy`, `:sum`, `:backward_diff`, `:forward_diff`, `:helmert` or `:hypothesis`.
 If `ignore=false` (features to be encoded are listed explictly in `features`), then this can be a vector of the same length as `features` to specify a different
 contrast encoding scheme for each feature
 - `buildmatrix=nothing`: A function or other callable with signature `buildmatrix(colname, k)`, 
 where `colname` is the name of the feature levels and `k` is it's length, and which returns contrast or 
 hypothesis matrix with row/column ordering consistent with the ordering of `levels(col)`. Only relevant if `mode` is `:contrast` or `:hypothesis`.
-- `ignore=true`: Whether to exclude or includes the features given in `features`
-- `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
+$ignore_doc
+$ordered_factor_doc
 
 # Operations
 
@@ -121,7 +118,7 @@ The fields of `fitted_params(mach)` are:
 
 The fields of `report(mach)` are:
 
-- `encoded_features`: The subset of the categorical features of X that were encoded
+$encoded_features_doc
 
 # Examples
 
@@ -148,12 +145,12 @@ mach = fit!(machine(encoder, X))
 Xnew = transform(mach, X)
 
 julia > Xnew
-    (name_1 = [1.0, 0.0, 0.0, 0.0],
-    name_2 = [0.0, 1.0, 0.0, 1.0],
+    (name_John = [1.0, 0.0, 0.0, 0.0],
+    name_Mary = [0.0, 1.0, 0.0, 1.0],
     height = [1.85, 1.67, 1.5, 1.67],
-    favnum_1 = [0.0, 1.0, 0.0, -1.0],
-    favnum_2 = [2.0, -1.0, 0.0, -1.0],
-    favnum_3 = [-1.0, -1.0, 3.0, -1.0],
+    favnum_5 = [0.0, 1.0, 0.0, -1.0],
+    favnum_7 = [2.0, -1.0, 0.0, -1.0],
+    favnum_10 = [-1.0, -1.0, 3.0, -1.0],
     age = [23, 23, 14, 23],)
 ```
 

diff --git a/src/encoders/frequency_encoding/frequency_encoding.jl b/src/encoders/frequency_encoding/frequency_encoding.jl
@@ -7,39 +7,43 @@ categorical features with their (normalized or raw) frequencies of occurrence in
 
 # Arguments
 
-  - `X`: A table where the elements of the categorical features have [scitypes](https://juliaai.github.io/ScientificTypes.jl/dev/) `Multiclass` or `OrderedFactor`
-  - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
-  - `ignore=true`: Whether to exclude or includes the features given in `features`
-  - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
+    $X_doc
+    $features_doc
+    $ignore_doc
+    $ordered_factor_doc
   - `normalize=false`: Whether to use normalized frequencies that sum to 1 over category values or to use raw counts.
 
-# Returns (in a dict)
+# Returns as a named-tuple
 
   - `statistic_given_feat_val`: The frequency of each level of each selected categorical feature
-  - `encoded_features`: The subset of the categorical features of X that were encoded
+  $encoded_features_doc
 """
 function frequency_encoder_fit(
     X,
-    features::AbstractVector{Symbol} = Symbol[];
+    features = Symbol[];
     ignore::Bool = true,
     ordered_factor::Bool = false,
     normalize::Bool = false,
+    output_type::Type = Float32,
 )
     # 1. Define feature mapper
     function feature_mapper(col, name)
         frequency_map = (!normalize) ? countmap(col) : proportionmap(col)
-        statistic_given_feat_val = Dict{Any, Real}(level=>frequency_map[level] for level in levels(col))
+        feat_levels = levels(col)
+        statistic_given_feat_val = Dict{eltype(feat_levels), output_type}(
+            level => frequency_map[level] for level in feat_levels
+        )
         return statistic_given_feat_val
     end
 
     # 2. Pass it to generic_fit
     statistic_given_feat_val, encoded_features = generic_fit(
         X, features; ignore = ignore, ordered_factor = ordered_factor,
-        feature_mapper = feature_mapper,
-    )
-    cache = Dict(
-        :statistic_given_feat_val => statistic_given_feat_val,
-        :encoded_features => encoded_features,
+        feature_mapper = feature_mapper)
+
+    cache = (
+        statistic_given_feat_val = statistic_given_feat_val,
+        encoded_features = encoded_features,
     )
     return cache
 end
@@ -58,7 +62,7 @@ Encode the levels of a categorical variable in a given table with their (normali
 
   - `X_tr`: The table with selected features after the selected features are encoded by frequency encoding.
 """
-function frequency_encoder_transform(X, cache::Dict)
-    statistic_given_feat_val = cache[:statistic_given_feat_val]
+function frequency_encoder_transform(X, cache::NamedTuple)
+    statistic_given_feat_val = cache.statistic_given_feat_val
     return generic_transform(X, statistic_given_feat_val)
 end
-Original file line number
+Diff line change
@@ Expand Up / @@ -24,8 +24,7 @@ jobs: @@
           matrix:
             version:
               - '1.10'
-              - '1.6'
-              - 'nightly'
+              - '1'
             os:
               - ubuntu-latest
             arch:
@@ Expand Down @@