JuliaAI · EssamWisam · May 15, 2025 · May 12, 2025 · May 12, 2025 · May 12, 2025
diff --git a/.gitignore b/.gitignore
@@ -27,3 +27,4 @@ meh/*.ipynb
 .DS_Store
 /*.jl
 scratchpad/
+examples/test.jl
diff --git a/src/encoders/frequency_encoding/frequency_encoding.jl b/src/encoders/frequency_encoding/frequency_encoding.jl
@@ -28,7 +28,10 @@ function frequency_encoder_fit(
     # 1. Define feature mapper
     function feature_mapper(col, name)
         frequency_map = (!normalize) ? countmap(col) : proportionmap(col)
-        statistic_given_feat_val = Dict{Any, Real}(level=>frequency_map[level] for level in levels(col))
+        feat_levels = levels(col)
+        statistic_given_feat_val = Dict{eltype(feat_levels), Float32}(
+            level => frequency_map[level] for level in feat_levels
+        )
         return statistic_given_feat_val
     end
 

diff --git a/src/encoders/missingness_encoding/missingness_encoding.jl b/src/encoders/missingness_encoding/missingness_encoding.jl
@@ -30,7 +30,7 @@ function missingness_encoder_fit(
     features::AbstractVector{Symbol} = Symbol[];
     ignore::Bool = true,
     ordered_factor::Bool = false,
-    label_for_missing::Dict{<:Type, <:Any} = Dict(    
+    label_for_missing::Dict{<:Type, <:Any} = Dict(
         AbstractString => "missing",
         Char => 'm',
     ),
@@ -40,8 +40,8 @@ function missingness_encoder_fit(
 
     # 1. Define feature mapper
     function feature_mapper(col, name)
-        col_type = nonmissingtype(eltype(col)).parameters[1]
-        feat_levels = levels(col; skipmissing=true)
+        feat_levels = levels(col; skipmissing = true)
+        col_type = nonmissingtype(eltype(feat_levels))
 
         # Ensure column type is valid (can't test because never occurs)
         # Converting array elements to strings before wrapping in a `CategoricalArray`, as...
@@ -58,7 +58,7 @@ function missingness_encoder_fit(
 
         # Check no collision between keys(label_for_missing) and feat_levels
         for value in values(label_for_missing)
-            if !ismissing(value) 
+            if !ismissing(value)
                 if value in feat_levels
                     throw(ArgumentError(COLLISION_NEW_VAL_ME(value)))
                 end
@@ -73,7 +73,7 @@ function missingness_encoder_fit(
                 break
             end
         end
-        
+
         # Nonmissing levels remain as is
         label_for_missing_given_feature = Dict{Missing, col_type}()
 
@@ -91,7 +91,8 @@ function missingness_encoder_fit(
 
     # 2. Pass it to generic_fit
     label_for_missing_given_feature, encoded_features = generic_fit(
-        X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper,
+        X, features; ignore = ignore, ordered_factor = ordered_factor,
+        feature_mapper = feature_mapper,
     )
     cache = Dict(
         :label_for_missing_given_feature => label_for_missing_given_feature,
@@ -117,6 +118,11 @@ Apply a fitted missingness encoder to a table given the output of `missingness_e
 """
 function missingness_encoder_transform(X, cache::Dict)
     label_for_missing_given_feature = cache[:label_for_missing_given_feature]
-    return generic_transform(X, label_for_missing_given_feature; ignore_unknown = true)
+    return generic_transform(
+        X,
+        label_for_missing_given_feature;
+        ignore_unknown = true,
+        ensure_categorical = true,
+    )
 end
 
diff --git a/src/encoders/ordinal_encoding/interface_mlj.jl b/src/encoders/ordinal_encoding/interface_mlj.jl
@@ -5,15 +5,17 @@ mutable struct OrdinalEncoder{AS <: AbstractVector{Symbol}} <: Unsupervised
     features::AS
     ignore::Bool
     ordered_factor::Bool
+    op_dtype::Type
 end;
 
 # 2. Constructor
 function OrdinalEncoder(;
     features = Symbol[],
     ignore = true,
     ordered_factor = false,
+    op_dtype = Float32,
 )
-    return OrdinalEncoder(features, ignore, ordered_factor)
+    return OrdinalEncoder(features, ignore, ordered_factor, op_dtype)
 end;
 
 
@@ -29,6 +31,7 @@ function MMI.fit(transformer::OrdinalEncoder, verbosity::Int, X)
         transformer.features;
         ignore = transformer.ignore,
         ordered_factor = transformer.ordered_factor,
+        op_dtype = transformer.op_dtype,
     )
     fitresult =
         generic_cache[:index_given_feat_level]
@@ -92,6 +95,7 @@ Train the machine using `fit!(mach, rows=...)`.
 - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
 - `ignore=true`: Whether to exclude or includes the features given in `features`
 - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
+- `op_dtype`: The numerical concrete type of the encoded features. Default is `Float32`.
 
 # Operations
 

diff --git a/src/encoders/ordinal_encoding/ordinal_encoding.jl b/src/encoders/ordinal_encoding/ordinal_encoding.jl
@@ -10,7 +10,7 @@ Fit an encoder to encode the levels of categorical variables in a given table as
   - `features=[]`: A list of names of categorical features given as symbols to exclude or include from encoding
   - `ignore=true`: Whether to exclude or includes the features given in `features`
   - `ordered_factor=false`: Whether to encode `OrderedFactor` or ignore them
-
+  - `dtype`: The numerical concrete type of the encoded features. Default is `Float32`.
 # Returns (in a dict)
 
   - `index_given_feat_level`: Maps each level for each column in a subset of the categorical features of X into an integer.
@@ -21,12 +21,13 @@ function ordinal_encoder_fit(
     features::AbstractVector{Symbol} = Symbol[];
     ignore::Bool = true,
     ordered_factor::Bool = false,
+    op_dtype::Type = Float32,
 )
     # 1. Define feature mapper
     function feature_mapper(col, name)
         feat_levels = levels(col)
         index_given_feat_val =
-            Dict{Any, Integer}(value => index for (index, value) in enumerate(feat_levels))
+            Dict{eltype(feat_levels), op_dtype}(value => index for (index, value) in enumerate(feat_levels))
         return index_given_feat_val
     end
 

diff --git a/src/encoders/target_encoding/target_encoding.jl b/src/encoders/target_encoding/target_encoding.jl
@@ -166,8 +166,9 @@ function target_encoder_fit(
 
     # 3. Define function to compute the new value(s) for each level given a column
     function feature_mapper(col, name)
+        feat_levels = levels(col)
         y_stat_given_feat_level_for_col =
-            Dict{Any, Union{AbstractFloat, AbstractVector{<:AbstractFloat}}}()
+            Dict{eltype(feat_levels), Any}()
         for level in levels(col)
             # Get the targets of an example that belong to this level
             targets_for_level = y[col.==level]

diff --git a/src/generic.jl b/src/generic.jl
@@ -49,11 +49,13 @@ function generic_fit(X,
         feat_col = Tables.getcolumn(X, feat_name)
         feat_type = elscitype(feat_col)
         feat_has_allowed_type =
-            feat_type <: Union{Missing, Multiclass} || (ordered_factor && feat_type <: Union{Missing, OrderedFactor})
+            feat_type <: Union{Missing, Multiclass} ||
+            (ordered_factor && feat_type <: Union{Missing, OrderedFactor})
         if feat_has_allowed_type  # then should be encoded
             push!(encoded_features, feat_name)
             # Compute the dict using the given feature_mapper function
-            mapping_per_feat_level[feat_name] = feature_mapper(feat_col, feat_name, args...; kwargs...)
+            mapping_per_feat_level[feat_name] =
+                feature_mapper(feat_col, feat_name, args...; kwargs...)
         end
     end
     return mapping_per_feat_level, encoded_features
@@ -72,7 +74,7 @@ function generate_new_feat_names(feat_name, num_inds, existing_names)
 
     new_column_names = []
     while conflict
-        suffix = repeat("_", count)  
+        suffix = repeat("_", count)
         new_column_names = [Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds]
         conflict = any(name -> name in existing_names, new_column_names)
         count += 1
@@ -85,22 +87,29 @@ end
 """
 **Private method.**
 
-Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in 
+Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
 a subset of categorical features of X into a scalar or a vector (as specified in single_feat)
 
-  - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` 
-  into a scalar (single_feat=true)
+  - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
+    into a scalar (single_feat=true)
 
-  - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` 
-  into a set of k features where k is the length of the vector (single_feat=false)
+  - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
+    into a set of k features where k is the length of the vector (single_feat=false)
   - In both cases it attempts to preserve the type of the table.
   - In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such
-    assumption is necessary because any column in X must correspond to a constant number of features 
+    assumption is necessary because any column in X must correspond to a constant number of features
     in the output table (which is equal to k).
   - Features not in the dictionary are mapped to themselves (i.e., not changed).
-  - Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error.
+  - Levels not in the nested dictionary are mapped to themselves if `ignore unknown` is true else raise an error.
+  - If `ensure_categorical` is true, then any input categorical column will remain categorical
 """
-function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore_unknown = false)
+function generic_transform(
+    X,
+    mapping_per_feat_level;
+    single_feat = true,
+    ignore_unknown = false,
+    ensure_categorical = false,
+)
     feat_names = Tables.schema(X).names
     new_feat_names = Symbol[]
     new_cols = []
@@ -115,18 +124,25 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
                 if !issubset(test_levels, train_levels)
                     # get the levels in test that are not in train
                     lost_levels = setdiff(test_levels, train_levels)
-                    error("While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.")
+                    error(
+                        "While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.",
+                    )
                 end
             end
-            
+
             if single_feat
                 level2scalar = mapping_per_feat_level[feat_name]
-                new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
+                if ensure_categorical
+                    new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
+                else 
+                    new_col = !isempty(level2scalar) ? unwrap.(recode(col, level2scalar...)) : col
+                end
+
                 push!(new_cols, new_col)
                 push!(new_feat_names, feat_name)
             else
                 level2vector = mapping_per_feat_level[feat_name]
-                new_multi_col = map(x->get(level2vector, x, x), col)
+                new_multi_col = map(x -> get(level2vector, x, x), col)
                 new_multi_col = [col for col in eachrow(hcat(new_multi_col...))]
                 push!(new_cols, new_multi_col...)
 
@@ -144,8 +160,8 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
         end
     end
 
-    transformed_X= NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
+    transformed_X = NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
     # Attempt to preserve table type
     transformed_X = Tables.materializer(X)(transformed_X)
     return transformed_X
-end
+end
diff --git a/src/transformers/cardinality_reducer/cardinality_reducer.jl b/src/transformers/cardinality_reducer/cardinality_reducer.jl
@@ -35,20 +35,20 @@ function cardinality_reducer_fit(
     features::AbstractVector{Symbol} = Symbol[];
     ignore::Bool = true,
     ordered_factor::Bool = false,
-    min_frequency::Real = 3,                        
-    label_for_infrequent::Dict{<:Type, <:Any} = Dict(    
+    min_frequency::Real = 3,
+    label_for_infrequent::Dict{<:Type, <:Any} = Dict(
         AbstractString => "Other",
         Char => 'O',
     ),
-)   
+)
     supportedtypes_list = [Char, AbstractString, Number]
     supportedtypes = Union{supportedtypes_list...}
 
     # 1. Define feature mapper
     function feature_mapper(col, name)
         val_to_freq = (min_frequency isa AbstractFloat) ? proportionmap(col) : countmap(col)
-        col_type = eltype(col).parameters[1]
         feat_levels = levels(col)
+        col_type = eltype(feat_levels)
 
         # Ensure column type is valid (can't test because never occurs)
         # Converting array elements to strings before wrapping in a `CategoricalArray`, as...
@@ -88,7 +88,11 @@ function cardinality_reducer_fit(
                     elseif elgrandtype == Number
                         new_cat_given_col_val[level] = minimum(feat_levels) - 1
                     else
-                        throw(ArgumentError(UNSPECIFIED_COL_TYPE(col_type, label_for_infrequent)))
+                        throw(
+                            ArgumentError(
+                                UNSPECIFIED_COL_TYPE(col_type, label_for_infrequent),
+                            ),
+                        )
                     end
                 end
             end
@@ -98,7 +102,8 @@ function cardinality_reducer_fit(
 
     # 2. Pass it to generic_fit
     new_cat_given_col_val, encoded_features = generic_fit(
-        X, features; ignore = ignore, ordered_factor = ordered_factor, feature_mapper = feature_mapper,
+        X, features; ignore = ignore, ordered_factor = ordered_factor,
+        feature_mapper = feature_mapper,
     )
     cache = Dict(
         :new_cat_given_col_val => new_cat_given_col_val,
@@ -125,5 +130,5 @@ Apply a fitted cardinality reducer to a table given the output of `cardinality_r
 """
 function cardinality_reducer_transform(X, cache::Dict)
     new_cat_given_col_val = cache[:new_cat_given_col_val]
-    return generic_transform(X, new_cat_given_col_val; ignore_unknown = true)
+    return generic_transform(X, new_cat_given_col_val; ignore_unknown = true, ensure_categorical = true)
 end
diff --git a/test/encoders/contrast_encoder.jl b/test/encoders/contrast_encoder.jl
@@ -51,9 +51,7 @@ end
     cache = contrast_encoder_fit(X, [:name]; ignore=false, mode = :dummy)
     k = length(levels(X.name))
     contrast_matrix = get_dummy_contrast(k)
-    print()
     for (i, level) in enumerate(levels(X.name))
-        println(cache[:vector_given_value_given_feature])
         @test cache[:vector_given_value_given_feature][:name][level] == contrast_matrix[i, :]
     end
 end
@@ -289,4 +287,40 @@ end
 
     # Test report
     @test report(mach) == (encoded_features = generic_cache[:encoded_features],)
+end
+
+
+@testset "Test Contrast Encoder Output Types" begin
+    X = (
+        name   = categorical(["Ben", "John", "Mary", "John"]),
+        height = [1.85, 1.67, 1.5, 1.67],
+        favnum = categorical([7, 5, 10, 1]),
+        age    = [23, 23, 14, 23],
+    )
+
+    methods =  [:contrast, :dummy, :sum, :backward_diff, :helmert, :hypothesis]
+    matrix_func = [buildrandomcontrast, nothing, nothing, nothing, nothing, buildrandomhypothesis]
+
+    for (i, method) in enumerate(methods)
+        encoder = ContrastEncoder(
+            features = [:name, :favnum],
+            ignore = false,
+            mode = method,
+            buildmatrix=matrix_func[i]
+        )
+        mach = fit!(machine(encoder, X))
+        Xnew = MMI.transform(mach, X)
+
+        # Test Consistency with Types
+        scs = schema(Xnew).scitypes
+        ts  = schema(Xnew).types
+
+        # Check scitypes for previously continuos or categorical features
+        @test all(scs[1:end-1] .== Continuous)
+        @test all(t -> (t <: AbstractFloat) && isconcretetype(t), ts[1:end-1])
+        # Check scitypes for previously Count feature
+        last_type, last_sctype = ts[end], scs[end]
+        @test last_type <: Integer && isconcretetype(last_type)
+        @test last_sctype <: Count
+    end
 end