✨ Fix frequency encoder output types

EssamWisam · EssamWisam · commit 7b577d707435 · 2025-05-12T17:18:22.000+03:00
diff --git a/.gitignore b/.gitignore
@@ -27,3 +27,4 @@ meh/*.ipynb
 .DS_Store
 /*.jl
 scratchpad/
+examples/test.jl
diff --git a/src/encoders/frequency_encoding/frequency_encoding.jl b/src/encoders/frequency_encoding/frequency_encoding.jl
@@ -28,7 +28,10 @@ function frequency_encoder_fit(
     # 1. Define feature mapper
     function feature_mapper(col, name)
         frequency_map = (!normalize) ? countmap(col) : proportionmap(col)
-        statistic_given_feat_val = Dict{Any, Real}(level=>frequency_map[level] for level in levels(col))
+        feat_levels = levels(col)
+        statistic_given_feat_val = Dict{eltype(feat_levels), Float32}(
+            level => frequency_map[level] for level in feat_levels
+        )
         return statistic_given_feat_val
     end
 
diff --git a/src/generic.jl b/src/generic.jl
@@ -49,11 +49,13 @@ function generic_fit(X,
         feat_col = Tables.getcolumn(X, feat_name)
         feat_type = elscitype(feat_col)
         feat_has_allowed_type =
-            feat_type <: Union{Missing, Multiclass} || (ordered_factor && feat_type <: Union{Missing, OrderedFactor})
+            feat_type <: Union{Missing, Multiclass} ||
+            (ordered_factor && feat_type <: Union{Missing, OrderedFactor})
         if feat_has_allowed_type  # then should be encoded
             push!(encoded_features, feat_name)
             # Compute the dict using the given feature_mapper function
-            mapping_per_feat_level[feat_name] = feature_mapper(feat_col, feat_name, args...; kwargs...)
+            mapping_per_feat_level[feat_name] =
+                feature_mapper(feat_col, feat_name, args...; kwargs...)
         end
     end
     return mapping_per_feat_level, encoded_features
@@ -72,7 +74,7 @@ function generate_new_feat_names(feat_name, num_inds, existing_names)
 
     new_column_names = []
     while conflict
-        suffix = repeat("_", count)  
+        suffix = repeat("_", count)
         new_column_names = [Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds]
         conflict = any(name -> name in existing_names, new_column_names)
         count += 1
@@ -85,22 +87,29 @@ end
 """
 **Private method.**
 
-Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in 
+Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
 a subset of categorical features of X into a scalar or a vector (as specified in single_feat)
 
-  - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` 
-  into a scalar (single_feat=true)
+  - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
+    into a scalar (single_feat=true)
 
-  - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` 
-  into a set of k features where k is the length of the vector (single_feat=false)
+  - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
+    into a set of k features where k is the length of the vector (single_feat=false)
   - In both cases it attempts to preserve the type of the table.
   - In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such
-    assumption is necessary because any column in X must correspond to a constant number of features 
+    assumption is necessary because any column in X must correspond to a constant number of features
     in the output table (which is equal to k).
   - Features not in the dictionary are mapped to themselves (i.e., not changed).
-  - Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error.
+  - Levels not in the nested dictionary are mapped to themselves if `ignore unknown` is true else raise an error.
+  - If `ensure_categorical` is true, then any input categorical column will remain categorical
 """
-function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore_unknown = false)
+function generic_transform(
+    X,
+    mapping_per_feat_level;
+    single_feat = true,
+    ignore_unknown = false,
+    ensure_categorical = false,
+)
     feat_names = Tables.schema(X).names
     new_feat_names = Symbol[]
     new_cols = []
@@ -115,18 +124,25 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
                 if !issubset(test_levels, train_levels)
                     # get the levels in test that are not in train
                     lost_levels = setdiff(test_levels, train_levels)
-                    error("While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.")
+                    error(
+                        "While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.",
+                    )
                 end
             end
-            
+
             if single_feat
                 level2scalar = mapping_per_feat_level[feat_name]
-                new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
+                if ensure_categorical
+                    new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
+                else 
+                    new_col = !isempty(level2scalar) ? unwrap.(recode(col, level2scalar...)) : col
+                end
+               
                 push!(new_cols, new_col)
                 push!(new_feat_names, feat_name)
             else
                 level2vector = mapping_per_feat_level[feat_name]
-                new_multi_col = map(x->get(level2vector, x, x), col)
+                new_multi_col = map(x -> get(level2vector, x, x), col)
                 new_multi_col = [col for col in eachrow(hcat(new_multi_col...))]
                 push!(new_cols, new_multi_col...)
 
@@ -144,8 +160,8 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
         end
     end
 
-    transformed_X= NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
+    transformed_X = NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
     # Attempt to preserve table type
     transformed_X = Tables.materializer(X)(transformed_X)
     return transformed_X
-end
+end
diff --git a/test/encoders/frequency_encoder.jl b/test/encoders/frequency_encoder.jl
@@ -9,7 +9,8 @@ using MLJTransforms: frequency_encoder_fit, frequency_encoder_transform
     for norm in normalize
         result = frequency_encoder_fit(X; normalize = norm)[:statistic_given_feat_val]
         enc =
-            (col, level) -> ((norm) ? sum(col .== level) / length(col) : sum(col .== level))
+            (col, level) ->
+                Float32((norm) ? sum(col .== level) / length(col) : sum(col .== level))
         true_output = Dict{Symbol, Dict{Any, Any}}(
             :F => Dict(
                 "m" => enc(F_col, "m"),
@@ -44,7 +45,7 @@ end
         X_tr = frequency_encoder_transform(X, cache)
         enc =
             (col, level) ->
-                ((norm) ? sum(X[col] .== level) / length(X[col]) : sum(X[col] .== level))
+                Float32((norm) ? sum(X[col] .== level) / length(X[col]) : sum(X[col] .== level))
 
         target = (
             A = [enc(:A, X[:A][i]) for i in 1:10],
@@ -81,4 +82,42 @@ end
         # Test report
         @test report(mach) == (encoded_features = generic_cache[:encoded_features],)
     end
-end
+end
+
+@testset "Test Frequency Encoding Output Types" begin
+    # Define categorical features
+    A = ["g", "b", "g", "r", "r"]
+    B = [1.0, 2.0, 3.0, 4.0, 5.0]
+    C = ["f", "f", "f", "m", "f"]
+    D = [true, false, true, false, true]
+    E = [1, 2, 3, 4, 5]
+
+    # Combine into a named tuple
+    X = (A = A, B = B, C = C, D = D, E = E)
+
+    # Coerce A, C, D to multiclass and B to continuous and E to ordinal
+    X = coerce(X,
+        :A => Multiclass,
+        :B => Continuous,
+        :C => Multiclass,
+        :D => Multiclass,
+        :E => OrderedFactor,
+    )
+
+    # Check scitype coercions:
+    schema(X)
+
+    encoder = FrequencyEncoder(ordered_factor = false, normalize = false)
+    mach = fit!(machine(encoder, X))
+    Xnew = MMI.transform(mach, X)
+
+
+    scs = schema(Xnew).scitypes
+    ts  = schema(Xnew).types
+    # Check scitypes correctness
+    @test all(scs[1:end-1] .== Continuous)
+    @test all(t -> (t <: AbstractFloat) && isconcretetype(t), ts[1:end-1])
+    # Ordinal column should be intact
+    @test scs[end] === schema(X).scitypes[end]
+    @test ts[end] == schema(X).types[end]
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -15,7 +15,7 @@ using StatsModels
 
 # Other transformers
 using Tables, CategoricalArrays
-using ScientificTypes: scitype
+using ScientificTypes: scitype, schema
 using Statistics
 using StableRNGs
 stable_rng = StableRNGs.StableRNG(123)