✨ Support for using level names instead of indices for new columns in transform

EssamWisam · EssamWisam · commit a815b766f99d · 2025-05-13T16:27:43.000+03:00
diff --git a/.gitignore b/.gitignore
@@ -27,3 +27,4 @@ meh/*.ipynb
 .DS_Store
 /*.jl
 scratchpad/
+examples/test.jl
diff --git a/src/generic.jl b/src/generic.jl
@@ -49,11 +49,13 @@ function generic_fit(X,
         feat_col = Tables.getcolumn(X, feat_name)
         feat_type = elscitype(feat_col)
         feat_has_allowed_type =
-            feat_type <: Union{Missing, Multiclass} || (ordered_factor && feat_type <: Union{Missing, OrderedFactor})
+            feat_type <: Union{Missing, Multiclass} ||
+            (ordered_factor && feat_type <: Union{Missing, OrderedFactor})
         if feat_has_allowed_type  # then should be encoded
             push!(encoded_features, feat_name)
             # Compute the dict using the given feature_mapper function
-            mapping_per_feat_level[feat_name] = feature_mapper(feat_col, feat_name, args...; kwargs...)
+            mapping_per_feat_level[feat_name] =
+                feature_mapper(feat_col, feat_name, args...; kwargs...)
         end
     end
     return mapping_per_feat_level, encoded_features
@@ -64,19 +66,37 @@ end
 """
 **Private method.**
 
-Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n
+Function to generate new feature names: feat_name_0, feat_name_1,..., feat_name_n or if possible,
+feat_name_level_0, feat_name_level_1,..., feat_name_level_n
 """
-function generate_new_feat_names(feat_name, num_inds, existing_names)
-    conflict = true        # will be kept true as long as there is a conflict
-    count = 1            # number of conflicts+1 = number of underscores
+function generate_new_feat_names(
+    feat_name,
+    num_inds,
+    levels,
+    existing_names;
+    use_levelnames = true,
+)
+    # Convert levels (e.g. KeySet or Tuple) to an indexable vector
+    levels_vec = collect(levels)
+
+    conflict = true        # true while there's a name clash
+    count = 1              # number of underscores in the suffix
+    new_column_names = Symbol[]
 
-    new_column_names = []
     while conflict
-        suffix = repeat("_", count)  
-        new_column_names = [Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds]
+        suffix = repeat("_", count)
+        if use_levelnames
+            # Always use the first num_inds level names
+            new_column_names = [ Symbol("$(feat_name)$(suffix)$(levels_vec[i])") for i in 1:num_inds ]
+        else
+            # Always use numeric indices
+            new_column_names = [ Symbol("$(feat_name)$(suffix)$i") for i in 1:num_inds ]
+        end
+        # Check for collisions
         conflict = any(name -> name in existing_names, new_column_names)
         count += 1
     end
+
     return new_column_names
 end
 
@@ -85,22 +105,32 @@ end
 """
 **Private method.**
 
-Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in 
+Given a table `X` and a dictionary `mapping_per_feat_level` which maps each level for each column in
 a subset of categorical features of X into a scalar or a vector (as specified in single_feat)
 
-  - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` 
-  into a scalar (single_feat=true)
+  - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
+    into a scalar (single_feat=true)
 
-  - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level` 
-  into a set of k features where k is the length of the vector (single_feat=false)
+  - transforms each value (some level) in each column in `X` using the function in `mapping_per_feat_level`
+    into a set of k features where k is the length of the vector (single_feat=false)
   - In both cases it attempts to preserve the type of the table.
   - In the latter case, it assumes that all levels under the same category are mapped to vectors of the same length. Such
-    assumption is necessary because any column in X must correspond to a constant number of features 
+    assumption is necessary because any column in X must correspond to a constant number of features
     in the output table (which is equal to k).
   - Features not in the dictionary are mapped to themselves (i.e., not changed).
   - Levels not in the nested dictionary are mapped to themselves if `identity_map_unknown` is true else raise an error.
+  - use_levelnames: if true, the new feature names are generated using the level names when the transform generates multiple features;
+    else they are generated using the indices of the levels.
+  - custom_levels: if not nothing, then the levels of the categorical features are replaced by the custom_levels
 """
-function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore_unknown = false)
+function generic_transform(
+    X,
+    mapping_per_feat_level;
+    single_feat = true,
+    ignore_unknown = false,
+    use_levelnames = false,
+    custom_levels = nothing,
+)
     feat_names = Tables.schema(X).names
     new_feat_names = Symbol[]
     new_cols = []
@@ -115,25 +145,29 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
                 if !issubset(test_levels, train_levels)
                     # get the levels in test that are not in train
                     lost_levels = setdiff(test_levels, train_levels)
-                    error("While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.")
+                    error(
+                        "While transforming, found novel levels for the column $(feat_name): $(lost_levels) that were not seen while training.",
+                    )
                 end
             end
-            
+
             if single_feat
                 level2scalar = mapping_per_feat_level[feat_name]
                 new_col = !isempty(level2scalar) ? recode(col, level2scalar...) : col
                 push!(new_cols, new_col)
                 push!(new_feat_names, feat_name)
             else
                 level2vector = mapping_per_feat_level[feat_name]
-                new_multi_col = map(x->get(level2vector, x, x), col)
+                new_multi_col = map(x -> get(level2vector, x, x), col)
                 new_multi_col = [col for col in eachrow(hcat(new_multi_col...))]
                 push!(new_cols, new_multi_col...)
 
                 feat_names_with_inds = generate_new_feat_names(
                     feat_name,
                     length(first(mapping_per_feat_level[feat_name])[2]),
-                    feat_names,
+                    (custom_levels === nothing) ? keys(mapping_per_feat_level[feat_name]) : custom_levels,
+                    feat_names;
+                    use_levelnames = use_levelnames,
                 )
                 push!(new_feat_names, feat_names_with_inds...)
             end
@@ -144,8 +178,8 @@ function generic_transform(X, mapping_per_feat_level; single_feat = true, ignore
         end
     end
 
-    transformed_X= NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
+    transformed_X = NamedTuple{tuple(new_feat_names...)}(tuple(new_cols)...)
     # Attempt to preserve table type
     transformed_X = Tables.materializer(X)(transformed_X)
     return transformed_X
-end
+end
diff --git a/test/generic.jl b/test/generic.jl
@@ -27,21 +27,38 @@ push!(dataset_forms, create_dummy_dataset(:regression, as_dataframe=false, retur
 push!(dataset_forms, create_dummy_dataset(:regression, as_dataframe=true, return_y=false))
 
 @testset "Generate New feature names Function Tests" begin
-    # Test 1: No initial conflicts
-    @testset "No Initial Conflicts" begin
-        existing_names = []
-        names = generate_new_feat_names("feat", 3, existing_names)
-        @test names == [Symbol("feat_1"), Symbol("feat_2"), Symbol("feat_3")]
+    levels = ("A", "B", "C")
+
+    # Test 1: No initial conflicts, indices mode (use_levelnames=false)
+    @testset "No Initial Conflicts (Indices)" begin
+        existing_names = Symbol[]
+        names = generate_new_feat_names("feat", 2, levels, existing_names; use_levelnames=false)
+        @test names == [Symbol("feat_1"), Symbol("feat_2")]
+    end
+
+    # Test 2: No conflicts, level-names mode (default use_levelnames=true)
+    @testset "No Initial Conflicts (Level Names)" begin
+        existing_names = Symbol[]
+        names = generate_new_feat_names("feat", 3, levels, existing_names)
+        @test names == [Symbol("feat_A"), Symbol("feat_B"), Symbol("feat_C")]
     end
 
-    # Test 2: Handle initial conflict by adding underscores
-    @testset "Initial Conflict Resolution" begin
-        existing_names = [Symbol("feat_1"), Symbol("feat_2"), Symbol("feat_3")]
-        names = generate_new_feat_names("feat", 3, existing_names)
-        @test names == [Symbol("feat__1"), Symbol("feat__2"), Symbol("feat__3")]
+    # Test 3: Handle initial conflict by adding underscores (indices)
+    @testset "Initial Conflict Resolution (Indices)" begin
+        existing_names = [Symbol("feat_1"), Symbol("feat_2")]
+        names = generate_new_feat_names("feat", 2, levels, existing_names; use_levelnames=false)
+        @test names == [Symbol("feat__1"), Symbol("feat__2")]
+    end
+
+    # Test 4: Handle initial conflict by adding underscores (level names)
+    @testset "Initial Conflict Resolution (Level Names)" begin
+        existing_names = [Symbol("feat_A"), Symbol("feat_B"), Symbol("feat_C")]
+        names = generate_new_feat_names("feat", 3, levels, existing_names)
+        @test names == [Symbol("feat__A"), Symbol("feat__B"), Symbol("feat__C")]
     end
 end
 
+
 # Dummy encoder that maps each level to its hash (some arbitrary function)
 function dummy_encoder_fit(
     X,