1+ # # Adult Income Prediction: Comparing Categorical Encoders
2+
3+ # **Julia version** is assumed to be 1.10.*
4+
5+ # This demonstration is available as a Jupyter notebook or julia script (as well as the dataset)
6+ # [here](https://github.com/essamwise/MLJTransforms.jl/tree/main/docs/src/tutorials/wine_example).
7+ #
8+ # This tutorial compares different categorical encoding approaches on adult income prediction.
9+ # We'll test OneHot, Frequency, and Cardinality Reduction encoders with CatBoost classification.
10+ #
11+ # **Why compare encoders?** Categorical variables with many levels (like occupation, education)
12+ # can create high-dimensional sparse features. Different encoding strategies handle this
13+ # challenge differently, affecting both model performance and training speed.
14+ #
15+ # **High Cardinality Challenge:** We've added a synthetic feature with 100 categories to
16+ # demonstrate how encoders handle extreme cardinality - a common real-world scenario with
17+ # features like customer IDs, product codes, or geographical subdivisions.
18+
19+ # packages are already activated by generate.jl
20+
21+ using MLJ, MLJTransforms, DataFrames, ScientificTypes
22+ using Random, CSV, StatsBase, Plots, BenchmarkTools
23+
24+ # Import scitypes from MLJ to avoid any package version skew
25+ using MLJ: OrderedFactor, Continuous, Multiclass
26+
27+ # ## Load and Prepare Data
28+ # Load the Adult Income dataset. This dataset contains demographic information
29+ # and the task is to predict whether a person makes over $50K per year.
30+
31+ # Load data with header and rename columns to the expected symbols
32+ df = CSV. read (" ./adult.csv" , DataFrame; header = true )
33+ rename! (
34+ df,
35+ [
36+ :age ,
37+ :workclass ,
38+ :fnlwgt ,
39+ :education ,
40+ :education_num ,
41+ :marital_status ,
42+ :occupation ,
43+ :relationship ,
44+ :race ,
45+ :sex ,
46+ :capital_gain ,
47+ :capital_loss ,
48+ :hours_per_week ,
49+ :native_country ,
50+ :income ,
51+ ],
52+ )
53+
54+ first (df, 5 )
55+
56+
57+ # Clean the data by removing leading/trailing spaces and converting income to binary:
58+ for col in [:workclass , :education , :marital_status , :occupation , :relationship ,
59+ :race , :sex , :native_country , :income ]
60+ df[! , col] = strip .(string .(df[! , col]))
61+ end
62+
63+ # Convert income to binary (0 for <=50K, 1 for >50K)
64+ df. income = ifelse .(df. income .== " >50K" , 1 , 0 )
65+
66+ # Let's a high-cardinality categorical feature to showcase encoder handling
67+ # Create a realistic frequency distribution: A1-A3 make up 90% of data, A4-A500 make up 10%
68+ Random. seed! (42 )
69+ high_card_categories = [" A$i " for i in 1 : 500 ]
70+
71+ n_rows = nrow (df)
72+ n_frequent = Int (round (0.9 * n_rows)) # 90% for A1, A2, A3
73+ n_rare = n_rows - n_frequent # 10% for A4-A500
74+
75+ frequent_samples = rand ([" A1" , " A2" , " A3" ], n_frequent)
76+
77+ rare_categories = [" A$i " for i in 4 : 500 ]
78+ rare_samples = rand (rare_categories, n_rare)
79+
80+ # Combine and shuffle
81+ all_samples = vcat (frequent_samples, rare_samples)
82+ df. high_cardinality_feature = all_samples[randperm (n_rows)]
83+
84+ # Coerce categorical columns to appropriate scientific types.
85+ # Apply explicit type coercions using fully qualified names
86+ type_dict = Dict (
87+ :income => OrderedFactor,
88+ :age => Continuous,
89+ :fnlwgt => Continuous,
90+ :education_num => Continuous,
91+ :capital_gain => Continuous,
92+ :capital_loss => Continuous,
93+ :hours_per_week => Continuous,
94+ :workclass => Multiclass,
95+ :education => Multiclass,
96+ :marital_status => Multiclass,
97+ :occupation => Multiclass,
98+ :relationship => Multiclass,
99+ :race => Multiclass,
100+ :sex => Multiclass,
101+ :native_country => Multiclass,
102+ :high_cardinality_feature => Multiclass,
103+ )
104+ df = coerce (df, type_dict)
105+
106+ # Let's examine the cardinality of our categorical features:
107+ categorical_cols = [:workclass , :education , :marital_status , :occupation ,
108+ :relationship , :race , :sex , :native_country , :high_cardinality_feature ]
109+ println (" Cardinality of categorical features:" )
110+ for col in categorical_cols
111+ n_unique = length (unique (df[! , col]))
112+ println (" $col : $n_unique unique values" )
113+ end
114+
115+
116+
117+ # ## Split Data
118+ # Separate features (X) from target (y), then split into train/test sets:
119+
120+ y, X = unpack (df, == (:income ); rng = 123 );
121+ train, test = partition (eachindex (y), 0.8 , shuffle = true , rng = 100 );
122+
123+ # ## Setup Encoders and Model
124+ # Load the required models and create different encoding strategies:
125+
126+ OneHot = @load OneHotEncoder pkg = MLJModels verbosity = 0
127+ CatBoostClassifier = @load CatBoostClassifier pkg = CatBoost
128+
129+
130+ # **Encoding Strategies:**
131+ # 1. **OneHotEncoder**: Creates binary columns for each category
132+ # 2. **FrequencyEncoder**: Replaces categories with their frequency counts
133+ # In case of the one-hot-encoder, we worry when categories have high cardinality as that would lead to an explosion in the number of features.
134+
135+ card_reducer = MLJTransforms. CardinalityReducer (
136+ min_frequency = 0.15 ,
137+ ordered_factor = true ,
138+ label_for_infrequent = Dict (
139+ AbstractString => " OtherItems" ,
140+ Char => ' O' ,
141+ ),
142+ )
143+ onehot_model = OneHot (drop_last = true , ordered_factor = true )
144+ freq_model = MLJTransforms. FrequencyEncoder (normalize = false , ordered_factor = true )
145+ cat = CatBoostClassifier ();
146+
147+ # Create three different pipelines to compare:
148+ pipelines = [
149+ (" CardRed + OneHot + CAT" , card_reducer |> onehot_model |> cat),
150+ (" OneHot + CAT" , onehot_model |> cat),
151+ (" FreqEnc + CAT" , freq_model |> cat),
152+ ]
153+
154+ # ## Evaluate Pipelines with Proper Benchmarking
155+ # Train each pipeline and measure both performance (accuracy) and training time using @btime:
156+
157+ results = DataFrame (pipeline = String[], accuracy = Float64[], training_time = Float64[]);
158+
159+ # Prepare results DataFrame
160+
161+ for (name, pipe) in pipelines
162+ println (" Training and benchmarking: $name " )
163+
164+ # # Train once to compute accuracy
165+ mach = machine (pipe, X, y)
166+ MLJ. fit! (mach, rows = train)
167+ predictions = MLJ. predict_mode (mach, rows = test)
168+ accuracy_value = MLJ. accuracy (predictions, y[test])
169+
170+ # # Measure training time using @belapsed (returns Float64 seconds) with 5 samples
171+ # # Create a fresh machine inside the benchmark to avoid state sharing
172+ training_time =
173+ @belapsed MLJ. fit! (machine ($ pipe, $ X, $ y), rows = $ train, force = true ) samples = 5
174+
175+ println (" Training time (min over 5 samples): $(training_time) s" )
176+ println (" Accuracy: $(round (accuracy_value, digits= 4 )) \n " )
177+
178+ push! (results, (string (name), accuracy_value, training_time))
179+ end
180+
181+
182+ # Sort by accuracy (higher is better) and display results:
183+ sort! (results, :accuracy , rev = true )
184+ results
185+
186+ # ## Visualization
187+ # Create side-by-side bar charts to compare both training time and model performance:
188+
189+ n = nrow (results)
190+
191+ # Create a simple timing visualization (note: timing strings from @btime need manual parsing for plotting)
192+ # Sort by accuracy (higher is better)
193+ sort! (results, :accuracy , rev = true )
194+ results # show table
195+
196+ # -------------------------
197+ # Visualization (side-by-side)
198+ # -------------------------
199+ n = nrow (results)
200+ # training time plot (seconds)
201+ time_plot = bar (1 : n, results. training_time;
202+ xticks = (1 : n, results. pipeline),
203+ title = " Training Time (s)" ,
204+ xlabel = " Pipeline" , ylabel = " Time (s)" ,
205+ xrotation = 8 ,
206+ legend = false ,
207+ color = :lightblue ,
208+ )
209+
210+ # accuracy plot
211+ accuracy_plot = bar (1 : n, results. accuracy;
212+ xticks = (1 : n, results. pipeline),
213+ title = " Classification Accuracy" ,
214+ xlabel = " Pipeline" , ylabel = " Accuracy" ,
215+ xrotation = 8 ,
216+ legend = false ,
217+ ylim = (0.0 , 1.0 ),
218+ color = :lightcoral ,
219+ )
220+
221+
222+ combined_plot = plot (time_plot, accuracy_plot; layout = (1 , 2 ), size = (1200 , 500 ))
223+
224+ # Save the plot
225+ savefig (combined_plot, " adult_encoding_comparison.png" ); # hide
226+
227+ # md # 
228+
229+ # ## Conclusion
230+ #
231+ # **Key Findings from Results:**
232+ #
233+ # **Training Time Performance (dramatic differences!):**
234+ # - **FreqEnc + CAT**: 0.32 seconds - **fastest approach**
235+ # - **CardRed + OneHot + CAT**: 0.57 seconds - **10x faster than pure OneHot**
236+ # - **OneHot + CAT**: 5.85 seconds - **significantly slower due to high cardinality**
237+ #
238+ # **Accuracy:** In this example, we don't see a difference in accuracy but the savings in time are big.
239+
240+ # Note that we still observe a speed improvement with the cardinality reducer if we omit the high cardinality feature we added but it's much smaller as the adults dataset is not that high in cardinality.
0 commit comments