JuliaAI
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 0 deletions b/‎.gitignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/make.jl‎
Lines changed: 4 additions & 4 deletions b/‎docs/make.jl‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/src/generate.jl‎
Lines changed: 3 additions & 1 deletion b/‎docs/src/generate.jl‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/src/tutorials/wine_example/Manifest.toml‎ renamed to ‎docs/src/tutorials/adult_example/Manifest.toml‎
Lines changed: 25 additions & 3 deletions b/‎docs/src/tutorials/wine_example/Manifest.toml‎ renamed to ‎docs/src/tutorials/adult_example/Manifest.toml‎
Lines changed: 25 additions & 3 deletions
diff --git a/‎docs/src/tutorials/wine_example/Project.toml‎ renamed to ‎docs/src/tutorials/adult_example/Project.toml‎
Lines changed: 2 additions & 0 deletions b/‎docs/src/tutorials/wine_example/Project.toml‎ renamed to ‎docs/src/tutorials/adult_example/Project.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/src/tutorials/adult_example/adult_encoding_comparison.png‎
31.2 KB b/‎docs/src/tutorials/adult_example/adult_encoding_comparison.png‎
31.2 KB
diff --git a/‎docs/src/tutorials/adult_example/generate.jl‎
Lines changed: 3 additions & 0 deletions b/‎docs/src/tutorials/adult_example/generate.jl‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/src/tutorials/wine_example/notebook.ipynb‎ renamed to ‎docs/src/tutorials/adult_example/notebook.ipynb‎ b/‎docs/src/tutorials/wine_example/notebook.ipynb‎ renamed to ‎docs/src/tutorials/adult_example/notebook.ipynb‎
diff --git a/‎docs/src/tutorials/adult_example/notebook.jl‎
Lines changed: 240 additions & 0 deletions b/‎docs/src/tutorials/adult_example/notebook.jl‎
Lines changed: 240 additions & 0 deletions
@@ -28,3 +28,8 @@ meh/*.ipynb
 /*.jl
 scratchpad/
 examples/test.jl
+catboost_info/**
+/catboost_info
+/catboost_info
+/docs/src/tutorials/adult_example/.CondaPkg
+/docs/src/tutorials/adult_example/catboost_info
@@ -35,10 +35,10 @@ makedocs(
             ],
         ],
         "Extended Examples" => Any[
-            "Standardization Impact"=>"tutorials/standardization/notebook.md",
-            "Milk Quality Classification"=>"tutorials/classic_comparison/notebook.md",
-            "Wine Quality Prediction"=>"tutorials/wine_example/notebook.md",
-            "Entity Embeddings Tutorial"=>"tutorials/entity_embeddings/notebook.md",
+            "Standardization Impact"      => "tutorials/standardization/notebook.md",
+            "Milk Quality Classification" => "tutorials/classic_comparison/notebook.md",
+            "Adult Income Classification" => "tutorials/adult_example/notebook.md",
+            "Entity Embeddings Tutorial"  => "tutorials/entity_embeddings/notebook.md",
         ],
         "Contributing" => "contributing.md",
         "About" => "about.md",
 
@@ -1,7 +1,9 @@
 function generate(dir; execute = true, pluto = false)
     quote
         using Pkg
-        Pkg.activate(temp = true)
+        # Activate the specific tutorial directory instead of temp environment
+        Pkg.activate($dir)
+        Pkg.instantiate()
         Pkg.add("Literate")
         using Literate
 
 
@@ -2,7 +2,7 @@
 
 julia_version = "1.11.5"
 manifest_format = "2.0"
-project_hash = "5a14cc2e68cb2e8e5e7b95aca3553ebdf3e9929e"
+project_hash = "2bc58e2f1d5ca6172834e6bda17b630aa9b5ac28"
 
 [[deps.ADTypes]]
 git-tree-sha1 = "be7ae030256b8ef14a441726c4c37766b90b93a3"
@@ -165,6 +165,12 @@ git-tree-sha1 = "aebf55e6d7795e02ca500a689d326ac979aaf89e"
 uuid = "9718e550-a3fa-408a-8086-8db961cd8217"
 version = "0.1.1"
 
+[[deps.BenchmarkTools]]
+deps = ["Compat", "JSON", "Logging", "Printf", "Profile", "Statistics", "UUIDs"]
+git-tree-sha1 = "e38fbc49a620f5d0b660d7f543db1009fe0f8336"
+uuid = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+version = "1.6.0"
+
 [[deps.BitBasis]]
 deps = ["LinearAlgebra", "StaticArrays"]
 git-tree-sha1 = "89dc08420d4f593ff30f02611d136b475a5eb43d"
@@ -766,6 +772,12 @@ git-tree-sha1 = "68c173f4f449de5b438ee67ed0c9c748dc31a2ec"
 uuid = "34004b35-14d8-5ef3-9330-4cdb6864b03a"
 version = "0.3.28"
 
+[[deps.IOCapture]]
+deps = ["Logging", "Random"]
+git-tree-sha1 = "b6d6bfdd7ce25b0f9b2f6b3dd56b2673a66c8770"
+uuid = "b5f81e59-6552-4d32-b1f0-c071b021bf89"
+version = "0.2.5"
+
 [[deps.InitialValues]]
 git-tree-sha1 = "4da0f88e9a39111c2fa3add390ab15f3a44f3ca3"
 uuid = "22cec73e-a1b8-11e9-2c92-598750a2cf9c"
@@ -1059,6 +1071,12 @@ weakdeps = ["ChainRulesCore", "SparseArrays", "Statistics"]
     LinearMapsSparseArraysExt = "SparseArrays"
     LinearMapsStatisticsExt = "Statistics"
 
+[[deps.Literate]]
+deps = ["Base64", "IOCapture", "JSON", "REPL"]
+git-tree-sha1 = "da046be6d63304f7ba9c1bb04820fb306ba1ab12"
+uuid = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
+version = "2.20.1"
+
 [[deps.LogExpFunctions]]
 deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
 git-tree-sha1 = "13ca9e2586b89836fd20cccf56e57e2b9ae7f38f"
@@ -1162,10 +1180,10 @@ uuid = "d491faf4-2d78-11e9-2867-c94bc002c0b7"
 version = "0.17.9"
 
 [[deps.MLJTransforms]]
-deps = ["BitBasis", "CategoricalArrays", "Combinatorics", "Dates", "Distributions", "LinearAlgebra", "MLJModelInterface", "OrderedCollections", "Parameters", "ScientificTypes", "Statistics", "StatsBase", "TableOperations", "Tables"]
+deps = ["BitBasis", "CategoricalArrays", "Combinatorics", "Dates", "Distributions", "LinearAlgebra", "MLJModelInterface", "OrderedCollections", "Parameters", "ScientificTypes", "ScientificTypesBase", "Statistics", "StatsBase", "TableOperations", "Tables"]
 path = "/Users/essamwisam/Documents/GitHub/MLJTransforms"
 uuid = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6"
-version = "0.1.6"
+version = "0.1.1"
 
 [[deps.MLJTuning]]
 deps = ["ComputationalResources", "Distributed", "Distributions", "LatinHypercubeSampling", "MLJBase", "ProgressMeter", "Random", "RecipesBase", "StatisticalMeasuresBase"]
@@ -1502,6 +1520,10 @@ deps = ["Unicode"]
 uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 version = "1.11.0"
 
+[[deps.Profile]]
+uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
+version = "1.11.0"
+
 [[deps.ProgressMeter]]
 deps = ["Distributed", "Printf"]
 git-tree-sha1 = "13c5103482a8ed1536a54c08d0e742ae3dca2d42"
 
@@ -1,11 +1,13 @@
 [deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 CatBoost = "e2e10f9a-a85d-4fa9-b6b2-639a32100a12"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 LIBSVM = "b1bec4e5-fd48-53fe-b0cb-9723c09d164b"
 LightGBM = "7acf609c-83a4-11e9-1ffb-b912bcd3b04a"
+Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
 MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
 MLJGLMInterface = "caf8df21-4939-456d-ac9c-5fefbfb04c0c"
 MLJLinearModels = "6ee0df7b-362f-4a72-a706-9e79364fb692"
 
@@ -0,0 +1,3 @@
+# Use the per-tutorial environment defined by `Project.toml` in this folder
+joinpath(@__DIR__, "..", "..", "generate.jl") |> include
+generate(@__DIR__, execute = true)
@@ -0,0 +1,240 @@
+# # Adult Income Prediction: Comparing Categorical Encoders
+
+# **Julia version** is assumed to be 1.10.*
+
+# This demonstration is available as a Jupyter notebook or julia script (as well as the dataset)
+# [here](https://github.com/essamwise/MLJTransforms.jl/tree/main/docs/src/tutorials/wine_example).
+#
+# This tutorial compares different categorical encoding approaches on adult income prediction.
+# We'll test OneHot, Frequency, and Cardinality Reduction encoders with CatBoost classification.
+# 
+# **Why compare encoders?** Categorical variables with many levels (like occupation, education)
+# can create high-dimensional sparse features. Different encoding strategies handle this
+# challenge differently, affecting both model performance and training speed.
+# 
+# **High Cardinality Challenge:** We've added a synthetic feature with 100 categories to 
+# demonstrate how encoders handle extreme cardinality - a common real-world scenario with
+# features like customer IDs, product codes, or geographical subdivisions.
+
+# packages are already activated by generate.jl
+
+using MLJ, MLJTransforms, DataFrames, ScientificTypes
+using Random, CSV, StatsBase, Plots, BenchmarkTools
+
+# Import scitypes from MLJ to avoid any package version skew
+using MLJ: OrderedFactor, Continuous, Multiclass
+
+# ## Load and Prepare Data
+# Load the Adult Income dataset. This dataset contains demographic information
+# and the task is to predict whether a person makes over $50K per year.
+
+# Load data with header and rename columns to the expected symbols
+df = CSV.read("./adult.csv", DataFrame; header = true)
+rename!(
+    df,
+    [
+        :age,
+        :workclass,
+        :fnlwgt,
+        :education,
+        :education_num,
+        :marital_status,
+        :occupation,
+        :relationship,
+        :race,
+        :sex,
+        :capital_gain,
+        :capital_loss,
+        :hours_per_week,
+        :native_country,
+        :income,
+    ],
+)
+
+first(df, 5)
+
+
+# Clean the data by removing leading/trailing spaces and converting income to binary:
+for col in [:workclass, :education, :marital_status, :occupation, :relationship,
+    :race, :sex, :native_country, :income]
+    df[!, col] = strip.(string.(df[!, col]))
+end
+
+# Convert income to binary (0 for <=50K, 1 for >50K)
+df.income = ifelse.(df.income .== ">50K", 1, 0)
+
+# Let's a high-cardinality categorical feature to showcase encoder handling
+# Create a realistic frequency distribution: A1-A3 make up 90% of data, A4-A500 make up 10%
+Random.seed!(42)
+high_card_categories = ["A$i" for i in 1:500]
+
+n_rows = nrow(df)
+n_frequent = Int(round(0.9 * n_rows))  # 90% for A1, A2, A3
+n_rare = n_rows - n_frequent           # 10% for A4-A500
+
+frequent_samples = rand(["A1", "A2", "A3"], n_frequent)
+
+rare_categories = ["A$i" for i in 4:500]
+rare_samples = rand(rare_categories, n_rare)
+
+# Combine and shuffle
+all_samples = vcat(frequent_samples, rare_samples)
+df.high_cardinality_feature = all_samples[randperm(n_rows)]
+
+# Coerce categorical columns to appropriate scientific types. 
+# Apply explicit type coercions using fully qualified names
+type_dict = Dict(
+    :income => OrderedFactor,
+    :age => Continuous,
+    :fnlwgt => Continuous,
+    :education_num => Continuous,
+    :capital_gain => Continuous,
+    :capital_loss => Continuous,
+    :hours_per_week => Continuous,
+    :workclass => Multiclass,
+    :education => Multiclass,
+    :marital_status => Multiclass,
+    :occupation => Multiclass,
+    :relationship => Multiclass,
+    :race => Multiclass,
+    :sex => Multiclass,
+    :native_country => Multiclass,
+    :high_cardinality_feature => Multiclass,
+)
+df = coerce(df, type_dict)
+
+# Let's examine the cardinality of our categorical features:
+categorical_cols = [:workclass, :education, :marital_status, :occupation,
+    :relationship, :race, :sex, :native_country, :high_cardinality_feature]
+println("Cardinality of categorical features:")
+for col in categorical_cols
+    n_unique = length(unique(df[!, col]))
+    println("  $col: $n_unique unique values")
+end
+
+
+
+# ## Split Data
+# Separate features (X) from target (y), then split into train/test sets:
+
+y, X = unpack(df, ==(:income); rng = 123);
+train, test = partition(eachindex(y), 0.8, shuffle = true, rng = 100);
+
+# ## Setup Encoders and Model
+# Load the required models and create different encoding strategies:
+
+OneHot = @load OneHotEncoder pkg = MLJModels verbosity = 0
+CatBoostClassifier = @load CatBoostClassifier pkg = CatBoost
+
+
+# **Encoding Strategies:**
+# 1. **OneHotEncoder**: Creates binary columns for each category
+# 2. **FrequencyEncoder**: Replaces categories with their frequency counts
+# In case of the one-hot-encoder, we worry when categories have high cardinality as that would lead to an explosion in the number of features.
+
+card_reducer = MLJTransforms.CardinalityReducer(
+    min_frequency = 0.15,
+    ordered_factor = true,
+    label_for_infrequent = Dict(
+        AbstractString => "OtherItems",
+        Char => 'O',
+    ),
+)
+onehot_model = OneHot(drop_last = true, ordered_factor = true)
+freq_model = MLJTransforms.FrequencyEncoder(normalize = false, ordered_factor = true)
+cat = CatBoostClassifier();
+
+# Create three different pipelines to compare:
+pipelines = [
+    ("CardRed + OneHot + CAT", card_reducer |> onehot_model |> cat),
+    ("OneHot + CAT", onehot_model |> cat),
+    ("FreqEnc + CAT", freq_model |> cat),
+]
+
+# ## Evaluate Pipelines with Proper Benchmarking
+# Train each pipeline and measure both performance (accuracy) and training time using @btime:
+
+results = DataFrame(pipeline = String[], accuracy = Float64[], training_time = Float64[]);
+
+# Prepare results DataFrame
+
+for (name, pipe) in pipelines
+    println("Training and benchmarking: $name")
+
+    ## Train once to compute accuracy
+    mach = machine(pipe, X, y)
+    MLJ.fit!(mach, rows = train)
+    predictions = MLJ.predict_mode(mach, rows = test)
+    accuracy_value = MLJ.accuracy(predictions, y[test])
+
+    ## Measure training time using @belapsed (returns Float64 seconds) with 5 samples
+    ## Create a fresh machine inside the benchmark to avoid state sharing
+    training_time =
+        @belapsed MLJ.fit!(machine($pipe, $X, $y), rows = $train, force = true) samples = 5
+
+    println("  Training time (min over 5 samples): $(training_time) s")
+    println("  Accuracy: $(round(accuracy_value, digits=4))\n")
+
+    push!(results, (string(name), accuracy_value, training_time))
+end
+
+
+# Sort by accuracy (higher is better) and display results:
+sort!(results, :accuracy, rev = true)
+results
+
+# ## Visualization
+# Create side-by-side bar charts to compare both training time and model performance:
+
+n = nrow(results)
+
+# Create a simple timing visualization (note: timing strings from @btime need manual parsing for plotting)
+# Sort by accuracy (higher is better)
+sort!(results, :accuracy, rev = true)
+results  # show table
+
+# -------------------------
+# Visualization (side-by-side)
+# -------------------------
+n = nrow(results)
+# training time plot (seconds)
+time_plot = bar(1:n, results.training_time;
+    xticks = (1:n, results.pipeline),
+    title = "Training Time (s)",
+    xlabel = "Pipeline", ylabel = "Time (s)",
+    xrotation = 8,
+    legend = false,
+    color = :lightblue,
+)
+
+# accuracy plot
+accuracy_plot = bar(1:n, results.accuracy;
+    xticks = (1:n, results.pipeline),
+    title = "Classification Accuracy",
+    xlabel = "Pipeline", ylabel = "Accuracy",
+    xrotation = 8,
+    legend = false,
+    ylim = (0.0, 1.0),
+    color = :lightcoral,
+)
+
+
+combined_plot = plot(time_plot, accuracy_plot; layout = (1, 2), size = (1200, 500))
+
+# Save the plot
+savefig(combined_plot, "adult_encoding_comparison.png"); #hide
+
+#md # ![Adult Encoding Comparison](adult_encoding_comparison_proper_benchmark.png)
+
+# ## Conclusion
+#
+# **Key Findings from Results:**
+# 
+# **Training Time Performance (dramatic differences!):**
+# - **FreqEnc + CAT**: 0.32 seconds - **fastest approach**
+# - **CardRed + OneHot + CAT**: 0.57 seconds - **10x faster than pure OneHot**
+# - **OneHot + CAT**: 5.85 seconds - **significantly slower due to high cardinality**
+#
+# **Accuracy:** In this example, we don't see a difference in accuracy but the savings in time are big.
+
+# Note that we still observe a speed improvement with the cardinality reducer if we omit the high cardinality feature we added but it's much smaller as the adults dataset is not that high in cardinality.
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+# Use the per-tutorial environment defined by `Project.toml` in this folder
	`2`	`+joinpath(@__DIR__, "..", "..", "generate.jl") \|> include`
	`3`	`+generate(@__DIR__, execute = true)`