⭐️ Add basic extension feats for entity embedders

EssamWisam · EssamWisam · commit 060afd9f8565 · 2024-11-22T21:31:08.000-06:00
diff --git a/Project.toml b/Project.toml
@@ -19,9 +19,21 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 TableOperations = "ab02a1b2-a7df-11e8-156e-fb1833f50b87"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
+[weakdeps]
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
+Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
+MLJFlux = "094fc8d1-fd35-5302-93ea-dabda2abf845"
+
+[extensions]
+EntityEmbeddingsExt = ["MLJFlux", "Optimisers", "MLJBase", "Flux"]
+
 [compat]
 CategoricalArrays = "0.10"
+Flux = "0.14.25"
+MLJFlux = "0.6.0"
 MLJModelInterface = "1.11"
+Optimisers = "0.3.4"
 ScientificTypes = "3.0"
 StatsBase = "0.34"
 TableOperations = "1.2"
@@ -30,11 +42,11 @@ julia = "1.6.7"
 
 [extras]
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
-StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
 test = ["Test", "DataFrames", "MLJBase", "Random", "StableRNGs", "StatsModels"]
diff --git a/ext/EntityEmbeddingsExt.jl b/ext/EntityEmbeddingsExt.jl
@@ -0,0 +1,127 @@
+module EntityEmbeddingsExt
+
+using MLJFlux
+using Tables
+using ScientificTypes
+using MLJModelInterface
+using TableOperations
+using Optimisers
+using Flux
+using MLJBase
+using MLJTransforms
+using MLJTransforms: EntityEmbedder
+const MMI = MLJModelInterface
+
+# activations
+function MLJTransforms.get_activation(func_symbol::Symbol)
+    if hasproperty(Flux, func_symbol)
+        return getproperty(Flux, func_symbol)
+    else
+        error("Function $func_symbol not found in Flux.")
+    end
+end
+
+function MLJTransforms.entity_embedder_fit(
+    X,
+    y,
+    features::AbstractVector{Symbol} = Symbol[];
+    ignore::Bool = true,
+    hidden_layer_sizes::Tuple{Vararg{Int}} = (5,),
+    activation::Symbol = :relu,
+    epochs = 100,
+    batch_size = 32,
+    learning_rate = 0.01,
+    embedding_dims::Dict{Symbol, Real} = Dict{Symbol, Real}(),
+    verbosity::Int = 0,
+    kwargs...,
+)
+
+    # Figure out task
+    y_scitype = elscitype(y)
+    classification_types = (y_scitype <: Multiclass || y_scitype <: OrderedFactor)
+    regression_types = (y_scitype <: Continuous || y_scitype <: Count)
+    task =
+        regression_types ? :Regression :
+        classification_types ? :Classification : :Unknown
+    task == :Unknown && error(
+        "Your target must be Continuous/Count for regression or Multiclass/OrderedFactor for classification",
+    )
+
+    # Handle ignore and given feat names
+    feat_names_org = Tables.schema(X).names
+    feat_names =
+        (ignore) ? setdiff(feat_names_org, features) : intersect(feat_names_org, features)
+
+    feat_inds_cat = [
+        findfirst(feat_names .== feat_name) for
+        feat_name in feat_names if elscitype(Tables.getcolumn(X, feat_name)) <: Finite
+    ]
+
+    # Select only the relevant columns in `X` based on `feat_names`
+    X = X |> TableOperations.select(feat_names...) |> Tables.columntable
+
+
+    # Setup builder
+    builder = MLJFlux.MLP(;
+        hidden = hidden_layer_sizes,
+        σ = MLJTransforms.get_activation(activation),
+    )
+
+    # Accordingly fit NeuralNetworkRegressor, NeuralNetworkClassifier
+    clf =
+        (task == :Classification) ?
+        MLJFlux.NeuralNetworkClassifier(
+            builder = builder,
+            optimiser = Optimisers.Adam(learning_rate),
+            batch_size = batch_size,
+            epochs = epochs,
+            embedding_dims = embedding_dims;
+            kwargs...,
+        ) :
+        MLJFlux.NeuralNetworkRegressor(
+            builder = builder,
+            optimiser = Optimisers.Adam(learning_rate),
+            batch_size = batch_size,
+            epochs = epochs,
+            embedding_dims = embedding_dims;
+            kwargs...,
+        )
+
+    # Fit the model
+    mach = machine(clf, X, y)
+    fit!(mach, verbosity = verbosity)
+
+    # Get mappings
+
+    mapping_matrices = MLJFlux.get_embedding_matrices(
+        fitted_params(mach).chain,
+        feat_inds_cat,
+        feat_names,
+    )
+    ordinal_mappings = mach.fitresult[3]
+    cache = (
+        mapping_matrices = mapping_matrices,
+        ordinal_mappings = ordinal_mappings,
+        task = task,
+        machine = mach,
+    )
+    return cache
+end
+
+
+"""
+Given X and a dict of mapping_matrices that map each categorical column to a matrix, use the matrix to transform
+each level in each categorical columns using the columns of the matrix.
+
+This is used with the embedding matrices of the entity embedding layer in entity enabled models to implement entity embeddings.
+"""
+function MLJTransforms.entity_embedder_transform(X, cache)
+    mach = cache[:machine]
+    Xnew = MLJFlux.transform(mach, X)
+    return Xnew
+end
+
+include("EntityEmbeddingsInterface.jl")
+
+
+end
diff --git a/newmeh/Project.toml b/newmeh/Project.toml
@@ -0,0 +1,7 @@
+[deps]
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
+MLJFlux = "094fc8d1-fd35-5302-93ea-dabda2abf845"
+MLJTransforms = "23777cdb-d90c-4eb0-a694-7c2b83d5c1d6"
+Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
+Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
diff --git a/src/MLJTransforms.jl b/src/MLJTransforms.jl
@@ -10,12 +10,13 @@ using LinearAlgebra
 
 # Other transformers
 using Combinatorics
-import Distributions
+using Distributions: Distributions
 using Parameters
 using Dates
 using OrderedCollections
 
 
+
 const MMI = MLJModelInterface
 
 # Functions of generic use across transformers
@@ -26,27 +27,27 @@ include("utils.jl")
 include("encoders/target_encoding/errors.jl")
 include("encoders/target_encoding/target_encoding.jl")
 include("encoders/target_encoding/interface_mlj.jl")
-export  TargetEncoder
+export TargetEncoder
 
 # Ordinal encoding
 include("encoders/ordinal_encoding/ordinal_encoding.jl")
 include("encoders/ordinal_encoding/interface_mlj.jl")
-export  OrdinalEncoder
+export OrdinalEncoder
 
 # Frequency encoding
 include("encoders/frequency_encoding/frequency_encoding.jl")
 include("encoders/frequency_encoding/interface_mlj.jl")
 export frequency_encoder_fit, frequency_encoder_transform, FrequencyEncoder
-export  FrequencyEncoder
+export FrequencyEncoder
 
 # Cardinality reduction
 include("transformers/cardinality_reducer/cardinality_reducer.jl")
 include("transformers/cardinality_reducer/interface_mlj.jl")
 export cardinality_reducer_fit, cardinality_reducer_transform, CardinalityReducer
-export  CardinalityReducer
+export CardinalityReducer
 include("encoders/missingness_encoding/missingness_encoding.jl")
 include("encoders/missingness_encoding/interface_mlj.jl")
-export  MissingnessEncoder
+export MissingnessEncoder
 
 # Contrast encoder
 include("encoders/contrast_encoder/contrast_encoder.jl")
@@ -69,3 +70,25 @@ export UnivariateDiscretizer,
     OneHotEncoder, ContinuousEncoder, FillImputer, UnivariateFillImputer,
     UnivariateTimeTypeToContinuous, InteractionTransformer
 end
+
+# For the extension
+function get_activation end
+function entity_embedder_fit end
+function entity_embedder_transform end
+
+mutable struct EntityEmbedder{AS <: AbstractVector{Symbol},
+    TV <: Tuple{Vararg{Int}},
+    I1 <: Integer, I2 <: Integer, AF <: AbstractFloat,
+    DSR <: Dict{Symbol, Real}, I3 <: Int} <: Unsupervised
+    features::AS
+    ignore::Bool
+    hidden_layer_sizes::TV
+    activation::Symbol
+    epochs::I1
+    batch_size::I2
+    learning_rate::AF
+    embedding_dims::DSR
+    verbosity::I3
+end
+
+function EntityEmbedder end