fix bug, add support for serialization and add more tests

OkonSamuel · OkonSamuel · commit 2c7925926b3f · 2024-03-22T18:20:48.000+01:00
diff --git a/Project.toml b/Project.toml
@@ -13,20 +13,36 @@ Aqua = "0.8"
 Distributions = "0.25"
 julia = "1.6"
 MLJBase = "1.1"
+MLJTuning = "0.8"
 MLJDecisionTreeInterface = "0.4"
+MLJScikitLearnInterface = "0.6"
 MLJModelInterface = "1.4"
 ScientificTypesBase = "3"
 StableRNGs = "1"
+StatisticalMeasures = "0.1"
 Tables = "1.2"
 Test = "1.6"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
+MLJTuning = "03970b2e-30c4-11ea-3135-d1576263f10f"
 MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661"
+MLJScikitLearnInterface = "5ae90465-5518-4432-b9d2-8a1def2f0cab"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Aqua", "Distributions", "MLJBase", "MLJDecisionTreeInterface", "StableRNGs", "Test"]
+test = [
+    "Aqua",
+    "Distributions",
+    "MLJBase",
+    "MLJTuning",
+    "MLJDecisionTreeInterface",
+    "MLJScikitLearnInterface",
+    "StableRNGs", 
+    "StatisticalMeasures",
+    "Test"
+]
diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
 # FeatureSelection.jl
 
-| Linux | Coverage |
-| :------------ | :------- |
-| [![Build Status](https://github.com/JuliaAI/FeatureSelection.jl/workflows/CI/badge.svg)](https://github.com/JuliaAI/FeatureSelection.jl/actions) | [![Coverage](https://codecov.io/gh/JuliaAI/FeatureSelection.jl/branch/master/graph/badge.svg)](https://codecov.io/github/JuliaAI/FeatureSelection.jl?branch=dev) |
+| Linux | Coverage | Code Style
+| :------------ | :------- | :------------- |
+| [![Build Status](https://github.com/JuliaAI/FeatureSelection.jl/workflows/CI/badge.svg)](https://github.com/JuliaAI/FeatureSelection.jl/actions) | [![Coverage](https://codecov.io/gh/JuliaAI/FeatureSelection.jl/branch/master/graph/badge.svg)](https://codecov.io/github/JuliaAI/FeatureSelection.jl?branch=dev) | [![Code Style: Blue](https://img.shields.io/badge/code%20style-blue-4495d1.svg)](https://github.com/invenia/BlueStyle) |
 
 Repository housing feature selection algorithms for use with the machine learning toolbox
 [MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/).
@@ -26,20 +26,20 @@ recursive feature elimination should return the first columns as important featu
 ```julia
 using MLJ, FeatureSelection
 using StableRNGs
-rng = StableRNG(123)
+rng = StableRNG(10)
 A = rand(rng, 50, 10)
 X = MLJ.table(A) # features
 y = @views(
     10 .* sin.(
         pi .* A[:, 1] .* A[:, 2]
-    ) + 20 .* (A[:, 3] .- 0.5).^ 2 .+ 10 .* A[:, 4] .+ 5 * A[:, 5]
+    ) .+ 20 .* (A[:, 3] .- 0.5).^ 2 .+ 10 .* A[:, 4] .+ 5 * A[:, 5]
 ) # target
 ```
 Now we that we have our data we can create our recursive feature elimination model and 
 train it on our dataset
 ```julia
 RandomForestRegressor = @load RandomForestRegressor pkg=DecisionTree
-forest = RandomForestRegressor()
+forest = RandomForestRegressor(rng=rng)
 rfe = RecursiveFeatureElimination(
     model = forest, n_features=5, step=1
 ) # see doctring for description of defaults
@@ -48,24 +48,28 @@ fit!(mach)
 ```
 We can inspect the feature importances in two ways:
 ```julia
+# A variable with lower rank has more significance than a variable with higher rank.
+# A variable with Higher feature importance is better than a variable with lower 
+# feature importance
 report(mach).ranking # returns [1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
-feature_importances(mach) # returns dict of feature => rank pairs
+feature_importances(mach) # returns dict of feature => importance pairs
 ```
-We can view the important features used by our model by inspecting the `fitted_params` object.
+We can view the important features used by our model by inspecting the `fitted_params` 
+object.
 ```julia
 p = fitted_params(mach)
 p.features_left == [:x1, :x2, :x3, :x4, :x5]
 ```
 We can also call the `predict` method on the fitted machine, to predict using a 
 random forest regressor trained using only the important features, or call the `transform` 
-method, to select just those features from some new table including all the original features.
-For more info, type `?RecursiveFeatureElimination` on a Julia REPL.
+method, to select just those features from some new table including all the original 
+features. For more info, type `?RecursiveFeatureElimination` on a Julia REPL.
 
 Okay, let's say that we didn't know that our synthetic dataset depends on only five 
-columns from our feature table. We could apply cross fold validation `CV(nfolds=5)` with 
-our recursive feature elimination model to select the optimal value of  
-`n_features` for our model. In this case we will use a simple Grid search with root mean 
-square as the measure. 
+columns from our feature table. We could apply cross fold validation 
+`StratifiedCV(nfolds=5)` with our recursive feature elimination model to select the 
+optimal value of `n_features` for our model. In this case we will use a simple Grid 
+search with root mean square as the measure. 
 ```julia
 rfe = RecursiveFeatureElimination(model = forest)
 tuning_rfe_model  = TunedModel(
@@ -74,15 +78,17 @@ tuning_rfe_model  = TunedModel(
     tuning = Grid(rng=rng),
     resampling = StratifiedCV(nfolds = 5),
     range = range(
-        rfe, :n_features, lower = 1, upper=10, unit=1
+        rfe, :n_features, values = 1:10
     )
 )
 self_tuning_rfe_mach = machine(tuning_rfe_model, X, y)
 fit!(self_tuning_rfe_mach)
 ```
-As before we can inspect the important features by inspecting the `fitted_params` object.
+As before we can inspect the important features by inspecting the object returned by 
+`fitted_params` or `feature_importances` as shown below.
 ```julia
 fitted_params(self_tuning_rfe_mach).best_fitted_params.features_left == [:x1, :x2, :x3, :x4, :x5]
+feature_importances(self_tuning_rfe_mach) # returns dict of feature => importance pairs
 ```
 and call `predict` on the tuned model machine as shown below
 ```julia
diff --git a/src/models/rfe.jl b/src/models/rfe.jl
@@ -202,34 +202,35 @@ function MMI.fit(selector::RFE, verbosity::Int, X, y, args...)
     )
 
     # Compute required number of features to select
-    n_features = selector.n_features # Remember to modify this estimate later
+    n_features_select = selector.n_features
     ## zero indicates that half of the features be selected.
-    if n_features == 0
-        n_features = div(nfeatures, 2) 
-    elseif 0 < n_features < 1
-        n_features = round(Int, n_features * n_features)
+    if n_features_select == 0
+        n_features_select = div(nfeatures, 2) 
+    elseif 0 < n_features_select < 1
+        n_features_select = round(Int, n_features_select * nfeatures)
     else
-        n_features = round(Int, n_features)
+        n_features_select = round(Int, n_features_select)
     end
 
     step = selector.step
     
     if 0 < step < 1
-        step = round(Int, max(1, step * n_features))
+        step = round(Int, max(1, step * n_features_select))
     else
         step = round(Int, step) 
     end
     
     support = trues(nfeatures)
-    ranking = ones(nfeatures) # every feature has equal rank initially
-    indexes  = axes(support, 1)
+    ranking = ones(Int, nfeatures) # every feature has equal rank initially
+    mask = trues(nfeatures) # for boolean indexing of ranking vector in while loop below
 
     # Elimination
-    features_left = copy(features)
-    while sum(support) > n_features
+    features_left = features
+    n_features_left = length(features_left)
+    while n_features_left > n_features_select
         # Rank the remaining features
         model = selector.model
-        verbosity > 0 && @info("Fitting estimator with $(sum(support)) features.")
+        verbosity > 0 && @info("Fitting estimator with $(n_features_left) features.")
     
         data = MMI.reformat(model, MMI.selectcols(X, features_left), args...)
 
@@ -249,24 +250,25 @@ function MMI.fit(selector::RFE, verbosity::Int, X, y, args...)
         ranks = sortperm(importances)
 
         # Eliminate the worse features
-        threshold = min(step, sum(support) - n_features)
-        
-        support[indexes[ranks][1:threshold]] .= false
-        ranking[.!support] .+= 1
+        threshold = min(step, n_features_left - n_features_select)
+        @views(support[support][ranks[1:threshold]]) .= false
+        mask .= support .== false
+        @views(ranking[mask]) .+= 1
 
         # Remaining features
-        features_left = @view(features[support])
+        features_left = features[support]
+        n_features_left = length(features_left)
     end
 
     # Set final attributes
     data = MMI.reformat(selector.model, MMI.selectcols(X, features_left), args...)
-    verbosity > 0 && @info ("Fitting estimator with $(sum(support)) features.")
+    verbosity > 0 && @info ("Fitting estimator with $(n_features_left) features.")
     model_fitresult, _, model_report = MMI.fit(selector.model, verbosity - 1, data...)
     
     fitresult = (
         support = support,
         model_fitresult = model_fitresult,
-        features_left = copy(features_left),
+        features_left = features_left,
         features = features
     )
     report = ( 
@@ -280,7 +282,7 @@ end
 
 function MMI.fitted_params(model::RFE, fitresult)
     (
-        features_left = fitresult.features_left,
+        features_left = copy(fitresult.features_left),
         model_fitresult = MMI.fitted_params(model.model, fitresult.model_fitresult)
     )
 end
@@ -295,15 +297,45 @@ function MMI.transform(::RFE, fitresult, X)
     sch = Tables.schema(Tables.columns(X))
     if (length(fitresult.features) == length(sch.names) && 
         !all(e -> e in sch.names, fitresult.features))
-        throw(
-            ERR_FEATURES_SEEN
-        )
+            throw(
+                ERR_FEATURES_SEEN
+            )
     end
     return MMI.selectcols(X, fitresult.features_left)
 end
 
 function MMI.feature_importances(::RFE, fitresult, report)
-    return Pair.(fitresult.features, report.ranking)
+    return Pair.(fitresult.features, Iterators.reverse(report.ranking))
+end
+
+function MMI.save(model::RFE, fitresult)
+    support = fitresult.support
+    atomic_fitresult = fitresult.model_fitresult
+    features_left = fitresult.features_left
+    features = fitresult.features
+    
+    atom = model.model
+    return (
+        support = copy(support),
+        model_fitresult = MMI.save(atom, atomic_fitresult),
+        features_left = copy(features_left),
+        features = copy(features)
+    )
+end
+
+function MMI.restore(model::RFE, serializable_fitresult)
+    support = serializable_fitresult.support
+    atomic_serializable_fitresult = serializable_fitresult.model_fitresult
+    features_left = serializable_fitresult.features_left
+    features = serializable_fitresult.features
+
+    atom = model.model
+    return (
+        support = support,
+        model_fitresult = MMI.restore(atom, atomic_serializable_fitresult),
+        features_left = features_left,
+        features = features
+    )
 end
 
 ## Traits definitions
diff --git a/test/models/dummy_test_models.jl b/test/models/dummy_test_models.jl
@@ -1,6 +1,7 @@
 module DummyTestModels
 
 using MLJBase
+using Distributions
 
 ## THE CONSTANT DETERMINISTIC REGRESSOR (FOR TESTING)
 ##
@@ -17,9 +18,50 @@ end
 MLJBase.reformat(::DeterministicConstantRegressor, X) = (MLJBase.matrix(X),)
 MLJBase.reformat(::DeterministicConstantRegressor, X, y) = (MLJBase.matrix(X), y)
 MLJBase.selectrows(::DeterministicConstantRegressor, I, A) = (view(A, I, :),)
-MLJBase.selectrows(::DeterministicConstantRegressor, I, A, y) =
-    (view(A, I, :), y[I])
+function MLJBase.selectrows(::DeterministicConstantRegressor, I, A, y)
+    return (view(A, I, :), y[I])
+end
+
+function MLJBase.predict(::DeterministicConstantRegressor, fitresult, Xnew)
+    return fill(fitresult, nrows(Xnew))
+end
+
+## THE EphemeralClassifier (FOR TESTING)
+## Define a Deterministic Classifier with non-persistent `fitresult`, but which addresses
+## this by overloading `save`/`restore`:
+struct EphemeralClassifier <: MLJBase.Deterministic end
+thing = []
+
+function MLJBase.fit(::EphemeralClassifier, verbosity, X, y)
+    # if I serialize/deserialized `thing` then `id` below changes:
+    id = objectid(thing)
+    p = Distributions.fit(UnivariateFinite, y)
+    fitresult = (thing, id, p)
+    report  = (features = MLJBase.schema(X).names,)
+    return fitresult, nothing, report
+end
+
+function MLJBase.predict(::EphemeralClassifier, fitresult, X)
+    thing, id, p = fitresult
+    id == objectid(thing) ||  throw(ErrorException("dead fitresult"))
+    return [mode(p) for _ in 1:MLJBase.nrows(X)]
+end
+
+function MLJBase.feature_importances(model::EphemeralClassifier, fitresult, report)
+    return [ftr => 1.0 for ftr in report.features]
+end
+
+MLJBase.target_scitype(::Type{<:EphemeralClassifier}) = AbstractVector{OrderedFactor{2}}
+MLJBase.reports_feature_importances(::Type{<:EphemeralClassifier}) = true
+
+function MLJBase.save(::EphemeralClassifier, fitresult)
+    thing, _, p = fitresult
+    return (thing, p)
+end
+function MLJBase.restore(::EphemeralClassifier, serialized_fitresult)
+    thing, p = serialized_fitresult
+    id = objectid(thing)
+    return (thing, id, p)
+end
 
-MLJBase.predict(::DeterministicConstantRegressor, fitresult, Xnew) =
-    fill(fitresult, nrows(Xnew))
 end
diff --git a/test/models/rfe.jl b/test/models/rfe.jl
diff --git a/test/runtests.jl b/test/runtests.jl