Merge pull request #23 from JuliaAI/dev

ablaom · web-flow · commit 833bbdc4cdfc · 2024-07-30T13:55:28.000+12:00
For a 0.2.1 release
diff --git a/.github/codecov.yml b/.github/codecov.yml
@@ -0,0 +1,9 @@
+coverage:
+  status:
+    project:
+      default:
+        threshold: 0.5%
+        removed_code_behavior: fully_covered_patch
+    patch:
+      default:
+        target: 80%
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -70,6 +70,8 @@ jobs:
         with:
           file: lcov.info
           token: ${{ secrets.CODECOV_TOKEN }}
+          fail_ci_if_error: false
+
   docs:
     name: Documentation
     runs-on: ubuntu-latest
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "FeatureSelection"
 uuid = "33837fe5-dbff-4c9e-8c2f-c5612fe2b8b6"
 authors = ["Anthony D. Blaom <anthony.blaom@gmail.com>", "Samuel Okon <okonsamuel50@gmail.com"]
-version = "0.2.0"
+version = "0.2.1"
 
 [deps]
 MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
diff --git a/src/models/rfe.jl b/src/models/rfe.jl
@@ -41,17 +41,14 @@ eval(:(const RFE{M} =
 
 # Common keyword constructor for both model types
 """
-    RecursiveFeatureElimination(model, n_features, step)
+    RecursiveFeatureElimination(model; n_features=0, step=1)
 
 This model implements a recursive feature elimination algorithm for feature selection.
 It recursively removes features, training a base model on the remaining features and
 evaluating their importance until the desired number of features is selected.
-       
-Construct an instance with default hyper-parameters using the syntax
-`rfe_model = RecursiveFeatureElimination(model=...)`. Provide keyword arguments to override
-hyper-parameter defaults.
 
 # Training data
+
 In MLJ or MLJBase, bind an instance `rfe_model` to data with
 
     mach = machine(rfe_model, X, y)
@@ -92,53 +89,62 @@ Train the machine using `fit!(mach, rows=...)`.
 # Operations
 
 - `transform(mach, X)`: transform the input table `X` into a new table containing only
-columns corresponding to features gotten from the RFE algorithm.
+  columns corresponding to features accepted by the RFE algorithm.
 
 - `predict(mach, X)`: transform the input table `X` into a new table same as in
-
-- `transform(mach, X)` above and predict using the fitted base model on the
-  transformed table.
+  `transform(mach, X)` above and predict using the fitted base model on the transformed
+  table.
 
 # Fitted parameters
+
 The fields of `fitted_params(mach)` are:
+
 - `features_left`: names of features remaining after recursive feature elimination.
 
 - `model_fitresult`: fitted parameters of the base model.
 
 # Report
+
 The fields of `report(mach)` are:
+
 - `scores`: dictionary of scores for each feature in the training dataset.
-  The model deems highly scored variables more significant. 
+  The model deems highly scored variables more significant.
 
 - `model_report`: report for the fitted base model.
 
 
 # Examples
+
+The following example assumes you have MLJDecisionTreeInterface in the active package
+ennvironment.
+
 ```
-using FeatureSelection, MLJ, StableRNGs
+using MLJ
 
 RandomForestRegressor = @load RandomForestRegressor pkg=DecisionTree
 
 # Creates a dataset where the target only depends on the first 5 columns of the input table.
-A = rand(rng, 50, 10);
+A = rand(50, 10);
 y = 10 .* sin.(
         pi .* A[:, 1] .* A[:, 2]
-    ) + 20 .* (A[:, 3] .- 0.5).^ 2 .+ 10 .* A[:, 4] .+ 5 * A[:, 5]);
+    ) + 20 .* (A[:, 3] .- 0.5).^ 2 .+ 10 .* A[:, 4] .+ 5 * A[:, 5];
 X = MLJ.table(A);
 
-# fit a rfe model
+# fit a rfe model:
 rf = RandomForestRegressor()
-selector = RecursiveFeatureElimination(model = rf)
+selector = RecursiveFeatureElimination(rf, n_features=2)
 mach = machine(selector, X, y)
 fit!(mach)
 
 # view the feature importances
 feature_importances(mach)
 
-# predict using the base model
-Xnew = MLJ.table(rand(rng, 50, 10));
+# predict using the base model trained on the reduced feature set:
+Xnew = MLJ.table(rand(50, 10));
 predict(mach, Xnew)
 
+# transform data with all features to the reduced feature set:
+transform(mach, Xnew)
 ```
 """
 function RecursiveFeatureElimination(
@@ -173,7 +179,7 @@ function RecursiveFeatureElimination(
         # This branch is hit just incase there are any models that supports_class_weights
         # feature importance that aren't `<:Probabilistic` or `<:Deterministic`
         # which is rare.
-        throw(ERR_MODEL_TYPE) 
+        throw(ERR_MODEL_TYPE)
     end
     message = MMI.clean!(selector)
     isempty(message) || @warn(message)
@@ -214,22 +220,30 @@ abs_last(x::Pair{<:Any, <:Real}) = abs(last(x))
 """
     score_features!(scores_dict, features, importances, n_features_to_score)
 
-Internal method that updates the `scores_dict` by increasing the score for each feature based on their 
+**Private method.**
+
+Update the `scores_dict` by increasing the score for each feature based on their
 importance and store the features in the `features` array.
 
 # Arguments
-- `scores_dict::Dict{Symbol, Int}`: A dictionary where the keys are features and 
+
+- `scores_dict::Dict{Symbol, Int}`: A dictionary where the keys are features and
   the values are their corresponding scores.
+
 - `features::Vector{Symbol}`: An array to store the top features based on importance.
-- `importances::Vector{Pair(Symbol, <:Real)}}`: An array of tuples where each tuple 
-  contains a feature and its importance score. 
+
+- `importances::Vector{Pair(Symbol, <:Real)}}`: An array of tuples where each tuple
+  contains a feature and its importance score.
+
 - `n_features_to_score::Int`: The number of top features to score and store.
 
 # Notes
-Ensure that `n_features_to_score` is less than or equal to the minimum of the 
+
+Ensure that `n_features_to_score` is less than or equal to the minimum of the
 lengths of `features` and `importances`.
 
 # Example
+
 ```julia
 scores_dict = Dict(:feature1 => 0, :feature2 => 0, :feature3 => 0)
 features = [:x1, :x1, :x1]
@@ -244,7 +258,7 @@ features == [:feature1, :feature2, :x1]
 function score_features!(scores_dict, features, importances, n_features_to_score)
     for i in Base.OneTo(n_features_to_score)
         ftr = first(importances[i])
-        features[i] = ftr 
+        features[i] = ftr
         scores_dict[ftr] += 1
     end
 end
@@ -273,7 +287,7 @@ function MMI.fit(selector::RFE, verbosity::Int, X, y, args...)
                 "n_features > number of features in training data, "*
                 "hence no feature will be eliminated."
             )
-        end 
+        end
     end
 
     _step = selector.step
@@ -296,17 +310,17 @@ function MMI.fit(selector::RFE, verbosity::Int, X, y, args...)
         verbosity > 0 && @info("Fitting estimator with $(n_features_to_keep) features.")
         data = MMI.reformat(model, MMI.selectcols(X, features_left), args...)
         fitresult, _, report = MMI.fit(model, verbosity - 1, data...)
-        # Note that the MLJ feature importance API does not impose any restrictions on the 
-        # ordering of `feature => score` pairs in the `importances` vector. 
+        # Note that the MLJ feature importance API does not impose any restrictions on the
+        # ordering of `feature => score` pairs in the `importances` vector.
         # Therefore, the order of `feature => score` pairs in the `importances` vector
-        # might differ from the order of features in the `features` vector, which is 
+        # might differ from the order of features in the `features` vector, which is
         # extracted from the feature matrix `X` above. Hence the need for a dictionary
         # implementation.
         importances = MMI.feature_importances(
             selector.model,
             fitresult,
             report
-        ) 
+        )
 
         # Eliminate the worse features and increase score of remaining features
         sort!(importances, by=abs_last, rev = true)
@@ -396,6 +410,7 @@ end
 MMI.load_path(::Type{<:RFE}) = "FeatureSelection.RecursiveFeatureElimination"
 MMI.constructor(::Type{<:RFE}) = RecursiveFeatureElimination
 MMI.package_name(::Type{<:RFE}) = "FeatureSelection"
+MMI.is_wrapper(::Type{<:RFE}) = true
 
 for trait in [
     :supports_weights,