Skip to content

Commit 12098d8

Browse files
OkonSamuelablaom
andauthored
Prelim work (#3)
* add draft for RFE model * rename package * Add FeatureSelectore and some tests * fix current tests * complete RFE model and add tests * Update model docstring * fix code, Update readme and add more tests * Apply suggestions from code review Co-authored-by: Anthony Blaom, PhD <[email protected]> * rename n_features_to_select to n_features * update readme with * Apply suggestions from code review Co-authored-by: Anthony Blaom, PhD <[email protected]> * set max column limit to 92 in readme * add Aqua.jl tests and refactor code * update ci * Apply suggestions from code review Co-authored-by: Anthony Blaom, PhD <[email protected]> * fix bug, add support for serialization and add more tests * Update ci.yml * Update ci.yml * Update ci.yml * Update ci.yml * Update ci.yml --------- Co-authored-by: Anthony Blaom, PhD <[email protected]>
1 parent b37a299 commit 12098d8

File tree

13 files changed

+1029
-198
lines changed

13 files changed

+1029
-198
lines changed

.github/workflows/TagBot.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,4 @@ jobs:
1212
- uses: JuliaRegistries/TagBot@v1
1313
with:
1414
token: ${{ secrets.GITHUB_TOKEN }}
15-
ssh: ${{ secrets.DOCUMENTER_KEY }}
15+
ssh: ${{ secrets.DOCUMENTER_KEY }}

.github/workflows/ci.yml

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,12 @@ jobs:
1313
test:
1414
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
1515
runs-on: ${{ matrix.os }}
16+
timeout-minutes: 60
1617
strategy:
1718
fail-fast: false
1819
matrix:
1920
version:
20-
- '1.0'
21+
- '1.6'
2122
- '1' # automatically expands to the latest stable 1.x release of Julia.
2223
os:
2324
- ubuntu-latest
@@ -29,6 +30,27 @@ jobs:
2930
with:
3031
version: ${{ matrix.version }}
3132
arch: ${{ matrix.arch }}
33+
- name: "Replace julia libstdcxx ubuntu + julia v1.6"
34+
shell: bash
35+
if: ${{ matrix.version == '1.6' && matrix.os == 'ubuntu-latest' }}
36+
# The following is needed for Julia <=1.8.3 on Linux OS
37+
# due to old version of libstcxx used by Julia
38+
# taken from https://github.com/hhaensel/ReplaceLibstdcxx.jl/blob/main/src/ReplaceLibstdcxx.jl
39+
run: |
40+
julia -e '
41+
libs = filter(x -> ! occursin("32", x), getindex.(split.(readlines(pipeline(`ldconfig -p`, `grep libstdc`)), r"\s*=>\s*"), 2))
42+
source_dir = dirname(libs[end])
43+
julia_lib_dir = joinpath(dirname(Sys.BINDIR), "lib", "julia")
44+
julia_lib_file = get(filter(endswith(r"libstdc\+\+.so\.\d+\.\d+\.\d+"), readdir(julia_lib_dir, join = true)), 1, nothing)
45+
julia_lib_version = match(r"so(\.\d+)\.", julia_lib_file).captures[1]
46+
source_lib = get(filter(endswith(r"libstdc\+\+.so\.\d+\.\d+\.\d+"), readdir(source_dir, join = true)), 1, nothing)
47+
julia_lib = joinpath(dirname(Sys.BINDIR), "lib", "julia", "libstdc++.so")
48+
for src in [julia_lib, julia_lib * julia_lib_version]
49+
islink(src) && rm(src, force = true)
50+
symlink(source_lib, src)
51+
@info read(`ls -al $src`, String)
52+
end
53+
'
3254
- uses: actions/cache@v1
3355
env:
3456
cache-name: cache-artifacts
@@ -65,19 +87,23 @@ jobs:
6587
end
6688
end
6789
event_name = "${{ github.event_name }}"
90+
ref = "${{ github.ref }}"
91+
ref_is_master = ref == "refs/heads/master"
92+
ref_is_dev = ref == "refs/heads/dev"
93+
ref_is_tag = startswith(ref, "refs/tags/")
6894
if event_name == "pull_request"
6995
base_ref = "${{ github.base_ref }}"
7096
head_ref = "${{ github.head_ref }}"
7197
base_repository = "${{ github.repository }}"
7298
head_repository = "${{ github.event.pull_request.head.repo.full_name }}"
73-
build_docs = (base_ref == "master") && (head_ref == "dev") && (base_repository == head_repository)
99+
is_not_fork = base_repository == head_repository
100+
build_docs = (base_ref == "master") && (head_ref == "dev") && (is_not_fork)
74101
elseif event_name == "push"
75-
ref = "${{ github.ref }}"
76-
build_docs = (ref == "refs/heads/master") || (startswith(ref, "refs/tags/"))
102+
build_docs = ref_is_master || ref_is_dev || ref_is_tag
77103
elseif event_name == "schedule"
78-
build_docs = ref == "refs/heads/master"
104+
build_docs = ref_is_master || ref_is_dev
79105
elseif event_name == "workflow_dispatch"
80-
build_docs = ref == "refs/heads/master"
106+
build_docs = ref_is_master || ref_is_dev
81107
else
82108
build_docs = false
83109
end

Project.toml

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,21 +4,45 @@ authors = ["Anthony D. Blaom <[email protected]>"]
44
version = "0.1.0"
55

66
[deps]
7-
Example = "7876af07-990d-54b4-ab0e-23690620f79a"
87
MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
98
ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161"
9+
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
1010

1111
[compat]
12-
Example = "0.5"
13-
MLJModelInterface = "1"
14-
ScientificTypesBase = "1, 2, 3"
15-
julia = "1"
12+
Aqua = "0.8"
13+
Distributions = "0.25"
14+
julia = "1.6"
15+
MLJBase = "1.1"
16+
MLJTuning = "0.8"
17+
MLJDecisionTreeInterface = "0.4"
18+
MLJScikitLearnInterface = "0.6"
19+
MLJModelInterface = "1.4"
20+
ScientificTypesBase = "3"
21+
StableRNGs = "1"
22+
StatisticalMeasures = "0.1"
23+
Tables = "1.2"
24+
Test = "1.6"
1625

1726
[extras]
27+
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
1828
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
1929
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
30+
MLJTuning = "03970b2e-30c4-11ea-3135-d1576263f10f"
31+
MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661"
32+
MLJScikitLearnInterface = "5ae90465-5518-4432-b9d2-8a1def2f0cab"
2033
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
34+
StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541"
2135
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
2236

2337
[targets]
24-
test = ["Distributions", "MLJBase", "StableRNGs", "Test"]
38+
test = [
39+
"Aqua",
40+
"Distributions",
41+
"MLJBase",
42+
"MLJTuning",
43+
"MLJDecisionTreeInterface",
44+
"MLJScikitLearnInterface",
45+
"StableRNGs",
46+
"StatisticalMeasures",
47+
"Test"
48+
]

README.md

Lines changed: 102 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,104 @@
11
# FeatureSelection.jl
22

3-
This repository is a template for creating repositories that contain
4-
glue code between (i) packages providing machine learning algorithms; and (ii)
5-
the machine learning toolbox
6-
[MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/) - that is,
7-
for so-called *interface-only packages*.
8-
9-
## When to use this template
10-
11-
This template is intended for use when a package providing a machine
12-
learning model algorithm is not hosting the code that implements the
13-
MLJ model API, and a separate package for this purpose is to be
14-
created. This repo is itself a working implementation but should
15-
be used in conjunction with the more detailed [model implementation
16-
guidelines](https://alan-turing-institute.github.io/MLJ.jl/dev/adding_models_for_general_use/).
17-
18-
## How to use this template
19-
20-
1. Clone this repository or use it as a template if available from your organization.
21-
22-
2. Rename this repository, replacing the word "Example" with the name of the model-providing package.
23-
24-
1. Develop the contents of src/MLJExampleInterface.jl appropriately.
25-
26-
2. Rename src/MLJExampleInterface.jl appropriately.
27-
28-
3. Remove Example from Project.toml and instead add the model-providing package.
29-
30-
3. **GENERATE A NEW UUID in Project.toml** and change the Project.toml
31-
name and author appropriately.
32-
33-
1. You may want to remove the Distributions test dependency if you don't need it.
34-
35-
4. Replace every instance of "Example" in this README.md with the name
36-
of the model-providing package and adjust the organization name in
37-
the link.
38-
39-
5. Remove everything in this REAMDE.md except what is below the line
40-
you are currently reading &#128521;.
41-
42-
43-
# MLJ.jl <--> Example.jl
44-
45-
Repository implementing the [MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/) model interface for models provided by
46-
[Example.jl](https://github.com/JuliaLang/Example.jl).
47-
48-
| Linux | Coverage |
49-
| :------------ | :------- |
50-
| [![Build Status](https://github.com/JuliaAI/MLJExampleInterface.jl/workflows/CI/badge.svg)](https://github.com/JuliaAI/MLJExampleInterface.jl/actions) | [![Coverage](https://codecov.io/gh/JuliaAI/MLJExampleInterface.jl/branch/master/graph/badge.svg)](https://codecov.io/github/JuliaAI/MLJExampleInterface.jl?branch=master) |
3+
| Linux | Coverage | Code Style
4+
| :------------ | :------- | :------------- |
5+
| [![Build Status](https://github.com/JuliaAI/FeatureSelection.jl/workflows/CI/badge.svg)](https://github.com/JuliaAI/FeatureSelection.jl/actions) | [![Coverage](https://codecov.io/gh/JuliaAI/FeatureSelection.jl/branch/master/graph/badge.svg)](https://codecov.io/github/JuliaAI/FeatureSelection.jl?branch=dev) | [![Code Style: Blue](https://img.shields.io/badge/code%20style-blue-4495d1.svg)](https://github.com/invenia/BlueStyle) |
6+
7+
Repository housing feature selection algorithms for use with the machine learning toolbox
8+
[MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/).
9+
10+
`FeatureSelector` model builds on contributions originally residing at [MLJModels.jl](https://github.com/JuliaAI/MLJModels.jl/blob/v0.16.15/src/builtins/Transformers.jl#L189-L266)
11+
12+
# Installation
13+
On a running instance of Julia with at least version 1.6 run
14+
```julia
15+
import Pkg;
16+
Pkg.add("FeatureSelection")
17+
```
18+
19+
# Example Usage
20+
Lets build a supervised recursive feature eliminator with `RandomForestRegressor`
21+
from DecisionTree.jl as our base model.
22+
But first we need a dataset to train on. We shall create a synthetic dataset popularly
23+
known in the R community as the friedman dataset#1. Notice how the target vector for this
24+
dataset depends on only the first five columns of feature table. So we expect that our
25+
recursive feature elimination should return the first columns as important features.
26+
```julia
27+
using MLJ, FeatureSelection
28+
using StableRNGs
29+
rng = StableRNG(10)
30+
A = rand(rng, 50, 10)
31+
X = MLJ.table(A) # features
32+
y = @views(
33+
10 .* sin.(
34+
pi .* A[:, 1] .* A[:, 2]
35+
) .+ 20 .* (A[:, 3] .- 0.5).^ 2 .+ 10 .* A[:, 4] .+ 5 * A[:, 5]
36+
) # target
37+
```
38+
Now we that we have our data we can create our recursive feature elimination model and
39+
train it on our dataset
40+
```julia
41+
RandomForestRegressor = @load RandomForestRegressor pkg=DecisionTree
42+
forest = RandomForestRegressor(rng=rng)
43+
rfe = RecursiveFeatureElimination(
44+
model = forest, n_features=5, step=1
45+
) # see doctring for description of defaults
46+
mach = machine(rfe, X, y)
47+
fit!(mach)
48+
```
49+
We can inspect the feature importances in two ways:
50+
```julia
51+
# A variable with lower rank has more significance than a variable with higher rank.
52+
# A variable with Higher feature importance is better than a variable with lower
53+
# feature importance
54+
report(mach).ranking # returns [1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
55+
feature_importances(mach) # returns dict of feature => importance pairs
56+
```
57+
We can view the important features used by our model by inspecting the `fitted_params`
58+
object.
59+
```julia
60+
p = fitted_params(mach)
61+
p.features_left == [:x1, :x2, :x3, :x4, :x5]
62+
```
63+
We can also call the `predict` method on the fitted machine, to predict using a
64+
random forest regressor trained using only the important features, or call the `transform`
65+
method, to select just those features from some new table including all the original
66+
features. For more info, type `?RecursiveFeatureElimination` on a Julia REPL.
67+
68+
Okay, let's say that we didn't know that our synthetic dataset depends on only five
69+
columns from our feature table. We could apply cross fold validation
70+
`StratifiedCV(nfolds=5)` with our recursive feature elimination model to select the
71+
optimal value of `n_features` for our model. In this case we will use a simple Grid
72+
search with root mean square as the measure.
73+
```julia
74+
rfe = RecursiveFeatureElimination(model = forest)
75+
tuning_rfe_model = TunedModel(
76+
model = rfe,
77+
measure = rms,
78+
tuning = Grid(rng=rng),
79+
resampling = StratifiedCV(nfolds = 5),
80+
range = range(
81+
rfe, :n_features, values = 1:10
82+
)
83+
)
84+
self_tuning_rfe_mach = machine(tuning_rfe_model, X, y)
85+
fit!(self_tuning_rfe_mach)
86+
```
87+
As before we can inspect the important features by inspecting the object returned by
88+
`fitted_params` or `feature_importances` as shown below.
89+
```julia
90+
fitted_params(self_tuning_rfe_mach).best_fitted_params.features_left == [:x1, :x2, :x3, :x4, :x5]
91+
feature_importances(self_tuning_rfe_mach) # returns dict of feature => importance pairs
92+
```
93+
and call `predict` on the tuned model machine as shown below
94+
```julia
95+
Xnew = MLJ.table(rand(rng, 50, 10)) # create test data
96+
predict(self_tuning_rfe_mach, Xnew)
97+
```
98+
In this case, prediction is done using the best recursive feature elimination model gotten
99+
from the tuning process above.
100+
101+
For resampling methods different from cross-validation, and for other
102+
`TunedModel` options, such as parallelization, see the
103+
[Tuning Models](https://alan-turing-institute.github.io/MLJ.jl/dev/tuning_models/) section of the MLJ manual.
104+
[MLJ Documentation](https://alan-turing-institute.github.io/MLJ.jl/dev/)

src/FeatureSelection.jl

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
module FeatureSelection
2+
3+
using MLJModelInterface, Tables, ScientificTypesBase
4+
5+
export FeatureSelector, RecursiveFeatureElimination
6+
7+
const MMI = MLJModelInterface
8+
9+
## Includes
10+
include("models/featureselector.jl")
11+
include("models/rfe.jl")
12+
13+
## Pkg Traits
14+
MMI.metadata_pkg.(
15+
(
16+
DeterministicRecursiveFeatureElimination,
17+
ProbabilisticRecursiveFeatureElimination,
18+
FeatureSelector
19+
),
20+
package_name = "FeatureSelection",
21+
package_uuid = "33837fe5-dbff-4c9e-8c2f-c5612fe2b8b6",
22+
package_url = "https://github.com/JuliaAI/FeatureSelection.jl",
23+
is_pure_julia = true,
24+
package_license = "MIT"
25+
)
26+
27+
end # module

0 commit comments

Comments
 (0)