Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ Cargo.lock
*.tar.gz
.asv
.idea
benchmarks/
17 changes: 15 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,13 @@ keywords = ["machine-learning", "random-forest", "decision-tree", "ensemble", "t
name = "biosphere"

[dependencies]
ndarray = "0.17.1"
ndarray = "0.17.2"
rand = "0.9"
rayon = "1.11"
serde = { version = "1", features = ["derive"], optional = true }

[features]
serde = ["dep:serde"]

[profile.bench]
incremental = true
Expand All @@ -25,11 +29,11 @@ lto = "fat"

[dev-dependencies]
rstest = "0.26"
ndarray-rand = "0.16"
csv = "^1"
ndarray-csv = "^0.5"
criterion = "0.8"
assert_approx_eq = "1.1"
postcard = { version = "1", features = ["use-std"] }

[[bench]]
name = "bench_utils"
Expand All @@ -42,3 +46,12 @@ harness = false
[[bench]]
name = "bench_tree"
harness = false

[[bench]]
name = "bench_ops"
harness = false

[[bench]]
name = "bench_tree_serde"
harness = false
required-features = ["serde"]
31 changes: 30 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,35 @@ Random forests with a runtime of `O(n d log(n) + n_estimators d n max_depth)` in

`biosphere` is available as a rust crate and as a Python package.

## Serialize / deserialize a `DecisionTree`

Enable the `serde` feature and choose a serde format (here: `postcard`):

```toml
# Cargo.toml
biosphere = { version = "0.4.2", features = ["serde"] }
postcard = { version = "1", features = ["use-std"] }
```

```rust
use biosphere::DecisionTree;

let X = ndarray::array![[0.0], [1.0], [2.0], [3.0]];
let y = ndarray::array![0.0, 0.0, 1.0, 1.0];

let mut tree = DecisionTree::default();
tree.fit(&X.view(), &y.view());

// serialize and deserialize the tree
let bytes = postcard::to_stdvec(&tree).unwrap();
// deserialize the tree from bytes
let restored: DecisionTree = postcard::from_bytes(&bytes).unwrap();

assert_eq!(tree.predict(&X.view()), restored.predict(&X.view()));
```

In this repo you can run: `cargo run --example decision_tree_serde --features serde`.

## Benchmarks

Ran on an M1 Pro with `n_jobs=4`. Wall-time to fit a Random Forest including OOB score with 400 trees to
Expand All @@ -16,4 +45,4 @@ features.
| model | 1000 | 2000 | 4000 | 8000 | 16000 | 32000 | 64000 | 128000 | 256000 | 512000 | 1024000 | 2048000 |
|:-------------|:-------|:-------|:-------|:-------|:--------|:--------|:--------|:---------|:---------|:---------|:----------|:----------|
| biosphere | 0.04s | 0.08s | 0.15s | 0.32s | 0.65s | 1.40s | 2.97s | 6.48s | 15.53s | 37.91s | 96.69s | 231.82s |
| scikit-learn | 0.28s | 0.34s | 0.46s | 0.69s | 1.23s | 2.47s | 4.99s | 10.49s | 22.11s | 51.04s | 118.95s | 271.03s |
| scikit-learn | 0.28s | 0.34s | 0.46s | 0.69s | 1.23s | 2.47s | 4.99s | 10.49s | 22.11s | 51.04s | 118.95s | 271.03s |
24 changes: 9 additions & 15 deletions benches/bench_forest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@ use biosphere::{RandomForest, RandomForestParameters};

#[cfg(test)]
use criterion::{criterion_group, criterion_main, Criterion};
use ndarray::{s, Array, Array1, Array2};
use ndarray_rand::rand_distr::{Bernoulli, Uniform};
use ndarray_rand::RandomExt;
use ndarray::{s, Array1, Array2};
use rand::rngs::StdRng;
use rand::Rng;
use rand::SeedableRng;
Expand All @@ -15,25 +13,21 @@ pub fn data(n: usize, d: usize, rng: &mut impl Rng) -> (Array2<f64>, Array1<f64>

for i in 0..d {
if i % 2 == 0 {
X.slice_mut(s![.., i]).assign(&Array::random_using(
(n,),
Uniform::new(0., 1.).unwrap(),
rng,
));
X.slice_mut(s![.., i])
.assign(&Array1::from_shape_fn(n, |_| rng.random::<f64>()));
} else {
X.slice_mut(s![.., i]).assign(
&Array::random_using((n,), Bernoulli::new(0.3).unwrap(), rng).mapv(|x| {
if x {
X.slice_mut(s![.., i])
.assign(&Array1::from_shape_fn(n, |_| {
if rng.random_bool(0.3) {
1.0
} else {
0.0
}
}),
);
}));
}
}
let X = Array::random_using((n, d), Uniform::new(0., 1.).unwrap(), rng);
let y = Array::random_using(n, Uniform::new(0., 1.).unwrap(), rng);
let X = Array2::from_shape_fn((n, d), |_| rng.random::<f64>());
let y = Array1::from_shape_fn(n, |_| rng.random::<f64>());
let y = y + X.column(0) + X.column(1).map(|x| x - x * x);

(X, y)
Expand Down
167 changes: 167 additions & 0 deletions benches/bench_ops.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
use biosphere::{
DecisionTree, DecisionTreeParameters, MaxFeatures, RandomForest, RandomForestParameters,
};

#[cfg(test)]
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use ndarray::{s, Array1, Array2};
use rand::rngs::StdRng;
use rand::Rng;
use rand::SeedableRng;
use std::hint::black_box;

#[allow(non_snake_case)]
fn data(n: usize, d: usize, rng: &mut impl Rng) -> (Array2<f64>, Array1<f64>) {
let mut X = Array2::<f64>::zeros((n, d));

for i in 0..d {
if i % 2 == 0 {
X.slice_mut(s![.., i])
.assign(&Array1::from_shape_fn(n, |_| rng.random::<f64>()));
} else {
X.slice_mut(s![.., i])
.assign(&Array1::from_shape_fn(n, |_| {
if rng.random_bool(0.3) {
1.0
} else {
0.0
}
}));
}
}

let y = Array1::from_shape_fn(n, |_| rng.random::<f64>())
+ X.column(0)
+ X.column(1).map(|x| x - x * x);

(X, y)
}

#[allow(non_snake_case)]
pub fn benchmark_ops(c: &mut Criterion) {
let mut rng = StdRng::seed_from_u64(0);

// --------------------
// Decision tree: fit
// --------------------
let mut tree_fit = c.benchmark_group("tree_fit");
for &(n, d, max_depth, max_features) in &[
(10_000usize, 10usize, 8usize, 10usize),
(100_000usize, 10usize, 8usize, 10usize),
] {
let (X, y) = data(n, d, &mut rng);
let X_view = X.view();
let y_view = y.view();
tree_fit.throughput(Throughput::Elements(n as u64));

let params = DecisionTreeParameters::default()
.with_max_depth(Some(max_depth))
.with_max_features(MaxFeatures::Value(max_features));

tree_fit.bench_with_input(
BenchmarkId::from_parameter(format!("n={}, d={}", n, d)),
&(X_view, y_view),
|b, (X, y)| {
b.iter(|| {
let mut tree = DecisionTree::new(params.clone());
tree.fit(X, y);
black_box(tree)
})
},
);
}
tree_fit.finish();

// -----------------------
// Decision tree: predict
// -----------------------
let mut tree_predict = c.benchmark_group("tree_predict");
for &(n, d, max_depth, max_features) in &[
(10_000usize, 10usize, 8usize, 10usize),
(100_000usize, 10usize, 8usize, 10usize),
] {
let (X, y) = data(n, d, &mut rng);
let X_view = X.view();
let y_view = y.view();
tree_predict.throughput(Throughput::Elements(n as u64));

let params = DecisionTreeParameters::default()
.with_max_depth(Some(max_depth))
.with_max_features(MaxFeatures::Value(max_features));

let mut tree = DecisionTree::new(params);
tree.fit(&X_view, &y_view);

tree_predict.bench_with_input(
BenchmarkId::from_parameter(format!("n={}, d={}", n, d)),
&X_view,
|b, X| b.iter(|| black_box(tree.predict(X))),
);
}
tree_predict.finish();

// ---------------------
// Random forest: fit
// ---------------------
let mut forest_fit = c.benchmark_group("forest_fit");
forest_fit.sample_size(10);
for &(n, d, n_estimators, max_depth) in &[(100_000usize, 10usize, 100usize, 8usize)] {
let (X, y) = data(n, d, &mut rng);
let X_view = X.view();
let y_view = y.view();
forest_fit.throughput(Throughput::Elements(n as u64));

for &n_jobs in &[1i32, 4i32] {
let params = RandomForestParameters::default()
.with_n_estimators(n_estimators)
.with_max_depth(Some(max_depth))
.with_n_jobs(Some(n_jobs));

forest_fit.bench_with_input(
BenchmarkId::new(format!("n_jobs={}", n_jobs), format!("n={}, d={}", n, d)),
&(X_view, y_view),
|b, (X, y)| {
b.iter(|| {
let mut forest = RandomForest::new(params.clone());
forest.fit(X, y);
black_box(forest)
})
},
);
}
}
forest_fit.finish();

// ------------------------
// Random forest: predict
// ------------------------
let mut forest_predict = c.benchmark_group("forest_predict");
for &(n, d, n_estimators, max_depth) in &[(100_000usize, 10usize, 100usize, 8usize)] {
let (X, y) = data(n, d, &mut rng);
let X_view = X.view();
let y_view = y.view();
forest_predict.throughput(Throughput::Elements(n as u64));

let params = RandomForestParameters::default()
.with_n_estimators(n_estimators)
.with_max_depth(Some(max_depth))
.with_n_jobs(Some(4));

let mut forest = RandomForest::new(params);
forest.fit(&X_view, &y_view);

forest_predict.bench_with_input(
BenchmarkId::from_parameter(format!("n={}, d={}", n, d)),
&X_view,
|b, X| b.iter(|| black_box(forest.predict(X))),
);
}
forest_predict.finish();
}

criterion_group!(
name = bench_ops;
config = Criterion::default().sample_size(10);
targets = benchmark_ops
);
criterion_main!(bench_ops);
27 changes: 13 additions & 14 deletions benches/bench_tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@ use biosphere::{DecisionTree, DecisionTreeParameters, MaxFeatures};

#[cfg(test)]
use criterion::{criterion_group, criterion_main, Criterion};
use ndarray::{s, Array, Array1, Array2};
use ndarray_rand::rand_distr::{Bernoulli, Uniform};
use ndarray_rand::RandomExt;
use ndarray::{s, Array1, Array2};
use rand::rngs::StdRng;
use rand::Rng;
use rand::SeedableRng;
Expand All @@ -16,20 +14,21 @@ pub fn data(n: usize, d: usize, rng: &mut impl Rng) -> (Array2<f64>, Array1<f64>

for i in 0..d {
if i % 2 == 0 {
X.slice_mut(s![.., i]).assign(&Array::random_using(
(n,),
Uniform::new(0., 1.).unwrap(),
rng,
));
X.slice_mut(s![.., i])
.assign(&Array1::from_shape_fn(n, |_| rng.random::<f64>()));
} else {
X.slice_mut(s![.., i]).assign(
&Array::random_using((n,), Bernoulli::new(0.3).unwrap(), rng)
.mapv(|x| x as i64 as f64),
);
X.slice_mut(s![.., i])
.assign(&Array1::from_shape_fn(n, |_| {
if rng.random_bool(0.3) {
1.0
} else {
0.0
}
}));
}
}
let X = Array::random_using((n, d), Uniform::new(0., 1.).unwrap(), rng);
let y = Array::random_using(n, Uniform::new(0., 1.).unwrap(), rng);
let X = Array2::from_shape_fn((n, d), |_| rng.random::<f64>());
let y = Array1::from_shape_fn(n, |_| rng.random::<f64>());
let y = y + X.column(0) + X.column(1).map(|x| x - x * x);

(X, y)
Expand Down
Loading
Loading