diff --git a/Cargo.toml b/Cargo.toml index 5502be8f..eeb2a073 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,3 +19,4 @@ datasets = [] num = { version = "0.1.41", default-features = false } rand = "0.4.1" rulinalg = { git = "https://github.com/AtheMathmo/rulinalg", rev = "1ed8b937" } +kdtree = "0.5" diff --git a/src/learning/dbscan.rs b/src/learning/dbscan.rs index e7bbdee7..41d0cbab 100644 --- a/src/learning/dbscan.rs +++ b/src/learning/dbscan.rs @@ -36,12 +36,14 @@ //! let clustering = model.clusters().unwrap(); //! ``` -use learning::{LearningResult, UnSupModel}; use learning::error::{Error, ErrorKind}; +use learning::{LearningResult, UnSupModel}; -use linalg::{Matrix, Vector, BaseMatrix}; -use rulinalg::utils; +use kdtree::distance::squared_euclidean; +use kdtree::KdTree; +use linalg::{BaseMatrix, Matrix, Vector}; use rulinalg::matrix::Row; +use rulinalg::utils; /// DBSCAN Model /// @@ -80,17 +82,27 @@ impl UnSupModel, Vector>> for DBSCAN { fn train(&mut self, inputs: &Matrix) -> LearningResult<()> { self.init_params(inputs.rows()); let mut cluster = 0; + let mut neighbours = Vec::with_capacity(inputs.rows()); + let mut sub_neighbours = Vec::with_capacity(inputs.rows()); + + let kdt = Self::kdtree(inputs); for (idx, point) in inputs.row_iter().enumerate() { let visited = self._visited[idx]; + idx; + point; + visited; if !visited { self._visited[idx] = true; - let neighbours = self.region_query(point, inputs); + neighbours.clear(); + self.region_query(point, inputs, &mut neighbours, &kdt); + neighbours.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap()); + neighbours.dedup(); if neighbours.len() >= self.min_points { - self.expand_cluster(inputs, idx, neighbours, cluster); + self.expand_cluster(inputs, idx, &mut neighbours, &mut sub_neighbours, cluster, &kdt); cluster += 1; } } @@ -105,16 +117,14 @@ impl UnSupModel, Vector>> for DBSCAN { fn predict(&self, inputs: &Matrix) -> LearningResult>> { if self.predictive { - if let (&Some(ref cluster_data), &Some(ref clusters)) = (&self._cluster_data, - &self.clusters) { + if let (&Some(ref cluster_data), &Some(ref clusters)) = (&self._cluster_data, &self.clusters) { let mut classes = Vec::with_capacity(inputs.rows()); for input_point in inputs.row_iter() { let mut distances = Vec::with_capacity(cluster_data.rows()); for cluster_point in cluster_data.row_iter() { - let point_distance = - utils::vec_bin_op(input_point.raw_slice(), cluster_point.raw_slice(), |x, y| x - y); + let point_distance = utils::vec_bin_op(input_point.raw_slice(), cluster_point.raw_slice(), |x, y| x - y); distances.push(utils::dot(&point_distance, &point_distance).sqrt()); } @@ -131,8 +141,7 @@ impl UnSupModel, Vector>> for DBSCAN { Err(Error::new_untrained()) } } else { - Err(Error::new(ErrorKind::InvalidState, - "Model must be set to predictive. Use `self.set_predictive(true)`.")) + Err(Error::new(ErrorKind::InvalidState, "Model must be set to predictive. Use `self.set_predictive(true)`.")) } } } @@ -167,49 +176,46 @@ impl DBSCAN { self.clusters.as_ref() } - fn expand_cluster(&mut self, - inputs: &Matrix, - point_idx: usize, - neighbour_pts: Vec, - cluster: usize) { - debug_assert!(point_idx < inputs.rows(), - "Point index too large for inputs"); - debug_assert!(neighbour_pts.iter().all(|x| *x < inputs.rows()), - "Neighbour indices too large for inputs"); - + fn expand_cluster<'a>(&mut self, inputs: &Matrix, mut point_idx: usize, neighbours: &mut Vec, sub_neighbours: &mut Vec, cluster: usize, kdt: &KdTree) { + debug_assert!(point_idx < inputs.rows(), "Point index too large for inputs"); self.clusters.as_mut().map(|x| x.mut_data()[point_idx] = Some(cluster)); - for data_point_idx in &neighbour_pts { - let visited = self._visited[*data_point_idx]; + while let Some(data_point_idx) = neighbours.pop() { + debug_assert!(data_point_idx < inputs.rows(), "Data point index too large for inputs"); + debug_assert!(neighbours.iter().all(|x| *x < inputs.rows()), "Neighbour indices too large for inputs"); + + self.clusters.as_mut().map(|x| x.mut_data()[point_idx] = Some(cluster)); + let visited = self._visited[data_point_idx]; if !visited { - self._visited[*data_point_idx] = true; - let data_point_row = unsafe { inputs.row_unchecked(*data_point_idx) }; - let sub_neighbours = self.region_query(data_point_row, inputs); + self._visited[data_point_idx] = true; + let data_point_row = unsafe { inputs.row_unchecked(data_point_idx) }; + + sub_neighbours.clear(); + self.region_query(data_point_row, inputs, sub_neighbours, kdt); if sub_neighbours.len() >= self.min_points { - self.expand_cluster(inputs, *data_point_idx, sub_neighbours, cluster); + point_idx = data_point_idx; + neighbours.extend_from_slice(&sub_neighbours); + neighbours.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap()); + neighbours.dedup(); } } } } - - fn region_query(&self, point: Row, inputs: &Matrix) -> Vec { - debug_assert!(point.cols() == inputs.cols(), - "point must be of same dimension as inputs"); - - let mut in_neighbourhood = Vec::new(); - for (idx, data_point) in inputs.row_iter().enumerate() { - //TODO: Use `MatrixMetric` when rulinalg#154 is fixed. - let point_distance = utils::vec_bin_op(data_point.raw_slice(), point.raw_slice(), |x, y| x - y); - let dist = utils::dot(&point_distance, &point_distance).sqrt(); - - if dist < self.eps { - in_neighbourhood.push(idx); - } + fn region_query<'a>(&self, point: Row, inputs: &Matrix, neighbours: &mut Vec, kdt: &KdTree) { + debug_assert!(point.cols() == inputs.cols(), "point must be of same dimension as inputs"); + for (pnt, idx) in kdt.within(point.raw_slice(), self.eps.powi(2), &squared_euclidean).expect("KdTree error checking point") { + neighbours.push(*idx); } + } - in_neighbourhood + fn kdtree<'a>(inputs: &'a Matrix) -> KdTree { + let mut kdt = KdTree::new(inputs.cols()); + for (idx, row) in inputs.row_iter().enumerate() { + kdt.add(row.raw_slice(), idx); + } + kdt } fn init_params(&mut self, total_points: usize) { @@ -229,17 +235,19 @@ impl DBSCAN { #[cfg(test)] mod tests { use super::DBSCAN; - use linalg::{Matrix, BaseMatrix}; + use linalg::{BaseMatrix, Matrix}; #[test] fn test_region_query() { let model = DBSCAN::new(1.0, 3); let inputs = Matrix::new(3, 2, vec![1.0, 1.0, 1.1, 1.9, 3.0, 3.0]); + let kdt = DBSCAN::kdtree(&inputs); let m = matrix![1.0, 1.0]; let row = m.row(0); - let neighbours = model.region_query(row, &inputs); + let mut neighbours = vec![]; + model.region_query(row, &inputs, &mut neighbours, &kdt); assert!(neighbours.len() == 2); } @@ -249,10 +257,12 @@ mod tests { let model = DBSCAN::new(0.01, 3); let inputs = Matrix::new(3, 2, vec![1.0, 1.0, 1.1, 1.9, 1.1, 1.1]); + let kdt = DBSCAN::kdtree(&inputs); let m = matrix![1.0, 1.0]; let row = m.row(0); - let neighbours = model.region_query(row, &inputs); + let mut neighbours = vec![]; + model.region_query(row, &inputs, &mut neighbours, &kdt); assert!(neighbours.len() == 1); } diff --git a/src/lib.rs b/src/lib.rs index e117551b..58b1dd24 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -109,6 +109,7 @@ #[macro_use] extern crate rulinalg; +extern crate kdtree; extern crate num as libnum; extern crate rand; @@ -118,9 +119,9 @@ pub mod prelude; /// /// This module contains reexports of common tools from the rulinalg crate. pub mod linalg { - pub use rulinalg::matrix::{Axes, Matrix, MatrixSlice, MatrixSliceMut, BaseMatrix, BaseMatrixMut}; - pub use rulinalg::vector::Vector; + pub use rulinalg::matrix::{Axes, BaseMatrix, BaseMatrixMut, Matrix, MatrixSlice, MatrixSliceMut}; pub use rulinalg::norm; + pub use rulinalg::vector::Vector; } /// Module for data handling @@ -133,15 +134,15 @@ pub mod learning { pub mod dbscan; pub mod glm; pub mod gmm; + pub mod gp; + pub mod k_means; + pub mod knn; pub mod lin_reg; pub mod logistic_reg; - pub mod k_means; - pub mod nnet; - pub mod gp; - pub mod svm; pub mod naive_bayes; - pub mod knn; + pub mod nnet; pub mod pca; + pub mod svm; pub mod error; @@ -177,11 +178,7 @@ pub mod learning { type Targets; /// Compute the gradient for the model. - fn compute_grad(&self, - params: &[f64], - inputs: &Self::Inputs, - targets: &Self::Targets) - -> (f64, Vec); + fn compute_grad(&self, params: &[f64], inputs: &Self::Inputs, targets: &Self::Targets) -> (f64, Vec); } /// Trait for optimization algorithms. @@ -189,16 +186,11 @@ pub mod learning { /// Return the optimized parameter using gradient optimization. /// /// Takes in a set of starting parameters and related model data. - fn optimize(&self, - model: &M, - start: &[f64], - inputs: &M::Inputs, - targets: &M::Targets) - -> Vec; + fn optimize(&self, model: &M, start: &[f64], inputs: &M::Inputs, targets: &M::Targets) -> Vec; } - pub mod grad_desc; pub mod fmincg; + pub mod grad_desc; } /// Module for learning tools. diff --git a/tests/learning/dbscan.rs b/tests/learning/dbscan.rs index 5761e193..c59665aa 100644 --- a/tests/learning/dbscan.rs +++ b/tests/learning/dbscan.rs @@ -3,39 +3,60 @@ use rm::linalg::Matrix; use rm::learning::dbscan::DBSCAN; use rm::learning::UnSupModel; +extern crate rand; +use self::rand::{thread_rng, Rng}; + +#[test] +#[ignore] +fn test_does_not_overflow() { + let mut rng = thread_rng(); + let n = 100_000; + let d = 2; + let inputs = Matrix::new(n, d, rng.gen_iter::().take(n * d).collect::>()); + + let mut model = DBSCAN::new(0.5, 2); + model.train(&inputs).unwrap(); + + let clustering = dbg!(model.clusters().unwrap()); +} + #[test] fn test_basic_clusters() { - let inputs = Matrix::new(6, 2, vec![1.0, 2.0, - 1.1, 2.2, - 0.9, 1.9, - 1.0, 2.1, - -2.0, 3.0, - -2.2, 3.1]); + let inputs = Matrix::new(8, 2, vec![1.0, 2.0, 1.1, 2.2, 0.9, 1.9, 1.0, 2.1, -2.0, 3.0, -2.2, 3.1, -1.0, -2.0, -2.0, -1.0]); let mut model = DBSCAN::new(0.5, 2); model.train(&inputs).unwrap(); - let clustering = model.clusters().unwrap(); + let clustering = dbg!(model.clusters().unwrap()); assert!(clustering.data().iter().take(4).all(|x| *x == Some(0))); - assert!(clustering.data().iter().skip(4).all(|x| *x == Some(1))); + assert!(clustering.data().iter().skip(4).take(2).all(|x| *x == Some(1))); + assert!(clustering.data().iter().skip(6).all(|x| *x == None)); } +#[test] +fn test_border_points() { + let inputs = Matrix::new(5, 1, vec![1.55, 2.0, 2.1, 2.2, 2.65]); + + let mut model = DBSCAN::new(0.5, 3); + model.train(&inputs).unwrap(); + + let clustering = dbg!(model.clusters().unwrap()); + + assert!(clustering.data().iter().take(1).all(|x| *x == None)); + assert!(clustering.data().iter().skip(1).take(3).all(|x| *x == Some(0))); + assert!(clustering.data().iter().skip(4).all(|x| *x == None)); +} #[test] fn test_basic_prediction() { - let inputs = Matrix::new(6, 2, vec![1.0, 2.0, - 1.1, 2.2, - 0.9, 1.9, - 1.0, 2.1, - -2.0, 3.0, - -2.2, 3.1]); + let inputs = Matrix::new(6, 2, vec![1.0, 2.0, 1.1, 2.2, 0.9, 1.9, 1.0, 2.1, -2.0, 3.0, -2.2, 3.1]); let mut model = DBSCAN::new(0.5, 2); model.set_predictive(true); model.train(&inputs).unwrap(); - let new_points = Matrix::new(2,2, vec![1.0, 2.0, 4.0, 4.0]); + let new_points = Matrix::new(2, 2, vec![1.0, 2.0, 4.0, 4.0]); let classes = model.predict(&new_points).unwrap(); assert!(classes[0] == Some(0));