diff --git a/.travis.yml b/.travis.yml index 82e4582..8a6b6ff 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,15 +1,13 @@ language: python python: -- "3.6" - "3.7" -- "3.8" before_install: - pip install pytest==5.4.3 - pip install pytest-cov==2.10.0 - pip install codecov==2.1.8 install: -- python setup.py install +- pip install . script: -- pytest --cov +- python setup.py test after_success: - codecov diff --git a/README.md b/README.md index 65f47c0..559bc84 100644 --- a/README.md +++ b/README.md @@ -80,17 +80,17 @@ Uses the `setup.py` generated by [PyScaffold](https://pypi.org/project/PyScaffol ## Test ----------------- -Uses the `setup.py` generated by [PyScaffold](https://pypi.org/project/PyScaffold/): +Uses the [pytest](https://docs.pytest.org/) and [pytest-cov](https://pypi.org/project/pytest-cov/): - python setup.py test + pytest ## Dependencies -------------- Dependencies are minimal: - - Python (>= 3.6) - - [Scikit-Learn](https://scikit-learn.org/stable/) (>=0.23) - - [Pandas](https://pandas.pydata.org/) (>=1.0) +- [Python (>= 3.6)](https://www.python.org/downloads/release/python-360/) +- [Scikit-Learn (>=0.23)](https://scikit-learn.org/stable/) +- [Pandas (>=1.0)](https://pandas.pydata.org/) ## References diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..acc89a7 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,5 @@ +[pytest] +testpaths = + src + tests +addopts = --cov --cov-append diff --git a/setup.cfg b/setup.cfg index 8fb59b7..563816a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,6 +33,7 @@ setup_requires = pyscaffold>=3.2a0,<3.3a0 install_requires = pandas>=1.0 scikit-learn>=0.23 + black==21.5b2 # The usage of test_requires is discouraged, see `Dependency Management` docs tests_require = diff --git a/src/randomforests/Forest.py b/src/randomforests/Forest.py index 9e9dffa..c48c795 100644 --- a/src/randomforests/Forest.py +++ b/src/randomforests/Forest.py @@ -1,5 +1,6 @@ import numpy as np + class RandomForest: """ A Random Forest base class. @@ -29,12 +30,12 @@ class RandomForest: def __init__(self, n_trees=10, max_depth=2, min_size=1): self.max_depth = max_depth - self.min_size = min_size - self.n_trees = n_trees - self.cost = None - self.trees = None + self.min_size = min_size + self.n_trees = n_trees + self.cost = None + self.trees = None - def _subsample(self, dataset : np.ndarray) -> np.ndarray: + def _subsample(self, dataset: np.ndarray) -> np.ndarray: """ This function returns a bootstrapped version of the dataset which has the same number of rows. @@ -50,20 +51,20 @@ def _subsample(self, dataset : np.ndarray) -> np.ndarray: number_of_rows = dataset.shape[0] sample_of_rows = number_of_rows - random_indices = np.random.choice(number_of_rows, - size=sample_of_rows, - replace=True) - return dataset[random_indices,:] + random_indices = np.random.choice( + number_of_rows, size=sample_of_rows, replace=True + ) + return dataset[random_indices, :] def set_params(self, **parameters): for parameter, value in parameters.items(): setattr(self, parameter, value) return self - def get_params(self, deep=True): - return {"max_depth" : self.max_depth, - "min_size" : self.min_size, - "cost" : self.cost, - "n_trees" : self.n_trees} - + return { + "max_depth": self.max_depth, + "min_size": self.min_size, + "cost": self.cost, + "n_trees": self.n_trees, + } diff --git a/src/randomforests/ForestClassifier.py b/src/randomforests/ForestClassifier.py index aa3b67f..acae912 100644 --- a/src/randomforests/ForestClassifier.py +++ b/src/randomforests/ForestClassifier.py @@ -10,7 +10,8 @@ from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.metrics import accuracy_score -class RandomForestClassifier (BaseEstimator, ClassifierMixin, RandomForest): + +class RandomForestClassifier(BaseEstimator, ClassifierMixin, RandomForest): """ A random forest classification model that extends the abstract base class of random forest. @@ -33,7 +34,13 @@ class RandomForestClassifier (BaseEstimator, ClassifierMixin, RandomForest): The cost function """ - def __init__(self, n_trees : int = 10, max_depth : int =2, min_size : int =1, cost : str ='gini'): + def __init__( + self, + n_trees: int = 10, + max_depth: int = 2, + min_size: int = 1, + cost: str = "gini", + ): """ Constructor for random forest classifier. This mainly just initialize the attributes of the class by calling the base class constructor. @@ -41,18 +48,14 @@ def __init__(self, n_trees : int = 10, max_depth : int =2, min_size : int =1, co to make sure it either using 'gini', otherwise an error is thrown. """ - super().__init__(n_trees = n_trees, - max_depth = max_depth, - min_size = min_size) + super().__init__(n_trees=n_trees, max_depth=max_depth, min_size=min_size) - if cost == 'gini': - self.cost = "gini" + if cost == "gini": + self.cost = "gini" else: - raise NameError('Not valid cost function') - - + raise NameError("Not valid cost function") - def fit(self, X, y = None): + def fit(self, X, y=None): """ Fit the random forest to the training set train. @@ -64,15 +67,15 @@ def fit(self, X, y = None): """ n_features = round(sqrt(X.shape[1])) - dataset = _make_dataset(X,y) - self.trees = [self._bootstrap_tree(dataset = dataset, - n_features = n_features) - for i in range(self.n_trees)] + dataset = _make_dataset(X, y) + self.trees = [ + self._bootstrap_tree(dataset=dataset, n_features=n_features) + for i in range(self.n_trees) + ] return self - - def predict(self, x : pd.DataFrame) -> int: + def predict(self, x: pd.DataFrame) -> int: """ Predict the class that this sample datapoint belongs to. @@ -94,7 +97,6 @@ def predict(self, x : pd.DataFrame) -> int: return sp.stats.mode(preds)[0][0] - def score(self, X=None, y=None): """ Returns the accuracy of the model @@ -107,12 +109,14 @@ def score(self, X=None, y=None): """ - return accuracy_score(y,self.predict(X)) + return accuracy_score(y, self.predict(X)) - def _bootstrap_tree(self, dataset : np.ndarray, n_features : int) -> DecisionTreeClassifier: + def _bootstrap_tree( + self, dataset: np.ndarray, n_features: int + ) -> DecisionTreeClassifier: sample = self._subsample(dataset) - tree = DecisionTreeClassifier(max_depth = self.max_depth, - min_size = self.min_size, - n_features = n_features) - return tree.fit(sample[:,:-1],sample[:,-1]) \ No newline at end of file + tree = DecisionTreeClassifier( + max_depth=self.max_depth, min_size=self.min_size, n_features=n_features + ) + return tree.fit(sample[:, :-1], sample[:, -1]) diff --git a/src/randomforests/ForestRegressor.py b/src/randomforests/ForestRegressor.py index ab1094d..ec3a539 100644 --- a/src/randomforests/ForestRegressor.py +++ b/src/randomforests/ForestRegressor.py @@ -9,7 +9,8 @@ from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.metrics import mean_squared_error -class RandomForestRegressor (BaseEstimator, ClassifierMixin, RandomForest): + +class RandomForestRegressor(BaseEstimator, ClassifierMixin, RandomForest): """ A random forest regression model that extends the abstract base class of random forest. @@ -32,7 +33,13 @@ class RandomForestRegressor (BaseEstimator, ClassifierMixin, RandomForest): The cost function """ - def __init__(self, n_trees : int = 10, max_depth : int =2, min_size : int =1, cost : str = "mse"): + def __init__( + self, + n_trees: int = 10, + max_depth: int = 2, + min_size: int = 1, + cost: str = "mse", + ): """ Constructor for random forest regressor. This mainly just initialize the attributes of the class by calling the base class constructor. @@ -40,17 +47,14 @@ def __init__(self, n_trees : int = 10, max_depth : int =2, min_size : int =1, co to make sure it either using 'mse', otherwise an error is thrown. """ - super().__init__(n_trees = n_trees, - max_depth = max_depth, - min_size = min_size) + super().__init__(n_trees=n_trees, max_depth=max_depth, min_size=min_size) - if cost == 'mse': - self.cost = "mse" + if cost == "mse": + self.cost = "mse" else: - raise NameError('Not valid cost function') - + raise NameError("Not valid cost function") - def fit(self, X, y = None): + def fit(self, X, y=None): """ Fit the random forest to the training set train. @@ -70,14 +74,15 @@ def fit(self, X, y = None): """ n_features = round(sqrt(X.shape[1])) - dataset = _make_dataset(X,y) - self.trees = [self._bootstrap_tree(dataset = dataset, - n_features = n_features) - for i in range(self.n_trees)] + dataset = _make_dataset(X, y) + self.trees = [ + self._bootstrap_tree(dataset=dataset, n_features=n_features) + for i in range(self.n_trees) + ] return self - def predict(self, x : pd.DataFrame) -> int: + def predict(self, x: pd.DataFrame) -> int: """ Predict the value for this sample datapoint @@ -94,10 +99,10 @@ def predict(self, x : pd.DataFrame) -> int: rows = x.to_numpy() else: rows = x - + preds = np.vstack([tree.predict(rows) for tree in self.trees]) - return np.mean(preds,axis=0) + return np.mean(preds, axis=0) def score(self, X=None, y=None): """ @@ -114,12 +119,14 @@ def score(self, X=None, y=None): float """ - return mean_squared_error(y,self.predict(X)) + return mean_squared_error(y, self.predict(X)) - def _bootstrap_tree(self, dataset : np.ndarray, n_features : int) -> DecisionTreeRegressor: + def _bootstrap_tree( + self, dataset: np.ndarray, n_features: int + ) -> DecisionTreeRegressor: sample = self._subsample(dataset) - tree = DecisionTreeRegressor(max_depth = self.max_depth, - min_size = self.min_size, - n_features = n_features) - return tree.fit(sample[:,:-1],sample[:,-1]) \ No newline at end of file + tree = DecisionTreeRegressor( + max_depth=self.max_depth, min_size=self.min_size, n_features=n_features + ) + return tree.fit(sample[:, :-1], sample[:, -1]) diff --git a/src/randomforests/Tree.py b/src/randomforests/Tree.py index 61b25bf..eae66b1 100644 --- a/src/randomforests/Tree.py +++ b/src/randomforests/Tree.py @@ -5,280 +5,273 @@ from randomforests.utils import _make_dataset -class DecisionTree: - """ - A decision tree abstract base class. - - Classification and Regression Trees will be derived class that override - certain functions of this class, mainly the cost function and make leaf - function. They will also need a .fit, .predict and .score function to be - compatible with scikit-learn. - - - Attributes - ----------- - max_depth int: default=2 - The maximum depth of tree. - - min_size int: default=1 - The minimum number of datapoints in terminal nodes. - - n_features int: min_size=None - The number of features to be used in splitting. - - cost str: - The name of the cost function. - - root dict: - The root of the decision tree. - - """ - def __init__(self, max_depth : int= 2, min_size : int = 1, n_features : int = None): - - self.max_depth = max_depth - self.min_size = min_size - self.n_features = None - self.cost = None - - if n_features is not None: - self.n_features = n_features - - self.root = None - - def _set_features(self, X : np.ndarray) -> None: +class DecisionTree: """ - Sets the number of features we want use to search for the best split. - This isn't useful for decision trees specifically, but useful for - Random Forests where we don't use all features possible, but use - a random subset. + A decision tree abstract base class. + Classification and Regression Trees will be derived class that override + certain functions of this class, mainly the cost function and make leaf + function. They will also need a .fit, .predict and .score function to be + compatible with scikit-learn. - Parameters - ---------- - X np.ndarray The feature dataset - """ - if self.n_features is None: - self.n_features = X.shape[1] - else: - if (self.n_features > X.shape[1]): - raise AttributeError("n_features > X.shape[1]") + Attributes + ----------- + max_depth int: default=2 + The maximum depth of tree. + min_size int: default=1 + The minimum number of datapoints in terminal nodes. - def _fit(self, X = None, y = None): - """ - Builds the decsision tree by recursively splitting tree until the - the maxmimum depth, max_depth, of the tree is acheived or the nodes - have the minmum number of training points per node, min_size, is - achieved. - - Note: n_features will be passed by the RandomForest as it is - usually ta subset of the total number of features. - However, if one is using the class as a stand alone decision - tree, then the n_features will automatically be - - Parameters - ---------- - X DataFrame of the features dataset. - - Y Series of the targetvariable - """ - - self._set_features(X) - - dataset = _make_dataset(X = X, y = y) - - # perform optimal split for the root - self.root = self._get_split(dataset) + n_features int: min_size=None + The number of features to be used in splitting. - # now recurisively split the roots dataset until the stopping - # criteria is met. - root = self._split(self.root, 1) + cost str: + The name of the cost function. + root dict: + The root of the decision tree. - - def _test_split(self, dataset : np.ndarray, column : int, value : float) -> tuple: """ - This function splits the data set depending on the feature (index) and - the splitting value (value) - Parameters - ----------- - index : The column index of the feature. - value : The value to split the data. - dataset : The list of list representation of the dataframe + def __init__(self, max_depth: int = 2, min_size: int = 1, n_features: int = None): - Returns - --------- - Tupple of the left and right split datasets. + self.max_depth = max_depth + self.min_size = min_size + self.n_features = None + self.cost = None - """ - left = dataset[dataset[:,column] < value] - right = dataset[dataset[:,column] >= value] - return left, right + if n_features is not None: + self.n_features = n_features + self.root = None - def _get_split(self, dataset : np.ndarray) -> dict: - """ - Select the best splitting point and feature for a dataset - using a random subset of self.n_features number of features. - - Parameters - ----------- - dataset np.ndarray: - Training data. - - Returns - ------- - dict Dictionary of the best splitting feature of randomly chosen and - the best splitting value. - """ - - b_index, b_value, b_score, b_groups = 999, 999, 999, None - - # the features to test among the split - features = set() - - # randomily select features to consider - # TODO: push this to another function or into set_features? - while len(features) < self.n_features: - index = randrange(self.n_features) - features.add(index) - - # loop through the number of features and values of the data - # to figure out which gives the best split according - # to the derived classes cost function value of the tested - # split - for column in features: - for row in dataset[:,column]: - groups = self._test_split(dataset, column, row) - gini = self._cost(groups) - if gini < b_score: - b_column = column - b_value = row - b_score = gini - b_groups = groups - - - return {'column':b_column, 'value':b_value, 'groups':b_groups} - - def _split(self, node : dict, depth : int) -> None: - """ - Recursive splitting function that creates child - splits for a node or make this node a leaf. - Note: Leaves are just a value, which is determined - in the derived class. - - Parameters - ----------- - node dictionary: - The current node in the tree. + def _set_features(self, X: np.ndarray) -> None: + """ + Sets the number of features we want use to search for the best split. + This isn't useful for decision trees specifically, but useful for + Random Forests where we don't use all features possible, but use + a random subset. - depth int : - The depth of node curr. - Returns - --------] - """ - left, right = node['groups'] - del(node['groups']) - - # check for a no split in left - if left.size == 0: - node['left'] = node['right'] = self._make_leaf(right[:,-1]) - return - # check for a no split in right - elif right.size == 0: - node['left'] = node['right'] = self._make_leaf(left[:,-1]) - return - #check for max depth - elif depth >= self.max_depth: - node['left'] = self._make_leaf(left[:,-1]) - node['right'] = self._make_leaf(right[:,-1]) - return - # else - else: - # process left child - if len(left) <= self.min_size: - node['left'] = self._make_leaf(left[:,-1]) + Parameters + ---------- + X np.ndarray The feature dataset + """ + if self.n_features is None: + self.n_features = X.shape[1] else: - node['left'] = self._get_split(left) - self._split(node['left'], depth+1) - - # process right child - if len(right) <= self.min_size: - node['right'] = self._make_leaf(right[:,-1]) - + if self.n_features > X.shape[1]: + raise AttributeError("n_features > X.shape[1]") + + def _fit(self, X=None, y=None): + """ + Builds the decsision tree by recursively splitting tree until the + the maxmimum depth, max_depth, of the tree is acheived or the nodes + have the minmum number of training points per node, min_size, is + achieved. + + Note: n_features will be passed by the RandomForest as it is + usually ta subset of the total number of features. + However, if one is using the class as a stand alone decision + tree, then the n_features will automatically be + + Parameters + ---------- + X DataFrame of the features dataset. + + Y Series of the targetvariable + """ + + self._set_features(X) + + dataset = _make_dataset(X=X, y=y) + + # perform optimal split for the root + self.root = self._get_split(dataset) + + # now recurisively split the roots dataset until the stopping + # criteria is met. + root = self._split(self.root, 1) + + def _test_split(self, dataset: np.ndarray, column: int, value: float) -> tuple: + """ + This function splits the data set depending on the feature (index) and + the splitting value (value) + + Parameters + ----------- + index : The column index of the feature. + value : The value to split the data. + dataset : The list of list representation of the dataframe + + Returns + --------- + Tupple of the left and right split datasets. + + """ + left = dataset[dataset[:, column] < value] + right = dataset[dataset[:, column] >= value] + return left, right + + def _get_split(self, dataset: np.ndarray) -> dict: + """ + Select the best splitting point and feature for a dataset + using a random subset of self.n_features number of features. + + Parameters + ----------- + dataset np.ndarray: + Training data. + + Returns + ------- + dict Dictionary of the best splitting feature of randomly chosen and + the best splitting value. + """ + + b_index, b_value, b_score, b_groups = 999, 999, 999, None + + # the features to test among the split + features = set() + + # randomily select features to consider + # TODO: push this to another function or into set_features? + while len(features) < self.n_features: + index = randrange(self.n_features) + features.add(index) + + # loop through the number of features and values of the data + # to figure out which gives the best split according + # to the derived classes cost function value of the tested + # split + for column in features: + for row in dataset[:, column]: + groups = self._test_split(dataset, column, row) + gini = self._cost(groups) + if gini < b_score: + b_column = column + b_value = row + b_score = gini + b_groups = groups + + return {"column": b_column, "value": b_value, "groups": b_groups} + + def _split(self, node: dict, depth: int) -> None: + """ + Recursive splitting function that creates child + splits for a node or make this node a leaf. + Note: Leaves are just a value, which is determined + in the derived class. + + Parameters + ----------- + node dictionary: + The current node in the tree. + + depth int : + The depth of node curr. + + Returns + --------] + """ + left, right = node["groups"] + del node["groups"] + + # check for a no split in left + if left.size == 0: + node["left"] = node["right"] = self._make_leaf(right[:, -1]) + return + # check for a no split in right + elif right.size == 0: + node["left"] = node["right"] = self._make_leaf(left[:, -1]) + return + # check for max depth + elif depth >= self.max_depth: + node["left"] = self._make_leaf(left[:, -1]) + node["right"] = self._make_leaf(right[:, -1]) + return + # else else: - node['right'] = self._get_split(right) - self._split(node['right'], depth+1) - - - def _predict(self, row : np.ndarray, node : dict): - """ - Predicts the target value for one single row to by recursively - traversing tree and returns the termina leaf value corresponding - to this data point. - - Parameters - ----------- - row np.ndarray : - The data point to classify. - - node dict : - he current node in the tree. - - Returns - -------- - The leaf value of this data point. - """ - if row[node['column']] < node['value']: - if isinstance(node['left'], dict): - return self._predict(row, node['left']) - else: - return node['left'] - else: - if isinstance(node['right'], dict): - return self._predict(row, node['right']) - else: - return node['right'] - - - def predict(self, x : np.ndarray) -> int: - """ - Predict the class that this sample datapoint belongs to. - - Parameters - ---------- - x np.ndarray: - The datapoints to classify. - - Returns - -------- - The predicted class the data points belong to. - """ - if isinstance(x, np.ndarray) is False: - rows = x.to_numpy() - else: - rows = x - - predictor = partial(self._predict, **{"node":self.root}) - preds = np.apply_along_axis(predictor, axis=1, arr=rows) + # process left child + if len(left) <= self.min_size: + node["left"] = self._make_leaf(left[:, -1]) + + else: + node["left"] = self._get_split(left) + self._split(node["left"], depth + 1) + + # process right child + if len(right) <= self.min_size: + node["right"] = self._make_leaf(right[:, -1]) + + else: + node["right"] = self._get_split(right) + self._split(node["right"], depth + 1) + + def _predict(self, row: np.ndarray, node: dict): + """ + Predicts the target value for one single row to by recursively + traversing tree and returns the termina leaf value corresponding + to this data point. + + Parameters + ----------- + row np.ndarray : + The data point to classify. + + node dict : + he current node in the tree. + + Returns + -------- + The leaf value of this data point. + """ + if row[node["column"]] < node["value"]: + if isinstance(node["left"], dict): + return self._predict(row, node["left"]) + else: + return node["left"] + else: + if isinstance(node["right"], dict): + return self._predict(row, node["right"]) + else: + return node["right"] + + def predict(self, x: np.ndarray) -> int: + """ + Predict the class that this sample datapoint belongs to. + + Parameters + ---------- + x np.ndarray: + The datapoints to classify. + + Returns + -------- + The predicted class the data points belong to. + """ + if isinstance(x, np.ndarray) is False: + rows = x.to_numpy() + else: + rows = x - return preds + predictor = partial(self._predict, **{"node": self.root}) + preds = np.apply_along_axis(predictor, axis=1, arr=rows) - def set_params(self, **parameters): - for parameter, value in parameters.items(): - setattr(self, parameter, value) - return self + return preds + def set_params(self, **parameters): + for parameter, value in parameters.items(): + setattr(self, parameter, value) + return self - def get_params(self, deep=True): - return {"max_depth" : self.max_depth, - "min_size" : self.min_size, + def get_params(self, deep=True): + return { + "max_depth": self.max_depth, + "min_size": self.min_size, "n_features": self.n_features, - "cost" : self.cost} - - + "cost": self.cost, + } diff --git a/src/randomforests/TreeClassifier.py b/src/randomforests/TreeClassifier.py index d575ffd..d0b7a59 100644 --- a/src/randomforests/TreeClassifier.py +++ b/src/randomforests/TreeClassifier.py @@ -6,7 +6,8 @@ from randomforests.Tree import DecisionTree -class DecisionTreeClassifier (BaseEstimator, ClassifierMixin, DecisionTree ): + +class DecisionTreeClassifier(BaseEstimator, ClassifierMixin, DecisionTree): """ A decision tree classifier that extends the DecisionTree class. @@ -28,18 +29,21 @@ class DecisionTreeClassifier (BaseEstimator, ClassifierMixin, DecisionTree ): The cost function """ - def __init__(self, max_depth : int = 2, min_size : int = 1, n_features : int= None, cost : str ='gini'): + def __init__( + self, + max_depth: int = 2, + min_size: int = 1, + n_features: int = None, + cost: str = "gini", + ): - super().__init__(max_depth = max_depth, - min_size = min_size, - n_features = n_features) + super().__init__(max_depth=max_depth, min_size=min_size, n_features=n_features) - if cost == 'gini': - self.cost = "gini" + if cost == "gini": + self.cost = "gini" self._cost = self._cost_gini else: - raise NameError('Not valid cost function') - + raise NameError("Not valid cost function") def fit(self, X=None, y=None): """ @@ -81,10 +85,12 @@ def score(self, X=None, y=None): float """ - return accuracy_score(y,self.predict(X),) + return accuracy_score( + y, + self.predict(X), + ) - - def _cost_gini(self, groups : tuple) -> float: + def _cost_gini(self, groups: tuple) -> float: """ Get the cost of the spit of the dataframe. Groups will be the tuple containing the left and right @@ -105,17 +111,16 @@ def _cost_gini(self, groups : tuple) -> float: for group in groups: split_size = len(group) if split_size != 0: - cost += split_size * self._gini_index(group[:,-1]) - - return cost /size + cost += split_size * self._gini_index(group[:, -1]) + return cost / size - def _gini_index(self, y : np.ndarray) -> float: + def _gini_index(self, y: np.ndarray) -> float: """ Gini index for a single target vector. """ gini = 0.0 - y_t = y.reshape(len(y)) + y_t = y.reshape(len(y)) target_val_cts = dict(zip(*np.unique(y_t, return_counts=True))) size = len(y) @@ -126,7 +131,7 @@ def _gini_index(self, y : np.ndarray) -> float: return gini - def _make_leaf(self, y : np.ndarray) -> float : + def _make_leaf(self, y: np.ndarray) -> float: """ Makest the leaf of the tree by taking the value of the class that has the largest size. @@ -140,6 +145,6 @@ def _make_leaf(self, y : np.ndarray) -> float : The leaf value. """ - y_t = y.reshape(len(y)) + y_t = y.reshape(len(y)) - return sp.stats.mode(y_t)[0][0] \ No newline at end of file + return sp.stats.mode(y_t)[0][0] diff --git a/src/randomforests/TreeRegressor.py b/src/randomforests/TreeRegressor.py index b582d90..077b96b 100644 --- a/src/randomforests/TreeRegressor.py +++ b/src/randomforests/TreeRegressor.py @@ -5,7 +5,8 @@ from randomforests.Tree import DecisionTree -class DecisionTreeRegressor (BaseEstimator, ClassifierMixin, DecisionTree ): + +class DecisionTreeRegressor(BaseEstimator, ClassifierMixin, DecisionTree): """ A decision tree regressor that extends the DecisionTree class. @@ -27,16 +28,13 @@ class DecisionTreeRegressor (BaseEstimator, ClassifierMixin, DecisionTree ): The root dictionary for the tree """ - def __init__(self, max_depth : int = 2, min_size : int = 1, n_features : int = None): + def __init__(self, max_depth: int = 2, min_size: int = 1, n_features: int = None): - super().__init__(max_depth = max_depth, - min_size = min_size, - n_features = n_features) + super().__init__(max_depth=max_depth, min_size=min_size, n_features=n_features) - self.cost = "mse" + self.cost = "mse" self._cost = self._cost_mse - def fit(self, X=None, y=None): """ Builds the regression decision tree by recursively splitting @@ -76,9 +74,9 @@ def score(self, X=None, y=None): float """ - return mean_squared_error(y,self.predict(X)) + return mean_squared_error(y, self.predict(X)) - def _cost_mse(self, groups : tuple) -> float: + def _cost_mse(self, groups: tuple) -> float: """ Get the cost of the spit of the dataframe. Groups will be the tuple containing the left and right splits. The cost is the mean square error, @@ -100,12 +98,12 @@ def _cost_mse(self, groups : tuple) -> float: for group in groups: split_size = len(group) if split_size > 0: - cost += split_size * np.var(group[:,-1]) + cost += split_size * np.var(group[:, -1]) else: continue return cost / size - def _make_leaf(self, y : np.ndarray) -> float : + def _make_leaf(self, y: np.ndarray) -> float: """ Makes the leaf of the tree by taking the mean of the target values @@ -121,5 +119,3 @@ def _make_leaf(self, y : np.ndarray) -> float : y_t = y.reshape(len(y)) return np.mean(y_t) - - diff --git a/src/randomforests/__init__.py b/src/randomforests/__init__.py index 3f3434d..2d60641 100644 --- a/src/randomforests/__init__.py +++ b/src/randomforests/__init__.py @@ -3,7 +3,9 @@ from randomforests.ForestClassifier import RandomForestClassifier from randomforests.ForestRegressor import RandomForestRegressor -__all__ = [DecisionTreeClassifier, - DecisionTreeRegressor, - RandomForestClassifier, - RandomForestRegressor] \ No newline at end of file +__all__ = [ + DecisionTreeClassifier, + DecisionTreeRegressor, + RandomForestClassifier, + RandomForestRegressor, +] diff --git a/src/randomforests/utils.py b/src/randomforests/utils.py index 60dd6ab..dc76dcf 100644 --- a/src/randomforests/utils.py +++ b/src/randomforests/utils.py @@ -1,41 +1,42 @@ import pandas as pd import numpy as np + def _make_dataset(X, y): - """ - This function converts a Pandas Dataframe X and - the Pandas Series y as a new numpy array - - - Parameters - --------- - X: The Pandas DataFrame of the dataset features - - y: The Pandas Series of the target values - - Returns - -------- - np.ndarray The datafarme X with y appended to the right - most column - """ - feats = X - target = y - - # convert the the dataframe/series to numpy array if - # not in numpy array format - if isinstance(X, np.ndarray) is False: - if isinstance(X, pd.core.frame.DataFrame) is True: - feats = X.to_numpy() - else: - raise TypeError("X needs to be NumPy array or Pandas Dataframe") - - if isinstance(y, np.ndarray) is False: - if isinstance(y, pd.core.series.Series) is True: - target = y.values - else: - raise TypeError("X needs to be NumPy array or Pandas Series") - - # append the column vector as the right most column - dataset = np.append(feats,target.reshape(len(target),1),axis=1) - - return dataset \ No newline at end of file + """ + This function converts a Pandas Dataframe X and + the Pandas Series y as a new numpy array + + + Parameters + --------- + X: The Pandas DataFrame of the dataset features + + y: The Pandas Series of the target values + + Returns + -------- + np.ndarray The datafarme X with y appended to the right + most column + """ + feats = X + target = y + + # convert the the dataframe/series to numpy array if + # not in numpy array format + if isinstance(X, np.ndarray) is False: + if isinstance(X, pd.core.frame.DataFrame) is True: + feats = X.to_numpy() + else: + raise TypeError("X needs to be NumPy array or Pandas Dataframe") + + if isinstance(y, np.ndarray) is False: + if isinstance(y, pd.core.series.Series) is True: + target = y.values + else: + raise TypeError("X needs to be NumPy array or Pandas Series") + + # append the column vector as the right most column + dataset = np.append(feats, target.reshape(len(target), 1), axis=1) + + return dataset diff --git a/tests/integration/test_int_forest_clf.py b/tests/integration/test_int_forest_clf.py index 1514c66..0fcf8ee 100644 --- a/tests/integration/test_int_forest_clf.py +++ b/tests/integration/test_int_forest_clf.py @@ -9,42 +9,36 @@ def test_make_bootsrap(): """ Cant really do good test since it has random sample with replacement """ - X = np.array([[0.1], - [0.5], - [0.7], - [0.9]]) + X = np.array([[0.1], [0.5], [0.7], [0.9]]) y = np.array([0, 0, 1, 1]) - dataset = _make_dataset(X,y) + dataset = _make_dataset(X, y) forest = RandomForestClassifier() - tree = forest._bootstrap_tree(dataset=dataset, n_features=1) + tree = forest._bootstrap_tree(dataset=dataset, n_features=1) assert tree.n_features == 1 + def test_fit(): """ Cant really do good test since it has random sample with replacement """ - X = np.array([[0.1], - [0.5], - [0.7], - [0.9]]) + X = np.array([[0.1], [0.5], [0.7], [0.9]]) y = np.array([0, 0, 1, 1]) - forest = RandomForestClassifier() - model = forest.fit(X,y) + model = forest.fit(X, y) assert len(model.trees) == 10 -predict_tests = [(np.array([0, 0, 1, 1])), - (pd.Series([0, 0, 1, 1]))] +predict_tests = [(np.array([0, 0, 1, 1])), (pd.Series([0, 0, 1, 1]))] + -@pytest.mark.parametrize('y', predict_tests) +@pytest.mark.parametrize("y", predict_tests) def test_predict(y): """ Cant really do good test since it has random sample with replacement @@ -52,15 +46,12 @@ def test_predict(y): But check to make sure the shape is consistent and the predicted values are with in the training set range. """ - X = np.array([[0.1], - [0.5], - [0.7], - [0.9]]) + X = np.array([[0.1], [0.5], [0.7], [0.9]]) forest = RandomForestClassifier() - model = forest.fit(X,y) + model = forest.fit(X, y) - correct_size = len(model.predict(X)) == 4 - correct_class = np.array_equal(np.unique(model.predict(X)), np.array([0,1])) + correct_size = len(model.predict(X)) == 4 + correct_class = np.array_equal(np.unique(model.predict(X)), np.array([0, 1])) assert correct_size and correct_class diff --git a/tests/integration/test_int_forest_reg.py b/tests/integration/test_int_forest_reg.py index 37d11c3..f4504a8 100644 --- a/tests/integration/test_int_forest_reg.py +++ b/tests/integration/test_int_forest_reg.py @@ -9,40 +9,36 @@ def test_make_bootsrap(): """ Cant really do good test since it has random sample with replacement """ - X = np.array([[0.1], - [0.5], - [0.7], - [0.9]]) + X = np.array([[0.1], [0.5], [0.7], [0.9]]) y = np.array([0.1, 0.5, 0.7, 0.9]) - dataset = _make_dataset(X,y) + dataset = _make_dataset(X, y) forest = RandomForestRegressor() - tree = forest._bootstrap_tree(dataset=dataset, n_features=1) + tree = forest._bootstrap_tree(dataset=dataset, n_features=1) assert tree.n_features == 1 + def test_fit(): """ Cant really do good test since it has random sample with replacement """ - X = np.array([[0.1], - [0.5], - [0.7], - [0.9]]) + X = np.array([[0.1], [0.5], [0.7], [0.9]]) y = np.array([0.1, 0.5, 0.7, 0.9]) forest = RandomForestRegressor() - model = forest.fit(X,y) + model = forest.fit(X, y) assert len(model.trees) == 10 -predict_tests = [(np.array([0.1, 0.5, 0.7, 0.9])), - (pd.Series([0.1, 0.5, 0.7, 0.9]))] -@pytest.mark.parametrize('y', predict_tests) +predict_tests = [(np.array([0.1, 0.5, 0.7, 0.9])), (pd.Series([0.1, 0.5, 0.7, 0.9]))] + + +@pytest.mark.parametrize("y", predict_tests) def test_predict(y): """ Cant really do good test since it has random sample with replacement @@ -51,20 +47,17 @@ def test_predict(y): with the training set target values. """ - X = np.array([[0., 0.1, 0.1], - [0., 0.5, 0.5], - [0., 0.7, 0.7], - [0., 0.9, 0.9]]) + X = np.array([[0.0, 0.1, 0.1], [0.0, 0.5, 0.5], [0.0, 0.7, 0.7], [0.0, 0.9, 0.9]]) y = np.array([0.1, 0.5, 0.7, 0.9]) forest = RandomForestRegressor() - model = forest.fit(X,y) + model = forest.fit(X, y) - preds = model.predict(X) - correct_size = len(preds) == 4 + preds = model.predict(X) + correct_size = len(preds) == 4 - bounded_max = np.max(preds) <= 0.9 + 1e-14 # ehhh - bounded_min = np.min(preds) >= 0.1 + bounded_max = np.max(preds) <= 0.9 + 1e-14 # ehhh + bounded_min = np.min(preds) >= 0.1 - assert correct_size and bounded_min and bounded_max \ No newline at end of file + assert correct_size and bounded_min and bounded_max diff --git a/tests/integration/test_int_tree_clf.py b/tests/integration/test_int_tree_clf.py index 12e7b87..7cda8c0 100644 --- a/tests/integration/test_int_tree_clf.py +++ b/tests/integration/test_int_tree_clf.py @@ -5,85 +5,126 @@ get_split_tests = [ - (1, - np.array([[0.1,0],[0.5,0],[0.7,1],[0.9,1]]), - {'column': 0, 'value': 0.7, 'groups': - (np.array([[0.1, 0. ],[0.5, 0. ]]), - np.array([[0.7, 1. ],[0.9, 1. ]]))} - ), + ( + 1, + np.array([[0.1, 0], [0.5, 0], [0.7, 1], [0.9, 1]]), + { + "column": 0, + "value": 0.7, + "groups": ( + np.array([[0.1, 0.0], [0.5, 0.0]]), + np.array([[0.7, 1.0], [0.9, 1.0]]), + ), + }, + ), + ( + 2, + np.array([[0, 0.1, 0], [0, 0.5, 0], [0, 0.7, 1], [0, 0.9, 1]]), + { + "column": 1, + "value": 0.7, + "groups": ( + np.array([[0.0, 0.1, 0.0], [0.0, 0.5, 0.0]]), + np.array([[0.0, 0.7, 1.0], [0.0, 0.9, 1.0]]), + ), + }, + ), +] - (2, - np.array([[0,0.1,0],[0,0.5,0],[0,0.7,1],[0,0.9,1]]), - {'column': 1, 'value': 0.7, 'groups': - (np.array([[0. , 0.1, 0. ],[0. , 0.5, 0. ]]), - np.array([[0. , 0.7, 1. ],[0. , 0.9, 1. ]]))} - )] -@pytest.mark.parametrize('n_features, dataset, expected', get_split_tests) +@pytest.mark.parametrize("n_features, dataset, expected", get_split_tests) def test_get_split(n_features, dataset, expected): - tree = DecisionTreeClassifier(n_features = n_features) - result = tree._get_split(dataset) - column = result["column"] == expected["column"] - value = result["value"] == expected["value"] + tree = DecisionTreeClassifier(n_features=n_features) + result = tree._get_split(dataset) + column = result["column"] == expected["column"] + value = result["value"] == expected["value"] - left_grp = np.array_equal(result["groups"][0],expected["groups"][0]) - right_grp = np.array_equal(result["groups"][1],expected["groups"][1]) + left_grp = np.array_equal(result["groups"][0], expected["groups"][0]) + right_grp = np.array_equal(result["groups"][1], expected["groups"][1]) - assert (column and value and left_grp and right_grp) + assert column and value and left_grp and right_grp split_tests = [ - ({'column': 0, - 'value' : 0.0, - 'groups': (np.array([]), np.array([[0. , 0.1, 0 ], [0. , 0.5, 0]]))}, - 1, # depth - {'column': 0, 'value': 0.0, 'left': 0, 'right': 0}), - ({'column': 0, - 'value' : 0.0, - 'groups': (np.array([]), np.array([[0. , 0.1, 0 ], [0. , 0.5, 0]]))}, - 1, # depth - {'column': 0, 'value': 0.0, 'left': 0, 'right': 0}), - ({'column': 1, - 'value' : 9.0, - 'groups': (np.array([[0,1,1], [0. , 0.5, 1 ]]), - np.array([[0,1,0], [0. , 0.5, 0 ]]))}, - 2, # depth - {'column': 1, 'value': 9.0, 'left': 1, 'right': 0}), - ({'column': 1, - 'value': 0.7, - 'groups': (np.array([[0. , 0.1, 0 ]]), - np.array([[0. , 0.7, 1], [0. , 0.8, 0 ],[0. , 0.9, 1]]))}, - 2, # depth - {'column': 1, 'value': 0.7, 'left': 0, 'right': 1}), - ({'column': 1, - 'value': 0.7, - 'groups': (np.array([[0. , 0.7, 1], [0. , 0.8, 0 ],[0. , 0.9, 1]]), - np.array([[0. , 0.1, 0 ]]))}, - 2, # depth - {'column': 1, 'value': 0.7, 'left': 1, 'right': 0}) + ( + { + "column": 0, + "value": 0.0, + "groups": (np.array([]), np.array([[0.0, 0.1, 0], [0.0, 0.5, 0]])), + }, + 1, # depth + {"column": 0, "value": 0.0, "left": 0, "right": 0}, + ), + ( + { + "column": 0, + "value": 0.0, + "groups": (np.array([]), np.array([[0.0, 0.1, 0], [0.0, 0.5, 0]])), + }, + 1, # depth + {"column": 0, "value": 0.0, "left": 0, "right": 0}, + ), + ( + { + "column": 1, + "value": 9.0, + "groups": ( + np.array([[0, 1, 1], [0.0, 0.5, 1]]), + np.array([[0, 1, 0], [0.0, 0.5, 0]]), + ), + }, + 2, # depth + {"column": 1, "value": 9.0, "left": 1, "right": 0}, + ), + ( + { + "column": 1, + "value": 0.7, + "groups": ( + np.array([[0.0, 0.1, 0]]), + np.array([[0.0, 0.7, 1], [0.0, 0.8, 0], [0.0, 0.9, 1]]), + ), + }, + 2, # depth + {"column": 1, "value": 0.7, "left": 0, "right": 1}, + ), + ( + { + "column": 1, + "value": 0.7, + "groups": ( + np.array([[0.0, 0.7, 1], [0.0, 0.8, 0], [0.0, 0.9, 1]]), + np.array([[0.0, 0.1, 0]]), + ), + }, + 2, # depth + {"column": 1, "value": 0.7, "left": 1, "right": 0}, + ), ] -@pytest.mark.parametrize('test_node, depth, expected', split_tests) + +@pytest.mark.parametrize("test_node, depth, expected", split_tests) def test__split(test_node, depth, expected): - tree = DecisionTreeClassifier(max_depth = 2, min_size= 2, n_features = 2) + tree = DecisionTreeClassifier(max_depth=2, min_size=2, n_features=2) tree._split(test_node, depth) assert test_node == expected public_predict_test = [ - (np.array([[0. , 0.1], - [0. , 0.5], - [0. , 0.7], - [0. , 0.9]]), - np.array([0, 0, 1, 1])), # y - (pd.DataFrame({"c1":[0., 0., 0., 0.], - "c2":[0.1, 0.5, 0.7, 0.9]}), - pd.Series([0, 0, 1, 1])) + ( + np.array([[0.0, 0.1], [0.0, 0.5], [0.0, 0.7], [0.0, 0.9]]), + np.array([0, 0, 1, 1]), + ), # y + ( + pd.DataFrame({"c1": [0.0, 0.0, 0.0, 0.0], "c2": [0.1, 0.5, 0.7, 0.9]}), + pd.Series([0, 0, 1, 1]), + ), ] -@pytest.mark.parametrize('X, y', public_predict_test) + +@pytest.mark.parametrize("X, y", public_predict_test) def test_predict(X, y): - tree = DecisionTreeClassifier() - model = tree.fit(X, y) - assert np.array_equal(model.predict(X),y) + tree = DecisionTreeClassifier() + model = tree.fit(X, y) + assert np.array_equal(model.predict(X), y) diff --git a/tests/integration/test_int_tree_reg.py b/tests/integration/test_int_tree_reg.py index 66377f4..05bb3aa 100644 --- a/tests/integration/test_int_tree_reg.py +++ b/tests/integration/test_int_tree_reg.py @@ -3,50 +3,49 @@ import numpy as np from randomforests import DecisionTreeRegressor + def test__get_split(): - tree = DecisionTreeRegressor(n_features = 1) - result = tree._get_split(np.array([[0.1,0.1],[0.5,0.5],[0.7,0.7],[0.9,0.9]])) + tree = DecisionTreeRegressor(n_features=1) + result = tree._get_split(np.array([[0.1, 0.1], [0.5, 0.5], [0.7, 0.7], [0.9, 0.9]])) + + column = result["column"] == 0 + value = result["value"] == 0.5 - column = result["column"] == 0 - value = result["value"] == 0.5 + left_grp = np.array_equal(result["groups"][0], np.array([[0.1, 0.1]])) + right_grp = np.array_equal( + result["groups"][1], np.array([[0.5, 0.5], [0.7, 0.7], [0.9, 0.9]]) + ) - left_grp = np.array_equal(result["groups"][0],np.array([[0.1, 0.1]])) - right_grp = np.array_equal(result["groups"][1],np.array([[0.5, 0.5], - [0.7, 0.7], - [0.9, 0.9]])) + assert column and value and left_grp and right_grp - assert (column and value and left_grp and right_grp) public_predict_test = [ - (np.array([[0. , 0.1], - [0. , 0.5], - [0. , 0.7], - [0. , 0.9]]), # X - np.array([0.1, 0.5, 0.7, 0.9]),# y - np.array([0.1, 0.6, 0.6, 0.9])),# y_test - - (pd.DataFrame({"c1":[0., 0., 0., 0.], - "c2":[0.1, 0.5, 0.7, 0.9]}),# X - pd.Series([0.1, 0.5, 0.7, 0.9]),# y - np.array([0.1, 0.6, 0.6, 0.9])# y_test - )# y_test + ( + np.array([[0.0, 0.1], [0.0, 0.5], [0.0, 0.7], [0.0, 0.9]]), # X + np.array([0.1, 0.5, 0.7, 0.9]), # y + np.array([0.1, 0.6, 0.6, 0.9]), + ), # y_test + ( + pd.DataFrame({"c1": [0.0, 0.0, 0.0, 0.0], "c2": [0.1, 0.5, 0.7, 0.9]}), # X + pd.Series([0.1, 0.5, 0.7, 0.9]), # y + np.array([0.1, 0.6, 0.6, 0.9]), # y_test + ), # y_test ] -@pytest.mark.parametrize('X, y, y_test', public_predict_test) +@pytest.mark.parametrize("X, y, y_test", public_predict_test) def test_predict(X, y, y_test): - tree = DecisionTreeRegressor() - model = tree.fit(X, y) + tree = DecisionTreeRegressor() + model = tree.fit(X, y) assert np.array_equal(model.predict(X), y_test) def test_score(): - X = pd.DataFrame({"c1":[0., 0., 0., 0.], - "c2":[0.1, 0.5, 0.7, 0.9]}) + X = pd.DataFrame({"c1": [0.0, 0.0, 0.0, 0.0], "c2": [0.1, 0.5, 0.7, 0.9]}) y = pd.Series([0.1, 0.5, 0.7, 0.9]) - tree = DecisionTreeRegressor() - model = tree.fit(X, y) + tree = DecisionTreeRegressor() + model = tree.fit(X, y) - assert model.score(X,y) == pytest.approx(0.004999999) \ No newline at end of file + assert model.score(X, y) == pytest.approx(0.004999999) diff --git a/tests/unit/test_forest.py b/tests/unit/test_forest.py index f78507a..81eb0c2 100644 --- a/tests/unit/test_forest.py +++ b/tests/unit/test_forest.py @@ -2,14 +2,14 @@ from randomforests.Forest import RandomForest + def test_init(): forest = RandomForest() - assert forest.n_trees == 10 and \ - forest.min_size == 1 and \ - forest.max_depth == 2 + assert forest.n_trees == 10 and forest.min_size == 1 and forest.max_depth == 2 + def test_subsample(): - forest = RandomForest() - dataset = np.array([[0.1,0],[0.5,0],[0.7,1],[0.9,1]]) + forest = RandomForest() + dataset = np.array([[0.1, 0], [0.5, 0], [0.7, 1], [0.9, 1]]) resampled = forest._subsample(dataset) - assert resampled.shape == dataset.shape \ No newline at end of file + assert resampled.shape == dataset.shape diff --git a/tests/unit/test_tree.py b/tests/unit/test_tree.py index 9d74109..9bfd130 100644 --- a/tests/unit/test_tree.py +++ b/tests/unit/test_tree.py @@ -6,78 +6,77 @@ def test_init(): - tree = DecisionTree(3,2,1) + tree = DecisionTree(3, 2, 1) + + assert tree.max_depth == 3 and tree.min_size == 2 and tree.n_features == 1 - assert (tree.max_depth == 3 and - tree.min_size == 2 and - tree.n_features == 1) def test_set_features(): tree = DecisionTree() - X = pd.DataFrame({"x1":[0,1],"x2":[1,0]}) + X = pd.DataFrame({"x1": [0, 1], "x2": [1, 0]}) tree._set_features(X) assert tree.n_features == 2 + def test_set_features_error(): tree = DecisionTree(n_features=5) - X = pd.DataFrame({"x1":[0,1],"x2":[1,0]}) + X = pd.DataFrame({"x1": [0, 1], "x2": [1, 0]}) with pytest.raises(Exception): tree._set_features(X) -test_split_data =[ - ( np.array([[1, 4, 3, 0], - [2, 3, 3, 0], - [3, 2, 3, 0], - [4, 1, 3, 1]]), - 0, 3, - [np.array([[1, 4, 3, 0], - [2, 3, 3, 0]]), - np.array([[3, 2, 3, 0], - [4, 1, 3, 1]]) - ]), - ( np.array([[1, 4, 3, 0], - [2, 3, 3, 0], - [3, 2, 3, 0], - [4, 1, 3, 1]]), - 1, 2, - [np.array([[4, 1, 3, 1]]), - np.array([[1, 4, 3, 0], - [2, 3, 3, 0], - [3, 2, 3, 0]]) - ]) - ] - - - -@pytest.mark.parametrize('data, column, value, expected', test_split_data) +test_split_data = [ + ( + np.array([[1, 4, 3, 0], [2, 3, 3, 0], [3, 2, 3, 0], [4, 1, 3, 1]]), + 0, + 3, + [ + np.array([[1, 4, 3, 0], [2, 3, 3, 0]]), + np.array([[3, 2, 3, 0], [4, 1, 3, 1]]), + ], + ), + ( + np.array([[1, 4, 3, 0], [2, 3, 3, 0], [3, 2, 3, 0], [4, 1, 3, 1]]), + 1, + 2, + [ + np.array([[4, 1, 3, 1]]), + np.array([[1, 4, 3, 0], [2, 3, 3, 0], [3, 2, 3, 0]]), + ], + ), +] + + +@pytest.mark.parametrize("data, column, value, expected", test_split_data) def test_split_dataset(data, column, value, expected): - tree = DecisionTree(max_depth=5, min_size=2) - result = tree._test_split(dataset = data, - column = column, - value = value) - - assert( np.array_equal(result[0], expected[0]) & - np.array_equal(result[0], expected[0])) - - -predict_tests = [(np.array([1,2]), {"column":0, "value":2, "left":1}, 1), - (np.array([1,2]), {"column":0, "value":1, "right":0}, 0), - (np.array([1,2]), {"column":1, "value":3, "left":1}, 1), - (np.array([1,2]), - {"column":0, "value":1, "right": - {"column":1, "value":5, "left": 0}}, 0), - (np.array([1,2]), - {"column":0, "value":2, "left": - {"column":1, "value":1, "right": 1}}, 1) - ] - - -@pytest.mark.parametrize('row, node, expected', predict_tests) + tree = DecisionTree(max_depth=5, min_size=2) + result = tree._test_split(dataset=data, column=column, value=value) + + assert np.array_equal(result[0], expected[0]) & np.array_equal( + result[0], expected[0] + ) + + +predict_tests = [ + (np.array([1, 2]), {"column": 0, "value": 2, "left": 1}, 1), + (np.array([1, 2]), {"column": 0, "value": 1, "right": 0}, 0), + (np.array([1, 2]), {"column": 1, "value": 3, "left": 1}, 1), + ( + np.array([1, 2]), + {"column": 0, "value": 1, "right": {"column": 1, "value": 5, "left": 0}}, + 0, + ), + ( + np.array([1, 2]), + {"column": 0, "value": 2, "left": {"column": 1, "value": 1, "right": 1}}, + 1, + ), +] + + +@pytest.mark.parametrize("row, node, expected", predict_tests) def test_private_predict(row, node, expected): - tree = DecisionTree(max_depth=5, min_size=2) - result = tree._predict(row = row, node = node) - assert expected == result - - + tree = DecisionTree(max_depth=5, min_size=2) + result = tree._predict(row=row, node=node) + assert expected == result diff --git a/tests/unit/test_unit_forest_clf.py b/tests/unit/test_unit_forest_clf.py index ac3ea09..2350344 100644 --- a/tests/unit/test_unit_forest_clf.py +++ b/tests/unit/test_unit_forest_clf.py @@ -2,14 +2,16 @@ from randomforests import RandomForestClassifier - def test_default(): forest = RandomForestClassifier() - assert (forest.n_trees == 10 and - forest.max_depth == 2 and - forest.min_size == 1 and - forest.cost == 'gini') + assert ( + forest.n_trees == 10 + and forest.max_depth == 2 + and forest.min_size == 1 + and forest.cost == "gini" + ) + def test_cost_exception(): with pytest.raises(Exception): @@ -17,14 +19,19 @@ def test_cost_exception(): params_tests = [ - ({"max_depth":3, "min_size":5, "n_trees":53, "cost":'gini'}, - {"max_depth":3, "min_size":5, "n_trees":53, "cost":'gini'}) + ( + {"max_depth": 3, "min_size": 5, "n_trees": 53, "cost": "gini"}, + {"max_depth": 3, "min_size": 5, "n_trees": 53, "cost": "gini"}, + ) ] -@pytest.mark.parametrize('test_dict, expected_dict', params_tests) + +@pytest.mark.parametrize("test_dict, expected_dict", params_tests) def test_get_params(test_dict, expected_dict): - forest = RandomForestClassifier(max_depth = test_dict["max_depth"], - min_size = test_dict["min_size"], - n_trees = test_dict["n_trees"]) + forest = RandomForestClassifier( + max_depth=test_dict["max_depth"], + min_size=test_dict["min_size"], + n_trees=test_dict["n_trees"], + ) assert expected_dict == forest.get_params() diff --git a/tests/unit/test_unit_forest_reg.py b/tests/unit/test_unit_forest_reg.py index e478a0e..f90fed4 100644 --- a/tests/unit/test_unit_forest_reg.py +++ b/tests/unit/test_unit_forest_reg.py @@ -5,10 +5,13 @@ def test_default(): forest = RandomForestRegressor() - assert (forest.n_trees == 10 and - forest.max_depth == 2 and - forest.min_size == 1 and - forest.cost == 'mse') + assert ( + forest.n_trees == 10 + and forest.max_depth == 2 + and forest.min_size == 1 + and forest.cost == "mse" + ) + def test_cost_exception(): with pytest.raises(Exception): @@ -16,14 +19,19 @@ def test_cost_exception(): params_tests = [ - ({"max_depth":3, "min_size":5, "n_trees":53, "cost":'mse'}, - {"max_depth":3, "min_size":5, "n_trees":53, "cost":'mse'}) + ( + {"max_depth": 3, "min_size": 5, "n_trees": 53, "cost": "mse"}, + {"max_depth": 3, "min_size": 5, "n_trees": 53, "cost": "mse"}, + ) ] -@pytest.mark.parametrize('test_dict, expected_dict', params_tests) + +@pytest.mark.parametrize("test_dict, expected_dict", params_tests) def test_get_params(test_dict, expected_dict): - forest = RandomForestRegressor(max_depth = test_dict["max_depth"], - min_size = test_dict["min_size"], - n_trees = test_dict["n_trees"]) + forest = RandomForestRegressor( + max_depth=test_dict["max_depth"], + min_size=test_dict["min_size"], + n_trees=test_dict["n_trees"], + ) assert expected_dict == forest.get_params() diff --git a/tests/unit/test_unit_tree_clf.py b/tests/unit/test_unit_tree_clf.py index a5413e8..9d6c539 100644 --- a/tests/unit/test_unit_tree_clf.py +++ b/tests/unit/test_unit_tree_clf.py @@ -3,70 +3,91 @@ from randomforests import DecisionTreeClassifier -leaf_tests =[(np.array([[0.],[0.], [1.],[0.]]),0), - (np.array([[0.],[1.], [1.],[1.]]),1), - (np.array([[1.],[1.], [1.],[1.]]),1)] - +leaf_tests = [ + (np.array([[0.0], [0.0], [1.0], [0.0]]), 0), + (np.array([[0.0], [1.0], [1.0], [1.0]]), 1), + (np.array([[1.0], [1.0], [1.0], [1.0]]), 1), +] -@pytest.mark.parametrize('y, expected', leaf_tests) +@pytest.mark.parametrize("y, expected", leaf_tests) def test_make_leaf(y, expected): - tree = DecisionTreeClassifier(max_depth=5, min_size=2) - result = tree._make_leaf(y) - assert expected == pytest.approx(result) - + tree = DecisionTreeClassifier(max_depth=5, min_size=2) + result = tree._make_leaf(y) + assert expected == pytest.approx(result) + + +gini_index_tests = [ + (np.array([[0.0], [0.0], [0.0], [0.0]]), 0), + (np.array([[0.0], [0.0], [1.0], [1.0]]), 0.5), + (np.array([[0.0], [0.0], [0.0], [1.0]]), 0.375), + (np.array([[1.0], [1.0], [0.0], [1.0]]), 0.375), + (np.array([[1.0], [1.0], [0.0]]), 0.4444444), + (np.array([[1.0], [1.0], [1.0], [1.0]]), 0), +] -gini_index_tests =[ - (np.array([[0.],[0.], [0.],[0.]]),0), - (np.array([[0.],[0.], [1.],[1.]]),0.5), - (np.array([[0.],[0.], [0.],[1.]]),0.375), - (np.array([[1.],[1.], [0.],[1.]]),0.375), - (np.array([[1.],[1.], [0.]]), 0.4444444), - (np.array([[1.],[1.], [1.],[1.]]),0)] -@pytest.mark.parametrize('y, expected', gini_index_tests) +@pytest.mark.parametrize("y, expected", gini_index_tests) def test_gini_index(y, expected): - tree = DecisionTreeClassifier(max_depth=5, min_size=2) - result = tree._gini_index(y) - assert expected == pytest.approx(result) - - -gini_cost_tests =[ ((np.array([[0.],[0.], [0.],[0.]]), - np.array([[0.],[0.], [1.],[1.]])), - 0.25), - ((np.array([[0.],[1.], [0.],[0.]]), - np.array([[0.],[0.], [0.],[1.]])), - 0.375) + tree = DecisionTreeClassifier(max_depth=5, min_size=2) + result = tree._gini_index(y) + assert expected == pytest.approx(result) + + +gini_cost_tests = [ + ( + ( + np.array([[0.0], [0.0], [0.0], [0.0]]), + np.array([[0.0], [0.0], [1.0], [1.0]]), + ), + 0.25, + ), + ( + ( + np.array([[0.0], [1.0], [0.0], [0.0]]), + np.array([[0.0], [0.0], [0.0], [1.0]]), + ), + 0.375, + ), ] -@pytest.mark.parametrize('groups, expected', gini_cost_tests) -def test_cost_gini(groups, expected): - tree = DecisionTreeClassifier() - result = tree._cost_gini(groups) - assert expected == result +@pytest.mark.parametrize("groups, expected", gini_cost_tests) +def test_cost_gini(groups, expected): + tree = DecisionTreeClassifier() + result = tree._cost_gini(groups) + assert expected == result get_params_tests = [ - ({"max_depth":3, "min_size":5, "n_features":None, "cost":'gini'}, - {"max_depth":3, "min_size":5, "n_features":None, "cost":'gini'}) + ( + {"max_depth": 3, "min_size": 5, "n_features": None, "cost": "gini"}, + {"max_depth": 3, "min_size": 5, "n_features": None, "cost": "gini"}, + ) ] -@pytest.mark.parametrize('test_dict, expected_dict', get_params_tests) + +@pytest.mark.parametrize("test_dict, expected_dict", get_params_tests) def test_get_params(test_dict, expected_dict): - tree = DecisionTreeClassifier(max_depth = test_dict["max_depth"], - min_size = test_dict["min_size"], - n_features = test_dict["n_features"]) + tree = DecisionTreeClassifier( + max_depth=test_dict["max_depth"], + min_size=test_dict["min_size"], + n_features=test_dict["n_features"], + ) assert expected_dict == tree.get_params() + def test_default_getparams(): tree = DecisionTreeClassifier() - assert {"max_depth":2, "min_size":1, "n_features":None, "cost":'gini'} == tree.get_params() + assert { + "max_depth": 2, + "min_size": 1, + "n_features": None, + "cost": "gini", + } == tree.get_params() def test_cost_exception(): with pytest.raises(Exception): tree = DecisionTreeClassifier(cost="mse") - - diff --git a/tests/unit/test_unit_tree_reg.py b/tests/unit/test_unit_tree_reg.py index 19633ef..3946c06 100644 --- a/tests/unit/test_unit_tree_reg.py +++ b/tests/unit/test_unit_tree_reg.py @@ -3,12 +3,14 @@ from randomforests import DecisionTreeRegressor -leaf_tests =[(np.array([[0.],[0.], [1.],[0.]]),0.25), - (np.array([[0.],[1.], [1.],[1.]]),0.75), - (np.array([[1.],[1.], [1.],[1.]]),1.0)] +leaf_tests = [ + (np.array([[0.0], [0.0], [1.0], [0.0]]), 0.25), + (np.array([[0.0], [1.0], [1.0], [1.0]]), 0.75), + (np.array([[1.0], [1.0], [1.0], [1.0]]), 1.0), +] -@pytest.mark.parametrize('y, expected', leaf_tests) +@pytest.mark.parametrize("y, expected", leaf_tests) def test_make_leaf(y, expected): tree = DecisionTreeRegressor(max_depth=5, min_size=2) result = tree._make_leaf(y) @@ -16,38 +18,49 @@ def test_make_leaf(y, expected): get_params_tests = [ - ({"max_depth":3, "min_size":5, "n_features":None}, - {"max_depth":3, "min_size":5, "n_features":None}) + ( + {"max_depth": 3, "min_size": 5, "n_features": None}, + {"max_depth": 3, "min_size": 5, "n_features": None}, + ) ] -@pytest.mark.parametrize('test_dict, expected_dict', get_params_tests) +@pytest.mark.parametrize("test_dict, expected_dict", get_params_tests) def test_get_params(test_dict, expected_dict): - tree = DecisionTreeRegressor(max_depth = test_dict["max_depth"], - min_size = test_dict["min_size"], - n_features = test_dict["n_features"]) + tree = DecisionTreeRegressor( + max_depth=test_dict["max_depth"], + min_size=test_dict["min_size"], + n_features=test_dict["n_features"], + ) assert expected_dict == tree.get_params() def test_default_getparams(): tree = DecisionTreeRegressor() - assert {"max_depth":2, "min_size":1, "n_features":None} == tree.get_params() - - + assert {"max_depth": 2, "min_size": 1, "n_features": None} == tree.get_params() + + +mse_cost_tests = [ + ( + ( + np.array([[0.0], [0.0], [0.0], [0.0]]), + np.array([[0.0], [0.0], [1.0], [1.0]]), + ), + 0.125, + ), + ( + ( + np.array([[0.0], [1.0], [0.0], [0.0]]), + np.array([[0.0], [0.0], [0.0], [1.0]]), + ), + 0.1874999995, + ), +] -mse_cost_tests =[ ((np.array([[0.],[0.], [0.],[0.]]), - np.array([[0.],[0.], [1.],[1.]])), - 0.125), - ((np.array([[0.],[1.], [0.],[0.]]), - np.array([[0.],[0.], [0.],[1.]])), - 0.1874999995) - ] -@pytest.mark.parametrize('groups, expected', mse_cost_tests) +@pytest.mark.parametrize("groups, expected", mse_cost_tests) def test_cost_mse(groups, expected): tree = DecisionTreeRegressor() result = tree._cost_mse(groups) assert expected == pytest.approx(result) - - diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index e93d3ad..82c7722 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,33 +1,38 @@ import pytest -import pandas as pd +import pandas as pd import numpy as np from randomforests.utils import _make_dataset + def test_make_datatset(): - dataset = np.array([[2.771244718,1.784783929,0], - [1.728571309,1.169761413,0], - [3.678319846,2.81281357,1], - [3.961043357,2.61995032,1], - [2.999208922,2.209014212,0], - [7.497545867,3.162953546,0], - [9.00220326,3.339047188,1], - [7.444542326,0.476683375,1], - [10.12493903,3.234550982,0], - [6.642287351,3.319983761,1]]) + dataset = np.array( + [ + [2.771244718, 1.784783929, 0], + [1.728571309, 1.169761413, 0], + [3.678319846, 2.81281357, 1], + [3.961043357, 2.61995032, 1], + [2.999208922, 2.209014212, 0], + [7.497545867, 3.162953546, 0], + [9.00220326, 3.339047188, 1], + [7.444542326, 0.476683375, 1], + [10.12493903, 3.234550982, 0], + [6.642287351, 3.319983761, 1], + ] + ) + df = pd.DataFrame(data=dataset, columns=["col1", "col2", "tar"]) + X = df[["col1", "col2"]] + y = df["tar"] - df = pd.DataFrame(data=dataset,columns =['col1','col2','tar']) - X = df[["col1","col2"]] - y = df["tar"] + assert np.array_equal(dataset, _make_dataset(X=X, y=y)) - assert np.array_equal(dataset, _make_dataset(X=X, y=y)) +dataset_test = [(1, 2), (None, None), (np.array([[2.771244718, 1.784783929]]), None)] -dataset_test = [(1,2),(None,None), (np.array([[2.771244718,1.784783929]]),None)] -@pytest.mark.parametrize('X, y', dataset_test) -def test_make_dataset_errors(X,y): - with pytest.raises(Exception): - _make_dataset(X,y) +@pytest.mark.parametrize("X, y", dataset_test) +def test_make_dataset_errors(X, y): + with pytest.raises(Exception): + _make_dataset(X, y)