Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
language: python
python:
- "3.6"
- "3.7"
- "3.8"
before_install:
- pip install pytest==5.4.3
- pip install pytest-cov==2.10.0
- pip install codecov==2.1.8
install:
- python setup.py install
- pip install .
script:
- pytest --cov
- python setup.py test
after_success:
- codecov
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,17 +80,17 @@ Uses the `setup.py` generated by [PyScaffold](https://pypi.org/project/PyScaffol

## Test
-----------------
Uses the `setup.py` generated by [PyScaffold](https://pypi.org/project/PyScaffold/):
Uses the [pytest](https://docs.pytest.org/) and [pytest-cov](https://pypi.org/project/pytest-cov/):

python setup.py test
pytest

## Dependencies
--------------
Dependencies are minimal:

- Python (>= 3.6)
- [Scikit-Learn](https://scikit-learn.org/stable/) (>=0.23)
- [Pandas](https://pandas.pydata.org/) (>=1.0)
- [Python (>= 3.6)](https://www.python.org/downloads/release/python-360/)
- [Scikit-Learn (>=0.23)](https://scikit-learn.org/stable/)
- [Pandas (>=1.0)](https://pandas.pydata.org/)


## References
Expand Down
5 changes: 5 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[pytest]
testpaths =
src
tests
addopts = --cov --cov-append
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ setup_requires = pyscaffold>=3.2a0,<3.3a0
install_requires =
pandas>=1.0
scikit-learn>=0.23
black==21.5b2

# The usage of test_requires is discouraged, see `Dependency Management` docs
tests_require =
Expand Down
31 changes: 16 additions & 15 deletions src/randomforests/Forest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np


class RandomForest:
"""
A Random Forest base class.
Expand Down Expand Up @@ -29,12 +30,12 @@ class RandomForest:

def __init__(self, n_trees=10, max_depth=2, min_size=1):
self.max_depth = max_depth
self.min_size = min_size
self.n_trees = n_trees
self.cost = None
self.trees = None
self.min_size = min_size
self.n_trees = n_trees
self.cost = None
self.trees = None

def _subsample(self, dataset : np.ndarray) -> np.ndarray:
def _subsample(self, dataset: np.ndarray) -> np.ndarray:
"""
This function returns a bootstrapped version of the dataset which
has the same number of rows.
Expand All @@ -50,20 +51,20 @@ def _subsample(self, dataset : np.ndarray) -> np.ndarray:

number_of_rows = dataset.shape[0]
sample_of_rows = number_of_rows
random_indices = np.random.choice(number_of_rows,
size=sample_of_rows,
replace=True)
return dataset[random_indices,:]
random_indices = np.random.choice(
number_of_rows, size=sample_of_rows, replace=True
)
return dataset[random_indices, :]

def set_params(self, **parameters):
for parameter, value in parameters.items():
setattr(self, parameter, value)
return self


def get_params(self, deep=True):
return {"max_depth" : self.max_depth,
"min_size" : self.min_size,
"cost" : self.cost,
"n_trees" : self.n_trees}

return {
"max_depth": self.max_depth,
"min_size": self.min_size,
"cost": self.cost,
"n_trees": self.n_trees,
}
52 changes: 28 additions & 24 deletions src/randomforests/ForestClassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score

class RandomForestClassifier (BaseEstimator, ClassifierMixin, RandomForest):

class RandomForestClassifier(BaseEstimator, ClassifierMixin, RandomForest):
"""
A random forest classification model that extends the abstract base class
of random forest.
Expand All @@ -33,26 +34,28 @@ class RandomForestClassifier (BaseEstimator, ClassifierMixin, RandomForest):
The cost function
"""

def __init__(self, n_trees : int = 10, max_depth : int =2, min_size : int =1, cost : str ='gini'):
def __init__(
self,
n_trees: int = 10,
max_depth: int = 2,
min_size: int = 1,
cost: str = "gini",
):
"""
Constructor for random forest classifier. This mainly just initialize
the attributes of the class by calling the base class constructor.
However, here is where it is the cost function string is checked
to make sure it either using 'gini', otherwise an error is thrown.

"""
super().__init__(n_trees = n_trees,
max_depth = max_depth,
min_size = min_size)
super().__init__(n_trees=n_trees, max_depth=max_depth, min_size=min_size)

if cost == 'gini':
self.cost = "gini"
if cost == "gini":
self.cost = "gini"
else:
raise NameError('Not valid cost function')


raise NameError("Not valid cost function")

def fit(self, X, y = None):
def fit(self, X, y=None):
"""
Fit the random forest to the training set train.

Expand All @@ -64,15 +67,15 @@ def fit(self, X, y = None):
"""

n_features = round(sqrt(X.shape[1]))
dataset = _make_dataset(X,y)
self.trees = [self._bootstrap_tree(dataset = dataset,
n_features = n_features)
for i in range(self.n_trees)]
dataset = _make_dataset(X, y)
self.trees = [
self._bootstrap_tree(dataset=dataset, n_features=n_features)
for i in range(self.n_trees)
]

return self


def predict(self, x : pd.DataFrame) -> int:
def predict(self, x: pd.DataFrame) -> int:
"""
Predict the class that this sample datapoint belongs to.

Expand All @@ -94,7 +97,6 @@ def predict(self, x : pd.DataFrame) -> int:

return sp.stats.mode(preds)[0][0]


def score(self, X=None, y=None):
"""
Returns the accuracy of the model
Expand All @@ -107,12 +109,14 @@ def score(self, X=None, y=None):

"""

return accuracy_score(y,self.predict(X))
return accuracy_score(y, self.predict(X))

def _bootstrap_tree(self, dataset : np.ndarray, n_features : int) -> DecisionTreeClassifier:
def _bootstrap_tree(
self, dataset: np.ndarray, n_features: int
) -> DecisionTreeClassifier:

sample = self._subsample(dataset)
tree = DecisionTreeClassifier(max_depth = self.max_depth,
min_size = self.min_size,
n_features = n_features)
return tree.fit(sample[:,:-1],sample[:,-1])
tree = DecisionTreeClassifier(
max_depth=self.max_depth, min_size=self.min_size, n_features=n_features
)
return tree.fit(sample[:, :-1], sample[:, -1])
53 changes: 30 additions & 23 deletions src/randomforests/ForestRegressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import mean_squared_error

class RandomForestRegressor (BaseEstimator, ClassifierMixin, RandomForest):

class RandomForestRegressor(BaseEstimator, ClassifierMixin, RandomForest):
"""
A random forest regression model that extends the abstract base class
of random forest.
Expand All @@ -32,25 +33,28 @@ class RandomForestRegressor (BaseEstimator, ClassifierMixin, RandomForest):
The cost function
"""

def __init__(self, n_trees : int = 10, max_depth : int =2, min_size : int =1, cost : str = "mse"):
def __init__(
self,
n_trees: int = 10,
max_depth: int = 2,
min_size: int = 1,
cost: str = "mse",
):
"""
Constructor for random forest regressor. This mainly just initialize
the attributes of the class by calling the base class constructor.
However, here is where it is the cost function string is checked
to make sure it either using 'mse', otherwise an error is thrown.

"""
super().__init__(n_trees = n_trees,
max_depth = max_depth,
min_size = min_size)
super().__init__(n_trees=n_trees, max_depth=max_depth, min_size=min_size)

if cost == 'mse':
self.cost = "mse"
if cost == "mse":
self.cost = "mse"
else:
raise NameError('Not valid cost function')

raise NameError("Not valid cost function")

def fit(self, X, y = None):
def fit(self, X, y=None):
"""
Fit the random forest to the training set train.

Expand All @@ -70,14 +74,15 @@ def fit(self, X, y = None):
"""

n_features = round(sqrt(X.shape[1]))
dataset = _make_dataset(X,y)
self.trees = [self._bootstrap_tree(dataset = dataset,
n_features = n_features)
for i in range(self.n_trees)]
dataset = _make_dataset(X, y)
self.trees = [
self._bootstrap_tree(dataset=dataset, n_features=n_features)
for i in range(self.n_trees)
]

return self

def predict(self, x : pd.DataFrame) -> int:
def predict(self, x: pd.DataFrame) -> int:
"""
Predict the value for this sample datapoint

Expand All @@ -94,10 +99,10 @@ def predict(self, x : pd.DataFrame) -> int:
rows = x.to_numpy()
else:
rows = x

preds = np.vstack([tree.predict(rows) for tree in self.trees])

return np.mean(preds,axis=0)
return np.mean(preds, axis=0)

def score(self, X=None, y=None):
"""
Expand All @@ -114,12 +119,14 @@ def score(self, X=None, y=None):
float
"""

return mean_squared_error(y,self.predict(X))
return mean_squared_error(y, self.predict(X))

def _bootstrap_tree(self, dataset : np.ndarray, n_features : int) -> DecisionTreeRegressor:
def _bootstrap_tree(
self, dataset: np.ndarray, n_features: int
) -> DecisionTreeRegressor:

sample = self._subsample(dataset)
tree = DecisionTreeRegressor(max_depth = self.max_depth,
min_size = self.min_size,
n_features = n_features)
return tree.fit(sample[:,:-1],sample[:,-1])
tree = DecisionTreeRegressor(
max_depth=self.max_depth, min_size=self.min_size, n_features=n_features
)
return tree.fit(sample[:, :-1], sample[:, -1])
Loading