Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 79 additions & 56 deletions machine_learning/xgboost_classifier.py
Original file line number Diff line number Diff line change
@@ -1,77 +1,100 @@
# XGBoost Classifier Example
import numpy as np
from matplotlib import pyplot as plt
from sklearn.datasets import load_iris
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier


def data_handling(data: dict) -> tuple:
# Split dataset into features and target
# data is features
"""
>>> data_handling(({'data':'[5.1, 3.5, 1.4, 0.2]','target':([0])}))
('[5.1, 3.5, 1.4, 0.2]', [0])
>>> data_handling(
... {'data': '[4.9, 3.0, 1.4, 0.2], [4.7, 3.2, 1.3, 0.2]', 'target': ([0, 0])}
... )
('[4.9, 3.0, 1.4, 0.2], [4.7, 3.2, 1.3, 0.2]', [0, 0])
"""
return (data["data"], data["target"])
class SimpleXGBoost:
def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
self.n_estimators = n_estimators
self.learning_rate = learning_rate
self.max_depth = max_depth
self.trees = []
self.classes = None

def _negative_gradient(self, y_true, y_pred):
"""Compute the negative gradient for multi-class classification."""
return y_true - self._sigmoid(y_pred)

def _sigmoid(self, x):
"""Apply sigmoid function."""
return 1 / (1 + np.exp(-x))

def _update_predictions(self, predictions, residuals):
"""Update the predictions using the residuals and learning rate."""
return predictions + self.learning_rate * residuals

def fit(self, x, y):
"""Fit the model using gradient boosting for multi-class classification."""
self.classes = np.unique(y)
n_classes = len(self.classes)

# One-vs-all approach
self.trees = [[] for _ in range(n_classes)]

# Convert y to one-hot encoding
y_one_hot = np.eye(n_classes)[y]

for class_idx in range(n_classes):
predictions = np.zeros(x.shape[0])

for _ in range(self.n_estimators):
# Compute residuals (negative gradient)
residuals = self._negative_gradient(
y_one_hot[:, class_idx], predictions
)

# Fit a weak learner (decision tree) to the residuals
tree = DecisionTreeClassifier(max_depth=self.max_depth)
tree.fit(x, residuals)

# Update the predictions
predictions = self._update_predictions(predictions, tree.predict(x))

# Store the tree
self.trees[class_idx].append(tree)

def xgboost(features: np.ndarray, target: np.ndarray) -> XGBClassifier:
def predict(self, x):
"""Make predictions for multi-class classification."""
n_classes = len(self.classes)
class_scores = np.zeros((x.shape[0], n_classes))

for class_idx in range(n_classes):
predictions = np.zeros(x.shape[0])
for tree in self.trees[class_idx]:
predictions += self.learning_rate * tree.predict(x)
class_scores[:, class_idx] = predictions

# Return the class with the highest score
return self.classes[np.argmax(class_scores, axis=1)]


def data_handling(data: dict) -> tuple:
"""
# THIS TEST IS BROKEN!! >>> xgboost(np.array([[5.1, 3.6, 1.4, 0.2]]), np.array([0]))
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=100,
n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
reg_alpha=0, reg_lambda=1, ...)
Split dataset into features and target.

>>> data_handling({'data': np.array([[5.1, 3.5, 1.4, 0.2]]), 'target': np.array([0])})

Check failure on line 77 in machine_learning/xgboost_classifier.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E501)

machine_learning/xgboost_classifier.py:77:89: E501 Line too long (90 > 88)
(array([[5.1, 3.5, 1.4, 0.2]]), array([0]))
"""
classifier = XGBClassifier()
classifier.fit(features, target)
return classifier
return (data["data"], data["target"])


def main() -> None:
"""
>>> main()

Url for the algorithm:
https://xgboost.readthedocs.io/en/stable/
Iris type dataset is used to demonstrate algorithm.
XGBoost Classifier Example using the Iris dataset.
"""

# Load Iris dataset
iris = load_iris()
features, targets = data_handling(iris)
data, target = data_handling({"data": iris.data, "target": iris.target})
x_train, x_test, y_train, y_test = train_test_split(
features, targets, test_size=0.25
)

names = iris["target_names"]

# Create an XGBoost Classifier from the training data
xgboost_classifier = xgboost(x_train, y_train)

# Display the confusion matrix of the classifier with both training and test sets
ConfusionMatrixDisplay.from_estimator(
xgboost_classifier,
x_test,
y_test,
display_labels=names,
cmap="Blues",
normalize="true",
data, target, test_size=0.25, random_state=1
)
plt.title("Normalized Confusion Matrix - IRIS Dataset")
plt.show()
xgboost_classifier = SimpleXGBoost(n_estimators=50, learning_rate=0.1, max_depth=3)
xgboost_classifier.fit(x_train, y_train)
predictions = xgboost_classifier.predict(x_test)
# Accuracy printing
print(f"Accuracy: {accuracy_score(y_test, predictions)}")


if __name__ == "__main__":
Expand Down
101 changes: 62 additions & 39 deletions machine_learning/xgboost_regressor.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,85 @@
# XGBoost Regressor Example
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor


def data_handling(data: dict) -> tuple:
# Split dataset into features and target. Data is features.
"""
>>> data_handling((
... {'data':'[ 8.3252 41. 6.9841269 1.02380952 322. 2.55555556 37.88 -122.23 ]'
... ,'target':([4.526])}))
('[ 8.3252 41. 6.9841269 1.02380952 322. 2.55555556 37.88 -122.23 ]', [4.526])
"""
return (data["data"], data["target"])
class SimpleXGBoostRegressor:
def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
self.n_estimators = n_estimators
self.learning_rate = learning_rate
self.max_depth = max_depth
self.trees = []

def _negative_gradient(self, y_true, y_pred):
"""Compute the negative gradient (residuals) for regression."""
return y_true - y_pred

def _update_predictions(self, predictions, residuals):
"""Update the predictions using the residuals and learning rate."""
return predictions + self.learning_rate * residuals

def fit(self, x, y):
"""Fit the model using gradient boosting."""
# Initialize predictions as the average of the target
predictions = np.full(y.shape, np.mean(y))

for _ in range(self.n_estimators):
# Compute residuals (negative gradient)
residuals = self._negative_gradient(y, predictions)

# Fit a weak learner (decision tree) to the residuals
tree = DecisionTreeRegressor(max_depth=self.max_depth)
tree.fit(x, residuals)

# Update the predictions
predictions = self._update_predictions(predictions, tree.predict(x))

# Store the tree
self.trees.append(tree)

def xgboost(
features: np.ndarray, target: np.ndarray, test_features: np.ndarray
) -> np.ndarray:
def predict(self, x):
"""Make predictions by summing the weak learners' outputs."""
predictions = np.zeros(x.shape[0])

for tree in self.trees:
predictions += self.learning_rate * tree.predict(x)

return predictions


def data_handling(data: dict) -> tuple:
"""
>>> xgboost(np.array([[ 2.3571 , 52. , 6.00813008, 1.06775068,
... 907. , 2.45799458, 40.58 , -124.26]]),np.array([1.114]),
... np.array([[1.97840000e+00, 3.70000000e+01, 4.98858447e+00, 1.03881279e+00,
... 1.14300000e+03, 2.60958904e+00, 3.67800000e+01, -1.19780000e+02]]))
array([[1.1139996]], dtype=float32)
Split dataset into features and target.

>>> data_handling({'data': np.array([[5.1, 3.5, 1.4, 0.2]]), 'target': np.array([0])})

Check failure on line 56 in machine_learning/xgboost_regressor.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E501)

machine_learning/xgboost_regressor.py:56:89: E501 Line too long (90 > 88)
(array([[5.1, 3.5, 1.4, 0.2]]), array([0]))
"""
xgb = XGBRegressor(
verbosity=0, random_state=42, tree_method="exact", base_score=0.5
)
xgb.fit(features, target)
# Predict target for test data
predictions = xgb.predict(test_features)
predictions = predictions.reshape(len(predictions), 1)
return predictions
return (data["data"], data["target"])


def main() -> None:
"""
The URL for this algorithm
https://xgboost.readthedocs.io/en/stable/
California house price dataset is used to demonstrate the algorithm.
XGBoost Regressor Example using the California Housing dataset.

Expected error values:
Mean Absolute Error: 0.30957163379906033
Mean Square Error: 0.22611560196662744
Expected MAE: ~0.544
Expected MSE: ~0.544
"""
# Load California house price dataset
california = fetch_california_housing()
data, target = data_handling(california)
# Load California Housing dataset
housing = fetch_california_housing()
data, target = data_handling({"data": housing.data, "target": housing.target})
x_train, x_test, y_train, y_test = train_test_split(
data, target, test_size=0.25, random_state=1
)
predictions = xgboost(x_train, y_train, x_test)
# Error printing
xgboost_regressor = SimpleXGBoostRegressor(
n_estimators=50, learning_rate=0.1, max_depth=3
)
xgboost_regressor.fit(x_train, y_train)
predictions = xgboost_regressor.predict(x_test)
# MAE and MSE printing
print(f"Mean Absolute Error: {mean_absolute_error(y_test, predictions)}")
print(f"Mean Square Error: {mean_squared_error(y_test, predictions)}")
print(f"Mean Squared Error: {mean_squared_error(y_test, predictions)}")


if __name__ == "__main__":
Expand Down
Loading