diff --git a/machine_learning/xgboost_classifier.py b/machine_learning/xgboost_classifier.py index 1da933cf690f..a038da805cef 100644 --- a/machine_learning/xgboost_classifier.py +++ b/machine_learning/xgboost_classifier.py @@ -1,77 +1,100 @@ -# XGBoost Classifier Example import numpy as np -from matplotlib import pyplot as plt from sklearn.datasets import load_iris -from sklearn.metrics import ConfusionMatrixDisplay +from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split -from xgboost import XGBClassifier +from sklearn.tree import DecisionTreeClassifier -def data_handling(data: dict) -> tuple: - # Split dataset into features and target - # data is features - """ - >>> data_handling(({'data':'[5.1, 3.5, 1.4, 0.2]','target':([0])})) - ('[5.1, 3.5, 1.4, 0.2]', [0]) - >>> data_handling( - ... {'data': '[4.9, 3.0, 1.4, 0.2], [4.7, 3.2, 1.3, 0.2]', 'target': ([0, 0])} - ... ) - ('[4.9, 3.0, 1.4, 0.2], [4.7, 3.2, 1.3, 0.2]', [0, 0]) - """ - return (data["data"], data["target"]) +class SimpleXGBoost: + def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3): + self.n_estimators = n_estimators + self.learning_rate = learning_rate + self.max_depth = max_depth + self.trees = [] + self.classes = None + + def _negative_gradient(self, y_true, y_pred): + """Compute the negative gradient for multi-class classification.""" + return y_true - self._sigmoid(y_pred) + + def _sigmoid(self, x): + """Apply sigmoid function.""" + return 1 / (1 + np.exp(-x)) + + def _update_predictions(self, predictions, residuals): + """Update the predictions using the residuals and learning rate.""" + return predictions + self.learning_rate * residuals + + def fit(self, x, y): + """Fit the model using gradient boosting for multi-class classification.""" + self.classes = np.unique(y) + n_classes = len(self.classes) + + # One-vs-all approach + self.trees = [[] for _ in range(n_classes)] + + # Convert y to one-hot encoding + y_one_hot = np.eye(n_classes)[y] + + for class_idx in range(n_classes): + predictions = np.zeros(x.shape[0]) + + for _ in range(self.n_estimators): + # Compute residuals (negative gradient) + residuals = self._negative_gradient( + y_one_hot[:, class_idx], predictions + ) + + # Fit a weak learner (decision tree) to the residuals + tree = DecisionTreeClassifier(max_depth=self.max_depth) + tree.fit(x, residuals) + + # Update the predictions + predictions = self._update_predictions(predictions, tree.predict(x)) + # Store the tree + self.trees[class_idx].append(tree) -def xgboost(features: np.ndarray, target: np.ndarray) -> XGBClassifier: + def predict(self, x): + """Make predictions for multi-class classification.""" + n_classes = len(self.classes) + class_scores = np.zeros((x.shape[0], n_classes)) + + for class_idx in range(n_classes): + predictions = np.zeros(x.shape[0]) + for tree in self.trees[class_idx]: + predictions += self.learning_rate * tree.predict(x) + class_scores[:, class_idx] = predictions + + # Return the class with the highest score + return self.classes[np.argmax(class_scores, axis=1)] + + +def data_handling(data: dict) -> tuple: """ - # THIS TEST IS BROKEN!! >>> xgboost(np.array([[5.1, 3.6, 1.4, 0.2]]), np.array([0])) - XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None, - colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, - early_stopping_rounds=None, enable_categorical=False, - eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise', - importance_type=None, interaction_constraints='', - learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4, - max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1, - missing=nan, monotone_constraints='()', n_estimators=100, - n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, - reg_alpha=0, reg_lambda=1, ...) + Split dataset into features and target. + + >>> data_handling({'data': np.array([[5.1, 3.5, 1.4, 0.2]]), 'target': np.array([0])}) + (array([[5.1, 3.5, 1.4, 0.2]]), array([0])) """ - classifier = XGBClassifier() - classifier.fit(features, target) - return classifier + return (data["data"], data["target"]) def main() -> None: """ - >>> main() - - Url for the algorithm: - https://xgboost.readthedocs.io/en/stable/ - Iris type dataset is used to demonstrate algorithm. + XGBoost Classifier Example using the Iris dataset. """ - # Load Iris dataset iris = load_iris() - features, targets = data_handling(iris) + data, target = data_handling({"data": iris.data, "target": iris.target}) x_train, x_test, y_train, y_test = train_test_split( - features, targets, test_size=0.25 - ) - - names = iris["target_names"] - - # Create an XGBoost Classifier from the training data - xgboost_classifier = xgboost(x_train, y_train) - - # Display the confusion matrix of the classifier with both training and test sets - ConfusionMatrixDisplay.from_estimator( - xgboost_classifier, - x_test, - y_test, - display_labels=names, - cmap="Blues", - normalize="true", + data, target, test_size=0.25, random_state=1 ) - plt.title("Normalized Confusion Matrix - IRIS Dataset") - plt.show() + xgboost_classifier = SimpleXGBoost(n_estimators=50, learning_rate=0.1, max_depth=3) + xgboost_classifier.fit(x_train, y_train) + predictions = xgboost_classifier.predict(x_test) + # Accuracy printing + print(f"Accuracy: {accuracy_score(y_test, predictions)}") if __name__ == "__main__": diff --git a/machine_learning/xgboost_regressor.py b/machine_learning/xgboost_regressor.py index 52e041c55ea2..6cf87922057c 100644 --- a/machine_learning/xgboost_regressor.py +++ b/machine_learning/xgboost_regressor.py @@ -1,62 +1,85 @@ -# XGBoost Regressor Example import numpy as np from sklearn.datasets import fetch_california_housing from sklearn.metrics import mean_absolute_error, mean_squared_error from sklearn.model_selection import train_test_split -from xgboost import XGBRegressor +from sklearn.tree import DecisionTreeRegressor -def data_handling(data: dict) -> tuple: - # Split dataset into features and target. Data is features. - """ - >>> data_handling(( - ... {'data':'[ 8.3252 41. 6.9841269 1.02380952 322. 2.55555556 37.88 -122.23 ]' - ... ,'target':([4.526])})) - ('[ 8.3252 41. 6.9841269 1.02380952 322. 2.55555556 37.88 -122.23 ]', [4.526]) - """ - return (data["data"], data["target"]) +class SimpleXGBoostRegressor: + def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3): + self.n_estimators = n_estimators + self.learning_rate = learning_rate + self.max_depth = max_depth + self.trees = [] + + def _negative_gradient(self, y_true, y_pred): + """Compute the negative gradient (residuals) for regression.""" + return y_true - y_pred + + def _update_predictions(self, predictions, residuals): + """Update the predictions using the residuals and learning rate.""" + return predictions + self.learning_rate * residuals + + def fit(self, x, y): + """Fit the model using gradient boosting.""" + # Initialize predictions as the average of the target + predictions = np.full(y.shape, np.mean(y)) + + for _ in range(self.n_estimators): + # Compute residuals (negative gradient) + residuals = self._negative_gradient(y, predictions) + + # Fit a weak learner (decision tree) to the residuals + tree = DecisionTreeRegressor(max_depth=self.max_depth) + tree.fit(x, residuals) + + # Update the predictions + predictions = self._update_predictions(predictions, tree.predict(x)) + # Store the tree + self.trees.append(tree) -def xgboost( - features: np.ndarray, target: np.ndarray, test_features: np.ndarray -) -> np.ndarray: + def predict(self, x): + """Make predictions by summing the weak learners' outputs.""" + predictions = np.zeros(x.shape[0]) + + for tree in self.trees: + predictions += self.learning_rate * tree.predict(x) + + return predictions + + +def data_handling(data: dict) -> tuple: """ - >>> xgboost(np.array([[ 2.3571 , 52. , 6.00813008, 1.06775068, - ... 907. , 2.45799458, 40.58 , -124.26]]),np.array([1.114]), - ... np.array([[1.97840000e+00, 3.70000000e+01, 4.98858447e+00, 1.03881279e+00, - ... 1.14300000e+03, 2.60958904e+00, 3.67800000e+01, -1.19780000e+02]])) - array([[1.1139996]], dtype=float32) + Split dataset into features and target. + + >>> data_handling({'data': np.array([[5.1, 3.5, 1.4, 0.2]]), 'target': np.array([0])}) + (array([[5.1, 3.5, 1.4, 0.2]]), array([0])) """ - xgb = XGBRegressor( - verbosity=0, random_state=42, tree_method="exact", base_score=0.5 - ) - xgb.fit(features, target) - # Predict target for test data - predictions = xgb.predict(test_features) - predictions = predictions.reshape(len(predictions), 1) - return predictions + return (data["data"], data["target"]) def main() -> None: """ - The URL for this algorithm - https://xgboost.readthedocs.io/en/stable/ - California house price dataset is used to demonstrate the algorithm. + XGBoost Regressor Example using the California Housing dataset. - Expected error values: - Mean Absolute Error: 0.30957163379906033 - Mean Square Error: 0.22611560196662744 + Expected MAE: ~0.544 + Expected MSE: ~0.544 """ - # Load California house price dataset - california = fetch_california_housing() - data, target = data_handling(california) + # Load California Housing dataset + housing = fetch_california_housing() + data, target = data_handling({"data": housing.data, "target": housing.target}) x_train, x_test, y_train, y_test = train_test_split( data, target, test_size=0.25, random_state=1 ) - predictions = xgboost(x_train, y_train, x_test) - # Error printing + xgboost_regressor = SimpleXGBoostRegressor( + n_estimators=50, learning_rate=0.1, max_depth=3 + ) + xgboost_regressor.fit(x_train, y_train) + predictions = xgboost_regressor.predict(x_test) + # MAE and MSE printing print(f"Mean Absolute Error: {mean_absolute_error(y_test, predictions)}") - print(f"Mean Square Error: {mean_squared_error(y_test, predictions)}") + print(f"Mean Squared Error: {mean_squared_error(y_test, predictions)}") if __name__ == "__main__":