diff --git a/machine_learning/xgboost_classifier.py b/machine_learning/xgboost_classifier.py index 1da933cf690f..a6c303e0a2df 100644 --- a/machine_learning/xgboost_classifier.py +++ b/machine_learning/xgboost_classifier.py @@ -1,81 +1,105 @@ -# XGBoost Classifier Example import numpy as np from matplotlib import pyplot as plt from sklearn.datasets import load_iris -from sklearn.metrics import ConfusionMatrixDisplay +from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score from sklearn.model_selection import train_test_split -from xgboost import XGBClassifier +from sklearn.tree import DecisionTreeRegressor + + +class SimpleXGBoost: + def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3): + self.n_estimators = n_estimators + self.learning_rate = learning_rate + self.max_depth = max_depth + self.trees = [] + + def _negative_gradient(self, y_true, y_pred): + """Compute the negative gradient (residuals) for classification (log-odds).""" + return y_true - y_pred + + def _update_predictions(self, predictions, residuals): + """Update the predictions using the residuals and learning rate.""" + return predictions + self.learning_rate * residuals + + def fit(self, X, y): + """Fit the model using gradient boosting.""" + # Initialize predictions as the average of the target (for binary classification) + predictions = np.full(y.shape, np.mean(y)) + + for _ in range(self.n_estimators): + # Compute residuals (negative gradient) + residuals = self._negative_gradient(y, predictions) + + # Fit a weak learner (decision tree) to the residuals + tree = DecisionTreeRegressor(max_depth=self.max_depth) + tree.fit(X, residuals) + + # Update the predictions + predictions = self._update_predictions(predictions, tree.predict(X)) + + # Store the tree + self.trees.append(tree) + + def predict(self, X): + """Make predictions by summing the weak learners' outputs.""" + predictions = np.zeros(X.shape[0]) + + for tree in self.trees: + predictions += self.learning_rate * tree.predict(X) + + # Convert the predictions to binary (0 or 1) for classification + return np.round(predictions).astype(int) def data_handling(data: dict) -> tuple: # Split dataset into features and target - # data is features - """ - >>> data_handling(({'data':'[5.1, 3.5, 1.4, 0.2]','target':([0])})) - ('[5.1, 3.5, 1.4, 0.2]', [0]) - >>> data_handling( - ... {'data': '[4.9, 3.0, 1.4, 0.2], [4.7, 3.2, 1.3, 0.2]', 'target': ([0, 0])} - ... ) - ('[4.9, 3.0, 1.4, 0.2], [4.7, 3.2, 1.3, 0.2]', [0, 0]) - """ return (data["data"], data["target"]) -def xgboost(features: np.ndarray, target: np.ndarray) -> XGBClassifier: - """ - # THIS TEST IS BROKEN!! >>> xgboost(np.array([[5.1, 3.6, 1.4, 0.2]]), np.array([0])) - XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None, - colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, - early_stopping_rounds=None, enable_categorical=False, - eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise', - importance_type=None, interaction_constraints='', - learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4, - max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1, - missing=nan, monotone_constraints='()', n_estimators=100, - n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, - reg_alpha=0, reg_lambda=1, ...) - """ - classifier = XGBClassifier() - classifier.fit(features, target) - return classifier - - def main() -> None: """ >>> main() - Url for the algorithm: - https://xgboost.readthedocs.io/en/stable/ + Implemented XGBoost Classifier without external libraries. Iris type dataset is used to demonstrate algorithm. """ # Load Iris dataset iris = load_iris() features, targets = data_handling(iris) + + # For simplicity, binary classification (setosa vs non-setosa) + targets = (targets == 0).astype(int) + + # Split data into train and test sets x_train, x_test, y_train, y_test = train_test_split( features, targets, test_size=0.25 ) - names = iris["target_names"] + # Create a custom XGBoost classifier + xgboost_classifier = SimpleXGBoost(n_estimators=50, learning_rate=0.1, max_depth=3) + + # Fit the model + xgboost_classifier.fit(x_train, y_train) - # Create an XGBoost Classifier from the training data - xgboost_classifier = xgboost(x_train, y_train) + # Make predictions + y_pred = xgboost_classifier.predict(x_test) - # Display the confusion matrix of the classifier with both training and test sets - ConfusionMatrixDisplay.from_estimator( - xgboost_classifier, - x_test, + # Print accuracy + accuracy = accuracy_score(y_test, y_pred) + print(f"Accuracy: {accuracy * 100:.2f}%") + + # Display confusion matrix + ConfusionMatrixDisplay.from_predictions( y_test, - display_labels=names, + y_pred, + display_labels=["Non-Setosa", "Setosa"], cmap="Blues", normalize="true", ) - plt.title("Normalized Confusion Matrix - IRIS Dataset") + plt.title("Normalized Confusion Matrix - IRIS Dataset (Manual XGBoost)") plt.show() if __name__ == "__main__": - import doctest - - doctest.testmod(verbose=True) main() diff --git a/machine_learning/xgboost_regressor.py b/machine_learning/xgboost_regressor.py index 52e041c55ea2..c9cc940749a5 100644 --- a/machine_learning/xgboost_regressor.py +++ b/machine_learning/xgboost_regressor.py @@ -3,7 +3,50 @@ from sklearn.datasets import fetch_california_housing from sklearn.metrics import mean_absolute_error, mean_squared_error from sklearn.model_selection import train_test_split -from xgboost import XGBRegressor + + +class SimpleXGBoostRegressor: + def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3): + self.n_estimators = n_estimators + self.learning_rate = learning_rate + self.max_depth = max_depth + self.trees = [] + + def _negative_gradient(self, y_true, y_pred): + """Compute the negative gradient (residuals) for regression.""" + return y_true - y_pred + + def _update_predictions(self, predictions, residuals): + """Update the predictions using the residuals and learning rate.""" + return predictions + self.learning_rate * residuals + + def fit(self, X, y): + """Fit the model using gradient boosting.""" + # Initialize predictions as the average of the target + predictions = np.full(y.shape, np.mean(y)) + + for _ in range(self.n_estimators): + # Compute residuals (negative gradient) + residuals = self._negative_gradient(y, predictions) + + # Fit a weak learner (decision tree) to the residuals + tree = DecisionTreeRegressor(max_depth=self.max_depth) + tree.fit(X, residuals) + + # Update the predictions + predictions = self._update_predictions(predictions, tree.predict(X)) + + # Store the tree + self.trees.append(tree) + + def predict(self, X): + """Make predictions by summing the weak learners' outputs.""" + predictions = np.zeros(X.shape[0]) + + for tree in self.trees: + predictions += self.learning_rate * tree.predict(X) + + return predictions def data_handling(data: dict) -> tuple: @@ -17,26 +60,6 @@ def data_handling(data: dict) -> tuple: return (data["data"], data["target"]) -def xgboost( - features: np.ndarray, target: np.ndarray, test_features: np.ndarray -) -> np.ndarray: - """ - >>> xgboost(np.array([[ 2.3571 , 52. , 6.00813008, 1.06775068, - ... 907. , 2.45799458, 40.58 , -124.26]]),np.array([1.114]), - ... np.array([[1.97840000e+00, 3.70000000e+01, 4.98858447e+00, 1.03881279e+00, - ... 1.14300000e+03, 2.60958904e+00, 3.67800000e+01, -1.19780000e+02]])) - array([[1.1139996]], dtype=float32) - """ - xgb = XGBRegressor( - verbosity=0, random_state=42, tree_method="exact", base_score=0.5 - ) - xgb.fit(features, target) - # Predict target for test data - predictions = xgb.predict(test_features) - predictions = predictions.reshape(len(predictions), 1) - return predictions - - def main() -> None: """ The URL for this algorithm @@ -53,7 +76,11 @@ def main() -> None: x_train, x_test, y_train, y_test = train_test_split( data, target, test_size=0.25, random_state=1 ) - predictions = xgboost(x_train, y_train, x_test) + xgboost_regressor = SimpleXGBoostRegressor( + n_estimators=50, learning_rate=0.1, max_depth=3 + ) + xgboost_regressor.fit(x_train, y_train) + predictions = xgboost_regressor.predict(x_test) # Error printing print(f"Mean Absolute Error: {mean_absolute_error(y_test, predictions)}") print(f"Mean Square Error: {mean_squared_error(y_test, predictions)}")