TheAlgorithms · Dhairya-A-Mehra · Oct 1, 2024 · Oct 1, 2024
diff --git a/machine_learning/xgboost_classifier.py b/machine_learning/xgboost_classifier.py
@@ -1,81 +1,105 @@
-# XGBoost Classifier Example
 import numpy as np
 from matplotlib import pyplot as plt
 from sklearn.datasets import load_iris
-from sklearn.metrics import ConfusionMatrixDisplay
+from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score
 from sklearn.model_selection import train_test_split
-from xgboost import XGBClassifier
+from sklearn.tree import DecisionTreeRegressor
+
+
+class SimpleXGBoost:
+    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
+        self.n_estimators = n_estimators
+        self.learning_rate = learning_rate
+        self.max_depth = max_depth
+        self.trees = []
+
+    def _negative_gradient(self, y_true, y_pred):
+        """Compute the negative gradient (residuals) for classification (log-odds)."""
+        return y_true - y_pred
+
+    def _update_predictions(self, predictions, residuals):
+        """Update the predictions using the residuals and learning rate."""
+        return predictions + self.learning_rate * residuals
+
+    def fit(self, X, y):
+        """Fit the model using gradient boosting."""
+        # Initialize predictions as the average of the target (for binary classification)
+        predictions = np.full(y.shape, np.mean(y))
+
+        for _ in range(self.n_estimators):
+            # Compute residuals (negative gradient)
+            residuals = self._negative_gradient(y, predictions)
+
+            # Fit a weak learner (decision tree) to the residuals
+            tree = DecisionTreeRegressor(max_depth=self.max_depth)
+            tree.fit(X, residuals)
+
+            # Update the predictions
+            predictions = self._update_predictions(predictions, tree.predict(X))
+
+            # Store the tree
+            self.trees.append(tree)
+
+    def predict(self, X):
+        """Make predictions by summing the weak learners' outputs."""
+        predictions = np.zeros(X.shape[0])
+
+        for tree in self.trees:
+            predictions += self.learning_rate * tree.predict(X)
+
+        # Convert the predictions to binary (0 or 1) for classification
+        return np.round(predictions).astype(int)
 
 
 def data_handling(data: dict) -> tuple:
     # Split dataset into features and target
-    # data is features
-    """
-    >>> data_handling(({'data':'[5.1, 3.5, 1.4, 0.2]','target':([0])}))
-    ('[5.1, 3.5, 1.4, 0.2]', [0])
-    >>> data_handling(
-    ...     {'data': '[4.9, 3.0, 1.4, 0.2], [4.7, 3.2, 1.3, 0.2]', 'target': ([0, 0])}
-    ... )
-    ('[4.9, 3.0, 1.4, 0.2], [4.7, 3.2, 1.3, 0.2]', [0, 0])
-    """
     return (data["data"], data["target"])
 
 
-def xgboost(features: np.ndarray, target: np.ndarray) -> XGBClassifier:
-    """
-    # THIS TEST IS BROKEN!! >>> xgboost(np.array([[5.1, 3.6, 1.4, 0.2]]), np.array([0]))
-    XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
-                  colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
-                  early_stopping_rounds=None, enable_categorical=False,
-                  eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
-                  importance_type=None, interaction_constraints='',
-                  learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
-                  max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
-                  missing=nan, monotone_constraints='()', n_estimators=100,
-                  n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
-                  reg_alpha=0, reg_lambda=1, ...)
-    """
-    classifier = XGBClassifier()
-    classifier.fit(features, target)
-    return classifier
-
-
 def main() -> None:
     """
     >>> main()
 
-    Url for the algorithm:
-    https://xgboost.readthedocs.io/en/stable/
+    Implemented XGBoost Classifier without external libraries.
     Iris type dataset is used to demonstrate algorithm.
     """
 
     # Load Iris dataset
     iris = load_iris()
     features, targets = data_handling(iris)
+
+    # For simplicity, binary classification (setosa vs non-setosa)
+    targets = (targets == 0).astype(int)
+
+    # Split data into train and test sets
     x_train, x_test, y_train, y_test = train_test_split(
         features, targets, test_size=0.25
     )
 
-    names = iris["target_names"]
+    # Create a custom XGBoost classifier
+    xgboost_classifier = SimpleXGBoost(n_estimators=50, learning_rate=0.1, max_depth=3)
+
+    # Fit the model
+    xgboost_classifier.fit(x_train, y_train)
 
-    # Create an XGBoost Classifier from the training data
-    xgboost_classifier = xgboost(x_train, y_train)
+    # Make predictions
+    y_pred = xgboost_classifier.predict(x_test)
 
-    # Display the confusion matrix of the classifier with both training and test sets
-    ConfusionMatrixDisplay.from_estimator(
-        xgboost_classifier,
-        x_test,
+    # Print accuracy
+    accuracy = accuracy_score(y_test, y_pred)
+    print(f"Accuracy: {accuracy * 100:.2f}%")
+
+    # Display confusion matrix
+    ConfusionMatrixDisplay.from_predictions(
         y_test,
-        display_labels=names,
+        y_pred,
+        display_labels=["Non-Setosa", "Setosa"],
         cmap="Blues",
         normalize="true",
     )
-    plt.title("Normalized Confusion Matrix - IRIS Dataset")
+    plt.title("Normalized Confusion Matrix - IRIS Dataset (Manual XGBoost)")
     plt.show()
 
 
 if __name__ == "__main__":
-    import doctest
-
-    doctest.testmod(verbose=True)
     main()
diff --git a/machine_learning/xgboost_regressor.py b/machine_learning/xgboost_regressor.py
@@ -3,7 +3,50 @@
 from sklearn.datasets import fetch_california_housing
 from sklearn.metrics import mean_absolute_error, mean_squared_error
 from sklearn.model_selection import train_test_split
-from xgboost import XGBRegressor
+
+
+class SimpleXGBoostRegressor:
+    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
+        self.n_estimators = n_estimators
+        self.learning_rate = learning_rate
+        self.max_depth = max_depth
+        self.trees = []
+
+    def _negative_gradient(self, y_true, y_pred):
+        """Compute the negative gradient (residuals) for regression."""
+        return y_true - y_pred
+
+    def _update_predictions(self, predictions, residuals):
+        """Update the predictions using the residuals and learning rate."""
+        return predictions + self.learning_rate * residuals
+
+    def fit(self, X, y):
+        """Fit the model using gradient boosting."""
+        # Initialize predictions as the average of the target
+        predictions = np.full(y.shape, np.mean(y))
+
+        for _ in range(self.n_estimators):
+            # Compute residuals (negative gradient)
+            residuals = self._negative_gradient(y, predictions)
+
+            # Fit a weak learner (decision tree) to the residuals
+            tree = DecisionTreeRegressor(max_depth=self.max_depth)
+            tree.fit(X, residuals)
+
+            # Update the predictions
+            predictions = self._update_predictions(predictions, tree.predict(X))
+
+            # Store the tree
+            self.trees.append(tree)
+
+    def predict(self, X):
+        """Make predictions by summing the weak learners' outputs."""
+        predictions = np.zeros(X.shape[0])
+
+        for tree in self.trees:
+            predictions += self.learning_rate * tree.predict(X)
+
+        return predictions
 
 
 def data_handling(data: dict) -> tuple:
@@ -17,26 +60,6 @@
     return (data["data"], data["target"])
 
 
-def xgboost(
-    features: np.ndarray, target: np.ndarray, test_features: np.ndarray
-) -> np.ndarray:
-    """
-    >>> xgboost(np.array([[ 2.3571 ,   52. , 6.00813008, 1.06775068,
-    ...    907. , 2.45799458,   40.58 , -124.26]]),np.array([1.114]),
-    ... np.array([[1.97840000e+00,  3.70000000e+01,  4.98858447e+00,  1.03881279e+00,
-    ...    1.14300000e+03,  2.60958904e+00,  3.67800000e+01, -1.19780000e+02]]))
-    array([[1.1139996]], dtype=float32)
-    """
-    xgb = XGBRegressor(
-        verbosity=0, random_state=42, tree_method="exact", base_score=0.5
-    )
-    xgb.fit(features, target)
-    # Predict target for test data
-    predictions = xgb.predict(test_features)
-    predictions = predictions.reshape(len(predictions), 1)
-    return predictions
-
-
 def main() -> None:
     """
     The URL for this algorithm
@@ -53,7 +76,11 @@
     x_train, x_test, y_train, y_test = train_test_split(
         data, target, test_size=0.25, random_state=1
     )
-    predictions = xgboost(x_train, y_train, x_test)
+    xgboost_regressor = SimpleXGBoostRegressor(
+        n_estimators=50, learning_rate=0.1, max_depth=3
+    )
+    xgboost_regressor.fit(x_train, y_train)
+    predictions = xgboost_regressor.predict(x_test)
     # Error printing
     print(f"Mean Absolute Error: {mean_absolute_error(y_test, predictions)}")
     print(f"Mean Square Error: {mean_squared_error(y_test, predictions)}")