TheAlgorithms · Dhairya-A-Mehra · Oct 1, 2024 · Oct 1, 2024 · Oct 1, 2024 · Oct 1, 2024
diff --git a/machine_learning/xgboost_classifier.py b/machine_learning/xgboost_classifier.py
@@ -1,77 +1,100 @@
-# XGBoost Classifier Example
 import numpy as np
-from matplotlib import pyplot as plt
 from sklearn.datasets import load_iris
-from sklearn.metrics import ConfusionMatrixDisplay
+from sklearn.metrics import accuracy_score
 from sklearn.model_selection import train_test_split
-from xgboost import XGBClassifier
+from sklearn.tree import DecisionTreeClassifier
 
 
-def data_handling(data: dict) -> tuple:
-    # Split dataset into features and target
-    # data is features
-    """
-    >>> data_handling(({'data':'[5.1, 3.5, 1.4, 0.2]','target':([0])}))
-    ('[5.1, 3.5, 1.4, 0.2]', [0])
-    >>> data_handling(
-    ...     {'data': '[4.9, 3.0, 1.4, 0.2], [4.7, 3.2, 1.3, 0.2]', 'target': ([0, 0])}
-    ... )
-    ('[4.9, 3.0, 1.4, 0.2], [4.7, 3.2, 1.3, 0.2]', [0, 0])
-    """
-    return (data["data"], data["target"])
+class SimpleXGBoost:
+    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
+        self.n_estimators = n_estimators
+        self.learning_rate = learning_rate
+        self.max_depth = max_depth
+        self.trees = []
+        self.classes = None
+
+    def _negative_gradient(self, y_true, y_pred):
+        """Compute the negative gradient for multi-class classification."""
+        return y_true - self._sigmoid(y_pred)
+
+    def _sigmoid(self, x):
+        """Apply sigmoid function."""
+        return 1 / (1 + np.exp(-x))
+
+    def _update_predictions(self, predictions, residuals):
+        """Update the predictions using the residuals and learning rate."""
+        return predictions + self.learning_rate * residuals
+
+    def fit(self, x, y):
+        """Fit the model using gradient boosting for multi-class classification."""
+        self.classes = np.unique(y)
+        n_classes = len(self.classes)
+
+        # One-vs-all approach
+        self.trees = [[] for _ in range(n_classes)]
+
+        # Convert y to one-hot encoding
+        y_one_hot = np.eye(n_classes)[y]
+
+        for class_idx in range(n_classes):
+            predictions = np.zeros(x.shape[0])
+
+            for _ in range(self.n_estimators):
+                # Compute residuals (negative gradient)
+                residuals = self._negative_gradient(
+                    y_one_hot[:, class_idx], predictions
+                )
+
+                # Fit a weak learner (decision tree) to the residuals
+                tree = DecisionTreeClassifier(max_depth=self.max_depth)
+                tree.fit(x, residuals)
+
+                # Update the predictions
+                predictions = self._update_predictions(predictions, tree.predict(x))
 
+                # Store the tree
+                self.trees[class_idx].append(tree)
 
-def xgboost(features: np.ndarray, target: np.ndarray) -> XGBClassifier:
+    def predict(self, x):
+        """Make predictions for multi-class classification."""
+        n_classes = len(self.classes)
+        class_scores = np.zeros((x.shape[0], n_classes))
+
+        for class_idx in range(n_classes):
+            predictions = np.zeros(x.shape[0])
+            for tree in self.trees[class_idx]:
+                predictions += self.learning_rate * tree.predict(x)
+            class_scores[:, class_idx] = predictions
+
+        # Return the class with the highest score
+        return self.classes[np.argmax(class_scores, axis=1)]
+
+
+def data_handling(data: dict) -> tuple:
     """
-    # THIS TEST IS BROKEN!! >>> xgboost(np.array([[5.1, 3.6, 1.4, 0.2]]), np.array([0]))
-    XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
-                  colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
-                  early_stopping_rounds=None, enable_categorical=False,
-                  eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
-                  importance_type=None, interaction_constraints='',
-                  learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
-                  max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
-                  missing=nan, monotone_constraints='()', n_estimators=100,
-                  n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
-                  reg_alpha=0, reg_lambda=1, ...)
+    Split dataset into features and target.
+
+    >>> data_handling({'data': np.array([[5.1, 3.5, 1.4, 0.2]]), 'target': np.array([0])})
+    (array([[5.1, 3.5, 1.4, 0.2]]), array([0]))
     """
-    classifier = XGBClassifier()
-    classifier.fit(features, target)
-    return classifier
+    return (data["data"], data["target"])
 
 
 def main() -> None:
     """
-    >>> main()
-
-    Url for the algorithm:
-    https://xgboost.readthedocs.io/en/stable/
-    Iris type dataset is used to demonstrate algorithm.
+    XGBoost Classifier Example using the Iris dataset.
     """
-
     # Load Iris dataset
     iris = load_iris()
-    features, targets = data_handling(iris)
+    data, target = data_handling({"data": iris.data, "target": iris.target})
     x_train, x_test, y_train, y_test = train_test_split(
-        features, targets, test_size=0.25
-    )
-
-    names = iris["target_names"]
-
-    # Create an XGBoost Classifier from the training data
-    xgboost_classifier = xgboost(x_train, y_train)
-
-    # Display the confusion matrix of the classifier with both training and test sets
-    ConfusionMatrixDisplay.from_estimator(
-        xgboost_classifier,
-        x_test,
-        y_test,
-        display_labels=names,
-        cmap="Blues",
-        normalize="true",
+        data, target, test_size=0.25, random_state=1
     )
-    plt.title("Normalized Confusion Matrix - IRIS Dataset")
-    plt.show()
+    xgboost_classifier = SimpleXGBoost(n_estimators=50, learning_rate=0.1, max_depth=3)
+    xgboost_classifier.fit(x_train, y_train)
+    predictions = xgboost_classifier.predict(x_test)
+    # Accuracy printing
+    print(f"Accuracy: {accuracy_score(y_test, predictions)}")
 
 
 if __name__ == "__main__":

diff --git a/machine_learning/xgboost_regressor.py b/machine_learning/xgboost_regressor.py
@@ -1,62 +1,85 @@
-# XGBoost Regressor Example
 import numpy as np
 from sklearn.datasets import fetch_california_housing
 from sklearn.metrics import mean_absolute_error, mean_squared_error
 from sklearn.model_selection import train_test_split
-from xgboost import XGBRegressor
+from sklearn.tree import DecisionTreeRegressor
 
 
-def data_handling(data: dict) -> tuple:
-    # Split dataset into features and target.  Data is features.
-    """
-    >>> data_handling((
-    ...  {'data':'[ 8.3252 41. 6.9841269 1.02380952  322. 2.55555556   37.88 -122.23 ]'
-    ...  ,'target':([4.526])}))
-    ('[ 8.3252 41. 6.9841269 1.02380952  322. 2.55555556   37.88 -122.23 ]', [4.526])
-    """
-    return (data["data"], data["target"])
+class SimpleXGBoostRegressor:
+    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
+        self.n_estimators = n_estimators
+        self.learning_rate = learning_rate
+        self.max_depth = max_depth
+        self.trees = []
+
+    def _negative_gradient(self, y_true, y_pred):
+        """Compute the negative gradient (residuals) for regression."""
+        return y_true - y_pred
+
+    def _update_predictions(self, predictions, residuals):
+        """Update the predictions using the residuals and learning rate."""
+        return predictions + self.learning_rate * residuals
+
+    def fit(self, x, y):
+        """Fit the model using gradient boosting."""
+        # Initialize predictions as the average of the target
+        predictions = np.full(y.shape, np.mean(y))
+
+        for _ in range(self.n_estimators):
+            # Compute residuals (negative gradient)
+            residuals = self._negative_gradient(y, predictions)
+
+            # Fit a weak learner (decision tree) to the residuals
+            tree = DecisionTreeRegressor(max_depth=self.max_depth)
+            tree.fit(x, residuals)
+
+            # Update the predictions
+            predictions = self._update_predictions(predictions, tree.predict(x))
 
+            # Store the tree
+            self.trees.append(tree)
 
-def xgboost(
-    features: np.ndarray, target: np.ndarray, test_features: np.ndarray
-) -> np.ndarray:
+    def predict(self, x):
+        """Make predictions by summing the weak learners' outputs."""
+        predictions = np.zeros(x.shape[0])
+
+        for tree in self.trees:
+            predictions += self.learning_rate * tree.predict(x)
+
+        return predictions
+
+
+def data_handling(data: dict) -> tuple:
     """
-    >>> xgboost(np.array([[ 2.3571 ,   52. , 6.00813008, 1.06775068,
-    ...    907. , 2.45799458,   40.58 , -124.26]]),np.array([1.114]),
-    ... np.array([[1.97840000e+00,  3.70000000e+01,  4.98858447e+00,  1.03881279e+00,
-    ...    1.14300000e+03,  2.60958904e+00,  3.67800000e+01, -1.19780000e+02]]))
-    array([[1.1139996]], dtype=float32)
+    Split dataset into features and target.
+
+    >>> data_handling({'data': np.array([[5.1, 3.5, 1.4, 0.2]]), 'target': np.array([0])})
+    (array([[5.1, 3.5, 1.4, 0.2]]), array([0]))
     """
-    xgb = XGBRegressor(
-        verbosity=0, random_state=42, tree_method="exact", base_score=0.5
-    )
-    xgb.fit(features, target)
-    # Predict target for test data
-    predictions = xgb.predict(test_features)
-    predictions = predictions.reshape(len(predictions), 1)
-    return predictions
+    return (data["data"], data["target"])
 
 
 def main() -> None:
     """
-    The URL for this algorithm
-    https://xgboost.readthedocs.io/en/stable/
-    California house price dataset is used to demonstrate the algorithm.
+    XGBoost Regressor Example using the California Housing dataset.
 
-    Expected error values:
-    Mean Absolute Error: 0.30957163379906033
-    Mean Square Error: 0.22611560196662744
+    Expected MAE: ~0.544
+    Expected MSE: ~0.544
     """
-    # Load California house price dataset
-    california = fetch_california_housing()
-    data, target = data_handling(california)
+    # Load California Housing dataset
+    housing = fetch_california_housing()
+    data, target = data_handling({"data": housing.data, "target": housing.target})
     x_train, x_test, y_train, y_test = train_test_split(
         data, target, test_size=0.25, random_state=1
     )
-    predictions = xgboost(x_train, y_train, x_test)
-    # Error printing
+    xgboost_regressor = SimpleXGBoostRegressor(
+        n_estimators=50, learning_rate=0.1, max_depth=3
+    )
+    xgboost_regressor.fit(x_train, y_train)
+    predictions = xgboost_regressor.predict(x_test)
+    # MAE and MSE printing
     print(f"Mean Absolute Error: {mean_absolute_error(y_test, predictions)}")
-    print(f"Mean Square Error: {mean_squared_error(y_test, predictions)}")
+    print(f"Mean Squared Error: {mean_squared_error(y_test, predictions)}")
 
 
 if __name__ == "__main__":