diff --git a/machine_learning/gradient_boosting.r b/machine_learning/gradient_boosting.r
new file mode 100644
index 00000000..a07d68bb
--- /dev/null
+++ b/machine_learning/gradient_boosting.r
@@ -0,0 +1,435 @@
+# gradient_boosting.r
+# Gradient Boosting Algorithm Implementation in R
+# A sequential ensemble learning method that builds models iteratively
+# Each new model corrects errors made by previous models
+#
+# Algorithm details:
+# - Uses decision trees as weak learners
+# - Fits each tree to the residuals (errors) of previous predictions
+# - Combines predictions using weighted sum
+# - Learning rate controls contribution of each tree
+# - Time complexity: O(n_trees * n_samples * log(n_samples))
+# - Space complexity: O(n_trees * tree_size)
+
+library(R6)
+
+#' Decision Tree Node
+#' Simple decision tree implementation for gradient boosting
+DecisionTreeNode <- R6Class(
+  "DecisionTreeNode",
+  
+  public = list(
+    feature = NULL,
+    threshold = NULL,
+    left = NULL,
+    right = NULL,
+    value = NULL,
+    
+    initialize = function(feature = NULL, threshold = NULL, 
+                         left = NULL, right = NULL, value = NULL) {
+      self$feature <- feature
+      self$threshold <- threshold
+      self$left <- left
+      self$right <- right
+      self$value <- value
+    },
+    
+    is_leaf = function() {
+      return(!is.null(self$value))
+    }
+  )
+)
+
+#' Regression Tree for Gradient Boosting
+RegressionTree <- R6Class(
+  "RegressionTree",
+  
+  public = list(
+    max_depth = NULL,
+    min_samples_split = NULL,
+    root = NULL,
+    
+    initialize = function(max_depth = 3, min_samples_split = 2) {
+      self$max_depth <- max_depth
+      self$min_samples_split <- min_samples_split
+      self$root <- NULL
+    },
+    
+    fit = function(X, y) {
+      "Build the decision tree"
+      self$root <- private$build_tree(X, y, depth = 0)
+      invisible(self)
+    },
+    
+    predict = function(X) {
+      "Predict values for input data"
+      if (is.vector(X)) {
+        X <- matrix(X, nrow = 1)
+      }
+      apply(X, 1, function(row) private$predict_sample(row, self$root))
+    }
+  ),
+  
+  private = list(
+    build_tree = function(X, y, depth) {
+      "Recursively build decision tree"
+      n_samples <- nrow(X)
+      
+      # Stopping criteria
+      if (depth >= self$max_depth || 
+          n_samples < self$min_samples_split ||
+          length(unique(y)) == 1) {
+        return(DecisionTreeNode$new(value = mean(y)))
+      }
+      
+      # Find best split
+      best_split <- private$find_best_split(X, y)
+      
+      if (is.null(best_split)) {
+        return(DecisionTreeNode$new(value = mean(y)))
+      }
+      
+      # Split data
+      left_idx <- X[, best_split$feature] <= best_split$threshold
+      right_idx <- !left_idx
+      
+      # Build subtrees
+      left_subtree <- private$build_tree(
+        X[left_idx, , drop = FALSE], 
+        y[left_idx], 
+        depth + 1
+      )
+      right_subtree <- private$build_tree(
+        X[right_idx, , drop = FALSE], 
+        y[right_idx], 
+        depth + 1
+      )
+      
+      return(DecisionTreeNode$new(
+        feature = best_split$feature,
+        threshold = best_split$threshold,
+        left = left_subtree,
+        right = right_subtree
+      ))
+    },
+    
+    find_best_split = function(X, y) {
+      "Find the best feature and threshold to split on"
+      best_mse <- Inf
+      best_split <- NULL
+      n_features <- ncol(X)
+      n_samples <- nrow(X)
+      min_samples_leaf <- max(1, floor(self$min_samples_split / 2))
+      
+      for (feature in 1:n_features) {
+        # Sort feature values and corresponding targets
+        sorted_idx <- order(X[, feature])
+        sorted_x <- X[sorted_idx, feature]
+        sorted_y <- y[sorted_idx]
+        
+        # Consider only unique values as potential thresholds
+        unique_vals <- unique(sorted_x)
+        if (length(unique_vals) <= 1) next
+        
+        # Pre-compute cumulative statistics
+        cum_sum <- cumsum(sorted_y)
+        cum_sum_sq <- cumsum(sorted_y^2)
+        
+        # Evaluate splits between unique values
+        for (i in 1:(length(unique_vals)-1)) {
+          threshold <- (unique_vals[i] + unique_vals[i+1]) / 2
+          split_idx <- which(sorted_x <= threshold)
+          n_left <- length(split_idx)
+          n_right <- n_samples - n_left
+          
+          # Skip if split doesn't meet minimum samples requirement
+          if (n_left < min_samples_leaf || n_right < min_samples_leaf) next
+          
+          # Calculate MSE using pre-computed statistics
+          left_sum <- cum_sum[n_left]
+          left_sum_sq <- cum_sum_sq[n_left]
+          right_sum <- cum_sum[n_samples] - left_sum
+          right_sum_sq <- cum_sum_sq[n_samples] - left_sum_sq
+          
+          left_mse <- (left_sum_sq - (left_sum^2)/n_left)
+          right_mse <- (right_sum_sq - (right_sum^2)/n_right)
+          mse <- left_mse + right_mse
+          
+          if (mse < best_mse) {
+            best_mse <- mse
+            best_split <- list(feature = feature, threshold = threshold)
+          }
+        }
+      }
+      
+      return(best_split)
+    },
+    
+    predict_sample = function(x, node) {
+      "Predict single sample by traversing tree"
+      if (node$is_leaf()) {
+        return(node$value)
+      }
+      
+      if (x[node$feature] <= node$threshold) {
+        return(private$predict_sample(x, node$left))
+      } else {
+        return(private$predict_sample(x, node$right))
+      }
+    }
+  )
+)
+
+#' Gradient Boosting Regressor
+GradientBoostingRegressor <- R6Class(
+  "GradientBoostingRegressor",
+  
+  public = list(
+    n_estimators = NULL,
+    learning_rate = NULL,
+    max_depth = NULL,
+    min_samples_split = NULL,
+    trees = NULL,
+    initial_prediction = NULL,
+    early_stopping_rounds = NULL,
+    best_iteration = NULL,
+    
+    initialize = function(n_estimators = 100, learning_rate = 0.1, 
+                         max_depth = 3, min_samples_split = 2,
+                         early_stopping_rounds = NULL) {
+      "Initialize gradient boosting parameters"
+      if (n_estimators <= 0 || learning_rate <= 0 || max_depth <= 0) {
+        stop("Parameters must be positive")
+      }
+      
+      self$n_estimators <- n_estimators
+      self$learning_rate <- learning_rate
+      self$max_depth <- max_depth
+      self$min_samples_split <- min_samples_split
+      self$trees <- list()
+      self$initial_prediction <- NULL
+    },
+    
+    fit = function(X, y, verbose = FALSE) {
+      "Train the gradient boosting model"
+      # Input validation
+      if (!is.numeric(y)) {
+        stop("Target variable 'y' must be numeric")
+      }
+      if (is.vector(X)) {
+        X <- matrix(X, ncol = 1)
+      }
+      if (!is.matrix(X) || !is.numeric(X)) {
+        stop("Input 'X' must be a numeric matrix or vector")
+      }
+      if (length(y) != nrow(X)) {
+        stop("Number of samples in X and y must match")
+      }
+      if (any(is.na(X)) || any(is.na(y))) {
+        stop("Input contains missing values")
+      }
+      
+      # Initialize with mean of target
+      self$initial_prediction <- mean(y)
+      predictions <- rep(self$initial_prediction, length(y))
+      
+      # Build trees sequentially
+      for (i in 1:self$n_estimators) {
+        # Calculate residuals (negative gradient for MSE loss)
+        residuals <- y - predictions
+        
+        # Fit tree to residuals
+        tree <- RegressionTree$new(
+          max_depth = self$max_depth,
+          min_samples_split = self$min_samples_split
+        )
+        tree$fit(X, residuals)
+        
+        # Update predictions
+        tree_predictions <- tree$predict(X)
+        predictions <- predictions + self$learning_rate * tree_predictions
+        
+        # Store tree
+        self$trees[[i]] <- tree
+        
+        # Calculate training error
+        if (verbose && (i %% 10 == 0 || i == 1)) {
+          mse <- mean((y - predictions)^2)
+          rmse <- sqrt(mse)
+          cat(sprintf("Iteration %d/%d - RMSE: %.4f\n", 
+                     i, self$n_estimators, rmse))
+        }
+      }
+      
+      invisible(self)
+    },
+    
+    predict = function(X) {
+      "Make predictions using the trained model"
+      if (is.null(self$initial_prediction)) {
+        stop("Model has not been fitted yet")
+      }
+      
+      if (is.vector(X)) {
+        X <- matrix(X, ncol = 1)
+      }
+      
+      # Start with initial prediction
+      predictions <- rep(self$initial_prediction, nrow(X))
+      
+      # Add contribution from each tree
+      for (tree in self$trees) {
+        predictions <- predictions + self$learning_rate * tree$predict(X)
+      }
+      
+      return(predictions)
+    },
+    
+    score = function(X, y) {
+      "Calculate R-squared score"
+      predictions <- self$predict(X)
+      ss_res <- sum((y - predictions)^2)
+      ss_tot <- sum((y - mean(y))^2)
+      r2 <- 1 - (ss_res / ss_tot)
+      return(r2)
+    },
+    
+    get_feature_importance = function() {
+      "Calculate relative feature importance based on split frequency and gain"
+      if (length(self$trees) == 0) {
+        stop("Model has not been fitted yet")
+      }
+      
+      # Count feature usage and gain in splits
+      n_features <- ncol(X)  # Assumes X from last fit
+      importance <- rep(0, n_features)
+      names(importance) <- paste0("Feature_", 1:n_features)
+      
+      calculate_tree_importance <- function(node, depth = 0) {
+        if (is.null(node) || node$is_leaf()) {
+          return(NULL)
+        }
+        
+        # Add importance score based on depth (earlier splits are more important)
+        feature_idx <- node$feature
+        importance[feature_idx] <<- importance[feature_idx] + 1 / (depth + 1)
+        
+        calculate_tree_importance(node$left, depth + 1)
+        calculate_tree_importance(node$right, depth + 1)
+      }
+      
+      # Calculate importance for each tree
+      for (tree in self$trees) {
+        calculate_tree_importance(tree$root)
+      }
+      
+      # Normalize importance scores
+      if (sum(importance) > 0) {
+        importance <- importance / sum(importance)
+      }
+      
+      # Sort and return as named vector
+      importance <- sort(importance, decreasing = TRUE)
+      return(importance)
+    }
+  )
+)
+
+# Demonstration and testing
+demonstrate_gradient_boosting <- function() {
+  cat("=== Gradient Boosting Algorithm Demo ===\n\n")
+  
+  # Generate synthetic dataset
+  set.seed(42)
+  n_samples <- 200
+  
+  cat("Example 1: Non-linear regression problem\n")
+  cat("Generating synthetic data...\n\n")
+  
+  # Create non-linear relationship
+  X <- matrix(runif(n_samples, -3, 3), ncol = 1)
+  y <- sin(X[, 1]) + 0.3 * X[, 1]^2 + rnorm(n_samples, 0, 0.1)
+  
+  # Split into train and test
+  train_idx <- sample(1:n_samples, size = 0.8 * n_samples)
+  test_idx <- setdiff(1:n_samples, train_idx)
+  
+  X_train <- matrix(X[train_idx, ], ncol = 1)
+  y_train <- y[train_idx]
+  X_test <- matrix(X[test_idx, ], ncol = 1)
+  y_test <- y[test_idx]
+  
+  # Train model
+  cat("Training Gradient Boosting model...\n")
+  model <- GradientBoostingRegressor$new(
+    n_estimators = 50,
+    learning_rate = 0.1,
+    max_depth = 3
+  )
+  model$fit(X_train, y_train, verbose = TRUE)
+  
+  # Make predictions
+  cat("\nEvaluating model...\n")
+  train_pred <- model$predict(X_train)
+  test_pred <- model$predict(X_test)
+  
+  # Calculate metrics
+  train_rmse <- sqrt(mean((y_train - train_pred)^2))
+  test_rmse <- sqrt(mean((y_test - test_pred)^2))
+  train_r2 <- model$score(X_train, y_train)
+  test_r2 <- model$score(X_test, y_test)
+  
+  cat(sprintf("\nResults:\n"))
+  cat(sprintf("Train RMSE: %.4f | Train R²: %.4f\n", train_rmse, train_r2))
+  cat(sprintf("Test RMSE:  %.4f | Test R²:  %.4f\n\n", test_rmse, test_r2))
+  
+  # Example 2: Multi-feature problem
+  cat("Example 2: Multi-feature regression\n")
+  cat("Generating multi-dimensional data...\n")
+  
+  X_multi <- matrix(rnorm(n_samples * 3), ncol = 3)
+  y_multi <- 2 * X_multi[, 1] - 3 * X_multi[, 2] + 
+             0.5 * X_multi[, 3]^2 + rnorm(n_samples, 0, 0.5)
+  
+  # Train-test split
+  X_train_multi <- X_multi[train_idx, ]
+  y_train_multi <- y_multi[train_idx]
+  X_test_multi <- X_multi[test_idx, ]
+  y_test_multi <- y_multi[test_idx]
+  
+  # Train model
+  model2 <- GradientBoostingRegressor$new(
+    n_estimators = 50,
+    learning_rate = 0.1,
+    max_depth = 4
+  )
+  model2$fit(X_train_multi, y_train_multi)
+  
+  # Evaluate
+  test_rmse2 <- sqrt(mean((y_test_multi - model2$predict(X_test_multi))^2))
+  test_r2_2 <- model2$score(X_test_multi, y_test_multi)
+  
+  cat(sprintf("Test RMSE: %.4f | Test R²: %.4f\n\n", test_rmse2, test_r2_2))
+  
+  # Example 3: Hyperparameter comparison
+  cat("Example 3: Impact of learning rate\n")
+  learning_rates <- c(0.01, 0.1, 0.5)
+  
+  for (lr in learning_rates) {
+    model_lr <- GradientBoostingRegressor$new(
+      n_estimators = 30,
+      learning_rate = lr,
+      max_depth = 3
+    )
+    model_lr$fit(X_train, y_train, verbose = FALSE)
+    test_rmse_lr <- sqrt(mean((y_test - model_lr$predict(X_test))^2))
+    cat(sprintf("Learning rate %.2f - Test RMSE: %.4f\n", lr, test_rmse_lr))
+  }
+  
+  cat("\n=== Demo Complete ===\n")
+}
+
+# Run demonstration
+if (!interactive()) {
+  demonstrate_gradient_boosting()
+}
\ No newline at end of file