diff --git a/machine_learning/k_means.r b/machine_learning/k_means.r new file mode 100644 index 00000000..84be4d6a --- /dev/null +++ b/machine_learning/k_means.r @@ -0,0 +1,822 @@ +# k_means.r +# K-Means Clustering Algorithm Implementation in R +# A centroid-based clustering algorithm that partitions data into k clusters +# Each cluster is represented by its centroid (mean of all points in the cluster) +# +# Algorithm details: +# - Iteratively assigns points to nearest centroid and updates centroids +# - Minimizes within-cluster sum of squared distances (inertia) +# - Fast and efficient for large datasets +# - Works best with spherical, well-separated clusters +# - Time complexity: O(n * k * d * iterations) +# - Space complexity: O(n * d + k * d) + +library(R6) + +#' K-Means Clustering Algorithm +#' +#' Implementation of the K-Means clustering algorithm with multiple initialization methods +#' +#' @description +#' K-Means is a centroid-based clustering algorithm that partitions n observations into k clusters. +#' Each observation belongs to the cluster with the nearest centroid (cluster center). +#' The algorithm iteratively updates cluster assignments and centroids until convergence. +#' +#' @examples +#' # Create sample data +#' set.seed(42) +#' X <- rbind( +#' matrix(rnorm(100, mean = 0, sd = 1), ncol = 2), +#' matrix(rnorm(100, mean = 5, sd = 1), ncol = 2), +#' matrix(rnorm(100, mean = 10, sd = 1), ncol = 2) +#' ) +#' +#' # Fit k-means +#' kmeans_model <- KMeans$new(n_clusters = 3, init = "k-means++", max_iter = 300) +#' kmeans_model$fit(X) +#' labels <- kmeans_model$predict(X) +#' +#' # Get cluster centers +#' centers <- kmeans_model$get_centroids() +#' +#' # Plot results +#' plot(X, col = labels + 1, pch = 19, main = "K-Means Clustering") +#' points(centers, col = 1:3, pch = 8, cex = 2, lwd = 2) +#' legend("topright", legend = paste("Cluster", 1:3), col = 2:4, pch = 19) + +KMeans <- R6Class( + "KMeans", + + public = list( + #' @field n_clusters Number of clusters to form + n_clusters = NULL, + + #' @field init Initialization method: "random", "k-means++", or matrix of initial centroids + init = NULL, + + #' @field max_iter Maximum number of iterations + max_iter = NULL, + + #' @field tol Tolerance for convergence (change in inertia) + tol = NULL, + + #' @field random_state Random seed for reproducibility + random_state = NULL, + + #' @field n_init Number of times to run algorithm with different initializations + n_init = NULL, + + #' @field centroids Cluster centroids + centroids = NULL, + + #' @field labels Cluster labels for training data + labels = NULL, + + #' @field inertia Within-cluster sum of squared distances + inertia = NULL, + + #' @field n_iter Number of iterations performed + n_iter = NULL, + + #' @field cluster_sizes Number of points in each cluster + cluster_sizes = NULL, + + #' @field training_data Stored training data + training_data = NULL, + + #' @description + #' Initialize a new K-Means instance + #' + #' @param n_clusters Number of clusters (default: 3) + #' @param init Initialization method: "random", "k-means++" (default: "k-means++") + #' @param max_iter Maximum iterations (default: 300) + #' @param tol Tolerance for convergence (default: 1e-4) + #' @param random_state Random seed (default: NULL) + #' @param n_init Number of initializations to try (default: 10) + initialize = function(n_clusters = 3, init = "k-means++", max_iter = 300, + tol = 1e-4, random_state = NULL, n_init = 10) { + self$n_clusters <- n_clusters + self$init <- init + self$max_iter <- max_iter + self$tol <- tol + self$random_state <- random_state + self$n_init <- n_init + + if (!is.null(random_state)) { + set.seed(random_state) + } + }, + + #' @description + #' Compute Euclidean distance between two points + #' + #' @param x1 First point (vector) + #' @param x2 Second point (vector) + #' @return Euclidean distance + euclidean_distance = function(x1, x2) { + return(sqrt(sum((x1 - x2)^2))) + }, + + #' @description + #' Compute distances from all points to all centroids + #' + #' @param X Data matrix (n_samples x n_features) + #' @param centroids Centroid matrix (n_clusters x n_features) + #' @return Distance matrix (n_samples x n_clusters) + compute_distances = function(X, centroids) { + n_samples <- nrow(X) + n_clusters <- nrow(centroids) + # Vectorized computation of distances using dist() + distances <- as.matrix(dist(rbind(X, centroids)))[1:n_samples, (n_samples+1):(n_samples+n_clusters)] + return(distances) + }, + + #' @description + #' Initialize centroids randomly + #' + #' @param X Data matrix + #' @return Initial centroids + initialize_random = function(X) { + n_samples <- nrow(X) + indices <- sample(1:n_samples, self$n_clusters, replace = FALSE) + centroids <- X[indices, , drop = FALSE] + return(centroids) + }, + + #' @description + #' Initialize centroids using k-means++ algorithm + #' Selects centroids that are far apart, leading to better convergence + #' + #' @param X Data matrix + #' @return Initial centroids + initialize_kmeans_plusplus = function(X) { + n_samples <- nrow(X) + n_features <- ncol(X) + centroids <- matrix(0, nrow = self$n_clusters, ncol = n_features) + + # 1. Choose first centroid randomly + first_idx <- sample(1:n_samples, 1) + centroids[1, ] <- X[first_idx, ] + + # 2. Choose remaining centroids + for (k in 2:self$n_clusters) { + # Compute distances from each point to nearest existing centroid + min_distances <- rep(Inf, n_samples) + + for (i in 1:n_samples) { + for (j in 1:(k-1)) { + dist <- self$euclidean_distance(X[i, ], centroids[j, ]) + if (dist < min_distances[i]) { + min_distances[i] <- dist + } + } + } + + # Square the distances + squared_distances <- min_distances^2 + + # Choose next centroid with probability proportional to squared distance + probabilities <- squared_distances / sum(squared_distances) + next_idx <- sample(1:n_samples, 1, prob = probabilities) + centroids[k, ] <- X[next_idx, ] + } + + return(centroids) + }, + + #' @description + #' Initialize centroids based on selected method + #' + #' @param X Data matrix + #' @return Initial centroids + initialize_centroids = function(X) { + if (is.matrix(self$init)) { + # User-provided initial centroids + if (nrow(self$init) != self$n_clusters || ncol(self$init) != ncol(X)) { + stop("Initial centroids dimensions don't match") + } + return(self$init) + } else if (self$init == "random") { + return(self$initialize_random(X)) + } else if (self$init == "k-means++") { + return(self$initialize_kmeans_plusplus(X)) + } else { + stop("Unknown initialization method. Use 'random' or 'k-means++'") + } + }, + + #' @description + #' Assign points to nearest centroid + #' + #' @param X Data matrix + #' @param centroids Current centroids + #' @return List with labels and inertia + assign_clusters = function(X, centroids) { + n_samples <- nrow(X) + labels <- integer(n_samples) + inertia <- 0 + + # Compute distances to all centroids + distances <- self$compute_distances(X, centroids) + + # Assign to nearest centroid + for (i in 1:n_samples) { + nearest_cluster <- which.min(distances[i, ]) + labels[i] <- nearest_cluster + inertia <- inertia + distances[i, nearest_cluster]^2 + } + + return(list(labels = labels, inertia = inertia)) + }, + + #' @description + #' Update centroids as mean of assigned points + #' + #' @param X Data matrix + #' @param labels Current cluster labels + #' @return Updated centroids + update_centroids = function(X, labels) { + n_features <- ncol(X) + new_centroids <- matrix(0, nrow = self$n_clusters, ncol = n_features) + + for (k in 1:self$n_clusters) { + cluster_points <- X[labels == k, , drop = FALSE] + + if (nrow(cluster_points) == 0) { + # If cluster is empty, reinitialize randomly + warning(sprintf("Cluster %d is empty, reinitializing randomly", k)) + random_idx <- sample(1:nrow(X), 1) + new_centroids[k, ] <- X[random_idx, ] + } else { + # Compute mean of cluster points + new_centroids[k, ] <- colMeans(cluster_points) + } + } + + return(new_centroids) + }, + + #' @description + #' Perform one complete run of k-means + #' + #' @param X Data matrix + #' @return List with centroids, labels, inertia, and iterations + single_run = function(X) { + # Initialize centroids + centroids <- self$initialize_centroids(X) + prev_inertia <- Inf + + for (iter in 1:self$max_iter) { + # Assignment step + assignment <- self$assign_clusters(X, centroids) + labels <- assignment$labels + inertia <- assignment$inertia + + # Check convergence + if (is.finite(prev_inertia) && abs(prev_inertia - inertia) < self$tol) { + return(list( + centroids = centroids, + labels = labels, + inertia = inertia, + n_iter = iter + )) + } + + # Update step + centroids <- self$update_centroids(X, labels) + prev_inertia <- inertia + } + + # Max iterations reached + return(list( + centroids = centroids, + labels = labels, + inertia = inertia, + n_iter = self$max_iter + )) + }, + + #' @description + #' Fit the k-means model to the data + #' + #' @param X Data matrix (n_samples x n_features) + #' @return Self (for method chaining) + fit = function(X) { + # Validate input + if (!is.matrix(X)) { + X <- as.matrix(X) + } + + n_samples <- nrow(X) + + if (self$n_clusters > n_samples) { + stop("Number of clusters cannot exceed number of samples") + } + + # Store training data + self$training_data <- X + + # Run k-means multiple times and keep best result + best_inertia <- Inf + best_result <- NULL + + cat(sprintf("Running K-Means with %d initialization(s)...\n", self$n_init)) + + for (run in 1:self$n_init) { + result <- self$single_run(X) + + if (result$inertia < best_inertia) { + best_inertia <- result$inertia + best_result <- result + } + + if (self$n_init > 1) { + cat(sprintf(" Run %d/%d: inertia = %.4f\n", run, self$n_init, result$inertia)) + } + } + + # Store best result + self$centroids <- best_result$centroids + self$labels <- best_result$labels + self$inertia <- best_result$inertia + self$n_iter <- best_result$n_iter + + # Compute cluster sizes + self$cluster_sizes <- integer(self$n_clusters) + for (k in 1:self$n_clusters) { + self$cluster_sizes[k] <- sum(self$labels == k) + } + + cat(sprintf("\nBest result: inertia = %.4f, iterations = %d\n", + self$inertia, self$n_iter)) + + return(invisible(self)) + }, + + #' @description + #' Predict cluster labels for new data + #' + #' @param X Data matrix to predict (n_samples x n_features) + #' @return Vector of cluster labels + predict = function(X) { + if (is.null(self$centroids)) { + stop("Model has not been fitted yet. Call fit() first.") + } + + if (!is.matrix(X)) { + X <- as.matrix(X) + } + + n_samples <- nrow(X) + labels <- integer(n_samples) + + # Compute distances and assign to nearest centroid + distances <- self$compute_distances(X, self$centroids) + + for (i in 1:n_samples) { + labels[i] <- which.min(distances[i, ]) + } + + return(labels) + }, + + #' @description + #' Fit the model and return cluster labels + #' + #' @param X Data matrix (n_samples x n_features) + #' @return Vector of cluster labels + fit_predict = function(X) { + self$fit(X) + return(self$labels) + }, + + #' @description + #' Transform data to cluster-distance space + #' + #' @param X Data matrix + #' @return Matrix of distances to each cluster centroid + transform = function(X) { + if (is.null(self$centroids)) { + stop("Model has not been fitted yet. Call fit() first.") + } + + if (!is.matrix(X)) { + X <- as.matrix(X) + } + + return(self$compute_distances(X, self$centroids)) + }, + + #' @description + #' Fit and transform in one step + #' + #' @param X Data matrix + #' @return Matrix of distances to cluster centroids + fit_transform = function(X) { + self$fit(X) + return(self$transform(X)) + }, + + #' @description + #' Get the cluster centroids + #' + #' @return Matrix of centroids + get_centroids = function() { + if (is.null(self$centroids)) { + stop("Model has not been fitted yet. Call fit() first.") + } + return(self$centroids) + }, + + #' @description + #' Calculate silhouette score for clustering quality + #' + #' @return Vector of silhouette scores for each point + silhouette_score = function() { + if (is.null(self$labels) || is.null(self$training_data)) { + stop("Model has not been fitted yet. Call fit() first.") + } + + n_samples <- length(self$labels) + silhouette_scores <- numeric(n_samples) + X <- self$training_data + + for (i in 1:n_samples) { + cluster_i <- self$labels[i] + + # Calculate a(i): mean distance to points in same cluster + same_cluster_indices <- which(self$labels == cluster_i) + same_cluster_indices <- same_cluster_indices[same_cluster_indices != i] + + if (length(same_cluster_indices) == 0) { + a_i <- 0 + } else { + distances_same <- numeric(length(same_cluster_indices)) + for (j in seq_along(same_cluster_indices)) { + distances_same[j] <- self$euclidean_distance( + X[i, ], + X[same_cluster_indices[j], ] + ) + } + a_i <- mean(distances_same) + } + + # Calculate b(i): mean distance to points in nearest cluster + b_i <- Inf + for (k in 1:self$n_clusters) { + if (k == cluster_i) next + + other_cluster_indices <- which(self$labels == k) + if (length(other_cluster_indices) > 0) { + distances_other <- numeric(length(other_cluster_indices)) + for (j in seq_along(other_cluster_indices)) { + distances_other[j] <- self$euclidean_distance( + X[i, ], + X[other_cluster_indices[j], ] + ) + } + mean_dist <- mean(distances_other) + if (mean_dist < b_i) { + b_i <- mean_dist + } + } + } + + # Calculate silhouette score + if (a_i == 0 && b_i == 0) { + silhouette_scores[i] <- 0 + } else { + silhouette_scores[i] <- (b_i - a_i) / max(a_i, b_i) + } + } + + return(silhouette_scores) + }, + + #' @description + #' Calculate Davies-Bouldin Index for clustering quality + #' Lower values indicate better clustering + #' + #' @return Davies-Bouldin Index score + davies_bouldin_score = function() { + if (is.null(self$labels) || is.null(self$centroids)) { + stop("Model has not been fitted yet. Call fit() first.") + } + + X <- self$training_data + + # Calculate average distance within each cluster + avg_distances <- numeric(self$n_clusters) + for (k in 1:self$n_clusters) { + cluster_points <- X[self$labels == k, , drop = FALSE] + if (nrow(cluster_points) > 0) { + distances <- numeric(nrow(cluster_points)) + for (i in 1:nrow(cluster_points)) { + distances[i] <- self$euclidean_distance( + cluster_points[i, ], + self$centroids[k, ] + ) + } + avg_distances[k] <- mean(distances) + } + } + + # Calculate Davies-Bouldin Index + db_index <- 0 + for (i in 1:self$n_clusters) { + max_ratio <- 0 + for (j in 1:self$n_clusters) { + if (i != j) { + centroid_distance <- self$euclidean_distance( + self$centroids[i, ], + self$centroids[j, ] + ) + ratio <- (avg_distances[i] + avg_distances[j]) / centroid_distance + if (ratio > max_ratio) { + max_ratio <- ratio + } + } + } + db_index <- db_index + max_ratio + } + + db_index <- db_index / self$n_clusters + return(db_index) + }, + + #' @description + #' Calculate Calinski-Harabasz Index (Variance Ratio Criterion) + #' Higher values indicate better clustering + #' + #' @return Calinski-Harabasz Index score + calinski_harabasz_score = function() { + if (is.null(self$labels) || is.null(self$centroids)) { + stop("Model has not been fitted yet. Call fit() first.") + } + + X <- self$training_data + n_samples <- nrow(X) + n_features <- ncol(X) + + # Overall centroid + overall_centroid <- colMeans(X) + + # Between-cluster dispersion + between_dispersion <- 0 + for (k in 1:self$n_clusters) { + n_k <- self$cluster_sizes[k] + if (n_k > 0) { + dist_sq <- sum((self$centroids[k, ] - overall_centroid)^2) + between_dispersion <- between_dispersion + n_k * dist_sq + } + } + + # Within-cluster dispersion + within_dispersion <- self$inertia + + # Calinski-Harabasz Index + if (within_dispersion == 0) { + return(Inf) + } + + ch_score <- (between_dispersion / (self$n_clusters - 1)) / + (within_dispersion / (n_samples - self$n_clusters)) + + return(ch_score) + }, + + #' @description + #' Print summary of the model + print = function() { + cat("K-Means Clustering Model\n") + cat("========================\n") + cat(sprintf("Number of clusters: %d\n", self$n_clusters)) + cat(sprintf("Initialization method: %s\n", + ifelse(is.matrix(self$init), "custom", self$init))) + cat(sprintf("Maximum iterations: %d\n", self$max_iter)) + cat(sprintf("Tolerance: %.2e\n", self$tol)) + cat(sprintf("Number of initializations: %d\n", self$n_init)) + + if (!is.null(self$n_iter)) { + cat(sprintf("\nIterations performed: %d\n", self$n_iter)) + cat(sprintf("Final inertia: %.4f\n", self$inertia)) + + if (!is.null(self$cluster_sizes)) { + cat("\nCluster sizes:\n") + for (k in 1:self$n_clusters) { + cat(sprintf(" Cluster %d: %d points (%.1f%%)\n", + k, + self$cluster_sizes[k], + 100 * self$cluster_sizes[k] / length(self$labels))) + } + } + + if (!is.null(self$centroids)) { + cat("\nCentroid coordinates:\n") + print(self$centroids) + } + } else { + cat("\nModel not fitted yet.\n") + } + + invisible(self) + } + ) +) + +#' Helper function to create and fit k-means model +#' +#' @param X Data matrix (n_samples x n_features) +#' @param n_clusters Number of clusters +#' @param init Initialization method +#' @param max_iter Maximum iterations +#' @param tol Convergence tolerance +#' @param random_state Random seed +#' @param n_init Number of initializations +#' @return Fitted KMeans object +k_means <- function(X, n_clusters = 3, init = "k-means++", max_iter = 300, + tol = 1e-4, random_state = NULL, n_init = 10) { + model <- KMeans$new( + n_clusters = n_clusters, + init = init, + max_iter = max_iter, + tol = tol, + random_state = random_state, + n_init = n_init + ) + model$fit(X) + return(model) +} + +# Example usage and demonstrations +if (FALSE) { + # Example 1: Basic clustering with 2D data + cat("\n=== Example 1: Basic 2D Clustering ===\n") + set.seed(42) + + # Create 3 clusters of data + cluster1 <- matrix(rnorm(60, mean = 0, sd = 1), ncol = 2) + cluster2 <- matrix(rnorm(60, mean = 5, sd = 1), ncol = 2) + cluster3 <- matrix(rnorm(60, mean = c(2, 8), sd = 1), ncol = 2) + X <- rbind(cluster1, cluster2, cluster3) + + # Fit k-means with k-means++ initialization + kmeans_model <- KMeans$new(n_clusters = 3, init = "k-means++", + max_iter = 300, n_init = 10, random_state = 42) + kmeans_model$fit(X) + + # Get predictions + labels <- kmeans_model$predict(X) + + # Print model summary + kmeans_model$print() + + # Get centroids + centroids <- kmeans_model$get_centroids() + + # Calculate quality metrics + sil_scores <- kmeans_model$silhouette_score() + cat(sprintf("\nMean silhouette score: %.4f\n", mean(sil_scores))) + + db_score <- kmeans_model$davies_bouldin_score() + cat(sprintf("Davies-Bouldin Index: %.4f (lower is better)\n", db_score)) + + ch_score <- kmeans_model$calinski_harabasz_score() + cat(sprintf("Calinski-Harabasz Index: %.4f (higher is better)\n", ch_score)) + + # Plot results + plot(X, col = labels + 1, pch = 19, + main = "K-Means Clustering (k-means++)", + xlab = "Feature 1", ylab = "Feature 2") + points(centroids, col = 1:3, pch = 8, cex = 3, lwd = 3) + legend("topright", legend = c(paste("Cluster", 1:3), "Centroids"), + col = c(2:4, 1), pch = c(19, 19, 19, 8), pt.cex = c(1, 1, 1, 2)) + + # Example 2: Random initialization comparison + cat("\n=== Example 2: Random Initialization ===\n") + kmeans_random <- KMeans$new(n_clusters = 3, init = "random", + max_iter = 300, n_init = 10, random_state = 42) + kmeans_random$fit(X) + + cat(sprintf("\nK-means++ inertia: %.4f\n", kmeans_model$inertia)) + cat(sprintf("Random init inertia: %.4f\n", kmeans_random$inertia)) + cat(sprintf("Improvement: %.2f%%\n", + 100 * (kmeans_random$inertia - kmeans_model$inertia) / kmeans_random$inertia)) + + # Example 3: Iris dataset + cat("\n=== Example 3: Iris Dataset ===\n") + data(iris) + X_iris <- as.matrix(iris[, 1:4]) + + kmeans_iris <- KMeans$new(n_clusters = 3, init = "k-means++", + max_iter = 300, n_init = 10, random_state = 42) + kmeans_iris$fit(X_iris) + labels_iris <- kmeans_iris$predict(X_iris) + + kmeans_iris$print() + + # Calculate quality metrics + sil_scores_iris <- kmeans_iris$silhouette_score() + cat(sprintf("\nMean silhouette score: %.4f\n", mean(sil_scores_iris))) + + # Visualize on first two dimensions + plot(X_iris[, 1:2], col = labels_iris + 1, pch = 19, + main = "K-Means on Iris Dataset", + xlab = "Sepal Length", ylab = "Sepal Width") + centroids_iris <- kmeans_iris$get_centroids() + points(centroids_iris[, 1:2], col = 1:3, pch = 8, cex = 3, lwd = 3) + + # Compare with true species + true_labels <- as.numeric(iris$Species) + confusion_matrix <- table(Predicted = labels_iris, Actual = true_labels) + cat("\nConfusion Matrix:\n") + print(confusion_matrix) + + # Example 4: Elbow method for optimal k + cat("\n=== Example 4: Elbow Method ===\n") + inertias <- numeric(10) + silhouette_scores <- numeric(10) + + for (k in 1:10) { + if (k == 1) { + # Silhouette not defined for k=1 + model <- KMeans$new(n_clusters = k, max_iter = 100, + n_init = 3, random_state = 42) + model$fit(X) + inertias[k] <- model$inertia + silhouette_scores[k] <- NA + } else { + model <- KMeans$new(n_clusters = k, max_iter = 100, + n_init = 3, random_state = 42) + model$fit(X) + inertias[k] <- model$inertia + silhouette_scores[k] <- mean(model$silhouette_score()) + } + } + + # Plot elbow curve + par(mfrow = c(1, 2)) + + plot(1:10, inertias, type = "b", pch = 19, col = "blue", + main = "Elbow Method", + xlab = "Number of Clusters (k)", ylab = "Inertia") + grid() + + plot(2:10, silhouette_scores[2:10], type = "b", pch = 19, col = "red", + main = "Silhouette Score", + xlab = "Number of Clusters (k)", ylab = "Mean Silhouette Score") + grid() + + par(mfrow = c(1, 1)) + + # Example 5: Transform to cluster-distance space + cat("\n=== Example 5: Transform to Cluster-Distance Space ===\n") + distances <- kmeans_model$transform(X[1:5, ]) + cat("\nDistances from first 5 points to each centroid:\n") + print(distances) + + # Example 6: Custom initial centroids + cat("\n=== Example 6: Custom Initial Centroids ===\n") + custom_centroids <- matrix(c(0, 0, 5, 5, 2, 8), nrow = 3, byrow = TRUE) + kmeans_custom <- KMeans$new(n_clusters = 3, init = custom_centroids, + max_iter = 300, n_init = 1) + kmeans_custom$fit(X) + kmeans_custom$print() + + # Example 7: Quality metrics comparison + cat("\n=== Example 7: Clustering Quality Metrics ===\n") + + metrics_df <- data.frame( + k = 2:6, + inertia = numeric(5), + silhouette = numeric(5), + davies_bouldin = numeric(5), + calinski_harabasz = numeric(5) + ) + + for (i in 1:5) { + k <- i + 1 + model <- KMeans$new(n_clusters = k, max_iter = 100, + n_init = 5, random_state = 42) + model$fit(X) + + metrics_df$inertia[i] <- model$inertia + metrics_df$silhouette[i] <- mean(model$silhouette_score()) + metrics_df$davies_bouldin[i] <- model$davies_bouldin_score() + metrics_df$calinski_harabasz[i] <- model$calinski_harabasz_score() + } + + cat("\nClustering Quality Metrics:\n") + print(metrics_df) + + cat("\nOptimal k based on:\n") + cat(sprintf(" Silhouette Score: k = %d (%.4f)\n", + metrics_df$k[which.max(metrics_df$silhouette)], + max(metrics_df$silhouette))) + cat(sprintf(" Davies-Bouldin Index: k = %d (%.4f)\n", + metrics_df$k[which.min(metrics_df$davies_bouldin)], + min(metrics_df$davies_bouldin))) + cat(sprintf(" Calinski-Harabasz Index: k = %d (%.4f)\n", + metrics_df$k[which.max(metrics_df$calinski_harabasz)], + max(metrics_df$calinski_harabasz))) +}