Merge pull request #19 from nrennie/duplicate-columns

nrennie · web-flow · commit 37fb4b57d0ba · 2025-08-15T23:37:38.000+01:00
Add `duplicate_columns()` function
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: messy
 Title: Create Messy Data from Clean Data Frames
-Version: 0.1.0.9001
+Version: 0.1.0.9002
 Authors@R: c(
     person(given = "Nicola", family = "Rennie", role = c("aut", "cre", "cph"),
     email = "nrennie35@gmail.com", comment = c(ORCID = "0000-0003-4797-557X")))
diff --git a/NAMESPACE b/NAMESPACE
@@ -4,6 +4,7 @@ export(add_special_chars)
 export(add_whitespace)
 export(change_case)
 export(change_separators)
+export(duplicate_columns)
 export(duplicate_rows)
 export(make_missing)
 export(messy)
diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,7 @@
 # messy (development version)
 
 * Add `change_separators()` function
+* Add `duplicate_columns()` function
 
 # messy 0.1.0
 
diff --git a/R/duplicate_columns.R b/R/duplicate_columns.R
@@ -0,0 +1,42 @@
+#' Duplicate columns and insert them into the dataframe at random
+#'
+#' @param data input dataframe
+#' @param messiness Probability that each column is duplicated. Must be
+#' between 0 and 1. Default 0.1.
+#' @param random Whether duplicated column names should be randomly selected
+#' from other column names, or maintain the original. Default `TRUE`.
+#' @param name_sep Separator to use for adding numbers to end of names. Default `""`.
+#' @return A dataframe with duplicated rows inserted
+#' @author Jordi Rosell
+#' @export
+#' @examples
+#' duplicate_columns(mtcars, messiness = 0.1)
+
+duplicate_columns <- function(
+    data,
+    messiness = 0.1,
+    random = TRUE,
+    name_sep = "") {
+  if (messiness < 0 || messiness > 1) {
+    stop("'messiness' must be between 0 and 1")
+  }
+  if (!is.logical(random)) {
+    stop("'random' must be either 'TRUE' or 'FALSE'")
+  }
+
+  original_names <- colnames(data)
+  n <- ncol(data)
+  for (i in seq_len(n)) {
+    if (stats::runif(1) < messiness) {
+      if (random) {
+        new_col_name <- sample(original_names, 1)
+      } else {
+        new_col_name <- original_names[i]
+      }
+      new_col_name <- paste0(new_col_name, name_sep, sample(100 * n, 1))
+      data[[new_col_name]] <- data[[i]]
+    }
+  }
+
+  return(data)
+}
diff --git a/man/duplicate_columns.Rd b/man/duplicate_columns.Rd