nrennie · nrennie · Aug 15, 2025 · Dec 5, 2024 · Aug 15, 2025 · Aug 15, 2025
diff --git a/.lintr b/.lintr
@@ -3,5 +3,6 @@ linters: linters_with_defaults(
   T_and_F_symbol_linter(),
   assignment_linter(),
   object_name_linter = NULL,
-  brace_linter = NULL
+  brace_linter = NULL,
+  return_linter = NULL
   )
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: messy
 Title: Create Messy Data from Clean Data Frames
-Version: 0.1.0.9000
+Version: 0.1.0.9001
 Authors@R: c(
     person(given = "Nicola", family = "Rennie", role = c("aut", "cre", "cph"),
     email = "nrennie35@gmail.com", comment = c(ORCID = "0000-0003-4797-557X")))

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,7 @@
 export(add_special_chars)
 export(add_whitespace)
 export(change_case)
+export(change_separators)
 export(duplicate_rows)
 export(make_missing)
 export(messy)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # messy (development version)
 
+* Add `change_separators()` function
+
 # messy 0.1.0
 
 * CRAN release

diff --git a/R/change_separators.R b/R/change_separators.R
@@ -0,0 +1,82 @@
+#' Change separators
+#'
+#' Randomly change the separators in character strings
+#' through random replacement
+#' @param data input dataframe
+#' @param cols set of columns to apply transformation to. If `NULL`
+#' will apply to all columns. Default `NULL`.
+#' @param messiness Percentage of values to change. Must be
+#' between 0 and 1. Default 0.1.
+#' @param sep_in A single value, or vector, or list of what is considered
+#' a separator in the input data. Default `c("-", "_", "  ", " ")`.
+#' @param sep_out A single value, or vector, or list of what the separators
+#' may be randomly with. Default `c("-", "_", "  ", " ")`.
+#' @return a dataframe the same size as the input data.
+#' @export
+#' @examples
+#' change_separators(mtcars)
+change_separators <- function(data,
+                              cols = NULL,
+                              messiness = 0.1,
+                              sep_in = c("-", "_", "  ", " "),
+                              sep_out = c("-", "_", "  ", " ")) {
+  if (messiness < 0 || messiness > 1) {
+    stop("'messiness' must be between 0 and 1")
+  }
+
+  if (is.null(cols)) {
+    output <- data |>
+      dplyr::mutate(
+        dplyr::across(
+          dplyr::where(is.character),
+          \(x) change_sep(x,
+            messiness = messiness,
+            sep_in = sep_in,
+            sep_out = sep_out
+          )
+        )
+      )
+  } else {
+    # are cols present
+    if (!all((cols %in% colnames(data)))) {
+      stop("All elements of 'cols' must be a column name in 'data'")
+    } else {
+      output <- data |>
+        dplyr::mutate(
+          dplyr::across(
+            dplyr::all_of(cols),
+            \(x) change_sep(x,
+              messiness = messiness,
+              sep_in = sep_in,
+              sep_out = sep_out
+            )
+          )
+        )
+    }
+  }
+  return(output)
+}
+
+
+#' Function to change separators
+#'
+#' @param x Character vector
+#' @param messiness Percentage of values to change. Must be
+#' between 0 and 1. Default 0.1.
+#' @param sep_in A single value, or vector, or list of what is considered
+#' a separator in the input data. Default `c("-", "_", "  ", " ")`.
+#' @param sep_out A single value, or vector, or list of what the separators
+#' may be randomly with. Default `c("-", "_", "  ", " ")`.
+#' @return Messy character vector
+#' @noRd
+change_sep <- function(x,
+                       messiness = 0.1,
+                       sep_in = c("-", "_", "  ", " "),
+                       sep_out = c("-", "_", "  ", " ")) {
+  sep_in_escaped <- stringr::str_escape(sep_in)[order(nchar(sep_in), decreasing = TRUE)]
+  pattern <- paste0("(", paste(sep_in_escaped, collapse = "|"), ")")
+  replace_match <- function(match) {
+    if (stats::runif(1) < messiness) sample(sep_out, 1) else match
+  }
+  stringr::str_replace_all(x, pattern, replace_match)
+}
diff --git a/R/messy.R b/R/messy.R
@@ -10,19 +10,25 @@
 #' Default `NA`.
 #' @param case_type Whether the case should change based on
 #' the `"word"` or `"letter"`.
+#' @param sep_in A single value, or vector, or list of what is considered
+#' a separator in the input data. Default `c("-", "_", "  ", " ")`.
+#' @param sep_out A single value, or vector, or list of what the separators
+#' may be randomly with. Default `c("-", "_", "  ", " ")`.
 #' @return a dataframe the same size as the input data.
 #' @export
 #' @examples
 #' messy(mtcars)
-
 messy <- function(data,
                   messiness = 0.1,
                   missing = NA,
-                  case_type = "word") {
+                  case_type = "word",
+                  sep_in = c("-", "_", "  ", " "),
+                  sep_out = c("-", "_", "  ", " ")) {
   output <- data |>
     add_special_chars(messiness = messiness) |>
     add_whitespace(messiness = messiness) |>
     make_missing(messiness = messiness, missing = missing) |>
-    change_case(messiness = messiness, case_type = case_type)
+    change_case(messiness = messiness, case_type = case_type) |>
+    change_separators(messiness = messiness, sep_in = sep_in, sep_out = sep_out)
   return(output)
 }
diff --git a/man/change_separators.Rd b/man/change_separators.Rd
diff --git a/man/messy.Rd b/man/messy.Rd
diff --git a/messy.Rproj b/messy.Rproj
@@ -1,4 +1,5 @@
 Version: 1.0
+ProjectId: 2385395b-4ccc-4ad0-85e9-8f36dabcd033
 
 RestoreWorkspace: No
 SaveWorkspace: No

diff --git a/tests/testthat/test-change_separators.R b/tests/testthat/test-change_separators.R
@@ -0,0 +1,26 @@
+test_that("change_separators works ", {
+  # Test data
+  test_df <- data.frame(
+    X1 = c("a b", "c_d", "e f", "g_h", "i j"),
+    X2 = c("k l", "m n", "o_p", "q r", "s_t"),
+    X3 = 1:5,
+    stringsAsFactors = FALSE
+  )
+
+  # when cols argument is used
+  result_X1 <- change_separators(test_df, cols = "X1", messiness = 1)
+  expect_identical(result_X1$X2, test_df$X2)
+  expect_identical(result_X1$X3, test_df$X3)
+
+  # messiness must be a value between 0-1
+  expect_error(change_separators(test_df, messiness = -0.01))
+  expect_error(change_separators(test_df, messiness = 1.5))
+
+  # invalid column names for cols
+  expect_error(change_separators(test_df, cols = "test_col3"))
+
+  # when strings remain unchanged
+  test_df_noseparators <- data.frame(X1 = c("ab", "cd", "ef"))
+  result_noseparators <- change_separators(test_df_noseparators, messiness = 1)
+  expect_identical(result_noseparators, test_df_noseparators)
+})