Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .lintr
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@ linters: linters_with_defaults(
T_and_F_symbol_linter(),
assignment_linter(),
object_name_linter = NULL,
brace_linter = NULL
brace_linter = NULL,
return_linter = NULL
)
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: messy
Title: Create Messy Data from Clean Data Frames
Version: 0.1.0.9000
Version: 0.1.0.9001
Authors@R: c(
person(given = "Nicola", family = "Rennie", role = c("aut", "cre", "cph"),
email = "nrennie35@gmail.com", comment = c(ORCID = "0000-0003-4797-557X")))
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
export(add_special_chars)
export(add_whitespace)
export(change_case)
export(change_separators)
export(duplicate_rows)
export(make_missing)
export(messy)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# messy (development version)

* Add `change_separators()` function

# messy 0.1.0

* CRAN release
Expand Down
82 changes: 82 additions & 0 deletions R/change_separators.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#' Change separators
#'
#' Randomly change the separators in character strings
#' through random replacement
#' @param data input dataframe
#' @param cols set of columns to apply transformation to. If `NULL`
#' will apply to all columns. Default `NULL`.
#' @param messiness Percentage of values to change. Must be
#' between 0 and 1. Default 0.1.
#' @param sep_in A single value, or vector, or list of what is considered
#' a separator in the input data. Default `c("-", "_", " ", " ")`.
#' @param sep_out A single value, or vector, or list of what the separators
#' may be randomly with. Default `c("-", "_", " ", " ")`.
#' @return a dataframe the same size as the input data.
#' @export
#' @examples
#' change_separators(mtcars)
change_separators <- function(data,
cols = NULL,
messiness = 0.1,
sep_in = c("-", "_", " ", " "),
sep_out = c("-", "_", " ", " ")) {
if (messiness < 0 || messiness > 1) {
stop("'messiness' must be between 0 and 1")
}

if (is.null(cols)) {
output <- data |>
dplyr::mutate(
dplyr::across(
dplyr::where(is.character),
\(x) change_sep(x,
messiness = messiness,
sep_in = sep_in,
sep_out = sep_out
)
)
)
} else {
# are cols present
if (!all((cols %in% colnames(data)))) {
stop("All elements of 'cols' must be a column name in 'data'")
} else {
output <- data |>
dplyr::mutate(
dplyr::across(
dplyr::all_of(cols),
\(x) change_sep(x,
messiness = messiness,
sep_in = sep_in,
sep_out = sep_out
)
)
)
}
}
return(output)
}


#' Function to change separators
#'
#' @param x Character vector
#' @param messiness Percentage of values to change. Must be
#' between 0 and 1. Default 0.1.
#' @param sep_in A single value, or vector, or list of what is considered
#' a separator in the input data. Default `c("-", "_", " ", " ")`.
#' @param sep_out A single value, or vector, or list of what the separators
#' may be randomly with. Default `c("-", "_", " ", " ")`.
#' @return Messy character vector
#' @noRd
change_sep <- function(x,
messiness = 0.1,
sep_in = c("-", "_", " ", " "),
sep_out = c("-", "_", " ", " ")) {
sep_in_escaped <- stringr::str_escape(sep_in)[order(nchar(sep_in), decreasing = TRUE)]
pattern <- paste0("(", paste(sep_in_escaped, collapse = "|"), ")")
replace_match <- function(match) {
if (stats::runif(1) < messiness) sample(sep_out, 1) else match
}
stringr::str_replace_all(x, pattern, replace_match)
}
12 changes: 9 additions & 3 deletions R/messy.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,25 @@
#' Default `NA`.
#' @param case_type Whether the case should change based on
#' the `"word"` or `"letter"`.
#' @param sep_in A single value, or vector, or list of what is considered
#' a separator in the input data. Default `c("-", "_", " ", " ")`.
#' @param sep_out A single value, or vector, or list of what the separators
#' may be randomly with. Default `c("-", "_", " ", " ")`.
#' @return a dataframe the same size as the input data.
#' @export
#' @examples
#' messy(mtcars)

messy <- function(data,
messiness = 0.1,
missing = NA,
case_type = "word") {
case_type = "word",
sep_in = c("-", "_", " ", " "),
sep_out = c("-", "_", " ", " ")) {
output <- data |>
add_special_chars(messiness = messiness) |>
add_whitespace(messiness = messiness) |>
make_missing(messiness = messiness, missing = missing) |>
change_case(messiness = messiness, case_type = case_type)
change_case(messiness = messiness, case_type = case_type) |>
change_separators(messiness = messiness, sep_in = sep_in, sep_out = sep_out)
return(output)
}
39 changes: 39 additions & 0 deletions man/change_separators.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 14 additions & 1 deletion man/messy.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions messy.Rproj
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
Version: 1.0
ProjectId: 2385395b-4ccc-4ad0-85e9-8f36dabcd033

RestoreWorkspace: No
SaveWorkspace: No
Expand Down
26 changes: 26 additions & 0 deletions tests/testthat/test-change_separators.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
test_that("change_separators works ", {
# Test data
test_df <- data.frame(
X1 = c("a b", "c_d", "e f", "g_h", "i j"),
X2 = c("k l", "m n", "o_p", "q r", "s_t"),
X3 = 1:5,
stringsAsFactors = FALSE
)

# when cols argument is used
result_X1 <- change_separators(test_df, cols = "X1", messiness = 1)
expect_identical(result_X1$X2, test_df$X2)
expect_identical(result_X1$X3, test_df$X3)

# messiness must be a value between 0-1
expect_error(change_separators(test_df, messiness = -0.01))
expect_error(change_separators(test_df, messiness = 1.5))

# invalid column names for cols
expect_error(change_separators(test_df, cols = "test_col3"))

# when strings remain unchanged
test_df_noseparators <- data.frame(X1 = c("ab", "cd", "ef"))
result_noseparators <- change_separators(test_df_noseparators, messiness = 1)
expect_identical(result_noseparators, test_df_noseparators)
})