Skip to content

Commit e0a8b1b

Browse files
authored
Merge pull request #77 from awasyn/test-cleanup
add unit tests to cleanup.R
2 parents c7d2671 + 3e58486 commit e0a8b1b

File tree

4 files changed

+621
-27
lines changed

4 files changed

+621
-27
lines changed

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ importFrom(stats,na.omit)
265265
importFrom(stringi,stri_extract_all_regex)
266266
importFrom(stringi,stri_replace_all_regex)
267267
importFrom(stringr,coll)
268+
importFrom(stringr,regex)
268269
importFrom(stringr,str_count)
269270
importFrom(stringr,str_detect)
270271
importFrom(stringr,str_glue)

R/cleanup.R

Lines changed: 17 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -164,9 +164,10 @@ removeEmptyRows <- function(prot, by_column = "DomArch") {
164164
prot <- prot %>%
165165
as_tibble() %>%
166166
# filter(grepl("\\*", {{by_column}})) %>% # Keep only rows with Query (*) for GenContext
167-
filter(!grepl("^-$", {{ by_column }})) %>% # remove "-"
168-
filter(!grepl("^NA$", {{ by_column }})) %>% # remove "NA"
169-
filter(!grepl("^$", {{ by_column }})) # remove empty rows
167+
filter(!grepl("^-$", .[[by_column]])) %>% # remove "-"
168+
filter(!grepl("^NA$", .[[by_column]])) %>% # remove "NA"
169+
filter(!grepl("^$", .[[by_column]])) %>% # remove empty rows
170+
filter(!grepl("^\\s*$", .[[by_column]])) # remove rows with only spaces
170171

171172
return(prot)
172173
}
@@ -191,7 +192,7 @@ removeEmptyRows <- function(prot, by_column = "DomArch") {
191192
#' @export
192193
#'
193194
#' @importFrom dplyr pull mutate
194-
#' @importFrom stringr str_replace_all
195+
#' @importFrom stringr str_replace_all regex
195196
#' @importFrom rlang .data :=
196197
#'
197198
#' @examples
@@ -201,29 +202,19 @@ removeEmptyRows <- function(prot, by_column = "DomArch") {
201202
condenseRepeatedDomains <- function(prot, by_column = "DomArch", excluded_prots = c()) {
202203
# If there are strings that condenseRepeatedDomains should not affect, the pattern to search
203204
# for must be changed to exclude a search for those desired strings
204-
205-
collapsed_prots <- paste0(excluded_prots, collapse = "\\s|")
206-
regex_exclude <- paste0("(?!", collapsed_prots, "\\s)")
207-
regex_identify_repeats <- paste0("(?i)", regex_exclude, "\\b([a-z0-9_-]+)\\b(?:\\s+\\1\\b)+")
208-
209-
# !! FUNS is soft-deprecated. FIX!!!
210-
prot <- prot %>%
211-
dplyr::mutate(!!by_column := stringr::str_replace_all(
212-
.data[[by_column]],
213-
c(
214-
"\\." = "_d_",
215-
" " = "_",
216-
"\\+" = " ",
217-
"-" = "__",
218-
regex_identify_repeats = "\\1(s)",
219-
"__" = "-",
220-
" " = "+",
221-
"_d_" = "."
222-
)
223-
))
205+
collapsed_prots <- paste0(excluded_prots, collapse = "|")
206+
regex_exclude <- if (length(excluded_prots)) paste0("(?!", collapsed_prots, "\\b)") else ""
207+
208+
# Allow + or space (or combinations) as delimiters
209+
regex_identify_repeats <- paste0("(?i)", regex_exclude, "\\b([A-Za-z0-9_-]+)\\b(?:[+\\s]+\\1\\b)+")
210+
211+
prot <-
212+
prot %>%
213+
mutate(
214+
!!by_column := str_replace_all(.data[[by_column]], regex(regex_identify_repeats), "\\1(s)")
215+
)
224216

225217
return(prot)
226-
227218
}
228219

229220

@@ -731,7 +722,7 @@ selectLongestDuplicate <- function(prot, column) {
731722
# grab all the longest rows
732723
unique_dups <- prot %>%
733724
filter(!.data$row.orig %in% remove_rows) %>%
734-
select(-.data$row.orig)
725+
select(-"row.orig")
735726

736727
return(unique_dups)
737728
}

R/fa2domain.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ createIPRScanDomainTable <- function(
185185
id_domain = stringr::str_glue("{AccNum}-{DB.ID}-{StartLoc}_{StopLoc}")
186186
) |>
187187
dplyr::ungroup() |>
188-
dplyr::relocate(.data$id_domain, .before = 1)
188+
dplyr::relocate('id_domain', .before = 1)
189189
return(df_iprscan_domains)
190190
}
191191

0 commit comments

Comments
 (0)