Skip to content
Merged
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ importFrom(stats,na.omit)
importFrom(stringi,stri_extract_all_regex)
importFrom(stringi,stri_replace_all_regex)
importFrom(stringr,coll)
importFrom(stringr,regex)
importFrom(stringr,str_count)
importFrom(stringr,str_detect)
importFrom(stringr,str_glue)
Expand Down
43 changes: 17 additions & 26 deletions R/cleanup.R
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,10 @@ removeEmptyRows <- function(prot, by_column = "DomArch") {
prot <- prot %>%
as_tibble() %>%
# filter(grepl("\\*", {{by_column}})) %>% # Keep only rows with Query (*) for GenContext
filter(!grepl("^-$", {{ by_column }})) %>% # remove "-"
filter(!grepl("^NA$", {{ by_column }})) %>% # remove "NA"
filter(!grepl("^$", {{ by_column }})) # remove empty rows
filter(!grepl("^-$", .[[by_column]])) %>% # remove "-"
filter(!grepl("^NA$", .[[by_column]])) %>% # remove "NA"
filter(!grepl("^$", .[[by_column]])) %>% # remove empty rows
filter(!grepl("^\\s*$", .[[by_column]])) # remove rows with only spaces

return(prot)
}
Expand All @@ -191,7 +192,7 @@ removeEmptyRows <- function(prot, by_column = "DomArch") {
#' @export
#'
#' @importFrom dplyr pull mutate
#' @importFrom stringr str_replace_all
#' @importFrom stringr str_replace_all regex
#' @importFrom rlang .data :=
#'
#' @examples
Expand All @@ -201,29 +202,19 @@ removeEmptyRows <- function(prot, by_column = "DomArch") {
condenseRepeatedDomains <- function(prot, by_column = "DomArch", excluded_prots = c()) {
# If there are strings that condenseRepeatedDomains should not affect, the pattern to search
# for must be changed to exclude a search for those desired strings

collapsed_prots <- paste0(excluded_prots, collapse = "\\s|")
regex_exclude <- paste0("(?!", collapsed_prots, "\\s)")
regex_identify_repeats <- paste0("(?i)", regex_exclude, "\\b([a-z0-9_-]+)\\b(?:\\s+\\1\\b)+")

# !! FUNS is soft-deprecated. FIX!!!
prot <- prot %>%
dplyr::mutate(!!by_column := stringr::str_replace_all(
.data[[by_column]],
c(
"\\." = "_d_",
" " = "_",
"\\+" = " ",
"-" = "__",
regex_identify_repeats = "\\1(s)",
"__" = "-",
" " = "+",
"_d_" = "."
)
))
collapsed_prots <- paste0(excluded_prots, collapse = "|")
regex_exclude <- if (length(excluded_prots)) paste0("(?!", collapsed_prots, "\\b)") else ""

# Allow + or space (or combinations) as delimiters
regex_identify_repeats <- paste0("(?i)", regex_exclude, "\\b([A-Za-z0-9_-]+)\\b(?:[+\\s]+\\1\\b)+")

prot <-
prot %>%
mutate(
!!by_column := str_replace_all(.data[[by_column]], regex(regex_identify_repeats), "\\1(s)")
)

return(prot)

}


Expand Down Expand Up @@ -731,7 +722,7 @@ selectLongestDuplicate <- function(prot, column) {
# grab all the longest rows
unique_dups <- prot %>%
filter(!.data$row.orig %in% remove_rows) %>%
select(-.data$row.orig)
select(-"row.orig")

return(unique_dups)
}
Expand Down
2 changes: 1 addition & 1 deletion R/fa2domain.R
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ createIPRScanDomainTable <- function(
id_domain = stringr::str_glue("{AccNum}-{DB.ID}-{StartLoc}_{StopLoc}")
) |>
dplyr::ungroup() |>
dplyr::relocate(.data$id_domain, .before = 1)
dplyr::relocate('id_domain', .before = 1)
return(df_iprscan_domains)
}

Expand Down
Loading