beckywagner4 · Heisenberg4322 · Apr 24, 2023 · May 26, 2023 · Nov 20, 2023 · Jul 9, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -12,14 +12,17 @@ License: GPL (>= 3)
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-Imports: 
+Imports:
     dplyr,
     janitor,
     lubridate,
     magrittr,
     purrr,
     readr,
+    rlang,
     stringr,
     tibble,
-    tidyr
-RoxygenNote: 7.2.1
+    tidyr,
+    utils,
+    xml2
+RoxygenNote: 7.3.0
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,9 +1,18 @@
 # Generated by roxygen2: do not edit by hand
 
 export("%>%")
+export(add_date_sift)
+export(filter_proc_sift)
+export(read_fullscan_sift)
+export(read_fullscan_xml_sift)
+export(read_many_fullscan_xml_sift)
 export(read_many_proc_sift)
 export(read_many_sift)
 export(read_many_sift2)
 export(read_proc_sift)
 export(read_sift)
+export(validate_as_quantitative_sift)
+import(dplyr)
+import(lubridate)
+import(rlang)
 importFrom(magrittr,"%>%")
diff --git a/R/add_date_sift.R b/R/add_date_sift.R
@@ -0,0 +1,68 @@
+#' Add Date SIFT Function
+#'
+#' @description This function adds a date column (dttm) to SIFT data from the scan start time (dttm) and time (numeric) columns, and can set the timezone of date and start time columns.
+#'
+#' @param data A data frame (from other read_sift functions) containing the SIFT data to be processed.
+#' @param time_units A character string specifying the units for time in the unstitched files. Can be "s" for seconds or "ms" for milliseconds. This argument is required.
+#' @param time_input A character string specifying the column name for the time input (e.g., "time_s" or "time_ms").
+#' @param start_time_input A character string specifying the column name for the start time input. Defaults to "start_time".
+#' @param force_tz A character string containing the time zone to convert to, or `FALSE` to not adjust the time zone. Defaults to `FALSE`. For use in WACL, use "Europe/London".
+#' @param output A character string specifying the name of the output column for the calculated date. Defaults to "date".
+#' @param round_date A logical value indicating whether to round the output to the nearest second. Defaults to TRUE.
+#'
+#' @return A data frame with an added output column (default: `date`), and time zone set for the specified columns.
+#'
+#' @import dplyr
+#' @import lubridate
+#' @import rlang
+#'
+#' @export
+#'
+
+add_date_sift <- function(data, time_units, time_input, start_time_input = "start_time", force_tz = FALSE, output = "date", round_date = TRUE) {
+
+  if (!time_units %in% c("s", "ms")) {
+    stop("Invalid value for 'time_units'. Please use 's' or 'ms'. Check unstitched files to find which to use.")
+  }
+
+  if (!(time_input %in% colnames(data))) {
+    stop("The specified time_input column does not exist in the data.")
+  }
+
+  if (!(start_time_input %in% colnames(data))) {
+    stop("The specified start_time_input column does not exist in the data.")
+  }
+
+  if (!is.numeric(data[[time_input]])) {
+    stop("The time_input column must be numeric.")
+  }
+
+
+  time_col <- if (time_units == "ms") data[[time_input]] / 1000 else data[[time_input]]
+
+  data <- data %>%
+    dplyr::mutate(
+      !!rlang::sym(output) := .data[[start_time_input]] + time_col
+    )
+
+  if (round_date) {
+    data <- data %>%
+      dplyr::mutate(
+        !!rlang::sym(output) := lubridate::round_date(.data[[output]], unit = "second")
+      )
+  }
+
+  if (force_tz != FALSE) {
+    data <- data %>%
+      dplyr::mutate(
+        !!rlang::sym(output) := lubridate::force_tz(.data[[output]], tzone = force_tz),
+        !!rlang::sym(start_time_input) := lubridate::force_tz(.data[[start_time_input]], tzone = force_tz)
+      )
+  } else {
+    warning("Timezone not specified. Set a timezone using the force_tz parameter. Use ?add_date_sift to see function documentation.")
+  }
+
+  data <- data %>% dplyr::relocate(.data[[output]])
+
+  return(data)
+}
diff --git a/R/filter_proc_sift.R b/R/filter_proc_sift.R
@@ -0,0 +1,155 @@
+#' Filter Processed SIFT-MS Data
+#'
+#' @description
+#' This function is designed to filter processed SIFT-MS data outputted from the
+#' read_proc_sift function. It allows you to filter the data frame for one of six
+#' tables, can filter to remove incomplete scans and sets the units for
+#' concentration.
+#'
+#' @param df A data frame outputted from the read_proc_sift function containing
+#'           Processed SIFT-MS data, which includes columns `table`, `time_s`,
+#'           `compound_identity`, and `conc`.
+#'
+#' @param table_type A character string indicating the type of concentration data
+#'        to filter. Acceptable values include: "analyte_conc", "conc_per_reagent",
+#'        "conc_per_product", "raw_analyte_conc", "raw_conc_per_reagent", and
+#'        "raw_conc_per_product"
+#' @param remove_incomplete_scans A logical value. If TRUE (default), scans
+#'        which are incomplete will be removed. Incomplete scans can result when
+#'        a scan is ended.
+#' @param conc_unit A character string specifying the desired concentration unit for
+#'        conversion. Acceptable values are "ppm", "ppb", or "ppt".
+#'
+#' @return A data frame
+#' @export
+
+filter_proc_sift <- function(df, table_type, remove_incomplete_scans = TRUE, conc_unit) {
+
+  valid_units <- c("ppm", "ppb", "ppt")
+  if (!conc_unit %in% valid_units) {
+    stop("Invalid conc_unit provided. Choose from 'ppm', 'ppb', or 'ppt'.")
+  }
+
+  if (remove_incomplete_scans) {
+    df <- df %>%
+      group_by(time_s, table) %>%
+      filter(!any(is.na(conc))) %>%
+      ungroup()
+  }
+
+  df <- df |>
+    separate(col = table,
+             into = c("table", "data_type", "table_type"),
+             sep = ":") |>
+    mutate(data_type = str_trim(data_type),
+           table_type = str_trim(table_type),
+           start_time = str_extract(table, "\\d{8}-\\d{6}") %>%
+             ymd_hms(tz = "UTC", truncated = 3))
+
+  lookup <- list(
+    analyte_conc = "Analyte concentrations",
+    conc_per_reagent = "Analyte concentration per reagent ion",
+    conc_per_product = "Analyte concentration per product ion",
+    raw_analyte_conc = "Raw analyte concentrations",
+    raw_conc_per_reagent = "Raw analyte concentration per reagent ion",
+    raw_conc_per_product = "Raw analyte concentration per product ion"
+  )
+
+  if(!table_type %in% names(lookup)) {
+    stop("Invalid table_type provided.")
+  }
+
+  filtered_df <- df[df$table_type == lookup[[table_type]], ]
+
+  if (table_type %in% c("analyte_conc", "raw_analyte_conc")) {
+    filtered_df <- filtered_df %>%
+      mutate(
+        unit = str_extract(compound_identity, "(?<=\\()ppb|ppm|ppt(?=\\))"),
+        cas_number = str_extract(compound_identity, "(?<=\\()\\d{1,7}-\\d{2}-\\d{1,2}(?=\\))"),
+        compound = str_remove(compound_identity, "\\s*\\([^()]*\\)\\s*$") %>%
+          str_remove_all("\\s*\\([^()]*\\)\\s*$") %>%
+          str_trim()
+      ) %>%
+      select(start_time, table, table_type, time_s, compound_identity, compound, cas_number, conc, unit)
+  }
+
+  if (table_type %in% c("conc_per_reagent", "raw_conc_per_reagent")) {
+    filtered_df <- filtered_df %>%
+      separate(compound_identity, into = c("reagent_ion", "compound_identity_2"), sep = " / ") %>%
+      mutate(
+        reagent_ion = str_trim(reagent_ion, side = "right"),
+        compound_identity_2 = str_trim(compound_identity_2, side = "left")
+      ) %>%
+      mutate(compound_identity = filtered_df$compound_identity) %>%
+      mutate(
+        unit = str_extract(compound_identity_2, "(?<=\\()ppb|ppm|ppt(?=\\))"),
+        cas_number = str_extract(compound_identity_2, "(?<=\\()\\d{1,7}-\\d{2}-\\d{1,2}(?=\\))"),
+        compound = str_remove(compound_identity_2, "\\s*\\([^()]*\\)\\s*$") %>%
+          str_remove_all("\\s*\\([^()]*\\)\\s*$") %>%
+          str_trim()
+      ) %>%
+      select(start_time, table, table_type, time_s, compound_identity, reagent_ion, compound, cas_number, conc, unit)
+  }
+
+  if (table_type %in% c("conc_per_product", "raw_conc_per_product")) {
+    filtered_df <- filtered_df %>%
+      separate(compound_identity, into = c("product_ion_2", "reagent_ion", "compound_identity_2"), sep = " / ") %>%
+      mutate(compound_identity = filtered_df$compound_identity) %>%
+      separate(product_ion_2, into = c("product_ion", "product_mass"), sep = "\\s+\\[", remove = FALSE) %>%
+      mutate(
+        product_mass = gsub("]", "", product_mass),
+        product_mass = trimws(product_mass)
+      ) %>%
+      mutate(
+        unit = str_extract(compound_identity_2, "(?<=\\()ppb|ppm|ppt(?=\\))"),
+        cas_number = str_extract(compound_identity_2, "(?<=\\()\\d{1,7}-\\d{2}-\\d{1,2}(?=\\))"),
+        compound = str_remove(compound_identity_2, "\\s*\\([^()]*\\)\\s*$") %>%
+          str_remove_all("\\s*\\([^()]*\\)\\s*$") %>%
+          str_trim()
+      ) %>%
+      select(start_time, table, table_type, time_s, compound_identity, reagent_ion, product_ion, product_mass, compound, cas_number, conc, unit)
+  }
+
+  if (conc_unit == "ppb") {
+    filtered_df <- filtered_df %>%
+      mutate(
+        conc = if_else(unit == "ppm", conc * 1000, conc),
+        unit = if_else(unit == "ppm", "ppb", unit),
+        compound_identity = str_replace_all(compound_identity, "\\(ppm\\)", "(ppb)"),
+        conc = if_else(unit == "ppt", conc / 1000, conc),
+        unit = if_else(unit == "ppt", "ppb", unit),
+        compound_identity = str_replace_all(compound_identity, "\\(ppt\\)", "(ppb)")
+      )
+  }
+
+  if (conc_unit == "ppm") {
+    filtered_df <- filtered_df %>%
+      mutate(
+        conc = if_else(unit == "ppb", conc / 1000, conc),
+        unit = if_else(unit == "ppb", "ppm", unit),
+        compound_identity = str_replace_all(compound_identity, "\\(ppb\\)", "(ppm)"),
+        conc = if_else(unit == "ppt", conc / 1000000, conc),
+        unit = if_else(unit == "ppt", "ppm", unit),
+        compound_identity = str_replace_all(compound_identity, "\\(ppt\\)", "(ppm)")
+      )
+  }
+
+  if (conc_unit == "ppt") {
+    filtered_df <- filtered_df %>%
+      mutate(
+        conc = if_else(unit == "ppb", conc * 1000, conc),
+        unit = if_else(unit == "ppb", "ppt", unit),
+        compound_identity = str_replace_all(compound_identity, "\\(ppb\\)", "(ppt)"),
+        conc = if_else(unit == "ppm", conc * 1000000, conc),
+        unit = if_else(unit == "ppm", "ppt", unit),
+        compound_identity = str_replace_all(compound_identity, "\\(ppm\\)", "(ppt)")
+      )
+  }
+
+  unique_units <- unique(filtered_df$unit)
+  if (length(unique_units) > 1) {
+    warning("Not all units are the same after conversion. Multiple units found: ", paste(unique_units, collapse = ", "))
+  }
+
+  return(filtered_df)
+}
diff --git a/R/read_fullscan_sift.R b/R/read_fullscan_sift.R
@@ -0,0 +1,65 @@
+#' Read SIFT Full Scan Data
+#'
+#' This function reads extracts the "Time vs Mass" table from a SIFT-MS file. Useful for fullscan SIFT-MS data which is missing several tables usually found in a SIM SIFT-MS file and causes an error in read_sift.
+#'
+#' @param file The file path for the SIFT .csv file.
+#' @param drop_prep Is the PREPARATION sub-data missing? TRUE/FALSE.
+#' @param chatty \code{TRUE}/\code{FALSE}. Should the function communicate what it is doing? Useful for debugging.
+#' @param warn logical. if a section of the file cannot be read, should a warning or error be thrown? Default FALSE leads to an error being produced. For use with \code{read_many_sift()} to skip bad files
+#'
+#' @return A tibble containing processed time vs mass data.
+#'
+#' @export
+#'
+
+read_fullscan_sift <- function(file, drop_prep = F, chatty = T, warn = FALSE) {
+  # Time vs mass
+  flags <- c("Mass Vs Time", "Cycle vs Product", "Time vs Mass",
+             "Detailed Compound Concentrations", "Analyte vs Time",
+             "Summary,")
+  lines <- tibble::tibble(line = readLines(file)) %>% dplyr::mutate(start = dplyr::row_number())
+
+  start_ends <- lines %>% dplyr::filter(stringr::str_detect(line,
+                                                            paste(flags, collapse = "|"))) %>% rbind(tibble::tibble(line = "start",
+                                                                                                                    start = 0)) %>% dplyr::arrange(start) %>% dplyr::mutate(end = dplyr::lead(start)) %>%
+    tidyr::replace_na(list(end = max(lines$start))) %>%
+    dplyr::mutate(start = dplyr::if_else(line == "Summary,:",
+                                         start + 1, start))
+  read_data <- function(start, end) {
+    if (start == 0) {
+      df <- suppressMessages(suppressWarnings(readr::read_csv(file,
+                                                              col_types = readr::cols(), na = c(":", " ",
+                                                                                                ""), skip = start, n_max = end - start, col_names = F)))
+    }
+    else {
+      df <- suppressMessages(suppressWarnings(readr::read_csv(file,
+                                                              col_types = readr::cols(), na = c(":", " ",
+                                                                                                ""), skip = start, n_max = end - (start +
+                                                                                                                                    2))))
+    }
+    return(df)
+  }
+  raw <- purrr::map2(.x = start_ends$start, .y = start_ends$end,
+                     .f = ~read_data(start = .x, end = .y))
+  n <- 0
+
+
+
+  # Meta
+  meta <- raw[[1]] %>% janitor::remove_empty(which = c("rows",
+                                                       "cols")) %>% tidyr::drop_na(X2) %>% dplyr::select(-X3) %>%
+    tidyr::pivot_wider(names_from = "X1", values_from = "X2") %>%
+    janitor::clean_names()
+  start_time <- meta$job_start_date %>% lubridate::ymd_hms()
+
+  # Time vs mass
+  if (drop_prep) {
+    n <- n - 1
+  }
+  time_vs_mass <- raw[[n + 6]] %>%
+    janitor::remove_empty(which = c("rows", "cols")) %>%
+    tidyr::pivot_longer(-(1:10), names_to = "ion") %>%
+    janitor::clean_names()
+
+  cbind(start_time, time_vs_mass) |> as_tibble()
+}