Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
1cca0f1
Add error catching for read_sift
willdrysdale Apr 24, 2023
4e93406
Merge pull request #1 from willdrysdale/main
willdrysdale May 26, 2023
3ab38f5
Update readme installation instructions
willdrysdale Nov 20, 2023
59fae8b
fixed bug in read_proc_sift
Heisenberg4322 Jul 9, 2024
909b338
Merge pull request #2 from wacl-york/hotfix/processed-sift-fix
Heisenberg4322 Jul 9, 2024
2890684
Create add_date_sift.R
Heisenberg4322 Jul 9, 2024
3354cd0
Merge pull request #3 from wacl-york/feature-sift_date
Heisenberg4322 Jul 9, 2024
5b9597e
Minor bug fix
Heisenberg4322 Jul 10, 2024
c1f1de1
Merge pull request #4 from wacl-york/hotfix-add_date_sift
Heisenberg4322 Jul 10, 2024
28560ff
Update add_date_sift.R
Heisenberg4322 Jul 10, 2024
a00bc89
Merge pull request #5 from wacl-york/hotfix-add_date_sift-2
Heisenberg4322 Jul 10, 2024
bc470c9
Create read_fullscan_sift.R
Heisenberg4322 Jul 10, 2024
17418ab
Merge pull request #6 from wacl-york/feature-read_fullscan_sift
Heisenberg4322 Jul 10, 2024
615cebb
documents update
Heisenberg4322 Jul 10, 2024
c30a3d6
Update add_date_sift.R
Heisenberg4322 Jul 10, 2024
c50c03f
Merge pull request #7 from wacl-york/hotfix-add_date_sift-3
Heisenberg4322 Jul 10, 2024
2e40311
description/namespace update for read_fullscan_sift
Heisenberg4322 Jul 10, 2024
52c3b03
add_date_sift update
Heisenberg4322 Jul 10, 2024
5f00980
Merge pull request #8 from wacl-york/hotfix-add_date_sift-4
Heisenberg4322 Jul 10, 2024
c69c7af
Update read_proc_sift.R
Heisenberg4322 Jul 11, 2024
69c4069
Merge pull request #9 from wacl-york/hotfix-read_proc_sift
Heisenberg4322 Jul 11, 2024
a1400f2
Update read_proc_sift.R
Heisenberg4322 Jul 11, 2024
33ff75f
Merge pull request #10 from wacl-york/hotfix-read_proc_sift
Heisenberg4322 Jul 11, 2024
a8baf99
Update read_proc_sift.R
Heisenberg4322 Jul 11, 2024
f94073c
Merge pull request #11 from wacl-york/hotfix-read_proc_sift
Heisenberg4322 Jul 11, 2024
767bc8b
Replaced the read_proc_sift function which was malfunctioning
Heisenberg4322 Aug 22, 2024
ba56650
Merge pull request #12 from wacl-york/update-read_proc_sift
Heisenberg4322 Aug 22, 2024
da5eec3
New function to process processed SIFT data output from read_proc_sift
Heisenberg4322 Aug 23, 2024
1bee722
Merge pull request #13 from wacl-york/feature-filter_proc_sift
Heisenberg4322 Aug 23, 2024
cd8c787
Update filter_proc_sift.R
Heisenberg4322 Aug 23, 2024
8bf6515
Merge pull request #14 from wacl-york/hotfix-filter_proc_sift
Heisenberg4322 Aug 23, 2024
b0c8eea
Update filter_proc_sift.R
Heisenberg4322 Aug 23, 2024
36689db
documentation update
Heisenberg4322 Aug 23, 2024
e406394
Merge pull request #15 from wacl-york/hotfix-filter_proc_sift2
Heisenberg4322 Aug 23, 2024
1d21ee8
added warning to catch and remove duplicate data
Heisenberg4322 Aug 27, 2024
10beb8c
Merge pull request #16 from wacl-york/hotfix-filter_proc_sift3
Heisenberg4322 Aug 27, 2024
c28d026
Feature - new function to validate fullscan SIFT data as quantitative
Heisenberg4322 Sep 18, 2024
46180ad
Merge pull request #17 from wacl-york/feature-validate_as_quantitativ…
Heisenberg4322 Sep 18, 2024
e6d7011
change of two column names for easier use with scripts
Heisenberg4322 Sep 18, 2024
724842d
change of two column names
Heisenberg4322 Sep 18, 2024
edec000
Merge pull request #18 from wacl-york/hotfix-validate_as_quanititativ…
Heisenberg4322 Sep 18, 2024
4a85434
Update validate_as_quantitative_sift.R
Heisenberg4322 Sep 18, 2024
946a131
Merge pull request #19 from wacl-york/hotfix-validate_as_quantitative…
Heisenberg4322 Sep 18, 2024
1a76c22
Update validate_as_quantitative_sift.R
Heisenberg4322 Sep 18, 2024
93310c9
Merge pull request #20 from wacl-york/hotfix-validate_as_quantitative…
Heisenberg4322 Sep 18, 2024
cbd364b
new functions for reading fullscan xml files
Heisenberg4322 Dec 19, 2024
8ff111b
Merge pull request #21 from wacl-york/feature-read_fullscan_xml_sift
Heisenberg4322 Dec 19, 2024
34fb607
function update
Heisenberg4322 Dec 19, 2024
ac98ec1
Merge pull request #22 from wacl-york/hotfix-add_date_sift
Heisenberg4322 Dec 19, 2024
41867b1
Update read_fullscan_xml_sift.R
Heisenberg4322 Jan 2, 2025
9daad10
Merge pull request #23 from wacl-york/hotfix-read_fullscan_xml_sift
Heisenberg4322 Jan 2, 2025
be74f05
Update filter_proc_sift.R
Heisenberg4322 Feb 17, 2025
eb613ce
Merge pull request #24 from wacl-york/hotfix-filter_proc_sift
Heisenberg4322 Feb 17, 2025
060f1ce
Update read_fullscan_xml_sift.R
Heisenberg4322 Feb 17, 2025
742401c
Update read_fullscan_xml_sift.R
Heisenberg4322 Apr 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,17 @@ License: GPL (>= 3)
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
Imports:
Imports:
dplyr,
janitor,
lubridate,
magrittr,
purrr,
readr,
rlang,
stringr,
tibble,
tidyr
RoxygenNote: 7.2.1
tidyr,
utils,
xml2
RoxygenNote: 7.3.0
9 changes: 9 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
# Generated by roxygen2: do not edit by hand

export("%>%")
export(add_date_sift)
export(filter_proc_sift)
export(read_fullscan_sift)
export(read_fullscan_xml_sift)
export(read_many_fullscan_xml_sift)
export(read_many_proc_sift)
export(read_many_sift)
export(read_many_sift2)
export(read_proc_sift)
export(read_sift)
export(validate_as_quantitative_sift)
import(dplyr)
import(lubridate)
import(rlang)
importFrom(magrittr,"%>%")
68 changes: 68 additions & 0 deletions R/add_date_sift.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#' Add Date SIFT Function
#'
#' @description This function adds a date column (dttm) to SIFT data from the scan start time (dttm) and time (numeric) columns, and can set the timezone of date and start time columns.
#'
#' @param data A data frame (from other read_sift functions) containing the SIFT data to be processed.
#' @param time_units A character string specifying the units for time in the unstitched files. Can be "s" for seconds or "ms" for milliseconds. This argument is required.
#' @param time_input A character string specifying the column name for the time input (e.g., "time_s" or "time_ms").
#' @param start_time_input A character string specifying the column name for the start time input. Defaults to "start_time".
#' @param force_tz A character string containing the time zone to convert to, or `FALSE` to not adjust the time zone. Defaults to `FALSE`. For use in WACL, use "Europe/London".
#' @param output A character string specifying the name of the output column for the calculated date. Defaults to "date".
#' @param round_date A logical value indicating whether to round the output to the nearest second. Defaults to TRUE.
#'
#' @return A data frame with an added output column (default: `date`), and time zone set for the specified columns.
#'
#' @import dplyr
#' @import lubridate
#' @import rlang
#'
#' @export
#'

add_date_sift <- function(data, time_units, time_input, start_time_input = "start_time", force_tz = FALSE, output = "date", round_date = TRUE) {

if (!time_units %in% c("s", "ms")) {
stop("Invalid value for 'time_units'. Please use 's' or 'ms'. Check unstitched files to find which to use.")
}

if (!(time_input %in% colnames(data))) {
stop("The specified time_input column does not exist in the data.")
}

if (!(start_time_input %in% colnames(data))) {
stop("The specified start_time_input column does not exist in the data.")
}

if (!is.numeric(data[[time_input]])) {
stop("The time_input column must be numeric.")
}


time_col <- if (time_units == "ms") data[[time_input]] / 1000 else data[[time_input]]

data <- data %>%
dplyr::mutate(
!!rlang::sym(output) := .data[[start_time_input]] + time_col
)

if (round_date) {
data <- data %>%
dplyr::mutate(
!!rlang::sym(output) := lubridate::round_date(.data[[output]], unit = "second")
)
}

if (force_tz != FALSE) {
data <- data %>%
dplyr::mutate(
!!rlang::sym(output) := lubridate::force_tz(.data[[output]], tzone = force_tz),
!!rlang::sym(start_time_input) := lubridate::force_tz(.data[[start_time_input]], tzone = force_tz)
)
} else {
warning("Timezone not specified. Set a timezone using the force_tz parameter. Use ?add_date_sift to see function documentation.")
}

data <- data %>% dplyr::relocate(.data[[output]])

return(data)
}
155 changes: 155 additions & 0 deletions R/filter_proc_sift.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#' Filter Processed SIFT-MS Data
#'
#' @description
#' This function is designed to filter processed SIFT-MS data outputted from the
#' read_proc_sift function. It allows you to filter the data frame for one of six
#' tables, can filter to remove incomplete scans and sets the units for
#' concentration.
#'
#' @param df A data frame outputted from the read_proc_sift function containing
#' Processed SIFT-MS data, which includes columns `table`, `time_s`,
#' `compound_identity`, and `conc`.
#'
#' @param table_type A character string indicating the type of concentration data
#' to filter. Acceptable values include: "analyte_conc", "conc_per_reagent",
#' "conc_per_product", "raw_analyte_conc", "raw_conc_per_reagent", and
#' "raw_conc_per_product"
#' @param remove_incomplete_scans A logical value. If TRUE (default), scans
#' which are incomplete will be removed. Incomplete scans can result when
#' a scan is ended.
#' @param conc_unit A character string specifying the desired concentration unit for
#' conversion. Acceptable values are "ppm", "ppb", or "ppt".
#'
#' @return A data frame
#' @export

filter_proc_sift <- function(df, table_type, remove_incomplete_scans = TRUE, conc_unit) {

valid_units <- c("ppm", "ppb", "ppt")
if (!conc_unit %in% valid_units) {
stop("Invalid conc_unit provided. Choose from 'ppm', 'ppb', or 'ppt'.")
}

if (remove_incomplete_scans) {
df <- df %>%
group_by(time_s, table) %>%
filter(!any(is.na(conc))) %>%
ungroup()
}

df <- df |>
separate(col = table,
into = c("table", "data_type", "table_type"),
sep = ":") |>
mutate(data_type = str_trim(data_type),
table_type = str_trim(table_type),
start_time = str_extract(table, "\\d{8}-\\d{6}") %>%
ymd_hms(tz = "UTC", truncated = 3))

lookup <- list(
analyte_conc = "Analyte concentrations",
conc_per_reagent = "Analyte concentration per reagent ion",
conc_per_product = "Analyte concentration per product ion",
raw_analyte_conc = "Raw analyte concentrations",
raw_conc_per_reagent = "Raw analyte concentration per reagent ion",
raw_conc_per_product = "Raw analyte concentration per product ion"
)

if(!table_type %in% names(lookup)) {
stop("Invalid table_type provided.")
}

filtered_df <- df[df$table_type == lookup[[table_type]], ]

if (table_type %in% c("analyte_conc", "raw_analyte_conc")) {
filtered_df <- filtered_df %>%
mutate(
unit = str_extract(compound_identity, "(?<=\\()ppb|ppm|ppt(?=\\))"),
cas_number = str_extract(compound_identity, "(?<=\\()\\d{1,7}-\\d{2}-\\d{1,2}(?=\\))"),
compound = str_remove(compound_identity, "\\s*\\([^()]*\\)\\s*$") %>%
str_remove_all("\\s*\\([^()]*\\)\\s*$") %>%
str_trim()
) %>%
select(start_time, table, table_type, time_s, compound_identity, compound, cas_number, conc, unit)
}

if (table_type %in% c("conc_per_reagent", "raw_conc_per_reagent")) {
filtered_df <- filtered_df %>%
separate(compound_identity, into = c("reagent_ion", "compound_identity_2"), sep = " / ") %>%
mutate(
reagent_ion = str_trim(reagent_ion, side = "right"),
compound_identity_2 = str_trim(compound_identity_2, side = "left")
) %>%
mutate(compound_identity = filtered_df$compound_identity) %>%
mutate(
unit = str_extract(compound_identity_2, "(?<=\\()ppb|ppm|ppt(?=\\))"),
cas_number = str_extract(compound_identity_2, "(?<=\\()\\d{1,7}-\\d{2}-\\d{1,2}(?=\\))"),
compound = str_remove(compound_identity_2, "\\s*\\([^()]*\\)\\s*$") %>%
str_remove_all("\\s*\\([^()]*\\)\\s*$") %>%
str_trim()
) %>%
select(start_time, table, table_type, time_s, compound_identity, reagent_ion, compound, cas_number, conc, unit)
}

if (table_type %in% c("conc_per_product", "raw_conc_per_product")) {
filtered_df <- filtered_df %>%
separate(compound_identity, into = c("product_ion_2", "reagent_ion", "compound_identity_2"), sep = " / ") %>%
mutate(compound_identity = filtered_df$compound_identity) %>%
separate(product_ion_2, into = c("product_ion", "product_mass"), sep = "\\s+\\[", remove = FALSE) %>%
mutate(
product_mass = gsub("]", "", product_mass),
product_mass = trimws(product_mass)
) %>%
mutate(
unit = str_extract(compound_identity_2, "(?<=\\()ppb|ppm|ppt(?=\\))"),
cas_number = str_extract(compound_identity_2, "(?<=\\()\\d{1,7}-\\d{2}-\\d{1,2}(?=\\))"),
compound = str_remove(compound_identity_2, "\\s*\\([^()]*\\)\\s*$") %>%
str_remove_all("\\s*\\([^()]*\\)\\s*$") %>%
str_trim()
) %>%
select(start_time, table, table_type, time_s, compound_identity, reagent_ion, product_ion, product_mass, compound, cas_number, conc, unit)
}

if (conc_unit == "ppb") {
filtered_df <- filtered_df %>%
mutate(
conc = if_else(unit == "ppm", conc * 1000, conc),
unit = if_else(unit == "ppm", "ppb", unit),
compound_identity = str_replace_all(compound_identity, "\\(ppm\\)", "(ppb)"),
conc = if_else(unit == "ppt", conc / 1000, conc),
unit = if_else(unit == "ppt", "ppb", unit),
compound_identity = str_replace_all(compound_identity, "\\(ppt\\)", "(ppb)")
)
}

if (conc_unit == "ppm") {
filtered_df <- filtered_df %>%
mutate(
conc = if_else(unit == "ppb", conc / 1000, conc),
unit = if_else(unit == "ppb", "ppm", unit),
compound_identity = str_replace_all(compound_identity, "\\(ppb\\)", "(ppm)"),
conc = if_else(unit == "ppt", conc / 1000000, conc),
unit = if_else(unit == "ppt", "ppm", unit),
compound_identity = str_replace_all(compound_identity, "\\(ppt\\)", "(ppm)")
)
}

if (conc_unit == "ppt") {
filtered_df <- filtered_df %>%
mutate(
conc = if_else(unit == "ppb", conc * 1000, conc),
unit = if_else(unit == "ppb", "ppt", unit),
compound_identity = str_replace_all(compound_identity, "\\(ppb\\)", "(ppt)"),
conc = if_else(unit == "ppm", conc * 1000000, conc),
unit = if_else(unit == "ppm", "ppt", unit),
compound_identity = str_replace_all(compound_identity, "\\(ppm\\)", "(ppt)")
)
}

unique_units <- unique(filtered_df$unit)
if (length(unique_units) > 1) {
warning("Not all units are the same after conversion. Multiple units found: ", paste(unique_units, collapse = ", "))
}

return(filtered_df)
}
65 changes: 65 additions & 0 deletions R/read_fullscan_sift.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#' Read SIFT Full Scan Data
#'
#' This function reads extracts the "Time vs Mass" table from a SIFT-MS file. Useful for fullscan SIFT-MS data which is missing several tables usually found in a SIM SIFT-MS file and causes an error in read_sift.
#'
#' @param file The file path for the SIFT .csv file.
#' @param drop_prep Is the PREPARATION sub-data missing? TRUE/FALSE.
#' @param chatty \code{TRUE}/\code{FALSE}. Should the function communicate what it is doing? Useful for debugging.
#' @param warn logical. if a section of the file cannot be read, should a warning or error be thrown? Default FALSE leads to an error being produced. For use with \code{read_many_sift()} to skip bad files
#'
#' @return A tibble containing processed time vs mass data.
#'
#' @export
#'

read_fullscan_sift <- function(file, drop_prep = F, chatty = T, warn = FALSE) {
# Time vs mass
flags <- c("Mass Vs Time", "Cycle vs Product", "Time vs Mass",
"Detailed Compound Concentrations", "Analyte vs Time",
"Summary,")
lines <- tibble::tibble(line = readLines(file)) %>% dplyr::mutate(start = dplyr::row_number())

start_ends <- lines %>% dplyr::filter(stringr::str_detect(line,
paste(flags, collapse = "|"))) %>% rbind(tibble::tibble(line = "start",
start = 0)) %>% dplyr::arrange(start) %>% dplyr::mutate(end = dplyr::lead(start)) %>%
tidyr::replace_na(list(end = max(lines$start))) %>%
dplyr::mutate(start = dplyr::if_else(line == "Summary,:",
start + 1, start))
read_data <- function(start, end) {
if (start == 0) {
df <- suppressMessages(suppressWarnings(readr::read_csv(file,
col_types = readr::cols(), na = c(":", " ",
""), skip = start, n_max = end - start, col_names = F)))
}
else {
df <- suppressMessages(suppressWarnings(readr::read_csv(file,
col_types = readr::cols(), na = c(":", " ",
""), skip = start, n_max = end - (start +
2))))
}
return(df)
}
raw <- purrr::map2(.x = start_ends$start, .y = start_ends$end,
.f = ~read_data(start = .x, end = .y))
n <- 0



# Meta
meta <- raw[[1]] %>% janitor::remove_empty(which = c("rows",
"cols")) %>% tidyr::drop_na(X2) %>% dplyr::select(-X3) %>%
tidyr::pivot_wider(names_from = "X1", values_from = "X2") %>%
janitor::clean_names()
start_time <- meta$job_start_date %>% lubridate::ymd_hms()

# Time vs mass
if (drop_prep) {
n <- n - 1
}
time_vs_mass <- raw[[n + 6]] %>%
janitor::remove_empty(which = c("rows", "cols")) %>%
tidyr::pivot_longer(-(1:10), names_to = "ion") %>%
janitor::clean_names()

cbind(start_time, time_vs_mass) |> as_tibble()
}
Loading