-
Notifications
You must be signed in to change notification settings - Fork 0
Implementing and Testing DIANN converter for MSstatsBIG. #9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: devel
Are you sure you want to change the base?
Changes from 1 commit
1dfd8f0
005ced9
d882091
6728b7c
a041a72
43e68b2
d3772c9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,113 @@ | ||
| #' @keywords internal | ||
| reduceBigDIANN <- function(input_file, output_path, MBR = TRUE, | ||
| quantificationColumn = "FragmentQuantCorrected") { | ||
| if (grepl("csv", input_file)) { | ||
| delim = "," | ||
| } else if (grepl("tsv|xls", input_file)) { | ||
| delim = "\t" | ||
| } else { | ||
| delim <- ";" | ||
| } | ||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| diann_chunk <- function(x, pos) cleanDIANNChunk(x, | ||
| output_path, | ||
| MBR, | ||
| quantificationColumn, | ||
| pos) | ||
| readr::read_delim_chunked(input_file, | ||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| readr::DataFrameCallback$new(diann_chunk), | ||
| delim = delim, | ||
| chunk_size = 1e6) | ||
| } | ||
|
|
||
| #' @keywords internal | ||
| cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos) { | ||
|
|
||
| # 1. Select required columns | ||
| base_cols <- c('Protein.Names', 'Stripped.Sequence', 'Modified.Sequence', | ||
| 'Precursor.Charge', quantificationColumn, 'Q.Value', | ||
| 'Precursor.Mz', 'Fragment.Info', 'Run') | ||
|
|
||
| mbr_cols <- if (MBR) { | ||
| c('Lib.Q.Value', 'Lib.PG.Q.Value') | ||
| } else { | ||
| c('Global.Q.Value', 'Global.PG.Q.Value') | ||
| } | ||
|
Comment on lines
+118
to
+122
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here, we need to add a filter for filtering rows if their values in these columns is below a certain threshold: see reference
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
|
|
||
| req_cols <- intersect(c(base_cols, mbr_cols), colnames(input)) | ||
| input <- dplyr::select(input, all_of(req_cols)) | ||
|
|
||
| # 2. Split concatenated values (un-nest) | ||
| split_cols <- intersect(c(quantificationColumn, "Fragment.Info"), colnames(input)) | ||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if (length(split_cols) > 0) { | ||
| input <- tidyr::separate_rows(input, all_of(split_cols), sep = ";") | ||
tonywu1999 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| # 3. Process fragment information | ||
| input[[quantificationColumn]] <- as.numeric(input[[quantificationColumn]]) | ||
|
|
||
| input <- dplyr::mutate( | ||
| input, | ||
| FragmentIon = sub('\\^\\.\\*', '', .data$Fragment.Info), | ||
| ProductCharge = dplyr::if_else( | ||
| grepl("/", .data$Fragment.Info), | ||
| # Extract charge, default to 1 if parsing fails | ||
| as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")), | ||
| 1L | ||
| ) | ||
| ) | ||
tonywu1999 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| # 4. Clean and filter data | ||
| input <- dplyr::filter( | ||
| input, | ||
| !grepl("NH3|H2O", .data$FragmentIon) & !is.na(.data[[quantificationColumn]]) | ||
| ) | ||
|
|
||
| # 5. Rename columns to MSstats standard | ||
| input <- dplyr::rename_with(input, .fn = function(x) gsub("\\.", "", x)) | ||
|
|
||
| # Standardize column names | ||
| old_names <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence', | ||
| 'PrecursorCharge', gsub("\\.", "", quantificationColumn), 'QValue', | ||
| 'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge') | ||
| new_names <- c('ProteinName', 'PeptideSequence', 'PeptideModifiedSequence', | ||
| 'PrecursorCharge', 'Intensity', 'DetectionQValue', | ||
| 'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge') | ||
|
|
||
| current_names <- colnames(input) | ||
| names_to_rename <- intersect(current_names, old_names) | ||
|
|
||
| # Create a named vector for renaming in the format c(new_name = old_name) | ||
| new_names_subset <- new_names[match(names_to_rename, old_names)] | ||
| rename_map <- setNames(names_to_rename, new_names_subset) | ||
| rename_map <- rename_map[!is.na(names(rename_map))] | ||
|
|
||
| input <- dplyr::rename(input, any_of(rename_map)) | ||
|
|
||
| # Final column selection for MSstats format | ||
| msstats_cols <- c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", "PrecursorCharge", | ||
| "FragmentIon", "ProductCharge", "Run", "Intensity") | ||
|
|
||
| #TODO: confirm with Tony -- are these three needed? | ||
|
|
||
| # Add annotation columns if they exist | ||
| if ("Condition" %in% colnames(input)) msstats_cols <- c(msstats_cols, "Condition") | ||
| if ("BioReplicate" %in% colnames(input)) msstats_cols <- c(msstats_cols, "BioReplicate") | ||
|
Comment on lines
+256
to
+257
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Confirmed with Devon, you can remove this code and instead add an additional parameter for users to provide an annotation file (i.e. the table with Run, BioReplicate, and Condition)
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed |
||
|
|
||
| # Add IsotopeLabelType, assuming Light for DIANN | ||
| input$IsotopeLabelType <- "L" | ||
| msstats_cols <- c(msstats_cols, "IsotopeLabelType") | ||
|
|
||
| final_cols <- intersect(msstats_cols, colnames(input)) | ||
| input <- dplyr::select(input, all_of(final_cols)) | ||
|
|
||
| # Write to file | ||
| if (!is.null(pos)) { | ||
| if (pos == 1) { | ||
| readr::write_csv(input, file = output_path, append = FALSE) | ||
| } else { | ||
| readr::write_csv(input, file = output_path, append = TRUE) | ||
| } | ||
| } | ||
| NULL | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -155,6 +155,50 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name, | |||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| #' Convert out-of-memory DIANN files to MSstats format. | ||||||||||||||||||||||||||||||||||||||||||||
| #' | ||||||||||||||||||||||||||||||||||||||||||||
| #' @inheritParams MSstatsPreprocessBig | ||||||||||||||||||||||||||||||||||||||||||||
| #' @param MBR True if analysis was done with match between runs. | ||||||||||||||||||||||||||||||||||||||||||||
| #' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x. | ||||||||||||||||||||||||||||||||||||||||||||
| #' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. | ||||||||||||||||||||||||||||||||||||||||||||
coderabbitai[bot] marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||||||||||||||||||||||||||||||||||||||||
| #' | ||||||||||||||||||||||||||||||||||||||||||||
| #' @export | ||||||||||||||||||||||||||||||||||||||||||||
| #' | ||||||||||||||||||||||||||||||||||||||||||||
| #' @return either arrow object or sparklyr table that can be optionally collected | ||||||||||||||||||||||||||||||||||||||||||||
| #' into memory by using dplyr::collect function. | ||||||||||||||||||||||||||||||||||||||||||||
| #' | ||||||||||||||||||||||||||||||||||||||||||||
| bigDIANNtoMSstatsFormat <- function(input_file, | ||||||||||||||||||||||||||||||||||||||||||||
| output_file_name, | ||||||||||||||||||||||||||||||||||||||||||||
| backend, | ||||||||||||||||||||||||||||||||||||||||||||
| MBR = TRUE, | ||||||||||||||||||||||||||||||||||||||||||||
| quantificationColumn = "Fragment.Quant.Corrected", | ||||||||||||||||||||||||||||||||||||||||||||
| max_feature_count = 100, | ||||||||||||||||||||||||||||||||||||||||||||
| filter_unique_peptides = FALSE, | ||||||||||||||||||||||||||||||||||||||||||||
| aggregate_psms = FALSE, | ||||||||||||||||||||||||||||||||||||||||||||
| filter_few_obs = FALSE, | ||||||||||||||||||||||||||||||||||||||||||||
| remove_annotation = FALSE, | ||||||||||||||||||||||||||||||||||||||||||||
| calculateAnomalyScores=FALSE, | ||||||||||||||||||||||||||||||||||||||||||||
| anomalyModelFeatures=c(), | ||||||||||||||||||||||||||||||||||||||||||||
| connection = NULL) { | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| # Reduce and clean the DIANN report file in chunks | ||||||||||||||||||||||||||||||||||||||||||||
| reduceBigDIANN(input_file, | ||||||||||||||||||||||||||||||||||||||||||||
| paste0("reduce_output_", output_file_name), | ||||||||||||||||||||||||||||||||||||||||||||
| MBR, | ||||||||||||||||||||||||||||||||||||||||||||
| quantificationColumn) | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| # Preprocess the cleaned data (feature selection, etc.) | ||||||||||||||||||||||||||||||||||||||||||||
| msstats_data <- MSstatsPreprocessBig( | ||||||||||||||||||||||||||||||||||||||||||||
| paste0("reduce_output_", output_file_name), | ||||||||||||||||||||||||||||||||||||||||||||
| output_file_name, backend, max_feature_count, | ||||||||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+184
to
+193
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: fd "converters.R" --type fRepository: Vitek-Lab/MSstatsBig Length of output: 111 🏁 Script executed: wc -l R/converters.RRepository: Vitek-Lab/MSstatsBig Length of output: 82 🏁 Script executed: cat -n R/converters.R | sed -n '180,195p'Repository: Vitek-Lab/MSstatsBig Length of output: 878 🏁 Script executed: cat -n R/converters.R | sed -n '170,200p'Repository: Vitek-Lab/MSstatsBig Length of output: 1649 🏁 Script executed: cat -n R/converters.R | sed -n '1,30p'Repository: Vitek-Lab/MSstatsBig Length of output: 2410 🏁 Script executed: rg "bigDIANNtoMSstatsFormat" --type r -A 3 -B 3Repository: Vitek-Lab/MSstatsBig Length of output: 1488 🏁 Script executed: cat -n tests/testthat/test-diann_converter.R | grep -A 50 "bigDIANNtoMSstatsFormat works with arrow"Repository: Vitek-Lab/MSstatsBig Length of output: 2424 🏁 Script executed: cat -n tests/testthat/test-diann_converter.RRepository: Vitek-Lab/MSstatsBig Length of output: 5551 🏁 Script executed: rg "output_file_name" R/converters.R -nRepository: Vitek-Lab/MSstatsBig Length of output: 958 🏁 Script executed: cat -n R/converters.R | sed -n '128,155p'Repository: Vitek-Lab/MSstatsBig Length of output: 1853 Fix unsafe path construction that breaks when output_file_name contains directories. The Note: The same pattern exists in 🐛 Proposed fix+ reduce_path <- file.path(
+ dirname(output_file_name),
+ paste0("reduce_output_", basename(output_file_name))
+ )
+ reduceBigDIANN(input_file, reduce_path, MBR, quantificationColumn)
- reduceBigDIANN(input_file,
- paste0("reduce_output_", output_file_name),
- MBR,
- quantificationColumn)
- msstats_data <- MSstatsPreprocessBig(
- paste0("reduce_output_", output_file_name),
+ msstats_data <- MSstatsPreprocessBig(
+ reduce_path,
output_file_name, backend, max_feature_count,📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||||||||||||||||||||||||||
| filter_unique_peptides, aggregate_psms, filter_few_obs, | ||||||||||||||||||||||||||||||||||||||||||||
| remove_annotation, calculateAnomalyScores, | ||||||||||||||||||||||||||||||||||||||||||||
| anomalyModelFeatures, connection) | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| return(msstats_data) | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| #' Merge annotation to output of MSstatsPreprocessBig | ||||||||||||||||||||||||||||||||||||||||||||
| #' | ||||||||||||||||||||||||||||||||||||||||||||
| #' @param input output of MSstatsPreprocessBig | ||||||||||||||||||||||||||||||||||||||||||||
|
|
@@ -185,4 +229,3 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name, | |||||||||||||||||||||||||||||||||||||||||||
| MSstatsAddAnnotationBig <- function(input, annotation) { | ||||||||||||||||||||||||||||||||||||||||||||
| dplyr::inner_join(input, annotation, by = "Run") | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,95 @@ | ||
| library(testthat) | ||
| library(mockery) | ||
|
|
||
| context("General converter functions") | ||
|
|
||
| test_that("MSstatsAddAnnotationBig adds annotation correctly", { | ||
| input_data <- data.frame( | ||
| Run = c("Run1", "Run2", "Run3"), | ||
| Intensity = c(100, 200, 300) | ||
| ) | ||
|
|
||
| annotation_data <- data.frame( | ||
| Run = c("Run1", "Run2", "Run3"), | ||
| Condition = c("A", "A", "B"), | ||
| BioReplicate = c(1, 2, 1) | ||
| ) | ||
|
|
||
| expected_output <- data.frame( | ||
| Run = c("Run1", "Run2", "Run3"), | ||
| Intensity = c(100, 200, 300), | ||
| Condition = c("A", "A", "B"), | ||
| BioReplicate = c(1, 2, 1) | ||
| ) | ||
|
|
||
| result <- MSstatsAddAnnotationBig(input_data, annotation_data) | ||
|
|
||
| expect_equal(result, expected_output) | ||
| }) | ||
|
|
||
| test_that("MSstatsPreprocessBig performs feature selection correctly", { | ||
| input_file <- tempfile(fileext = ".csv") | ||
| output_file <- "preprocess_output.csv" | ||
|
|
||
| # P1 has 3 features (frag1, frag2, frag3). frag3 has the highest avg intensity. | ||
| # P2 has 2 features (fragA, fragB). fragB has the highest avg intensity. | ||
| msstats_data <- rbind( | ||
| data.frame(ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2, FragmentIon = rep(c("frag1", "frag2", "frag3"), each = 2), ProductCharge = 1, IsotopeLabelType = "L", Condition = "A", BioReplicate = rep(1:2, 3), Run = rep(c("run1", "run2"), 3), Intensity = c(1000, 1100, 500, 550, 2000, 2100)), | ||
| data.frame(ProteinName = "P2", PeptideSequence = "PEPTIDE2", PrecursorCharge = 3, FragmentIon = rep(c("fragA", "fragB"), each = 2), ProductCharge = 1, IsotopeLabelType = "L", Condition = "B", BioReplicate = rep(1:2, 2), Run = rep(c("run1", "run2"), 2), Intensity = c(100, 150, 800, 850)) | ||
| ) | ||
| readr::write_csv(msstats_data, input_file) | ||
|
|
||
| processed <- MSstatsPreprocessBig(input_file, output_file, backend = "arrow", | ||
| max_feature_count = 1) | ||
| result <- dplyr::collect(processed) | ||
|
|
||
| # For P1, frag3 should be selected. For P2, fragB should be selected. | ||
| expect_equal(nrow(result), 4) | ||
|
|
||
| p1_result <- result[result$ProteinName == "P1", ] | ||
| expect_equal(nrow(p1_result), 2) | ||
| expect_true(all(p1_result$FragmentIon == "frag3")) | ||
|
|
||
| p2_result <- result[result$ProteinName == "P2", ] | ||
| expect_equal(nrow(p2_result), 2) | ||
| expect_true(all(p2_result$FragmentIon == "fragB")) | ||
|
|
||
| # Cleanup | ||
| file.remove(input_file) | ||
| if (file.exists(output_file)) file.remove(output_file) | ||
| }) | ||
|
|
||
| test_that("bigSpectronauttoMSstatsFormat works correctly", { | ||
| # Mock reduceBigSpectronaut as its source is not provided | ||
| mock_reduce <- mock(NULL) | ||
|
|
||
| stub(bigSpectronauttoMSstatsFormat, "reduceBigSpectronaut", function(input_file, output_path, ...) { | ||
| msstats_data <- data.frame( | ||
| ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2, | ||
| FragmentIon = rep(c("frag1", "frag2"), each = 2), ProductCharge = 1, | ||
| IsotopeLabelType = "L", Condition = "A", BioReplicate = rep(1:2, 2), | ||
| Run = rep(c("run1", "run2"), 2), Intensity = c(1000, 1100, 2000, 2100) # frag2 is higher | ||
| ) | ||
| readr::write_csv(msstats_data, output_path) | ||
| }) | ||
|
|
||
| input_file <- "dummy_spectro_input.csv" | ||
| output_file <- "spectro_output.csv" | ||
|
|
||
| processed <- bigSpectronauttoMSstatsFormat( | ||
| input_file = input_file, | ||
| output_file_name = output_file, | ||
| backend = "arrow", | ||
| max_feature_count = 1 | ||
| ) | ||
| result <- dplyr::collect(processed) | ||
|
|
||
| # The mock reduce function creates a file with 2 features for P1. | ||
| # max_feature_count = 1 should select frag2. | ||
| expect_equal(nrow(result), 2) | ||
| expect_true(all(result$FragmentIon == "frag2")) | ||
|
|
||
| # Cleanup | ||
| if (file.exists(output_file)) file.remove(output_file) | ||
| if (file.exists(paste0("reduce_output_", output_file))) file.remove(paste0("reduce_output_", output_file)) | ||
| }) |
Uh oh!
There was an error while loading. Please reload this page.