-
Notifications
You must be signed in to change notification settings - Fork 0
Implementing and Testing DIANN converter for MSstatsBIG. #9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: devel
Are you sure you want to change the base?
Changes from 2 commits
1dfd8f0
005ced9
d882091
6728b7c
a041a72
43e68b2
d3772c9
bfbcf7c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,134 @@ | ||
| #' @keywords internal | ||
| reduceBigDIANN <- function(input_file, output_path, MBR = TRUE, | ||
| quantificationColumn = "FragmentQuantCorrected", | ||
| global_qvalue_cutoff = 0.01, | ||
| qvalue_cutoff = 0.01, | ||
| pg_qvalue_cutoff = 0.01) { | ||
| if (grepl("csv", input_file)) { | ||
| delim = "," | ||
| } else if (grepl("tsv|xls", input_file)) { | ||
| delim = "\t" | ||
| } else { | ||
| delim <- ";" | ||
| } | ||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| diann_chunk <- function(x, pos) cleanDIANNChunk(x, | ||
| output_path, | ||
| MBR, | ||
| quantificationColumn, | ||
| pos, | ||
| global_qvalue_cutoff, | ||
| qvalue_cutoff, | ||
| pg_qvalue_cutoff) | ||
| readr::read_delim_chunked(input_file, | ||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| readr::DataFrameCallback$new(diann_chunk), | ||
| delim = delim, | ||
| chunk_size = 1e6) | ||
| } | ||
|
|
||
| #' @keywords internal | ||
| cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, | ||
| global_qvalue_cutoff = 0.01, | ||
| qvalue_cutoff = 0.01, | ||
| pg_qvalue_cutoff = 0.01) { | ||
|
|
||
| # 1. Select required columns | ||
| base_cols <- c('Protein.Names', 'Stripped.Sequence', 'Modified.Sequence', | ||
| 'Precursor.Charge', quantificationColumn, 'Q.Value', | ||
| 'Precursor.Mz', 'Fragment.Info', 'Run') | ||
|
|
||
| mbr_cols <- if (MBR) { | ||
| c('Lib.Q.Value', 'Lib.PG.Q.Value') | ||
| } else { | ||
| c('Global.Q.Value', 'Global.PG.Q.Value') | ||
| } | ||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| req_cols <- intersect(c(base_cols, mbr_cols), colnames(input)) | ||
| input <- dplyr::select(input, all_of(req_cols)) | ||
|
|
||
| # 2. Split concatenated values (un-nest) | ||
| split_cols <- intersect(c(quantificationColumn, "Fragment.Info"), colnames(input)) | ||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if (length(split_cols) > 0) { | ||
| input <- tidyr::separate_rows(input, all_of(split_cols), sep = ";") | ||
tonywu1999 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| # 3. Process fragment information | ||
|
|
||
| #Convert Intensity to Numeric from Char strings | ||
| input[[quantificationColumn]] <- as.numeric(input[[quantificationColumn]]) | ||
coderabbitai[bot] marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| input <- dplyr::mutate( | ||
| input, | ||
| FragmentIon = sub('\\^\\.\\*', '', .data$Fragment.Info), | ||
|
|
||
| # Extract product charge | ||
| ProductCharge = dplyr::if_else( | ||
| grepl("/", .data$Fragment.Info), | ||
| # Extract charge (number right after "/" in string), default to 1 if parsing fails | ||
| as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")), | ||
| 1L | ||
| ) | ||
|
Comment on lines
179
to
186
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: find . -name "clean_DIANN.R" -type fRepository: Vitek-Lab/MSstatsBig Length of output: 81 🏁 Script executed: sed -n '64,70p' ./R/clean_DIANN.RRepository: Vitek-Lab/MSstatsBig Length of output: 348 🏁 Script executed: python3 << 'EOF'
# Let's verify the behavior of stringr::str_extract and as.integer in R
# by examining what happens when the regex pattern doesn't match
# The behavior should be:
# stringr::str_extract("something/", "(?<=/)[0-9]+") -> NA (no digits after /)
# as.integer(NA) -> NA (not 1L)
# This is a R behavior question, let me create a test to understand it better
print("Testing R string extraction and type conversion behavior:")
print("Case 1: Fragment.Info with / but no digits after it")
print(" stringr::str_extract returns: NA (pattern doesn't match)")
print(" as.integer(NA) returns: NA (not 1L)")
print("")
print("Case 2: Fragment.Info with / and digits after it")
print(" stringr::str_extract returns: the digits")
print(" as.integer(digits) returns: integer value")
print("")
print("Case 3: Fragment.Info without /")
print(" Current code returns: 1L (via the else branch of if_else)")
print("")
print("So the bug is in Case 1 - when / is present but no digits follow")
print("The function would return NA instead of 1L")
EOFRepository: Vitek-Lab/MSstatsBig Length of output: 150 🏁 Script executed: grep -r "ProductCharge" ./R --include="*.R" -B 2 -A 2Repository: Vitek-Lab/MSstatsBig Length of output: 4903 🏁 Script executed: sed -n '50,80p' ./R/clean_DIANN.RRepository: Vitek-Lab/MSstatsBig Length of output: 1088 Ensure ProductCharge defaults to 1 when parsing fails. The current code returns 🔧 Suggested change- ProductCharge = dplyr::if_else(
- grepl("/", .data$Fragment.Info),
- # Extract charge (number right after "/" in string), default to 1 if parsing fails
- as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")),
- 1L
- )
+ ProductCharge = dplyr::if_else(
+ grepl("/", .data$Fragment.Info),
+ dplyr::coalesce(
+ as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")),
+ 1L
+ ),
+ 1L
+ )🤖 Prompt for AI Agents |
||
| ) | ||
|
|
||
| # 4. Clean and filter data | ||
| input <- dplyr::filter( | ||
| input, | ||
| !grepl("NH3|H2O", .data$FragmentIon) & !is.na(.data[[quantificationColumn]]) | ||
| ) | ||
|
|
||
| # 5. Rename columns to MSstats standard | ||
| input <- dplyr::rename_with(input, .fn = function(x) gsub("\\.", "", x)) | ||
|
|
||
| # Standardize column names | ||
| old_names <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence', | ||
| 'PrecursorCharge', gsub("\\.", "", quantificationColumn), 'QValue', | ||
| 'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge') | ||
| new_names <- c('ProteinName', 'PeptideSequence', 'PeptideModifiedSequence', | ||
| 'PrecursorCharge', 'Intensity', 'DetectionQValue', | ||
| 'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge') | ||
|
|
||
| current_names <- colnames(input) | ||
| names_to_rename <- intersect(current_names, old_names) | ||
|
|
||
| # Create a named vector for renaming in the format c(new_name = old_name) | ||
| new_names_subset <- new_names[match(names_to_rename, old_names)] | ||
| rename_map <- setNames(names_to_rename, new_names_subset) | ||
| rename_map <- rename_map[!is.na(names(rename_map))] | ||
|
|
||
| input <- dplyr::rename(input, any_of(rename_map)) | ||
| # Filter by Q-values | ||
| input <- dplyr::filter(input, DetectionQValue < global_qvalue_cutoff) | ||
|
|
||
| if (MBR) { | ||
| input <- dplyr::filter(input, LibPGQValue < pg_qvalue_cutoff & LibQValue < qvalue_cutoff) | ||
| } else { | ||
| input <- dplyr::filter(input, GlobalPGQValue < pg_qvalue_cutoff & GlobalQValue < qvalue_cutoff) | ||
| } | ||
|
|
||
| # Final column selection for MSstats format | ||
| msstats_cols <- c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", "PrecursorCharge", | ||
| "FragmentIon", "ProductCharge", "Run", "Intensity") | ||
|
|
||
| #TODO: confirm with Tony -- are these three needed? | ||
|
|
||
| # Add annotation columns if they exist | ||
| if ("Condition" %in% colnames(input)) msstats_cols <- c(msstats_cols, "Condition") | ||
| if ("BioReplicate" %in% colnames(input)) msstats_cols <- c(msstats_cols, "BioReplicate") | ||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| # Add IsotopeLabelType, assuming Light for DIANN | ||
| input$IsotopeLabelType <- "L" | ||
| msstats_cols <- c(msstats_cols, "IsotopeLabelType") | ||
|
|
||
| final_cols <- intersect(msstats_cols, colnames(input)) | ||
| input <- dplyr::select(input, all_of(final_cols)) | ||
|
|
||
| # Write to file | ||
| if (!is.null(pos)) { | ||
| if (pos == 1) { | ||
| readr::write_csv(input, file = output_path, append = FALSE) | ||
| } else { | ||
| readr::write_csv(input, file = output_path, append = TRUE) | ||
| } | ||
| } | ||
| NULL | ||
| } | ||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,95 @@ | ||
| library(testthat) | ||
| library(mockery) | ||
|
|
||
| context("General converter functions") | ||
|
|
||
| test_that("MSstatsAddAnnotationBig adds annotation correctly", { | ||
| input_data <- data.frame( | ||
| Run = c("Run1", "Run2", "Run3"), | ||
| Intensity = c(100, 200, 300) | ||
| ) | ||
|
|
||
| annotation_data <- data.frame( | ||
| Run = c("Run1", "Run2", "Run3"), | ||
| Condition = c("A", "A", "B"), | ||
| BioReplicate = c(1, 2, 1) | ||
| ) | ||
|
|
||
| expected_output <- data.frame( | ||
| Run = c("Run1", "Run2", "Run3"), | ||
| Intensity = c(100, 200, 300), | ||
| Condition = c("A", "A", "B"), | ||
| BioReplicate = c(1, 2, 1) | ||
| ) | ||
|
|
||
| result <- MSstatsAddAnnotationBig(input_data, annotation_data) | ||
|
|
||
| expect_equal(result, expected_output) | ||
| }) | ||
|
|
||
| test_that("MSstatsPreprocessBig performs feature selection correctly", { | ||
| input_file <- tempfile(fileext = ".csv") | ||
| output_file <- "preprocess_output.csv" | ||
|
|
||
| # P1 has 3 features (frag1, frag2, frag3). frag3 has the highest avg intensity. | ||
| # P2 has 2 features (fragA, fragB). fragB has the highest avg intensity. | ||
| msstats_data <- rbind( | ||
| data.frame(ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2, FragmentIon = rep(c("frag1", "frag2", "frag3"), each = 2), ProductCharge = 1, IsotopeLabelType = "L", Condition = "A", BioReplicate = rep(1:2, 3), Run = rep(c("run1", "run2"), 3), Intensity = c(1000, 1100, 500, 550, 2000, 2100)), | ||
| data.frame(ProteinName = "P2", PeptideSequence = "PEPTIDE2", PrecursorCharge = 3, FragmentIon = rep(c("fragA", "fragB"), each = 2), ProductCharge = 1, IsotopeLabelType = "L", Condition = "B", BioReplicate = rep(1:2, 2), Run = rep(c("run1", "run2"), 2), Intensity = c(100, 150, 800, 850)) | ||
| ) | ||
| readr::write_csv(msstats_data, input_file) | ||
|
|
||
| processed <- MSstatsPreprocessBig(input_file, output_file, backend = "arrow", | ||
| max_feature_count = 1) | ||
| result <- dplyr::collect(processed) | ||
|
|
||
| # For P1, frag3 should be selected. For P2, fragB should be selected. | ||
| expect_equal(nrow(result), 4) | ||
|
|
||
| p1_result <- result[result$ProteinName == "P1", ] | ||
| expect_equal(nrow(p1_result), 2) | ||
| expect_true(all(p1_result$FragmentIon == "frag3")) | ||
|
|
||
| p2_result <- result[result$ProteinName == "P2", ] | ||
| expect_equal(nrow(p2_result), 2) | ||
| expect_true(all(p2_result$FragmentIon == "fragB")) | ||
|
|
||
| # Cleanup | ||
| file.remove(input_file) | ||
| if (file.exists(output_file)) file.remove(output_file) | ||
| }) | ||
|
|
||
| test_that("bigSpectronauttoMSstatsFormat works correctly", { | ||
| # Mock reduceBigSpectronaut as its source is not provided | ||
| mock_reduce <- mock(NULL) | ||
|
|
||
| stub(bigSpectronauttoMSstatsFormat, "reduceBigSpectronaut", function(input_file, output_path, ...) { | ||
| msstats_data <- data.frame( | ||
| ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2, | ||
| FragmentIon = rep(c("frag1", "frag2"), each = 2), ProductCharge = 1, | ||
| IsotopeLabelType = "L", Condition = "A", BioReplicate = rep(1:2, 2), | ||
| Run = rep(c("run1", "run2"), 2), Intensity = c(1000, 1100, 2000, 2100) # frag2 is higher | ||
| ) | ||
| readr::write_csv(msstats_data, output_path) | ||
| }) | ||
|
|
||
| input_file <- "dummy_spectro_input.csv" | ||
| output_file <- "spectro_output.csv" | ||
|
|
||
| processed <- bigSpectronauttoMSstatsFormat( | ||
| input_file = input_file, | ||
| output_file_name = output_file, | ||
| backend = "arrow", | ||
| max_feature_count = 1 | ||
| ) | ||
| result <- dplyr::collect(processed) | ||
|
|
||
| # The mock reduce function creates a file with 2 features for P1. | ||
| # max_feature_count = 1 should select frag2. | ||
| expect_equal(nrow(result), 2) | ||
| expect_true(all(result$FragmentIon == "frag2")) | ||
|
|
||
| # Cleanup | ||
| if (file.exists(output_file)) file.remove(output_file) | ||
| if (file.exists(paste0("reduce_output_", output_file))) file.remove(paste0("reduce_output_", output_file)) | ||
| }) |
Uh oh!
There was an error while loading. Please reload this page.