Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Description: MSstats package provide tools for preprocessing, summarization and
processing larger than memory data sets.
License: Artistic-2.0
Encoding: UTF-8
RoxygenNote: 7.3.2
RoxygenNote: 7.3.3
Imports:
arrow,
DBI,
Expand All @@ -23,7 +23,9 @@ Imports:
readr,
sparklyr,
utils
Suggests:
Suggests:
testthat,
mockery,
knitr,
rmarkdown
VignetteBuilder: knitr
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

export(MSstatsAddAnnotationBig)
export(MSstatsPreprocessBig)
export(bigDIANNtoMSstatsFormat)
export(bigFragPipetoMSstatsFormat)
export(bigSpectronauttoMSstatsFormat)
importFrom(MSstats,dataProcess)
Expand Down
134 changes: 134 additions & 0 deletions R/clean_DIANN.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#' @keywords internal
reduceBigDIANN <- function(input_file, output_path, MBR = TRUE,
quantificationColumn = "FragmentQuantCorrected",
global_qvalue_cutoff = 0.01,
qvalue_cutoff = 0.01,
pg_qvalue_cutoff = 0.01) {
if (grepl("csv", input_file)) {
delim = ","
} else if (grepl("tsv|xls", input_file)) {
delim = "\t"
} else {
delim <- ";"
}

diann_chunk <- function(x, pos) cleanDIANNChunk(x,
output_path,
MBR,
quantificationColumn,
pos,
global_qvalue_cutoff,
qvalue_cutoff,
pg_qvalue_cutoff)
readr::read_delim_chunked(input_file,
readr::DataFrameCallback$new(diann_chunk),
delim = delim,
chunk_size = 1e6)
}

#' @keywords internal
cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
global_qvalue_cutoff = 0.01,
qvalue_cutoff = 0.01,
pg_qvalue_cutoff = 0.01) {

# 1. Select required columns
base_cols <- c('Protein.Names', 'Stripped.Sequence', 'Modified.Sequence',
'Precursor.Charge', quantificationColumn, 'Q.Value',
'Precursor.Mz', 'Fragment.Info', 'Run')

mbr_cols <- if (MBR) {
c('Lib.Q.Value', 'Lib.PG.Q.Value')
} else {
c('Global.Q.Value', 'Global.PG.Q.Value')
}

req_cols <- intersect(c(base_cols, mbr_cols), colnames(input))
input <- dplyr::select(input, all_of(req_cols))

# 2. Split concatenated values (un-nest)
split_cols <- intersect(c(quantificationColumn, "Fragment.Info"), colnames(input))
if (length(split_cols) > 0) {
input <- tidyr::separate_rows(input, all_of(split_cols), sep = ";")
}

# 3. Process fragment information

#Convert Intensity to Numeric from Char strings
input[[quantificationColumn]] <- as.numeric(input[[quantificationColumn]])

input <- dplyr::mutate(
input,
FragmentIon = sub('\\^\\.\\*', '', .data$Fragment.Info),

# Extract product charge
ProductCharge = dplyr::if_else(
grepl("/", .data$Fragment.Info),
# Extract charge (number right after "/" in string), default to 1 if parsing fails
as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")),
1L
)
Comment on lines 179 to 186
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

find . -name "clean_DIANN.R" -type f

Repository: Vitek-Lab/MSstatsBig

Length of output: 81


🏁 Script executed:

sed -n '64,70p' ./R/clean_DIANN.R

Repository: Vitek-Lab/MSstatsBig

Length of output: 348


🏁 Script executed:

python3 << 'EOF'
# Let's verify the behavior of stringr::str_extract and as.integer in R
# by examining what happens when the regex pattern doesn't match

# The behavior should be:
# stringr::str_extract("something/", "(?<=/)[0-9]+") -> NA (no digits after /)
# as.integer(NA) -> NA (not 1L)

# This is a R behavior question, let me create a test to understand it better

print("Testing R string extraction and type conversion behavior:")
print("Case 1: Fragment.Info with / but no digits after it")
print("  stringr::str_extract returns: NA (pattern doesn't match)")
print("  as.integer(NA) returns: NA (not 1L)")
print("")
print("Case 2: Fragment.Info with / and digits after it")
print("  stringr::str_extract returns: the digits")
print("  as.integer(digits) returns: integer value")
print("")
print("Case 3: Fragment.Info without /")
print("  Current code returns: 1L (via the else branch of if_else)")
print("")
print("So the bug is in Case 1 - when / is present but no digits follow")
print("The function would return NA instead of 1L")

EOF

Repository: Vitek-Lab/MSstatsBig

Length of output: 150


🏁 Script executed:

grep -r "ProductCharge" ./R --include="*.R" -B 2 -A 2

Repository: Vitek-Lab/MSstatsBig

Length of output: 4903


🏁 Script executed:

sed -n '50,80p' ./R/clean_DIANN.R

Repository: Vitek-Lab/MSstatsBig

Length of output: 1088


Ensure ProductCharge defaults to 1 when parsing fails.

The current code returns NA when a / is present in Fragment.Info but no digits follow (e.g., "y1/"). The regex pattern "(?<=/)[0-9]+" fails to match, causing stringr::str_extract() to return NA, and subsequently as.integer(NA) returns NA instead of the intended default of 1L. Use dplyr::coalesce() to handle this edge case:

🔧 Suggested change
-    ProductCharge = dplyr::if_else(
-      grepl("/", .data$Fragment.Info),
-      # Extract charge (number right after "/" in string), default to 1 if parsing fails
-      as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")),
-      1L
-    )
+    ProductCharge = dplyr::if_else(
+      grepl("/", .data$Fragment.Info),
+      dplyr::coalesce(
+        as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")),
+        1L
+      ),
+      1L
+    )
🤖 Prompt for AI Agents
In `@R/clean_DIANN.R` around lines 64 - 70, ProductCharge currently becomes NA
when Fragment.Info contains "/" but no digits follow because
as.integer(stringr::str_extract(...)) yields NA; modify the ProductCharge
computation (the dplyr::if_else branch that uses stringr::str_extract on
.data$Fragment.Info) to wrap the extracted value with dplyr::coalesce (or
equivalent) so that if str_extract returns NA it falls back to "1" before
coercion, ensuring the final value is 1L when parsing fails.

)

# 4. Clean and filter data
input <- dplyr::filter(
input,
!grepl("NH3|H2O", .data$FragmentIon) & !is.na(.data[[quantificationColumn]])
)

# 5. Rename columns to MSstats standard
input <- dplyr::rename_with(input, .fn = function(x) gsub("\\.", "", x))

# Standardize column names
old_names <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence',
'PrecursorCharge', gsub("\\.", "", quantificationColumn), 'QValue',
'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge')
new_names <- c('ProteinName', 'PeptideSequence', 'PeptideModifiedSequence',
'PrecursorCharge', 'Intensity', 'DetectionQValue',
'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge')

current_names <- colnames(input)
names_to_rename <- intersect(current_names, old_names)

# Create a named vector for renaming in the format c(new_name = old_name)
new_names_subset <- new_names[match(names_to_rename, old_names)]
rename_map <- setNames(names_to_rename, new_names_subset)
rename_map <- rename_map[!is.na(names(rename_map))]

input <- dplyr::rename(input, any_of(rename_map))
# Filter by Q-values
input <- dplyr::filter(input, DetectionQValue < global_qvalue_cutoff)

if (MBR) {
input <- dplyr::filter(input, LibPGQValue < pg_qvalue_cutoff & LibQValue < qvalue_cutoff)
} else {
input <- dplyr::filter(input, GlobalPGQValue < pg_qvalue_cutoff & GlobalQValue < qvalue_cutoff)
}

# Final column selection for MSstats format
msstats_cols <- c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", "PrecursorCharge",
"FragmentIon", "ProductCharge", "Run", "Intensity")

#TODO: confirm with Tony -- are these three needed?

# Add annotation columns if they exist
if ("Condition" %in% colnames(input)) msstats_cols <- c(msstats_cols, "Condition")
if ("BioReplicate" %in% colnames(input)) msstats_cols <- c(msstats_cols, "BioReplicate")

# Add IsotopeLabelType, assuming Light for DIANN
input$IsotopeLabelType <- "L"
msstats_cols <- c(msstats_cols, "IsotopeLabelType")

final_cols <- intersect(msstats_cols, colnames(input))
input <- dplyr::select(input, all_of(final_cols))

# Write to file
if (!is.null(pos)) {
if (pos == 1) {
readr::write_csv(input, file = output_path, append = FALSE)
} else {
readr::write_csv(input, file = output_path, append = TRUE)
}
}
NULL
}
45 changes: 44 additions & 1 deletion R/converters.R
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,50 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name,
}


#' Convert out-of-memory DIANN files to MSstats format.
#'
#' @inheritParams MSstatsPreprocessBig
#' @param MBR True if analysis was done with match between runs.
#' @param quantificationColumn Use 'Fragment.Quant.Corrected'(default) column for quantified intensities for DIANN 1.8.x.
#' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x.
#'
#' @export
#'
#' @return either arrow object or sparklyr table that can be optionally collected
#' into memory by using dplyr::collect function.
#'
bigDIANNtoMSstatsFormat <- function(input_file,
output_file_name,
backend,
MBR = TRUE,
quantificationColumn = "Fragment.Quant.Corrected",
max_feature_count = 100,
filter_unique_peptides = FALSE,
aggregate_psms = FALSE,
filter_few_obs = FALSE,
remove_annotation = FALSE,
calculateAnomalyScores=FALSE,
anomalyModelFeatures=c(),
connection = NULL) {

# Reduce and clean the DIANN report file in chunks
reduceBigDIANN(input_file,
paste0("reduce_output_", output_file_name),
MBR,
quantificationColumn)

# Preprocess the cleaned data (feature selection, etc.)
msstats_data <- MSstatsPreprocessBig(
paste0("reduce_output_", output_file_name),
output_file_name, backend, max_feature_count,
filter_unique_peptides, aggregate_psms, filter_few_obs,
remove_annotation, calculateAnomalyScores,
anomalyModelFeatures, connection)

return(msstats_data)
}


#' Merge annotation to output of MSstatsPreprocessBig
#'
#' @param input output of MSstatsPreprocessBig
Expand Down Expand Up @@ -185,4 +229,3 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name,
MSstatsAddAnnotationBig <- function(input, annotation) {
dplyr::inner_join(input, annotation, by = "Run")
}

65 changes: 65 additions & 0 deletions man/bigDIANNtoMSstatsFormat.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

95 changes: 95 additions & 0 deletions tests/testthat/test-converters.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
library(testthat)
library(mockery)

context("General converter functions")

test_that("MSstatsAddAnnotationBig adds annotation correctly", {
input_data <- data.frame(
Run = c("Run1", "Run2", "Run3"),
Intensity = c(100, 200, 300)
)

annotation_data <- data.frame(
Run = c("Run1", "Run2", "Run3"),
Condition = c("A", "A", "B"),
BioReplicate = c(1, 2, 1)
)

expected_output <- data.frame(
Run = c("Run1", "Run2", "Run3"),
Intensity = c(100, 200, 300),
Condition = c("A", "A", "B"),
BioReplicate = c(1, 2, 1)
)

result <- MSstatsAddAnnotationBig(input_data, annotation_data)

expect_equal(result, expected_output)
})

test_that("MSstatsPreprocessBig performs feature selection correctly", {
input_file <- tempfile(fileext = ".csv")
output_file <- "preprocess_output.csv"

# P1 has 3 features (frag1, frag2, frag3). frag3 has the highest avg intensity.
# P2 has 2 features (fragA, fragB). fragB has the highest avg intensity.
msstats_data <- rbind(
data.frame(ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2, FragmentIon = rep(c("frag1", "frag2", "frag3"), each = 2), ProductCharge = 1, IsotopeLabelType = "L", Condition = "A", BioReplicate = rep(1:2, 3), Run = rep(c("run1", "run2"), 3), Intensity = c(1000, 1100, 500, 550, 2000, 2100)),
data.frame(ProteinName = "P2", PeptideSequence = "PEPTIDE2", PrecursorCharge = 3, FragmentIon = rep(c("fragA", "fragB"), each = 2), ProductCharge = 1, IsotopeLabelType = "L", Condition = "B", BioReplicate = rep(1:2, 2), Run = rep(c("run1", "run2"), 2), Intensity = c(100, 150, 800, 850))
)
readr::write_csv(msstats_data, input_file)

processed <- MSstatsPreprocessBig(input_file, output_file, backend = "arrow",
max_feature_count = 1)
result <- dplyr::collect(processed)

# For P1, frag3 should be selected. For P2, fragB should be selected.
expect_equal(nrow(result), 4)

p1_result <- result[result$ProteinName == "P1", ]
expect_equal(nrow(p1_result), 2)
expect_true(all(p1_result$FragmentIon == "frag3"))

p2_result <- result[result$ProteinName == "P2", ]
expect_equal(nrow(p2_result), 2)
expect_true(all(p2_result$FragmentIon == "fragB"))

# Cleanup
file.remove(input_file)
if (file.exists(output_file)) file.remove(output_file)
})

test_that("bigSpectronauttoMSstatsFormat works correctly", {
# Mock reduceBigSpectronaut as its source is not provided
mock_reduce <- mock(NULL)

stub(bigSpectronauttoMSstatsFormat, "reduceBigSpectronaut", function(input_file, output_path, ...) {
msstats_data <- data.frame(
ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2,
FragmentIon = rep(c("frag1", "frag2"), each = 2), ProductCharge = 1,
IsotopeLabelType = "L", Condition = "A", BioReplicate = rep(1:2, 2),
Run = rep(c("run1", "run2"), 2), Intensity = c(1000, 1100, 2000, 2100) # frag2 is higher
)
readr::write_csv(msstats_data, output_path)
})

input_file <- "dummy_spectro_input.csv"
output_file <- "spectro_output.csv"

processed <- bigSpectronauttoMSstatsFormat(
input_file = input_file,
output_file_name = output_file,
backend = "arrow",
max_feature_count = 1
)
result <- dplyr::collect(processed)

# The mock reduce function creates a file with 2 features for P1.
# max_feature_count = 1 should select frag2.
expect_equal(nrow(result), 2)
expect_true(all(result$FragmentIon == "frag2"))

# Cleanup
if (file.exists(output_file)) file.remove(output_file)
if (file.exists(paste0("reduce_output_", output_file))) file.remove(paste0("reduce_output_", output_file))
})
Loading