Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Description: MSstats package provide tools for preprocessing, summarization and
processing larger than memory data sets.
License: Artistic-2.0
Encoding: UTF-8
RoxygenNote: 7.3.2
RoxygenNote: 7.3.3
Imports:
arrow,
DBI,
Expand All @@ -22,7 +22,9 @@ Imports:
MSstatsConvert,
readr,
sparklyr,
utils
utils,
testthat,
mockery
Suggests:
knitr,
rmarkdown
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

export(MSstatsAddAnnotationBig)
export(MSstatsPreprocessBig)
export(bigDIANNtoMSstatsFormat)
export(bigFragPipetoMSstatsFormat)
export(bigSpectronauttoMSstatsFormat)
importFrom(MSstats,dataProcess)
Expand Down
113 changes: 113 additions & 0 deletions R/clean_DIANN.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#' @keywords internal
reduceBigDIANN <- function(input_file, output_path, MBR = TRUE,
quantificationColumn = "FragmentQuantCorrected") {
if (grepl("csv", input_file)) {
delim = ","
} else if (grepl("tsv|xls", input_file)) {
delim = "\t"
} else {
delim <- ";"
}

diann_chunk <- function(x, pos) cleanDIANNChunk(x,
output_path,
MBR,
quantificationColumn,
pos)
readr::read_delim_chunked(input_file,
readr::DataFrameCallback$new(diann_chunk),
delim = delim,
chunk_size = 1e6)
}

#' @keywords internal
cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos) {

# 1. Select required columns
base_cols <- c('Protein.Names', 'Stripped.Sequence', 'Modified.Sequence',
'Precursor.Charge', quantificationColumn, 'Q.Value',
'Precursor.Mz', 'Fragment.Info', 'Run')

mbr_cols <- if (MBR) {
c('Lib.Q.Value', 'Lib.PG.Q.Value')
} else {
c('Global.Q.Value', 'Global.PG.Q.Value')
}
Comment on lines +118 to +122
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here, we need to add a filter for filtering rows if their values in these columns is below a certain threshold:

see reference

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


req_cols <- intersect(c(base_cols, mbr_cols), colnames(input))
input <- dplyr::select(input, all_of(req_cols))

# 2. Split concatenated values (un-nest)
split_cols <- intersect(c(quantificationColumn, "Fragment.Info"), colnames(input))
if (length(split_cols) > 0) {
input <- tidyr::separate_rows(input, all_of(split_cols), sep = ";")
}

# 3. Process fragment information
input[[quantificationColumn]] <- as.numeric(input[[quantificationColumn]])

input <- dplyr::mutate(
input,
FragmentIon = sub('\\^\\.\\*', '', .data$Fragment.Info),
ProductCharge = dplyr::if_else(
grepl("/", .data$Fragment.Info),
# Extract charge, default to 1 if parsing fails
as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")),
1L
)
)

# 4. Clean and filter data
input <- dplyr::filter(
input,
!grepl("NH3|H2O", .data$FragmentIon) & !is.na(.data[[quantificationColumn]])
)

# 5. Rename columns to MSstats standard
input <- dplyr::rename_with(input, .fn = function(x) gsub("\\.", "", x))

# Standardize column names
old_names <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence',
'PrecursorCharge', gsub("\\.", "", quantificationColumn), 'QValue',
'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge')
new_names <- c('ProteinName', 'PeptideSequence', 'PeptideModifiedSequence',
'PrecursorCharge', 'Intensity', 'DetectionQValue',
'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge')

current_names <- colnames(input)
names_to_rename <- intersect(current_names, old_names)

# Create a named vector for renaming in the format c(new_name = old_name)
new_names_subset <- new_names[match(names_to_rename, old_names)]
rename_map <- setNames(names_to_rename, new_names_subset)
rename_map <- rename_map[!is.na(names(rename_map))]

input <- dplyr::rename(input, any_of(rename_map))

# Final column selection for MSstats format
msstats_cols <- c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", "PrecursorCharge",
"FragmentIon", "ProductCharge", "Run", "Intensity")

#TODO: confirm with Tony -- are these three needed?

# Add annotation columns if they exist
if ("Condition" %in% colnames(input)) msstats_cols <- c(msstats_cols, "Condition")
if ("BioReplicate" %in% colnames(input)) msstats_cols <- c(msstats_cols, "BioReplicate")
Comment on lines +256 to +257
Copy link
Contributor

@tonywu1999 tonywu1999 Jan 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Confirmed with Devon, you can remove this code and instead add an additional parameter for users to provide an annotation file (i.e. the table with Run, BioReplicate, and Condition)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed


# Add IsotopeLabelType, assuming Light for DIANN
input$IsotopeLabelType <- "L"
msstats_cols <- c(msstats_cols, "IsotopeLabelType")

final_cols <- intersect(msstats_cols, colnames(input))
input <- dplyr::select(input, all_of(final_cols))

# Write to file
if (!is.null(pos)) {
if (pos == 1) {
readr::write_csv(input, file = output_path, append = FALSE)
} else {
readr::write_csv(input, file = output_path, append = TRUE)
}
}
NULL
}
45 changes: 44 additions & 1 deletion R/converters.R
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,50 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name,
}


#' Convert out-of-memory DIANN files to MSstats format.
#'
#' @inheritParams MSstatsPreprocessBig
#' @param MBR True if analysis was done with match between runs.
#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x.
#' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x.
#'
#' @export
#'
#' @return either arrow object or sparklyr table that can be optionally collected
#' into memory by using dplyr::collect function.
#'
bigDIANNtoMSstatsFormat <- function(input_file,
output_file_name,
backend,
MBR = TRUE,
quantificationColumn = "Fragment.Quant.Corrected",
max_feature_count = 100,
filter_unique_peptides = FALSE,
aggregate_psms = FALSE,
filter_few_obs = FALSE,
remove_annotation = FALSE,
calculateAnomalyScores=FALSE,
anomalyModelFeatures=c(),
connection = NULL) {

# Reduce and clean the DIANN report file in chunks
reduceBigDIANN(input_file,
paste0("reduce_output_", output_file_name),
MBR,
quantificationColumn)

# Preprocess the cleaned data (feature selection, etc.)
msstats_data <- MSstatsPreprocessBig(
paste0("reduce_output_", output_file_name),
output_file_name, backend, max_feature_count,
Comment on lines +184 to +193
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

fd "converters.R" --type f

Repository: Vitek-Lab/MSstatsBig

Length of output: 111


🏁 Script executed:

wc -l R/converters.R

Repository: Vitek-Lab/MSstatsBig

Length of output: 82


🏁 Script executed:

cat -n R/converters.R | sed -n '180,195p'

Repository: Vitek-Lab/MSstatsBig

Length of output: 878


🏁 Script executed:

cat -n R/converters.R | sed -n '170,200p'

Repository: Vitek-Lab/MSstatsBig

Length of output: 1649


🏁 Script executed:

cat -n R/converters.R | sed -n '1,30p'

Repository: Vitek-Lab/MSstatsBig

Length of output: 2410


🏁 Script executed:

rg "bigDIANNtoMSstatsFormat" --type r -A 3 -B 3

Repository: Vitek-Lab/MSstatsBig

Length of output: 1488


🏁 Script executed:

cat -n tests/testthat/test-diann_converter.R | grep -A 50 "bigDIANNtoMSstatsFormat works with arrow"

Repository: Vitek-Lab/MSstatsBig

Length of output: 2424


🏁 Script executed:

cat -n tests/testthat/test-diann_converter.R

Repository: Vitek-Lab/MSstatsBig

Length of output: 5551


🏁 Script executed:

rg "output_file_name" R/converters.R -n

Repository: Vitek-Lab/MSstatsBig

Length of output: 958


🏁 Script executed:

cat -n R/converters.R | sed -n '128,155p'

Repository: Vitek-Lab/MSstatsBig

Length of output: 1853


Fix unsafe path construction that breaks when output_file_name contains directories.

The paste0("reduce_output_", output_file_name) calls at lines 186 and 192 produce invalid paths if output_file_name includes directory separators (e.g., /tmp/out.csvreduce_output_/tmp/out.csv). Use file.path() with dirname() and basename() to safely place the intermediate file alongside the output.

Note: The same pattern exists in bigSpectronauttoMSstatsFormat at lines 143 and 148 and should be fixed identically.

🐛 Proposed fix
+  reduce_path <- file.path(
+    dirname(output_file_name),
+    paste0("reduce_output_", basename(output_file_name))
+  )
+  reduceBigDIANN(input_file, reduce_path, MBR, quantificationColumn)
-  reduceBigDIANN(input_file, 
-                 paste0("reduce_output_", output_file_name),
-                 MBR,
-                 quantificationColumn)
   
-  msstats_data <- MSstatsPreprocessBig(
-    paste0("reduce_output_", output_file_name),
+  msstats_data <- MSstatsPreprocessBig(
+    reduce_path,
     output_file_name, backend, max_feature_count,
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Reduce and clean the DIANN report file in chunks
reduceBigDIANN(input_file,
paste0("reduce_output_", output_file_name),
MBR,
quantificationColumn)
# Preprocess the cleaned data (feature selection, etc.)
msstats_data <- MSstatsPreprocessBig(
paste0("reduce_output_", output_file_name),
output_file_name, backend, max_feature_count,
# Reduce and clean the DIANN report file in chunks
reduce_path <- file.path(
dirname(output_file_name),
paste0("reduce_output_", basename(output_file_name))
)
reduceBigDIANN(input_file, reduce_path, MBR, quantificationColumn)
# Preprocess the cleaned data (feature selection, etc.)
msstats_data <- MSstatsPreprocessBig(
reduce_path,
output_file_name, backend, max_feature_count,
🤖 Prompt for AI Agents
In `@R/converters.R` around lines 184 - 193, The intermediate path construction
using paste0("reduce_output_", output_file_name) is unsafe when output_file_name
contains directories; update calls that create the reduced intermediate filename
(in reduceBigDIANN -> MSstatsPreprocessBig sequence and the analogous locations
in bigSpectronauttoMSstatsFormat) to build the path with
file.path(dirname(output_file_name), paste0("reduce_output_",
basename(output_file_name))) so the reduce_output_* file is created alongside
the intended output file regardless of directory components.

filter_unique_peptides, aggregate_psms, filter_few_obs,
remove_annotation, calculateAnomalyScores,
anomalyModelFeatures, connection)

return(msstats_data)
}


#' Merge annotation to output of MSstatsPreprocessBig
#'
#' @param input output of MSstatsPreprocessBig
Expand Down Expand Up @@ -185,4 +229,3 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name,
MSstatsAddAnnotationBig <- function(input, annotation) {
dplyr::inner_join(input, annotation, by = "Run")
}

65 changes: 65 additions & 0 deletions man/bigDIANNtoMSstatsFormat.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

95 changes: 95 additions & 0 deletions tests/testthat/test-converters.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
library(testthat)
library(mockery)

context("General converter functions")

test_that("MSstatsAddAnnotationBig adds annotation correctly", {
input_data <- data.frame(
Run = c("Run1", "Run2", "Run3"),
Intensity = c(100, 200, 300)
)

annotation_data <- data.frame(
Run = c("Run1", "Run2", "Run3"),
Condition = c("A", "A", "B"),
BioReplicate = c(1, 2, 1)
)

expected_output <- data.frame(
Run = c("Run1", "Run2", "Run3"),
Intensity = c(100, 200, 300),
Condition = c("A", "A", "B"),
BioReplicate = c(1, 2, 1)
)

result <- MSstatsAddAnnotationBig(input_data, annotation_data)

expect_equal(result, expected_output)
})

test_that("MSstatsPreprocessBig performs feature selection correctly", {
input_file <- tempfile(fileext = ".csv")
output_file <- "preprocess_output.csv"

# P1 has 3 features (frag1, frag2, frag3). frag3 has the highest avg intensity.
# P2 has 2 features (fragA, fragB). fragB has the highest avg intensity.
msstats_data <- rbind(
data.frame(ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2, FragmentIon = rep(c("frag1", "frag2", "frag3"), each = 2), ProductCharge = 1, IsotopeLabelType = "L", Condition = "A", BioReplicate = rep(1:2, 3), Run = rep(c("run1", "run2"), 3), Intensity = c(1000, 1100, 500, 550, 2000, 2100)),
data.frame(ProteinName = "P2", PeptideSequence = "PEPTIDE2", PrecursorCharge = 3, FragmentIon = rep(c("fragA", "fragB"), each = 2), ProductCharge = 1, IsotopeLabelType = "L", Condition = "B", BioReplicate = rep(1:2, 2), Run = rep(c("run1", "run2"), 2), Intensity = c(100, 150, 800, 850))
)
readr::write_csv(msstats_data, input_file)

processed <- MSstatsPreprocessBig(input_file, output_file, backend = "arrow",
max_feature_count = 1)
result <- dplyr::collect(processed)

# For P1, frag3 should be selected. For P2, fragB should be selected.
expect_equal(nrow(result), 4)

p1_result <- result[result$ProteinName == "P1", ]
expect_equal(nrow(p1_result), 2)
expect_true(all(p1_result$FragmentIon == "frag3"))

p2_result <- result[result$ProteinName == "P2", ]
expect_equal(nrow(p2_result), 2)
expect_true(all(p2_result$FragmentIon == "fragB"))

# Cleanup
file.remove(input_file)
if (file.exists(output_file)) file.remove(output_file)
})

test_that("bigSpectronauttoMSstatsFormat works correctly", {
# Mock reduceBigSpectronaut as its source is not provided
mock_reduce <- mock(NULL)

stub(bigSpectronauttoMSstatsFormat, "reduceBigSpectronaut", function(input_file, output_path, ...) {
msstats_data <- data.frame(
ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2,
FragmentIon = rep(c("frag1", "frag2"), each = 2), ProductCharge = 1,
IsotopeLabelType = "L", Condition = "A", BioReplicate = rep(1:2, 2),
Run = rep(c("run1", "run2"), 2), Intensity = c(1000, 1100, 2000, 2100) # frag2 is higher
)
readr::write_csv(msstats_data, output_path)
})

input_file <- "dummy_spectro_input.csv"
output_file <- "spectro_output.csv"

processed <- bigSpectronauttoMSstatsFormat(
input_file = input_file,
output_file_name = output_file,
backend = "arrow",
max_feature_count = 1
)
result <- dplyr::collect(processed)

# The mock reduce function creates a file with 2 features for P1.
# max_feature_count = 1 should select frag2.
expect_equal(nrow(result), 2)
expect_true(all(result$FragmentIon == "frag2"))

# Cleanup
if (file.exists(output_file)) file.remove(output_file)
if (file.exists(paste0("reduce_output_", output_file))) file.remove(paste0("reduce_output_", output_file))
})
Loading