Vitek-Lab · Rudhik1904 · Dec 30, 2025 · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -13,7 +13,7 @@ Description: MSstats package provide tools for preprocessing, summarization and
     processing larger than memory data sets.
 License: Artistic-2.0
 Encoding: UTF-8
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.3
 Imports: 
     arrow,
     DBI,
@@ -22,7 +22,9 @@ Imports:
     MSstatsConvert,
     readr,
     sparklyr,
-    utils
+    utils,
+    testthat,
+    mockery
 Suggests: 
     knitr,
     rmarkdown

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export(MSstatsAddAnnotationBig)
 export(MSstatsPreprocessBig)
+export(bigDIANNtoMSstatsFormat)
 export(bigFragPipetoMSstatsFormat)
 export(bigSpectronauttoMSstatsFormat)
 importFrom(MSstats,dataProcess)

diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R
@@ -0,0 +1,113 @@
+#' @keywords internal
+reduceBigDIANN <- function(input_file, output_path, MBR = TRUE,
+                           quantificationColumn = "FragmentQuantCorrected") {
+  if (grepl("csv", input_file)) {
+    delim = ","
+  } else if (grepl("tsv|xls", input_file)) {
+    delim = "\t"
+  } else {
+    delim <- ";"
+  }
+
+  diann_chunk <- function(x, pos) cleanDIANNChunk(x,
+                                                  output_path,
+                                                  MBR,
+                                                  quantificationColumn,
+                                                  pos)
+  readr::read_delim_chunked(input_file,
+                            readr::DataFrameCallback$new(diann_chunk),
+                            delim = delim,
+                            chunk_size = 1e6)
+}
+
+#' @keywords internal
+cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos) {
+
+  # 1. Select required columns
+  base_cols <- c('Protein.Names', 'Stripped.Sequence', 'Modified.Sequence', 
+                 'Precursor.Charge', quantificationColumn, 'Q.Value', 
+                 'Precursor.Mz', 'Fragment.Info', 'Run')
+
+  mbr_cols <- if (MBR) {
+    c('Lib.Q.Value', 'Lib.PG.Q.Value')
+  } else {
+    c('Global.Q.Value', 'Global.PG.Q.Value')
+  }
+
+  req_cols <- intersect(c(base_cols, mbr_cols), colnames(input))
+  input <- dplyr::select(input, all_of(req_cols))
+
+  # 2. Split concatenated values (un-nest)
+  split_cols <- intersect(c(quantificationColumn, "Fragment.Info"), colnames(input))
+  if (length(split_cols) > 0) {
+    input <- tidyr::separate_rows(input, all_of(split_cols), sep = ";")
+  }
+
+  # 3. Process fragment information
+  input[[quantificationColumn]] <- as.numeric(input[[quantificationColumn]])
+
+  input <- dplyr::mutate(
+    input,
+    FragmentIon = sub('\\^\\.\\*', '', .data$Fragment.Info),
+    ProductCharge = dplyr::if_else(
+      grepl("/", .data$Fragment.Info),
+      # Extract charge, default to 1 if parsing fails
+      as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")),
+      1L
+    )
+  )
+
+  # 4. Clean and filter data
+  input <- dplyr::filter(
+    input,
+    !grepl("NH3|H2O", .data$FragmentIon) & !is.na(.data[[quantificationColumn]])
+  )
+
+  # 5. Rename columns to MSstats standard
+  input <- dplyr::rename_with(input, .fn = function(x) gsub("\\.", "", x))
+
+  # Standardize column names
+  old_names <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence',
+                 'PrecursorCharge', gsub("\\.", "", quantificationColumn), 'QValue',
+                 'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge')
+  new_names <- c('ProteinName', 'PeptideSequence', 'PeptideModifiedSequence',
+                 'PrecursorCharge', 'Intensity', 'DetectionQValue', 
+                 'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge')
+
+  current_names <- colnames(input)
+  names_to_rename <- intersect(current_names, old_names)
+
+  # Create a named vector for renaming in the format c(new_name = old_name)
+  new_names_subset <- new_names[match(names_to_rename, old_names)]
+  rename_map <- setNames(names_to_rename, new_names_subset)
+  rename_map <- rename_map[!is.na(names(rename_map))]
+
+  input <- dplyr::rename(input, any_of(rename_map))
+
+  # Final column selection for MSstats format
+  msstats_cols <- c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", "PrecursorCharge", 
+                    "FragmentIon", "ProductCharge", "Run", "Intensity")
+
+  #TODO: confirm with Tony -- are these three needed?
+
+  # Add annotation columns if they exist
+  if ("Condition" %in% colnames(input)) msstats_cols <- c(msstats_cols, "Condition")
+  if ("BioReplicate" %in% colnames(input)) msstats_cols <- c(msstats_cols, "BioReplicate")
+
+  # Add IsotopeLabelType, assuming Light for DIANN
+  input$IsotopeLabelType <- "L"
+  msstats_cols <- c(msstats_cols, "IsotopeLabelType")
+
+  final_cols <- intersect(msstats_cols, colnames(input))
+  input <- dplyr::select(input, all_of(final_cols))
+
+  # Write to file
+  if (!is.null(pos)) {
+    if (pos == 1) {
+      readr::write_csv(input, file = output_path, append = FALSE)
+    } else {
+      readr::write_csv(input, file = output_path, append = TRUE)
+    }
+  }
+  NULL
+}
diff --git a/R/converters.R b/R/converters.R
@@ -155,6 +155,50 @@ bigSpectronauttoMSstatsFormat <-  function(input_file, output_file_name,
 }
 
 
+#' Convert out-of-memory DIANN files to MSstats format.
+#'
+#' @inheritParams MSstatsPreprocessBig
+#' @param MBR True if analysis was done with match between runs.
+#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x.
+#' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. 
+#'
+#' @export
+#'
+#' @return either arrow object or sparklyr table that can be optionally collected
+#' into memory by using dplyr::collect function.
+#'
+bigDIANNtoMSstatsFormat <- function(input_file, 
+                                    output_file_name,
+                                    backend,
+                                    MBR = TRUE,
+                                    quantificationColumn = "Fragment.Quant.Corrected",
+                                    max_feature_count = 100,
+                                    filter_unique_peptides =  FALSE,
+                                    aggregate_psms =  FALSE,
+                                    filter_few_obs =  FALSE,
+                                    remove_annotation =  FALSE,
+                                    calculateAnomalyScores=FALSE, 
+                                    anomalyModelFeatures=c(),
+                                    connection =  NULL) {
+
+  # Reduce and clean the DIANN report file in chunks
+  reduceBigDIANN(input_file, 
+                 paste0("reduce_output_", output_file_name),
+                 MBR,
+                 quantificationColumn)
+
+  # Preprocess the cleaned data (feature selection, etc.)
+  msstats_data <- MSstatsPreprocessBig(
+    paste0("reduce_output_", output_file_name),
+    output_file_name, backend, max_feature_count,
-  # Reduce and clean the DIANN report file in chunks
-  reduceBigDIANN(input_file, 
-                 paste0("reduce_output_", output_file_name),
-                 MBR,
-                 quantificationColumn)
-  
-  # Preprocess the cleaned data (feature selection, etc.)
-  msstats_data <- MSstatsPreprocessBig(
-    paste0("reduce_output_", output_file_name),
-    output_file_name, backend, max_feature_count,
+  # Reduce and clean the DIANN report file in chunks
+  reduce_path <- file.path(
+    dirname(output_file_name),
+    paste0("reduce_output_", basename(output_file_name))
+  )
+  reduceBigDIANN(input_file, reduce_path, MBR, quantificationColumn)
+  
+  # Preprocess the cleaned data (feature selection, etc.)
+  msstats_data <- MSstatsPreprocessBig(
+    reduce_path,
+    output_file_name, backend, max_feature_count,
-  # Reduce and clean the DIANN report file in chunks
-  reduceBigDIANN(input_file, 
-                 paste0("reduce_output_", output_file_name),
-                 MBR,
-                 quantificationColumn)
-  
-  # Preprocess the cleaned data (feature selection, etc.)
-  msstats_data <- MSstatsPreprocessBig(
-    paste0("reduce_output_", output_file_name),
-    output_file_name, backend, max_feature_count,
+  # Reduce and clean the DIANN report file in chunks
+  reduce_path <- file.path(
+    dirname(output_file_name),
+    paste0("reduce_output_", basename(output_file_name))
+  )
+  reduceBigDIANN(input_file, reduce_path, MBR, quantificationColumn)
+  
+  # Preprocess the cleaned data (feature selection, etc.)
+  msstats_data <- MSstatsPreprocessBig(
+    reduce_path,
+    output_file_name, backend, max_feature_count,
+    filter_unique_peptides, aggregate_psms, filter_few_obs, 
+    remove_annotation, calculateAnomalyScores, 
+    anomalyModelFeatures, connection)
+
+  return(msstats_data)
+}
+
+
 #' Merge annotation to output of MSstatsPreprocessBig
 #'
 #' @param input output of MSstatsPreprocessBig
@@ -185,4 +229,3 @@ bigSpectronauttoMSstatsFormat <-  function(input_file, output_file_name,
 MSstatsAddAnnotationBig <- function(input, annotation) {
   dplyr::inner_join(input, annotation, by = "Run")
 }
-
diff --git a/man/bigDIANNtoMSstatsFormat.Rd b/man/bigDIANNtoMSstatsFormat.Rd
diff --git a/tests/testthat/test-converters.R b/tests/testthat/test-converters.R
@@ -0,0 +1,95 @@
+library(testthat)
+library(mockery)
+
+context("General converter functions")
+
+test_that("MSstatsAddAnnotationBig adds annotation correctly", {
+  input_data <- data.frame(
+    Run = c("Run1", "Run2", "Run3"),
+    Intensity = c(100, 200, 300)
+  )
+
+  annotation_data <- data.frame(
+    Run = c("Run1", "Run2", "Run3"),
+    Condition = c("A", "A", "B"),
+    BioReplicate = c(1, 2, 1)
+  )
+
+  expected_output <- data.frame(
+    Run = c("Run1", "Run2", "Run3"),
+    Intensity = c(100, 200, 300),
+    Condition = c("A", "A", "B"),
+    BioReplicate = c(1, 2, 1)
+  )
+
+  result <- MSstatsAddAnnotationBig(input_data, annotation_data)
+
+  expect_equal(result, expected_output)
+})
+
+test_that("MSstatsPreprocessBig performs feature selection correctly", {
+  input_file <- tempfile(fileext = ".csv")
+  output_file <- "preprocess_output.csv"
+
+  # P1 has 3 features (frag1, frag2, frag3). frag3 has the highest avg intensity.
+  # P2 has 2 features (fragA, fragB). fragB has the highest avg intensity.
+  msstats_data <- rbind(
+    data.frame(ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2, FragmentIon = rep(c("frag1", "frag2", "frag3"), each = 2), ProductCharge = 1, IsotopeLabelType = "L", Condition = "A", BioReplicate = rep(1:2, 3), Run = rep(c("run1", "run2"), 3), Intensity = c(1000, 1100, 500, 550, 2000, 2100)),
+    data.frame(ProteinName = "P2", PeptideSequence = "PEPTIDE2", PrecursorCharge = 3, FragmentIon = rep(c("fragA", "fragB"), each = 2), ProductCharge = 1, IsotopeLabelType = "L", Condition = "B", BioReplicate = rep(1:2, 2), Run = rep(c("run1", "run2"), 2), Intensity = c(100, 150, 800, 850))
+  )
+  readr::write_csv(msstats_data, input_file)
+
+  processed <- MSstatsPreprocessBig(input_file, output_file, backend = "arrow",
+                                    max_feature_count = 1)
+  result <- dplyr::collect(processed)
+
+  # For P1, frag3 should be selected. For P2, fragB should be selected.
+  expect_equal(nrow(result), 4)
+
+  p1_result <- result[result$ProteinName == "P1", ]
+  expect_equal(nrow(p1_result), 2)
+  expect_true(all(p1_result$FragmentIon == "frag3"))
+
+  p2_result <- result[result$ProteinName == "P2", ]
+  expect_equal(nrow(p2_result), 2)
+  expect_true(all(p2_result$FragmentIon == "fragB"))
+
+  # Cleanup
+  file.remove(input_file)
+  if (file.exists(output_file)) file.remove(output_file)
+})
+
+test_that("bigSpectronauttoMSstatsFormat works correctly", {
+  # Mock reduceBigSpectronaut as its source is not provided
+  mock_reduce <- mock(NULL)
+
+  stub(bigSpectronauttoMSstatsFormat, "reduceBigSpectronaut", function(input_file, output_path, ...) {
+    msstats_data <- data.frame(
+      ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2,
+      FragmentIon = rep(c("frag1", "frag2"), each = 2), ProductCharge = 1,
+      IsotopeLabelType = "L", Condition = "A", BioReplicate = rep(1:2, 2),
+      Run = rep(c("run1", "run2"), 2), Intensity = c(1000, 1100, 2000, 2100) # frag2 is higher
+    )
+    readr::write_csv(msstats_data, output_path)
+  })
+
+  input_file <- "dummy_spectro_input.csv"
+  output_file <- "spectro_output.csv"
+
+  processed <- bigSpectronauttoMSstatsFormat(
+    input_file = input_file,
+    output_file_name = output_file,
+    backend = "arrow",
+    max_feature_count = 1
+  )
+  result <- dplyr::collect(processed)
+
+  # The mock reduce function creates a file with 2 features for P1.
+  # max_feature_count = 1 should select frag2.
+  expect_equal(nrow(result), 2)
+  expect_true(all(result$FragmentIon == "frag2"))
+
+  # Cleanup
+  if (file.exists(output_file)) file.remove(output_file)
+  if (file.exists(paste0("reduce_output_", output_file))) file.remove(paste0("reduce_output_", output_file))
+})