antonvsdata
diff --git a/‎DESCRIPTION‎
Lines changed: 6 additions & 5 deletions b/‎DESCRIPTION‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 2 additions & 0 deletions b/‎NAMESPACE‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎R/batch_correction.R‎
Lines changed: 2 additions & 36 deletions b/‎R/batch_correction.R‎
Lines changed: 2 additions & 36 deletions
diff --git a/‎R/class_constructor.R‎
Lines changed: 74 additions & 25 deletions b/‎R/class_constructor.R‎
Lines changed: 74 additions & 25 deletions
diff --git a/‎R/logging.R‎
Lines changed: 11 additions & 8 deletions b/‎R/logging.R‎
Lines changed: 11 additions & 8 deletions
@@ -1,11 +1,12 @@
 Package: notame
 Type: Package
 Title: Workflow for non-targeted LC-MS metabolic profiling
-Version: 0.0.11
+Version: 0.1.0
 Authors@R: c(
     person("Anton", "Klåvus", email = "anton.klavus@iki.fi", role = c("aut", "cre")),
     person("Jussi", "Paananen", role = "aut"),
-    person("Oskari", "Timonen", role = "aut"))
+    person("Oskari", "Timonen", role = "aut"),
+    person("Atte", "Lihtamo", role = "aut"))
 Description: Automates common preprocessing steps in a LC-MS metabolomics workflow
     such as drift correction, quality control and common visualizations.
 License: MIT + file LICENSE
@@ -16,14 +17,15 @@ Depends:
     R (>= 3.5),
     Biobase,
     BiocGenerics,
+    futile.logger,
     ggplot2,
     magrittr
 Imports:
     dplyr,
     foreach,
-    futile.logger,
     grDevices,
     methods,
+    openxlsx,
     tibble,
     tidyr
 Suggests:
@@ -42,7 +44,6 @@ Suggests:
     lmerTest,
     missForest,
     mixOmics,
-    openxlsx,
     pcaMethods,
     PK,
     randomForest,
@@ -52,5 +53,5 @@ Suggests:
     RUVSeq,
     supraHex,
     testthat
-RoxygenNote: 7.1.2
+RoxygenNote: 7.2.1
 VignetteBuilder: knitr
@@ -8,6 +8,7 @@ export(align_batches)
 export(assess_quality)
 export(assign_cluster_id)
 export(citations)
+export(clean_stats_results)
 export(cluster_features)
 export(cohens_d)
 export(combined_data)
@@ -74,6 +75,7 @@ export(perform_logistic)
 export(perform_oneway_anova)
 export(perform_paired_t_test)
 export(perform_pairwise_t_test)
+export(perform_permanova)
 export(perform_repeatability)
 export(perform_t_test)
 export(plot_dendrogram)
 
@@ -1,50 +1,16 @@
 #' Batch correction
 #'
-#' "Basic" batch correction by median? from BatchCorrMetabolomics::doBC
+#' DEPRECATED
 #'
 #' @param object a MetaboSet object
 #' @param batch the column name for batch labels
 #' @param ref the column name for reference sample labels
 #' @param ref_label the label for reference samples
 #' @param ... other parameters pased to doBC
 #'
-#' @return a MetaboSet object with the corrected abundances
-#'
-#' @examples
-#' \dontrun{
-#' batch_corrected <- dobc(merged_sample, batch = "Batch", ref = "QC", ref_label = "QC")
-#' # Evaluate batch correction
-#' pca_bhattacharyya_dist(merged_sample, batch = "Batch")
-#' pca_bhattacharyya_dist(batch_corrected, batch = "Batch")
-#' }
 #' @export
 dobc <- function(object, batch, ref, ref_label, ...) {
-
-  if (!requireNamespace("BatchCorrMetabolomics", quietly = TRUE)) {
-    stop("Package \"BatchCorrMetabolomics\" needed for this function to work. Please install it from
-         https://github.com/rwehrens/BatchCorrMetabolomics.",
-         call. = FALSE)
-  }
-  add_citation("BatchCorrMetabolomics was used for batch correction:", citation("BatchCorrMetabolomics"))
-
-  ref_idx <- which(pData(object)[, ref] == ref_label)
-  seq_idx <- object$Injection_order
-  batch_idx <- pData(object)[, batch]
-
-  batch_corrected <- foreach::foreach(feature = featureNames(object), .combine = rbind) %dopar% {
-    tmp <- BatchCorrMetabolomics::doBC(Xvec = exprs(object)[feature, ],
-                                       ref.idx = ref_idx,
-                                       batch.idx = batch_idx,
-                                       seq.idx = seq_idx,
-                                       minBsamp = 1,
-                                       method = "lm",
-                                       correctionFormula = "X ~ B")
-    matrix(tmp, nrow = 1, dimnames = list(feature, names(tmp)))
-  }
-
-  exprs(object) <- batch_corrected
-
-  object
+  stop("This function is deprecated.")
 }
 
 #' Remove Unwanted Variation
 
@@ -5,14 +5,14 @@ log_text_if <- function(text, logif) {
 }
 
 # Helper function for checking integrity of pheno data
-check_pheno_data <- function(x, id_prefix, log_messages = FALSE) {
+check_pheno_data <- function(x, id_prefix, id_column = NULL, log_messages = FALSE) {
   log_text_if("\nChecking sample information", log_messages)
 
   # Check that Injection order is included
   if (!"Injection_order" %in% colnames(x)) {
     stop('"Injection_order" not found for the samples')
   }
-  # No NAs allowd in Injection order
+  # No NAs allowed in Injection order
   if (any(is.na(x$Injection_order))) {
     stop("Missing values in Injection_order")
   }
@@ -31,7 +31,18 @@ check_pheno_data <- function(x, id_prefix, log_messages = FALSE) {
       warning("QC column not found and can not be generated. Please create one before constructing a MetaboSet object.")
     }
   }
-
+  # If id_column is provided, try to change name of the column to "Sample_ID"
+  if (!is.null(id_column)) {
+    log_text_if("Checking provided sample ID column", log_messages)
+    if (!id_column %in% colnames(x)) {
+      log_text_if(paste0("ID column '", id_column, "' not found"), log_messages)
+    } else if (!any(duplicated(x[, id_column])) && !any(is.na(x[, id_column]))) {
+      x$Sample_ID <- x[, id_column]
+      log_text_if(paste0("Column 'Sample_ID' created from ", id_column), log_messages)
+    } else {
+      log_text_if("Provided sample ID column is not valid", log_messages)
+    }
+  }
   # If Sample_ID is not provided explicitly, it will be created
   if (!"Sample_ID" %in% colnames(x)) {
     x$Sample_ID <- paste0(id_prefix, x$Injection_order)
@@ -101,7 +112,7 @@ looks_numeric <- function(x) {
 
 # Check that all abundances look OK
 check_exprs <- function(exprs_, log_messages = FALSE) {
- log_text_if("Checking that feature abundances only contain numeric values", log_messages)
+  log_text_if("Checking that feature abundances only contain numeric values", log_messages)
   # Check that all rows are full of numbers
   non_numerics <- exprs_ %>%
     apply(1, function(x){!looks_numeric(x)})
@@ -114,16 +125,34 @@ check_exprs <- function(exprs_, log_messages = FALSE) {
   exprs_
 }
 
-check_feature_data <- function(feature_data, log_messages = FALSE) {
+check_feature_data <- function(feature_data, check_limits = TRUE, mz_limits = c(10, 2000), rt_limits = c(0, 20), log_messages = FALSE) {
+  log_text_if("\nChecking feature information", log_messages)
   log_text_if("Checking that feature IDs are unique and not stored as numbers", log_messages)
   fid <- feature_data$Feature_ID
+  if (any(duplicated(fid))) {
+    stop("Feature_ID values are not unique")
+  }
   if (any(is.na(fid))) {
     stop("Missing values in Feature IDs")
   }
   fid_num <- suppressWarnings(as.numeric(fid))
   if (any(!is.na(fid_num))) {
     stop("Numbers are not allowed as feature IDs")
   }
+  fid_chr <- suppressWarnings(as.character(fid))
+  if (any(grepl("^[[:digit:]]", fid_chr))) {
+    stop("Feature IDs can not start with numbers")
+  }
+  if (check_limits) {
+    log_text_if("Checking that m/z and retention time values are reasonable", log_messages)
+    mz <- feature_data[, find_mz_rt_cols(feature_data)$mz_col]
+    rt <- feature_data[, find_mz_rt_cols(feature_data)$rt_col]
+    if (!(all(mz > mz_limits[1]) && all(mz < mz_limits[2])) ||
+        !(all(rt > rt_limits[1]) && all(rt < rt_limits[2]))) {
+      stop("Values in m/z or retention time columns are outside limits.")
+    }
+  }
+
   feature_data
 }
 
@@ -140,11 +169,14 @@ check_feature_data <- function(feature_data, log_messages = FALSE) {
 #'
 #' @param file path to the Excel file
 #' @param sheet the sheet number or name
+#' @param id_column character, column name for unique identification of samples
 #' @param corner_row integer, the bottom row of sample information, usually contains data file names and feature info column names. If set to NULL, will be detected automatically.
 #' @param corner_column integer or character, the corresponding column number or the column name (letter) in Excel. If set to NULL, will be detected automatically.
 #' @param id_prefix character, prefix for autogenerated sample IDs, see Details
 #' @param split_by character vector, in the case where all the modes are in the same Excel file, the column names of feature data used to separate the modes (usually Mode and Column)
 #' @param name in the case where the Excel file only contains one mode, the name of the mode, such as "Hilic_neg"
+#' @param mz_limits numeric vector of two, all m/z values should be in between these
+#' @param rt_limits numeric vector of two, all retention time values should be in between these
 #' @param skip_checks logical: skip checking data integrity. Not recommended, but sometimes useful when you
 #' just want to read the data in as is and fix errors later. NOTE: Sample_ID and QC columns will not be constructed.
 #' The data integrity checks need to be passed when contstructing MetaboSet objects.
@@ -162,16 +194,16 @@ check_feature_data <- function(feature_data, log_messages = FALSE) {
 #' The function will try to find columns for mass and retention time by looking at a few common alternatives,
 #' and throw an error if no matching column is found. Sample information needs to contain a row called "Injection_order",
 #' and the values need to be unique. In addition, a possible sample identifier row needs to be named "Sample_ID",
-#' and the values need to be unique, with an exception of QC samples: if there are any "QC" identifiers, they will
-#' be replaced with "QC_1", "QC_2" and so on. If a "Sample_ID" row is not found, it will be created using the \code{id_prefix}
-#' and injection order.
+#' or to be specified in \code{id_column}, and the values need to be unique, with an exception of QC samples:
+#' if there are any "QC" identifiers, they will be replaced with "QC_1", "QC_2" and so on.
+#' If a "Sample_ID" row is not found, it will be created using the \code{id_prefix} and injection order.
 #'
 #'
 #' @importFrom magrittr "%>%"
 #'
 #' @export
-read_from_excel <- function(file, sheet = 1, corner_row = NULL, corner_column = NULL,
-                            id_prefix = "ID_", split_by = NULL, name = NULL,
+read_from_excel <- function(file, sheet = 1, id_column = NULL, corner_row = NULL, corner_column = NULL,
+                            id_prefix = "ID_", split_by = NULL, name = NULL, mz_limits = c(10, 2000), rt_limits = c(0, 20),
                             skip_checks = FALSE) {
 
   if (!requireNamespace("openxlsx", quietly = TRUE)) {
@@ -232,10 +264,17 @@ read_from_excel <- function(file, sheet = 1, corner_row = NULL, corner_column =
 
   # If the file only contains one mode, add the mode name as Split column
   if (!is.null(name)) {
-    log_text(paste0("Assigning ", name, " as the value of the Split column for each feature"))
+    log_text(paste0("Assigning ", name,
+                    " as the value of the Split column for each feature"))
     feature_data$Split <- name
     split_by <- "Split"
   } else { # Multiple modes in the file, create Split column to separate modes
+    if (!all(split_by %in% colnames(feature_data))) {
+      stop(paste0("Couldn't find column(s): ",
+                  paste(split_by[!(split_by %in% colnames(feature_data))],
+                        collapse = ", ")
+      ))
+    }
     log_text(paste0("Creating Split column from ",
                paste0(split_by, collapse = ", ")))
     feature_data <- feature_data %>%
@@ -253,17 +292,25 @@ read_from_excel <- function(file, sheet = 1, corner_row = NULL, corner_column =
     best_classes() %>%
     dplyr::mutate_if(is.factor, as.character)
   rownames(feature_data) <- feature_data$Feature_ID
+  log_text("Replacing dots (.) in feature information column names with underscores (_)")
+  colnames(feature_data) <- gsub("[.]", "_", colnames(feature_data))
 
   # Extract LC-MS measurements as matrix
-  log_text(paste0("\nExtracting feature abundances from rows ", cr+1,  " to ", nrow(dada),
+  log_text(paste0("\nExtracting feature abundances from rows ", cr+1, " to ", nrow(dada),
              " and columns ", excel_columns[cc + 1], " to ", excel_columns[ncol(dada)]))
   exprs_ <- dada[(cr+1):nrow(dada), (cc+1):ncol(dada)]
 
   # Skip checks
   if (!skip_checks) {
-    pheno_data <- check_pheno_data(x = pheno_data, id_prefix = id_prefix, log_messages = TRUE)
+    pheno_data <- check_pheno_data(x = pheno_data, id_prefix = id_prefix,
+                                   id_column = id_column, log_messages = TRUE
+    )
     exprs_ <- check_exprs(exprs_, log_messages = TRUE)
-    feature_data <- check_feature_data(feature_data, log_messages = TRUE)
+    feature_data <- check_feature_data(feature_data,
+                                       mz_limits = mz_limits,
+                                       rt_limits = rt_limits,
+                                       log_messages = TRUE
+    )
   }
 
   rownames(exprs_) <- rownames(feature_data)
@@ -275,9 +322,9 @@ read_from_excel <- function(file, sheet = 1, corner_row = NULL, corner_column =
 # Helper function to search for mass and retention time column names
 find_mz_rt_cols <- function(feature_data) {
   # Find mass and retention time columns
-  mz_tags <- c("mass", "average mz", "average.mz", "molecularweight", "molecular weight")
+  mz_tags <- c("mass", "average mz", "average.mz", "molecularweight", "molecular weight", "average_mz")
   rt_tags <-  c("retention time", "retentiontime", "average rt[(]min[)]",
-                "average[.]rt[.]min[.]", "^rt$")
+                "average[_]rt[_]min[_]", "average[.]rt[.]min[.]", "^rt$")
 
   mz_col <- NULL
   for (tag in mz_tags) {
@@ -360,20 +407,22 @@ MetaboSet <- setClass("MetaboSet",
 
 setValidity("MetaboSet",
             function(object) {
-              if (!is.na(object@group_col) & !object@group_col %in% colnames(object@phenoData@data)) {
-                paste0("Column '", object@group_col, "' not found in pheno data")
-              } else if (!is.na(object@time_col) & !object@time_col %in% colnames(object@phenoData@data)) {
-                paste("Column", object@time_col, "not found in pheno data")
-              } else if (!is.na(object@subject_col) & !object@subject_col %in% colnames(object@phenoData@data)) {
-                paste("Column", object@subject_col, "not found in pheno data")
+              if (!is.na(group_col(object)) & !group_col(object) %in% colnames(pData(object))) {
+                return(paste0("Column '", group_col(object), "' not found in pheno data"))
+              } else if (!is.na(time_col(object)) & !time_col(object) %in% colnames(pData(object))) {
+                return(paste("Column", time_col(object), "not found in pheno data"))
+              } else if (!is.na(subject_col(object)) & !subject_col(object) %in% colnames(pData(object))) {
+                return(paste("Column", subject_col(object), "not found in pheno data"))
               } else if (!all(c("Injection_order", "Sample_ID", "QC") %in% colnames(pData(object)))) {
-                "Pheno data should contain columns Sample_ID, QC and Injection_order"
+                return("Pheno data should contain columns Sample_ID, QC and Injection_order")
+              } else if (any(is.na(pData(object)[, "QC"]))) {
+                return("QC column should not contain NAs")
               } else if (!"Flag" %in% colnames(fData(object))) {
-                "Flag column not found in fData"
+                return("Flag column not found in fData")
               } else {
                 x <- check_pheno_data(pData(object), id_prefix = "")
                 x <- check_exprs(exprs(object))
-                x <- check_feature_data(fData(object))
+                x <- check_feature_data(fData(object), check_limits = FALSE)
                 TRUE
               }
             })
 
@@ -4,23 +4,24 @@
 #' Initialize a log file with the current data and time.
 #' All major operations run after this will be logged to the specified file.
 #'
-#' @section Warning:
-#' This overwrites the current contents of the file
-#'
 #' @param log_file Path to the log file
 #'
 #' @examples
 #' file_name <- "~/log.txt"
 #' init_log(file_name)
 #' # Print the contents of the file
-#' scan(file_name, sep="\n", what = "chracter")
+#' scan(file_name, sep = "\n", what = "character")
 #'
 #' @seealso \code{\link{log_text}}, \code{\link{finish_log}}, \code{\link{log_state}}
 #'
 #' @export
 init_log <- function(log_file) {
   futile.logger::flog.appender(futile.logger::appender.tee(log_file), name = "notame")
-  log_text(paste0("Starting logging"))
+  log_text("Starting logging")
+  # Pass errors to log
+  options(error = function() {
+      futile.logger::flog.error(geterrmessage(), name = "notame")
+  })
 }
 
 #' Log text to the current log file
@@ -35,13 +36,13 @@ init_log <- function(log_file) {
 #' init_log(file_name)
 #' log_text("Hello World!")
 #' # Print the contents of the file
-#' scan(file_name, sep="\n", what = "chracter")
+#' scan(file_name, sep = "\n", what = "character")
 #'
 #' @seealso \code{\link{init_log}}, \code{\link{finish_log}}, \code{\link{log_state}}
 #'
 #' @export
 log_text <- function(text) {
-  futile.logger::flog.info(text)
+  futile.logger::flog.info(text, name = "notame")
 }
 
 #' Finish a log
@@ -52,8 +53,10 @@ log_text <- function(text) {
 #'
 #' @export
 finish_log <- function() {
+  # Return default option for error
+  options(error = NULL)
   # Log end of session info
   futile.logger::flog.info(paste("Finished analysis. ", date(), "\nSession info:\n", sep=""))
   futile.logger::flog.info(capture.output(sessionInfo()))
-  futile.logger::flog.appender(futile.logger::appender.console(), name = "notame")
+  invisible(futile.logger::flog.appender(futile.logger::appender.console(), name = "notame"))
 }