diff --git a/Microarray/Agilent_1-channel/Array_Annotations/Agilent_array_annotations.csv b/Microarray/Agilent_1-channel/Array_Annotations/Agilent_array_annotations.csv
new file mode 100644
index 00000000..072e0c5c
--- /dev/null
+++ b/Microarray/Agilent_1-channel/Array_Annotations/Agilent_array_annotations.csv
@@ -0,0 +1,2 @@
+array_design,annot_type,annot_filename,download_link,download_date
+AGILENT SurePrint G3 GE 8x60k v3,agilent,072363_D_AA_20240521.txt,https://earray.chem.agilent.com/earray/array/displayViewArrayDesign.do?eArrayAction=view&arraydesignid=ADID40392,2024-11-15
diff --git a/Microarray/Agilent_1-channel/Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112-A.md b/Microarray/Agilent_1-channel/Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112-A.md
new file mode 100644
index 00000000..17a151fd
--- /dev/null
+++ b/Microarray/Agilent_1-channel/Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112-A.md
@@ -0,0 +1,1515 @@
+# GeneLab bioinformatics processing pipeline for Agilent 1-channel microarray data
+
+> **This page holds an overview and instructions for how GeneLab processes Agilent 1-channel microarray datasets\*. Exact processing commands and GL-DPPD-7112 version used for specific datasets are provided with their processed data in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/).**
+>
+
+---
+
+**Date:** March XX, 2025
+**Revision:** -A
+**Document Number:** GL-DPPD-7112-A
+
+**Submitted by:**
+Crystal Han (GeneLab Data Processing Team)
+
+**Approved by:**
+Samrawit Gebre (OSDR Project Manager)
+Lauren Sanders (OSDR Project Scientist)
+Amanda Saravia-Butler (GeneLab Science Lead)
+Barbara Novak (GeneLab Data Processing Lead)
+
+---
+
+## Updates from previous version
+
+Updated [Ensembl Reference Files](../../../GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv) to the following releases:
+- Animals: Ensembl release 112
+- Plants: Ensembl plants release 59
+- Bacteria: Ensembl bacteria release 59
+
+Software Updates:
+
+| Program | Previous Version | New Version |
+|:--------|:-----------------|:---------------|
+|R|4.1.3|4.4.2|
+|DT|0.26|0.33|
+|dplyr|1.0.10|1.1.4|
+|stringr|1.5.0|1.5.1|
+|R.utils|2.12.2|2.12.3|
+|limma|3.50.3|3.62.2|
+|glue|1.6.2|1.8.0|
+|ggplot2|3.4.0|3.5.1|
+|biomaRt|2.50.0|2.62.0|
+|matrixStats|0.63.0|1.5.0|
+|dp_tools|1.3.4|1.3.5|
+|Quarto|1.2.313|1.6.40|
+
+Custom Annotations
+
+- Added ability to use custom gene annotations when annotations are not available in Biomart or Ensembl FTP for *Arabidopsis thaliana*, see [Step 7](#7-probe-annotations)
+
+---
+
+# Table of contents
+
+- [Software used](#software-used)
+- [General processing overview with example commands](#general-processing-overview-with-example-commands)
+ - [1. Create Sample RunSheet](#1-create-sample-runsheet)
+ - [2. Load Data](#2-load-data)
+ - [2a. Load Libraries and Define Input Parameters](#2a-load-libraries-and-define-input-parameters)
+ - [2b. Define Custom Functions](#2b-define-custom-functions)
+ - [2c. Load Metadata and Raw Data](#2c-load-metadata-and-raw-data)
+ - [2d. Load Annotation Metadata](#2d-load-annotation-metadata)
+ - [3. Raw Data Quality Assessment](#3-raw-data-quality-assessment)
+ - [3a. Density Plot](#3a-density-plot)
+ - [3b. Pseudo Image Plots](#3b-pseudo-image-plots)
+ - [3c. MA Plots](#3c-ma-plots)
+ - [3d. Foreground-Background Plots](#3d-foreground-background-plots)
+ - [3e. Boxplots](#3e-boxplots)
+ - [4. Background Correction](#4-background-correction)
+ - [5. Between Array Normalization](#5-between-array-normalization)
+ - [6. Normalized Data Quality Assessment](#6-normalized-data-quality-assessment)
+ - [6a. Density Plot](#6a-density-plot)
+ - [6b. Pseudo Image Plots](#6b-pseudo-image-plots)
+ - [6c. MA Plots](#6c-ma-plots)
+ - [6d. Boxplots](#6d-boxplots)
+ - [7. Probe Annotations](#7-probe-annotations)
+ - [7a. Get Probe Annotations](#7a-get-probe-annotations)
+ - [7b. Summarize Gene Mapping](#7b-summarize-gene-mapping)
+ - [7c. Generate Annotated Raw and Normalized Expression Tables](#7c-generate-annotated-raw-and-normalized-expression-tables)
+ - [8. Perform Probe Differential Expression (DE)](#8-perform-probe-differential-expression-de)
+ - [8a. Generate Design Matrix](#8a-generate-design-matrix)
+ - [8b. Perform Individual Probe Level DE](#8b-perform-individual-probe-level-de)
+ - [8c. Add Annotation and Stats Columns and Format DE Table](#8c-add-annotation-and-stats-columns-and-format-de-table)
+
+---
+
+# Software used
+
+|Program|Version|Relevant Links|
+|:------|:------:|:-------------|
+|R|4.4.2|[https://www.r-project.org/](https://www.r-project.org/)|
+|DT|0.33|[https://github.com/rstudio/DT](https://github.com/rstudio/DT)|
+|dplyr|1.1.4|[https://dplyr.tidyverse.org](https://dplyr.tidyverse.org)|
+|stringr|1.5.1|[https://stringr.tidyverse.org](https://stringr.tidyverse.org)|
+|R.utils|2.12.3|[https://github.com/HenrikBengtsson/R.utils](https://github.com/HenrikBengtsson/R.utils)|
+|limma|3.62.2|[https://bioconductor.org/packages/3.14/bioc/html/limma.html](https://bioconductor.org/packages/3.14/bioc/html/limma.html)|
+|glue|1.8.0|[https://glue.tidyverse.org](https://glue.tidyverse.org)|
+|ggplot2|3.5.1|[https://ggplot2.tidyverse.org](https://ggplot2.tidyverse.org)|
+|biomaRt|2.62.0|[https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html)|
+|matrixStats|1.5.0|[https://github.com/HenrikBengtsson/matrixStats](https://github.com/HenrikBengtsson/matrixStats)|
+|statmod|1.5.0|[https://github.com/cran/statmod](https://github.com/cran/statmod)|
+|dp_tools|1.3.5|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)|
+|singularity|3.9|[https://sylabs.io](https://sylabs.io)|
+|Quarto|1.6.40|[https://quarto.org](https://quarto.org)|
+
+---
+
+# General processing overview with example commands
+
+> Exact processing commands and output files listed in **bold** below are included with each Microarray processed dataset in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/).
+
+---
+
+## 1. Create Sample RunSheet
+
+> Notes:
+> - Rather than running the commands below to create the runsheet needed for processing, the runsheet may also be created manually by following the [file specification](../Workflow_Documentation/NF_MAAgilent1ch/examples/runsheet/README.md).
+>
+> - These command line tools are part of the [dp_tools](https://github.com/J-81/dp_tools) program.
+
+```bash
+### Download the *ISA.zip file from the GeneLab Repository ###
+
+dpt-get-isa-archive \
+ --accession OSD-###
+
+### Parse the metadata from the *ISA.zip file to create a sample runsheet ###
+
+dpt-isa-to-runsheet --accession OSD-### \
+ --config-type microarray \
+ --config-version Latest \
+ --isa-archive *ISA.zip
+```
+
+**Parameter Definitions:**
+
+- `--accession OSD-###` - OSD accession ID (replace ### with the OSD number being processed), used to retrieve the urls for the ISA archive and raw expression files hosted on the GeneLab Repository
+- `--config-type` - Instructs the script to extract the metadata required for `microarray` processing from the ISA archive
+- `--config-version` - Specifies the `dp-tools` configuration version to use, a value of `Latest` will specify the most recent version
+- `--isa-archive` - Specifies the *ISA.zip file for the respective GLDS dataset, downloaded in the `dpt-get-isa-archive` command
+
+
+**Input Data:**
+
+- No input data required but the OSD accession ID needs to be indicated, which is used to download the respective ISA archive
+
+**Output Data:**
+
+- *ISA.zip (compressed ISA directory containing Investigation, Study, and Assay (ISA) metadata files for the respective OSD dataset, used to define sample groups - the *ISA.zip file is located in the [OSD repository](https://osdr.nasa.gov/bio/repo/search?q=&data_source=cgene,alsda&data_type=study) under 'Study Files' -> 'metadata')
+
+- **{OSD-Accession-ID}_microarray_v{version}_runsheet.csv** (table containing metadata required for processing, version denotes the dp_tools schema used to specify the metadata to extract from the ISA archive)
+
+
+
+---
+
+## 2. Load Data
+
+> Note: Steps 2 - 8 are done in R
+
+
+
+### 2a. Load Libraries and Define Input Parameters
+
+```R
+### Install R packages if not already installed ###
+
+install.packages("tidyverse")
+install.packages("R.utils")
+install.packages("glue")
+install.packages("matrixStats")
+install.packages("statmod")
+if (!require("BiocManager", quietly = TRUE))
+ install.packages("BiocManager")
+BiocManager::install(version = "3.14")
+BiocManager::install("limma")
+BiocManager::install("biomaRt")
+
+
+### Import libraries ###
+
+library(tidyverse)
+library(dplyr)
+library(stringr)
+library(R.utils)
+library(glue)
+library(matrixStats)
+library(limma)
+library(biomaRt)
+library(statmod)
+
+
+# Define path to runsheet
+runsheet <- "/path/to/runsheet/{OSD-Accession-ID}_microarray_v{version}_runsheet.csv"
+
+## Set up output structure
+
+# Output Constants
+DIR_RAW_DATA <- "00-RawData"
+DIR_NORMALIZED_EXPRESSION <- "01-limma_NormExp"
+DIR_DGE <- "02-limma_DGE"
+
+dir.create(DIR_RAW_DATA)
+dir.create(DIR_NORMALIZED_EXPRESSION)
+dir.create(DIR_DGE)
+```
+
+
+
+### 2b. Define Custom Functions
+
+#### all_true()
+
+ wraps R base::all() function; overrides default behavior for empty input vector
+
+ ```R
+ all_true <- function(i_vector) {
+ if ( length(i_vector) == 0 ) {
+ stop(paste("Input vector is length zero"))
+ }
+ all(i_vector)
+ }
+ ```
+
+ **Function Parameter Definitions:**
+ - `i_vector=` - a vector of logical values
+
+ **Returns:** a logical of length 1; `TRUE` if all values are true, `FALSE` otherwise; stops and returns an error if input vector is empty
+
+
+#### runsheet_paths_are_URIs()
+
+ tests if paths provided in runsheet dataframe are URIs
+
+ ```R
+ runsheet_paths_are_URIs <- function(df_runsheet) {
+ all_true(stringr::str_starts(df_runsheet$`Array Data File Path`, "https"))
+ }
+ ```
+
+ **Custom Functions Used:**
+ - [all_true()](#all_true)
+
+ **Function Parameter Definitions:**
+ - `df_runsheet=` - a dataframe containing the sample runsheet information
+
+ **Returns:** a logical of length 1; `TRUE` if all values in the `Array Data File Path` of the runsheet start with "https", `FALSE` otherwise; stops and returns an error if input vector is empty
+
+
+#### download_files_from_runsheet()
+
+ downloads the raw data files
+
+ ```R
+ download_files_from_runsheet <- function(df_runsheet) {
+ urls <- df_runsheet$`Array Data File Path`
+ destinationFiles <- df_runsheet$`Array Data File Name`
+
+ mapply(function(url, destinationFile) {
+ print(paste0("Downloading from '", url, "' TO '", destinationFile, "'"))
+ if ( file.exists(destinationFile ) ) {
+ warning(paste( "Using Existing File:", destinationFile ))
+ } else {
+ download.file(url, destinationFile)
+ }
+ }, urls, destinationFiles)
+
+ destinationFiles # Return these paths
+ }
+ ```
+
+ **Function Parameter Definitions:**
+ - `df_runsheet=` - a dataframe containing the sample runsheet information
+
+ **Returns:** a list of filenames that were downloaded; same as the `Array Data File Name` in the sample runsheet
+
+
+#### fetch_organism_specific_annotation_table()
+
+ determines the organism specific annotation file to use based on the provided organism name
+
+ ```R
+ fetch_organism_specific_annotation_table <- function(organism, annotation_table_link) {
+ # Uses the latest GeneLab annotations table to find the organism specific annotation file path and ensembl version
+ # Raises an exception if the organism does not have an associated annotation file or ensembl version yet
+
+ all_organism_table <- read.csv(annotation_table_link)
+
+ annotation_table <- all_organism_table %>% dplyr::filter(species == organism)
+
+ # Guard clause: Ensure annotation_table populated
+ # Else: raise exception for unsupported organism
+ if (nrow(annotation_table) == 0 || annotation_table$genelab_annots_link == "" || is.na(annotation_table$ensemblVersion)) {
+ stop(glue::glue("Organism supplied '{organism}' is not supported. See the following url for supported organisms: {annotation_table_link}. Supported organisms will correspond to a row based on the 'species' column and include a url in the 'genelab_annots_link' column of that row and a version number in the 'ensemblVersion' column."))
+ }
+
+ return(annotation_table)
+ }
+ ```
+
+ **Function Parameter Definitions:**
+ - `organism=` - a string containing the name of the organism (as found in the species column of the GeneLab annotation table)
+ - `annotation_table_link=` - a string specifying the URL or path to latest GeneLab Annotations file, see [GL-DPPD-7110-A_annotations.csv](../../../GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv)
+
+ **Returns:** a dataframe containing all rows in the GeneLab annotations file that match the specified organism
+
+
+#### agilent_image_plot()
+
+ plots pseudo images of each array
+
+ ```R
+ agilent_image_plot <- function(eListRaw, transform_func = identity) {
+ # Adapted from this discussion: https://support.bioconductor.org/p/15523/
+ copy_raw_data <- eListRaw
+ copy_raw_data$genes$Block <- 1 # Agilent arrays only have one block
+ names(copy_raw_data$genes)[2] <- "Column"
+ copy_raw_data$printer <- limma::getLayout(copy_raw_data$genes)
+
+ r <- copy_raw_data$genes$Row
+ c <- copy_raw_data$genes$Column
+ nr <- max(r)
+ nc <- max(c)
+ y <- rep(NA,nr*nc)
+ i <- (r-1)*nc+c
+ for ( array_i in seq(colnames(copy_raw_data$E)) ) {
+ y[i] <- transform_func(copy_raw_data$E[,array_i])
+ limma::imageplot(y,copy_raw_data$printer, main = rownames(copy_raw_data$targets)[array_i])
+ }
+ }
+ ```
+
+ **Function Parameter Definitions:**
+ - `eListRaw=` - R object containing expression matrix to be plotted
+ - `transform_func=identity` - function used to transform expression matrix before plotting
+
+ **Returns:** pseudo images of each array
+
+
+#### boxplot_expression_safe_margin()
+
+ plots boxplot of expression data for each array
+
+ ```R
+ boxplot_expression_safe_margin <- function(data, transform_func = identity, xlab = "Log2 Intensity") {
+ # Basic box plot
+ df_data <- as.data.frame(transform_func(data$E))
+ ggplot2::ggplot(stack(df_data), ggplot2::aes(x=values, y=ind)) +
+ ggplot2::geom_boxplot() +
+ ggplot2::scale_y_discrete(limits=rev) +
+ ggplot2::labs(y= "Sample Name", x = xlab)
+ }
+ ```
+
+ **Function Parameter Definitions:**
+ - `data=` - R object containing expression matrix to be plotted
+ - `transform_func=identity` - function used to transform expression matrix before plotting
+ - `xlab="Log2 Intensity"` - string containing x-axis label for the plot
+
+ **Returns:** boxplot of expression data for each array
+
+
+#### shortened_organism_name()
+
+ shortens organism names, for example 'Homo Sapiens' to 'hsapiens'
+
+ ```R
+ shortened_organism_name <- function(long_name) {
+ #' Convert organism names like 'Homo Sapiens' into 'hsapiens'
+ tokens <- long_name %>% stringr::str_split(" ", simplify = TRUE)
+ genus_name <- tokens[1]
+
+ species_name <- tokens[2]
+
+ short_name <- stringr::str_to_lower(paste0(substr(genus_name, start = 1, stop = 1), species_name))
+
+ return(short_name)
+ }
+ ```
+
+ **Function Parameter Definitions:**
+ - `long_name=` - a string containing the long name of the organism
+
+ **Returns:** a string containing the short name of the organism
+
+
+#### get_biomart_attribute()
+
+ retrieves resolved biomart attribute source from runsheet dataframe
+
+ ```R
+ get_biomart_attribute <- function(df_rs) {
+ # check if runsheet has 'biomart_attribute' column
+ if ( !is.null(df_rs$`biomart_attribute`) ) {
+ print("Using attribute name sourced from runsheet")
+ # Format according to biomart needs
+ formatted_value <- unique(df_rs$`biomart_attribute`) %>%
+ stringr::str_replace_all(" ","_") %>% # Replace all spaces with underscore
+ stringr::str_to_lower() # Lower casing only
+ return(formatted_value)
+ } else {
+ stop("ERROR: Could not find 'biomart_attribute' in runsheet")
+ }
+ }
+ ```
+
+ **Function Parameter Definitions:**
+ - `df_rs=` - a dataframe containing the sample runsheet information
+
+ **Returns:** a string containing the formatted value from the `biomart_attribute` column of the runsheet, with all spaces converted to underscores and uppercase converted to lowercase; if no `biomart_attribute` exists in the runsheet, stop and return an error
+
+
+#### get_ensembl_genomes_mappings_from_ftp()
+
+ obtains mapping table directly from ftp; useful when biomart live service no longer exists for desired version
+
+ ```R
+ get_ensembl_genomes_mappings_from_ftp <- function(organism, ensembl_genomes_portal, ensembl_genomes_version, biomart_attribute) {
+ request_url <- glue::glue("https://ftp.ebi.ac.uk/ensemblgenomes/pub/{ensembl_genomes_portal}/release-{ensembl_genomes_version}/mysql/{ensembl_genomes_portal}_mart_{ensembl_genomes_version}/{organism}_eg_gene__efg_{biomart_attribute}__dm.txt.gz")
+
+ print(glue::glue("Mappings file URL: {request_url}"))
+
+ # Create a temporary file name
+ temp_file <- tempfile(fileext = ".gz")
+
+ # Download the gzipped table file using the download.file function
+ download.file(url = request_url, destfile = temp_file, method = "libcurl") # Use 'libcurl' to support ftps
+
+ # Uncompress the file
+ uncompressed_temp_file <- tempfile()
+ gzcon <- gzfile(temp_file, "rt")
+ content <- readLines(gzcon)
+ writeLines(content, uncompressed_temp_file)
+ close(gzcon)
+
+
+ # Load the data into a dataframe
+ mapping <- read.table(uncompressed_temp_file, # Read the uncompressed file
+ # Add column names as follows: MAPID, TAIR, PROBEID
+ col.names = c("MAPID", "ensembl_gene_id", biomart_attribute),
+ header = FALSE, # No header in original table
+ sep = "\t") # Tab separated
+
+ # Clean up temporary files
+ unlink(temp_file)
+ unlink(uncompressed_temp_file)
+
+ return(mapping)
+ }
+ ```
+
+ **Function Parameter Definitions:**
+ - `organism=` - a string containing the name of the organism (formatted using `shortened_organism_name()`)
+ - `ensembl_genomes_portal=` - a string containing the name of the genomes portal, for example 'plants'
+ - `ensembl_genomes_version=` - a string containing the version of Ensembl to use
+ - `biomart_attribute=` - a string containing the biomart attribute (formatted using `get_biomart_attribute()`)
+
+ **Returns:** a dataframe containing the mapping between Ensembl ID and probe ID, as obtained via FTP
+
+
+#### list_to_unique_piped_string()
+
+ converts character vector into string denoting unique elements separated by '|' characters
+
+ ```R
+ list_to_unique_piped_string <- function(str_list) {
+ #! Convert vector of multi-mapped genes to string separated by '|' characters
+ #! e.g. c("GO1","GO2","GO2","G03") -> "GO1|GO2|GO3"
+ return(toString(unique(str_list)) %>% stringr::str_replace_all(pattern = stringr::fixed(", "), replacement = "|"))
+ }
+ ```
+
+ **Function Parameter Definitions:**
+ - `str_list=` - vector of character elements
+
+ **Returns:** a string containing the unique elements from `str_list` concatenated together, separated by '|' characters
+
+
+#### runsheet_to_design_matrix()
+
+ loads the GeneLab runsheet into a list of dataframes
+
+ ```R
+ runsheet_to_design_matrix <- function(runsheet_path) {
+ # Pull all factors for each sample in the study from the runsheet created in Step 1
+ df = read.csv(runsheet_path)
+ # get only Factor Value columns
+ factors = as.data.frame(df[,grep("Factor.Value", colnames(df), ignore.case=TRUE)])
+ colnames(factors) = paste("factor",1:dim(factors)[2], sep= "_")
+
+ # Load metadata from runsheet csv file
+ compare_csv = data.frame(sample_id = df[,c("Sample.Name")], factors)
+
+ # Create data frame containing all samples and respective factors
+ study <- as.data.frame(compare_csv[,2:dim(compare_csv)[2]])
+ colnames(study) <- colnames(compare_csv)[2:dim(compare_csv)[2]]
+ rownames(study) <- compare_csv[,1]
+
+ # Format groups and indicate the group that each sample belongs to
+ if (dim(study)[2] >= 2){
+ group<-apply(study,1,paste,collapse = " & ") # concatenate multiple factors into one condition per sample
+ } else{
+ group<-study[,1]
+ }
+ group_names <- paste0("(",group,")",sep = "") # human readable group names
+ group <- sub("^BLOCKER_", "", make.names(paste0("BLOCKER_", group))) # group naming compatible with R models, this maintains the default behaviour of make.names with the exception that 'X' is never prepended to group namesnames(group) <- group_names
+ names(group) <- group_names
+
+ # Format contrasts table, defining pairwise comparisons for all groups
+ contrast.names <- combn(levels(factor(names(group))),2) # generate matrix of pairwise group combinations for comparison
+ contrasts <- apply(contrast.names, MARGIN=2, function(col) sub("^BLOCKER_", "", make.names(paste0("BLOCKER_", stringr::str_sub(col, 2, -2)))))
+ contrast.names <- c(paste(contrast.names[1,],contrast.names[2,],sep = "v"),paste(contrast.names[2,],contrast.names[1,],sep = "v")) # format combinations for output table files names
+ contrasts <- cbind(contrasts,contrasts[c(2,1),])
+ colnames(contrasts) <- contrast.names
+ sampleTable <- data.frame(condition=factor(group))
+ rownames(sampleTable) <- df[,c("Sample.Name")]
+
+ condition <- sampleTable[,'condition']
+ names_mapping <- as.data.frame(cbind(safe_name = as.character(condition), original_name = group_names))
+
+ design <- model.matrix(~ 0 + condition)
+ design_data <- list( matrix = design, mapping = names_mapping, groups = as.data.frame( cbind(sample = df[,c("Sample.Name")], group = group_names) ), contrasts = contrasts )
+ return(design_data)
+ }
+ ```
+
+ **Function Parameter Definitions:**
+ - `runsheet_path=` - a string containing the path to the runsheet generated in [Step 1](#1-create-sample-runsheet)
+
+ **Returns:** a list of R objects containing the sample information and metadata
+ - `design_data$matrix` - a design (or model) matrix describing the conditions in the dataset
+ - `design_data$mapping` - a dataframe mapping the human-readable group names to the names of the conditions modified for use in R
+ - `design_data$groups` - a dataframe of group names and contrasts for each sample
+ - `design_data$contrasts` - a matrix of all pairwise comparisons of the groups
+
+
+#### lm_fit_pairwise()
+
+ performs all pairwise comparisons using limma::lmFit()
+
+ ```R
+ lm_fit_pairwise <- function(norm_data, design) {
+ # Approach based on limma manual section 17.4 (version 3.52.4)
+ fit <- limma::lmFit(norm_data, design)
+
+ # Create Contrast Model
+ fit.groups <- colnames(fit$design)[which(fit$assign == 1)]
+ combos <- combn(fit.groups,2)
+ contrasts<-c(paste(combos[1,],combos[2,],sep = "-"),paste(combos[2,],combos[1,],sep = "-")) # format combinations for limma:makeContrasts
+ cont.matrix <- limma::makeContrasts(contrasts=contrasts,levels=design)
+ contrast.fit <- limma::contrasts.fit(fit, cont.matrix)
+
+ contrast.fit <- limma::eBayes(contrast.fit,trend=TRUE,robust=TRUE)
+ return(contrast.fit)
+ }
+ ```
+
+ **Function Parameter Definitions:**
+ - `norm_data=` - an R object containing log-ratios or log-expression values for a series of arrays, with rows corresponding to genes and columns to samples
+ - `design=` - the design matrix of the microarray experiment, with rows corresponding to samples and columns to coefficients to be estimated
+
+ **Returns:** an R object of class `MArrayLM`
+
+
+#### reformat_names()
+
+ reformats column names for consistency across DE analyses tables within GeneLab
+
+ ```R
+ reformat_names <- function(colname, group_name_mapping) {
+ new_colname <- colname %>%
+ stringr::str_replace(pattern = "^P.value.adj.condition", replacement = "Adj.p.value_") %>%
+ stringr::str_replace(pattern = "^P.value.condition", replacement = "P.value_") %>%
+ stringr::str_replace(pattern = "^Coef.condition", replacement = "Log2fc_") %>% # This is the Log2FC as per: https://rdrr.io/bioc/limma/man/writefit.html
+ stringr::str_replace(pattern = "^t.condition", replacement = "T.stat_") %>%
+ stringr::str_replace(pattern = "^Genes\\.", replacement = "") %>%
+ stringr::str_replace(pattern = ".condition", replacement = "v")
+
+ # remap to group names before make.names was applied
+ unique_group_name_mapping <- unique(group_name_mapping) %>% arrange(-nchar(safe_name))
+ for ( i in seq(nrow(unique_group_name_mapping)) ) {
+ safe_name <- unique_group_name_mapping[i,]$safe_name
+ original_name <- unique_group_name_mapping[i,]$original_name
+ new_colname <- new_colname %>% stringr::str_replace(pattern = stringr::fixed(safe_name), replacement = original_name)
+ }
+
+ return(new_colname)
+ }
+ ```
+
+ **Function Parameter Definitions:**
+ - `colnames=` - a character vector containing the column names to reformat
+ - `group_name_mapping=` - a dataframe mapping the original human-readable group names to the R modified safe names
+
+ **Returns:** a character vector containing the formatted column names
+
+
+#### generate_prefixed_column_order()
+
+ creates a vector of column names based on subject and given prefixes; used for both contrasts and groups column name generation
+
+ ```R
+ generate_prefixed_column_order <- function(subjects, prefixes) {
+ # Track order of columns
+ final_order = c()
+
+ # For each contrast
+ for (subject in subjects) {
+ # Generate column names for each prefix and append to final_order
+ for (prefix in prefixes) {
+ final_order <- append(final_order, glue::glue("{prefix}{subject}"))
+ }
+ }
+ return(final_order)
+ }
+ ```
+
+ **Function Parameter Definitions:**
+ - `subjects` - a character vector containing subject strings to add prefixes to
+ - `prefixes` - a character vector of prefixes to add to the beginning of each subject string
+
+ **Returns:** a character vector with all possible combinations of prefix + subject
+
+
+
+
+### 2c. Load Metadata and Raw Data
+
+```R
+# fileEncoding removes strange characters from the column names
+df_rs <- read.csv(runsheet, check.names = FALSE, fileEncoding = 'UTF-8-BOM')
+
+if ( runsheet_paths_are_URIs(df_rs) ) {
+ print("Determined Raw Data Locations are URIS")
+ local_paths <- download_files_from_runsheet(df_rs)
+} else {
+ print("Or Determined Raw Data Locations are local paths")
+ local_paths <- df_rs$`Array Data File Path`
+}
+
+# uncompress files if needed
+if ( all_true(stringr::str_ends(local_paths, ".gz")) ) {
+ print("Determined these files are gzip compressed... uncompressing now")
+ # This does the uncompression
+ lapply(local_paths, R.utils::gunzip, remove = FALSE, overwrite = TRUE)
+ # This removes the .gz extension to get the uncompressed filenames
+ local_paths <- vapply(local_paths,
+ stringr::str_replace, # Run this function against each item in 'local_paths'
+ FUN.VALUE = character(1), # Execpt an character vector as a return
+ USE.NAMES = FALSE, # Don't use the input to assign names for the returned list
+ pattern = ".gz$", # first argument for applied function
+ replacement = "" # second argument for applied function
+ )
+}
+
+df_local_paths <- data.frame(`Sample Name` = df_rs$`Sample Name`, `Local Paths` = local_paths, check.names = FALSE)
+
+# Load raw data into R object
+raw_data <- limma::read.maimages(df_local_paths$`Local Paths`,
+ source = "agilent", # Specify platform
+ green.only = TRUE, # Specify one-channel design
+ names = df_local_paths$`Sample Name` # Map column names as Sample Names (instead of default filenames)
+ )
+
+# Handle raw data which lacks certain replaceable column data
+
+## This likely arises as Agilent Feature Extraction (the process that generates the raw data files on OSDR)
+## gives some user flexibilty in what probe column to ouput
+
+## Missing ProbeUID "Unique integer for each unique probe in a design"
+### Source: https://www.agilent.com/cs/library/usermanuals/public/GEN-MAN-G4460-90057.pdf Page 178
+### Remedy: Assign unique integers for each probe
+
+if ( !("ProbeUID" %in% colnames(raw_data$genes)) ) {
+ # Assign unique integers for each probe
+ print("Assigning `ProbeUID` as original files did not include them")
+ raw_data$genes$ProbeUID <- seq_len(nrow(raw_data$genes))
+}
+
+# Summarize raw data
+print(paste0("Number of Arrays: ", dim(raw_data)[2]))
+print(paste0("Number of Probes: ", dim(raw_data)[1]))
+```
+
+**Custom Functions Used:**
+
+- [all_true()](#all_true)
+- [runsheet_paths_are_URIs()](#runsheet_paths_are_URIs)
+- [download_files_from_runsheet()](#download_files_from_runsheet)
+
+**Input Data:**
+
+- `runsheet` (Path to runsheet, output from [Step 1](#1-create-sample-runsheet))
+
+**Output Data:**
+
+- `df_rs` (R dataframe containing information from the runsheet)
+- `raw_data` (R object containing raw microarray data)
+
+ > Note: The raw data R object will be used to generate quality assessment (QA) plots in the next step.
+
+
+
+### 2d. Load Annotation Metadata
+
+```R
+# If using custom annotation, local_annotation_dir is path to directory containing annotation file and annotation_config_path is path/url to config file
+local_annotation_dir <- NULL #
+annotation_config_path <- NULL #
+
+annotation_table_link <- "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/GL_RefAnnotTable-A_1.1.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv"
+
+annotation_table <- fetch_organism_specific_annotation_table(unique(df_rs$organism))
+
+annotation_file_path <- annotation_table$genelab_annots_link
+ensembl_version <- as.character(annotation_table$ensemblVersion)
+```
+
+**Custom Functions Used:**
+
+- [fetch_organism_specific_annotation_table()](#fetch_organism_specific_annotation_table)
+
+**Input Data:**
+
+- `local_annotation_dir` (Path to local annotation directory if using custom annotations, see [Step 7a](#7a-get-probe-annotations))
+
+ > Note: If not using custom annotations, leave `local_annotation_dir` as `NULL`.
+
+- `annotation_config_path` (URL or path to annotation config file if using custom annotations, see [Step 7a](#7a-get-probe-annotations))
+
+ > Note: If not using custom annotations, leave `annotation_config_path` as `NULL`.
+
+- `df_rs$organism` (organism specified in the runsheet created in [Step 1](#1-create-sample-runsheet))
+- `annotation_table_link` (URL or path to latest GeneLab Annotations file, see [GL-DPPD-7110-A_annotations.csv](../../../GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv))
+
+**Output Data:**
+
+- `annotation_file_path` (reference organism annotation file url indicated in the 'genelab_annots_link' column of the GeneLab Annotations file provided in `annotation_table_link`)
+- `ensembl_version` (reference organism Ensembl version indicated in the 'ensemblVersion' column of the GeneLab Annotations file provided in `annotation_table_link`)
+
+
+
+---
+
+## 3. Raw Data Quality Assessment
+
+
+
+### 3a. Density Plot
+
+```R
+# Plot settings
+par(
+ xpd = TRUE # Ensure legend can extend past plot area
+)
+
+number_of_sets = ceiling(dim(raw_data)[2] / 30) # Set of 30 samples, used to scale plot
+
+limma::plotDensities(raw_data,
+ log = TRUE,
+ legend = FALSE)
+legend("topright", legend = colnames(raw_data),
+ lty = 1, # Solid line
+ col = 1:ncol(raw_data), # Ensure legend color is in sync with plot
+ ncol = number_of_sets, # Set number of columns by number of sets
+ cex = max(0.5, 1 + 0.2 - (number_of_sets*0.2)) # Reduce scale by 20% for each column beyond 1, minimum of 0.5
+ )
+```
+
+**Input Data:**
+
+- `raw_data` (raw data R object created in [Step 2c](#2c-load-metadata-and-raw-data) above)
+
+**Output Data:**
+
+- Plot containing the density of raw intensities for each array (raw intensity values with background intensity values subtracted; lack of overlap indicates a need for normalization)
+
+
+
+### 3b. Pseudo Image Plots
+
+```R
+agilent_image_plot(raw_data, transform_func = function(expression_matrix) log2(expression_matrix + 1))
+```
+
+**Custom Functions Used:**
+
+- [agilent_image_plot()](#agilent_image_plot)
+
+**Input Data:**
+
+- `raw_data` (raw data R object created in [Step 2c](#2c-load-metadata-and-raw-data) above)
+
+**Output Data:**
+
+- Pseudo images of each array before background correction and normalization
+
+
+
+### 3c. MA Plots
+
+```R
+for ( array_i in seq(colnames(raw_data$E)) ) {
+ sample_name <- rownames(raw_data$targets)[array_i]
+ limma::plotMA(raw_data,array=array_i,xlab="Average log-expression",ylab="Expression log-ratio (this sample vs. others)", main = sample_name, status=raw_data$genes$ControlType)
+}
+```
+
+**Input Data:**
+
+- `raw_data` (raw data R object created in [Step 2c](#2c-load-metadata-and-raw-data) above)
+
+**Output Data:**
+
+- M (log ratio of the subject array vs a pseudo-reference, the mean of all other arrays) vs. A (average log expression) plot for each array before background correction and normalization (negative and positive control probes are in green and red, respectively)
+
+
+
+### 3d. Foreground-Background Plots
+
+```R
+for ( array_i in seq(colnames(raw_data$E)) ) {
+ sample_name <- rownames(raw_data$targets)[array_i]
+ limma::plotFB(raw_data, array = array_i, xlab = "log2 Background", ylab = "log2 Foreground", main = sample_name)
+}
+```
+
+**Input Data:**
+
+- `raw_data` (raw data R object created in [Step 2c](#2c-load-metadata-and-raw-data) above)
+
+**Output Data:**
+
+- Foreground vs. background expression plot for each array before background correction and normalization
+
+
+
+### 3e. Boxplots
+
+```R
+boxplot_expression_safe_margin(raw_data, transform_func = log2)
+```
+
+**Custom Functions Used:**
+
+- [boxplot_expression_safe_margin()](#boxplot_expression_safe_margin)
+
+**Input Data:**
+
+- `raw_data` (raw data R object created in [Step 2c](#2c-load-metadata-and-raw-data) above)
+
+**Output Data:**
+
+- Boxplot of raw expression data for each array before background correction and normalization
+
+
+
+---
+
+## 4. Background Correction
+
+```R
+background_corrected_data <- limma::backgroundCorrect(raw_data, method = "normexp")
+```
+
+**Input Data:**
+
+- `raw_data` (raw data R object created in [Step 2c](#2c-load-metadata-and-raw-data) above)
+
+**Output Data:**
+
+- `background_corrected_data` (R object containing background-corrected microarray data)
+
+ >
+ > Note: Background correction was performed using the `normexp` method as recommended by [Ritchie, M.E., et al.](http://bioinformatics.oxfordjournals.org/content/23/20/2700), which performs background correction and quantile normalization using the control probes by utilizing the `normexp.fit.control` function to estimate the parameters required by normal+exponential(normexp) convolution model with the help of negative control probes, followed by the `normexp.signal` function to perform the background correction.
+
+
+
+---
+
+## 5. Between Array Normalization
+
+```R
+# Normalize background-corrected data using the quantile method
+norm_data <- limma::normalizeBetweenArrays(background_corrected_data, method = "quantile")
+
+# Summarize background-corrected and normalized data
+print(paste0("Number of Arrays: ", dim(norm_data)[2]))
+print(paste0("Number of Probes: ", dim(norm_data)[1]))
+```
+
+**Input Data:**
+
+- `background_corrected_data` (R object containing background-corrected microarray data created in [Step 4](#4-background-correction) above)
+
+**Output Data:**
+
+- `norm_data` (R object containing background-corrected and normalized microarray data)
+
+ >
+ > Note: Normalization was performed using the `quantile` method, which forces the entire empirical distribution of all arrays to be identical
+
+
+
+---
+
+## 6. Normalized Data Quality Assessment
+
+
+
+### 6a. Density Plot
+
+```R
+# Plot settings
+par(
+ xpd = TRUE # Ensure legend can extend past plot area
+)
+
+number_of_sets = ceiling(dim(norm_data)[2] / 30) # Set of 30 samples, used to scale plot
+
+limma::plotDensities(norm_data,
+ log = TRUE,
+ legend = FALSE)
+legend("topright", legend = colnames(norm_data),
+ lty = 1, # Solid line
+ col = 1:ncol(norm_data), # Ensure legend color is in sync with plot
+ ncol = number_of_sets, # Set number of columns by number of sets
+ cex = max(0.5, 1 + 0.2 - (number_of_sets*0.2)) # Reduce scale by 20% for each column beyond 1, minimum of 0.5
+ )
+```
+
+**Input Data:**
+
+- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization) above)
+
+**Output Data:**
+
+- Plot containing the density of background-corrected and normalized intensities for each array (near complete overlap is expected after normalization)
+
+
+
+### 6b. Pseudo Image Plots
+
+```R
+agilent_image_plot(norm_data,
+ transform_func = function(expression_matrix) log2(2**expression_matrix + 1) # Compute as log2 of normalized expression after adding a +1 offset to prevent negative values in the pseudoimage
+ )
+```
+
+**Custom Functions Used:**
+
+- [agilent_image_plot()](#agilent_image_plot)
+
+**Input Data:**
+
+- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization) above)
+
+**Output Data:**
+
+- Pseudo images of each array after background correction and normalization
+
+
+
+### 6c. MA Plots
+
+```R
+for ( array_i in seq(colnames(norm_data$E)) ) {
+ sample_name <- rownames(norm_data$targets)[array_i]
+ limma::plotMA(norm_data,array=array_i,xlab="Average log-expression",ylab="Expression log-ratio (this sample vs. others)", main = sample_name, status=norm_data$genes$ControlType)
+}
+```
+
+**Input Data:**
+
+- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization) above)
+
+**Output Data:**
+
+- M (log ratio of the subject array vs a pseudo-reference, the mean of all other arrays) vs. A (average log expression) plot for each array after background correction and normalization (negative and positive control probes are in green and red, respectively)
+
+
+
+### 6d. Boxplots
+
+```R
+boxplot_expression_safe_margin(norm_data)
+```
+
+**Custom Functions Used:**
+
+- [boxplot_expression_safe_margin()](#boxplot_expression_safe_margin)
+
+**Input Data:**
+
+- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization) above)
+
+**Output Data:**
+
+- Boxplot of expression data for each array after background correction and normalization
+
+
+
+---
+
+## 7. Probe Annotations
+
+
+
+### 7a. Get Probe Annotations
+
+```R
+organism <- shortened_organism_name(unique(df_rs$organism))
+annot_key <- ifelse(organism %in% c("athaliana"), 'TAIR', 'ENSEMBL')
+
+if (organism %in% c("athaliana")) {
+ ENSEMBL_VERSION = ensembl_version
+ ensembl_genomes_portal = "plants"
+ print(glue::glue("Using ensembl genomes ftp to get specific version of probe id mapping table. Ensembl genomes portal: {ensembl_genomes_portal}, version: {ENSEMBL_VERSION}"))
+ expected_attribute_name <- get_biomart_attribute(df_rs)
+ df_mapping <- get_ensembl_genomes_mappings_from_ftp(
+ organism = organism,
+ ensembl_genomes_portal = ensembl_genomes_portal,
+ ensembl_genomes_version = ENSEMBL_VERSION,
+ biomart_attribute = expected_attribute_name
+ )
+
+ # TAIR from the mapping tables tend to be in the format 'AT1G01010.1' but the raw data has 'AT1G01010'
+ # So here we remove the '.NNN' from the mapping table where .NNN is any number
+ df_mapping$ensembl_gene_id <- stringr::str_replace_all(df_mapping$ensembl_gene_id, "\\.\\d+$", "")
+
+ use_custom_annot <- FALSE
+} else {
+ # Use biomart from main Ensembl website which archives keep each release on the live service
+ # locate dataset
+ expected_dataset_name <- shortened_organism_name(unique(df_rs$organism)) %>% stringr::str_c("_gene_ensembl")
+ print(paste0("Expected dataset name: '", expected_dataset_name, "'"))
+
+ expected_attribute_name <- get_biomart_attribute(df_rs)
+ print(paste0("Expected attribute name: '", expected_attribute_name, "'"))
+
+ # Specify Ensembl version used in current GeneLab reference annotations
+ ENSEMBL_VERSION <- ensembl_version
+
+ print(glue::glue("Using Ensembl biomart to get specific version of mapping table. Ensembl version: {ENSEMBL_VERSION}"))
+
+ # Check if organism/array design is supported in biomart
+ use_custom_annot <- TRUE
+
+ ensembl <- biomaRt::useEnsembl(biomart = "genes", version = ENSEMBL_VERSION)
+ ensembl_datasets <- biomaRt::listDatasets(ensembl)
+ if (expected_dataset_name %in% ensembl_datasets$dataset) {
+ ensembl <- biomaRt::useEnsembl(biomart = "genes", dataset = expected_dataset_name, version = ENSEMBL_VERSION)
+ ensembl_attributes <- biomaRt::listAttributes(ensembl)
+ if (expected_attribute_name %in% ensembl_attributes$name) {
+ use_custom_annot <- FALSE
+ }
+ }
+
+ if (use_custom_annot) {
+ unloadNamespace("biomaRt")
+ } else {
+ print(ensembl)
+
+ probe_ids <- unique(norm_data$genes$ProbeName)
+
+ # Create probe map
+ # Run Biomart Queries in chunks to prevent request timeouts
+ # Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk size
+ CHUNK_SIZE= 1500
+ probe_id_chunks <- split(probe_ids, ceiling(seq_along(probe_ids) / CHUNK_SIZE))
+ df_mapping <- data.frame()
+ for (i in seq_along(probe_id_chunks)) {
+ probe_id_chunk <- probe_id_chunks[[i]]
+ print(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})"))
+ chunk_results <- biomaRt::getBM(
+ attributes = c(
+ expected_attribute_name,
+ "ensembl_gene_id"
+ ),
+ filters = expected_attribute_name,
+ values = probe_id_chunk,
+ mart = ensembl)
+
+ if (nrow(chunk_results) > 0) {
+ df_mapping <- df_mapping %>% dplyr::bind_rows(chunk_results)
+ }
+
+ Sys.sleep(10) # Slight break between requests to prevent back-to-back requests
+ }
+ }
+}
+
+# At this point, we have df_mapping from either the biomart live service or the ensembl genomes ftp archive depending on the organism
+# If no df_mapping obtained (e.g., not supported in biomart), use custom annotations; otherwise, merge in-house annotations to df_mapping
+
+if (use_custom_annot) {
+ expected_attribute_name <- 'ProbeName'
+
+ annot_type <- 'NO_CUSTOM_ANNOT'
+ if (!is.null(local_annotation_dir) && !is.null(annotation_config_path)) {
+ config_df <- read.csv(annotation_config_path, row.names=1)
+ if (unique(df_rs$`biomart_attribute`) %in% row.names(config_df)) {
+ annot_config <- config_df[unique(df_rs$`biomart_attribute`), ]
+ annot_type <- annot_config$annot_type[[1]]
+ } else {
+ warning(paste0("No entry for '", unique(df_rs$`biomart_attribute`), "' in provided config file: ", annotation_config_path))
+ }
+ } else {
+ warning("Need to provide both local_annotation_dir and annotation_config_path to use custom annotation.")
+ }
+
+ if (annot_type == 'agilent') {
+ unique_probe_ids <- read.delim(
+ file.path(local_annotation_dir, annot_config$annot_filename[[1]]),
+ header = TRUE, na.strings = c('NA', '')
+ )[c('ProbeID', 'EnsemblID', 'GeneSymbol', 'GeneName', 'RefSeqAccession', 'EntrezGeneID', 'GO')]
+
+ stopifnot(nrow(unique_probe_ids) == length(unique(unique_probe_ids$ProbeID)))
+
+ # Clean columns
+ unique_probe_ids$GO <- purrr::map_chr(stringr::str_extract_all(unique_probe_ids$GO, 'GO:\\d{7}'), ~paste0(unique(.), collapse = '|')) %>% stringr::str_replace('^$', NA_character_)
+
+ names(unique_probe_ids) <- c('ProbeName', 'ENSEMBL', 'SYMBOL', 'GENENAME', 'REFSEQ', 'ENTREZID', 'GOSLIM_IDS')
+
+ unique_probe_ids$STRING_id <- NA_character_
+
+ gene_col <- 'ENSEMBL'
+ if (sum(!is.na(unique_probe_ids$ENTREZID)) > sum(!is.na(unique_probe_ids$ENSEMBL))) {
+ gene_col <- 'ENTREZID'
+ }
+ if (sum(!is.na(unique_probe_ids$SYMBOL)) > max(sum(!is.na(unique_probe_ids$ENTREZID)), sum(!is.na(unique_probe_ids$ENSEMBL)))) {
+ gene_col <- 'SYMBOL'
+ }
+
+ unique_probe_ids <- unique_probe_ids %>%
+ dplyr::mutate(
+ count_gene_mappings = 1 + stringr::str_count(get(gene_col), stringr::fixed("|")),
+ gene_mapping_source = gene_col
+ )
+ } else if (annot_type == 'custom') {
+ unique_probe_ids <- read.csv(
+ file.path(local_annotation_dir, annot_config$annot_filename[[1]]),
+ header = TRUE, na.strings = c('NA', '')
+ )
+ } else {
+ annot_cols <- c('ProbeName', 'ENTREZID', 'SYMBOL', 'GENENAME', 'ENSEMBL', 'REFSEQ', 'GOSLIM_IDS', 'STRING_id', 'count_gene_mappings', 'gene_mapping_source')
+ unique_probe_ids <- setNames(data.frame(matrix(NA_character_, nrow = 1, ncol = length(annot_cols))), annot_cols)
+ }
+} else {
+ annot <- read.table(
+ as.character(annotation_file_path),
+ sep = "\t",
+ header = TRUE,
+ quote = "",
+ comment.char = ""
+ )
+
+ unique_probe_ids <- df_mapping %>%
+ dplyr::mutate(dplyr::across(!!sym(expected_attribute_name), as.character)) %>% # Ensure probe ids treated as character type
+ dplyr::group_by(!!sym(expected_attribute_name)) %>%
+ dplyr::summarise(
+ ENSEMBL = list_to_unique_piped_string(ensembl_gene_id)
+ ) %>%
+ # Count number of ensembl IDS mapped
+ dplyr::mutate(
+ count_gene_mappings = 1 + stringr::str_count(ENSEMBL, stringr::fixed("|")),
+ gene_mapping_source = annot_key
+ ) %>%
+ dplyr::left_join(annot, by = c("ENSEMBL" = annot_key))
+}
+
+norm_data$genes <- norm_data$genes %>%
+ dplyr::left_join(unique_probe_ids, by = c("ProbeName" = expected_attribute_name ) ) %>%
+ dplyr::mutate( count_gene_mappings := ifelse(is.na(count_gene_mappings), 0, count_gene_mappings) ) %>%
+ dplyr::mutate( gene_mapping_source := unique(unique_probe_ids$gene_mapping_source) )
+```
+
+**Custom Functions Used:**
+
+- [shortened_organism_name()](#shortened_organism_name)
+- [get_biomart_attribute()](#get_biomart_attribute)
+- [get_ensembl_genomes_mappings_from_ftp()](#get_ensembl_genomes_mappings_from_ftp)
+- [list_to_unique_piped_string()](#list_to_unique_piped_string)
+
+**Input Data:**
+
+- `df_rs$organism` (organism specified in the runsheet created in [Step 1](#1-create-sample-runsheet))
+- `df_rs$biomart_attribute` (array design biomart identifier specified in the runsheet created in [Step 1](#1-create-sample-runsheet))
+- `annotation_file_path` (reference organism annotation file url indicated in the 'genelab_annots_link' column of the GeneLab Annotations file provided in `annotation_table_link`, output from [Step 2d](#2d-load-annotation-metadata))
+- `ensembl_version` (reference organism Ensembl version indicated in the 'ensemblVersion' column of the GeneLab Annotations file provided in `annotation_table_link`, output from [Step 2d](#2d-load-annotation-metadata))
+- `annot_key` (keytype to join annotation table and microarray probes, dependent on organism, e.g. mus musculus uses 'ENSEMBL')
+- `local_annotation_dir` (path to local annotation directory if using custom annotations, output from [Step 2d](#2d-load-annotation-metadata))
+- `annotation_config_path` (URL or path to annotation config file if using custom annotations, output from [Step 2d](#2d-load-annotation-metadata))
+
+ > Note: See [Agilent_array_annotations.csv](../Array_Annotations/Agilent_array_annotations.csv) for the latest config file used at GeneLab. This file can also be created manually by following the [file specification](../Workflow_Documentation/NF_MAAgilent1ch/examples/annotations/README.md).
+
+- `norm_data$genes` (Manufacturer's probe metadata, including probe IDs and sequence position gene annotations associated with the `norm_data` R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization))
+
+**Output Data:**
+
+- `unique_probe_ids` (R object containing probe ID to gene annotation mappings)
+- `norm_data$genes` (Probe metadata, updated to include gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations)
+
+
+
+### 7b. Summarize Gene Mapping
+
+```R
+# Pie Chart with Percentages
+slices <- c(
+ 'Control probes' = nrow(norm_data$gene %>% dplyr::filter(ControlType != 0) %>% dplyr::distinct(ProbeName)),
+ 'Unique Mapping' = nrow(norm_data$gene %>% dplyr::filter(ControlType == 0) %>% dplyr::filter(count_gene_mappings == 1) %>% dplyr::distinct(ProbeName)),
+ 'Multi Mapping' = nrow(norm_data$gene %>% dplyr::filter(ControlType == 0) %>% dplyr::filter(count_gene_mappings > 1) %>% dplyr::distinct(ProbeName)),
+ 'No Mapping' = nrow(norm_data$gene %>% dplyr::filter(ControlType == 0) %>% dplyr::filter(count_gene_mappings == 0) %>% dplyr::distinct(ProbeName))
+)
+pct <- round(slices/sum(slices)*100)
+chart_names <- names(slices)
+chart_names <- glue::glue("{names(slices)} ({slices})") # add count to labels
+chart_names <- paste(chart_names, pct) # add percents to labels
+chart_names <- paste(chart_names,"%",sep="") # ad % to labels
+pie(slices,labels = chart_names, col=rainbow(length(slices)),
+ main=glue::glue("Mapping to Primary Keytype\n {nrow(norm_data$gene %>% dplyr::distinct(ProbeName))} Total Unique Probes")
+ )
+
+original_mapping_rate = nrow(norm_data$gene %>% dplyr::filter(ControlType == 0) %>% dplyr::filter(ProbeName != SystematicName) %>% dplyr::distinct(ProbeName))
+print(glue::glue("Original Manufacturer Reported Mapping Count: {original_mapping_rate}"))
+print(glue::glue("Unique Mapping Count: {slices[['Unique Mapping']]}"))
+```
+
+**Input Data:**
+
+- `norm_data$genes` (Probe metadata, updated to include gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations, output from [Step 7a](#7a-get-probe-annotations) above)
+
+**Output Data:**
+
+- A pie chart denoting the gene mapping rates for each unique probe ID
+- A printout denoting the count of unique mappings for both the original manufacturer mapping and the gene mapping
+
+
+
+### 7c. Generate Annotated Raw and Normalized Expression Tables
+
+```R
+## Reorder columns before saving to file
+ANNOTATIONS_COLUMN_ORDER = c(
+ annot_key,
+ "SYMBOL",
+ "GENENAME",
+ "REFSEQ",
+ "ENTREZID",
+ "STRING_id",
+ "GOSLIM_IDS"
+)
+
+PROBE_INFO_COLUMN_ORDER = c(
+ "ProbeUID",
+ "ProbeName",
+ "count_gene_mappings",
+ "gene_mapping_source"
+)
+SAMPLE_COLUMN_ORDER <- df_rs$`Sample Name`
+
+## Generate raw intensity matrix that includes annotations
+raw_data_matrix <- background_corrected_data$genes %>%
+ dplyr::select(ProbeUID, ProbeName) %>%
+ dplyr::bind_cols(background_corrected_data$E) %>%
+ dplyr::left_join(unique_probe_ids, by = c("ProbeName" = expected_attribute_name ) ) %>%
+ dplyr::mutate( count_gene_mappings = ifelse(is.na(count_gene_mappings), 0, count_gene_mappings) ) %>%
+ dplyr::mutate( gene_mapping_source = unique(unique_probe_ids$gene_mapping_source) )
+
+## Perform reordering
+FINAL_COLUMN_ORDER <- c(
+ ANNOTATIONS_COLUMN_ORDER,
+ PROBE_INFO_COLUMN_ORDER,
+ SAMPLE_COLUMN_ORDER
+ )
+
+raw_data_matrix <- raw_data_matrix %>%
+ dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER))
+
+write.csv(raw_data_matrix, file.path(DIR_RAW_DATA, "raw_intensities_GLmicroarray.csv"), row.names = FALSE)
+
+## Generate normalized expression matrix that includes annotations
+norm_data_matrix <- norm_data$genes %>%
+ dplyr::select(ProbeUID, ProbeName) %>%
+ dplyr::bind_cols(norm_data$E) %>%
+ dplyr::left_join(unique_probe_ids, by = c("ProbeName" = expected_attribute_name ) ) %>%
+ dplyr::mutate( count_gene_mappings = ifelse(is.na(count_gene_mappings), 0, count_gene_mappings) ) %>%
+ dplyr::mutate( gene_mapping_source = unique(unique_probe_ids$gene_mapping_source) )
+
+norm_data_matrix <- norm_data_matrix %>%
+ dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER))
+
+write.csv(norm_data_matrix, file.path(DIR_NORMALIZED_EXPRESSION, "normalized_expression_GLmicroarray.csv"), row.names = FALSE)
+```
+
+**Input Data:**
+
+- `df_rs` (R dataframe containing information from the runsheet, output from [Step 2c](#2c-load-metadata-and-raw-data))
+- `annot_key` (keytype to join annotation table and microarray probes, dependent on organism, e.g. mus musculus uses 'ENSEMBL', defined in [Step 7a](#7a-get-probe-annotations))
+- `background_corrected_data` (R object containing background-corrected microarray data, output from [Step 4](#4-background-correction))
+- `norm_data` (R object containing background-corrected and normalized microarray data, output from [Step 5](#5-between-array-normalization))
+- `unique_probe_ids` (R object containing probe ID to gene annotation mappings, output from [Step 7a](#7a-get-probe-annotations))
+
+**Output Data:**
+
+- **raw_intensities_GLmicroarray.csv** (table containing the background corrected unnormalized intensity values for each sample including gene annotations)
+- **normalized_expression_GLmicroarray.csv** (table containing the background corrected, normalized intensity values for each sample including gene annotations)
+
+## 8. Perform Probe Differential Expression (DE)
+
+> Note: Run differential expression analysis only if there are at least 2 replicates per factor group.
+
+
+
+### 8a. Generate Design Matrix
+
+```R
+# Loading metadata from runsheet csv file
+design_data <- runsheet_to_design_matrix(runsheet)
+design <- design_data$matrix
+
+# Write SampleTable.csv and contrasts.csv file
+write.csv(design_data$groups, file.path(DIR_DGE, "SampleTable_GLmicroarray.csv"), row.names = FALSE)
+write.csv(design_data$contrasts, file.path(DIR_DGE, "contrasts_GLmicroarray.csv"))
+```
+
+**Custom Functions Used:**
+
+- [runsheet_to_design_matrix()](#runsheet_to_design_matrix)
+
+**Input Data:**
+
+- `runsheet` (Path to runsheet, output from [Step 1](#1-create-sample-runsheet))
+
+**Output Data:**
+
+- `design_data` (a list of R objects containing the sample information and metadata
+ - `design_data$matrix` - a design (or model) matrix describing the conditions in the dataset
+ - `design_data$mapping` - a dataframe mapping the human-readable group names to the names of the conditions modified for use in R
+ - `design_data$groups` - a dataframe of group names and contrasts for each sample
+ - `design_data$contrasts` - a matrix of all pairwise comparisons of the groups)
+- `design` (R object containing the limma study design matrix, indicating the group that each sample belongs to)
+- **SampleTable_GLmicroarray.csv** (table containing samples and their respective groups)
+- **contrasts_GLmicroarray.csv** (table containing all pairwise comparisons)
+
+
+
+### 8b. Perform Individual Probe Level DE
+
+```R
+# Calculate results
+res <- lm_fit_pairwise(norm_data, design)
+
+# Print DE table, without filtering
+limma::write.fit(res, adjust = 'BH',
+ file = "INTERIM.csv",
+ row.names = FALSE,
+ quote = TRUE,
+ sep = ",")
+```
+
+**Custom Functions Used:**
+
+- [lm_fit_pairwise()](#lm_fit_pairwise)
+
+**Input Data:**
+
+- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization))
+- `design` (R object containing the limma study design matrix, indicating the group that each sample belongs to, created in [Step 8a](#8a-generate-design-matrix) above)
+
+**Output Data:**
+
+- INTERIM.csv (Statistical values from individual probe level DE analysis, including:
+ - Log2fc between all pairwise comparisons
+ - T statistic for all pairwise comparison tests
+ - P value for all pairwise comparison tests)
+
+
+
+### 8c. Add Annotation and Stats Columns and Format DE Table
+
+```R
+## Reformat Table for consistency across DE analyses tables within GeneLab ##
+
+# Read in DE table
+df_interim <- read.csv("INTERIM.csv")
+
+print("Remove extra columns from final table")
+
+# These columns are data mapped to column PROBEID as per the original Manufacturer and can be linked as needed
+colnames_to_remove = c(
+ "Genes.Row",
+ "Genes.Col",
+ "Genes.Start",
+ "Genes.Sequence",
+ "Genes.ControlType",
+ "Genes.GeneName",
+ "Genes.SystematicName",
+ "Genes.Description",
+ "AveExpr" # Replaced by 'All.mean' column
+)
+
+df_interim <- df_interim %>% dplyr::select(-any_of(colnames_to_remove))
+
+df_interim <- df_interim %>% dplyr::rename_with(reformat_names, .cols = matches('\\.condition|^Genes\\.'), group_name_mapping = design_data$mapping)
+
+
+# Concatenate expression values for each sample
+df_interim <- df_interim %>% dplyr::bind_cols(norm_data$E)
+
+
+## Add Group Wise Statistics ##
+
+# Group mean and standard deviations for normalized expression values are computed and added to the table
+
+unique_groups <- unique(design_data$group$group)
+for ( i in seq_along(unique_groups) ) {
+ current_group <- unique_groups[i]
+ current_samples <- design_data$group %>%
+ dplyr::group_by(group) %>%
+ dplyr::summarize(
+ samples = sort(unique(sample))
+ ) %>%
+ dplyr::filter(
+ group == current_group
+ ) %>%
+ dplyr::pull()
+
+ print(glue::glue("Computing mean and standard deviation for Group {i} of {length(unique_groups)}"))
+ print(glue::glue("Group: {current_group}"))
+ print(glue::glue("Samples in Group: '{toString(current_samples)}'"))
+
+ df_interim <- df_interim %>%
+ dplyr::mutate(
+ "Group.Mean_{current_group}" := rowMeans(dplyr::select(., all_of(current_samples))),
+ "Group.Stdev_{current_group}" := matrixStats::rowSds(as.matrix(dplyr::select(., all_of(current_samples)))),
+ ) %>%
+ dplyr::ungroup() %>%
+ as.data.frame()
+}
+
+df_interim <- df_interim %>%
+ dplyr::mutate(
+ "All.mean" := rowMeans(dplyr::select(., all_of(SAMPLE_COLUMN_ORDER))),
+ "All.stdev" := matrixStats::rowSds(as.matrix(dplyr::select(., all_of(SAMPLE_COLUMN_ORDER)))),
+ ) %>%
+ dplyr::ungroup() %>%
+ as.data.frame()
+
+STAT_COLUMNS_ORDER <- generate_prefixed_column_order(
+ subjects = colnames(design_data$contrasts),
+ prefixes = c(
+ "Log2fc_",
+ "T.stat_",
+ "P.value_",
+ "Adj.p.value_"
+ )
+ )
+ALL_SAMPLE_STATS_COLUMNS_ORDER <- c(
+ "All.mean",
+ "All.stdev",
+ "F",
+ "F.p.value"
+)
+
+GROUP_MEAN_STDEV_COLUMNS_ORDER <- generate_prefixed_column_order(
+ subjects = unique(design_data$groups$group),
+ prefixes = c(
+ "Group.Mean_",
+ "Group.Stdev_"
+ )
+)
+
+FINAL_COLUMN_ORDER <- c(
+ ANNOTATIONS_COLUMN_ORDER,
+ PROBE_INFO_COLUMN_ORDER,
+ SAMPLE_COLUMN_ORDER,
+ STAT_COLUMNS_ORDER,
+ ALL_SAMPLE_STATS_COLUMNS_ORDER,
+ GROUP_MEAN_STDEV_COLUMNS_ORDER
+ )
+
+## Assert final column order includes all columns from original table
+if (!setequal(FINAL_COLUMN_ORDER, colnames(df_interim))) {
+ write.csv(FINAL_COLUMN_ORDER, "FINAL_COLUMN_ORDER.csv")
+ NOT_IN_DF_INTERIM <- paste(setdiff(FINAL_COLUMN_ORDER, colnames(df_interim)), collapse = ":::")
+ NOT_IN_FINAL_COLUMN_ORDER <- paste(setdiff(colnames(df_interim), FINAL_COLUMN_ORDER), collapse = ":::")
+ stop(glue::glue("Column reordering attempt resulted in different sets of columns than original. Names unique to 'df_interim': {NOT_IN_FINAL_COLUMN_ORDER}. Names unique to 'FINAL_COLUMN_ORDER': {NOT_IN_DF_INTERIM}."))
+}
+
+## Perform reordering
+df_interim <- df_interim %>% dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER))
+
+# Save to file
+write.csv(df_interim, file.path(DIR_DGE, "differential_expression_GLmicroarray.csv"), row.names = FALSE)
+```
+
+**Custom Functions Used:**
+
+- [reformat_names()](#reformat_names)
+- [generate_prefixed_column_order()](#generate_prefixed_column_order)
+
+**Input Data:**
+
+- `design_data` (a list of R objects containing the sample information and metadata, output from [Step 8a](#8a-generate-design-matrix) above)
+- INTERIM.csv (Statistical values from individual probe level DE analysis, output from [Step 8b](#8b-perform-individual-probe-level-de) above)
+- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization))
+
+**Output Data:**
+
+- **differential_expression_GLmicroarray.csv** (table containing normalized expression for each sample, group statistics, Limma probe DE results for each pairwise comparison, and gene annotations)
+
+
+> All steps of the Microarray pipeline are performed using R markdown and the completed R markdown is rendered (via Quarto) as an html file (**NF_MAAgilent1ch_v\*_GLmicroarray.html**) and published in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/) for the respective dataset.
diff --git a/Microarray/Agilent_1-channel/README.md b/Microarray/Agilent_1-channel/README.md
index 79e11ec4..23fbb213 100644
--- a/Microarray/Agilent_1-channel/README.md
+++ b/Microarray/Agilent_1-channel/README.md
@@ -1,7 +1,7 @@
# GeneLab bioinformatics processing pipeline for Agilent 1-channel microarray data
-> **The document [`GL-DPPD-7112.md`](Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112.md) holds an overview and example commands for how GeneLab processes Agilent 1-channel microarray datasets. See the [Repository Links](#repository-links) descriptions below for more information. Processed data output files and processing code is provided for each GLDS dataset along with the processed data in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/).**
+> **The document [`GL-DPPD-7112-A.md`](Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112-A.md) holds an overview and example commands for how GeneLab processes Agilent 1-channel microarray datasets. See the [Repository Links](#repository-links) descriptions below for more information. Processed data output files and processing code is provided for each GLDS dataset along with the processed data in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/).**
---
@@ -20,6 +20,10 @@
- Contains instructions for installing and running the GeneLab MAAgilent1ch workflow
+* [**Array_Annotations**](Array_Annotations)
+
+ - Contains the custom annotations table used in the GeneLab NF_MAAgilent1ch
+
---
**Developed by:**
Jonathan Oribello
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/CHANGELOG.md b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/CHANGELOG.md
index d4beb45b..4d7c4001 100644
--- a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/CHANGELOG.md
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [1.0.5](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_MAAgilent1ch_1.0.5/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch) - 2025-03-03
+
+### Added
+
+- Support for custom annotations, see [specification](examples/annotations/README.md)
+- Add option to skip differential expression analysis (`--skipDE`) ([#104](https://github.com/nasa/GeneLab_Data_Processing/issues/104))
+
## [1.0.4](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_MAAgilent1ch_1.0.4/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch) - 2024-10-02
### Added
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/README.md b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/README.md
index 83298da5..9844cd84 100644
--- a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/README.md
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/README.md
@@ -4,7 +4,7 @@
### Implementation Tools
-The current GeneLab Agilent 1 Channel Microarray consensus processing pipeline (NF_MAAgilent1ch), [GL-DPPD-7112](../../Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112.md), is implemented as a [Nextflow](https://nextflow.io/) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) to run all tools in containers. This workflow (NF_MAAgilent1ch) is run using the command line interface (CLI) of any unix-based system. While knowledge of creating workflows in Nextflow is not required to run the workflow as is, [the Nextflow documentation](https://nextflow.io/docs/latest/index.html) is a useful resource for users who want to modify and/or extend this workflow.
+The current GeneLab Agilent 1 Channel Microarray consensus processing pipeline (NF_MAAgilent1ch), [GL-DPPD-7112-A](../../Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112-A.md), is implemented as a [Nextflow](https://nextflow.io/) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) to run all tools in containers. This workflow (NF_MAAgilent1ch) is run using the command line interface (CLI) of any unix-based system. While knowledge of creating workflows in Nextflow is not required to run the workflow as is, [the Nextflow documentation](https://nextflow.io/docs/latest/index.html) is a useful resource for users who want to modify and/or extend this workflow.
### Workflow & Subworkflows
@@ -14,7 +14,7 @@ The current GeneLab Agilent 1 Channel Microarray consensus processing pipeline (
---
The NF_MAAgilent1ch workflow is composed of three subworkflows as shown in the image above.
-Below is a description of each subworkflow and the additional output files generated that are not already indicated in the [GL-DPPD-7112 pipeline document](../../Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112.md):
+Below is a description of each subworkflow and the additional output files generated that are not already indicated in the [GL-DPPD-7112-A pipeline document](../../Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112-A.md):
1. **Analysis Staging Subworkflow**
@@ -25,7 +25,7 @@ Below is a description of each subworkflow and the additional output files gener
2. **Agilent 1 Channel Microarray Processing Subworkflow**
- Description:
- - This subworkflow uses the staged raw data and metadata parameters from the Analysis Staging Subworkflow to generate processed data using the [GL-DPPD-7112 pipeline](../../Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112.md).
+ - This subworkflow uses the staged raw data and metadata parameters from the Analysis Staging Subworkflow to generate processed data using the [GL-DPPD-7112-A pipeline](../../Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112-A.md).
1. **V&V Pipeline Subworkflow**
@@ -93,9 +93,9 @@ We recommend installing Singularity on a system wide level as per the associated
All files required for utilizing the NF_MAAgilent1ch GeneLab workflow for processing Agilent 1 Channel Microarray data are in the [workflow_code](workflow_code) directory. To get a copy of latest NF_MAAgilent1ch version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands:
```bash
-wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_MAAgilent1ch_1.0.4/NF_MAAgilent1ch_1.0.4.zip
+wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_MAAgilent1ch_1.0.5/NF_MAAgilent1ch_1.0.5.zip
-unzip NF_MAAgilent1ch_1.0.4.zip
+unzip NF_MAAgilent1ch_1.0.5.zip
```
@@ -104,7 +104,7 @@ unzip NF_MAAgilent1ch_1.0.4.zip
### 3. Run the Workflow
-While in the location containing the `NF_MAAgilent1ch_1.0.4` directory that was downloaded in [step 2](#2-download-the-workflow-files), you are now able to run the workflow. Below are three examples of how to run the NF_MAAgilent1ch workflow:
+While in the location containing the `NF_MAAgilent1ch_1.0.5` directory that was downloaded in [step 2](#2-download-the-workflow-files), you are now able to run the workflow. Below are three examples of how to run the NF_MAAgilent1ch workflow:
> Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --ensemblVersion) that denote workflow specific parameters. Take care to use the proper number of hyphens for each argument.
@@ -112,7 +112,7 @@ While in the location containing the `NF_MAAgilent1ch_1.0.4` directory that was
#### 3a. Approach 1: Run the workflow on a GeneLab Agilent 1 Channel Microarray dataset
```bash
-nextflow run NF_MAAgilent1ch_1.0.4/main.nf \
+nextflow run NF_MAAgilent1ch_1.0.5/main.nf \
-profile singularity \
--osdAccession OSD-548 \
--gldsAccession GLDS-548
@@ -125,7 +125,7 @@ nextflow run NF_MAAgilent1ch_1.0.4/main.nf \
> Note: Specifications for creating a runsheet manually are described [here](examples/runsheet/README.md).
```bash
-nextflow run NF_MAAgilent1ch_1.0.4/main.nf \
+nextflow run NF_MAAgilent1ch_1.0.5/main.nf \
-profile singularity \
--runsheetPath
```
@@ -134,7 +134,7 @@ nextflow run NF_MAAgilent1ch_1.0.4/main.nf \
**Required Parameters For All Approaches:**
-* `NF_MAAgilent1ch_1.0.4/main.nf` - Instructs Nextflow to run the NF_MAAgilent1ch workflow
+* `NF_MAAgilent1ch_1.0.5/main.nf` - Instructs Nextflow to run the NF_MAAgilent1ch workflow
* `-profile` - Specifies the configuration profile(s) to load, `singularity` instructs Nextflow to setup and use singularity for all software called in the workflow
@@ -166,7 +166,7 @@ nextflow run NF_MAAgilent1ch_1.0.4/main.nf \
All parameters listed above and additional optional arguments for the NF_MAAgilent1ch workflow, including debug related options that may not be immediately useful for most users, can be viewed by running the following command:
```bash
-nextflow run NF_MAAgilent1ch_1.0.4/main.nf --help
+nextflow run NF_MAAgilent1ch_1.0.5/main.nf --help
```
See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details common to all nextflow workflows.
@@ -180,11 +180,11 @@ See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nex
All R code steps and output are rendered within a Quarto document yielding the following:
- Output:
- - NF_MAAgilent1ch_1.0.4.html (html report containing executed code and output including QA plots)
+ - NF_MAAgilent1ch_1.0.5.html (html report containing executed code and output including QA plots)
The outputs from the Analysis Staging and V&V Pipeline Subworkflows are described below:
-> Note: The outputs from the Agilent 1 Channel Microarray Processing Subworkflow are documented in the [GL-DPPD-7112.md](../../../Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112.md) processing protocol.
+> Note: The outputs from the Agilent 1 Channel Microarray Processing Subworkflow are documented in the [GL-DPPD-7112-A.md](../../../Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112-A.md) processing protocol.
**Analysis Staging Subworkflow**
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/examples/annotations/README.md b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/examples/annotations/README.md
new file mode 100644
index 00000000..10a81ae2
--- /dev/null
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/examples/annotations/README.md
@@ -0,0 +1,20 @@
+# Custom Annotations Specification
+
+## Description
+
+* If using custom gene annotations when processing Agilent 1-channel datasets through GeneLab's Agilent 1-channel processing pipeline, a csv config file must be provided as specified below.
+* See [Agilent_array_annotations.csv](../Array_Annotations/Agilent_array_annotations.csv) for the latest config file used at GeneLab.
+
+
+## Example
+
+- [config.csv](config.csv)
+
+
+## Required columns
+
+| Column Name | Type | Description | Example |
+|:------------|:-----|:------------|:--------|
+| array_design | string | A bioMart attribute identifier denoting the microarray probe/probeset attribute used for annotation mapping. | AGILENT SurePrint G3 GE 8x60k v3 |
+| annot_type | string | Used to determine how the custom annotations are parsed before merging to the data. Currently, only the below are supported: - `agilent`: Annotations file is expected to be in the AA (All Annotations) format by [Agilent](https://earray.chem.agilent.com/earray/)
- `custom`: Annotations file is merged as is, expected to have the following columns: `ProbesetID`, `ENTREZID`, `SYMBOL`, `GENENAME`, `ENSEMBL`, `REFSEQ`, `GOSLIM_IDS`, `STRING_id`, `count_gene_mappings`, `gene_mapping_source`
| agilent |
+| annot_filename | string | Name of the custom annotations file. | 072363_D_AA_20240521.txt |
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/examples/annotations/config.csv b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/examples/annotations/config.csv
new file mode 100644
index 00000000..3ff24708
--- /dev/null
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/examples/annotations/config.csv
@@ -0,0 +1,2 @@
+array_design,annot_type,annot_filename
+AGILENT SurePrint G3 GE 8x60k v3,agilent,072363_D_AA_20240521.txt
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/Agile1CMP.qmd b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/Agile1CMP.qmd
index 73e555d8..6a1166ed 100644
--- a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/Agile1CMP.qmd
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/Agile1CMP.qmd
@@ -1,6 +1,6 @@
---
title: "Agilent 1 Channel Processing"
-subtitle: "Workflow Version: NF_MAAgilent1ch_1.0.4"
+subtitle: "`r paste0('Workflow Version: NF_MAAgilent1ch_', params$workflow_version)`"
date: now
title-block-banner: true
format:
@@ -14,13 +14,16 @@ format:
number-sections: true
params:
+ workflow_version: NULL
id: NULL # str, used to name output files
runsheet: NULL # str, path to runsheet
biomart_attribute: NULL # str, used as a fallback value if 'Array Design REF' column is not found in the runsheet
- annotation_file_path: NULL # str, Annotation file from 'genelab_annots_link' column of https://github.com/nasa/GeneLab_Data_Processing/blob/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv
- organism: NULL # str, Used to determine primary keytype
+ annotation_file_path: NULL # str, Annotation file from 'genelab_annots_link' column of GeneLab Annotations file
+ ensembl_version: NULL # str, Used to determine ensembl version
+ local_annotation_dir: NULL
+ annotation_config_path: NULL
DEBUG_limit_biomart_query: NULL # int, If supplied, only the first n probeIDs are queried
-
+ run_DE: 'true'
---
## Validate Parameters
@@ -34,6 +37,10 @@ if (is.null(params$runsheet)) {
runsheet = params$runsheet #
+# If using custom annotation, local_annotation_dir is path to directory containing annotation file and annotation_config_path is path/url to config file
+local_annotation_dir <- params$local_annotation_dir #
+annotation_config_path <- params$annotation_config_path #
+
message(params)
## Set up output structure
@@ -56,32 +63,12 @@ print("Loading Runsheet...") # NON_DPPD
# fileEncoding removes strange characters from the column names
df_rs <- read.csv(runsheet, check.names = FALSE, fileEncoding = 'UTF-8-BOM')
-## Determines the organism specific annotation file to use based on the organism in the runsheet
-fetch_organism_specific_annotation_file_path <- function(organism) {
- # Uses the GeneLab GL-DPPD-7110_annotations.csv file to find the organism specific annotation file path
- # Raises an exception if the organism does not have an associated annotation file yet
-
-
- all_organism_table <- read.csv("https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv")
-
- annotation_file_path <- all_organism_table %>% dplyr::filter(species == organism) %>% dplyr::pull(genelab_annots_link)
-
- # Guard clause: Ensure annotation_file_path populated
- # Else: raise exception for unsupported organism
- if (length(annotation_file_path) == 0) {
- stop(glue::glue("Organism supplied '{organism}' is not supported. See the following url for supported organisms: https://github.com/nasa/GeneLab_Data_Processing/blob/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv. Supported organisms will correspond to a row based on the 'species' column and include a url in the 'genelab_annots_link' column of that row"))
- }
-
- return(annotation_file_path)
-}
-annotation_file_path <- fetch_organism_specific_annotation_file_path(unique(df_rs$organism))
-
# NON_DPPD:START
print("Here is the embedded runsheet")
DT::datatable(df_rs)
# NON_DPPD:END
print("Loading Raw Data...") # NON_DPPD
-allTrue <- function(i_vector) {
+all_true <- function(i_vector) {
if ( length(i_vector) == 0 ) {
stop(paste("Input vector is length zero"))
}
@@ -89,13 +76,13 @@ allTrue <- function(i_vector) {
}
# Define paths to raw data files
-runsheetPathsAreURIs <- function(df_runsheet) {
- allTrue(stringr::str_starts(df_runsheet$`Array Data File Path`, "https"))
+runsheet_paths_are_URIs <- function(df_runsheet) {
+ all_true(stringr::str_starts(df_runsheet$`Array Data File Path`, "https"))
}
# Download raw data files
-downloadFilesFromRunsheet <- function(df_runsheet) {
+download_files_from_runsheet <- function(df_runsheet) {
urls <- df_runsheet$`Array Data File Path`
destinationFiles <- df_runsheet$`Array Data File Name`
@@ -111,9 +98,9 @@ downloadFilesFromRunsheet <- function(df_runsheet) {
destinationFiles # Return these paths
}
-if ( runsheetPathsAreURIs(df_rs) ) {
+if ( runsheet_paths_are_URIs(df_rs) ) {
print("Determined Raw Data Locations are URIS")
- local_paths <- downloadFilesFromRunsheet(df_rs)
+ local_paths <- download_files_from_runsheet(df_rs)
} else {
print("Or Determined Raw Data Locations are local paths")
local_paths <- df_rs$`Array Data File Path`
@@ -121,7 +108,7 @@ if ( runsheetPathsAreURIs(df_rs) ) {
# uncompress files if needed
-if ( allTrue(stringr::str_ends(local_paths, ".gz")) ) {
+if ( all_true(stringr::str_ends(local_paths, ".gz")) ) {
print("Determined these files are gzip compressed... uncompressing now")
# This does the uncompression
lapply(local_paths, R.utils::gunzip, remove = FALSE, overwrite = TRUE)
@@ -174,9 +161,12 @@ message(paste0("Number of Probes: ", dim(raw_data)[1])) # NON_DPPD
DT::datatable(raw_data$targets, caption = "Sample to File Mapping")
DT::datatable(head(raw_data$genes, n = 20), caption = "First 20 rows of raw data file embedded probes to genes table")
# NON_DPPD:END
+
+annotation_file_path <- params$annotation_file_path
+ensembl_version <- params$ensembl_version
```
-## QA For Raw Data
+## Raw Data Quality Assessment
### Density Plot
@@ -213,7 +203,7 @@ legend("topright", legend = colnames(raw_data),
#| layout-ncol: 2
#| column: screen-inset-right # Allow images to flow all the way to the right
#| fig-align: left
-agilentImagePlot <- function(eListRaw, transform_func = identity) {
+agilent_image_plot <- function(eListRaw, transform_func = identity) {
# Adapted from this discussion: https://support.bioconductor.org/p/15523/
copy_raw_data <- eListRaw
copy_raw_data$genes$Block <- 1 # Agilent arrays only have one block
@@ -232,7 +222,7 @@ agilentImagePlot <- function(eListRaw, transform_func = identity) {
}
}
-agilentImagePlot(raw_data, transform_func = function(expression_matrix) log2(expression_matrix + 1))
+agilent_image_plot(raw_data, transform_func = function(expression_matrix) log2(expression_matrix + 1))
```
### MA Plots
@@ -272,7 +262,7 @@ for ( array_i in seq(colnames(raw_data$E)) ) {
#| fig-align: left
#| fig-width: 14
#| fig-height: !expr max(3, ncol(raw_data) * 0.2)
-boxplotExpressionSafeMargin <- function(data, transform_func = identity, xlab = "Log2 Intensity") {
+boxplot_expression_safe_margin <- function(data, transform_func = identity, xlab = "Log2 Intensity") {
# NON_DPPD:START
#' plot boxplots of expression values
#'
@@ -287,7 +277,7 @@ boxplotExpressionSafeMargin <- function(data, transform_func = identity, xlab =
ggplot2::labs(y= "Sample Name", x = xlab)
}
-boxplotExpressionSafeMargin(raw_data, transform_func = log2)
+boxplot_expression_safe_margin(raw_data, transform_func = log2)
```
## Background Correction
@@ -353,7 +343,7 @@ legend("topright", legend = colnames(norm_data),
#| layout-ncol: 2
#| column: screen-inset-right # Allow images to flow all the way to the right
#| fig-align: left
-agilentImagePlot(norm_data,
+agilent_image_plot(norm_data,
transform_func = function(expression_matrix) log2(2**expression_matrix + 1) # Compute as log2 of normalized expression after adding a +1 offset to prevent negative values in the pseudoimage
)
```
@@ -380,20 +370,17 @@ for ( array_i in seq(colnames(norm_data$E)) ) {
#| fig-align: left
#| fig-width: 14
#| fig-height: !expr max(3, ncol(norm_data) * 0.2)
-boxplotExpressionSafeMargin(norm_data)
+boxplot_expression_safe_margin(norm_data)
```
+## Probe Annotations
-## Perform Probe Differential Expression
-
-### Probe Differential Expression (DE)
-
-#### Add Probe Annotations
+### Get Probe Annotations
``` {r retrieve-probe-annotations}
#| message: false
-shortenedOrganismName <- function(long_name) {
+shortened_organism_name <- function(long_name) {
#' Convert organism names like 'Homo Sapiens' into 'hsapiens'
tokens <- long_name %>% stringr::str_split(" ", simplify = TRUE)
genus_name <- tokens[1]
@@ -405,7 +392,7 @@ shortenedOrganismName <- function(long_name) {
return(short_name)
}
-getBioMartAttribute <- function(df_rs) {
+get_biomart_attribute <- function(df_rs) {
#' Returns resolved biomart attribute source from runsheet
# NON_DPPD:START
#' this either comes from the runsheet or as a fall back, the parameters injected during render
@@ -460,122 +447,205 @@ get_ensembl_genomes_mappings_from_ftp <- function(organism, ensembl_genomes_port
return(mapping)
}
+# Convert list of multi-mapped genes to string
+list_to_unique_piped_string <- function(str_list) {
+ #! convert lists into strings denoting unique elements separated by '|' characters
+ #! e.g. c("GO1","GO2","GO2","G03") -> "GO1|GO2|GO3"
+ return(toString(unique(str_list)) %>% stringr::str_replace_all(pattern = stringr::fixed(", "), replacement = "|"))
+}
+
-organism <- shortenedOrganismName(unique(df_rs$organism))
+organism <- shortened_organism_name(unique(df_rs$organism))
+annot_key <- ifelse(organism %in% c("athaliana"), 'TAIR', 'ENSEMBL')
if (organism %in% c("athaliana")) {
- ensembl_genomes_version = "54"
+ ENSEMBL_VERSION = ensembl_version
ensembl_genomes_portal = "plants"
- print(glue::glue("Using ensembl genomes ftp to get specific version of probe id mapping table. Ensembl genomes portal: {ensembl_genomes_portal}, version: {ensembl_genomes_version}"))
- expected_attribute_name <- getBioMartAttribute(df_rs)
+ print(glue::glue("Using ensembl genomes ftp to get specific version of probe id mapping table. Ensembl genomes portal: {ensembl_genomes_portal}, version: {ENSEMBL_VERSION}"))
+ expected_attribute_name <- get_biomart_attribute(df_rs)
df_mapping <- get_ensembl_genomes_mappings_from_ftp(
organism = organism,
ensembl_genomes_portal = ensembl_genomes_portal,
- ensembl_genomes_version = ensembl_genomes_version,
+ ensembl_genomes_version = ENSEMBL_VERSION,
biomart_attribute = expected_attribute_name
)
# TAIR from the mapping tables tend to be in the format 'AT1G01010.1' but the raw data has 'AT1G01010'
# So here we remove the '.NNN' from the mapping table where .NNN is any number
df_mapping$ensembl_gene_id <- stringr::str_replace_all(df_mapping$ensembl_gene_id, "\\.\\d+$", "")
+
+ use_custom_annot <- FALSE
} else {
# Use biomart from main Ensembl website which archives keep each release on the live service
# locate dataset
- expected_dataset_name <- shortenedOrganismName(unique(df_rs$organism)) %>% stringr::str_c("_gene_ensembl")
+ expected_dataset_name <- shortened_organism_name(unique(df_rs$organism)) %>% stringr::str_c("_gene_ensembl")
print(paste0("Expected dataset name: '", expected_dataset_name, "'"))
message(paste0("Expected dataset name: '", expected_dataset_name, "'")) # NON_DPPD
+ expected_attribute_name <- get_biomart_attribute(df_rs)
+ print(paste0("Expected attribute name: '", expected_attribute_name, "'"))
+ message(paste0("Expected attribute name: '", expected_attribute_name, "'")) # NON_DPPD
# Specify Ensembl version used in current GeneLab reference annotations
- ENSEMBL_VERSION <- '107'
+ ENSEMBL_VERSION <- ensembl_version
print(paste0("Searching for Ensembl Version: ", ENSEMBL_VERSION)) # NON_DPPD
print(glue::glue("Using Ensembl biomart to get specific version of mapping table. Ensembl version: {ENSEMBL_VERSION}"))
- ensembl <- biomaRt::useEnsembl(biomart = "genes",
- dataset = expected_dataset_name,
- version = ENSEMBL_VERSION)
- print(ensembl)
+ # Check if organism/array design is supported in biomart
+ use_custom_annot <- TRUE
- expected_attribute_name <- getBioMartAttribute(df_rs)
- print(paste0("Expected attribute name: '", expected_attribute_name, "'"))
- message(paste0("Expected attribute name: '", expected_attribute_name, "'")) # NON_DPPD
+ ensembl <- biomaRt::useEnsembl(biomart = "genes", version = ENSEMBL_VERSION)
+ ensembl_datasets <- biomaRt::listDatasets(ensembl)
+ if (expected_dataset_name %in% ensembl_datasets$dataset) {
+ ensembl <- biomaRt::useEnsembl(biomart = "genes", dataset = expected_dataset_name, version = ENSEMBL_VERSION)
+ ensembl_attributes <- biomaRt::listAttributes(ensembl)
+ if (expected_attribute_name %in% ensembl_attributes$name) {
+ use_custom_annot <- FALSE
+ }
+ }
+
+ if (use_custom_annot) {
+ unloadNamespace("biomaRt")
+ } else {
+ print(ensembl)
- probe_ids <- unique(norm_data$genes$ProbeName)
+ probe_ids <- unique(norm_data$genes$ProbeName)
- # DEBUG:START
- if ( is.integer(params$DEBUG_limit_biomart_query) ) {
- warning(paste("DEBUG MODE: Limiting query to", params$DEBUG_limit_biomart_query, "entries"))
- message(paste("DEBUG MODE: Limiting query to", params$DEBUG_limit_biomart_query, "entries"))
- probe_ids <- probe_ids[1:params$DEBUG_limit_biomart_query]
- }
- # DEBUG:END
-
- # Create probe map
- # Run Biomart Queries in chunks to prevent request timeouts
- # Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk size
- CHUNK_SIZE= 1500
- probe_id_chunks <- split(probe_ids, ceiling(seq_along(probe_ids) / CHUNK_SIZE))
- df_mapping <- data.frame()
- for (i in seq_along(probe_id_chunks)) {
- probe_id_chunk <- probe_id_chunks[[i]]
- print(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})"))
- message(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})")) # NON_DPPD
- chunk_results <- biomaRt::getBM(
- attributes = c(
- expected_attribute_name,
- "ensembl_gene_id"
- ),
- filters = expected_attribute_name,
- values = probe_id_chunk,
- mart = ensembl)
-
- if (nrow(chunk_results) > 0) {
- df_mapping <- df_mapping %>% dplyr::bind_rows(chunk_results)
+ # DEBUG:START
+ if ( is.integer(params$DEBUG_limit_biomart_query) ) {
+ warning(paste("DEBUG MODE: Limiting query to", params$DEBUG_limit_biomart_query, "entries"))
+ message(paste("DEBUG MODE: Limiting query to", params$DEBUG_limit_biomart_query, "entries"))
+ probe_ids <- probe_ids[1:params$DEBUG_limit_biomart_query]
+ }
+ # DEBUG:END
+
+ # Create probe map
+ # Run Biomart Queries in chunks to prevent request timeouts
+ # Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk size
+ CHUNK_SIZE= 1500
+ probe_id_chunks <- split(probe_ids, ceiling(seq_along(probe_ids) / CHUNK_SIZE))
+ df_mapping <- data.frame()
+ for (i in seq_along(probe_id_chunks)) {
+ probe_id_chunk <- probe_id_chunks[[i]]
+ print(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})"))
+ message(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})")) # NON_DPPD
+ chunk_results <- biomaRt::getBM(
+ attributes = c(
+ expected_attribute_name,
+ "ensembl_gene_id"
+ ),
+ filters = expected_attribute_name,
+ values = probe_id_chunk,
+ mart = ensembl)
+
+ if (nrow(chunk_results) > 0) {
+ df_mapping <- df_mapping %>% dplyr::bind_rows(chunk_results)
+ }
+
+ Sys.sleep(10) # Slight break between requests to prevent back-to-back requests
}
-
- Sys.sleep(10) # Slight break between requests to prevent back-to-back requests
}
}
# At this point, we have df_mapping from either the biomart live service or the ensembl genomes ftp archive depending on the organism
-```
+# If no df_mapping obtained (e.g., not supported in biomart), use custom annotations; otherwise, merge in-house annotations to df_mapping
-``` {r reformat-merge-probe-annotations}
+if (use_custom_annot) {
+ expected_attribute_name <- 'ProbeName'
-# Convert list of multi-mapped genes to string
-listToUniquePipedString <- function(str_list) {
- #! convert lists into strings denoting unique elements separated by '|' characters
- #! e.g. c("GO1","GO2","GO2","G03") -> "GO1|GO2|GO3"
- return(toString(unique(str_list)) %>% stringr::str_replace_all(pattern = stringr::fixed(", "), replacement = "|"))
-}
+ annot_type <- 'NO_CUSTOM_ANNOT'
+ if (!is.null(local_annotation_dir) && !is.null(annotation_config_path)) {
+ config_df <- read.csv(annotation_config_path, row.names=1)
+ if (unique(df_rs$`biomart_attribute`) %in% row.names(config_df)) {
+ annot_config <- config_df[unique(df_rs$`biomart_attribute`), ]
+ annot_type <- annot_config$annot_type[[1]]
+ } else {
+ warning(paste0("No entry for '", unique(df_rs$`biomart_attribute`), "' in provided config file: ", annotation_config_path))
+ }
+ } else {
+ warning("Need to provide both local_annotation_dir and annotation_config_path to use custom annotation.")
+ }
+
+ if (annot_type == 'agilent') {
+ unique_probe_ids <- read.delim(
+ file.path(local_annotation_dir, annot_config$annot_filename[[1]]),
+ header = TRUE, na.strings = c('NA', '')
+ )[c('ProbeID', 'EnsemblID', 'GeneSymbol', 'GeneName', 'RefSeqAccession', 'EntrezGeneID', 'GO')]
+
+ stopifnot(nrow(unique_probe_ids) == length(unique(unique_probe_ids$ProbeID)))
+
+ # Clean columns
+ unique_probe_ids$GO <- purrr::map_chr(stringr::str_extract_all(unique_probe_ids$GO, 'GO:\\d{7}'), ~paste0(unique(.), collapse = '|')) %>% stringr::str_replace('^$', NA_character_)
+
+ names(unique_probe_ids) <- c('ProbeName', 'ENSEMBL', 'SYMBOL', 'GENENAME', 'REFSEQ', 'ENTREZID', 'GOSLIM_IDS')
+
+ unique_probe_ids$STRING_id <- NA_character_
+
+ gene_col <- 'ENSEMBL'
+ if (sum(!is.na(unique_probe_ids$ENTREZID)) > sum(!is.na(unique_probe_ids$ENSEMBL))) {
+ gene_col <- 'ENTREZID'
+ }
+ if (sum(!is.na(unique_probe_ids$SYMBOL)) > max(sum(!is.na(unique_probe_ids$ENTREZID)), sum(!is.na(unique_probe_ids$ENSEMBL)))) {
+ gene_col <- 'SYMBOL'
+ }
-unique_probe_ids <- df_mapping %>%
- # note: '!!sym(VAR)' syntax allows usage of variable 'VAR' in dplyr functions due to NSE. ref: https://dplyr.tidyverse.org/articles/programming.html # NON_DPPD
- dplyr::group_by(!!sym(expected_attribute_name)) %>%
- dplyr::summarise(
- ENSEMBL = listToUniquePipedString(ensembl_gene_id)
+ unique_probe_ids <- unique_probe_ids %>%
+ dplyr::mutate(
+ count_gene_mappings = 1 + stringr::str_count(get(gene_col), stringr::fixed("|")),
+ gene_mapping_source = gene_col
+ )
+ } else if (annot_type == 'custom') {
+ unique_probe_ids <- read.csv(
+ file.path(local_annotation_dir, annot_config$annot_filename[[1]]),
+ header = TRUE, na.strings = c('NA', '')
+ )
+ } else {
+ annot_cols <- c('ProbeName', 'ENTREZID', 'SYMBOL', 'GENENAME', 'ENSEMBL', 'REFSEQ', 'GOSLIM_IDS', 'STRING_id', 'count_gene_mappings', 'gene_mapping_source')
+ unique_probe_ids <- setNames(data.frame(matrix(NA_character_, nrow = 1, ncol = length(annot_cols))), annot_cols)
+ }
+} else {
+ annot <- read.table(
+ as.character(annotation_file_path),
+ sep = "\t",
+ header = TRUE,
+ quote = "",
+ comment.char = ""
+ )
+
+ unique_probe_ids <- df_mapping %>%
+ # note: '!!sym(VAR)' syntax allows usage of variable 'VAR' in dplyr functions due to NSE. ref: https://dplyr.tidyverse.org/articles/programming.html # NON_DPPD
+ dplyr::mutate(dplyr::across(!!sym(expected_attribute_name), as.character)) %>% # Ensure probe ids treated as character type
+ dplyr::group_by(!!sym(expected_attribute_name)) %>%
+ dplyr::summarise(
+ ENSEMBL = list_to_unique_piped_string(ensembl_gene_id)
+ ) %>%
+ # Count number of ensembl IDS mapped
+ dplyr::mutate(
+ count_gene_mappings = 1 + stringr::str_count(ENSEMBL, stringr::fixed("|")),
+ gene_mapping_source = annot_key
) %>%
- # Count number of ensembl IDS mapped
- dplyr::mutate(
- count_ENSEMBL_mappings = 1 + stringr::str_count(ENSEMBL, stringr::fixed("|"))
- )
+ dplyr::left_join(annot, by = c("ENSEMBL" = annot_key))
+}
+```
+``` {r reformat-merge-probe-annotations}
norm_data$genes <- norm_data$genes %>%
dplyr::left_join(unique_probe_ids, by = c("ProbeName" = expected_attribute_name ) ) %>%
- dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) )
+ dplyr::mutate( count_gene_mappings := ifelse(is.na(count_gene_mappings), 0, count_gene_mappings) ) %>%
+ dplyr::mutate( gene_mapping_source := unique(unique_probe_ids$gene_mapping_source) )
```
-### Summarize Biomart Mapping vs. Manufacturer Mapping
+### Summarize Gene Mapping
``` {r summarize-remapping-vs-original-mapping}
#| message = FALSE
# Pie Chart with Percentages
slices <- c(
'Control probes' = nrow(norm_data$gene %>% dplyr::filter(ControlType != 0) %>% dplyr::distinct(ProbeName)),
- 'Unique Mapping' = nrow(norm_data$gene %>% dplyr::filter(ControlType == 0) %>% dplyr::filter(count_ENSEMBL_mappings == 1) %>% dplyr::distinct(ProbeName)),
- 'Multi Mapping' = nrow(norm_data$gene %>% dplyr::filter(ControlType == 0) %>% dplyr::filter(count_ENSEMBL_mappings > 1) %>% dplyr::distinct(ProbeName)),
- 'No Mapping' = nrow(norm_data$gene %>% dplyr::filter(ControlType == 0) %>% dplyr::filter(count_ENSEMBL_mappings == 0) %>% dplyr::distinct(ProbeName))
+ 'Unique Mapping' = nrow(norm_data$gene %>% dplyr::filter(ControlType == 0) %>% dplyr::filter(count_gene_mappings == 1) %>% dplyr::distinct(ProbeName)),
+ 'Multi Mapping' = nrow(norm_data$gene %>% dplyr::filter(ControlType == 0) %>% dplyr::filter(count_gene_mappings > 1) %>% dplyr::distinct(ProbeName)),
+ 'No Mapping' = nrow(norm_data$gene %>% dplyr::filter(ControlType == 0) %>% dplyr::filter(count_gene_mappings == 0) %>% dplyr::distinct(ProbeName))
)
pct <- round(slices/sum(slices)*100)
chart_names <- names(slices)
@@ -583,20 +653,81 @@ chart_names <- glue::glue("{names(slices)} ({slices})") # add count to labels
chart_names <- paste(chart_names, pct) # add percents to labels
chart_names <- paste(chart_names,"%",sep="") # ad % to labels
pie(slices,labels = chart_names, col=rainbow(length(slices)),
- main=glue::glue("Biomart Mapping to Ensembl Primary Keytype\n {nrow(norm_data$gene %>% dplyr::distinct(ProbeName))} Total Unique Probes")
+ main=glue::glue("Mapping to Primary Keytype\n {nrow(norm_data$gene %>% dplyr::distinct(ProbeName))} Total Unique Probes")
)
original_mapping_rate = nrow(norm_data$gene %>% dplyr::filter(ControlType == 0) %>% dplyr::filter(ProbeName != SystematicName) %>% dplyr::distinct(ProbeName))
print(glue::glue("Original Manufacturer Reported Mapping Count: {original_mapping_rate}"))
-print(glue::glue("Biomart Unique Mapping Count: {slices[['Unique Mapping']]}"))
+print(glue::glue("Unique Mapping Count: {slices[['Unique Mapping']]}"))
message(glue::glue("Original Manufacturer Reported Mapping Rate: {original_mapping_rate}")) # NON_DPPD
-message(glue::glue("Biomart Unique Mapping Rate: {slices[['Unique Mapping']]}")) # NON_DPPD
+message(glue::glue("Unique Mapping Rate: {slices[['Unique Mapping']]}")) # NON_DPPD
+```
+
+### Generate Annotated Raw and Normalized Expression Tables
+
+```{r save-tables}
+## Reorder columns before saving to file
+ANNOTATIONS_COLUMN_ORDER = c(
+ annot_key,
+ "SYMBOL",
+ "GENENAME",
+ "REFSEQ",
+ "ENTREZID",
+ "STRING_id",
+ "GOSLIM_IDS"
+)
+
+PROBE_INFO_COLUMN_ORDER = c(
+ "ProbeUID",
+ "ProbeName",
+ "count_gene_mappings",
+ "gene_mapping_source"
+)
+SAMPLE_COLUMN_ORDER <- df_rs$`Sample Name`
+
+## Generate raw intensity matrix that includes annotations
+raw_data_matrix <- background_corrected_data$genes %>%
+ dplyr::select(ProbeUID, ProbeName) %>%
+ dplyr::bind_cols(background_corrected_data$E) %>%
+ dplyr::left_join(unique_probe_ids, by = c("ProbeName" = expected_attribute_name ) ) %>%
+ dplyr::mutate( count_gene_mappings = ifelse(is.na(count_gene_mappings), 0, count_gene_mappings) ) %>%
+ dplyr::mutate( gene_mapping_source = unique(unique_probe_ids$gene_mapping_source) )
+
+## Perform reordering
+FINAL_COLUMN_ORDER <- c(
+ ANNOTATIONS_COLUMN_ORDER,
+ PROBE_INFO_COLUMN_ORDER,
+ SAMPLE_COLUMN_ORDER
+ )
+
+raw_data_matrix <- raw_data_matrix %>%
+ dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER))
+
+write.csv(raw_data_matrix, file.path(DIR_RAW_DATA, "raw_intensities_GLmicroarray.csv"), row.names = FALSE)
+
+## Generate normalized expression matrix that includes annotations
+norm_data_matrix <- norm_data$genes %>%
+ dplyr::select(ProbeUID, ProbeName) %>%
+ dplyr::bind_cols(norm_data$E) %>%
+ dplyr::left_join(unique_probe_ids, by = c("ProbeName" = expected_attribute_name ) ) %>%
+ dplyr::mutate( count_gene_mappings = ifelse(is.na(count_gene_mappings), 0, count_gene_mappings) ) %>%
+ dplyr::mutate( gene_mapping_source = unique(unique_probe_ids$gene_mapping_source) )
+
+norm_data_matrix <- norm_data_matrix %>%
+ dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER))
+
+write.csv(norm_data_matrix, file.path(DIR_NORMALIZED_EXPRESSION, "normalized_expression_GLmicroarray.csv"), row.names = FALSE)
```
+## Perform Probe Differential Expression (DE)
+
### Generate Design Matrix
``` {r generate-design-matrix}
-runsheetToDesignMatrix <- function(runsheet_path) {
+#| include: !expr params$run_DE
+#| eval: !expr params$run_DE
+
+runsheet_to_design_matrix <- function(runsheet_path) {
df = read.csv(runsheet_path)
# get only Factor Value columns
factors = as.data.frame(df[,grep("Factor.Value", colnames(df), ignore.case=TRUE)])
@@ -639,7 +770,7 @@ runsheetToDesignMatrix <- function(runsheet_path) {
# Loading metadata from runsheet csv file
-design_data <- runsheetToDesignMatrix(runsheet)
+design_data <- runsheet_to_design_matrix(runsheet)
design <- design_data$matrix
# Write SampleTable.csv and contrasts.csv file
@@ -650,7 +781,10 @@ write.csv(design_data$contrasts, file.path(DIR_DGE, "contrasts_GLmicroarray.csv"
### Perform Individual Probe Level DE
``` {r perform-probe-differential-expression}
-lmFitPairwise <- function(norm_data, design) {
+#| include: !expr params$run_DE
+#| eval: !expr params$run_DE
+
+lm_fit_pairwise <- function(norm_data, design) {
#' Perform all pairwise comparisons
#' Approach based on limma manual section 17.4 (version 3.52.4)
@@ -669,7 +803,7 @@ lmFitPairwise <- function(norm_data, design) {
}
# Calculate results
-res <- lmFitPairwise(norm_data, design)
+res <- lm_fit_pairwise(norm_data, design)
DT::datatable(limma::topTable(res)) # NON_DPPD
# Print DE table, without filtering
@@ -680,16 +814,34 @@ limma::write.fit(res, adjust = 'BH',
sep = ",")
```
-### Add Additional Columns and Format DE Table
+### Add Annotation and Stats Columns and Format DE Table
-``` {r add-additional-columns-and-format-de-table}
-#| warning: false
+``` {r save-de-table}
#| message: false
+#| include: !expr params$run_DE
+#| eval: !expr params$run_DE
## Reformat Table for consistency across DE analyses tables within GeneLab ##
# Read in DE table
df_interim <- read.csv("INTERIM.csv")
+print("Remove extra columns from final table")
+
+# These columns are data mapped to column PROBEID as per the original Manufacturer and can be linked as needed
+colnames_to_remove = c(
+ "Genes.Row",
+ "Genes.Col",
+ "Genes.Start",
+ "Genes.Sequence",
+ "Genes.ControlType",
+ "Genes.GeneName",
+ "Genes.SystematicName",
+ "Genes.Description",
+ "AveExpr" # Replaced by 'All.mean' column
+)
+
+df_interim <- df_interim %>% dplyr::select(-any_of(colnames_to_remove))
+
# Reformat column names
reformat_names <- function(colname, group_name_mapping) {
# NON_DPPD:START
@@ -708,10 +860,7 @@ reformat_names <- function(colname, group_name_mapping) {
stringr::str_replace(pattern = "^P.value.condition", replacement = "P.value_") %>%
stringr::str_replace(pattern = "^Coef.condition", replacement = "Log2fc_") %>% # This is the Log2FC as per: https://rdrr.io/bioc/limma/man/writefit.html
stringr::str_replace(pattern = "^t.condition", replacement = "T.stat_") %>%
- stringr::str_replace(pattern = stringr::fixed("Genes.ProbeName"), replacement = "ProbeName") %>%
- stringr::str_replace(pattern = stringr::fixed("Genes.count_ENSEMBL_mappings"), replacement = "count_ENSEMBL_mappings") %>%
- stringr::str_replace(pattern = stringr::fixed("Genes.ProbeUID"), replacement = "ProbeUID") %>%
- stringr::str_replace(pattern = stringr::fixed("Genes.ENSEMBL"), replacement = "ENSEMBL") %>%
+ stringr::str_replace(pattern = "^Genes\\.", replacement = "") %>%
stringr::str_replace(pattern = ".condition", replacement = "v")
# remap to group names before make.names was applied
@@ -771,85 +920,14 @@ for ( i in seq_along(unique_groups) ) {
## Compute all sample mean and standard deviation
message(glue::glue("Computing mean and standard deviation for all samples"))
# NON_DPPD:END
-all_samples <- design_data$group %>% dplyr::pull(sample)
df_interim <- df_interim %>%
dplyr::mutate(
- "All.mean" := rowMeans(dplyr::select(., all_of(all_samples))),
- "All.stdev" := matrixStats::rowSds(as.matrix(dplyr::select(., all_of(all_samples)))),
+ "All.mean" := rowMeans(dplyr::select(., all_of(SAMPLE_COLUMN_ORDER))),
+ "All.stdev" := matrixStats::rowSds(as.matrix(dplyr::select(., all_of(SAMPLE_COLUMN_ORDER)))),
) %>%
dplyr::ungroup() %>%
as.data.frame()
-print("Remove extra columns from final table")
-
-# These columns are data mapped to column PROBEID as per the original Manufacturer and can be linked as needed
-colnames_to_remove = c(
- "Genes.Row",
- "Genes.Col",
- "Genes.Start",
- "Genes.Sequence",
- "Genes.ControlType",
- "Genes.ProbeName",
- "Genes.GeneName",
- "Genes.SystematicName",
- "Genes.Description",
- "AveExpr" # Replaced by 'All.mean' column
- # "Genes.count_ENSEMBL_mappings", Keep this
-)
-
-df_interim <- df_interim %>% dplyr::select(-any_of(colnames_to_remove))
-
-## Concatenate annotations for genes (for uniquely mapped probes) ##
-### Read in annotation table for the appropriate organism ###
-annot <- read.table(
- annotation_file_path,
- sep = "\t",
- header = TRUE,
- quote = "",
- comment.char = "",
- )
-
-# Join annotation table and uniquely mapped data
-
-# Determine appropriate keytype
-map_primary_keytypes <- c(
- 'Caenorhabditis elegans' = 'ENSEMBL',
- 'Danio rerio' = 'ENSEMBL',
- 'Drosophila melanogaster' = 'ENSEMBL',
- 'Rattus norvegicus' = 'ENSEMBL',
- 'Saccharomyces cerevisiae' = 'ENSEMBL',
- 'Homo sapiens' = 'ENSEMBL',
- 'Mus musculus' = 'ENSEMBL',
- 'Arabidopsis thaliana' = 'TAIR'
-)
-
-df_interim <- merge(
- annot,
- df_interim,
- by.x = map_primary_keytypes[[unique(df_rs$organism)]],
- by.y = "ENSEMBL",
- # ensure all original dge rows are kept.
- # If unmatched in the annotation database, then fill missing with NAN
- all.y = TRUE
- )
-
-## Reorder columns before saving to file
-ANNOTATIONS_COLUMN_ORDER = c(
- map_primary_keytypes[[unique(df_rs$organism)]],
- "SYMBOL",
- "GENENAME",
- "REFSEQ",
- "ENTREZID",
- "STRING_id",
- "GOSLIM_IDS"
-)
-
-PROBE_INFO_COLUMN_ORDER = c(
- "ProbeUID",
- "ProbeName",
- "count_ENSEMBL_mappings"
-)
-SAMPLE_COLUMN_ORDER <- all_samples
generate_prefixed_column_order <- function(subjects, prefixes) {
#' Return a vector of columns based on subject and given prefixes
#' Used for both contrasts and groups column name generation
@@ -882,26 +960,21 @@ ALL_SAMPLE_STATS_COLUMNS_ORDER <- c(
"F.p.value"
)
-GROUP_MEAN_COLUMNS_ORDER <- generate_prefixed_column_order(
- subjects = unique(design_data$groups$group),
- prefixes = c(
- "Group.Mean_"
- )
- )
-GROUP_STDEV_COLUMNS_ORDER <- generate_prefixed_column_order(
+GROUP_MEAN_STDEV_COLUMNS_ORDER <- generate_prefixed_column_order(
subjects = unique(design_data$groups$group),
prefixes = c(
+ "Group.Mean_",
"Group.Stdev_"
- )
)
+)
+
FINAL_COLUMN_ORDER <- c(
ANNOTATIONS_COLUMN_ORDER,
PROBE_INFO_COLUMN_ORDER,
SAMPLE_COLUMN_ORDER,
STAT_COLUMNS_ORDER,
ALL_SAMPLE_STATS_COLUMNS_ORDER,
- GROUP_MEAN_COLUMNS_ORDER,
- GROUP_STDEV_COLUMNS_ORDER
+ GROUP_MEAN_STDEV_COLUMNS_ORDER
)
## Assert final column order includes all columns from original table
@@ -917,74 +990,6 @@ df_interim <- df_interim %>% dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER))
# Save to file
write.csv(df_interim, file.path(DIR_DGE, "differential_expression_GLmicroarray.csv"), row.names = FALSE)
-
-### Generate and export PCA table for GeneLab visualization plots
-## Only use positive expression values, negative values can make up a small portion ( < 0.5% ) of normalized expression values and cannot be log transformed
-exp_raw <- log2(norm_data$E) # negatives get converted to NA
-exp_raw <- na.omit(norm_data$E)
-PCA_raw <- prcomp(t(exp_raw), scale = FALSE)
-write.csv(PCA_raw$x,
- file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.csv")
- )
-
-## Generate raw intensity matrix that includes annotations
-raw_data_matrix <- background_corrected_data$genes %>%
- dplyr::select(ProbeUID, ProbeName) %>%
- dplyr::bind_cols(background_corrected_data$E) %>%
- dplyr::left_join(unique_probe_ids, by = c("ProbeName" = expected_attribute_name ) ) %>%
- dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) )
-
-raw_data_matrix_annotated <- merge(
- annot,
- raw_data_matrix,
- by.x = map_primary_keytypes[[unique(df_rs$organism)]],
- by.y = "ENSEMBL",
- # ensure all original dge rows are kept.
- # If unmatched in the annotation database, then fill missing with NAN
- all.y = TRUE
- )
-
-## Perform reordering
-FINAL_COLUMN_ORDER <- c(
- ANNOTATIONS_COLUMN_ORDER,
- PROBE_INFO_COLUMN_ORDER,
- SAMPLE_COLUMN_ORDER
- )
-
-raw_data_matrix_annotated <- raw_data_matrix_annotated %>%
- dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER))
-
-write.csv(raw_data_matrix_annotated, file.path(DIR_RAW_DATA, "raw_intensities_GLmicroarray.csv"), row.names = FALSE)
-
-## Generate normalized expression matrix that includes annotations
-norm_data_matrix <- norm_data$genes %>%
- dplyr::select(ProbeUID, ProbeName) %>%
- dplyr::bind_cols(norm_data$E) %>%
- dplyr::left_join(unique_probe_ids, by = c("ProbeName" = expected_attribute_name ) ) %>%
- dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) )
-
-norm_data_matrix_annotated <- merge(
- annot,
- norm_data_matrix,
- by.x = map_primary_keytypes[[unique(df_rs$organism)]],
- by.y = "ENSEMBL",
- # ensure all original dge rows are kept.
- # If unmatched in the annotation database, then fill missing with NAN
- all.y = TRUE
- )
-
-
-## Perform reordering
-FINAL_COLUMN_ORDER <- c(
- ANNOTATIONS_COLUMN_ORDER,
- PROBE_INFO_COLUMN_ORDER,
- SAMPLE_COLUMN_ORDER
- )
-
-norm_data_matrix_annotated <- norm_data_matrix_annotated %>%
- dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER))
-
-write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "normalized_expression_GLmicroarray.csv"), row.names = FALSE)
```
## Version Reporting
@@ -1053,11 +1058,11 @@ get_versions <- function() {
## Note Libraries that were NOT used during processing
versions_buffer <- get_versions()
-if (organism %in% c("athaliana")) {
+if (organism %in% c("athaliana") || use_custom_annot) {
versions_buffer <- glue::glue_collapse(c(
versions_buffer,
glue::glue("- name: biomaRt"),
- glue::glue(" version: (Not used for plant datasets)"),
+ glue::glue(" version: (Not used for this dataset)"),
glue::glue(" homepage: https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html"),
glue::glue(" workflow task: PROCESS_AGILE1CH")
), sep = "\n")
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel/checks.py b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel/checks.py
index 6e1d3167..707300f9 100644
--- a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel/checks.py
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel/checks.py
@@ -242,8 +242,8 @@ def utils_common_constraints_on_dataframe(
col_constraints = col_constraints.copy()
# limit to only columns of interest
- query_df = df[col_set]
- for (colname, colseries) in query_df.iteritems():
+ query_df = df[list(col_set)]
+ for (colname, colseries) in query_df.items():
# check non null constraint
if col_constraints.pop("nonNull", False) and nonNull(colseries) == False:
issues["Failed non null constraint"].append(colname)
@@ -321,7 +321,7 @@ def check_dge_table_sample_columns_constraints(
) -> FlagEntry:
MINIMUM_COUNT = 0
# data specific preprocess
- df_dge = pd.read_csv(dge_table)[samples]
+ df_dge = pd.read_csv(dge_table)[list(samples)]
schema = pa.DataFrameSchema({
sample: pa.Column(float)
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel/config.yaml b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel/config.yaml
index f581d824..de550d19 100644
--- a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel/config.yaml
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel/config.yaml
@@ -78,6 +78,7 @@ Staging:
- ISA Field Name:
- Label
- Parameter Value[label]
+ - Parameter Value[Label]
ISA Table Source: Sample
Runsheet Column Name: Label
Processing Usage: >-
@@ -259,16 +260,6 @@ data assets:
resource categories: *DGEAnalysisData
- viz PCA table:
- processed location:
- - *DGEDataDir
- - "visualization_PCA_table_GLmicroarray.csv"
-
- tags:
- - processed
-
- resource categories: *neverPublished
-
data asset sets:
# These assets are not generated in the workflow, but are generated after the workflow
PUTATIVE: []
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel/protocol.py b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel/protocol.py
index 6c0561bd..b3df9a71 100644
--- a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel/protocol.py
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel/protocol.py
@@ -415,33 +415,6 @@ def validate(
""")
)
- with vp.component_start(
- name="Viz Tables",
- description="Extended from the dge tables",
- ):
- with vp.payload(
- payloads=[
- {
- "samples": lambda: set(dataset.samples),
- "pca_table": lambda: dataset.data_assets[
- "viz PCA table"
- ].path,
- }
- ]
- ):
- vp.add(
- bulkRNASeq.checks.check_viz_pca_table_index_and_columns_exist,
- full_description=textwrap.dedent(f"""
- - Check: Ensure all samples (row-indices) present and columns PC1, PC2 and PC3 are present
- - Reason:
- - PCA table should include all samples and PC1, PC2, PC3 (for 3D PCA viz)
- - Potential Source of Problems:
- - Bug in processing script
- - Flag Condition:
- - Green: All samples and all columns present
- - Halt: At least one sample or column is missing
- """)
- )
with vp.component_start(
name="Processing Report",
description="",
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel_skipDE/__init__.py b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel_skipDE/__init__.py
new file mode 100644
index 00000000..5faa427c
--- /dev/null
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel_skipDE/__init__.py
@@ -0,0 +1,9 @@
+from pathlib import Path
+
+# Import for access at the module level
+from . import checks
+from . import protocol
+from . import schemas
+
+# Set config path
+config = Path(__file__).parent / "config.yaml"
\ No newline at end of file
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel_skipDE/checks.py b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel_skipDE/checks.py
new file mode 100644
index 00000000..707300f9
--- /dev/null
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel_skipDE/checks.py
@@ -0,0 +1,618 @@
+import string
+from pathlib import Path
+import logging
+import enum
+from typing import Union
+import itertools
+from statistics import mean
+
+from loguru import logger as log
+import pandas as pd
+import pandera as pa
+
+from dp_tools.core.check_model import FlagCode, FlagEntry, FlagEntryWithOutliers
+from dp_tools.core.entity_model import Dataset
+
+class GroupFormatting(enum.Enum):
+ r_make_names = enum.auto()
+ ampersand_join = enum.auto()
+
+def check_contrasts_table_headers(contrasts_table: Path, runsheet: Path) -> FlagEntry:
+ # data specific preprocess
+ expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True)
+ expected_comparisons = [
+ "v".join(paired_groups)
+ for paired_groups in itertools.permutations(expected_groups, 2)
+ ]
+ df_contrasts = pd.read_csv(contrasts_table, index_col=0)
+
+ # check logic
+ differences = set(expected_comparisons).symmetric_difference(
+ set(df_contrasts.columns)
+ )
+ if not differences:
+ code = FlagCode.GREEN
+ message = f"Contrasts header includes expected comparisons as determined runsheet Factor Value Columns: {sorted(set(expected_comparisons))}"
+ else:
+ code = FlagCode.HALT
+ message = f"Contrasts header does not match expected comparisons as determined runsheet Factor Value Columns: {sorted(differences)}"
+ return {"code": code, "message": message}
+
+def check_metadata_attributes_exist(
+ dataset: Dataset, expected_attrs: list[str]
+) -> FlagEntry:
+ missing_metadata_fields = list(set(expected_attrs) - set(dataset.metadata))
+
+ # check if any missing_metadata_fields are present
+ # check logic
+ if not missing_metadata_fields:
+ code = FlagCode.GREEN
+ message = f"All expected metadata keys found: Expected {expected_attrs}, Found {sorted(set(dataset.metadata))}"
+ else:
+ code = FlagCode.HALT
+ message = f"Missing dataset metadata (source from Runsheet): {sorted(missing_metadata_fields)}"
+ return {"code": code, "message": message}
+
+def check_raw_intensities_table(
+ raw_intensities: Path, samples: list[str]
+) -> FlagEntry:
+ schema = pa.DataFrameSchema(
+ columns = {sample: pa.Column(float, pa.Check.ge(0)) for sample in samples}
+ )
+
+ log.trace(schema)
+
+ df = pd.read_csv(raw_intensities)
+
+ try:
+ schema.validate(df, lazy=True)
+ error_message = None
+ except pa.errors.SchemaErrors as err:
+ log.trace(err)
+ error_message = err.schema_errors
+ if error_message is None:
+ code = FlagCode.GREEN
+ message = (
+ f"Table conforms to schema: {repr(schema)}"
+ )
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"{error_message}"
+ )
+ return {"code": code, "message": message}
+
+def check_normalized_expression_table(
+ normalized_expression: Path, samples: list[str]
+) -> FlagEntry:
+ schema = pa.DataFrameSchema(
+ columns = {sample: pa.Column(float) for sample in samples}
+ )
+
+ df = pd.read_csv(normalized_expression)
+
+ try:
+ schema.validate(df, lazy=True)
+ error_message = None
+ except pa.errors.SchemaErrors as err:
+ log.trace(err)
+ error_message = err.schema_errors
+ if error_message is None:
+ code = FlagCode.GREEN
+ message = (
+ f"Table conforms to schema: {repr(schema)}"
+ )
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"{error_message}"
+ )
+ return {"code": code, "message": message}
+
+def check_viz_table_columns_constraints(
+ dge_table: Path, runsheet: Path, **_
+) -> FlagEntry:
+ # data specific preprocess
+ expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True)
+ expected_comparisons = [
+ "v".join(paired_groups)
+ for paired_groups in itertools.permutations(expected_groups, 2)
+ ]
+ viz_pairwise_columns_constraints = (
+ (
+ {f"Log2_Adj.p.value_{comp}" for comp in expected_comparisons},
+ {"nonNull": False},
+ ),
+ (
+ {f"Sig.1_{comp}" for comp in expected_comparisons},
+ {"allowedValues": [False, True], "nonNull": False},
+ ),
+ (
+ {f"Sig.05_{comp}" for comp in expected_comparisons},
+ {"allowedValues": [False, True], "nonNull": False},
+ ),
+ (
+ {f"Log2_P.value_{comp}" for comp in expected_comparisons},
+ {"nonNegative": False, "nonNull": False},
+ ),
+ (
+ {f"Updown_{comp}" for comp in expected_comparisons},
+ {"allowedValues": [1, -1], "nonNull": True},
+ ),
+ )
+
+ df_viz = pd.read_csv(dge_table)
+
+ # issue trackers
+ # here: {prefix+constraint: [failed_columns]}
+ issues: dict[str, list[str]] = dict()
+
+ issues = utils_common_constraints_on_dataframe(
+ df_viz, viz_pairwise_columns_constraints
+ )
+
+ # check logic
+ if not any([issue_type for issue_type in issues.values()]):
+ code = FlagCode.GREEN
+ message = (
+ f"All values in columns met constraint: {viz_pairwise_columns_constraints}"
+ )
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"Issues found {issues} that"
+ f"fail the contraint: {viz_pairwise_columns_constraints}."
+ )
+ return {"code": code, "message": message}
+
+def utils_runsheet_to_expected_groups(
+ runsheet: Path,
+ formatting: GroupFormatting = GroupFormatting.ampersand_join,
+ limit_to_samples: list = None,
+ map_to_lists: bool = False,
+) -> Union[dict[str, str], dict[str, list[str]]]:
+ df_rs = (
+ pd.read_csv(runsheet, index_col="Sample Name", dtype=str)
+ .filter(regex="^Factor Value\[.*\]")
+ .sort_index()
+ ) # using only Factor Value columns
+
+ if limit_to_samples:
+ df_rs = df_rs.filter(items=limit_to_samples, axis="rows")
+
+ match formatting:
+ case GroupFormatting.r_make_names:
+ expected_conditions_based_on_runsheet = (
+ df_rs.apply(lambda x: "...".join(x), axis="columns")
+ .apply(r_style_make_names) # join factors with '...'
+ .to_dict()
+ ) # reformat entire group in the R style
+ case GroupFormatting.ampersand_join:
+ expected_conditions_based_on_runsheet = df_rs.apply(
+ lambda x: f"({' & '.join(x)})", axis="columns"
+ ).to_dict()
+ case _:
+ raise ValueError(
+ f"Formatting method invalid, must be one of the following: {list(GroupFormatting)}"
+ )
+
+ # convert from {sample: group} dict
+ # to {group: [samples]} dict
+ if map_to_lists:
+ unique_groups = set(expected_conditions_based_on_runsheet.values())
+ reformatted_dict: dict[str, list[str]] = dict()
+ for query_group in unique_groups:
+ reformatted_dict[query_group] = [
+ sample
+ for sample, group in expected_conditions_based_on_runsheet.items()
+ if group == query_group
+ ]
+ expected_conditions_based_on_runsheet: dict[str, list[str]] = reformatted_dict
+
+ return expected_conditions_based_on_runsheet
+
+## Dataframe and Series specific helper functions
+def nonNull(df: pd.DataFrame) -> bool:
+ # negation since it checks if any are null
+ return ~df.isnull().any(axis=None)
+
+
+def nonNegative(df: pd.DataFrame) -> bool:
+ """This ignores null values, use nonNull to validate that condition"""
+ return ((df >= 0) | (df.isnull())).all(axis=None)
+
+
+def onlyAllowedValues(df: pd.DataFrame, allowed_values: list) -> bool:
+ """This ignores null values, use nonNull to validate that condition"""
+ return ((df.isin(allowed_values)) | (df.isnull())).all(axis=None)
+
+
+def utils_common_constraints_on_dataframe(
+ df: pd.DataFrame, constraints: tuple[tuple[set, dict], ...]
+) -> dict:
+
+ issues: dict[str, list[str]] = {
+ "Failed non null constraint": list(),
+ "Failed non negative constraint": list(),
+ }
+
+ for (col_set, col_constraints) in constraints:
+ # this will avoid overriding the original constraints dictionary
+ # which is likely used in the check message
+ col_constraints = col_constraints.copy()
+
+ # limit to only columns of interest
+ query_df = df[list(col_set)]
+ for (colname, colseries) in query_df.items():
+ # check non null constraint
+ if col_constraints.pop("nonNull", False) and nonNull(colseries) == False:
+ issues["Failed non null constraint"].append(colname)
+ # check non negative constraint
+ if (
+ col_constraints.pop("nonNegative", False)
+ and nonNegative(colseries) == False
+ ):
+ issues["Failed non negative constraint"].append(colname)
+ # check allowed values constraint
+ if allowedValues := col_constraints.pop("allowedValues", False):
+ if onlyAllowedValues(colseries, allowedValues) == False:
+ issues["Failed non negative constraint"].append(colname)
+
+ # raise exception if there are unhandled constraint keys
+ if col_constraints:
+ raise ValueError(f"Unhandled constraint types: {col_constraints}")
+
+ return issues
+
+def check_dge_table_log2fc_within_reason(
+ dge_table: Path, runsheet: Path, **_
+) -> FlagEntry:
+ """ Note: This function assumes the normalized expression values are log2 transformed
+ """
+ LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD = 1 # Percent
+ PERCENT_ROWS_WITHIN_TOLERANCE = 99.9 # Percent
+
+ # data specific preprocess
+ expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True)
+ expected_comparisons = [
+ "v".join(paired_groups)
+ for paired_groups in itertools.permutations(expected_groups, 2)
+ ]
+ df_dge = pd.read_csv(dge_table)
+
+ # Track error messages
+ error_list: list[tuple[str,float]] = list()
+ for comparison in expected_comparisons:
+ query_column = f"Log2fc_{comparison}"
+ group1_mean_col = (
+ "Group.Mean_" + comparison.split(")v(")[0] + ")"
+ ) # Uses parens and adds them back to prevent slicing on 'v' within factor names
+ group2_mean_col = "Group.Mean_" + "(" + comparison.split(")v(")[1]
+ print(df_dge[group1_mean_col].describe())
+ print(df_dge[group2_mean_col].describe())
+ computed_log2fc = df_dge[group1_mean_col] - df_dge[group2_mean_col]
+ abs_percent_difference = abs(
+ ((computed_log2fc - df_dge[query_column]) / df_dge[query_column]) * 100
+ )
+ percent_within_tolerance = (
+ mean(
+ abs_percent_difference
+ < LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD
+ )
+ * 100
+ )
+
+ if percent_within_tolerance < PERCENT_ROWS_WITHIN_TOLERANCE: # add current query column to error list
+ error_list.append((query_column,percent_within_tolerance))
+
+ # inplace sort error list for deterministic order
+ error_list.sort()
+ if error_list == list():
+ code = FlagCode.GREEN
+ message = f"All log2fc values recomputed and consistent (within {LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD})"
+ else:
+ code = FlagCode.HALT
+ message = f"At least one log2fc values recomputed and is not consistent (within {LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD}). These columns were flagged: {error_list}"
+
+ return {"code": code, "message": message}
+
+def check_dge_table_sample_columns_constraints(
+ dge_table: Path, samples: set[str], **_
+) -> FlagEntry:
+ MINIMUM_COUNT = 0
+ # data specific preprocess
+ df_dge = pd.read_csv(dge_table)[list(samples)]
+
+ schema = pa.DataFrameSchema({
+ sample: pa.Column(float)
+ for sample in samples
+ })
+
+ try:
+ schema.validate(df_dge, lazy=True)
+ error_cases = None
+ error_data = None
+ except pa.errors.SchemaErrors as err:
+ error_cases = err.failure_cases
+ error_data = err.data
+ if error_cases == error_data == None:
+ code = FlagCode.GREEN
+ message = (
+ f"All values in columns: {samples} met constraints: {repr(schema)}"
+ )
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"{error_cases}:::{error_data}"
+ )
+ return {"code": code, "message": message}
+
+def check_dge_table_group_statistical_columns_constraints(
+ dge_table: Path, runsheet: Path, **_
+) -> FlagEntry:
+ expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True)
+ expected_comparisons = [
+ "v".join(paired_groups)
+ for paired_groups in itertools.permutations(expected_groups, 2)
+ ]
+
+ resolved_constraints = (
+ ({f"Log2fc_{comp}" for comp in expected_comparisons}, {"nonNull": True}),
+ ({f"T.stat_{comp}" for comp in expected_comparisons}, {"nonNull": True}),
+ # can be removed from analysis before p-value and adj-p-value assessed
+ # ref: https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-are-some-p-values-set-to-na
+ (
+ {f"P.value_{comp}" for comp in expected_comparisons},
+ {"nonNegative": True, "nonNull": False},
+ ),
+ (
+ {f"Adj.p.value_{comp}" for comp in expected_comparisons},
+ {"nonNegative": True, "nonNull": False},
+ ),
+ )
+
+ df_dge = pd.read_csv(dge_table)
+
+ # issue trackers
+ # here: {prefix+constraint: [failed_columns]}
+ issues: dict[str, list[str]] = dict()
+
+ issues = utils_common_constraints_on_dataframe(df_dge, resolved_constraints)
+
+ # check logic
+ if not any([issue_type for issue_type in issues.values()]):
+ code = FlagCode.GREEN
+ message = f"All values in columns met constraint: {resolved_constraints}"
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"Issues found {issues} that" f"fail the contraint: {resolved_constraints}."
+ )
+ return {"code": code, "message": message}
+
+def enclose_in_parens(s: str) -> str:
+ """Recreates R's make.names function for individual strings.
+ This function is often used to create syntactically valid names in R which are then saved in R outputs.
+ Source: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/make.names
+
+ Args:
+ s (str): A string to convert
+
+ Returns:
+ str: A string enclosed in parentheses
+ """
+ return f"({s})"
+
+def r_style_make_names(s: str) -> str:
+ """Recreates R's make.names function for individual strings.
+ This function is often used to create syntactically valid names in R which are then saved in R outputs.
+ Source: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/make.names
+
+ Args:
+ s (str): A string to convert
+
+ Returns:
+ str: A string converted in the same way as R's make.names function
+ """
+ EXTRA_WHITELIST_CHARACTERS = "_ΩπϴλθijkuΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρστυφχψω_µ" # Note: there are two "μμ" like characters one is greek letter mu, the other is the micro sign
+ VALID_CHARACTERS = string.ascii_letters + string.digits + "." + EXTRA_WHITELIST_CHARACTERS
+ REPLACEMENT_CHAR = "."
+ new_string_chars = list()
+ for char in s:
+ if char in VALID_CHARACTERS:
+ new_string_chars.append(char)
+ else:
+ new_string_chars.append(REPLACEMENT_CHAR)
+ return "".join(new_string_chars)
+
+def check_if_valid_extensions(file, valid_ext):
+ """description
+ params (description + data type)
+ return (description + data type)"""
+ pass
+def check_factor():
+ """description"""
+ pass
+
+def check_if_extensions_valid(file, valid_ext):
+ """ Description of the function
+ Description Line 2
+
+ :param file: Input raw data file
+ :type file: Path
+ :param valid_ext: Extensions that are allow for the raw data files
+ :type valid_ext: list[str]
+ :return: A required fields-only flag entry dictionary
+ :rtype: FlagEntry
+ """
+ pass # Does nothing (... is equivalent)
+
+def check_factor_values_in_runsheet():
+ """ Description of the function
+ Description Line 2
+
+ :param file: Input raw data file
+ :type file: Path
+ :param valid_ext: Extensions that are allow for the raw data files
+ :type valid_ext: list[str]
+ :return: A required fields-only flag entry dictionary
+ :rtype: FlagEntry
+ """
+ pass # Does nothing (... is equivalent)
+
+def check_sample_table_against_runsheet(
+ runsheet: Path, sampleTable: Path, all_samples_required: bool
+) -> FlagEntry:
+ """Check the sample table includes all samples as denoted in the runsheet.
+
+ Args:
+ runsheet (Path): csv file used for processing, the index denotes all samples
+ sampleTable (Path): csv file that pairs each sample with resolved experimental group (called condition within the table)
+ all_samples_required (bool): denotes if all samples must be shared or if a subset of samples from the runsheet is okay.
+
+ Returns:
+ FlagEntry: A check result
+ """
+ # data specific preprocess
+ df_rs = pd.read_csv(runsheet, index_col="Sample Name").sort_index()
+ df_sample = pd.read_csv(sampleTable, index_col="sample").sort_index()
+
+ extra_samples: dict[str, set[str]] = {
+ "unique_to_runsheet": set(df_rs.index) - set(df_sample.index),
+ "unique_to_sampleTable": set(df_sample.index) - set(df_rs.index),
+ }
+
+ # check logic
+ if any(
+ [
+ (extra_samples["unique_to_runsheet"] and all_samples_required),
+ (extra_samples["unique_to_sampleTable"]),
+ ]
+ ):
+ code = FlagCode.HALT
+ message = f"Samples mismatched: {[f'{entry}:{v}' for entry, v in extra_samples.items() if v]}"
+ else:
+ code = FlagCode.GREEN
+ message = f"All samples accounted for based on runsheet (All samples required?: {all_samples_required})"
+ return {"code": code, "message": message}
+
+def check_dge_table_fixed_statistical_columns_constraints(
+ dge_table: Path, **_
+) -> FlagEntry:
+ # data specific preprocess
+ fixed_stats_columns = (
+ ({"All.mean", "All.stdev"}, {"nonNull": True, "nonNegative": True}),
+ ({"F.p.value"}, {"nonNull": False, "nonNegative": True}),
+ )
+
+ df_dge = pd.read_csv(dge_table)
+
+ # issue trackers
+ # here: {prefix+constraint: [failed_columns]}
+ issues: dict[str, list[str]] = dict()
+
+ issues = utils_common_constraints_on_dataframe(df_dge, fixed_stats_columns)
+
+ # check logic
+ if not any([issue_type for issue_type in issues.values()]):
+ code = FlagCode.GREEN
+ message = f"All values in columns met constraint: {fixed_stats_columns}"
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"Issues found {issues} that" f"fail the constraint: {fixed_stats_columns}."
+ )
+ return {"code": code, "message": message}
+
+def check_dge_table_comparison_statistical_columns_exist(
+ dge_table: Path, runsheet: Path, **_
+) -> FlagEntry:
+ # data specific preprocess
+ COMPARISON_PREFIXES = ["Log2fc_", "T.stat_", "P.value_", "Adj.p.value_"]
+ expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True)
+ expected_comparisons = [
+ "v".join(paired_groups)
+ for paired_groups in itertools.permutations(expected_groups, 2)
+ ]
+ expected_columns = {
+ "".join(comb)
+ for comb in itertools.product(COMPARISON_PREFIXES, expected_comparisons)
+ }
+ df_dge_columns = set(pd.read_csv(dge_table).columns)
+ missing_cols = expected_columns - df_dge_columns
+
+ # check logic
+ if not missing_cols:
+ code = FlagCode.GREEN
+ message = f"All comparison summary statistic columns (Prefixes: {COMPARISON_PREFIXES}) present. {sorted(list(expected_columns))}"
+ else:
+ code = FlagCode.HALT
+ message = f"Missing these comparison summary statistic columns (Prefixes: {COMPARISON_PREFIXES}): {sorted(list(missing_cols))}"
+ return {"code": code, "message": message}
+
+def check_dge_table_fixed_statistical_columns_exist(dge_table: Path, **_) -> FlagEntry:
+ # data specific preprocess
+ fixed_stats_columns = {
+ "All.mean": {"nonNull": True, "nonNegative": True},
+ "All.stdev": {"nonNull": True, "nonNegative": True},
+ "F.p.value": {"nonNull": False, "nonNegative": True},
+ }
+ expected_columns = set(fixed_stats_columns)
+ df_dge_columns = set(pd.read_csv(dge_table).columns)
+ missing_cols = expected_columns - df_dge_columns
+
+ # check logic
+ if not missing_cols:
+ code = FlagCode.GREEN
+ message = f"All dataset summary stat columns present. {sorted(list(expected_columns))}"
+ else:
+ code = FlagCode.HALT
+ message = (
+ f"Missing these dataset summary stat columns: {sorted(list(missing_cols))}"
+ )
+ return {"code": code, "message": message}
+
+def check_sample_table_for_correct_group_assignments(
+ runsheet: Path, sampleTable: Path
+) -> FlagEntry:
+ """Check the sample table is assigned to the correct experimental group.
+ An experimental group is defined by the Factor Value columns found in the runsheet.
+
+ Args:
+ runsheet (Path): csv file used for processing, includes metadata used for experimental group designation
+ sampleTable (Path): csv file that pairs each sample with resolved experimental group (called condition within the table)
+
+ Returns:
+ FlagEntry: A check result
+ """
+ df_sample = pd.read_csv(sampleTable, index_col=0).sort_index()
+ # data specific preprocess
+ df_rs = (
+ pd.read_csv(runsheet, index_col="Sample Name", dtype=str) # Ensure no factor value columns are misinterpreted as numeric
+ .filter(regex="^Factor Value\[.*\]")
+ .loc[df_sample.index] # ensure only sampleTable groups are checked
+ .sort_index()
+ ) # using only Factor Value columns
+
+ # TODO: refactor with utils_runsheet_to_expected_groups
+ expected_conditions_based_on_runsheet = df_rs.apply(
+ lambda x: " & ".join(x), axis="columns"
+ ).apply( # join factors with '...'
+ enclose_in_parens
+ ) # reformat entire group in the R style
+
+ mismatched_rows = expected_conditions_based_on_runsheet != df_sample["group"]
+
+ # check logic
+ if not any(mismatched_rows):
+ code = FlagCode.GREEN
+ message = f"Conditions are formatted and assigned correctly based on runsheet for all {len(df_sample)} samples in sample table: {list(df_sample.index)}"
+ else:
+ code = FlagCode.HALT
+ mismatch_description = (
+ df_sample[mismatched_rows]["group"]
+ + " <--SAMPLETABLE : RUNSHEET--> "
+ + expected_conditions_based_on_runsheet[mismatched_rows]
+ ).to_dict()
+ message = f"Mismatch in expected conditions based on runsheet for these rows: {mismatch_description}"
+ return {"code": code, "message": message}
\ No newline at end of file
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel_skipDE/config.yaml b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel_skipDE/config.yaml
new file mode 100644
index 00000000..8c3d11de
--- /dev/null
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel_skipDE/config.yaml
@@ -0,0 +1,238 @@
+# TOP LEVEL
+NAME: "microarray"
+VERSION: "0"
+
+# anchors for reuse
+_anchors:
+ RawDataDir: &RawDataDir "00-RawData"
+ NormExpDir: &NormExpDir "01-limma_NormExp"
+ DGEDataDir: &DGEDataDir "02-limma_DGE"
+ neverPublished: &neverPublished
+ subcategory: null
+ subdirectory: null
+ publish to repo: false
+ include subdirectory in table: false
+ table order: -1
+
+# configuration for microarray data
+Staging:
+ General:
+ Required Metadata:
+ From ISA:
+ - ISA Field Name: Study Assay Measurement Type
+ ISA Table Source: Investigation
+ Investigation Subtable: STUDY ASSAYS
+ Runsheet Column Name: Study Assay Measurement
+ Processing Usage: >-
+ Mapping to the appropriate processing pipeline for the assay.
+ Example: transcription profiling
+
+ - ISA Field Name: Study Assay Technology Type
+ ISA Table Source: Investigation
+ Investigation Subtable: STUDY ASSAYS
+ Runsheet Column Name: Study Assay Technology Type
+ Processing Usage: >-
+ Mapping to the appropriate processing pipeline for the assay.
+ Example: DNA microarray
+
+ - ISA Field Name: Study Assay Technology Platform
+ ISA Table Source: Investigation
+ Investigation Subtable: STUDY ASSAYS
+ Runsheet Column Name: Study Assay Technology Platform
+ Processing Usage: >-
+ Mapping to the appropriate processing pipeline for the assay.
+ Example: Affymetrix
+
+ - ISA Field Name:
+ - Characteristics[Organism]
+ - Characteristics[organism]
+ ISA Table Source: Sample
+ Runsheet Column Name: organism
+ Processing Usage: >-
+ Mapping to the appropriate alignment reference and annotation databases.
+ Example: Arabidopsis thaliana
+
+ - ISA Field Name:
+ - Array Design REF
+ ISA Table Source: Assay
+ Runsheet Column Name: biomart_attribute
+ Processing Usage: >-
+ Used for identifying biomart attribute name for biomart mapping purposes
+ Example: agilent_wholegenome_4x44k_v1
+
+ - ISA Field Name: Source Name
+ ISA Table Source: Sample
+ Runsheet Column Name: Source Name
+ Processing Usage: >-
+ The source entity that sample is derived from.
+ Example: GSM502538 1
+
+ - ISA Field Name: Sample Name
+ ISA Table Source: Assay
+ Runsheet Column Name: sample_name
+ Runsheet Index: true
+ Processing Usage: >-
+ Sample name is used as a unique sample identifier during processing
+ Example: Atha_Col-0_Root_WT_Ctrl_45min_Rep1_GSM502538
+
+ - ISA Field Name:
+ - Label
+ - Parameter Value[label]
+ - Parameter Value[Label]
+ ISA Table Source: Sample
+ Runsheet Column Name: Label
+ Processing Usage: >-
+ Used to determine if the dataset is one or two channel.
+ Example: Cy3
+
+ - ISA Field Name: Array Data File
+ ISA Table Source: Assay
+ Runsheet Column Name: Array Data File Name
+ Processing Usage: >-
+ Name of the raw data file(s). In some cases, this file is a
+ dataset-wise zip folder containing all raw data files. When this occurs,
+ the ISA Field Named 'Comment: Array Data File Name' designates the
+ raw data file mapping between contents of the dataset-wise zip.
+ Note: Actual locations are denoted in 'array_data_file_path'
+ Example: Atha_Col-0_HypocotylCC_WT_GC_Rep2_GSM2152575_raw.CEL.gz
+
+ - ISA Field Name: Array Data File
+ ISA Table Source: Assay
+ Runsheet Column Name: Array Data File Path
+ GLDS URL Mapping: true
+ Processing Usage: >-
+ Actual locations of the raw data files. Can be either a local path or a uri.
+ Example: 'https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-205/...'
+
+ - ISA Field Name: Comment[Array Data File Name]
+ ISA Table Source: Assay
+ Runsheet Column Name: Comment[Array Data File Name]
+ Value If Not Found: "N/A"
+ Processing Usage: >-
+ Actual locations of the raw data files. Can be either a local path or a uri.
+ Example: 'https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-205/...'
+
+ - ISA Field Name: Hybridization Assay Name
+ ISA Table Source: Assay
+ Runsheet Column Name: Hybridization Assay Name
+ Processing Usage: >-
+ Used to map samples to specific hybridization
+ (e.g. two channels where each channel is a different sample)
+ Also used to determine if separate channel files exist for two channel studies.
+ Example: GSM502538
+
+ - ISA Field Name: Factor Value[{factor_name}]
+ ISA Table Source: [Assay, Sample]
+ Runsheet Column Name: Factor Value[{factor_name}]
+ Matches Multiple Columns: true
+ Match Regex: "Factor Value\\[.*\\]"
+ Append Column Following: "Unit"
+ Processing Usage: >-
+ Factor values in a study. Used to assign experimental groups for each sample.
+ Note: On the runsheet, a subsequent 'Unit' Column value will be
+ suffix-concatenated if it exists.
+ Example: Basal Control
+
+ - ISA Field Name: Unit
+ ISA Table Source: [Assay, Sample]
+ Runsheet Column Name: null
+ Matches Multiple Columns: true
+ Autoload: false # handled by factor value loading above
+ Processing Usage: >-
+ Unit to be suffix-concatenated onto prior Factor value columns.
+ Example: day
+
+ From User:
+ - Runsheet Column Name: GLDS
+ Processing Usage: >-
+ The GLDS accession number
+ Example: GLDS-205
+
+ - Runsheet Column Name: array_data_file_path
+ Processing Usage: >-
+ The location of the array data file. Can be either a url or local path.
+ Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI
+ may be used to retrieve urls given the array data filename (sourced from ISA archive).
+ Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1
+
+ - Runsheet Column Name: is_array_data_file_compressed
+ Processing Usage: >-
+ Denotes if raw data files are compressed.
+ Note: This is determined by '.gz' extension during
+ runsheet generation.
+ Example: 'TRUE'
+
+ - Runsheet Column Name: version
+ Processing Usage: >-
+ The ISA archive version number.
+ Note: Retrieved from the GeneLab
+ Example: 2
+
+ISA Meta:
+ Valid Study Assay Technology And Measurement Types:
+ - measurement: "transcription profiling"
+ technology: "DNA microarray"
+ - measurement: "transcription profiling"
+ technology: "microarray"
+
+ # this is prepended to all file names in the curation assay table
+ Global file prefix: "{datasystem}_array_"
+
+ # configuration related to updating investigation file
+ # each must refer to a STUDY PROCESS in the 'ISA_investigation.yaml' file
+ # LEADCAP_organism should be the studied organisms scientific name with a leading cap
+ Post Processing Add Study Protocol:
+ GeneLab Agilent 1 Channel data processing protocol::{LEADCAP_organism} V1
+
+data assets:
+ runsheet:
+ processed location:
+ - "Metadata"
+ - "{dataset}_microarray_v0_runsheet.csv"
+
+ tags:
+ - raw
+
+ resource categories: *neverPublished
+
+ raw intensities table:
+ processed location:
+ - *RawDataDir
+ - "raw_intensities_GLmicroarray.csv"
+
+ tags:
+ - processed
+
+ resource categories:
+ subcategory: Raw Intensities Table
+ subdirectory: ""
+ publish to repo: true
+ include subdirectory in table: false
+ table order: 1
+
+ normalized expression table:
+ processed location:
+ - *NormExpDir
+ - "normalized_expression_GLmicroarray.csv"
+
+ tags:
+ - processed
+
+ resource categories:
+ subcategory: Normalized Expression Table
+ subdirectory: ""
+ publish to repo: true
+ include subdirectory in table: false
+ table order: 2
+
+data asset sets:
+ # These assets are not generated in the workflow, but are generated after the workflow
+ PUTATIVE: []
+ glds metadata:
+ - "runsheet"
+ processed:
+ - "viz PCA table"
+ - "DE annotated extended for viz table"
+ - "DE contrasts table"
+ - "DE annotated table"
+ - "sample table"
\ No newline at end of file
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel_skipDE/protocol.py b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel_skipDE/protocol.py
new file mode 100644
index 00000000..7ee08a29
--- /dev/null
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel_skipDE/protocol.py
@@ -0,0 +1,309 @@
+from pathlib import Path
+import re
+from typing import Union
+import yaml
+import logging
+import textwrap
+
+from dp_tools.core.entity_model import Dataset
+
+log = logging.getLogger(__name__)
+
+from dp_tools.core.check_model import ValidationProtocol, FlagCode
+# ORDER is intentional to allow shared name functions to overload in favor of microarray
+from dp_tools import bulkRNASeq
+
+from . import checks
+
+CONFIG = {
+ "Metadata-check_metadata_attributes_exist": {
+ "expected_attrs": ["biomart_attribute", "organism"]
+ },
+}
+
+
+def validate(
+ dataset: Dataset,
+ config_path: Path = None,
+ run_args: dict = None,
+ report_args: dict = None,
+ protocol_args: dict = None,
+ defer_run: bool = False,
+) -> Union[ValidationProtocol, ValidationProtocol.Report]:
+
+ if config_path is not None:
+ with open(config_path, "r") as f:
+ config = yaml.safe_load(f)
+ else:
+ config = CONFIG
+
+ if run_args is None:
+ run_args = dict()
+
+ if report_args is None:
+ report_args = dict()
+
+ if protocol_args is None:
+ protocol_args = dict()
+ # init validation protocol
+ vp = ValidationProtocol(**protocol_args)
+ # fmt: on
+ with vp.component_start(
+ name=dataset.name,
+ description="Validate microarray processed data",
+ ):
+ with vp.component_start(
+ name="Metadata", description="Metadata file validation"
+ ):
+ with vp.payload(payloads=[{"dataset": dataset}]):
+ vp.add(
+ checks.check_metadata_attributes_exist,
+ config=config["Metadata-check_metadata_attributes_exist"],
+ full_description=textwrap.dedent(f"""
+ - Check: Runsheet includes required metadata columns, {config["Metadata-check_metadata_attributes_exist"]["expected_attrs"]}
+ - Reason:
+ - 'Organism' column is used to map the appropriate annotation table
+ - 'biomart_attribute' column is used to map the appropriate biomart attribute for initial Ensembl based gene mapping
+ - Potential Source of Problems:
+ - Runsheet does not include these columns. If automatically generated using 'dp_tools', report this to the maintainer of 'dp_tools'. If manually generated, ensure {config["Metadata-check_metadata_attributes_exist"]["expected_attrs"]} columns are populated.
+ - Flag Condition:
+ - Green: Columns {config["Metadata-check_metadata_attributes_exist"]["expected_attrs"]} exist
+ - Halt: Columns {config["Metadata-check_metadata_attributes_exist"]["expected_attrs"]} do not exist
+ """)
+ )
+ vp.add_manual(
+ description = "Manually validate runsheet against ISA assay table",
+ start_instructions = "Open runsheet (in Metadata folder). Open OSD webpage, navigate to microarray assay table",
+ pass_fail_questions = [
+ "Does the runsheet open?",
+ "Is the number of samples in parity?",
+ "Do the file extensions in the 'Array Data File Name' column end in '.txt' or '.txt.gz'?",
+ "Do all factor values exist in both tables. Does the runsheet have units included as appropriate?",
+ ]
+ )
+ with vp.component_start(
+ name="Raw Intensities",
+ description="",
+ ):
+ with vp.payload(
+ payloads=[
+ {
+ "raw_intensities": lambda: dataset.data_assets["raw intensities table"].path,
+ "samples": dataset.samples
+ }
+ ]
+ ):
+ vp.add(
+ checks.check_raw_intensities_table,
+ full_description=textwrap.dedent(f"""
+ - Check: Ensure raw intensities table has all samples and values within [0,+inf)
+ - Reason:
+ - Part of processing output
+ - Potential Source of Problems:
+ - Bug in processing script or malformed raw data files
+ - Flag Condition:
+ - Green: All conditions met
+ - Halt: At least one condition failed
+ """)
+ )
+ with vp.payload(
+ payloads=[
+ {
+ "organism": lambda: dataset.metadata["organism"],
+ "samples": lambda: set(dataset.samples),
+ "dge_table": lambda: dataset.data_assets[
+ "raw intensities table"
+ ].path,
+ "runsheet": lambda: dataset.data_assets["runsheet"].path,
+ }
+ ]
+ ):
+ vp.add(
+ bulkRNASeq.checks.check_dge_table_annotation_columns_exist,
+ full_description=textwrap.dedent(f"""
+ - Check: Ensure ['SYMBOL','GENENAME','REFSEQ','ENTREZID','STRING_id','GOSLIMS_IDS','ENSEMBL'] columns exist (note: 'ENSEMBL' is replaced by 'TAIR' for Arabidospis)
+ - Reason:
+ - These columns should be populated during annotation for all single gene mapping probes
+ - Potential Source of Problems:
+ - Bug in processing script during annotation step
+ - Flag Condition:
+ - Green: All columns present
+ - Halt: At least one column is missing
+ """)
+ )
+ with vp.component_start(
+ name="Normalized expression",
+ description="",
+ ):
+ with vp.payload(
+ payloads=[
+ {
+ "normalized_expression": lambda: dataset.data_assets["normalized expression table"].path,
+ "samples": dataset.samples
+ }
+ ]
+ ):
+ vp.add(
+ checks.check_normalized_expression_table,
+ full_description=textwrap.dedent(f"""
+ - Check: Ensure normalized expression table has all samples and values within (-inf,+inf)
+ - Reason:
+ - Part of processing output. Note: Values are log2 transformed
+ - Potential Source of Problems:
+ - Bug in processing script
+ - Flag Condition:
+ - Green: All conditions met
+ - Halt: At least one condition failed
+ """)
+ )
+ with vp.payload(
+ payloads=[
+ {
+ "organism": lambda: dataset.metadata["organism"],
+ "samples": lambda: set(dataset.samples),
+ "dge_table": lambda: dataset.data_assets[
+ "normalized expression table"
+ ].path,
+ "runsheet": lambda: dataset.data_assets["runsheet"].path,
+ }
+ ]
+ ):
+ vp.add(
+ bulkRNASeq.checks.check_dge_table_annotation_columns_exist,
+ full_description=textwrap.dedent(f"""
+ - Check: Ensure ['SYMBOL','GENENAME','REFSEQ','ENTREZID','STRING_id','GOSLIMS_IDS','ENSEMBL'] columns exist (note: 'ENSEMBL' is replaced by 'TAIR' for Arabidospis)
+ - Reason:
+ - These columns should be populated during annotation for all single gene mapping probes
+ - Potential Source of Problems:
+ - Bug in processing script during annotation step
+ - Flag Condition:
+ - Green: All columns present
+ - Halt: At least one column is missing
+ """)
+ )
+ with vp.component_start(
+ name="Processing Report",
+ description="",
+ ):
+ vp.add_manual(
+ description = "Loading report",
+ start_instructions = "Load html report into web browser",
+ pass_fail_questions = [
+ "Does the report render without issue?",
+ ]
+ )
+ vp.add_manual(
+ description = "Section 2 Load Metadata and Raw Data",
+ start_instructions = "Navigate to section 2",
+ pass_fail_questions = [
+ "Does the content of the table match the runsheet?",
+ "Do the number of entries in the embedded runsheet match the number of samples/runsheet-rows?",
+ ]
+ )
+ vp.add_manual(
+ description = "Section 3 QA For Raw Data - Density Plots",
+ start_instructions = "Navigate to section 3 - Density Plots",
+ pass_fail_questions = [
+ "Is every sample included in the legend?",
+ "Are axes and axe titles clear?",
+ ],
+ pass_flag_questions = [
+ "Are all lines have similar one or two peak shape, each line not likely overlapping?",
+ ]
+ )
+ vp.add_manual(
+ description = "Section 3 QA For Raw Data - Pseudo Image Plots",
+ start_instructions = "Navigate to section 3 - Pseudo Image Plots",
+ pass_fail_questions = [
+ "Does every sample have its own plot?",
+ "Is each plot white-blue gradient?",
+ ],
+ pass_flag_questions = [
+ "Are there clear physical streaks, bright (i.e. white) circles or other features?",
+ ]
+ )
+ vp.add_manual(
+ description = "Section 3 QA For Raw Data - MA Plots",
+ start_instructions = "Navigate to section 3 - MA Plots",
+ pass_fail_questions = [
+ "Does every sample have its own plot?",
+ ],
+ pass_flag_questions = [
+ "Are positive control points distributed from left end to right end of MA cloud",
+ "Are negative control points concentrated at left end of MA cloud?",
+ ]
+ )
+ vp.add_manual(
+ description = "Section 3 QA For Raw Data - Foreground Background Plots",
+ start_instructions = "Navigate to section 3 - Foreground Background Plots",
+ pass_fail_questions = [
+ "Does every sample have its own plot?",
+ ],
+ pass_flag_questions = [
+ "Are the vast majority (i.e. 99.9%) of points are above the blue diagonal line",
+ ]
+ )
+ vp.add_manual(
+ description = "Section 3 QA For Raw Data - Boxplots",
+ start_instructions = "Navigate to section 3 - Boxplots",
+ pass_fail_questions = [
+ "Does every sample have its own plot?",
+ ],
+ pass_flag_questions = [
+ "Do the boxplots have overlapping distributions?",
+ ]
+ )
+ ###################################################
+ ### Normalized data plots
+ ###################################################
+ vp.add_manual(
+ description = "Section 6 Normalized Data Quality Assessment - Density Plots",
+ start_instructions = "Navigate to section 6 - Density Plots",
+ pass_fail_questions = [
+ "Is every sample included in the legend?",
+ "Are axes and axe titles clear?",
+ ],
+ pass_flag_questions = [
+ "Are all lines nearly fully overlapping?",
+ ]
+ )
+ vp.add_manual(
+ description = "Section 6 Normalized Data Quality Assessment - Pseudo Image Plots",
+ start_instructions = "Navigate to section 6 - Pseudo Image Plots",
+ pass_fail_questions = [
+ "Does every sample have its own plot?",
+ ],
+ pass_flag_questions = [
+ "Is each plot white-blue gradient?",
+ "Are there clear physical streaks, bright (i.e. white) circles or other features?",
+ ]
+ )
+ vp.add_manual(
+ description = "Section 6 Normalized Data Quality Assessment - MA Plots",
+ start_instructions = "Navigate to section 6 - MA Plots",
+ pass_fail_questions = [
+ "Does every sample have its own plot?",
+ ],
+ pass_flag_questions = [
+ "Are positive control points distributed from left end to right end of MA 'cloud'",
+ "Are negative control points concentrated at left end of MA 'cloud'?",
+ ]
+ )
+ vp.add_manual(
+ description = "Section 6 Normalized Data Quality Assessment - Boxplots",
+ start_instructions = "Navigate to section 6 - Boxplots",
+ pass_fail_questions = [
+ "Does every sample have its own plot?",
+ ],
+ pass_flag_questions = [
+ "Do the boxplots have largely the same distributions? (i.e. are the boxes all nearly horizontally aligned)",
+ ]
+ )
+ # return protocol object without running or generating a report
+ if defer_run:
+ return vp
+
+ vp.run(**run_args)
+
+ # return report
+ return vp.report(**report_args, combine_with_flags=dataset.loaded_assets_dicts)
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel_skipDE/schemas.py b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel_skipDE/schemas.py
new file mode 100644
index 00000000..9db06de0
--- /dev/null
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/bin/dp_tools__agilent_1_channel_skipDE/schemas.py
@@ -0,0 +1,29 @@
+""" Schemas for validation
+Uses Schema to allow usage of validation functions
+"""
+import pandera as pa
+
+check_single_value = pa.Check(
+ lambda x: len(x.unique()) == 1,
+ title="Check that all values in the column are identical",
+ description="Useful for columns that contain dataset level metadata like organism and paired_end.",
+ error="Dataset level columns do NOT contain one unique value"
+ )
+
+runsheet = pa.DataFrameSchema(
+ columns={
+ "Original Sample Name": pa.Column(str),
+ "Study Assay Measurement": pa.Column(str),
+ "Study Assay Technology Type": pa.Column(str),
+ "Study Assay Technology Platform": pa.Column(str),
+ "Source Name": pa.Column(str),
+ "Label": pa.Column(str),
+ "Hybridization Assay Name": pa.Column(str),
+ "Array Data File Name": pa.Column(str),
+ "Array Data File Path": pa.Column(str),
+ "Original Sample Name": pa.Column(str),
+ "organism": pa.Column(str, check_single_value),
+ },
+ # define checks at the DataFrameSchema-level
+ # checks=check_read2_path_populated_if_paired_end
+ )
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/config/default.config b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/config/default.config
index d4217d6a..c636a6ea 100644
--- a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/config/default.config
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/config/default.config
@@ -12,6 +12,7 @@ params {
Parameters that CAN be overwritten
*/
runsheetPath = false
+ referenceStorePath = './References' // Path to custom references
biomart_attribute = false // Must be supplied if runsheet 'Array design REF' column doesn't indicate it
outputDir = "." // the location for the output from the pipeline (also includes raw data and metadata)
publish_dir_mode = "link" // method for creating publish directory. Default here for hardlink
@@ -22,11 +23,13 @@ params {
*/
// For now, this particular is good for all organisms listed on the file.
annotation_file_path = "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv"
+ annotation_config_path = "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/master/Microarray/Agilent_1-channel/Array_Annotations/Agilent_array_annotations.csv"
/*
DEBUG related parameters, not likely useful in production
*/
skipVV = false // if true, VV will not be performed
+ skipDE = false // if true, DE will not be performed
limit_biomart_query = false // if set to a value, that value is the maximum number of biomart probe IDs to query
max_flag_code = 80 // Maximum flag value allowed, exceeding this value during V&V will cause the workflow to halt
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/config/software/by_docker_image.config b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/config/software/by_docker_image.config
index b9ceb8a9..1a8e1480 100644
--- a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/config/software/by_docker_image.config
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/config/software/by_docker_image.config
@@ -1,9 +1,9 @@
// Config that specifies packaged conda yml files for each process
process {
withName: 'AGILE1CH' {
- container = "quay.io/j_81/gl_images:NF_AffyMP-A_1.0.0-RC7"
+ container = "quay.io/nasa_genelab/gl-microarray:1.0.0"
}
withName: 'RUNSHEET_FROM_GLDS|VV_AGILE1CH|GENERATE_MD5SUMS|UPDATE_ISA_TABLES|GENERATE_SOFTWARE_TABLE' {
- container = "quay.io/j_81/dp_tools:1.3.1"
+ container = "quay.io/nasa_genelab/dp_tools:1.3.5"
}
}
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/main.nf b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/main.nf
index d2c4fcd5..ddea4730 100644
--- a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/main.nf
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/main.nf
@@ -10,6 +10,7 @@ include { VV_AGILE1CH } from './modules/VV_AGILE1CH.nf'
include { AGILE1CH } from './modules/AGILE1CH.nf'
include { RUNSHEET_FROM_GLDS } from './modules/RUNSHEET_FROM_GLDS.nf'
include { GENERATE_SOFTWARE_TABLE } from './modules/GENERATE_SOFTWARE_TABLE'
+include { DUMP_META } from './modules/DUMP_META'
/**************************************************
* HELP MENU **************************************
@@ -33,6 +34,7 @@ if (params.help) {
println(" the GLDS accession id to process through the NF_MAAgilent1ch.")
println(" --runsheetPath Use a local runsheet instead one automatically generated from a GLDS ISA archive.")
println(" --skipVV Skip automated V&V. Default: false")
+ println(" --skipDE Skip DE. Default: false")
println(" --outputDir Directory to save staged raw files and processed files. Default: ")
exit 0
}
@@ -76,15 +78,16 @@ workflow {
channel.fromPath( "${ projectDir }/bin/Agile1CMP.qmd" ),
ch_runsheet,
PARSE_ANNOTATION_TABLE.out.annotations_db_url,
- ch_meta | map { it.organism },
- params.limit_biomart_query
+ PARSE_ANNOTATION_TABLE.out.reference_version_and_source,
+ params.limit_biomart_query,
+ params.skipDE
)
VV_AGILE1CH(
ch_runsheet,
AGILE1CH.out.de,
params.skipVV,
- "${ projectDir }/bin/dp_tools__agilent_1_channel" // dp_tools plugin
+ "${ projectDir }/bin/${ params.skipDE ? 'dp_tools__agilent_1_channel_skipDE' : 'dp_tools__agilent_1_channel' }" // dp_tools plugin
)
// Software Version Capturing
@@ -100,9 +103,13 @@ workflow {
GENERATE_SOFTWARE_TABLE(
ch_software_versions | unique | collectFile(newLine: true, sort: true, cache: false),
- ch_runsheet | splitCsv(header: true, quote: '"') | first | map{ row -> row['Array Data File Name'] }
+ ch_runsheet | splitCsv(header: true, quote: '"') | first | map{ row -> row['Array Data File Name'] },
+ params.skipDE
)
+ // export meta for post processing usage
+ ch_meta | DUMP_META
+
emit:
meta = ch_meta
runsheet = ch_runsheet
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/AGILE1CH.nf b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/AGILE1CH.nf
index cdd1780b..eec53231 100644
--- a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/AGILE1CH.nf
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/AGILE1CH.nf
@@ -8,19 +8,13 @@ process AGILE1CH {
path(qmd) // quarto qmd file to render
path(runsheet_csv) // runsheet to supply as parameter
path(annotation_file_path) // runsheet to supply as parameter
- val(organism) // runsheet to supply as parameter
+ tuple val(ensemblVersion), val(ensemblSource)
val(limit_biomart_query) // DEBUG option, limits biomart queries to the number specified if not set to false
+ val(skipDE) // whether to skip DE
output:
path("NF_MAAgilent1ch_v${workflow.manifest.version}_GLmicroarray.html"), emit: report
- tuple path("02-limma_DGE/contrasts_GLmicroarray.csv"),
- path("02-limma_DGE/SampleTable_GLmicroarray.csv"),
- path("02-limma_DGE/differential_expression_GLmicroarray.csv"),
- path("02-limma_DGE/visualization_PCA_table_GLmicroarray.csv"),
- path("01-limma_NormExp/normalized_expression_GLmicroarray.csv"),
- path("00-RawData/raw_intensities_GLmicroarray.csv"), emit: de_all_files
-
tuple path("02-limma_DGE"),
path("01-limma_NormExp"),
path("00-RawData"), emit: de
@@ -29,14 +23,19 @@ process AGILE1CH {
script:
def limit_biomart_query_parameter = limit_biomart_query ? "-P DEBUG_limit_biomart_query:${limit_biomart_query}" : ''
+ def run_DE = skipDE ? "-P run_DE:'false'" : ''
"""
export HOME=\$PWD;
quarto render \$PWD/${qmd} \
+ -P 'workflow_version:${workflow.manifest.version}' \
-P 'runsheet:${runsheet_csv}' \
-P 'annotation_file_path:${annotation_file_path}' \
- -P 'organism:${organism}' \
- ${limit_biomart_query_parameter}
+ -P 'ensembl_version:${ensemblVersion}' \
+ -P 'local_annotation_dir:${params.referenceStorePath}' \
+ -P 'annotation_config_path:${params.annotation_config_path}' \
+ ${limit_biomart_query_parameter} \
+ ${run_DE}
# Rename report
mv Agile1CMP.html NF_MAAgilent1ch_v${workflow.manifest.version}_GLmicroarray.html
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/DUMP_META/main.nf b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/DUMP_META/main.nf
new file mode 100644
index 00000000..3be63b64
--- /dev/null
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/DUMP_META/main.nf
@@ -0,0 +1,16 @@
+process DUMP_META {
+ publishDir "${ params.outputDir }/${ params.gldsAccession }/GeneLab",
+ mode: params.publish_dir_mode
+
+ input:
+ val(meta)
+
+ output:
+ path("meta.sh")
+
+ script:
+ """
+ # Write the meta file
+ reformat_meta.sh '${ meta }' > meta.sh
+ """
+}
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/DUMP_META/resources/usr/bin/reformat_meta.sh b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/DUMP_META/resources/usr/bin/reformat_meta.sh
new file mode 100644
index 00000000..69855303
--- /dev/null
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/DUMP_META/resources/usr/bin/reformat_meta.sh
@@ -0,0 +1,28 @@
+input=$1
+
+# Remove leading and trailing brackets
+input=$(echo "$input" | sed 's/^\[//; s/\]$//')
+
+# Convert the input to KEY="VALUE" format with sanitized keys
+interim_output=$(echo "$input" | sed -E 's/:/=/g' | tr ',' '\n')
+
+while IFS='=' read -r key value; do
+ # Remove leading and ending quotes from the value, if present
+ value=$(echo "$value" | sed 's/^"\(.*\)"$/\1/')
+
+ # Wrap the value in double quotes
+ value="'$value'"
+
+ # Remove leading spaces
+ key=$(echo "$key" | sed 's/^ *//g')
+
+ # Sanitize the key to follow valid Bash variable name format
+ # Convert upper case to lower case, replace spaces with underscores, and remove square brackets
+ sanitized_key=$(echo "$key" | tr '[:upper:] ' '[:lower:]_' | sed 's/[][]//g')
+
+
+ # Append the key-value pair to the formatted output
+ formatted_output+="$sanitized_key=$value\n"
+done <<< "$interim_output"
+
+echo -e "$formatted_output"
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/GENERATE_SOFTWARE_TABLE/main.nf b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/GENERATE_SOFTWARE_TABLE/main.nf
index 6939155d..811d463b 100644
--- a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/GENERATE_SOFTWARE_TABLE/main.nf
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/GENERATE_SOFTWARE_TABLE/main.nf
@@ -6,12 +6,13 @@ process GENERATE_SOFTWARE_TABLE {
input:
path("software_versions.yaml")
val(filename)
+ val(skipDE)
output:
path("software_versions_GLmicroarray.md")
script:
"""
- SoftwareYamlToMarkdownTable.py software_versions.yaml \"$filename\"
+ SoftwareYamlToMarkdownTable.py software_versions.yaml \"$filename\" $skipDE
"""
}
\ No newline at end of file
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/GENERATE_SOFTWARE_TABLE/resources/usr/bin/SoftwareYamlToMarkdownTable.py b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/GENERATE_SOFTWARE_TABLE/resources/usr/bin/SoftwareYamlToMarkdownTable.py
index 40516f5b..ebbea763 100755
--- a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/GENERATE_SOFTWARE_TABLE/resources/usr/bin/SoftwareYamlToMarkdownTable.py
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/GENERATE_SOFTWARE_TABLE/resources/usr/bin/SoftwareYamlToMarkdownTable.py
@@ -42,7 +42,8 @@
@click.command()
@click.argument("input_yaml", type=click.Path(exists=True))
@click.argument("filename")
-def yamlToMarkdown(input_yaml: Path, filename: str):
+@click.argument("skip_de", type=click.BOOL)
+def yamlToMarkdown(input_yaml: Path, filename: str, skip_de: bool):
""" Using a software versions """
with open(input_yaml, "r") as f:
data = yaml.safe_load(f)
@@ -54,6 +55,10 @@ def yamlToMarkdown(input_yaml: Path, filename: str):
if not filename.endswith('.gz'):
AGILENT_SOFTWARE_DPPD.remove('r.utils')
+ if skip_de:
+ AGILENT_SOFTWARE_DPPD.remove('matrixstats')
+ AGILENT_SOFTWARE_DPPD.remove('statmod')
+
# Filter to direct software used (i.e. exclude dependencies of the software)
df = df.loc[df["name"].str.lower().isin(AGILENT_SOFTWARE_DPPD)]
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/POST_PROCESSING/GENERATE_PROTOCOL/main.nf b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/POST_PROCESSING/GENERATE_PROTOCOL/main.nf
index b899e81b..d63e95af 100644
--- a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/POST_PROCESSING/GENERATE_PROTOCOL/main.nf
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/POST_PROCESSING/GENERATE_PROTOCOL/main.nf
@@ -5,13 +5,14 @@ process GENERATE_PROTOCOL {
input:
path("software_versions_GLmicroarray.md")
- val(organism)
+ path("meta.sh")
+ val(skipDE)
output:
path("PROTOCOL_GLmicroarray.txt")
script:
"""
- generate_protocol.sh $workflow.manifest.version \"$organism\"
+ generate_protocol.sh $workflow.manifest.version $skipDE
"""
}
\ No newline at end of file
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/POST_PROCESSING/GENERATE_PROTOCOL/resources/usr/bin/generate_protocol.sh b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/POST_PROCESSING/GENERATE_PROTOCOL/resources/usr/bin/generate_protocol.sh
index b52a0ad1..6b8db2f7 100755
--- a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/POST_PROCESSING/GENERATE_PROTOCOL/resources/usr/bin/generate_protocol.sh
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/POST_PROCESSING/GENERATE_PROTOCOL/resources/usr/bin/generate_protocol.sh
@@ -25,8 +25,8 @@ done < <(sed -n '/|/p' "$software_versions_file" | sed 's/^ *|//;s/|$//')
# Print the extracted versions
env | grep "_VERSION"
-# Get organism
-organism=$2
+# Determine mapped sections
+source meta.sh
# List of organisms
organism_list=("Homo sapiens" "Mus musculus" "Rattus norvegicus" "Drosophila melanogaster" "Caenorhabditis elegans" "Danio rerio" "Saccharomyces cerevisiae")
@@ -34,6 +34,8 @@ organism_list=("Homo sapiens" "Mus musculus" "Rattus norvegicus" "Drosophila mel
# Check the value of 'organism' variable and set 'GENE_MAPPING_STEP' accordingly
if [[ $organism == "Arabidopsis thaliana" ]]; then
GENE_MAPPING_STEP="Ensembl gene ID mappings were retrieved for each probe using the Plants Ensembl database ftp server (plants.ensembl.org, release 54)."
+elif [[ $biomart_attribute == "AGILENT SurePrint G3 GE 8x60k v3" ]]; then
+ GENE_MAPPING_STEP="Gene annotations were retrieved for each probe from Agilent (https://earray.chem.agilent.com/earray/array/displayViewArrayDesign.do?eArrayAction=view&arraydesignid=ADID40392, created May 2024, accessed November 2024)."
elif [[ " ${organism_list[*]} " == *"${organism//\"/}"* ]]; then
GENE_MAPPING_STEP="Ensembl gene ID mappings were retrieved for each probe using biomaRt (version ${biomaRt_VERSION}), Ensembl database (ensembl.org, release 107)."
else
@@ -61,8 +63,22 @@ else
GENE_ANNOTATION_DB="TBD"
fi
+# Check if DGE was performed
+if $2; then
+ DE_STEP=""
+else
+ DE_STEP="Differential expression analysis was performed in R (version ${R_VERSION}) using limma (version ${limma_VERSION}); all groups were compared pairwise for each probe to generate a moderated t-statistic and associated p- and adjusted p-value."
+fi
+
+# Gene annotations
+if [[ $biomart_attribute == "AGILENT SurePrint G3 GE 8x60k v3" ]]; then
+ ANNOT_STEP="" # Already covered in GENE_MAPPING_STEP
+else
+ ANNOT_STEP="Gene annotations were assigned using the custom annotation tables generated in-house as detailed in GL-DPPD-7110 ([https://github.com/nasa/GeneLab_Data_Processing/blob/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110.md]), with STRINGdb (version 2.8.4), PANTHER.db (version 1.0.11), and ${GENE_ANNOTATION_DB} (version 3.15.0)."
+fi
+
# Read the template file
-template="Data were processed as described in GL-DPPD-7112 ([https://github.com/nasa/GeneLab_Data_Processing/blob/master/Microarray/Agilent_1-channel/Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112.md]), using NF_MAAgilent1ch version $1 ([https://github.com/nasa/GeneLab_Data_Processing/tree/NF_MAAgilent1ch_$1/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch]). In short, a RunSheet containing raw data file location and processing metadata from the study's *ISA.zip file was generated using dp_tools (version ${dp_tools_VERSION}). The raw array data files were loaded into R (version ${R_VERSION}) using limma (version ${limma_VERSION}). Raw data quality assurance density, pseudo image, MA, and foreground-background plots were generated using limma (version ${limma_VERSION}), and boxplots were generated using ggplot2 (version ${ggplot2_VERSION}). The raw intensity data was background corrected and normalized across arrays via the limma (version ${limma_VERSION}) quantile method. Normalized data quality assurance density, pseudo image, and MA plots were generated using limma (version ${limma_VERSION}), and boxplots were generated using ggplot2 (version ${ggplot2_VERSION}). ${GENE_MAPPING_STEP} Differential expression analysis was performed in R (version ${R_VERSION}) using limma (version ${limma_VERSION}); all groups were compared pairwise for each probe to generate a moderated t-statistic and associated p- and adjusted p-value. Gene annotations were assigned using the custom annotation tables generated in-house as detailed in GL-DPPD-7110 ([https://github.com/nasa/GeneLab_Data_Processing/blob/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110.md]), with STRINGdb (version 2.8.4), PANTHER.db (version 1.0.11), and ${GENE_ANNOTATION_DB} (version 3.15.0)."
+template="Data were processed as described in GL-DPPD-7112-A ([https://github.com/nasa/GeneLab_Data_Processing/blob/master/Microarray/Agilent_1-channel/Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112-A.md]), using NF_MAAgilent1ch version $1 ([https://github.com/nasa/GeneLab_Data_Processing/tree/NF_MAAgilent1ch_$1/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch]). In short, a RunSheet containing raw data file location and processing metadata from the study's *ISA.zip file was generated using dp_tools (version ${dp_tools_VERSION}). The raw array data files were loaded into R (version ${R_VERSION}) using limma (version ${limma_VERSION}). Raw data quality assurance density, pseudo image, MA, and foreground-background plots were generated using limma (version ${limma_VERSION}), and boxplots were generated using ggplot2 (version ${ggplot2_VERSION}). The raw intensity data was background corrected and normalized across arrays via the limma (version ${limma_VERSION}) quantile method. Normalized data quality assurance density, pseudo image, and MA plots were generated using limma (version ${limma_VERSION}), and boxplots were generated using ggplot2 (version ${ggplot2_VERSION}). ${GENE_MAPPING_STEP} ${DE_STEP} ${ANNOT_STEP}"
# Output the filled template
echo "$template" > PROTOCOL_GLmicroarray.txt
\ No newline at end of file
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/VV_AGILE1CH.nf b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/VV_AGILE1CH.nf
index e1eeaee2..e43ec0f2 100644
--- a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/VV_AGILE1CH.nf
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/modules/VV_AGILE1CH.nf
@@ -6,13 +6,13 @@ process VV_AGILE1CH {
saveAs: { "VV_Logs/VV_log_${ task.process.replace(":","-") }_GLmicroarray.tsv.MANUAL_CHECKS_PENDING" }
// V&V'ed data publishing
publishDir "${ params.outputDir }/${ params.gldsAccession }",
- pattern: '00-RawData',
+ pattern: '00-RawData/**',
mode: params.publish_dir_mode
publishDir "${ params.outputDir }/${ params.gldsAccession }",
- pattern: '01-limma_NormExp',
+ pattern: '01-limma_NormExp/**',
mode: params.publish_dir_mode
publishDir "${ params.outputDir }/${ params.gldsAccession }",
- pattern: '02-limma_DGE',
+ pattern: '02-limma_DGE/**',
mode: params.publish_dir_mode
publishDir "${ params.outputDir }/${ params.gldsAccession }",
pattern: 'Metadata/**',
@@ -22,18 +22,15 @@ process VV_AGILE1CH {
input:
path("VV_INPUT/Metadata/*") // While files from processing are staged, we instead want to use the files located in the publishDir for QC
- tuple path("VV_INPUT/02-limma_DGE"),
- path("VV_INPUT/01-limma_NormExp"),
- path("VV_INPUT/00-RawData")
- // "While files from processing are staged, we instead want to use the files located in the publishDir for QC
+ path("VV_INPUT/*") // "While files from processing are staged, we instead want to use the files located in the publishDir for QC
val(skipVV) // Skips running V&V but will still publish the files
- path("dp_tools__agilent_1_channel")
+ path(dp_tools_path)
output:
path("Metadata/*_runsheet.csv"), emit: VVed_runsheet
- tuple path("02-limma_DGE"),
- path("01-limma_NormExp"),
- path("00-RawData"), emit: VVed_de_data
+ path("02-limma_DGE/*"), emit: VVed_DGE, optional: true
+ path("01-limma_NormExp/*"), emit: VVed_NormExp
+ path("00-RawData/*"), emit: VVed_rawData
path("VV_report_GLmicroarray.tsv.MANUAL_CHECKS_PENDING"), optional: params.skipVV, emit: log
path("versions.yml"), emit: versions
@@ -45,7 +42,7 @@ process VV_AGILE1CH {
# Run V&V unless user requests to skip V&V
if ${ !skipVV} ; then
- dpt validation run dp_tools__agilent_1_channel . Metadata/*_runsheet.csv
+ dpt validation run ${ dp_tools_path } . Metadata/*_runsheet.csv
mv VV_report.tsv.MANUAL_CHECKS_PENDING VV_report_GLmicroarray.tsv.MANUAL_CHECKS_PENDING
fi
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/nextflow.config b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/nextflow.config
index d0d705ee..efa5fdf4 100644
--- a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/nextflow.config
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/nextflow.config
@@ -41,11 +41,11 @@ profiles {
manifest {
homePage = 'https://github.com/nasa/GeneLab_Data_Processing/tree/master/Microarray/Agilent_1-channel'
- description = 'Agilent 1 Channel Microarray Workflow for Document GL-DPPD-7112'
+ description = 'Agilent 1 Channel Microarray Workflow for Document GL-DPPD-7112-A'
mainScript = 'main.nf'
defaultBranch = 'main'
- nextflowVersion = '>=23.10.1'
- version = '1.0.4'
+ nextflowVersion = '>=24.10.5'
+ version = '1.0.5'
}
def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/post_processing.nf b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/post_processing.nf
index 21203fc4..64d791fa 100644
--- a/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/post_processing.nf
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/post_processing.nf
@@ -51,18 +51,20 @@ workflow {
ch_processed_directory = Channel.fromPath("${ params.outputDir }/${ params.gldsAccession }", checkIfExists: true)
ch_runsheet = Channel.fromPath("${ params.outputDir }/${ params.gldsAccession }/Metadata/*_runsheet.csv", checkIfExists: true)
ch_software_versions = Channel.fromPath("${ params.outputDir }/${ params.gldsAccession }/GeneLab/software_versions_GLmicroarray.md", checkIfExists: true)
+ ch_processing_meta = Channel.fromPath("${ params.outputDir }/${ params.gldsAccession }/GeneLab/meta.sh", checkIfExists: true)
GENERATE_MD5SUMS(
ch_processed_directory,
ch_runsheet,
- "${ projectDir }/bin/dp_tools__agilent_1_channel" // dp_tools plugin
+ "${ projectDir }/bin/${ params.skipDE ? 'dp_tools__agilent_1_channel_skipDE' : 'dp_tools__agilent_1_channel' }" // dp_tools plugin
)
UPDATE_ISA_TABLES(
ch_processed_directory,
ch_runsheet,
- "${ projectDir }/bin/dp_tools__agilent_1_channel" // dp_tools plugin
+ "${ projectDir }/bin/${ params.skipDE ? 'dp_tools__agilent_1_channel_skipDE' : 'dp_tools__agilent_1_channel' }" // dp_tools plugin
)
GENERATE_PROTOCOL(
ch_software_versions,
- ch_runsheet | splitCsv(header: true, quote: '"') | first | map{ row -> row['organism'] }
+ ch_processing_meta,
+ params.skipDE
)
}
\ No newline at end of file
diff --git a/Microarray/Agilent_1-channel/Workflow_Documentation/README.md b/Microarray/Agilent_1-channel/Workflow_Documentation/README.md
index 1b90bac6..a20ca913 100644
--- a/Microarray/Agilent_1-channel/Workflow_Documentation/README.md
+++ b/Microarray/Agilent_1-channel/Workflow_Documentation/README.md
@@ -6,7 +6,7 @@
|Pipeline Version|Current Workflow Version (for respective pipeline version)|Nextflow Version|
|:---------------|:---------------------------------------------------------|:---------------|
-|*[GL-DPPD-7112.md](../Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112.md)|[NF_MAAgilent1ch_1.0.4](NF_MAAgilent1ch)|23.10.1|
+|*[GL-DPPD-7112-A.md](../Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112-A.md)|[NF_MAAgilent1ch_1.0.5](NF_MAAgilent1ch)|24.10.5|
*Current GeneLab Pipeline/Workflow Implementation