From 08cefec1e96b84d7cf004bf2831866d59e153ad2 Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Sat, 30 Nov 2024 20:26:47 -0800 Subject: [PATCH 01/25] NF_MAAffymetrix: update qmd structure --- .../workflow_code/bin/Affymetrix.qmd | 186 +++++++++--------- .../modules/PROCESS_AFFYMETRIX.nf | 1 + 2 files changed, 95 insertions(+), 92 deletions(-) diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd index c5f3bec0..8ab701c3 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd @@ -1,6 +1,6 @@ --- title: "Affymetrix Processing" -subtitle: "Workflow Version: NF_MAAffymetrix_1.0.5" +subtitle: "`r paste0('Workflow Version: NF_MAAffymetrix_', params$workflow_version)`" date: now title-block-banner: true format: @@ -14,6 +14,7 @@ format: number-sections: true params: + workflow_version: NULL id: NULL # str, used to name output files runsheet: NULL # str, path to runsheet biomart_attribute: NULL # str, used as a fallback value if 'Array Design REF' column is not found in the runsheet @@ -479,11 +480,9 @@ DT::datatable(head(raw_data$genes, n = 20), caption = "First 20 rows of raw data # NON_DPPD:END ``` -## Perform Probeset Differential Expression and Annotation +## Probeset Annotations -### Probeset Differential Expression (DE) - -#### Add Probeset Annotations +### Get Probeset Annotations ``` {r retrieve-probeset-annotations} #| message: false @@ -755,9 +754,92 @@ print(glue::glue("Unique Mapping Count: {slices[['Unique Mapping']]}")) message(glue::glue("Unique Mapping Count: {slices[['Unique Mapping']]}")) # NON_DPPD ``` +### Add Annotation Columns and Format Tables + +```{r save-tables} +## Reorder columns before saving to file +ANNOTATIONS_COLUMN_ORDER = c( + annot_key, + "SYMBOL", + "GENENAME", + "REFSEQ", + "ENTREZID", + "STRING_id", + "GOSLIM_IDS" +) + +SAMPLE_COLUMN_ORDER <- df_rs$`Sample Name` + +probeset_expression_matrix.biomart_mapped <- probeset_expression_matrix.biomart_mapped %>% dplyr::rename( !!annot_key := ENSEMBL ) + +## Output column subset file with just normalized probeset level expression values +write.csv( + probeset_expression_matrix.biomart_mapped[c( + ANNOTATIONS_COLUMN_ORDER, + "ProbesetID", + primary_key_count, + SAMPLE_COLUMN_ORDER) + ], file.path(DIR_NORMALIZED_EXPRESSION, "normalized_expression_probeset_GLmicroarray.csv"), row.names = FALSE) + +## Determine column order for probe level tables + +PROBE_INFO_COLUMN_ORDER = c( + "ProbesetID", + "ProbeID", + primary_key_count +) + +FINAL_COLUMN_ORDER <- c( + ANNOTATIONS_COLUMN_ORDER, + PROBE_INFO_COLUMN_ORDER, + SAMPLE_COLUMN_ORDER + ) + +## Generate raw intensity matrix that includes annotations + +background_corrected_data_annotated <- oligo::exprs(background_corrected_data) %>% + as.data.frame() %>% + tibble::rownames_to_column(var = "fid") %>% # Ensure rownames (probeset IDs) can be used as join key + dplyr::mutate(dplyr::across(fid, as.integer)) %>% # Ensure fid is integer type, consistent with getProbeInfo typing + dplyr::right_join(oligo::getProbeInfo(background_corrected_data), by = "fid") %>% # Add 'man_fsetid' via mapping based on fid + dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID + dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID + dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% # Join with biomaRt ENSEMBL mappings + dplyr::mutate( !!primary_key_count := ifelse(is.na(get(primary_key)), 0, get(primary_key_count)) ) %>% # Convert NA mapping to 0 + dplyr::rename( !!annot_key := ENSEMBL ) + +## Perform reordering +background_corrected_data_annotated <- background_corrected_data_annotated %>% + dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER)) + +write.csv(background_corrected_data_annotated, file.path(DIR_RAW_DATA, "raw_intensities_probe_GLmicroarray.csv"), row.names = FALSE) + +## Generate normalized expression matrix that includes annotations +norm_data_matrix_annotated <- oligo::exprs(norm_data) %>% + as.data.frame() %>% + tibble::rownames_to_column(var = "fid") %>% # Ensure rownames (probeset IDs) can be used as join key + dplyr::mutate(dplyr::across(fid, as.integer)) %>% # Ensure fid is integer type, consistent with getProbeInfo typing + dplyr::right_join(oligo::getProbeInfo(norm_data), by = "fid") %>% # Add 'man_fsetid' via mapping based on fid + dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID + dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID + dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% + dplyr::mutate( !!primary_key_count := ifelse(is.na(get(primary_key)), 0, get(primary_key_count)) ) %>% # Convert NA mapping to 0 + dplyr::rename( !!annot_key := ENSEMBL ) + +norm_data_matrix_annotated <- norm_data_matrix_annotated %>% + dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER)) + +write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "normalized_intensities_probe_GLmicroarray.csv"), row.names = FALSE) +``` + +## Perform Probeset Differential Expression (DE) + ### Generate Design Matrix ``` {r generate-design-matrix} +#| include: !expr params$run_DE +#| eval: !expr params$run_DE + runsheetToDesignMatrix <- function(runsheet_path) { df <- read.csv(runsheet, check.names = FALSE) %>% dplyr::mutate_all(function(x) iconv(x, "latin1", "ASCII", sub="")) # Convert all characters to ascii, when not possible, remove the character # get only Factor Value columns @@ -804,11 +886,9 @@ runsheetToDesignMatrix <- function(runsheet_path) { design_data <- runsheetToDesignMatrix(runsheet) design <- design_data$matrix -if (params$run_DE) { - # Write SampleTable.csv and contrasts.csv file - write.csv(design_data$groups, file.path(DIR_DGE, "SampleTable_GLmicroarray.csv"), row.names = FALSE) - write.csv(design_data$contrasts, file.path(DIR_DGE, "contrasts_GLmicroarray.csv")) -} +# Write SampleTable.csv and contrasts.csv file +write.csv(design_data$groups, file.path(DIR_DGE, "SampleTable_GLmicroarray.csv"), row.names = FALSE) +write.csv(design_data$contrasts, file.path(DIR_DGE, "contrasts_GLmicroarray.csv")) ``` ### Perform Individual Probeset Level DE @@ -845,92 +925,14 @@ limma::write.fit(res, adjust = 'BH', row.names = FALSE, quote = TRUE, sep = ",") + +### Generate and export PCA table for GeneLab visualization plots +PCA_raw <- prcomp(t(exprs(probeset_level_data)), scale = FALSE) # Note: expression at the Probeset level is already log2 transformed +write.csv(PCA_raw$x, file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.csv")) ``` ### Add Additional Columns and Format DE Table -```{r save-tables} -## Reorder columns before saving to file -ANNOTATIONS_COLUMN_ORDER = c( - annot_key, - "SYMBOL", - "GENENAME", - "REFSEQ", - "ENTREZID", - "STRING_id", - "GOSLIM_IDS" -) - -SAMPLE_COLUMN_ORDER <- design_data$group %>% dplyr::pull(sample) - -probeset_expression_matrix.biomart_mapped <- probeset_expression_matrix.biomart_mapped %>% dplyr::rename( !!annot_key := ENSEMBL ) - -## Output column subset file with just normalized probeset level expression values -write.csv( - probeset_expression_matrix.biomart_mapped[c( - ANNOTATIONS_COLUMN_ORDER, - "ProbesetID", - primary_key_count, - SAMPLE_COLUMN_ORDER) - ], file.path(DIR_NORMALIZED_EXPRESSION, "normalized_expression_probeset_GLmicroarray.csv"), row.names = FALSE) - -if (params$run_DE) { - ### Generate and export PCA table for GeneLab visualization plots - PCA_raw <- prcomp(t(exprs(probeset_level_data)), scale = FALSE) # Note: expression at the Probeset level is already log2 transformed - write.csv(PCA_raw$x, file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.csv")) -} - -## Determine column order for probe level tables - -PROBE_INFO_COLUMN_ORDER = c( - "ProbesetID", - "ProbeID", - primary_key_count -) - -FINAL_COLUMN_ORDER <- c( - ANNOTATIONS_COLUMN_ORDER, - PROBE_INFO_COLUMN_ORDER, - SAMPLE_COLUMN_ORDER - ) - -## Generate raw intensity matrix that includes annotations - -background_corrected_data_annotated <- oligo::exprs(background_corrected_data) %>% - as.data.frame() %>% - tibble::rownames_to_column(var = "fid") %>% # Ensure rownames (probeset IDs) can be used as join key - dplyr::mutate(dplyr::across(fid, as.integer)) %>% # Ensure fid is integer type, consistent with getProbeInfo typing - dplyr::right_join(oligo::getProbeInfo(background_corrected_data), by = "fid") %>% # Add 'man_fsetid' via mapping based on fid - dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID - dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID - dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% # Join with biomaRt ENSEMBL mappings - dplyr::mutate( !!primary_key_count := ifelse(is.na(get(primary_key)), 0, get(primary_key_count)) ) %>% # Convert NA mapping to 0 - dplyr::rename( !!annot_key := ENSEMBL ) - -## Perform reordering -background_corrected_data_annotated <- background_corrected_data_annotated %>% - dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER)) - -write.csv(background_corrected_data_annotated, file.path(DIR_RAW_DATA, "raw_intensities_probe_GLmicroarray.csv"), row.names = FALSE) - -## Generate normalized expression matrix that includes annotations -norm_data_matrix_annotated <- oligo::exprs(norm_data) %>% - as.data.frame() %>% - tibble::rownames_to_column(var = "fid") %>% # Ensure rownames (probeset IDs) can be used as join key - dplyr::mutate(dplyr::across(fid, as.integer)) %>% # Ensure fid is integer type, consistent with getProbeInfo typing - dplyr::right_join(oligo::getProbeInfo(norm_data), by = "fid") %>% # Add 'man_fsetid' via mapping based on fid - dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID - dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID - dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% - dplyr::mutate( !!primary_key_count := ifelse(is.na(get(primary_key)), 0, get(primary_key_count)) ) %>% # Convert NA mapping to 0 - dplyr::rename( !!annot_key := ENSEMBL ) - -norm_data_matrix_annotated <- norm_data_matrix_annotated %>% - dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER)) - -write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "normalized_intensities_probe_GLmicroarray.csv"), row.names = FALSE) -``` - ``` {r save-de-table} #| message: false #| include: !expr params$run_DE diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/PROCESS_AFFYMETRIX.nf b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/PROCESS_AFFYMETRIX.nf index 6f6e3301..e8947df1 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/PROCESS_AFFYMETRIX.nf +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/PROCESS_AFFYMETRIX.nf @@ -28,6 +28,7 @@ process PROCESS_AFFYMETRIX { export HOME=\$PWD; quarto render \$PWD/${qmd} \ + -P 'workflow_version:${workflow.manifest.version}' \ -P 'runsheet:${runsheet_csv}' \ -P 'annotation_file_path:${annotation_file_path}' \ -P 'ensembl_version:${ensemblVersion}' \ From 501ec0f8b367bdb1114e5709f59a279dd0b9fb3d Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Sun, 5 Jan 2025 13:18:33 -0800 Subject: [PATCH 02/25] NF_MAAffymetrix: #113 update handling custom annotations --- .../workflow_code/bin/Affymetrix.qmd | 304 ++++++++++-------- .../resources/usr/bin/generate_protocol.sh | 4 +- 2 files changed, 169 insertions(+), 139 deletions(-) diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd index 8ab701c3..c319d3c0 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd @@ -45,7 +45,10 @@ if (is.null(params$runsheet)) { stop("PARAMETERIZATION ERROR: Must supply runsheet path") } -runsheet = params$runsheet # +runsheet <- params$runsheet # + +# If using custom annotation, local_annotation_dir is path to directory containing annotation file and config +local_annotation_dir <- params$local_annotation_dir # message(params) @@ -66,7 +69,7 @@ dir.create(DIR_DGE) original_par <- par() options(preferRaster=TRUE) # use Raster when possible to avoid antialiasing artifacts in images -options(timeout=1000) +options(timeout=1000) # ensure enough time for data downloads ``` ## Load Metadata and Raw Data @@ -197,6 +200,9 @@ message(paste0("Number of Probes: ", dim(raw_data)[1])) # NON_DPPD DT::datatable(raw_data$targets, caption = "Sample to File Mapping") DT::datatable(head(raw_data$genes, n = 20), caption = "First 20 rows of raw data file embedded probes to genes table") # NON_DPPD:END + +annotation_file_path <- params$annotation_file_path +ensembl_version <- params$ensembl_version ``` ## QA For Raw Data @@ -463,7 +469,7 @@ par(original_par) # Call RMA but skip normalize and background correction since those have already been applied probeset_level_data <- oligo::rma(norm_data, normalize=FALSE, - background=FALSE, + background=FALSE ) # Summarize background-corrected and normalized data @@ -553,54 +559,35 @@ get_ensembl_genomes_mappings_from_ftp <- function(organism, ensembl_genomes_port return(mapping) } +# Convert list of multi-mapped genes to string +listToUniquePipedString <- function(str_list) { + #! convert lists into strings denoting unique elements separated by '|' characters + #! e.g. c("GO1","GO2","GO2","G03") -> "GO1|GO2|GO3" + return(toString(unique(str_list)) %>% stringr::str_replace_all(pattern = stringr::fixed(", "), replacement = "|")) +} -organism <- shortenedOrganismName(unique(df_rs$organism)) - -if (organism %in% c('ecoli', 'paeruginosa')) { - expected_attribute_name <- 'ProbesetID' - - annot_file <- c( - 'ecoli' = 'E_coli_2.na36.annot.csv', - 'paeruginosa' = 'Pae_G1a.na36.annot.csv' - ) - - df_mapping <- read.csv( - file.path(params$local_annotation_dir, annot_file[[organism]]), - skip = 13, header = TRUE, na.strings = c('NA', '---') - )[c('Probe.Set.ID', 'Entrez.Gene', 'Gene.Symbol', 'Gene.Title', 'Ensembl', 'RefSeq.Transcript.ID', 'RefSeq.Protein.ID', 'Gene.Ontology.Biological.Process', 'Gene.Ontology.Cellular.Component', 'Gene.Ontology.Molecular.Function')] - - # Clean columns - df_mapping$Gene.Symbol <- purrr::map_chr(stringr::str_split(df_mapping$Gene.Symbol, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) - df_mapping$Gene.Title <- purrr::map_chr(stringr::str_split(df_mapping$Gene.Title, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) - df_mapping$Entrez.Gene <- purrr::map_chr(stringr::str_split(df_mapping$Entrez.Gene, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) - df_mapping$Ensembl <- purrr::map_chr(stringr::str_split(df_mapping$Ensembl, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) - - df_mapping$RefSeq <- paste(df_mapping$RefSeq.Transcript.ID, df_mapping$RefSeq.Protein.ID) - df_mapping$RefSeq <- purrr::map_chr(stringr::str_extract_all(df_mapping$RefSeq, '[A-Z]+_[\\d.]+'), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('^$', NA_character_) - - df_mapping$GO <- paste(df_mapping$Gene.Ontology.Biological.Process, df_mapping$Gene.Ontology.Cellular.Component, df_mapping$Gene.Ontology.Molecular.Function) - df_mapping$GO <- purrr::map_chr(stringr::str_extract_all(df_mapping$GO, '\\d{7}'), ~paste0('GO:', unique(.), collapse = "|")) %>% stringr::str_replace('^GO:$', NA_character_) - df_mapping <- df_mapping[c('Probe.Set.ID', 'Entrez.Gene', 'Gene.Symbol', 'Gene.Title', 'Ensembl', 'RefSeq', 'GO')] - names(df_mapping) <- c('ProbesetID', 'ENTREZID', 'SYMBOL', 'GENENAME', 'ENSEMBL', 'REFSEQ', 'GOSLIM_IDS') +organism <- shortenedOrganismName(unique(df_rs$organism)) +annot_key <- ifelse(organism %in% c("athaliana"), 'TAIR', 'ENSEMBL') - df_mapping$STRING_id <- NA_character_ -} else if (organism %in% c("athaliana")) { - ensembl_genomes_version = params$ensembl_version +if (organism %in% c("athaliana")) { + ENSEMBL_VERSION = ensembl_version ensembl_genomes_portal = "plants" - print(glue::glue("Using ensembl genomes ftp to get specific version of probeset id mapping table. Ensembl genomes portal: {ensembl_genomes_portal}, version: {ensembl_genomes_version}")) + print(glue::glue("Using ensembl genomes ftp to get specific version of probeset id mapping table. Ensembl genomes portal: {ensembl_genomes_portal}, version: {ENSEMBL_VERSION}")) expected_attribute_name <- getBioMartAttribute(df_rs) df_mapping <- retry_with_delay( get_ensembl_genomes_mappings_from_ftp, organism = organism, ensembl_genomes_portal = ensembl_genomes_portal, - ensembl_genomes_version = ensembl_genomes_version, + ensembl_genomes_version = ENSEMBL_VERSION, biomart_attribute = expected_attribute_name ) # TAIR from the mapping tables tend to be in the format 'AT1G01010.1' but the raw data has 'AT1G01010' # So here we remove the '.NNN' from the mapping table where .NNN is any number df_mapping$ensembl_gene_id <- stringr::str_replace_all(df_mapping$ensembl_gene_id, "\\.\\d+$", "") + + use_custom_annot <- FALSE } else { # Use biomart from main Ensembl website which archives keep each release on the live service # locate dataset @@ -610,94 +597,140 @@ if (organism %in% c('ecoli', 'paeruginosa')) { # Specify Ensembl version used in current GeneLab reference annotations - ENSEMBL_VERSION <- params$ensembl_version + ENSEMBL_VERSION <- ensembl_version print(paste0("Searching for Ensembl Version: ", ENSEMBL_VERSION)) # NON_DPPD print(glue::glue("Using Ensembl biomart to get specific version of mapping table. Ensembl version: {ENSEMBL_VERSION}")) - ensembl <- biomaRt::useEnsembl(biomart = "genes", - dataset = expected_dataset_name, - version = ENSEMBL_VERSION) - print(ensembl) + # Check if organism in supported in biomart + ensembl <- biomaRt::useEnsembl(biomart = "genes") + ensembl_datasets <- biomaRt::listDatasets(ensembl) + use_custom_annot <- !expected_dataset_name %in% ensembl_datasets$dataset - expected_attribute_name <- getBioMartAttribute(df_rs) - print(paste0("Expected attribute name: '", expected_attribute_name, "'")) - message(paste0("Expected attribute name: '", expected_attribute_name, "'")) # NON_DPPD + if (use_custom_annot) { + unloadNamespace("biomaRt") + } else { - # Some probe_ids for affy_hta_2_0 may end in .hg.1 instead of .hg (how it is in biomaRt), leading to 0 results returned - if (expected_attribute_name == 'affy_hta_2_0') { - rownames(probeset_level_data) <- stringr::str_replace(rownames(probeset_level_data), '\\.hg\\.1$', '.hg') - } + ensembl <- biomaRt::useEnsembl(biomart = "genes", + dataset = expected_dataset_name, + version = ENSEMBL_VERSION) + print(ensembl) - probe_ids <- rownames(probeset_level_data) + expected_attribute_name <- getBioMartAttribute(df_rs) + print(paste0("Expected attribute name: '", expected_attribute_name, "'")) + message(paste0("Expected attribute name: '", expected_attribute_name, "'")) # NON_DPPD - # DEBUG:START - if ( is.integer(params$DEBUG_limit_biomart_query) ) { - warning(paste("DEBUG MODE: Limiting query to", params$DEBUG_limit_biomart_query, "entries")) - message(paste("DEBUG MODE: Limiting query to", params$DEBUG_limit_biomart_query, "entries")) - probe_ids <- probe_ids[1:params$DEBUG_limit_biomart_query] - } - # DEBUG:END - - # Create probe map - # Run Biomart Queries in chunks to prevent request timeouts - # Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk size - CHUNK_SIZE= 1500 - probe_id_chunks <- split(probe_ids, ceiling(seq_along(probe_ids) / CHUNK_SIZE)) - df_mapping <- data.frame() - for (i in seq_along(probe_id_chunks)) { - probe_id_chunk <- probe_id_chunks[[i]] - print(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})")) - message(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})")) # NON_DPPD - chunk_results <- biomaRt::getBM( - attributes = c( - expected_attribute_name, - "ensembl_gene_id" - ), - filters = expected_attribute_name, - values = probe_id_chunk, - mart = ensembl) - - if (nrow(chunk_results) > 0) { - df_mapping <- df_mapping %>% dplyr::bind_rows(chunk_results) + # Some probe_ids for affy_hta_2_0 may end in .hg.1 instead of .hg (how it is in biomaRt), leading to 0 results returned + if (expected_attribute_name == 'affy_hta_2_0') { + rownames(probeset_level_data) <- stringr::str_replace(rownames(probeset_level_data), '\\.hg\\.1$', '.hg') } - - Sys.sleep(10) # Slight break between requests to prevent back-to-back requests + + probe_ids <- rownames(probeset_level_data) + + # DEBUG:START + if ( is.integer(params$DEBUG_limit_biomart_query) ) { + warning(paste("DEBUG MODE: Limiting query to", params$DEBUG_limit_biomart_query, "entries")) + message(paste("DEBUG MODE: Limiting query to", params$DEBUG_limit_biomart_query, "entries")) + probe_ids <- probe_ids[1:params$DEBUG_limit_biomart_query] + } + # DEBUG:END + + # Create probe map + # Run Biomart Queries in chunks to prevent request timeouts + # Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk size + CHUNK_SIZE= 1500 + probe_id_chunks <- split(probe_ids, ceiling(seq_along(probe_ids) / CHUNK_SIZE)) + df_mapping <- data.frame() + for (i in seq_along(probe_id_chunks)) { + probe_id_chunk <- probe_id_chunks[[i]] + print(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})")) + message(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})")) # NON_DPPD + chunk_results <- biomaRt::getBM( + attributes = c( + expected_attribute_name, + "ensembl_gene_id" + ), + filters = expected_attribute_name, + values = probe_id_chunk, + mart = ensembl) + + if (nrow(chunk_results) > 0) { + df_mapping <- df_mapping %>% dplyr::bind_rows(chunk_results) + } + + Sys.sleep(10) # Slight break between requests to prevent back-to-back requests + } + } } # At this point, we have df_mapping from either the biomart live service or the ensembl genomes ftp archive depending on the organism -``` +# If no df_mapping obtained (e.g., organism not supported in biomart), use custom annotations; otherwise, merge in-house annotations to df_mapping -``` {r reformat-merge-probe-annotations} -# Convert list of multi-mapped genes to string -listToUniquePipedString <- function(str_list) { - #! convert lists into strings denoting unique elements separated by '|' characters - #! e.g. c("GO1","GO2","GO2","G03") -> "GO1|GO2|GO3" - return(toString(unique(str_list)) %>% stringr::str_replace_all(pattern = stringr::fixed(", "), replacement = "|")) -} +if (use_custom_annot) { + expected_attribute_name <- 'ProbesetID' -annot_key = ifelse(organism %in% c("athaliana"), 'TAIR', 'ENSEMBL') + annot_type <- 'NO_CUSTOM_ANNOT' + if (!is.null(local_annotation_dir) && file.exists(file.path(local_annotation_dir, 'config.csv'))) { + config_df <- read.csv(file.path(local_annotation_dir, 'config.csv'), row.names=1) + if (df_rs$`biomart_attribute` %in% row.names(config_df)) { + annot_config <- config_df[df_rs$`biomart_attribute`, ] + annot_type <- annot_config$annot_type[[1]] + } else { + warning(paste0("No entry for '", df_rs$`biomart_attribute`, "' in provided config.csv")) + } + } else { + warning(paste0("No 'config.csv' file found in path (--referenceStorePath): ", local_annotation_dir)) + } -if (organism %in% c('ecoli')) { - unique_probe_ids <- df_mapping %>% - dplyr::mutate( - count_ENTREZID_mappings = 1 + stringr::str_count(ENTREZID, stringr::fixed("|")) - ) + if (annot_type == '3prime-IVT') { + unique_probe_ids <- read.csv( + file.path(local_annotation_dir, annot_config$annot_filename[[1]]), + skip = 13, header = TRUE, na.strings = c('NA', '---') + )[c('Probe.Set.ID', 'Entrez.Gene', 'Gene.Symbol', 'Gene.Title', 'Ensembl', 'RefSeq.Transcript.ID', 'RefSeq.Protein.ID', 'Gene.Ontology.Biological.Process', 'Gene.Ontology.Cellular.Component', 'Gene.Ontology.Molecular.Function')] + + # Clean columns + unique_probe_ids$Gene.Symbol <- purrr::map_chr(stringr::str_split(unique_probe_ids$Gene.Symbol, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) + unique_probe_ids$Gene.Title <- purrr::map_chr(stringr::str_split(unique_probe_ids$Gene.Title, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) + unique_probe_ids$Entrez.Gene <- purrr::map_chr(stringr::str_split(unique_probe_ids$Entrez.Gene, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) + unique_probe_ids$Ensembl <- purrr::map_chr(stringr::str_split(unique_probe_ids$Ensembl, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) + + unique_probe_ids$RefSeq <- paste(unique_probe_ids$RefSeq.Transcript.ID, unique_probe_ids$RefSeq.Protein.ID) + unique_probe_ids$RefSeq <- purrr::map_chr(stringr::str_extract_all(unique_probe_ids$RefSeq, '[A-Z]+_[\\d.]+'), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('^$', NA_character_) + + unique_probe_ids$GO <- paste(unique_probe_ids$Gene.Ontology.Biological.Process, unique_probe_ids$Gene.Ontology.Cellular.Component, unique_probe_ids$Gene.Ontology.Molecular.Function) + unique_probe_ids$GO <- purrr::map_chr(stringr::str_extract_all(unique_probe_ids$GO, '\\d{7}'), ~paste0('GO:', unique(.), collapse = "|")) %>% stringr::str_replace('^GO:$', NA_character_) + + unique_probe_ids <- unique_probe_ids[c('Probe.Set.ID', 'Entrez.Gene', 'Gene.Symbol', 'Gene.Title', 'Ensembl', 'RefSeq', 'GO')] + names(unique_probe_ids) <- c('ProbesetID', 'ENTREZID', 'SYMBOL', 'GENENAME', 'ENSEMBL', 'REFSEQ', 'GOSLIM_IDS') + + unique_probe_ids$STRING_id <- NA_character_ + + gene_col <- 'ENSEMBL' + if (sum(!is.na(unique_probe_ids$ENTREZID)) > sum(!is.na(unique_probe_ids$ENSEMBL))) { + gene_col <- 'ENTREZID' + } + if (sum(!is.na(unique_probe_ids$SYMBOL)) > sum(!is.na(unique_probe_ids$ENTREZID))) { + gene_col <- 'SYMBOL' + } - primary_key <- 'ENTREZID' - primary_key_count <- 'count_ENTREZID_mappings' -} else if (organism %in% c('paeruginosa')) { - unique_probe_ids <- df_mapping %>% + unique_probe_ids <- unique_probe_ids %>% dplyr::mutate( - count_SYMBOL_mappings = 1 + stringr::str_count(SYMBOL, stringr::fixed("|")) + count_gene_mappings = 1 + stringr::str_count(get(gene_col), stringr::fixed("|")), + gene_mapping_source = gene_col ) - - primary_key <- 'SYMBOL' - primary_key_count <- 'count_SYMBOL_mappings' + } else if (annot_type == 'custom') { + unique_probe_ids <- read.csv( + file.path(local_annotation_dir, annot_config$annot_filename[[1]]), + header = TRUE, na.strings = c('NA', '') + ) + } else { + annot_cols <- c('ProbesetID', 'ENTREZID', 'SYMBOL', 'GENENAME', 'ENSEMBL', 'REFSEQ', 'GOSLIM_IDS', 'STRING_id', 'count_gene_mappings', 'gene_mapping_source') + unique_probe_ids <- setNames(data.frame(matrix(NA_character_, nrow = 1, ncol = length(annot_cols))), annot_cols) + } } else { annot <- read.table( - as.character(params$annotation_file_path), + as.character(annotation_file_path), sep = "\t", header = TRUE, quote = "", @@ -713,33 +746,33 @@ if (organism %in% c('ecoli')) { ) %>% # Count number of ensembl IDS mapped dplyr::mutate( - count_ENSEMBL_mappings = 1 + stringr::str_count(ENSEMBL, stringr::fixed("|")) + count_gene_mappings = 1 + stringr::str_count(ENSEMBL, stringr::fixed("|")), + gene_mapping_source = annot_key ) %>% dplyr::left_join(annot, by = c("ENSEMBL" = annot_key)) - - - primary_key <- 'ENSEMBL' - primary_key_count <- 'count_ENSEMBL_mappings' } +``` +``` {r reformat-merge-probe-annotations} probeset_expression_matrix <- oligo::exprs(probeset_level_data) -probeset_expression_matrix.biomart_mapped <- probeset_expression_matrix %>% +probeset_expression_matrix.gene_mapped <- probeset_expression_matrix %>% as.data.frame() %>% tibble::rownames_to_column(var = "ProbesetID") %>% # Ensure rownames (probeset IDs) can be used as join key dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% - dplyr::mutate( !!primary_key_count := ifelse(is.na(get(primary_key)), 0, get(primary_key_count)) ) + dplyr::mutate( count_gene_mappings := ifelse(is.na(count_gene_mappings), 0, count_gene_mappings) ) %>% + dplyr::mutate( gene_mapping_source := unique(unique_probe_ids$gene_mapping_source) ) ``` -### Summarize Probeset Mapping +### Summarize Gene Mapping ``` {r summarize-remapping-vs-original-mapping} #| message: false # Pie Chart with Percentages slices <- c( - 'Unique Mapping' = nrow(probeset_expression_matrix.biomart_mapped %>% dplyr::filter(get(primary_key_count) == 1) %>% dplyr::distinct(ProbesetID)), - 'Multi Mapping' = nrow(probeset_expression_matrix.biomart_mapped %>% dplyr::filter(get(primary_key_count) > 1) %>% dplyr::distinct(ProbesetID)), - 'No Mapping' = nrow(probeset_expression_matrix.biomart_mapped %>% dplyr::filter(get(primary_key_count) == 0) %>% dplyr::distinct(ProbesetID)) + 'Unique Mapping' = nrow(probeset_expression_matrix.gene_mapped %>% dplyr::filter(count_gene_mappings == 1) %>% dplyr::distinct(ProbesetID)), + 'Multi Mapping' = nrow(probeset_expression_matrix.gene_mapped %>% dplyr::filter(count_gene_mappings > 1) %>% dplyr::distinct(ProbesetID)), + 'No Mapping' = nrow(probeset_expression_matrix.gene_mapped %>% dplyr::filter(count_gene_mappings == 0) %>% dplyr::distinct(ProbesetID)) ) pct <- round(slices/sum(slices)*100) chart_names <- names(slices) @@ -747,14 +780,14 @@ chart_names <- glue::glue("{names(slices)} ({slices})") # add count to labels chart_names <- paste(chart_names, pct) # add percents to labels chart_names <- paste(chart_names,"%",sep="") # ad % to labels pie(slices,labels = chart_names, col=rainbow(length(slices)), - main=glue::glue("Mapping to Primary Keytype\n {nrow(probeset_expression_matrix.biomart_mapped %>% dplyr::distinct(ProbesetID))} Total Unique Probesets") + main=glue::glue("Mapping to Primary Keytype\n {nrow(probeset_expression_matrix.gene_mapped %>% dplyr::distinct(ProbesetID))} Total Unique Probesets") ) print(glue::glue("Unique Mapping Count: {slices[['Unique Mapping']]}")) message(glue::glue("Unique Mapping Count: {slices[['Unique Mapping']]}")) # NON_DPPD ``` -### Add Annotation Columns and Format Tables +### Save Annotated Tables ```{r save-tables} ## Reorder columns before saving to file @@ -770,14 +803,15 @@ ANNOTATIONS_COLUMN_ORDER = c( SAMPLE_COLUMN_ORDER <- df_rs$`Sample Name` -probeset_expression_matrix.biomart_mapped <- probeset_expression_matrix.biomart_mapped %>% dplyr::rename( !!annot_key := ENSEMBL ) +probeset_expression_matrix.gene_mapped <- probeset_expression_matrix.gene_mapped %>% dplyr::rename( !!annot_key := ENSEMBL ) ## Output column subset file with just normalized probeset level expression values write.csv( - probeset_expression_matrix.biomart_mapped[c( + probeset_expression_matrix.gene_mapped[c( ANNOTATIONS_COLUMN_ORDER, "ProbesetID", - primary_key_count, + "count_gene_mappings", + "gene_mapping_source", SAMPLE_COLUMN_ORDER) ], file.path(DIR_NORMALIZED_EXPRESSION, "normalized_expression_probeset_GLmicroarray.csv"), row.names = FALSE) @@ -786,14 +820,15 @@ write.csv( PROBE_INFO_COLUMN_ORDER = c( "ProbesetID", "ProbeID", - primary_key_count + "count_gene_mappings", + "gene_mapping_source" ) FINAL_COLUMN_ORDER <- c( ANNOTATIONS_COLUMN_ORDER, PROBE_INFO_COLUMN_ORDER, SAMPLE_COLUMN_ORDER - ) +) ## Generate raw intensity matrix that includes annotations @@ -805,7 +840,8 @@ background_corrected_data_annotated <- oligo::exprs(background_corrected_data) % dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% # Join with biomaRt ENSEMBL mappings - dplyr::mutate( !!primary_key_count := ifelse(is.na(get(primary_key)), 0, get(primary_key_count)) ) %>% # Convert NA mapping to 0 + dplyr::mutate( count_gene_mappings := ifelse(is.na(count_gene_mappings), 0, count_gene_mappings) ) %>% # Convert NA mapping to 0 + dplyr::mutate( gene_mapping_source := unique(unique_probe_ids$gene_mapping_source) ) %>% dplyr::rename( !!annot_key := ENSEMBL ) ## Perform reordering @@ -823,7 +859,8 @@ norm_data_matrix_annotated <- oligo::exprs(norm_data) %>% dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% - dplyr::mutate( !!primary_key_count := ifelse(is.na(get(primary_key)), 0, get(primary_key_count)) ) %>% # Convert NA mapping to 0 + dplyr::mutate( count_gene_mappings := ifelse(is.na(count_gene_mappings), 0, count_gene_mappings) ) %>% # Convert NA mapping to 0 + dplyr::mutate( gene_mapping_source := unique(unique_probe_ids$gene_mapping_source) ) %>% dplyr::rename( !!annot_key := ENSEMBL ) norm_data_matrix_annotated <- norm_data_matrix_annotated %>% @@ -931,7 +968,7 @@ PCA_raw <- prcomp(t(exprs(probeset_level_data)), scale = FALSE) # Note: expressi write.csv(PCA_raw$x, file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.csv")) ``` -### Add Additional Columns and Format DE Table +### Save DE Table ``` {r save-de-table} #| message: false @@ -942,9 +979,9 @@ write.csv(PCA_raw$x, file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.cs # Read in DE table df_interim <- read.csv("INTERIM.csv") -# Bind columns from biomart mapped expression table +# Bind columns from gene mapped expression table df_interim <- df_interim %>% - dplyr::bind_cols(probeset_expression_matrix.biomart_mapped) + dplyr::bind_cols(probeset_expression_matrix.gene_mapped) # Reformat column names reformat_names <- function(colname, group_name_mapping) { @@ -1038,7 +1075,8 @@ df_interim <- df_interim %>% dplyr::select(-any_of(colnames_to_remove)) PROBE_INFO_COLUMN_ORDER = c( "ProbesetID", - primary_key_count + "count_gene_mappings", + "gene_mapping_source" ) generate_prefixed_column_order <- function(subjects, prefixes) { @@ -1173,19 +1211,11 @@ get_versions <- function() { ## Note Libraries that were NOT used during processing versions_buffer <- get_versions() -if (organism %in% c("athaliana")) { - versions_buffer <- glue::glue_collapse(c( - versions_buffer, - glue::glue("- name: biomaRt"), - glue::glue(" version: (Not used for plant datasets)"), - glue::glue(" homepage: https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html"), - glue::glue(" workflow task: PROCESS_AFFYMETRIX") - ), sep = "\n") -} else if (organism %in% c('ecoli', 'paeruginosa')) { +if (organism %in% c("athaliana") || use_custom_annot) { versions_buffer <- glue::glue_collapse(c( versions_buffer, glue::glue("- name: biomaRt"), - glue::glue(" version: (Not used for bacteria datasets)"), + glue::glue(" version: (Not used for this dataset)"), glue::glue(" homepage: https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html"), glue::glue(" workflow task: PROCESS_AFFYMETRIX") ), sep = "\n") diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/POST_PROCESSING/GENERATE_PROTOCOL/resources/usr/bin/generate_protocol.sh b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/POST_PROCESSING/GENERATE_PROTOCOL/resources/usr/bin/generate_protocol.sh index 5b6f2357..bddffaa3 100755 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/POST_PROCESSING/GENERATE_PROTOCOL/resources/usr/bin/generate_protocol.sh +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/POST_PROCESSING/GENERATE_PROTOCOL/resources/usr/bin/generate_protocol.sh @@ -34,9 +34,9 @@ organism_list=("Homo sapiens" "Mus musculus" "Rattus norvegicus" "Drosophila mel # Check the value of 'organism' variable and set 'GENE_MAPPING_STEP' accordingly if [[ $organism == "Arabidopsis thaliana" ]]; then GENE_MAPPING_STEP="Ensembl gene ID mappings were retrieved for each probeset using the Plants Ensembl database ftp server (plants.ensembl.org, release 54)." -elif [[ $organism == "Escherichia coli" ]]; then +elif [[ $biomart_attribute == "AFFY E coli Genome 2 0" ]]; then GENE_MAPPING_STEP="Gene annotations were retrieved for each probeset from ThermoFisher (https://www.thermofisher.com/order/catalog/product/sec/assets?url=TFS-Assets/LSG/Support-Files/E_coli_2-na36-annot-csv.zip, created March 2016, accessed June 2024)." -elif [[ $organism == "Pseudomonas aeruginosa" ]]; then +elif [[ $biomart_attribute == "AFFY GeneChip P. aeruginosa Genome" ]]; then GENE_MAPPING_STEP="Gene annotations were retrieved for each probeset from ThermoFisher (https://www.thermofisher.com/order/catalog/product/sec/assets?url=TFS-Assets/LSG/Support-Files/Pae_G1a-na36-annot-csv.zip, created March 2016, accessed June 2024)." elif [[ " ${organism_list[*]} " == *"${organism//\"/}"* ]]; then GENE_MAPPING_STEP="Ensembl gene ID mappings were retrieved for each probeset using biomaRt (version ${biomaRt_VERSION}), Ensembl database (ensembl.org, release 107)." From 475a337883d0258ddcbb38509ed064e6c3d4f483 Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Sun, 5 Jan 2025 15:56:37 -0800 Subject: [PATCH 03/25] NF_MAAffymetrix: update pipeline documentation --- .../GL-DPPD-7114.md | 648 +++++++++++------- .../examples/annotations/README.md | 20 + .../examples/annotations/config.csv | 3 + .../OSD-213_microarray_v0_runsheet.csv | 7 + .../runsheet/OSD-3_microarray_v0_runsheet.csv | 19 + .../examples/runsheet/README.md | 23 + 6 files changed, 455 insertions(+), 265 deletions(-) create mode 100644 Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md create mode 100644 Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/annotations/config.csv create mode 100644 Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/runsheet/OSD-213_microarray_v0_runsheet.csv create mode 100644 Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/runsheet/OSD-3_microarray_v0_runsheet.csv create mode 100644 Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/runsheet/README.md diff --git a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md index 0c7ce195..c2cdc0bb 100644 --- a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md +++ b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md @@ -2,7 +2,7 @@ > **This page holds an overview and instructions for how GeneLab processes Affymetrix microarray datasets. Exact processing commands and GL-DPPD-7114 version used for specific GeneLab datasets (GLDS) are provided with their processed data in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo).** > -> \* The pipeline detailed below is currently used for animal and Arabidopsis Thaliana studies only, it will be updated soon for processing microbe microarray data and other plant data. +> \* The pipeline detailed below currently supports gene annotations for Arabidopsis Thaliana via Ensembl FTP, all animals available in Biomart, and custom annotations (see [Step 8a](#8a-get-probeset-annotations)). --- @@ -26,7 +26,9 @@ Lauren Sanders (acting GeneLab Project Scientist) - [Software used](#software-used) - [General processing overview with example commands](#general-processing-overview-with-example-commands) - [1. Create Sample RunSheet](#1-create-sample-runsheet) - - [2. Load Metadata and Raw Data](#2-load-metadata-and-raw-data) + - [2. Load Data](#2-load-data) + - [2a. Load Metadata and Raw Data](#2a-load-metadata-and-raw-data) + - [2b. Load Annotation Metadata](#2b-load-annotation-metadata) - [3. Raw Data Quality Assessment](#3-raw-data-quality-assessment) - [3a. Density Plot](#3a-density-plot) - [3b. Pseudo Image Plots](#3b-pseudo-image-plots) @@ -40,12 +42,14 @@ Lauren Sanders (acting GeneLab Project Scientist) - [6c. MA Plots](#6c-ma-plots) - [6d. Boxplots](#6d-boxplots) - [7. Probeset Summarization](#7-probeset-summarization) - - [8. Perform Probeset Differential Expression (DE)](#8-perform-probeset-differential-expression-de) - - [8a. Add Probeset Annotations](#8a-add-probeset-annotations) - - [8b. Summarize Biomart Mapping](#8b-summarize-biomart-mapping) - - [8c. Generate Design Matrix](#8c-generate-design-matrix) - - [8d. Perform Individual Probeset Level DE](#8d-perform-individual-probeset-level-de) - - [8e. Add Additional Columns and Format DE Table](#8e-add-additional-columns-and-format-de-table) + - [8. Probeset Annotations](#8-probeset-annotations) + - [8a. Get Probeset Annotations](#8a-get-probeset-annotations) + - [8b. Summarize Gene Mapping](#8b-summarize-gene-mapping) + - [8c. Save Annotated Tables](#8c-save-annotated-tables) + - [9. Perform Probeset Differential Expression (DE)](#9-perform-probeset-differential-expression-de) + - [9a. Generate Design Matrix](#9a-generate-design-matrix) + - [9b. Perform Individual Probeset Level DE](#9b-perform-individual-probeset-level-de) + - [9c. Save DE Table](#9c-save-de-table) --- @@ -80,7 +84,7 @@ Lauren Sanders (acting GeneLab Project Scientist) ## 1. Create Sample RunSheet > Notes: -> - Rather than running the commands below to create the runsheet needed for processing, the runsheet may also be created manually by following the [file specification](../Workflow_Documentation/NF_MAAffymetrix/examples/README.md). +> - Rather than running the commands below to create the runsheet needed for processing, the runsheet may also be created manually by following the [file specification](../Workflow_Documentation/NF_MAAffymetrix/examples/runsheet/README.md). > > - These command line tools are part of the [dp_tools](https://github.com/J-81/dp_tools) program. @@ -120,9 +124,13 @@ dpt-isa-to-runsheet --accession OSD-### \ --- -## 2. Load Metadata and Raw Data +## 2. Load Data -> Note: Steps 2 - 8 are done in R +> Note: Steps 2 - 9 are done in R + +
+ +### 2a. Load Metadata and Raw Data ```R ### Install R packages if not already installed ### @@ -142,11 +150,14 @@ BiocManager::install("oligo") ## Note: Only dplyr is explicitly loaded. Other library functions are called with explicit namespace (e.g. LIBRARYNAME::FUNCTION) library(dplyr) # Ensure infix operator is available, methods should still reference dplyr namespace otherwise - +options(dplyr.summarise.inform = FALSE) # Don't print out '`summarise()` has grouped output by 'group'. You can override using the `.groups` argument.' # Define path to runsheet runsheet <- "/path/to/runsheet/{OSD-Accession-ID}_microarray_v{version}_runsheet.csv" +# If using custom annotation, define path to directory containing annotation file and config +local_annotation_dir <- NULL # + ## Set up output structure # Output Constants @@ -164,6 +175,8 @@ dir.create(DIR_DGE) original_par <- par() options(preferRaster=TRUE) # use Raster when possible to avoid antialiasing artifacts in images +options(timeout=1000) # ensure enough time for data downloads + # Utility function to improve robustness of function calls # Used to remedy intermittent internet issues during runtime retry_with_delay <- function(func, ...) { @@ -196,25 +209,6 @@ retry_with_delay <- function(func, ...) { df_rs <- read.csv(runsheet, check.names = FALSE) %>% dplyr::mutate_all(function(x) iconv(x, "latin1", "ASCII", sub="")) # Convert all characters to ascii, when not possible, remove the character -## Determines the organism specific annotation file to use based on the organism in the runsheet -fetch_organism_specific_annotation_file_path <- function(organism) { - # Uses the GeneLab GL-DPPD-7110_annotations.csv file to find the organism specific annotation file path - # Raises an exception if the organism does not have an associated annotation file yet - - - all_organism_table <- read.csv("https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv") - - annotation_file_path <- all_organism_table %>% dplyr::filter(species == organism) %>% dplyr::pull(genelab_annots_link) - - # Guard clause: Ensure annotation_file_path populated - # Else: raise exception for unsupported organism - if (length(annotation_file_path) == 0) { - stop(glue::glue("Organism supplied '{organism}' is not supported. See the following url for supported organisms: https://github.com/nasa/GeneLab_Data_Processing/blob/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv. Supported organisms will correspond to a row based on the 'species' column and include a url in the 'genelab_annots_link' column of that row")) - } - - return(annotation_file_path) -} -annotation_file_path <- retry_with_delay(fetch_organism_specific_annotation_file_path, unique(df_rs$organism)) allTrue <- function(i_vector) { if ( length(i_vector) == 0 ) { @@ -287,9 +281,12 @@ print(paste0("Number of Arrays: ", dim(raw_data)[2])) print(paste0("Number of Probes: ", dim(raw_data)[1])) ``` -**Input Data:** +**Parameter Definitions:** - `runsheet` (Path to runsheet, output from [Step 1](#1-create-sample-runsheet)) +- `local_annotation_dir` (Path to local annotation directory if using custom annotations, see [Step 8a](#8a-get-probeset-annotations)) + + > Note: If not using custom annotations, leave `local_annotation_dir` as `NULL`. **Output Data:** @@ -300,6 +297,46 @@ print(paste0("Number of Probes: ", dim(raw_data)[1]))
+### 2b. Load Annotation Metadata + +```R +## Determines the organism specific annotation file to use based on the organism in the runsheet +fetch_organism_specific_annotation_table <- function(organism) { + # Uses the latest GeneLab annotations table to find the organism specific annotation file path and ensembl version + # Raises an exception if the organism does not have an associated annotation file or ensembl version yet + + annotation_table_link <- "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/GL_RefAnnotTable-A_1.1.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv" + all_organism_table <- read.csv(annotation_table_link) + + annotation_table <- all_organism_table %>% dplyr::filter(species == organism) + + # Guard clause: Ensure annotation_table populated + # Else: raise exception for unsupported organism + if (nrow(annotation_table) == 0 || annotation_table$genelab_annots_link == "" || is.na(annotation_table$ensemblVersion)) { + stop(glue::glue("Organism supplied '{organism}' is not supported. See the following url for supported organisms: {annotation_table_link}. Supported organisms will correspond to a row based on the 'species' column and include a url in the 'genelab_annots_link' column of that row and a version number in the 'ensemblVersion' column.")) + } + + return(annotation_table) +} + +annotation_table <- retry_with_delay(fetch_organism_specific_annotation_table, unique(df_rs$organism)) + +annotation_file_path <- annotation_table$genelab_annots_link +ensembl_version <- as.character(annotation_table$ensemblVersion) +``` + +**Parameter Definitions:** + +- `df_rs$organism` (organism specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) +- `annotation_table_link` (URL or path to latest GeneLab Annotations file, see [GL-DPPD-7110-A_annotations.csv](../../../GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv)) + +**Output Data:** + +- `annotation_file_path` (reference organism annotation file url indicated in the 'genelab_annots_link' column of the GeneLab Annotations file provided in `annotation_table_link`) +- `ensembl_version` (reference organism Ensembl version indicated in the 'ensemblVersion' column of the GeneLab Annotations file provided in `annotation_table_link`) + +
+ --- ## 3. Raw Data Quality Assessment @@ -315,6 +352,11 @@ par( ) number_of_sets = ceiling(dim(raw_data)[2] / 30) # Set of 30 samples, used to scale plot +scale_factor = 0.2 # Default scale factor + +if (max(nchar(colnames(raw_data@assayData$exprs))) > 35 & number_of_sets > 1) { # Scale more if sample names are long + scale_factor = if_else(number_of_sets == 2, 0.4, 0.25) +} oligo::hist(raw_data, transfo=log2, # Log2 transform raw intensity values @@ -325,7 +367,7 @@ legend("topright", legend = colnames(raw_data@assayData$exprs), lty = c(1,2,3,4,5), # Seems like oligo::hist cycles through these first five line types col = oligo::darkColors(n = ncol(raw_data)), # Ensure legend color is in sync with plot ncol = number_of_sets, # Set number of columns by number of sets - cex = max(0.35, 1 + 0.2 - (number_of_sets*0.2)) # Reduce scale by 20% for each column beyond 1 with minimum of 35% + cex = max(0.35, 1 + scale_factor - (number_of_sets*scale_factor)) # Reduce for each column beyond 1 with minimum of 35% ) # Reset par @@ -334,7 +376,7 @@ par(original_par) **Input Data:** -- `raw_data` (raw data R object created in [Step 2](#2-load-metadata-and-raw-data) above) +- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) **Output Data:** @@ -355,7 +397,7 @@ for ( i in seq_along(1:ncol(raw_data))) { **Input Data:** -- `raw_data` (raw data R object created in [Step 2](#2-load-metadata-and-raw-data) above) +- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) **Output Data:** @@ -368,8 +410,8 @@ for ( i in seq_along(1:ncol(raw_data))) { ```R if (inherits(raw_data, "GeneFeatureSet")) { print("Raw data is a GeneFeatureSet, using exprs() to access expression values and adding 0.0001 to avoid log(0)") -} else if (inherits(raw_data, "ExpressionSet")) { - print("Raw data is an ExpressionSet. Using default approach for this class for MA Plot") +} else if (inherits(raw_data, "ExpressionSet") || inherits(raw_data, "ExpressionFeatureSet") || inherits(raw_data, "HTAFeatureSet")) { + print(paste0("Raw data is ", class(raw_data), ". Using default approach for this class for MA Plot")) } if (inherits(raw_data, "GeneFeatureSet")) { @@ -379,26 +421,20 @@ if (inherits(raw_data, "GeneFeatureSet")) { ylim=c(-2, 4), main="" # This function uses 'main' as a suffix to the sample name. Here we want just the sample name, thus here main is an empty string ) -} else if (inherits(raw_data, "ExpressionSet")) { - MA_plot <- oligo::MAplot( - raw_data, - ylim=c(-2, 4), - main="" # This function uses 'main' as a suffix to the sample name. Here we want just the sample name, thus here main is an empty string - ) -} else if (inherits(raw_data, "ExpressionFeatureSet")) { +} else if (inherits(raw_data, "ExpressionSet") || inherits(raw_data, "ExpressionFeatureSet") || inherits(raw_data, "HTAFeatureSet")) { MA_plot <- oligo::MAplot( raw_data, ylim=c(-2, 4), main="" # This function uses 'main' as a suffix to the sample name. Here we want just the sample name, thus here main is an empty string ) } else { - stop(glue::glue("No strategy for MA plots for {raw_data}")) + stop(glue::glue("No strategy for MA plots for {class(raw_data)}")) } ``` **Input Data:** -- `raw_data` (raw data R object created in [Step 2](#2-load-metadata-and-raw-data) above) +- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) **Output Data:** @@ -433,7 +469,7 @@ par(original_par) **Input Data:** -- `raw_data` (raw data R object created in [Step 2](#2-load-metadata-and-raw-data) above) +- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) **Output Data:** @@ -451,7 +487,7 @@ background_corrected_data <- raw_data %>% oligo::backgroundCorrect(method="rma") **Input Data:** -- `raw_data` (raw data R object created in [Step 2](#2-load-metadata-and-raw-data) above) +- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) **Output Data:** @@ -470,7 +506,7 @@ background_corrected_data <- raw_data %>% oligo::backgroundCorrect(method="rma") # Normalize background-corrected data using the quantile method norm_data <- oligo::normalize(background_corrected_data, method = "quantile", - target = "core" # Use oligo default: probes with probeset id mapping + target = "core" # Use oligo default: core metaprobeset mappings ) # Summarize background-corrected and normalized data @@ -516,7 +552,7 @@ legend("topright", legend = colnames(norm_data@assayData$exprs), lty = c(1,2,3,4,5), # Seems like oligo::hist cycles through these first five line types col = oligo::darkColors(n = ncol(norm_data)), # Ensure legend color is in sync with plot ncol = number_of_sets, # Set number of columns by number of sets - cex = max(0.35, 1 + 0.2 - (number_of_sets*0.2)) # Reduce scale by 20% for each column beyond 1 with minimum of 35% + cex = max(0.35, 1 + scale_factor - (number_of_sets*scale_factor)) # Reduce for each column beyond 1 with minimum of 35% ) # Reset par @@ -637,11 +673,11 @@ print(paste0("Number of Probesets: ", dim(unique(oligo::getProbeInfo(probeset_le --- -## 8. Perform Probeset Differential Expression (DE) +## 8. Probeset Annotations
-### 8a. Add Probeset Annotations +### 8a. Get Probeset Annotations ```R shortenedOrganismName <- function(long_name) { @@ -707,25 +743,35 @@ get_ensembl_genomes_mappings_from_ftp <- function(organism, ensembl_genomes_port return(mapping) } +# Convert list of multi-mapped genes to string +listToUniquePipedString <- function(str_list) { + #! convert lists into strings denoting unique elements separated by '|' characters + #! e.g. c("GO1","GO2","GO2","G03") -> "GO1|GO2|GO3" + return(toString(unique(str_list)) %>% stringr::str_replace_all(pattern = stringr::fixed(", "), replacement = "|")) +} + organism <- shortenedOrganismName(unique(df_rs$organism)) +annot_key <- ifelse(organism %in% c("athaliana"), 'TAIR', 'ENSEMBL') if (organism %in% c("athaliana")) { - ensembl_genomes_version = "54" + ENSEMBL_VERSION = ensembl_version ensembl_genomes_portal = "plants" - print(glue::glue("Using ensembl genomes ftp to get specific version of probeset id mapping table. Ensembl genomes portal: {ensembl_genomes_portal}, version: {ensembl_genomes_version}")) + print(glue::glue("Using ensembl genomes ftp to get specific version of probeset id mapping table. Ensembl genomes portal: {ensembl_genomes_portal}, version: {ENSEMBL_VERSION}")) expected_attribute_name <- getBioMartAttribute(df_rs) df_mapping <- retry_with_delay( get_ensembl_genomes_mappings_from_ftp, organism = organism, ensembl_genomes_portal = ensembl_genomes_portal, - ensembl_genomes_version = ensembl_genomes_version, + ensembl_genomes_version = ENSEMBL_VERSION, biomart_attribute = expected_attribute_name ) # TAIR from the mapping tables tend to be in the format 'AT1G01010.1' but the raw data has 'AT1G01010' # So here we remove the '.NNN' from the mapping table where .NNN is any number df_mapping$ensembl_gene_id <- stringr::str_replace_all(df_mapping$ensembl_gene_id, "\\.\\d+$", "") + + use_custom_annot <- FALSE } else { # Use biomart from main Ensembl website which archives keep each release on the live service # locate dataset @@ -734,92 +780,189 @@ if (organism %in% c("athaliana")) { # Specify Ensembl version used in current GeneLab reference annotations - ENSEMBL_VERSION <- '107' + ENSEMBL_VERSION <- ensembl_version print(glue::glue("Using Ensembl biomart to get specific version of mapping table. Ensembl version: {ENSEMBL_VERSION}")) - ensembl <- biomaRt::useEnsembl(biomart = "genes", - dataset = expected_dataset_name, - version = ENSEMBL_VERSION) - print(ensembl) + # Check if organism in supported in biomart + ensembl <- biomaRt::useEnsembl(biomart = "genes") + ensembl_datasets <- biomaRt::listDatasets(ensembl) + use_custom_annot <- !expected_dataset_name %in% ensembl_datasets$dataset + + if (use_custom_annot) { + unloadNamespace("biomaRt") + } else { + + ensembl <- biomaRt::useEnsembl(biomart = "genes", + dataset = expected_dataset_name, + version = ENSEMBL_VERSION) + print(ensembl) + + expected_attribute_name <- getBioMartAttribute(df_rs) + print(paste0("Expected attribute name: '", expected_attribute_name, "'")) + + # Some probe_ids for affy_hta_2_0 may end in .hg.1 instead of .hg (how it is in biomaRt), leading to 0 results returned + if (expected_attribute_name == 'affy_hta_2_0') { + rownames(probeset_level_data) <- stringr::str_replace(rownames(probeset_level_data), '\\.hg\\.1$', '.hg') + } + + probe_ids <- rownames(probeset_level_data) + + # Create probe map + # Run Biomart Queries in chunks to prevent request timeouts + # Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk size + CHUNK_SIZE= 1500 + probe_id_chunks <- split(probe_ids, ceiling(seq_along(probe_ids) / CHUNK_SIZE)) + df_mapping <- data.frame() + for (i in seq_along(probe_id_chunks)) { + probe_id_chunk <- probe_id_chunks[[i]] + print(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})")) + chunk_results <- biomaRt::getBM( + attributes = c( + expected_attribute_name, + "ensembl_gene_id" + ), + filters = expected_attribute_name, + values = probe_id_chunk, + mart = ensembl) + + if (nrow(chunk_results) > 0) { + df_mapping <- df_mapping %>% dplyr::bind_rows(chunk_results) + } + + Sys.sleep(10) # Slight break between requests to prevent back-to-back requests + } - expected_attribute_name <- getBioMartAttribute(df_rs) - print(paste0("Expected attribute name: '", expected_attribute_name, "'")) - - probe_ids <- rownames(probeset_level_data) - - # Create probe map - # Run Biomart Queries in chunks to prevent request timeouts - # Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk size - CHUNK_SIZE= 1500 - probe_id_chunks <- split(probe_ids, ceiling(seq_along(probe_ids) / CHUNK_SIZE)) - df_mapping <- data.frame() - for (i in seq_along(probe_id_chunks)) { - probe_id_chunk <- probe_id_chunks[[i]] - print(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})")) - chunk_results <- biomaRt::getBM( - attributes = c( - expected_attribute_name, - "ensembl_gene_id" - ), - filters = expected_attribute_name, - values = probe_id_chunk, - mart = ensembl) - - df_mapping <- df_mapping %>% dplyr::bind_rows(chunk_results) - Sys.sleep(10) # Slight break between requests to prevent back-to-back requests } } # At this point, we have df_mapping from either the biomart live service or the ensembl genomes ftp archive depending on the organism +# If no df_mapping obtained (e.g., organism not supported in biomart), use custom annotations; otherwise, merge in-house annotations to df_mapping -listToUniquePipedString <- function(str_list) { - #! convert lists into strings denoting unique elements separated by '|' characters - #! e.g. c("GO1","GO2","GO2","G03") -> "GO1|GO2|GO3" - return(toString(unique(str_list)) %>% stringr::str_replace_all(pattern = stringr::fixed(", "), replacement = "|")) -} +if (use_custom_annot) { + expected_attribute_name <- 'ProbesetID' + + annot_type <- 'NO_CUSTOM_ANNOT' + if (!is.null(local_annotation_dir) && file.exists(file.path(local_annotation_dir, 'config.csv'))) { + config_df <- read.csv(file.path(local_annotation_dir, 'config.csv'), row.names=1) + if (df_rs$`biomart_attribute` %in% row.names(config_df)) { + annot_config <- config_df[df_rs$`biomart_attribute`, ] + annot_type <- annot_config$annot_type[[1]] + } else { + warning(paste0("No entry for '", df_rs$`biomart_attribute`, "' in provided config.csv")) + } + } else { + warning(paste0("No 'config.csv' file found in path (--referenceStorePath): ", local_annotation_dir)) + } + + if (annot_type == '3prime-IVT') { + unique_probe_ids <- read.csv( + file.path(local_annotation_dir, annot_config$annot_filename[[1]]), + skip = 13, header = TRUE, na.strings = c('NA', '---') + )[c('Probe.Set.ID', 'Entrez.Gene', 'Gene.Symbol', 'Gene.Title', 'Ensembl', 'RefSeq.Transcript.ID', 'RefSeq.Protein.ID', 'Gene.Ontology.Biological.Process', 'Gene.Ontology.Cellular.Component', 'Gene.Ontology.Molecular.Function')] + + # Clean columns + unique_probe_ids$Gene.Symbol <- purrr::map_chr(stringr::str_split(unique_probe_ids$Gene.Symbol, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) + unique_probe_ids$Gene.Title <- purrr::map_chr(stringr::str_split(unique_probe_ids$Gene.Title, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) + unique_probe_ids$Entrez.Gene <- purrr::map_chr(stringr::str_split(unique_probe_ids$Entrez.Gene, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) + unique_probe_ids$Ensembl <- purrr::map_chr(stringr::str_split(unique_probe_ids$Ensembl, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) + + unique_probe_ids$RefSeq <- paste(unique_probe_ids$RefSeq.Transcript.ID, unique_probe_ids$RefSeq.Protein.ID) + unique_probe_ids$RefSeq <- purrr::map_chr(stringr::str_extract_all(unique_probe_ids$RefSeq, '[A-Z]+_[\\d.]+'), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('^$', NA_character_) + + unique_probe_ids$GO <- paste(unique_probe_ids$Gene.Ontology.Biological.Process, unique_probe_ids$Gene.Ontology.Cellular.Component, unique_probe_ids$Gene.Ontology.Molecular.Function) + unique_probe_ids$GO <- purrr::map_chr(stringr::str_extract_all(unique_probe_ids$GO, '\\d{7}'), ~paste0('GO:', unique(.), collapse = "|")) %>% stringr::str_replace('^GO:$', NA_character_) + + unique_probe_ids <- unique_probe_ids[c('Probe.Set.ID', 'Entrez.Gene', 'Gene.Symbol', 'Gene.Title', 'Ensembl', 'RefSeq', 'GO')] + names(unique_probe_ids) <- c('ProbesetID', 'ENTREZID', 'SYMBOL', 'GENENAME', 'ENSEMBL', 'REFSEQ', 'GOSLIM_IDS') + + unique_probe_ids$STRING_id <- NA_character_ + + gene_col <- 'ENSEMBL' + if (sum(!is.na(unique_probe_ids$ENTREZID)) > sum(!is.na(unique_probe_ids$ENSEMBL))) { + gene_col <- 'ENTREZID' + } + if (sum(!is.na(unique_probe_ids$SYMBOL)) > sum(!is.na(unique_probe_ids$ENTREZID))) { + gene_col <- 'SYMBOL' + } + + unique_probe_ids <- unique_probe_ids %>% + dplyr::mutate( + count_gene_mappings = 1 + stringr::str_count(get(gene_col), stringr::fixed("|")), + gene_mapping_source = gene_col + ) + } else if (annot_type == 'custom') { + unique_probe_ids <- read.csv( + file.path(local_annotation_dir, annot_config$annot_filename[[1]]), + header = TRUE, na.strings = c('NA', '') + ) + } else { + annot_cols <- c('ProbesetID', 'ENTREZID', 'SYMBOL', 'GENENAME', 'ENSEMBL', 'REFSEQ', 'GOSLIM_IDS', 'STRING_id', 'count_gene_mappings', 'gene_mapping_source') + unique_probe_ids <- setNames(data.frame(matrix(NA_character_, nrow = 1, ncol = length(annot_cols))), annot_cols) + } +} else { + annot <- read.table( + as.character(annotation_file_path), + sep = "\t", + header = TRUE, + quote = "", + comment.char = "" + ) -unique_probe_ids <- df_mapping %>% - dplyr::mutate(dplyr::across(!!sym(expected_attribute_name), as.character)) %>% # Ensure probeset ids treated as character type - dplyr::group_by(!!sym(expected_attribute_name)) %>% - dplyr::summarise( - ENSEMBL = listToUniquePipedString(ensembl_gene_id) + unique_probe_ids <- df_mapping %>% + dplyr::mutate(dplyr::across(!!sym(expected_attribute_name), as.character)) %>% # Ensure probeset ids treated as character type + dplyr::group_by(!!sym(expected_attribute_name)) %>% + dplyr::summarise( + ENSEMBL = listToUniquePipedString(ensembl_gene_id) + ) %>% + # Count number of ensembl IDS mapped + dplyr::mutate( + count_gene_mappings = 1 + stringr::str_count(ENSEMBL, stringr::fixed("|")), + gene_mapping_source = annot_key ) %>% - # Count number of ensembl IDS mapped - dplyr::mutate( - count_ENSEMBL_mappings = 1 + stringr::str_count(ENSEMBL, stringr::fixed("|")) - ) + dplyr::left_join(annot, by = c("ENSEMBL" = annot_key)) +} probeset_expression_matrix <- oligo::exprs(probeset_level_data) -probeset_expression_matrix.biomart_mapped <- probeset_expression_matrix %>% +probeset_expression_matrix.gene_mapped <- probeset_expression_matrix %>% as.data.frame() %>% tibble::rownames_to_column(var = "ProbesetID") %>% # Ensure rownames (probeset IDs) can be used as join key dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% - dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) ) + dplyr::mutate( count_gene_mappings := ifelse(is.na(count_gene_mappings), 0, count_gene_mappings) ) %>% + dplyr::mutate( gene_mapping_source := unique(unique_probe_ids$gene_mapping_source) ) ``` -**Input Data:** +**Parameter Definitions:** - `df_rs$organism` (organism specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) -- `df_rs$'biomart_attribute'` (array design biomart identifier specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) -- ENSEMBL_VERSION (reference organism Ensembl version indicated in the `ensemblVersion` column of the [GL-DPPD-7110_annotations.csv](../../../GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv) GeneLab Annotations file) +- `df_rs$biomart_attribute` (array design biomart identifier specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) +- `annotation_file_path` (reference organism annotation file url indicated in the 'genelab_annots_link' column of the GeneLab Annotations file provided in `annotation_table_link`, output from [Step 2b](#2b-load-annotation-metadata)) +- `ensembl_version` (reference organism Ensembl version indicated in the 'ensemblVersion' column of the GeneLab Annotations file provided in `annotation_table_link`, output from [Step 2b](#2b-load-annotation-metadata)) +- `annot_key` (Keytype to join annotation table and microarray probes, dependent on organism, e.g. mus musculus uses 'ENSEMBL') +- `local_annotation_dir` (Path to local annotation directory if using custom annotations, defined in [Step 2a](#2a-load-metadata-and-raw-data)) + + > Note: See [here](../Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md) for details on what to include in this directory. + +**Input Data:** + - `probeset_level_data` (R object containing probeset level expression values after summarization of normalized probeset level data, output from [Step 7](#7-probeset-summarization)) **Output Data:** -- `probeset_expression_matrix.biomart_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html)) +- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations) +- `unique_probe_ids` (R object containing probeset ID to gene annotation mappings)
-### 8b. Summarize Biomart Mapping +### 8b. Summarize Gene Mapping ```R # Pie Chart with Percentages slices <- c( - 'Unique Mapping' = nrow(probeset_expression_matrix.biomart_mapped %>% dplyr::filter(count_ENSEMBL_mappings == 1) %>% dplyr::distinct(ProbesetID)), - 'Multi Mapping' = nrow(probeset_expression_matrix.biomart_mapped %>% dplyr::filter(count_ENSEMBL_mappings > 1) %>% dplyr::distinct(ProbesetID)), - 'No Mapping' = nrow(probeset_expression_matrix.biomart_mapped %>% dplyr::filter(count_ENSEMBL_mappings == 0) %>% dplyr::distinct(ProbesetID)) + 'Unique Mapping' = nrow(probeset_expression_matrix.gene_mapped %>% dplyr::filter(count_gene_mappings == 1) %>% dplyr::distinct(ProbesetID)), + 'Multi Mapping' = nrow(probeset_expression_matrix.gene_mapped %>% dplyr::filter(count_gene_mappings > 1) %>% dplyr::distinct(ProbesetID)), + 'No Mapping' = nrow(probeset_expression_matrix.gene_mapped %>% dplyr::filter(count_gene_mappings == 0) %>% dplyr::distinct(ProbesetID)) ) pct <- round(slices/sum(slices)*100) chart_names <- names(slices) @@ -827,24 +970,130 @@ chart_names <- glue::glue("{names(slices)} ({slices})") # add count to labels chart_names <- paste(chart_names, pct) # add percents to labels chart_names <- paste(chart_names,"%",sep="") # ad % to labels pie(slices,labels = chart_names, col=rainbow(length(slices)), - main=glue::glue("Biomart Mapping to Ensembl Primary Keytype\n {nrow(probeset_expression_matrix.biomart_mapped %>% dplyr::distinct(ProbesetID))} Total Unique Probesets") + main=glue::glue("Mapping to Primary Keytype\n {nrow(probeset_expression_matrix.gene_mapped %>% dplyr::distinct(ProbesetID))} Total Unique Probesets") ) -print(glue::glue("Biomart Unique Mapping Count: {slices[['Unique Mapping']]}")) +print(glue::glue("Unique Mapping Count: {slices[['Unique Mapping']]}")) ``` **Input Data:** -- `probeset_expression_matrix.biomart_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html), output from [Step 8a](#8a-add-probeset-annotations) above) +- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations, output from [Step 8a](#8a-get-probeset-annotations) above) **Output Data:** -- A pie chart denoting the biomart mapping rates for each unique probeset ID -- A printout denoting the count of unique mappings for biomart mapping +- A pie chart denoting the gene mapping rates for each unique probeset ID +- A printout denoting the count of unique mappings for gene mapping
-### 8c. Generate Design Matrix +### 8c. Save Annotated Tables + +```R +## Reorder columns before saving to file +ANNOTATIONS_COLUMN_ORDER = c( + annot_key, + "SYMBOL", + "GENENAME", + "REFSEQ", + "ENTREZID", + "STRING_id", + "GOSLIM_IDS" +) + +SAMPLE_COLUMN_ORDER <- df_rs$`Sample Name` + +probeset_expression_matrix.gene_mapped <- probeset_expression_matrix.gene_mapped %>% dplyr::rename( !!annot_key := ENSEMBL ) + +## Output column subset file with just normalized probeset level expression values +write.csv( + probeset_expression_matrix.gene_mapped[c( + ANNOTATIONS_COLUMN_ORDER, + "ProbesetID", + "count_gene_mappings", + "gene_mapping_source", + SAMPLE_COLUMN_ORDER) + ], file.path(DIR_NORMALIZED_EXPRESSION, "normalized_expression_probeset_GLmicroarray.csv"), row.names = FALSE) + +## Determine column order for probe level tables + +PROBE_INFO_COLUMN_ORDER = c( + "ProbesetID", + "ProbeID", + "count_gene_mappings", + "gene_mapping_source" +) + +FINAL_COLUMN_ORDER <- c( + ANNOTATIONS_COLUMN_ORDER, + PROBE_INFO_COLUMN_ORDER, + SAMPLE_COLUMN_ORDER +) + +## Generate raw intensity matrix that includes annotations + +background_corrected_data_annotated <- oligo::exprs(background_corrected_data) %>% + as.data.frame() %>% + tibble::rownames_to_column(var = "fid") %>% # Ensure rownames (probeset IDs) can be used as join key + dplyr::mutate(dplyr::across(fid, as.integer)) %>% # Ensure fid is integer type, consistent with getProbeInfo typing + dplyr::right_join(oligo::getProbeInfo(background_corrected_data), by = "fid") %>% # Add 'man_fsetid' via mapping based on fid + dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID + dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID + dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% # Join with biomaRt ENSEMBL mappings + dplyr::mutate( count_gene_mappings := ifelse(is.na(count_gene_mappings), 0, count_gene_mappings) ) %>% # Convert NA mapping to 0 + dplyr::mutate( gene_mapping_source := unique(unique_probe_ids$gene_mapping_source) ) %>% + dplyr::rename( !!annot_key := ENSEMBL ) + +## Perform reordering +background_corrected_data_annotated <- background_corrected_data_annotated %>% + dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER)) + +write.csv(background_corrected_data_annotated, file.path(DIR_RAW_DATA, "raw_intensities_probe_GLmicroarray.csv"), row.names = FALSE) + +## Generate normalized expression matrix that includes annotations +norm_data_matrix_annotated <- oligo::exprs(norm_data) %>% + as.data.frame() %>% + tibble::rownames_to_column(var = "fid") %>% # Ensure rownames (probeset IDs) can be used as join key + dplyr::mutate(dplyr::across(fid, as.integer)) %>% # Ensure fid is integer type, consistent with getProbeInfo typing + dplyr::right_join(oligo::getProbeInfo(norm_data), by = "fid") %>% # Add 'man_fsetid' via mapping based on fid + dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID + dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID + dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% + dplyr::mutate( count_gene_mappings := ifelse(is.na(count_gene_mappings), 0, count_gene_mappings) ) %>% # Convert NA mapping to 0 + dplyr::mutate( gene_mapping_source := unique(unique_probe_ids$gene_mapping_source) ) %>% + dplyr::rename( !!annot_key := ENSEMBL ) + +norm_data_matrix_annotated <- norm_data_matrix_annotated %>% + dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER)) + +write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "normalized_intensities_probe_GLmicroarray.csv"), row.names = FALSE) +``` + +**Parameter Definitions:** + +- `df_rs[['Sample Name']]` (sample names specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) +- `annot_key` (Keytype to join annotation table and microarray probes, dependent on organism, e.g. mus musculus uses 'ENSEMBL', defined in [Step 8a](#8a-get-probeset-annotations)) + +**Input Data:** + +- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations, output from [Step 8a](#8a-get-probeset-annotations) above) +- `background_corrected_data` (R object containing background-corrected microarray data created in [Step 4](#4-background-correction)) +- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization)) +- `unique_probe_ids` (R object containing probeset ID to gene annotation mappings, output from [Step 8a](#8a-get-probeset-annotations)) + +**Output Data:** + +- **normalized_expression_probeset_GLmicroarray.csv** (table containing the background corrected, normalized probeset expression values for each sample. The ProbesetID is the unique index column.) +- **raw_intensities_probe_GLmicroarray.csv** (table containing the background corrected, unnormalized probe intensity values for each sample including gene annotations. The ProbeID is the unique index column.) +- **normalized_intensities_probe_GLmicroarray.csv** (table containing the background corrected, normalized probe intensity values for each sample including gene annotations. The ProbeID is the unique index column.) + +## 9. Perform Probeset Differential Expression (DE) + +> Note: Run differential expression analysis only if there is at least 1 replicate per factor group. + +
+ +### 9a. Generate Design Matrix ```R # Pull all factors for each sample in the study from the runsheet created in Step 1 @@ -911,7 +1160,7 @@ write.csv(design_data$contrasts, file.path(DIR_DGE, "contrasts_GLmicroarray.csv"
-### 8d. Perform Individual Probeset Level DE +### 9b. Perform Individual Probeset Level DE ```R lmFitPairwise <- function(norm_data, design) { @@ -941,12 +1190,16 @@ limma::write.fit(res, adjust = 'BH', row.names = FALSE, quote = TRUE, sep = ",") + +### Generate and export PCA table for GeneLab visualization plots +PCA_raw <- prcomp(t(exprs(probeset_level_data)), scale = FALSE) # Note: expression at the Probeset level is already log2 transformed +write.csv(PCA_raw$x, file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.csv")) ``` **Input Data:** - `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization)) -- `design` (R object containing the limma study design matrix, indicating the group that each sample belongs to, created in [Step 8c](#8c-generate-design-matrix) above) +- `design` (R object containing the limma study design matrix, indicating the group that each sample belongs to, created in [Step 9a](#9a-generate-design-matrix) above) - `probeset_level_data` (R object containing probeset level expression values after summarization of normalized probeset level data, output from [Step 7](#7-probeset-summarization)) **Output Data:** @@ -956,10 +1209,11 @@ limma::write.fit(res, adjust = 'BH', - T statistic for all pairwise comparison tests - P value for all pairwise comparison tests - Adjusted P value for all pairwise comparison tests) +- visualization_PCA_table_GLmicroarray.csv (file used to generate GeneLab PCA plots)
-### 8e. Add Additional Columns and Format DE Table +### 9c. Save DE Table ```R ## Reformat Table for consistency across DE analyses tables within GeneLab ## @@ -967,9 +1221,9 @@ limma::write.fit(res, adjust = 'BH', # Read in DE table df_interim <- read.csv("INTERIM.csv") -# Bind columns from biomart mapped expression table +# Bind columns from gene mapped expression table df_interim <- df_interim %>% - dplyr::bind_cols(probeset_expression_matrix.biomart_mapped) + dplyr::bind_cols(probeset_expression_matrix.gene_mapped) # Reformat column names reformat_names <- function(colname, group_name_mapping) { @@ -981,7 +1235,7 @@ reformat_names <- function(colname, group_name_mapping) { stringr::str_replace(pattern = ".condition", replacement = "v") # remap to group names before make.names was applied - unique_group_name_mapping <- unique(group_name_mapping) + unique_group_name_mapping <- unique(group_name_mapping) %>% arrange(-nchar(safe_name)) for ( i in seq(nrow(unique_group_name_mapping)) ) { safe_name <- unique_group_name_mapping[i,]$safe_name original_name <- unique_group_name_mapping[i,]$original_name @@ -991,7 +1245,7 @@ reformat_names <- function(colname, group_name_mapping) { return(new_colname) } -df_interim <- df_interim %>% dplyr::rename_with( reformat_names, group_name_mapping = design_data$mapping ) +df_interim <- df_interim %>% dplyr::rename_with(reformat_names, .cols = matches('\\.condition'), group_name_mapping = design_data$mapping) ## Add Group Wise Statistics ## @@ -1024,11 +1278,10 @@ for ( i in seq_along(unique_groups) ) { as.data.frame() } -all_samples <- design_data$group %>% dplyr::pull(sample) df_interim <- df_interim %>% dplyr::mutate( - "All.mean" := rowMeans(dplyr::select(., all_of(all_samples))), - "All.stdev" := matrixStats::rowSds(as.matrix(dplyr::select(., all_of(all_samples)))), + "All.mean" := rowMeans(dplyr::select(., all_of(SAMPLE_COLUMN_ORDER))), + "All.stdev" := matrixStats::rowSds(as.matrix(dplyr::select(., all_of(SAMPLE_COLUMN_ORDER)))), ) %>% dplyr::ungroup() %>% as.data.frame() @@ -1042,56 +1295,12 @@ colnames_to_remove = c( df_interim <- df_interim %>% dplyr::select(-any_of(colnames_to_remove)) -## Concatenate annotations for genes (for uniquely mapped probes) ## -### Read in annotation table for the appropriate organism ### -annot <- read.table( - annotation_file_path, - sep = "\t", - header = TRUE, - quote = "", - comment.char = "", - ) - -# Join annotation table and uniquely mapped data - -# Determine appropriate keytype as found in annotation tables -map_primary_keytypes <- c( - 'Caenorhabditis elegans' = 'ENSEMBL', - 'Danio rerio' = 'ENSEMBL', - 'Drosophila melanogaster' = 'ENSEMBL', - 'Rattus norvegicus' = 'ENSEMBL', - 'Saccharomyces cerevisiae' = 'ENSEMBL', - 'Homo sapiens' = 'ENSEMBL', - 'Mus musculus' = 'ENSEMBL', - 'Arabidopsis thaliana' = 'TAIR' -) - -df_interim <- merge( - annot, - df_interim, - by.x = map_primary_keytypes[[unique(df_rs$organism)]], - by.y = "ENSEMBL", - # ensure all original dge rows are kept. - # If unmatched in the annotation database, then fill missing with NAN - all.y = TRUE - ) - -## Reorder columns before saving to file -ANNOTATIONS_COLUMN_ORDER = c( - map_primary_keytypes[[unique(df_rs$organism)]], - "SYMBOL", - "GENENAME", - "REFSEQ", - "ENTREZID", - "STRING_id", - "GOSLIM_IDS" -) - PROBE_INFO_COLUMN_ORDER = c( "ProbesetID", - "count_ENSEMBL_mappings" + "count_gene_mappings", + "gene_mapping_source" ) -SAMPLE_COLUMN_ORDER <- all_samples + generate_prefixed_column_order <- function(subjects, prefixes) { #' Return a vector of columns based on subject and given prefixes #' Used for both contrasts and groups column name generation @@ -1159,106 +1368,15 @@ df_interim <- df_interim %>% dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER)) # Save to file write.csv(df_interim, file.path(DIR_DGE, "differential_expression_GLmicroarray.csv"), row.names = FALSE) - -## Output column subset file with just normalized probeset level expression values -write.csv( - df_interim[c( - ANNOTATIONS_COLUMN_ORDER, - "ProbesetID", - "count_ENSEMBL_mappings", - all_samples) - ], file.path(DIR_NORMALIZED_EXPRESSION, "normalized_expression_probeset_GLmicroarray.csv"), row.names = FALSE) - -### Generate and export PCA table for GeneLab visualization plots -PCA_raw <- prcomp(t(exprs(probeset_level_data)), scale = FALSE) # Note: expression at the Probeset level is already log2 transformed -write.csv(PCA_raw$x, - file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.csv") - ) - -## Generate raw intensity matrix that includes annotations - -background_corrected_data_annotated <- oligo::exprs(background_corrected_data) %>% - as.data.frame() %>% - tibble::rownames_to_column(var = "fid") %>% # Ensure rownames (probeset IDs) can be used as join key - dplyr::mutate(dplyr::across(fid, as.integer)) %>% # Ensure fid is integer type, consistent with getProbeInfo typing - dplyr::right_join(oligo::getProbeInfo(background_corrected_data), by = "fid") %>% # Add 'man_fsetid' via mapping based on fid - dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID - dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID - dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% # Join with biomaRt ENSEMBL mappings - dplyr::left_join(annot, by = c("ENSEMBL" = map_primary_keytypes[[unique(df_rs$organism)]])) %>% # Join with GeneLab Reference Annotation Table using key name expected in organism specific annotation table - dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) ) %>% # Convert NA mapping to 0 - dplyr::rename( !!map_primary_keytypes[[unique(df_rs$organism)]] := ENSEMBL ) - -## Determine column order for probe level tables - -PROBE_INFO_COLUMN_ORDER = c( - "ProbesetID", - "ProbeID", - "count_ENSEMBL_mappings" -) - -FINAL_COLUMN_ORDER <- c( - ANNOTATIONS_COLUMN_ORDER, - PROBE_INFO_COLUMN_ORDER, - SAMPLE_COLUMN_ORDER - ) - -## Generate raw intensity matrix that includes annotations - -background_corrected_data_annotated <- oligo::exprs(background_corrected_data) %>% - as.data.frame() %>% - tibble::rownames_to_column(var = "fid") %>% # Ensure rownames (probeset IDs) can be used as join key - dplyr::mutate(dplyr::across(fid, as.integer)) %>% # Ensure fid is integer type, consistent with getProbeInfo typing - dplyr::right_join(oligo::getProbeInfo(background_corrected_data), by = "fid") %>% # Add 'man_fsetid' via mapping based on fid - dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID - dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID - dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% # Join with biomaRt ENSEMBL mappings - dplyr::left_join(annot, by = c("ENSEMBL" = map_primary_keytypes[[unique(df_rs$organism)]])) %>% # Join with GeneLab Reference Annotation Table using key name expected in organism specific annotation table - dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) ) %>% # Convert NA mapping to 0 - dplyr::rename( !!map_primary_keytypes[[unique(df_rs$organism)]] := ENSEMBL ) - -## Perform reordering -background_corrected_data_annotated <- background_corrected_data_annotated %>% - dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER)) - -write.csv(background_corrected_data_annotated, file.path(DIR_RAW_DATA, "raw_intensities_probe_GLmicroarray.csv"), row.names = FALSE) - -## Generate normalized expression matrix that includes annotations -norm_data_matrix_annotated <- oligo::exprs(norm_data) %>% - as.data.frame() %>% - tibble::rownames_to_column(var = "fid") %>% # Ensure rownames (probeset IDs) can be used as join key - dplyr::mutate(dplyr::across(fid, as.integer)) %>% # Ensure fid is integer type, consistent with getProbeInfo typing - dplyr::right_join(oligo::getProbeInfo(norm_data), by = "fid") %>% # Add 'man_fsetid' via mapping based on fid - dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID - dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID - dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% - dplyr::left_join(annot, by = c("ENSEMBL" = map_primary_keytypes[[unique(df_rs$organism)]])) %>% # Join with GeneLab Reference Annotation Table using key name expected in organism specific annotation table - dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) ) %>% # Convert NA mapping to 0 - dplyr::rename( !!map_primary_keytypes[[unique(df_rs$organism)]] := ENSEMBL ) - - - -norm_data_matrix_annotated <- norm_data_matrix_annotated %>% - dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER)) - -write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "normalized_intensities_probe_GLmicroarray.csv"), row.names = FALSE) - ``` **Input Data:** -- INTERIM.csv (Statistical values from individual probeset level DE analysis, output from [Step 8d](#8d-perform-individual-probeset-level-de) above) -- `annotation_file_path` (Annotation file url from 'genelab_annots_link' column of [GL-DPPD-7110_annotations.csv](https://github.com/nasa/GeneLab_Data_Processing/blob/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv) corresponding to the subject organism) -- `primary_keytype` (Keytype to join annotation table and microarray probes, dependent on organism, e.g. mus musculus uses 'ENSEMBL') -- `background_corrected_data` (R object containing background-corrected microarray data) -- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization)) +- INTERIM.csv (Statistical values from individual probeset level DE analysis, output from [Step 9b](#9b-perform-individual-probeset-level-de) above) +- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations, output from [Step 8a](#8a-get-probeset-annotations) above) **Output Data:** - **differential_expression_GLmicroarray.csv** (table containing normalized probeset expression values for each sample, group statistics, Limma probeset DE results for each pairwise comparison, and gene annotations. The ProbesetID is the unique index column.) -- **normalized_expression_probeset_GLmicroarray.csv** (table containing the background corrected, normalized probeset expression values for each sample. The ProbesetID is the unique index column.) -- visualization_PCA_table_GLmicroarray.csv (file used to generate GeneLab PCA plots) -- **raw_intensities_probe_GLmicroarray.csv** (table containing the background corrected, unnormalized probe intensity values for each sample including gene annotations. The ProbeID is the unique index column.) -- **normalized_intensities_probe_GLmicroarray.csv** (table containing the background corrected, normalized probe intensity values for each sample including gene annotations. The ProbeID is the unique index column.) > All steps of the Microarray pipeline are performed using R markdown and the completed R markdown is rendered (via Quarto) as an html file (**NF_MAAffymetrix_v\*_GLmicroarray.html**) and published in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/) for the respective dataset. \ No newline at end of file diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md new file mode 100644 index 00000000..33e59b9f --- /dev/null +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md @@ -0,0 +1,20 @@ +# Custom Annotations Specification + +## Description + +* If using custom gene annotations when processing Affymetrix datasets through GeneLab's Affymetrix processing pipeline, a csv file named `config.csv` must be provided as specified below. +* Both the `config.csv` and custom annotations files must be placed in the directory specified by `local_annotation_dir` in the pipeline. + + +## Example + +- [config.csv](config.csv) + + +## Required columns + +| Column Name | Type | Description | Example | +|:------------|:-----|:------------|:--------| +| array_design | string | A bioMart attribute identifier denoting the microarray probe/probeset attribute used for annotation mapping. | AFFY E coli Genome 2 0 | +| annot_type | string | Used to determine how the custom annotations are parsed before merging to the data. Currently, only the below are supported:
  • `3prime-IVT`: Annotations file is expected to be in the format of the 3' IVT expression analysis arrays annotations by [Thermo Fisher](https://www.thermofisher.com/us/en/home/life-science/microarray-analysis/microarray-data-analysis/genechip-array-annotation-files.html)
  • `custom`: Annotations file is merged as is, expected to have the following columns: `ProbesetID`, `ENTREZID`, `SYMBOL`, `GENENAME`, `ENSEMBL`, `REFSEQ`, `GOSLIM_IDS`, `STRING_id`, `count_gene_mappings`, `gene_mapping_source`
| 3prime-IVT | +| annot_filename | string | Name of the custom annotations file. | E_coli_2.na36.annot.csv | diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/annotations/config.csv b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/annotations/config.csv new file mode 100644 index 00000000..5c8dff73 --- /dev/null +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/annotations/config.csv @@ -0,0 +1,3 @@ +array_design,annot_type,annot_filename +AFFY E coli Genome 2 0,3prime-IVT,E_coli_2.na36.annot.csv +AFFY GeneChip P. aeruginosa Genome,3prime-IVT,Pae_G1a.na36.annot.csv diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/runsheet/OSD-213_microarray_v0_runsheet.csv b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/runsheet/OSD-213_microarray_v0_runsheet.csv new file mode 100644 index 00000000..f309c854 --- /dev/null +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/runsheet/OSD-213_microarray_v0_runsheet.csv @@ -0,0 +1,7 @@ +Sample Name,Study Assay Measurement,Study Assay Technology Type,Study Assay Technology Platform,organism,biomart_attribute,Source Name,Label,Array Data File Name,Array Data File Path,Comment[Array Data File Name],Factor Value[Spaceflight],Factor Value[Altered Gravity],Original Sample Name +Atha_Col-0_clsCC_FLT_1G_Rep1,transcription profiling,DNA microarray,Affymetrix,Arabidopsis thaliana,AFFY ATH1 121501,Culture cells_1,biotin,GLDS-213_microarray_FC_front.CEL.gz,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-213/download?source=datamanager&file=GLDS-213_microarray_FC_front.CEL.gz,GLDS-213_microarray_FC_front.CEL.gz,Space Flight,1G by centrifugation,Atha_Col-0_clsCC_FLT_1G_Rep1 +Atha_Col-0_clsCC_FLT_1G_Rep2,transcription profiling,DNA microarray,Affymetrix,Arabidopsis thaliana,AFFY ATH1 121501,Culture cells_2,biotin,GLDS-213_microarray_FC_rear.CEL.gz,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-213/download?source=datamanager&file=GLDS-213_microarray_FC_rear.CEL.gz,GLDS-213_microarray_FC_rear.CEL.gz,Space Flight,1G by centrifugation,Atha_Col-0_clsCC_FLT_1G_Rep2 +Atha_Col-0_clsCC_FLT_uG_Rep1,transcription profiling,DNA microarray,Affymetrix,Arabidopsis thaliana,AFFY ATH1 121501,Culture cells_3,biotin,GLDS-213_microarray_FS_front.CEL.gz,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-213/download?source=datamanager&file=GLDS-213_microarray_FS_front.CEL.gz,GLDS-213_microarray_FS_front.CEL.gz,Space Flight,uG,Atha_Col-0_clsCC_FLT_uG_Rep1 +Atha_Col-0_clsCC_FLT_uG_Rep2,transcription profiling,DNA microarray,Affymetrix,Arabidopsis thaliana,AFFY ATH1 121501,Culture cells_4,biotin,GLDS-213_microarray_FS_rear.CEL.gz,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-213/download?source=datamanager&file=GLDS-213_microarray_FS_rear.CEL.gz,GLDS-213_microarray_FS_rear.CEL.gz,Space Flight,uG,Atha_Col-0_clsCC_FLT_uG_Rep2 +Atha_Col-0_clsCC_GC_1G_Rep1,transcription profiling,DNA microarray,Affymetrix,Arabidopsis thaliana,AFFY ATH1 121501,Culture cells_5,biotin,GLDS-213_microarray_GS_front.CEL.gz,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-213/download?source=datamanager&file=GLDS-213_microarray_GS_front.CEL.gz,GLDS-213_microarray_GS_front.CEL.gz,Ground Control,1G on Earth,Atha_Col-0_clsCC_GC_1G_Rep1 +Atha_Col-0_clsCC_GC_1G_Rep2,transcription profiling,DNA microarray,Affymetrix,Arabidopsis thaliana,AFFY ATH1 121501,Culture cells_6,biotin,GLDS-213_microarray_GS_rear.CEL.gz,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-213/download?source=datamanager&file=GLDS-213_microarray_GS_rear.CEL.gz,GLDS-213_microarray_GS_rear.CEL.gz,Ground Control,1G on Earth,Atha_Col-0_clsCC_GC_1G_Rep2 diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/runsheet/OSD-3_microarray_v0_runsheet.csv b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/runsheet/OSD-3_microarray_v0_runsheet.csv new file mode 100644 index 00000000..fdc974de --- /dev/null +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/runsheet/OSD-3_microarray_v0_runsheet.csv @@ -0,0 +1,19 @@ +Sample Name,Study Assay Measurement,Study Assay Technology Type,Study Assay Technology Platform,organism,biomart_attribute,Source Name,Label,Array Data File Name,Array Data File Path,Comment[Array Data File Name],Factor Value[Developmental Stage],Factor Value[Spaceflight],Original Sample Name +Dmel_Hml-GAL4-UAS-GFP_wo_FLT_3rd-Instar-Larva_Rep1,transcription profiling,DNA microarray,Affymetrix,Drosophila melanogaster,AFFY Drosophila 2,GSM588931 1,biotin,GLDS-3_microarray_GSM588948.CEL,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-3/download?source=datamanager&file=GLDS-3_microarray_GSM588948.CEL,GLDS-3_microarray_GSM588948.CEL,third instar larva stage,Space Flight,Dmel_Hml-GAL4-UAS-GFP_wo_FLT_3rd-Instar-Larva_Rep1 +Dmel_Hml-GAL4-UAS-GFP_wo_FLT_3rd-Instar-Larva_Rep2,transcription profiling,DNA microarray,Affymetrix,Drosophila melanogaster,AFFY Drosophila 2,GSM588932 1,biotin,GLDS-3_microarray_GSM588947.CEL,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-3/download?source=datamanager&file=GLDS-3_microarray_GSM588947.CEL,GLDS-3_microarray_GSM588947.CEL,third instar larva stage,Space Flight,Dmel_Hml-GAL4-UAS-GFP_wo_FLT_3rd-Instar-Larva_Rep2 +Dmel_Hml-GAL4-UAS-GFP_wo_FLT_3rd-Instar-Larva_Rep3,transcription profiling,DNA microarray,Affymetrix,Drosophila melanogaster,AFFY Drosophila 2,GSM588933 1,biotin,GLDS-3_microarray_GSM588946.CEL,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-3/download?source=datamanager&file=GLDS-3_microarray_GSM588946.CEL,GLDS-3_microarray_GSM588946.CEL,third instar larva stage,Space Flight,Dmel_Hml-GAL4-UAS-GFP_wo_FLT_3rd-Instar-Larva_Rep3 +Dmel_Hml-GAL4-UAS-GFP_wo_FLT_3rd-Instar-Larva_Rep4,transcription profiling,DNA microarray,Affymetrix,Drosophila melanogaster,AFFY Drosophila 2,GSM588934 1,biotin,GLDS-3_microarray_GSM588945.CEL,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-3/download?source=datamanager&file=GLDS-3_microarray_GSM588945.CEL,GLDS-3_microarray_GSM588945.CEL,third instar larva stage,Space Flight,Dmel_Hml-GAL4-UAS-GFP_wo_FLT_3rd-Instar-Larva_Rep4 +Dmel_Hml-GAL4-UAS-GFP_wo_FLT_3rd-Instar-Larva_Rep5,transcription profiling,DNA microarray,Affymetrix,Drosophila melanogaster,AFFY Drosophila 2,GSM588935 1,biotin,GLDS-3_microarray_GSM588944.CEL,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-3/download?source=datamanager&file=GLDS-3_microarray_GSM588944.CEL,GLDS-3_microarray_GSM588944.CEL,third instar larva stage,Space Flight,Dmel_Hml-GAL4-UAS-GFP_wo_FLT_3rd-Instar-Larva_Rep5 +Dmel_Hml-GAL4-UAS-GFP_wo_FLT_3rd-Instar-Larva_Rep6,transcription profiling,DNA microarray,Affymetrix,Drosophila melanogaster,AFFY Drosophila 2,GSM588936 1,biotin,GLDS-3_microarray_GSM588943.CEL,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-3/download?source=datamanager&file=GLDS-3_microarray_GSM588943.CEL,GLDS-3_microarray_GSM588943.CEL,third instar larva stage,Space Flight,Dmel_Hml-GAL4-UAS-GFP_wo_FLT_3rd-Instar-Larva_Rep6 +Dmel_Hml-GAL4-UAS-GFP_wo_FLT_Adult_Rep1,transcription profiling,DNA microarray,Affymetrix,Drosophila melanogaster,AFFY Drosophila 2,GSM588937 1,biotin,GLDS-3_microarray_GSM588942.CEL,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-3/download?source=datamanager&file=GLDS-3_microarray_GSM588942.CEL,GLDS-3_microarray_GSM588942.CEL,adult,Space Flight,Dmel_Hml-GAL4-UAS-GFP_wo_FLT_Adult_Rep1 +Dmel_Hml-GAL4-UAS-GFP_wo_FLT_Adult_Rep2,transcription profiling,DNA microarray,Affymetrix,Drosophila melanogaster,AFFY Drosophila 2,GSM588938 1,biotin,GLDS-3_microarray_GSM588941.CEL,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-3/download?source=datamanager&file=GLDS-3_microarray_GSM588941.CEL,GLDS-3_microarray_GSM588941.CEL,adult,Space Flight,Dmel_Hml-GAL4-UAS-GFP_wo_FLT_Adult_Rep2 +Dmel_Hml-GAL4-UAS-GFP_wo_FLT_Adult_Rep3,transcription profiling,DNA microarray,Affymetrix,Drosophila melanogaster,AFFY Drosophila 2,GSM588939 1,biotin,GLDS-3_microarray_GSM588940.CEL,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-3/download?source=datamanager&file=GLDS-3_microarray_GSM588940.CEL,GLDS-3_microarray_GSM588940.CEL,adult,Space Flight,Dmel_Hml-GAL4-UAS-GFP_wo_FLT_Adult_Rep3 +Dmel_Hml-GAL4-UAS-GFP_wo_GC_3rd-Instar-Larva_Rep1,transcription profiling,DNA microarray,Affymetrix,Drosophila melanogaster,AFFY Drosophila 2,GSM588940 1,biotin,GLDS-3_microarray_GSM588939.CEL,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-3/download?source=datamanager&file=GLDS-3_microarray_GSM588939.CEL,GLDS-3_microarray_GSM588939.CEL,third instar larva stage,Ground Control,Dmel_Hml-GAL4-UAS-GFP_wo_GC_3rd-Instar-Larva_Rep1 +Dmel_Hml-GAL4-UAS-GFP_wo_GC_3rd-Instar-Larva_Rep2,transcription profiling,DNA microarray,Affymetrix,Drosophila melanogaster,AFFY Drosophila 2,GSM588941 1,biotin,GLDS-3_microarray_GSM588938.CEL,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-3/download?source=datamanager&file=GLDS-3_microarray_GSM588938.CEL,GLDS-3_microarray_GSM588938.CEL,third instar larva stage,Ground Control,Dmel_Hml-GAL4-UAS-GFP_wo_GC_3rd-Instar-Larva_Rep2 +Dmel_Hml-GAL4-UAS-GFP_wo_GC_3rd-Instar-Larva_Rep3,transcription profiling,DNA microarray,Affymetrix,Drosophila melanogaster,AFFY Drosophila 2,GSM588942 1,biotin,GLDS-3_microarray_GSM588937.CEL,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-3/download?source=datamanager&file=GLDS-3_microarray_GSM588937.CEL,GLDS-3_microarray_GSM588937.CEL,third instar larva stage,Ground Control,Dmel_Hml-GAL4-UAS-GFP_wo_GC_3rd-Instar-Larva_Rep3 +Dmel_Hml-GAL4-UAS-GFP_wo_GC_3rd-Instar-Larva_Rep4,transcription profiling,DNA microarray,Affymetrix,Drosophila melanogaster,AFFY Drosophila 2,GSM588943 1,biotin,GLDS-3_microarray_GSM588936.CEL,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-3/download?source=datamanager&file=GLDS-3_microarray_GSM588936.CEL,GLDS-3_microarray_GSM588936.CEL,third instar larva stage,Ground Control,Dmel_Hml-GAL4-UAS-GFP_wo_GC_3rd-Instar-Larva_Rep4 +Dmel_Hml-GAL4-UAS-GFP_wo_GC_3rd-Instar-Larva_Rep5,transcription profiling,DNA microarray,Affymetrix,Drosophila melanogaster,AFFY Drosophila 2,GSM588944 1,biotin,GLDS-3_microarray_GSM588935.CEL,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-3/download?source=datamanager&file=GLDS-3_microarray_GSM588935.CEL,GLDS-3_microarray_GSM588935.CEL,third instar larva stage,Ground Control,Dmel_Hml-GAL4-UAS-GFP_wo_GC_3rd-Instar-Larva_Rep5 +Dmel_Hml-GAL4-UAS-GFP_wo_GC_3rd-Instar-Larva_Rep6,transcription profiling,DNA microarray,Affymetrix,Drosophila melanogaster,AFFY Drosophila 2,GSM588945 1,biotin,GLDS-3_microarray_GSM588934.CEL,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-3/download?source=datamanager&file=GLDS-3_microarray_GSM588934.CEL,GLDS-3_microarray_GSM588934.CEL,third instar larva stage,Ground Control,Dmel_Hml-GAL4-UAS-GFP_wo_GC_3rd-Instar-Larva_Rep6 +Dmel_Hml-GAL4-UAS-GFP_wo_GC_Adult_Rep1,transcription profiling,DNA microarray,Affymetrix,Drosophila melanogaster,AFFY Drosophila 2,GSM588946 1,biotin,GLDS-3_microarray_GSM588933.CEL,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-3/download?source=datamanager&file=GLDS-3_microarray_GSM588933.CEL,GLDS-3_microarray_GSM588933.CEL,adult,Ground Control,Dmel_Hml-GAL4-UAS-GFP_wo_GC_Adult_Rep1 +Dmel_Hml-GAL4-UAS-GFP_wo_GC_Adult_Rep2,transcription profiling,DNA microarray,Affymetrix,Drosophila melanogaster,AFFY Drosophila 2,GSM588947 1,biotin,GLDS-3_microarray_GSM588932.CEL,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-3/download?source=datamanager&file=GLDS-3_microarray_GSM588932.CEL,GLDS-3_microarray_GSM588932.CEL,adult,Ground Control,Dmel_Hml-GAL4-UAS-GFP_wo_GC_Adult_Rep2 +Dmel_Hml-GAL4-UAS-GFP_wo_GC_Adult_Rep3,transcription profiling,DNA microarray,Affymetrix,Drosophila melanogaster,AFFY Drosophila 2,GSM588948 1,biotin,GLDS-3_microarray_GSM588931.CEL,https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-3/download?source=datamanager&file=GLDS-3_microarray_GSM588931.CEL,GLDS-3_microarray_GSM588931.CEL,adult,Ground Control,Dmel_Hml-GAL4-UAS-GFP_wo_GC_Adult_Rep3 diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/runsheet/README.md b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/runsheet/README.md new file mode 100644 index 00000000..c4927a42 --- /dev/null +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/runsheet/README.md @@ -0,0 +1,23 @@ +# Runsheet Specification + +## Description + +* The Runsheet is a csv file that contains the metadata required for processing Affymetrix datasets through GeneLab's Affymetrix processing pipeline. + + +## Examples + +1. [Runsheet for GLDS-3](OSD-3_microarray_v0_runsheet.csv) +2. [Runsheet for GLDS-213](OSD-213_microarray_v0_runsheet.csv) + + +## Required columns + +| Column Name | Type | Description | Example | +|:------------|:-----|:------------|:--------| +| Sample Name | string | Sample Name, added as a prefix to sample-specific processed data output files. Should not include spaces or weird characters. | Dmel_Hml-GAL4-UAS-GFP_wo_FLT_3rd-Instar-Larva_Rep1 | +| biomart_attribute | string | A bioMart attribute identifier denoting the microarray probe/probeset attribute used for annotation mapping. | AFFY Drosophila 2 | +| organism | string | Species name used to map to the appropriate gene annotations file. | Drosophila melanogaster | +| Array Data File Path | string (url or local path) | Location of the raw data file for the sample. | /my/data/sample_1.CEL | +| Factor Value[] | string | A set of one or more columns specifying the experimental group the sample belongs to. In the simplest form, a column named 'Factor Value[group]' is sufficient. | Space Flight | +| Original Sample Name | string | Used to map the sample name that will be used for processing to the original sample name. This is often identical except in cases where the original name includes spaces or weird characters. | Dmel_Hml-GAL4-UAS-GFP_wo_FLT_3rd-Instar-Larva_Rep1 | From 6249c40b3e795ee5abdfdddfcb7d73fd98a50fe4 Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Thu, 9 Jan 2025 16:58:13 -0800 Subject: [PATCH 04/25] NF_MAAffymetrix: move updated doc to new pipeline version --- .../GL-DPPD-7114-A.md | 1382 +++++++++++++++++ .../GL-DPPD-7114.md | 648 ++++---- 2 files changed, 1647 insertions(+), 383 deletions(-) create mode 100644 Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md diff --git a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md new file mode 100644 index 00000000..c2cdc0bb --- /dev/null +++ b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md @@ -0,0 +1,1382 @@ +# GeneLab bioinformatics processing pipeline for Affymetrix microarray data + +> **This page holds an overview and instructions for how GeneLab processes Affymetrix microarray datasets. Exact processing commands and GL-DPPD-7114 version used for specific GeneLab datasets (GLDS) are provided with their processed data in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo).** +> +> \* The pipeline detailed below currently supports gene annotations for Arabidopsis Thaliana via Ensembl FTP, all animals available in Biomart, and custom annotations (see [Step 8a](#8a-get-probeset-annotations)). + +--- + +**Date:** March 31, 2023 +**Revision:** - +**Document Number:** GL-DPPD-7114 + +**Submitted by:** +Jonathan Oribello (GeneLab Data Processing Team) + +**Approved by:** +Sylvain Costes (GeneLab Project Manager) +Samrawit Gebre (GeneLab Deputy Project Manager) +Amanda Saravia-Butler (GeneLab Data Processing Lead) +Lauren Sanders (acting GeneLab Project Scientist) + +--- + +# Table of contents + +- [Software used](#software-used) +- [General processing overview with example commands](#general-processing-overview-with-example-commands) + - [1. Create Sample RunSheet](#1-create-sample-runsheet) + - [2. Load Data](#2-load-data) + - [2a. Load Metadata and Raw Data](#2a-load-metadata-and-raw-data) + - [2b. Load Annotation Metadata](#2b-load-annotation-metadata) + - [3. Raw Data Quality Assessment](#3-raw-data-quality-assessment) + - [3a. Density Plot](#3a-density-plot) + - [3b. Pseudo Image Plots](#3b-pseudo-image-plots) + - [3c. MA Plots](#3c-ma-plots) + - [3d. Boxplots](#3d-boxplots) + - [4. Background Correction](#4-background-correction) + - [5. Between Array Normalization](#5-between-array-normalization) + - [6. Normalized Data Quality Assessment](#6-normalized-data-quality-assessment) + - [6a. Density Plot](#6a-density-plot) + - [6b. Pseudo Image Plots](#6b-pseudo-image-plots) + - [6c. MA Plots](#6c-ma-plots) + - [6d. Boxplots](#6d-boxplots) + - [7. Probeset Summarization](#7-probeset-summarization) + - [8. Probeset Annotations](#8-probeset-annotations) + - [8a. Get Probeset Annotations](#8a-get-probeset-annotations) + - [8b. Summarize Gene Mapping](#8b-summarize-gene-mapping) + - [8c. Save Annotated Tables](#8c-save-annotated-tables) + - [9. Perform Probeset Differential Expression (DE)](#9-perform-probeset-differential-expression-de) + - [9a. Generate Design Matrix](#9a-generate-design-matrix) + - [9b. Perform Individual Probeset Level DE](#9b-perform-individual-probeset-level-de) + - [9c. Save DE Table](#9c-save-de-table) + +--- + +# Software used + +|Program|Version|Relevant Links| +|:------|:------:|:-------------| +|R|4.1.3|[https://www.bioinformatics.babraham.ac.uk/projects/fastqc/](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)| +|DT|0.26|[https://github.com/rstudio/DT](https://github.com/rstudio/DT)| +|dplyr|1.0.10|[https://dplyr.tidyverse.org](https://dplyr.tidyverse.org)| +|tibble|3.1.8|[https://tibble.tidyverse.org](https://tibble.tidyverse.org)| +|stringr|1.5.0|[https://stringr.tidyverse.org](https://stringr.tidyverse.org)| +|R.utils|2.12.2|[https://github.com/HenrikBengtsson/R.utils](https://github.com/HenrikBengtsson/R.utils)| +|oligo|1.58.0|[https://bioconductor.org/packages/3.14/bioc/html/oligo.html](https://bioconductor.org/packages/3.14/bioc/html/oligo.html)| +|limma|3.50.3|[https://bioconductor.org/packages/3.14/bioc/html/limma.html](https://bioconductor.org/packages/3.14/bioc/html/limma.html)| +|glue|1.6.2|[https://glue.tidyverse.org](https://glue.tidyverse.org)| +|biomaRt|2.50.0|[https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html)| +|matrixStats|0.63.0|[https://github.com/HenrikBengtsson/matrixStats](https://github.com/HenrikBengtsson/matrixStats)| +|statmod|1.5.0|[https://github.com/cran/statmod](https://github.com/cran/statmod)| +|dp_tools|1.3.4|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)| +|singularity|3.9|[https://sylabs.io](https://sylabs.io)| +|Quarto|1.2.313|[https://quarto.org](https://quarto.org)| + +--- + +# General processing overview with example commands + +> Exact processing commands and output files listed in **bold** below are included with each Microarray processed dataset in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/). + +--- + +## 1. Create Sample RunSheet + +> Notes: +> - Rather than running the commands below to create the runsheet needed for processing, the runsheet may also be created manually by following the [file specification](../Workflow_Documentation/NF_MAAffymetrix/examples/runsheet/README.md). +> +> - These command line tools are part of the [dp_tools](https://github.com/J-81/dp_tools) program. + +```bash +### Download the *ISA.zip file from the GeneLab Repository ### + +dpt-get-isa-archive \ + --accession OSD-### + +### Parse the metadata from the *ISA.zip file to create a sample runsheet ### + +dpt-isa-to-runsheet --accession OSD-### \ + --config-type microarray \ + --config-version Latest \ + --isa-archive *ISA.zip +``` + +**Parameter Definitions:** + +- `--accession OSD-###` - OSD accession ID (replace ### with the OSD number being processed), used to retrieve the urls for the ISA archive and raw expression files hosted on the GeneLab Repository +- `--config-type` - Instructs the script to extract the metadata required for `microarray` processing from the ISA archive +- `--config-version` - Specifies the `dp-tools` configuration version to use, a value of `Latest` will specify the most recent version +- `--isa-archive` - Specifies the *ISA.zip file for the respective GLDS dataset, downloaded in the `dpt-get-isa-archive` command + + +**Input Data:** + +- No input data required but the OSD accession ID needs to be indicated, which is used to download the respective ISA archive + +**Output Data:** + +- *ISA.zip (compressed ISA directory containing Investigation, Study, and Assay (ISA) metadata files for the respective OSD dataset, used to define sample groups - the *ISA.zip file is located in the [OSD repository](https://osdr.nasa.gov/bio/repo/search?q=&data_source=cgene,alsda&data_type=study) under 'Study Files' -> 'metadata') + +- **{OSD-Accession-ID}_microarray_v{version}_runsheet.csv** (table containing metadata required for processing, version denotes the dp_tools schema used to specify the metadata to extract from the ISA archive) + +
+ +--- + +## 2. Load Data + +> Note: Steps 2 - 9 are done in R + +
+ +### 2a. Load Metadata and Raw Data + +```R +### Install R packages if not already installed ### + +install.packages("tidyverse") +install.packages("R.utils") +install.packages("glue") +install.packages("matrixStats") +install.packages("statmod") +if (!require("BiocManager", quietly = TRUE)) + install.packages("BiocManager") +BiocManager::install(version = "3.14") +BiocManager::install("limma") +BiocManager::install("biomaRt") +BiocManager::install("oligo") + + +## Note: Only dplyr is explicitly loaded. Other library functions are called with explicit namespace (e.g. LIBRARYNAME::FUNCTION) +library(dplyr) # Ensure infix operator is available, methods should still reference dplyr namespace otherwise +options(dplyr.summarise.inform = FALSE) # Don't print out '`summarise()` has grouped output by 'group'. You can override using the `.groups` argument.' + +# Define path to runsheet +runsheet <- "/path/to/runsheet/{OSD-Accession-ID}_microarray_v{version}_runsheet.csv" + +# If using custom annotation, define path to directory containing annotation file and config +local_annotation_dir <- NULL # + +## Set up output structure + +# Output Constants +DIR_RAW_DATA <- "00-RawData" +DIR_NORMALIZED_EXPRESSION <- "01-oligo_NormExp" +DIR_DGE <- "02-limma_DGE" + +dir.create(DIR_RAW_DATA) +dir.create(DIR_NORMALIZED_EXPRESSION) +dir.create(DIR_DGE) + +## Save original par settings +## Par may be temporarily changed for plotting purposes and reset once the plotting is done + +original_par <- par() +options(preferRaster=TRUE) # use Raster when possible to avoid antialiasing artifacts in images + +options(timeout=1000) # ensure enough time for data downloads + +# Utility function to improve robustness of function calls +# Used to remedy intermittent internet issues during runtime +retry_with_delay <- function(func, ...) { + max_attempts = 5 + initial_delay = 10 + delay_increase = 30 + attempt <- 1 + current_delay <- initial_delay + while (attempt <= max_attempts) { + result <- tryCatch( + expr = func(...), + error = function(e) e + ) + + if (!inherits(result, "error")) { + return(result) + } else { + if (attempt < max_attempts) { + message(paste("Retry attempt", attempt, "failed for function with name <", deparse(substitute(func)) ,">. Retrying in", current_delay, "second(s)...")) + Sys.sleep(current_delay) + current_delay <- current_delay + delay_increase + } else { + stop(paste("Max retry attempts reached. Last error:", result$message)) + } + } + + attempt <- attempt + 1 + } +} + +df_rs <- read.csv(runsheet, check.names = FALSE) %>% + dplyr::mutate_all(function(x) iconv(x, "latin1", "ASCII", sub="")) # Convert all characters to ascii, when not possible, remove the character + +allTrue <- function(i_vector) { + if ( length(i_vector) == 0 ) { + stop(paste("Input vector is length zero")) + } + all(i_vector) +} + +# Define paths to raw data files +runsheetPathsAreURIs <- function(df_runsheet) { + allTrue(stringr::str_starts(df_runsheet$`Array Data File Path`, "https")) +} + + +# Download raw data files +downloadFilesFromRunsheet <- function(df_runsheet) { + urls <- df_runsheet$`Array Data File Path` + destinationFiles <- df_runsheet$`Array Data File Name` + + mapply(function(url, destinationFile) { + print(paste0("Downloading from '", url, "' TO '", destinationFile, "'")) + if ( file.exists(destinationFile ) ) { + warning(paste( "Using Existing File:", destinationFile )) + } else { + download.file(url, destinationFile) + } + }, urls, destinationFiles) + + destinationFiles # Return these paths +} + +if ( runsheetPathsAreURIs(df_rs) ) { + print("Determined Raw Data Locations are URIS") + local_paths <- retry_with_delay(downloadFilesFromRunsheet, df_rs) +} else { + print("Or Determined Raw Data Locations are local paths") + local_paths <- df_rs$`Array Data File Path` +} + + +# uncompress files if needed +if ( allTrue(stringr::str_ends(local_paths, ".gz")) ) { + print("Determined these files are gzip compressed... uncompressing now") + # This does the uncompression + lapply(local_paths, R.utils::gunzip, remove = FALSE, overwrite = TRUE) + # This removes the .gz extension to get the uncompressed filenames + local_paths <- vapply(local_paths, + stringr::str_replace, # Run this function against each item in 'local_paths' + FUN.VALUE = character(1), # Execpt an character vector as a return + USE.NAMES = FALSE, # Don't use the input to assign names for the returned list + pattern = ".gz$", # first argument for applied function + replacement = "" # second argument for applied function + ) +} + +df_local_paths <- data.frame(`Sample Name` = df_rs$`Sample Name`, `Local Paths` = local_paths, check.names = FALSE) + + +# Load raw data into R object +# Retry with delay here to accomodate oligo's automatic loading of annotation packages and occasional internet related failures to load +raw_data <- retry_with_delay( + oligo::read.celfiles, + df_local_paths$`Local Paths`, + sampleNames = df_local_paths$`Sample Name`# Map column names as Sample Names (instead of default filenames) + ) + + +# Summarize raw data +print(paste0("Number of Arrays: ", dim(raw_data)[2])) +print(paste0("Number of Probes: ", dim(raw_data)[1])) +``` + +**Parameter Definitions:** + +- `runsheet` (Path to runsheet, output from [Step 1](#1-create-sample-runsheet)) +- `local_annotation_dir` (Path to local annotation directory if using custom annotations, see [Step 8a](#8a-get-probeset-annotations)) + + > Note: If not using custom annotations, leave `local_annotation_dir` as `NULL`. + +**Output Data:** + +- `df_rs` (R dataframe containing information from the runsheet) +- `raw_data` (R object containing raw microarray data) + + > Note: The raw data R object will be used to generate quality assessment (QA) plots in the next step. + +
+ +### 2b. Load Annotation Metadata + +```R +## Determines the organism specific annotation file to use based on the organism in the runsheet +fetch_organism_specific_annotation_table <- function(organism) { + # Uses the latest GeneLab annotations table to find the organism specific annotation file path and ensembl version + # Raises an exception if the organism does not have an associated annotation file or ensembl version yet + + annotation_table_link <- "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/GL_RefAnnotTable-A_1.1.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv" + all_organism_table <- read.csv(annotation_table_link) + + annotation_table <- all_organism_table %>% dplyr::filter(species == organism) + + # Guard clause: Ensure annotation_table populated + # Else: raise exception for unsupported organism + if (nrow(annotation_table) == 0 || annotation_table$genelab_annots_link == "" || is.na(annotation_table$ensemblVersion)) { + stop(glue::glue("Organism supplied '{organism}' is not supported. See the following url for supported organisms: {annotation_table_link}. Supported organisms will correspond to a row based on the 'species' column and include a url in the 'genelab_annots_link' column of that row and a version number in the 'ensemblVersion' column.")) + } + + return(annotation_table) +} + +annotation_table <- retry_with_delay(fetch_organism_specific_annotation_table, unique(df_rs$organism)) + +annotation_file_path <- annotation_table$genelab_annots_link +ensembl_version <- as.character(annotation_table$ensemblVersion) +``` + +**Parameter Definitions:** + +- `df_rs$organism` (organism specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) +- `annotation_table_link` (URL or path to latest GeneLab Annotations file, see [GL-DPPD-7110-A_annotations.csv](../../../GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv)) + +**Output Data:** + +- `annotation_file_path` (reference organism annotation file url indicated in the 'genelab_annots_link' column of the GeneLab Annotations file provided in `annotation_table_link`) +- `ensembl_version` (reference organism Ensembl version indicated in the 'ensemblVersion' column of the GeneLab Annotations file provided in `annotation_table_link`) + +
+ +--- + +## 3. Raw Data Quality Assessment + +
+ +### 3a. Density Plot + +```R +# Plot settings +par( + xpd = TRUE # Ensure legend can extend past plot area +) + +number_of_sets = ceiling(dim(raw_data)[2] / 30) # Set of 30 samples, used to scale plot +scale_factor = 0.2 # Default scale factor + +if (max(nchar(colnames(raw_data@assayData$exprs))) > 35 & number_of_sets > 1) { # Scale more if sample names are long + scale_factor = if_else(number_of_sets == 2, 0.4, 0.25) +} + +oligo::hist(raw_data, + transfo=log2, # Log2 transform raw intensity values + which=c("all"), + nsample=10000, # Number of probes to plot + main = "Density of raw intensities for multiple arrays") +legend("topright", legend = colnames(raw_data@assayData$exprs), + lty = c(1,2,3,4,5), # Seems like oligo::hist cycles through these first five line types + col = oligo::darkColors(n = ncol(raw_data)), # Ensure legend color is in sync with plot + ncol = number_of_sets, # Set number of columns by number of sets + cex = max(0.35, 1 + scale_factor - (number_of_sets*scale_factor)) # Reduce for each column beyond 1 with minimum of 35% + ) + +# Reset par +par(original_par) +``` + +**Input Data:** + +- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) + +**Output Data:** + +- Plot containing the density of raw intensities for each array (lack of overlap indicates a need for normalization) + +
+ +### 3b. Pseudo Image Plots + +```R +for ( i in seq_along(1:ncol(raw_data))) { + oligo::image(raw_data[,i], + transfo = log2, + main = colnames(raw_data)[i] + ) +} +``` + +**Input Data:** + +- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) + +**Output Data:** + +- Pseudo images of each array before background correction and normalization + +
+ +### 3c. MA Plots + +```R +if (inherits(raw_data, "GeneFeatureSet")) { + print("Raw data is a GeneFeatureSet, using exprs() to access expression values and adding 0.0001 to avoid log(0)") +} else if (inherits(raw_data, "ExpressionSet") || inherits(raw_data, "ExpressionFeatureSet") || inherits(raw_data, "HTAFeatureSet")) { + print(paste0("Raw data is ", class(raw_data), ". Using default approach for this class for MA Plot")) +} + +if (inherits(raw_data, "GeneFeatureSet")) { + MA_plot <- oligo::MAplot( + exprs(raw_data) + 0.0001, + transfo=log2, + ylim=c(-2, 4), + main="" # This function uses 'main' as a suffix to the sample name. Here we want just the sample name, thus here main is an empty string + ) +} else if (inherits(raw_data, "ExpressionSet") || inherits(raw_data, "ExpressionFeatureSet") || inherits(raw_data, "HTAFeatureSet")) { + MA_plot <- oligo::MAplot( + raw_data, + ylim=c(-2, 4), + main="" # This function uses 'main' as a suffix to the sample name. Here we want just the sample name, thus here main is an empty string + ) +} else { + stop(glue::glue("No strategy for MA plots for {class(raw_data)}")) +} +``` + +**Input Data:** + +- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) + +**Output Data:** + +- M (log ratio of the subject array vs a pseudo-reference, the median of all other arrays) vs. A (average log expression) plot for each array before background correction and normalization + +
+ + +### 3d. Boxplots + +```R +max_samplename_length <- max(nchar(colnames(raw_data))) +dynamic_lefthand_margin <- max(max_samplename_length * 0.7, 10) +par( + mar = c(8, dynamic_lefthand_margin, 8, 2) + 0.1, # mar is the margin around the plot. c(bottom, left, top, right) + xpd = TRUE + ) +boxplot <- oligo::boxplot(raw_data[, rev(colnames(raw_data))], # Here we reverse column order to ensure descending order for samples in horizontal boxplot + transfo=log2, # Log2 transform raw intensity values + which=c("all"), + nsample=10000, # Number of probes to plot + las = 1, # las specifies the orientation of the axis labels. 1 = always horizontal + ylab="", + xlab="log2 Intensity", + main = "Boxplot of raw intensities \nfor perfect match and mismatch probes", + horizontal = TRUE + ) +title(ylab = "Sample Name", mgp = c(dynamic_lefthand_margin-2, 1, 0)) +# Reset par +par(original_par) +``` + +**Input Data:** + +- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) + +**Output Data:** + +- Boxplot of raw expression data for each array before background correction and normalization + +
+ +--- + +## 4. Background Correction + +```R +background_corrected_data <- raw_data %>% oligo::backgroundCorrect(method="rma") +``` + +**Input Data:** + +- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) + +**Output Data:** + +- `background_corrected_data` (R object containing background-corrected microarray data) + + > + > Note: Background correction was performed using the oligo `rma` method, specifically "Convolution Background Correction" + +
+ +--- + +## 5. Between Array Normalization + +```R +# Normalize background-corrected data using the quantile method +norm_data <- oligo::normalize(background_corrected_data, + method = "quantile", + target = "core" # Use oligo default: core metaprobeset mappings + ) + +# Summarize background-corrected and normalized data +print(paste0("Number of Arrays: ", dim(norm_data)[2])) +print(paste0("Number of Probes: ", dim(norm_data)[1])) +``` + +**Input Data:** + +- `background_corrected_data` (R object containing background-corrected microarray data created in [Step 4](#4-background-correction) above) + +**Output Data:** + +- `norm_data` (R object containing background-corrected and normalized microarray data) + + > + > Note: Normalization was performed using the `quantile` method, which forces the entire empirical distribution of all arrays to be identical + +
+ +--- + +## 6. Normalized Data Quality Assessment + +
+ +### 6a. Density Plot + +```R +# Plot settings +par( + xpd = TRUE # Ensure legend can extend past plot area +) + +number_of_sets = ceiling(dim(norm_data)[2] / 30) # Set of 30 samples, used to scale plot + +oligo::hist(norm_data, + transfo=log2, # Log2 transform normalized intensity values + which=c("all"), + nsample=10000, # Number of probes to plot + main = "Density of normalized intensities for multiple arrays") +legend("topright", legend = colnames(norm_data@assayData$exprs), + lty = c(1,2,3,4,5), # Seems like oligo::hist cycles through these first five line types + col = oligo::darkColors(n = ncol(norm_data)), # Ensure legend color is in sync with plot + ncol = number_of_sets, # Set number of columns by number of sets + cex = max(0.35, 1 + scale_factor - (number_of_sets*scale_factor)) # Reduce for each column beyond 1 with minimum of 35% + ) + +# Reset par +par(original_par) +``` + +**Input Data:** + +- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization) above) + +**Output Data:** + +- Plot containing the density of background-corrected and normalized intensities for each array (near complete overlap is expected after normalization) + +
+ +### 6b. Pseudo Image Plots + +```R +for ( i in seq_along(1:ncol(norm_data))) { + oligo::image(norm_data[,i], + transfo = log2, + main = colnames(norm_data)[i] + ) +} +``` + +**Input Data:** + +- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization) above) + +**Output Data:** + +- Pseudo images of each array after background correction and normalization + +
+ +### 6c. MA Plots + +```R +MA_plot <- oligo::MAplot( + norm_data, + ylim=c(-2, 4), + main="" # This function uses 'main' as a suffix to the sample name. Here we want just the sample name, thus here main is an empty string +) +``` + +**Input Data:** + +- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization) above) + +**Output Data:** + +- M (log ratio of the subject array vs a pseudo-reference, the median of all other arrays) vs. A (average log expression) plot for each array after background correction and normalization + +
+ +### 6d. Boxplots + +```R +max_samplename_length <- max(nchar(colnames(norm_data))) +dynamic_lefthand_margin <- max(max_samplename_length * 0.7, 10) +par( + mar = c(8, dynamic_lefthand_margin, 8, 2) + 0.1, # mar is the margin around the plot. c(bottom, left, top, right) + xpd = TRUE + ) +boxplot <- oligo::boxplot(norm_data[, rev(colnames(norm_data))], # Here we reverse column order to ensure descending order for samples in horizontal boxplot + transfo=log2, # Log2 transform normalized intensity values + which=c("all"), + nsample=10000, # Number of probes to plot + las = 1, # las specifies the orientation of the axis labels. 1 = always horizontal + ylab="", + xlab="log2 Intensity", + main = "Boxplot of normalized intensities \nfor perfect match and mismatch probes", + horizontal = TRUE + ) +title(ylab = "Sample Name", mgp = c(dynamic_lefthand_margin-2, 1, 0)) +# Reset par +par(original_par) +``` + +**Input Data:** + +- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization) above) + +**Output Data:** + +- Boxplot of expression data for each array after background correction and normalization + +
+ +--- + +## 7. Probeset Summarization + +```R +probeset_level_data <- oligo::rma(norm_data, + normalize=FALSE, + background=FALSE + ) + +# Summarize background-corrected and normalized data +print("Summarized Probeset Level Data Below") +print(paste0("Number of Arrays: ", dim(probeset_level_data)[2])) +print(paste0("Total Number of Probes Assigned To A Probeset: ", dim(oligo::getProbeInfo(probeset_level_data, target="core")['man_fsetid'])[1])) # man_fsetid means 'Manufacturer Probeset ID'. Ref: https://support.bioconductor.org/p/57191/ +print(paste0("Number of Probesets: ", dim(unique(oligo::getProbeInfo(probeset_level_data, target="core")['man_fsetid']))[1])) # man_fsetid means 'Manufacturer Probeset ID'. Ref: https://support.bioconductor.org/p/57191/ +``` + +**Input Data:** + +- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization) above) + +**Output Data:** + +- `probeset_level_data` (R object containing probeset level expression values after summarization of normalized probeset level data) + +
+ +--- + +## 8. Probeset Annotations + +
+ +### 8a. Get Probeset Annotations + +```R +shortenedOrganismName <- function(long_name) { + #' Convert organism names like 'Homo Sapiens' into 'hsapiens' + tokens <- long_name %>% stringr::str_split(" ", simplify = TRUE) + genus_name <- tokens[1] + + species_name <- tokens[2] + + short_name <- stringr::str_to_lower(paste0(substr(genus_name, start = 1, stop = 1), species_name)) + + return(short_name) +} + +getBioMartAttribute <- function(df_rs) { + #' Returns resolved biomart attribute source from runsheet + + # check if runsheet has 'biomart_attribute' column + if ( !is.null(df_rs$`biomart_attribute`) ) { + print("Using attribute name sourced from runsheet") + # Format according to biomart needs + formatted_value <- unique(df_rs$`biomart_attribute`) %>% + stringr::str_replace_all(" ","_") %>% # Replace all spaces with underscore + stringr::str_to_lower() # Lower casing only + return(formatted_value) + } else { + stop("ERROR: Could not find 'biomart_attribute' in runsheet") + } +} + +get_ensembl_genomes_mappings_from_ftp <- function(organism, ensembl_genomes_portal, ensembl_genomes_version, biomart_attribute) { + #' Obtain mapping table directly from ftp. Useful when biomart live service no longer exists for desired version + + request_url <- glue::glue("https://ftp.ebi.ac.uk/ensemblgenomes/pub/{ensembl_genomes_portal}/release-{ensembl_genomes_version}/mysql/{ensembl_genomes_portal}_mart_{ensembl_genomes_version}/{organism}_eg_gene__efg_{biomart_attribute}__dm.txt.gz") + + print(glue::glue("Mappings file URL: {request_url}")) + + # Create a temporary file name + temp_file <- tempfile(fileext = ".gz") + + # Download the gzipped table file using the download.file function + download.file(url = request_url, destfile = temp_file, method = "libcurl") # Use 'libcurl' to support ftps + + # Uncompress the file + uncompressed_temp_file <- tempfile() + gzcon <- gzfile(temp_file, "rt") + content <- readLines(gzcon) + writeLines(content, uncompressed_temp_file) + close(gzcon) + + + # Load the data into a dataframe + mapping <- read.table(uncompressed_temp_file, # Read the uncompressed file + # Add column names as follows: MAPID, TAIR, PROBESETID + col.names = c("MAPID", "ensembl_gene_id", biomart_attribute), + header = FALSE, # No header in original table + sep = "\t") # Tab separated + + # Clean up temporary files + unlink(temp_file) + unlink(uncompressed_temp_file) + + return(mapping) +} + +# Convert list of multi-mapped genes to string +listToUniquePipedString <- function(str_list) { + #! convert lists into strings denoting unique elements separated by '|' characters + #! e.g. c("GO1","GO2","GO2","G03") -> "GO1|GO2|GO3" + return(toString(unique(str_list)) %>% stringr::str_replace_all(pattern = stringr::fixed(", "), replacement = "|")) +} + + +organism <- shortenedOrganismName(unique(df_rs$organism)) +annot_key <- ifelse(organism %in% c("athaliana"), 'TAIR', 'ENSEMBL') + +if (organism %in% c("athaliana")) { + ENSEMBL_VERSION = ensembl_version + ensembl_genomes_portal = "plants" + print(glue::glue("Using ensembl genomes ftp to get specific version of probeset id mapping table. Ensembl genomes portal: {ensembl_genomes_portal}, version: {ENSEMBL_VERSION}")) + expected_attribute_name <- getBioMartAttribute(df_rs) + df_mapping <- retry_with_delay( + get_ensembl_genomes_mappings_from_ftp, + organism = organism, + ensembl_genomes_portal = ensembl_genomes_portal, + ensembl_genomes_version = ENSEMBL_VERSION, + biomart_attribute = expected_attribute_name + ) + + # TAIR from the mapping tables tend to be in the format 'AT1G01010.1' but the raw data has 'AT1G01010' + # So here we remove the '.NNN' from the mapping table where .NNN is any number + df_mapping$ensembl_gene_id <- stringr::str_replace_all(df_mapping$ensembl_gene_id, "\\.\\d+$", "") + + use_custom_annot <- FALSE +} else { + # Use biomart from main Ensembl website which archives keep each release on the live service + # locate dataset + expected_dataset_name <- shortenedOrganismName(unique(df_rs$organism)) %>% stringr::str_c("_gene_ensembl") + print(paste0("Expected dataset name: '", expected_dataset_name, "'")) + + + # Specify Ensembl version used in current GeneLab reference annotations + ENSEMBL_VERSION <- ensembl_version + + print(glue::glue("Using Ensembl biomart to get specific version of mapping table. Ensembl version: {ENSEMBL_VERSION}")) + + # Check if organism in supported in biomart + ensembl <- biomaRt::useEnsembl(biomart = "genes") + ensembl_datasets <- biomaRt::listDatasets(ensembl) + use_custom_annot <- !expected_dataset_name %in% ensembl_datasets$dataset + + if (use_custom_annot) { + unloadNamespace("biomaRt") + } else { + + ensembl <- biomaRt::useEnsembl(biomart = "genes", + dataset = expected_dataset_name, + version = ENSEMBL_VERSION) + print(ensembl) + + expected_attribute_name <- getBioMartAttribute(df_rs) + print(paste0("Expected attribute name: '", expected_attribute_name, "'")) + + # Some probe_ids for affy_hta_2_0 may end in .hg.1 instead of .hg (how it is in biomaRt), leading to 0 results returned + if (expected_attribute_name == 'affy_hta_2_0') { + rownames(probeset_level_data) <- stringr::str_replace(rownames(probeset_level_data), '\\.hg\\.1$', '.hg') + } + + probe_ids <- rownames(probeset_level_data) + + # Create probe map + # Run Biomart Queries in chunks to prevent request timeouts + # Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk size + CHUNK_SIZE= 1500 + probe_id_chunks <- split(probe_ids, ceiling(seq_along(probe_ids) / CHUNK_SIZE)) + df_mapping <- data.frame() + for (i in seq_along(probe_id_chunks)) { + probe_id_chunk <- probe_id_chunks[[i]] + print(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})")) + chunk_results <- biomaRt::getBM( + attributes = c( + expected_attribute_name, + "ensembl_gene_id" + ), + filters = expected_attribute_name, + values = probe_id_chunk, + mart = ensembl) + + if (nrow(chunk_results) > 0) { + df_mapping <- df_mapping %>% dplyr::bind_rows(chunk_results) + } + + Sys.sleep(10) # Slight break between requests to prevent back-to-back requests + } + + } +} + +# At this point, we have df_mapping from either the biomart live service or the ensembl genomes ftp archive depending on the organism +# If no df_mapping obtained (e.g., organism not supported in biomart), use custom annotations; otherwise, merge in-house annotations to df_mapping + +if (use_custom_annot) { + expected_attribute_name <- 'ProbesetID' + + annot_type <- 'NO_CUSTOM_ANNOT' + if (!is.null(local_annotation_dir) && file.exists(file.path(local_annotation_dir, 'config.csv'))) { + config_df <- read.csv(file.path(local_annotation_dir, 'config.csv'), row.names=1) + if (df_rs$`biomart_attribute` %in% row.names(config_df)) { + annot_config <- config_df[df_rs$`biomart_attribute`, ] + annot_type <- annot_config$annot_type[[1]] + } else { + warning(paste0("No entry for '", df_rs$`biomart_attribute`, "' in provided config.csv")) + } + } else { + warning(paste0("No 'config.csv' file found in path (--referenceStorePath): ", local_annotation_dir)) + } + + if (annot_type == '3prime-IVT') { + unique_probe_ids <- read.csv( + file.path(local_annotation_dir, annot_config$annot_filename[[1]]), + skip = 13, header = TRUE, na.strings = c('NA', '---') + )[c('Probe.Set.ID', 'Entrez.Gene', 'Gene.Symbol', 'Gene.Title', 'Ensembl', 'RefSeq.Transcript.ID', 'RefSeq.Protein.ID', 'Gene.Ontology.Biological.Process', 'Gene.Ontology.Cellular.Component', 'Gene.Ontology.Molecular.Function')] + + # Clean columns + unique_probe_ids$Gene.Symbol <- purrr::map_chr(stringr::str_split(unique_probe_ids$Gene.Symbol, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) + unique_probe_ids$Gene.Title <- purrr::map_chr(stringr::str_split(unique_probe_ids$Gene.Title, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) + unique_probe_ids$Entrez.Gene <- purrr::map_chr(stringr::str_split(unique_probe_ids$Entrez.Gene, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) + unique_probe_ids$Ensembl <- purrr::map_chr(stringr::str_split(unique_probe_ids$Ensembl, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) + + unique_probe_ids$RefSeq <- paste(unique_probe_ids$RefSeq.Transcript.ID, unique_probe_ids$RefSeq.Protein.ID) + unique_probe_ids$RefSeq <- purrr::map_chr(stringr::str_extract_all(unique_probe_ids$RefSeq, '[A-Z]+_[\\d.]+'), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('^$', NA_character_) + + unique_probe_ids$GO <- paste(unique_probe_ids$Gene.Ontology.Biological.Process, unique_probe_ids$Gene.Ontology.Cellular.Component, unique_probe_ids$Gene.Ontology.Molecular.Function) + unique_probe_ids$GO <- purrr::map_chr(stringr::str_extract_all(unique_probe_ids$GO, '\\d{7}'), ~paste0('GO:', unique(.), collapse = "|")) %>% stringr::str_replace('^GO:$', NA_character_) + + unique_probe_ids <- unique_probe_ids[c('Probe.Set.ID', 'Entrez.Gene', 'Gene.Symbol', 'Gene.Title', 'Ensembl', 'RefSeq', 'GO')] + names(unique_probe_ids) <- c('ProbesetID', 'ENTREZID', 'SYMBOL', 'GENENAME', 'ENSEMBL', 'REFSEQ', 'GOSLIM_IDS') + + unique_probe_ids$STRING_id <- NA_character_ + + gene_col <- 'ENSEMBL' + if (sum(!is.na(unique_probe_ids$ENTREZID)) > sum(!is.na(unique_probe_ids$ENSEMBL))) { + gene_col <- 'ENTREZID' + } + if (sum(!is.na(unique_probe_ids$SYMBOL)) > sum(!is.na(unique_probe_ids$ENTREZID))) { + gene_col <- 'SYMBOL' + } + + unique_probe_ids <- unique_probe_ids %>% + dplyr::mutate( + count_gene_mappings = 1 + stringr::str_count(get(gene_col), stringr::fixed("|")), + gene_mapping_source = gene_col + ) + } else if (annot_type == 'custom') { + unique_probe_ids <- read.csv( + file.path(local_annotation_dir, annot_config$annot_filename[[1]]), + header = TRUE, na.strings = c('NA', '') + ) + } else { + annot_cols <- c('ProbesetID', 'ENTREZID', 'SYMBOL', 'GENENAME', 'ENSEMBL', 'REFSEQ', 'GOSLIM_IDS', 'STRING_id', 'count_gene_mappings', 'gene_mapping_source') + unique_probe_ids <- setNames(data.frame(matrix(NA_character_, nrow = 1, ncol = length(annot_cols))), annot_cols) + } +} else { + annot <- read.table( + as.character(annotation_file_path), + sep = "\t", + header = TRUE, + quote = "", + comment.char = "" + ) + + unique_probe_ids <- df_mapping %>% + dplyr::mutate(dplyr::across(!!sym(expected_attribute_name), as.character)) %>% # Ensure probeset ids treated as character type + dplyr::group_by(!!sym(expected_attribute_name)) %>% + dplyr::summarise( + ENSEMBL = listToUniquePipedString(ensembl_gene_id) + ) %>% + # Count number of ensembl IDS mapped + dplyr::mutate( + count_gene_mappings = 1 + stringr::str_count(ENSEMBL, stringr::fixed("|")), + gene_mapping_source = annot_key + ) %>% + dplyr::left_join(annot, by = c("ENSEMBL" = annot_key)) +} + +probeset_expression_matrix <- oligo::exprs(probeset_level_data) + +probeset_expression_matrix.gene_mapped <- probeset_expression_matrix %>% + as.data.frame() %>% + tibble::rownames_to_column(var = "ProbesetID") %>% # Ensure rownames (probeset IDs) can be used as join key + dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% + dplyr::mutate( count_gene_mappings := ifelse(is.na(count_gene_mappings), 0, count_gene_mappings) ) %>% + dplyr::mutate( gene_mapping_source := unique(unique_probe_ids$gene_mapping_source) ) +``` + +**Parameter Definitions:** + +- `df_rs$organism` (organism specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) +- `df_rs$biomart_attribute` (array design biomart identifier specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) +- `annotation_file_path` (reference organism annotation file url indicated in the 'genelab_annots_link' column of the GeneLab Annotations file provided in `annotation_table_link`, output from [Step 2b](#2b-load-annotation-metadata)) +- `ensembl_version` (reference organism Ensembl version indicated in the 'ensemblVersion' column of the GeneLab Annotations file provided in `annotation_table_link`, output from [Step 2b](#2b-load-annotation-metadata)) +- `annot_key` (Keytype to join annotation table and microarray probes, dependent on organism, e.g. mus musculus uses 'ENSEMBL') +- `local_annotation_dir` (Path to local annotation directory if using custom annotations, defined in [Step 2a](#2a-load-metadata-and-raw-data)) + + > Note: See [here](../Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md) for details on what to include in this directory. + +**Input Data:** + +- `probeset_level_data` (R object containing probeset level expression values after summarization of normalized probeset level data, output from [Step 7](#7-probeset-summarization)) + +**Output Data:** + +- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations) +- `unique_probe_ids` (R object containing probeset ID to gene annotation mappings) + +
+ +### 8b. Summarize Gene Mapping + +```R +# Pie Chart with Percentages +slices <- c( + 'Unique Mapping' = nrow(probeset_expression_matrix.gene_mapped %>% dplyr::filter(count_gene_mappings == 1) %>% dplyr::distinct(ProbesetID)), + 'Multi Mapping' = nrow(probeset_expression_matrix.gene_mapped %>% dplyr::filter(count_gene_mappings > 1) %>% dplyr::distinct(ProbesetID)), + 'No Mapping' = nrow(probeset_expression_matrix.gene_mapped %>% dplyr::filter(count_gene_mappings == 0) %>% dplyr::distinct(ProbesetID)) +) +pct <- round(slices/sum(slices)*100) +chart_names <- names(slices) +chart_names <- glue::glue("{names(slices)} ({slices})") # add count to labels +chart_names <- paste(chart_names, pct) # add percents to labels +chart_names <- paste(chart_names,"%",sep="") # ad % to labels +pie(slices,labels = chart_names, col=rainbow(length(slices)), + main=glue::glue("Mapping to Primary Keytype\n {nrow(probeset_expression_matrix.gene_mapped %>% dplyr::distinct(ProbesetID))} Total Unique Probesets") + ) + +print(glue::glue("Unique Mapping Count: {slices[['Unique Mapping']]}")) +``` + +**Input Data:** + +- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations, output from [Step 8a](#8a-get-probeset-annotations) above) + +**Output Data:** + +- A pie chart denoting the gene mapping rates for each unique probeset ID +- A printout denoting the count of unique mappings for gene mapping + +
+ +### 8c. Save Annotated Tables + +```R +## Reorder columns before saving to file +ANNOTATIONS_COLUMN_ORDER = c( + annot_key, + "SYMBOL", + "GENENAME", + "REFSEQ", + "ENTREZID", + "STRING_id", + "GOSLIM_IDS" +) + +SAMPLE_COLUMN_ORDER <- df_rs$`Sample Name` + +probeset_expression_matrix.gene_mapped <- probeset_expression_matrix.gene_mapped %>% dplyr::rename( !!annot_key := ENSEMBL ) + +## Output column subset file with just normalized probeset level expression values +write.csv( + probeset_expression_matrix.gene_mapped[c( + ANNOTATIONS_COLUMN_ORDER, + "ProbesetID", + "count_gene_mappings", + "gene_mapping_source", + SAMPLE_COLUMN_ORDER) + ], file.path(DIR_NORMALIZED_EXPRESSION, "normalized_expression_probeset_GLmicroarray.csv"), row.names = FALSE) + +## Determine column order for probe level tables + +PROBE_INFO_COLUMN_ORDER = c( + "ProbesetID", + "ProbeID", + "count_gene_mappings", + "gene_mapping_source" +) + +FINAL_COLUMN_ORDER <- c( + ANNOTATIONS_COLUMN_ORDER, + PROBE_INFO_COLUMN_ORDER, + SAMPLE_COLUMN_ORDER +) + +## Generate raw intensity matrix that includes annotations + +background_corrected_data_annotated <- oligo::exprs(background_corrected_data) %>% + as.data.frame() %>% + tibble::rownames_to_column(var = "fid") %>% # Ensure rownames (probeset IDs) can be used as join key + dplyr::mutate(dplyr::across(fid, as.integer)) %>% # Ensure fid is integer type, consistent with getProbeInfo typing + dplyr::right_join(oligo::getProbeInfo(background_corrected_data), by = "fid") %>% # Add 'man_fsetid' via mapping based on fid + dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID + dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID + dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% # Join with biomaRt ENSEMBL mappings + dplyr::mutate( count_gene_mappings := ifelse(is.na(count_gene_mappings), 0, count_gene_mappings) ) %>% # Convert NA mapping to 0 + dplyr::mutate( gene_mapping_source := unique(unique_probe_ids$gene_mapping_source) ) %>% + dplyr::rename( !!annot_key := ENSEMBL ) + +## Perform reordering +background_corrected_data_annotated <- background_corrected_data_annotated %>% + dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER)) + +write.csv(background_corrected_data_annotated, file.path(DIR_RAW_DATA, "raw_intensities_probe_GLmicroarray.csv"), row.names = FALSE) + +## Generate normalized expression matrix that includes annotations +norm_data_matrix_annotated <- oligo::exprs(norm_data) %>% + as.data.frame() %>% + tibble::rownames_to_column(var = "fid") %>% # Ensure rownames (probeset IDs) can be used as join key + dplyr::mutate(dplyr::across(fid, as.integer)) %>% # Ensure fid is integer type, consistent with getProbeInfo typing + dplyr::right_join(oligo::getProbeInfo(norm_data), by = "fid") %>% # Add 'man_fsetid' via mapping based on fid + dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID + dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID + dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% + dplyr::mutate( count_gene_mappings := ifelse(is.na(count_gene_mappings), 0, count_gene_mappings) ) %>% # Convert NA mapping to 0 + dplyr::mutate( gene_mapping_source := unique(unique_probe_ids$gene_mapping_source) ) %>% + dplyr::rename( !!annot_key := ENSEMBL ) + +norm_data_matrix_annotated <- norm_data_matrix_annotated %>% + dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER)) + +write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "normalized_intensities_probe_GLmicroarray.csv"), row.names = FALSE) +``` + +**Parameter Definitions:** + +- `df_rs[['Sample Name']]` (sample names specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) +- `annot_key` (Keytype to join annotation table and microarray probes, dependent on organism, e.g. mus musculus uses 'ENSEMBL', defined in [Step 8a](#8a-get-probeset-annotations)) + +**Input Data:** + +- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations, output from [Step 8a](#8a-get-probeset-annotations) above) +- `background_corrected_data` (R object containing background-corrected microarray data created in [Step 4](#4-background-correction)) +- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization)) +- `unique_probe_ids` (R object containing probeset ID to gene annotation mappings, output from [Step 8a](#8a-get-probeset-annotations)) + +**Output Data:** + +- **normalized_expression_probeset_GLmicroarray.csv** (table containing the background corrected, normalized probeset expression values for each sample. The ProbesetID is the unique index column.) +- **raw_intensities_probe_GLmicroarray.csv** (table containing the background corrected, unnormalized probe intensity values for each sample including gene annotations. The ProbeID is the unique index column.) +- **normalized_intensities_probe_GLmicroarray.csv** (table containing the background corrected, normalized probe intensity values for each sample including gene annotations. The ProbeID is the unique index column.) + +## 9. Perform Probeset Differential Expression (DE) + +> Note: Run differential expression analysis only if there is at least 1 replicate per factor group. + +
+ +### 9a. Generate Design Matrix + +```R +# Pull all factors for each sample in the study from the runsheet created in Step 1 +runsheetToDesignMatrix <- function(runsheet_path) { + df <- read.csv(runsheet, check.names = FALSE) %>% + dplyr::mutate_all(function(x) iconv(x, "latin1", "ASCII", sub="")) # Convert all characters to ascii, when not possible, remove the character # get only Factor Value columns + factors = as.data.frame(df[,grep("Factor.Value", colnames(df), ignore.case=TRUE)]) + colnames(factors) = paste("factor",1:dim(factors)[2], sep= "_") + + # Load metadata from runsheet csv file + compare_csv = data.frame(sample_id = df[,c("Sample Name")], factors) + + # Create data frame containing all samples and respective factors + study <- as.data.frame(compare_csv[,2:dim(compare_csv)[2]]) + colnames(study) <- colnames(compare_csv)[2:dim(compare_csv)[2]] + rownames(study) <- compare_csv[,1] + + # Format groups and indicate the group that each sample belongs to + if (dim(study)[2] >= 2){ + group<-apply(study,1,paste,collapse = " & ") # concatenate multiple factors into one condition per sample + } else{ + group<-study[,1] + } + group_names <- paste0("(",group,")",sep = "") # human readable group names + group <- sub("^BLOCKER_", "", make.names(paste0("BLOCKER_", group))) # group naming compatible with R models, this maintains the default behaviour of make.names with the exception that 'X' is never prepended to group namesnames(group) <- group_names + names(group) <- group_names + + # Format contrasts table, defining pairwise comparisons for all groups + contrast.names <- combn(levels(factor(names(group))),2) # generate matrix of pairwise group combinations for comparison + contrasts <- apply(contrast.names, MARGIN=2, function(col) sub("^BLOCKER_", "", make.names(paste0("BLOCKER_", stringr::str_sub(col, 2, -2))))) + contrast.names <- c(paste(contrast.names[1,],contrast.names[2,],sep = "v"),paste(contrast.names[2,],contrast.names[1,],sep = "v")) # format combinations for output table files names + contrasts <- cbind(contrasts,contrasts[c(2,1),]) + colnames(contrasts) <- contrast.names + sampleTable <- data.frame(condition=factor(group)) + rownames(sampleTable) <- df[,c("Sample Name")] + + condition <- sampleTable[,'condition'] + names_mapping <- as.data.frame(cbind(safe_name = as.character(condition), original_name = group_names)) + + design <- model.matrix(~ 0 + condition) + design_data <- list( matrix = design, mapping = names_mapping, groups = as.data.frame( cbind(sample = df[,c("Sample Name")], group = group_names) ), contrasts = contrasts ) + return(design_data) +} + + +# Loading metadata from runsheet csv file +design_data <- runsheetToDesignMatrix(runsheet) +design <- design_data$matrix + +# Write SampleTable.csv and contrasts.csv file +write.csv(design_data$groups, file.path(DIR_DGE, "SampleTable_GLmicroarray.csv"), row.names = FALSE) +write.csv(design_data$contrasts, file.path(DIR_DGE, "contrasts_GLmicroarray.csv")) +``` + +**Input Data:** + +- `runsheet` (Path to runsheet, output from [Step 1](#1-create-sample-runsheet)) + +**Output Data:** + +- `design` (R object containing the limma study design matrix, indicating the group that each sample belongs to) +- **SampleTable_GLmicroarray.csv** (table containing samples and their respective groups) +- **contrasts_GLmicroarray.csv** (table containing all pairwise comparisons) + +
+ +### 9b. Perform Individual Probeset Level DE + +```R +lmFitPairwise <- function(norm_data, design) { + #' Perform all pairwise comparisons + + #' Approach based on limma manual section 17.4 (version 3.52.4) + + fit <- limma::lmFit(norm_data, design) + + # Create Contrast Model + fit.groups <- colnames(fit$design)[which(fit$assign == 1)] + combos <- combn(fit.groups,2) + contrasts<-c(paste(combos[1,],combos[2,],sep = "-"),paste(combos[2,],combos[1,],sep = "-")) # format combinations for limma:makeContrasts + cont.matrix <- limma::makeContrasts(contrasts=contrasts,levels=design) + contrast.fit <- limma::contrasts.fit(fit, cont.matrix) + + contrast.fit <- limma::eBayes(contrast.fit,trend=TRUE,robust=TRUE) + return(contrast.fit) +} + +# Calculate results +res <- lmFitPairwise(probeset_level_data, design) + +# Print DE table, without filtering +limma::write.fit(res, adjust = 'BH', + file = "INTERIM.csv", + row.names = FALSE, + quote = TRUE, + sep = ",") + +### Generate and export PCA table for GeneLab visualization plots +PCA_raw <- prcomp(t(exprs(probeset_level_data)), scale = FALSE) # Note: expression at the Probeset level is already log2 transformed +write.csv(PCA_raw$x, file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.csv")) +``` + +**Input Data:** + +- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization)) +- `design` (R object containing the limma study design matrix, indicating the group that each sample belongs to, created in [Step 9a](#9a-generate-design-matrix) above) +- `probeset_level_data` (R object containing probeset level expression values after summarization of normalized probeset level data, output from [Step 7](#7-probeset-summarization)) + +**Output Data:** + +- INTERIM.csv (Statistical values from individual probeset level DE analysis, including: + - Log2fc between all pairwise comparisons + - T statistic for all pairwise comparison tests + - P value for all pairwise comparison tests + - Adjusted P value for all pairwise comparison tests) +- visualization_PCA_table_GLmicroarray.csv (file used to generate GeneLab PCA plots) + +
+ +### 9c. Save DE Table + +```R +## Reformat Table for consistency across DE analyses tables within GeneLab ## + +# Read in DE table +df_interim <- read.csv("INTERIM.csv") + +# Bind columns from gene mapped expression table +df_interim <- df_interim %>% + dplyr::bind_cols(probeset_expression_matrix.gene_mapped) + +# Reformat column names +reformat_names <- function(colname, group_name_mapping) { + new_colname <- colname %>% + stringr::str_replace(pattern = "^P.value.adj.condition", replacement = "Adj.p.value_") %>% + stringr::str_replace(pattern = "^P.value.condition", replacement = "P.value_") %>% + stringr::str_replace(pattern = "^Coef.condition", replacement = "Log2fc_") %>% # This is the Log2FC as per: https://rdrr.io/bioc/limma/man/writefit.html + stringr::str_replace(pattern = "^t.condition", replacement = "T.stat_") %>% + stringr::str_replace(pattern = ".condition", replacement = "v") + + # remap to group names before make.names was applied + unique_group_name_mapping <- unique(group_name_mapping) %>% arrange(-nchar(safe_name)) + for ( i in seq(nrow(unique_group_name_mapping)) ) { + safe_name <- unique_group_name_mapping[i,]$safe_name + original_name <- unique_group_name_mapping[i,]$original_name + new_colname <- new_colname %>% stringr::str_replace(pattern = stringr::fixed(safe_name), replacement = original_name) + } + + return(new_colname) +} + +df_interim <- df_interim %>% dplyr::rename_with(reformat_names, .cols = matches('\\.condition'), group_name_mapping = design_data$mapping) + + +## Add Group Wise Statistics ## + +# Group mean and standard deviations for normalized expression values are computed and added to the table + +unique_groups <- unique(design_data$group$group) +for ( i in seq_along(unique_groups) ) { + current_group <- unique_groups[i] + current_samples <- design_data$group %>% + dplyr::group_by(group) %>% + dplyr::summarize( + samples = sort(unique(sample)) + ) %>% + dplyr::filter( + group == current_group + ) %>% + dplyr::pull() + + print(glue::glue("Computing mean and standard deviation for Group {i} of {length(unique_groups)}")) + print(glue::glue("Group: {current_group}")) + print(glue::glue("Samples in Group: '{toString(current_samples)}'")) + + df_interim <- df_interim %>% + dplyr::mutate( + "Group.Mean_{current_group}" := rowMeans(dplyr::select(., all_of(current_samples))), + "Group.Stdev_{current_group}" := matrixStats::rowSds(as.matrix(dplyr::select(., all_of(current_samples)))), + ) %>% + dplyr::ungroup() %>% + as.data.frame() +} + +df_interim <- df_interim %>% + dplyr::mutate( + "All.mean" := rowMeans(dplyr::select(., all_of(SAMPLE_COLUMN_ORDER))), + "All.stdev" := matrixStats::rowSds(as.matrix(dplyr::select(., all_of(SAMPLE_COLUMN_ORDER)))), + ) %>% + dplyr::ungroup() %>% + as.data.frame() + +print("Remove extra columns from final table") + +# These columns are data mapped to column PROBEID as per the original Manufacturer and can be linked as needed +colnames_to_remove = c( + "AveExpr" # Replaced by 'All.mean' column +) + +df_interim <- df_interim %>% dplyr::select(-any_of(colnames_to_remove)) + +PROBE_INFO_COLUMN_ORDER = c( + "ProbesetID", + "count_gene_mappings", + "gene_mapping_source" +) + +generate_prefixed_column_order <- function(subjects, prefixes) { + #' Return a vector of columns based on subject and given prefixes + #' Used for both contrasts and groups column name generation + + # Track order of columns + final_order = c() + + # For each contrast + for (subject in subjects) { + # Generate column names for each prefix and append to final_order + for (prefix in prefixes) { + final_order <- append(final_order, glue::glue("{prefix}{subject}")) + } + } + return(final_order) +} +STAT_COLUMNS_ORDER <- generate_prefixed_column_order( + subjects = colnames(design_data$contrasts), + prefixes = c( + "Log2fc_", + "T.stat_", + "P.value_", + "Adj.p.value_" + ) + ) +ALL_SAMPLE_STATS_COLUMNS_ORDER <- c( + "All.mean", + "All.stdev", + "F", + "F.p.value" +) + +GROUP_MEAN_COLUMNS_ORDER <- generate_prefixed_column_order( + subjects = unique(design_data$groups$group), + prefixes = c( + "Group.Mean_" + ) + ) +GROUP_STDEV_COLUMNS_ORDER <- generate_prefixed_column_order( + subjects = unique(design_data$groups$group), + prefixes = c( + "Group.Stdev_" + ) + ) +FINAL_COLUMN_ORDER <- c( + ANNOTATIONS_COLUMN_ORDER, + PROBE_INFO_COLUMN_ORDER, + SAMPLE_COLUMN_ORDER, + STAT_COLUMNS_ORDER, + ALL_SAMPLE_STATS_COLUMNS_ORDER, + GROUP_MEAN_COLUMNS_ORDER, + GROUP_STDEV_COLUMNS_ORDER + ) + +## Assert final column order includes all columns from original table +if (!setequal(FINAL_COLUMN_ORDER, colnames(df_interim))) { + write.csv(FINAL_COLUMN_ORDER, "FINAL_COLUMN_ORDER.csv") + NOT_IN_DF_INTERIM <- paste(setdiff(FINAL_COLUMN_ORDER, colnames(df_interim)), collapse = ":::") + NOT_IN_FINAL_COLUMN_ORDER <- paste(setdiff(colnames(df_interim), FINAL_COLUMN_ORDER), collapse = ":::") + stop(glue::glue("Column reordering attempt resulted in different sets of columns than original. Names unique to 'df_interim': {NOT_IN_FINAL_COLUMN_ORDER}. Names unique to 'FINAL_COLUMN_ORDER': {NOT_IN_DF_INTERIM}.")) +} + +## Perform reordering +df_interim <- df_interim %>% dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER)) + +# Save to file +write.csv(df_interim, file.path(DIR_DGE, "differential_expression_GLmicroarray.csv"), row.names = FALSE) +``` + +**Input Data:** + +- INTERIM.csv (Statistical values from individual probeset level DE analysis, output from [Step 9b](#9b-perform-individual-probeset-level-de) above) +- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations, output from [Step 8a](#8a-get-probeset-annotations) above) + +**Output Data:** + +- **differential_expression_GLmicroarray.csv** (table containing normalized probeset expression values for each sample, group statistics, Limma probeset DE results for each pairwise comparison, and gene annotations. The ProbesetID is the unique index column.) + +> All steps of the Microarray pipeline are performed using R markdown and the completed R markdown is rendered (via Quarto) as an html file (**NF_MAAffymetrix_v\*_GLmicroarray.html**) and published in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/) for the respective dataset. \ No newline at end of file diff --git a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md index c2cdc0bb..0c7ce195 100644 --- a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md +++ b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md @@ -2,7 +2,7 @@ > **This page holds an overview and instructions for how GeneLab processes Affymetrix microarray datasets. Exact processing commands and GL-DPPD-7114 version used for specific GeneLab datasets (GLDS) are provided with their processed data in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo).** > -> \* The pipeline detailed below currently supports gene annotations for Arabidopsis Thaliana via Ensembl FTP, all animals available in Biomart, and custom annotations (see [Step 8a](#8a-get-probeset-annotations)). +> \* The pipeline detailed below is currently used for animal and Arabidopsis Thaliana studies only, it will be updated soon for processing microbe microarray data and other plant data. --- @@ -26,9 +26,7 @@ Lauren Sanders (acting GeneLab Project Scientist) - [Software used](#software-used) - [General processing overview with example commands](#general-processing-overview-with-example-commands) - [1. Create Sample RunSheet](#1-create-sample-runsheet) - - [2. Load Data](#2-load-data) - - [2a. Load Metadata and Raw Data](#2a-load-metadata-and-raw-data) - - [2b. Load Annotation Metadata](#2b-load-annotation-metadata) + - [2. Load Metadata and Raw Data](#2-load-metadata-and-raw-data) - [3. Raw Data Quality Assessment](#3-raw-data-quality-assessment) - [3a. Density Plot](#3a-density-plot) - [3b. Pseudo Image Plots](#3b-pseudo-image-plots) @@ -42,14 +40,12 @@ Lauren Sanders (acting GeneLab Project Scientist) - [6c. MA Plots](#6c-ma-plots) - [6d. Boxplots](#6d-boxplots) - [7. Probeset Summarization](#7-probeset-summarization) - - [8. Probeset Annotations](#8-probeset-annotations) - - [8a. Get Probeset Annotations](#8a-get-probeset-annotations) - - [8b. Summarize Gene Mapping](#8b-summarize-gene-mapping) - - [8c. Save Annotated Tables](#8c-save-annotated-tables) - - [9. Perform Probeset Differential Expression (DE)](#9-perform-probeset-differential-expression-de) - - [9a. Generate Design Matrix](#9a-generate-design-matrix) - - [9b. Perform Individual Probeset Level DE](#9b-perform-individual-probeset-level-de) - - [9c. Save DE Table](#9c-save-de-table) + - [8. Perform Probeset Differential Expression (DE)](#8-perform-probeset-differential-expression-de) + - [8a. Add Probeset Annotations](#8a-add-probeset-annotations) + - [8b. Summarize Biomart Mapping](#8b-summarize-biomart-mapping) + - [8c. Generate Design Matrix](#8c-generate-design-matrix) + - [8d. Perform Individual Probeset Level DE](#8d-perform-individual-probeset-level-de) + - [8e. Add Additional Columns and Format DE Table](#8e-add-additional-columns-and-format-de-table) --- @@ -84,7 +80,7 @@ Lauren Sanders (acting GeneLab Project Scientist) ## 1. Create Sample RunSheet > Notes: -> - Rather than running the commands below to create the runsheet needed for processing, the runsheet may also be created manually by following the [file specification](../Workflow_Documentation/NF_MAAffymetrix/examples/runsheet/README.md). +> - Rather than running the commands below to create the runsheet needed for processing, the runsheet may also be created manually by following the [file specification](../Workflow_Documentation/NF_MAAffymetrix/examples/README.md). > > - These command line tools are part of the [dp_tools](https://github.com/J-81/dp_tools) program. @@ -124,13 +120,9 @@ dpt-isa-to-runsheet --accession OSD-### \ --- -## 2. Load Data +## 2. Load Metadata and Raw Data -> Note: Steps 2 - 9 are done in R - -
- -### 2a. Load Metadata and Raw Data +> Note: Steps 2 - 8 are done in R ```R ### Install R packages if not already installed ### @@ -150,14 +142,11 @@ BiocManager::install("oligo") ## Note: Only dplyr is explicitly loaded. Other library functions are called with explicit namespace (e.g. LIBRARYNAME::FUNCTION) library(dplyr) # Ensure infix operator is available, methods should still reference dplyr namespace otherwise -options(dplyr.summarise.inform = FALSE) # Don't print out '`summarise()` has grouped output by 'group'. You can override using the `.groups` argument.' + # Define path to runsheet runsheet <- "/path/to/runsheet/{OSD-Accession-ID}_microarray_v{version}_runsheet.csv" -# If using custom annotation, define path to directory containing annotation file and config -local_annotation_dir <- NULL # - ## Set up output structure # Output Constants @@ -175,8 +164,6 @@ dir.create(DIR_DGE) original_par <- par() options(preferRaster=TRUE) # use Raster when possible to avoid antialiasing artifacts in images -options(timeout=1000) # ensure enough time for data downloads - # Utility function to improve robustness of function calls # Used to remedy intermittent internet issues during runtime retry_with_delay <- function(func, ...) { @@ -209,6 +196,25 @@ retry_with_delay <- function(func, ...) { df_rs <- read.csv(runsheet, check.names = FALSE) %>% dplyr::mutate_all(function(x) iconv(x, "latin1", "ASCII", sub="")) # Convert all characters to ascii, when not possible, remove the character +## Determines the organism specific annotation file to use based on the organism in the runsheet +fetch_organism_specific_annotation_file_path <- function(organism) { + # Uses the GeneLab GL-DPPD-7110_annotations.csv file to find the organism specific annotation file path + # Raises an exception if the organism does not have an associated annotation file yet + + + all_organism_table <- read.csv("https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv") + + annotation_file_path <- all_organism_table %>% dplyr::filter(species == organism) %>% dplyr::pull(genelab_annots_link) + + # Guard clause: Ensure annotation_file_path populated + # Else: raise exception for unsupported organism + if (length(annotation_file_path) == 0) { + stop(glue::glue("Organism supplied '{organism}' is not supported. See the following url for supported organisms: https://github.com/nasa/GeneLab_Data_Processing/blob/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv. Supported organisms will correspond to a row based on the 'species' column and include a url in the 'genelab_annots_link' column of that row")) + } + + return(annotation_file_path) +} +annotation_file_path <- retry_with_delay(fetch_organism_specific_annotation_file_path, unique(df_rs$organism)) allTrue <- function(i_vector) { if ( length(i_vector) == 0 ) { @@ -281,12 +287,9 @@ print(paste0("Number of Arrays: ", dim(raw_data)[2])) print(paste0("Number of Probes: ", dim(raw_data)[1])) ``` -**Parameter Definitions:** +**Input Data:** - `runsheet` (Path to runsheet, output from [Step 1](#1-create-sample-runsheet)) -- `local_annotation_dir` (Path to local annotation directory if using custom annotations, see [Step 8a](#8a-get-probeset-annotations)) - - > Note: If not using custom annotations, leave `local_annotation_dir` as `NULL`. **Output Data:** @@ -297,46 +300,6 @@ print(paste0("Number of Probes: ", dim(raw_data)[1]))
-### 2b. Load Annotation Metadata - -```R -## Determines the organism specific annotation file to use based on the organism in the runsheet -fetch_organism_specific_annotation_table <- function(organism) { - # Uses the latest GeneLab annotations table to find the organism specific annotation file path and ensembl version - # Raises an exception if the organism does not have an associated annotation file or ensembl version yet - - annotation_table_link <- "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/GL_RefAnnotTable-A_1.1.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv" - all_organism_table <- read.csv(annotation_table_link) - - annotation_table <- all_organism_table %>% dplyr::filter(species == organism) - - # Guard clause: Ensure annotation_table populated - # Else: raise exception for unsupported organism - if (nrow(annotation_table) == 0 || annotation_table$genelab_annots_link == "" || is.na(annotation_table$ensemblVersion)) { - stop(glue::glue("Organism supplied '{organism}' is not supported. See the following url for supported organisms: {annotation_table_link}. Supported organisms will correspond to a row based on the 'species' column and include a url in the 'genelab_annots_link' column of that row and a version number in the 'ensemblVersion' column.")) - } - - return(annotation_table) -} - -annotation_table <- retry_with_delay(fetch_organism_specific_annotation_table, unique(df_rs$organism)) - -annotation_file_path <- annotation_table$genelab_annots_link -ensembl_version <- as.character(annotation_table$ensemblVersion) -``` - -**Parameter Definitions:** - -- `df_rs$organism` (organism specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) -- `annotation_table_link` (URL or path to latest GeneLab Annotations file, see [GL-DPPD-7110-A_annotations.csv](../../../GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv)) - -**Output Data:** - -- `annotation_file_path` (reference organism annotation file url indicated in the 'genelab_annots_link' column of the GeneLab Annotations file provided in `annotation_table_link`) -- `ensembl_version` (reference organism Ensembl version indicated in the 'ensemblVersion' column of the GeneLab Annotations file provided in `annotation_table_link`) - -
- --- ## 3. Raw Data Quality Assessment @@ -352,11 +315,6 @@ par( ) number_of_sets = ceiling(dim(raw_data)[2] / 30) # Set of 30 samples, used to scale plot -scale_factor = 0.2 # Default scale factor - -if (max(nchar(colnames(raw_data@assayData$exprs))) > 35 & number_of_sets > 1) { # Scale more if sample names are long - scale_factor = if_else(number_of_sets == 2, 0.4, 0.25) -} oligo::hist(raw_data, transfo=log2, # Log2 transform raw intensity values @@ -367,7 +325,7 @@ legend("topright", legend = colnames(raw_data@assayData$exprs), lty = c(1,2,3,4,5), # Seems like oligo::hist cycles through these first five line types col = oligo::darkColors(n = ncol(raw_data)), # Ensure legend color is in sync with plot ncol = number_of_sets, # Set number of columns by number of sets - cex = max(0.35, 1 + scale_factor - (number_of_sets*scale_factor)) # Reduce for each column beyond 1 with minimum of 35% + cex = max(0.35, 1 + 0.2 - (number_of_sets*0.2)) # Reduce scale by 20% for each column beyond 1 with minimum of 35% ) # Reset par @@ -376,7 +334,7 @@ par(original_par) **Input Data:** -- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) +- `raw_data` (raw data R object created in [Step 2](#2-load-metadata-and-raw-data) above) **Output Data:** @@ -397,7 +355,7 @@ for ( i in seq_along(1:ncol(raw_data))) { **Input Data:** -- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) +- `raw_data` (raw data R object created in [Step 2](#2-load-metadata-and-raw-data) above) **Output Data:** @@ -410,8 +368,8 @@ for ( i in seq_along(1:ncol(raw_data))) { ```R if (inherits(raw_data, "GeneFeatureSet")) { print("Raw data is a GeneFeatureSet, using exprs() to access expression values and adding 0.0001 to avoid log(0)") -} else if (inherits(raw_data, "ExpressionSet") || inherits(raw_data, "ExpressionFeatureSet") || inherits(raw_data, "HTAFeatureSet")) { - print(paste0("Raw data is ", class(raw_data), ". Using default approach for this class for MA Plot")) +} else if (inherits(raw_data, "ExpressionSet")) { + print("Raw data is an ExpressionSet. Using default approach for this class for MA Plot") } if (inherits(raw_data, "GeneFeatureSet")) { @@ -421,20 +379,26 @@ if (inherits(raw_data, "GeneFeatureSet")) { ylim=c(-2, 4), main="" # This function uses 'main' as a suffix to the sample name. Here we want just the sample name, thus here main is an empty string ) -} else if (inherits(raw_data, "ExpressionSet") || inherits(raw_data, "ExpressionFeatureSet") || inherits(raw_data, "HTAFeatureSet")) { +} else if (inherits(raw_data, "ExpressionSet")) { + MA_plot <- oligo::MAplot( + raw_data, + ylim=c(-2, 4), + main="" # This function uses 'main' as a suffix to the sample name. Here we want just the sample name, thus here main is an empty string + ) +} else if (inherits(raw_data, "ExpressionFeatureSet")) { MA_plot <- oligo::MAplot( raw_data, ylim=c(-2, 4), main="" # This function uses 'main' as a suffix to the sample name. Here we want just the sample name, thus here main is an empty string ) } else { - stop(glue::glue("No strategy for MA plots for {class(raw_data)}")) + stop(glue::glue("No strategy for MA plots for {raw_data}")) } ``` **Input Data:** -- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) +- `raw_data` (raw data R object created in [Step 2](#2-load-metadata-and-raw-data) above) **Output Data:** @@ -469,7 +433,7 @@ par(original_par) **Input Data:** -- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) +- `raw_data` (raw data R object created in [Step 2](#2-load-metadata-and-raw-data) above) **Output Data:** @@ -487,7 +451,7 @@ background_corrected_data <- raw_data %>% oligo::backgroundCorrect(method="rma") **Input Data:** -- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) +- `raw_data` (raw data R object created in [Step 2](#2-load-metadata-and-raw-data) above) **Output Data:** @@ -506,7 +470,7 @@ background_corrected_data <- raw_data %>% oligo::backgroundCorrect(method="rma") # Normalize background-corrected data using the quantile method norm_data <- oligo::normalize(background_corrected_data, method = "quantile", - target = "core" # Use oligo default: core metaprobeset mappings + target = "core" # Use oligo default: probes with probeset id mapping ) # Summarize background-corrected and normalized data @@ -552,7 +516,7 @@ legend("topright", legend = colnames(norm_data@assayData$exprs), lty = c(1,2,3,4,5), # Seems like oligo::hist cycles through these first five line types col = oligo::darkColors(n = ncol(norm_data)), # Ensure legend color is in sync with plot ncol = number_of_sets, # Set number of columns by number of sets - cex = max(0.35, 1 + scale_factor - (number_of_sets*scale_factor)) # Reduce for each column beyond 1 with minimum of 35% + cex = max(0.35, 1 + 0.2 - (number_of_sets*0.2)) # Reduce scale by 20% for each column beyond 1 with minimum of 35% ) # Reset par @@ -673,11 +637,11 @@ print(paste0("Number of Probesets: ", dim(unique(oligo::getProbeInfo(probeset_le --- -## 8. Probeset Annotations +## 8. Perform Probeset Differential Expression (DE)
-### 8a. Get Probeset Annotations +### 8a. Add Probeset Annotations ```R shortenedOrganismName <- function(long_name) { @@ -743,35 +707,25 @@ get_ensembl_genomes_mappings_from_ftp <- function(organism, ensembl_genomes_port return(mapping) } -# Convert list of multi-mapped genes to string -listToUniquePipedString <- function(str_list) { - #! convert lists into strings denoting unique elements separated by '|' characters - #! e.g. c("GO1","GO2","GO2","G03") -> "GO1|GO2|GO3" - return(toString(unique(str_list)) %>% stringr::str_replace_all(pattern = stringr::fixed(", "), replacement = "|")) -} - organism <- shortenedOrganismName(unique(df_rs$organism)) -annot_key <- ifelse(organism %in% c("athaliana"), 'TAIR', 'ENSEMBL') if (organism %in% c("athaliana")) { - ENSEMBL_VERSION = ensembl_version + ensembl_genomes_version = "54" ensembl_genomes_portal = "plants" - print(glue::glue("Using ensembl genomes ftp to get specific version of probeset id mapping table. Ensembl genomes portal: {ensembl_genomes_portal}, version: {ENSEMBL_VERSION}")) + print(glue::glue("Using ensembl genomes ftp to get specific version of probeset id mapping table. Ensembl genomes portal: {ensembl_genomes_portal}, version: {ensembl_genomes_version}")) expected_attribute_name <- getBioMartAttribute(df_rs) df_mapping <- retry_with_delay( get_ensembl_genomes_mappings_from_ftp, organism = organism, ensembl_genomes_portal = ensembl_genomes_portal, - ensembl_genomes_version = ENSEMBL_VERSION, + ensembl_genomes_version = ensembl_genomes_version, biomart_attribute = expected_attribute_name ) # TAIR from the mapping tables tend to be in the format 'AT1G01010.1' but the raw data has 'AT1G01010' # So here we remove the '.NNN' from the mapping table where .NNN is any number df_mapping$ensembl_gene_id <- stringr::str_replace_all(df_mapping$ensembl_gene_id, "\\.\\d+$", "") - - use_custom_annot <- FALSE } else { # Use biomart from main Ensembl website which archives keep each release on the live service # locate dataset @@ -780,189 +734,92 @@ if (organism %in% c("athaliana")) { # Specify Ensembl version used in current GeneLab reference annotations - ENSEMBL_VERSION <- ensembl_version + ENSEMBL_VERSION <- '107' print(glue::glue("Using Ensembl biomart to get specific version of mapping table. Ensembl version: {ENSEMBL_VERSION}")) - # Check if organism in supported in biomart - ensembl <- biomaRt::useEnsembl(biomart = "genes") - ensembl_datasets <- biomaRt::listDatasets(ensembl) - use_custom_annot <- !expected_dataset_name %in% ensembl_datasets$dataset - - if (use_custom_annot) { - unloadNamespace("biomaRt") - } else { - - ensembl <- biomaRt::useEnsembl(biomart = "genes", - dataset = expected_dataset_name, - version = ENSEMBL_VERSION) - print(ensembl) - - expected_attribute_name <- getBioMartAttribute(df_rs) - print(paste0("Expected attribute name: '", expected_attribute_name, "'")) - - # Some probe_ids for affy_hta_2_0 may end in .hg.1 instead of .hg (how it is in biomaRt), leading to 0 results returned - if (expected_attribute_name == 'affy_hta_2_0') { - rownames(probeset_level_data) <- stringr::str_replace(rownames(probeset_level_data), '\\.hg\\.1$', '.hg') - } - - probe_ids <- rownames(probeset_level_data) - - # Create probe map - # Run Biomart Queries in chunks to prevent request timeouts - # Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk size - CHUNK_SIZE= 1500 - probe_id_chunks <- split(probe_ids, ceiling(seq_along(probe_ids) / CHUNK_SIZE)) - df_mapping <- data.frame() - for (i in seq_along(probe_id_chunks)) { - probe_id_chunk <- probe_id_chunks[[i]] - print(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})")) - chunk_results <- biomaRt::getBM( - attributes = c( - expected_attribute_name, - "ensembl_gene_id" - ), - filters = expected_attribute_name, - values = probe_id_chunk, - mart = ensembl) - - if (nrow(chunk_results) > 0) { - df_mapping <- df_mapping %>% dplyr::bind_rows(chunk_results) - } - - Sys.sleep(10) # Slight break between requests to prevent back-to-back requests - } + ensembl <- biomaRt::useEnsembl(biomart = "genes", + dataset = expected_dataset_name, + version = ENSEMBL_VERSION) + print(ensembl) + expected_attribute_name <- getBioMartAttribute(df_rs) + print(paste0("Expected attribute name: '", expected_attribute_name, "'")) + + probe_ids <- rownames(probeset_level_data) + + # Create probe map + # Run Biomart Queries in chunks to prevent request timeouts + # Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk size + CHUNK_SIZE= 1500 + probe_id_chunks <- split(probe_ids, ceiling(seq_along(probe_ids) / CHUNK_SIZE)) + df_mapping <- data.frame() + for (i in seq_along(probe_id_chunks)) { + probe_id_chunk <- probe_id_chunks[[i]] + print(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})")) + chunk_results <- biomaRt::getBM( + attributes = c( + expected_attribute_name, + "ensembl_gene_id" + ), + filters = expected_attribute_name, + values = probe_id_chunk, + mart = ensembl) + + df_mapping <- df_mapping %>% dplyr::bind_rows(chunk_results) + Sys.sleep(10) # Slight break between requests to prevent back-to-back requests } } # At this point, we have df_mapping from either the biomart live service or the ensembl genomes ftp archive depending on the organism -# If no df_mapping obtained (e.g., organism not supported in biomart), use custom annotations; otherwise, merge in-house annotations to df_mapping -if (use_custom_annot) { - expected_attribute_name <- 'ProbesetID' - - annot_type <- 'NO_CUSTOM_ANNOT' - if (!is.null(local_annotation_dir) && file.exists(file.path(local_annotation_dir, 'config.csv'))) { - config_df <- read.csv(file.path(local_annotation_dir, 'config.csv'), row.names=1) - if (df_rs$`biomart_attribute` %in% row.names(config_df)) { - annot_config <- config_df[df_rs$`biomart_attribute`, ] - annot_type <- annot_config$annot_type[[1]] - } else { - warning(paste0("No entry for '", df_rs$`biomart_attribute`, "' in provided config.csv")) - } - } else { - warning(paste0("No 'config.csv' file found in path (--referenceStorePath): ", local_annotation_dir)) - } - - if (annot_type == '3prime-IVT') { - unique_probe_ids <- read.csv( - file.path(local_annotation_dir, annot_config$annot_filename[[1]]), - skip = 13, header = TRUE, na.strings = c('NA', '---') - )[c('Probe.Set.ID', 'Entrez.Gene', 'Gene.Symbol', 'Gene.Title', 'Ensembl', 'RefSeq.Transcript.ID', 'RefSeq.Protein.ID', 'Gene.Ontology.Biological.Process', 'Gene.Ontology.Cellular.Component', 'Gene.Ontology.Molecular.Function')] - - # Clean columns - unique_probe_ids$Gene.Symbol <- purrr::map_chr(stringr::str_split(unique_probe_ids$Gene.Symbol, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) - unique_probe_ids$Gene.Title <- purrr::map_chr(stringr::str_split(unique_probe_ids$Gene.Title, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) - unique_probe_ids$Entrez.Gene <- purrr::map_chr(stringr::str_split(unique_probe_ids$Entrez.Gene, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) - unique_probe_ids$Ensembl <- purrr::map_chr(stringr::str_split(unique_probe_ids$Ensembl, stringr::fixed(' /// ')), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('NA', NA_character_) - - unique_probe_ids$RefSeq <- paste(unique_probe_ids$RefSeq.Transcript.ID, unique_probe_ids$RefSeq.Protein.ID) - unique_probe_ids$RefSeq <- purrr::map_chr(stringr::str_extract_all(unique_probe_ids$RefSeq, '[A-Z]+_[\\d.]+'), ~paste0(unique(.), collapse = "|")) %>% stringr::str_replace('^$', NA_character_) - - unique_probe_ids$GO <- paste(unique_probe_ids$Gene.Ontology.Biological.Process, unique_probe_ids$Gene.Ontology.Cellular.Component, unique_probe_ids$Gene.Ontology.Molecular.Function) - unique_probe_ids$GO <- purrr::map_chr(stringr::str_extract_all(unique_probe_ids$GO, '\\d{7}'), ~paste0('GO:', unique(.), collapse = "|")) %>% stringr::str_replace('^GO:$', NA_character_) - - unique_probe_ids <- unique_probe_ids[c('Probe.Set.ID', 'Entrez.Gene', 'Gene.Symbol', 'Gene.Title', 'Ensembl', 'RefSeq', 'GO')] - names(unique_probe_ids) <- c('ProbesetID', 'ENTREZID', 'SYMBOL', 'GENENAME', 'ENSEMBL', 'REFSEQ', 'GOSLIM_IDS') - - unique_probe_ids$STRING_id <- NA_character_ - - gene_col <- 'ENSEMBL' - if (sum(!is.na(unique_probe_ids$ENTREZID)) > sum(!is.na(unique_probe_ids$ENSEMBL))) { - gene_col <- 'ENTREZID' - } - if (sum(!is.na(unique_probe_ids$SYMBOL)) > sum(!is.na(unique_probe_ids$ENTREZID))) { - gene_col <- 'SYMBOL' - } - - unique_probe_ids <- unique_probe_ids %>% - dplyr::mutate( - count_gene_mappings = 1 + stringr::str_count(get(gene_col), stringr::fixed("|")), - gene_mapping_source = gene_col - ) - } else if (annot_type == 'custom') { - unique_probe_ids <- read.csv( - file.path(local_annotation_dir, annot_config$annot_filename[[1]]), - header = TRUE, na.strings = c('NA', '') - ) - } else { - annot_cols <- c('ProbesetID', 'ENTREZID', 'SYMBOL', 'GENENAME', 'ENSEMBL', 'REFSEQ', 'GOSLIM_IDS', 'STRING_id', 'count_gene_mappings', 'gene_mapping_source') - unique_probe_ids <- setNames(data.frame(matrix(NA_character_, nrow = 1, ncol = length(annot_cols))), annot_cols) - } -} else { - annot <- read.table( - as.character(annotation_file_path), - sep = "\t", - header = TRUE, - quote = "", - comment.char = "" - ) +listToUniquePipedString <- function(str_list) { + #! convert lists into strings denoting unique elements separated by '|' characters + #! e.g. c("GO1","GO2","GO2","G03") -> "GO1|GO2|GO3" + return(toString(unique(str_list)) %>% stringr::str_replace_all(pattern = stringr::fixed(", "), replacement = "|")) +} - unique_probe_ids <- df_mapping %>% - dplyr::mutate(dplyr::across(!!sym(expected_attribute_name), as.character)) %>% # Ensure probeset ids treated as character type - dplyr::group_by(!!sym(expected_attribute_name)) %>% - dplyr::summarise( - ENSEMBL = listToUniquePipedString(ensembl_gene_id) - ) %>% - # Count number of ensembl IDS mapped - dplyr::mutate( - count_gene_mappings = 1 + stringr::str_count(ENSEMBL, stringr::fixed("|")), - gene_mapping_source = annot_key +unique_probe_ids <- df_mapping %>% + dplyr::mutate(dplyr::across(!!sym(expected_attribute_name), as.character)) %>% # Ensure probeset ids treated as character type + dplyr::group_by(!!sym(expected_attribute_name)) %>% + dplyr::summarise( + ENSEMBL = listToUniquePipedString(ensembl_gene_id) ) %>% - dplyr::left_join(annot, by = c("ENSEMBL" = annot_key)) -} + # Count number of ensembl IDS mapped + dplyr::mutate( + count_ENSEMBL_mappings = 1 + stringr::str_count(ENSEMBL, stringr::fixed("|")) + ) probeset_expression_matrix <- oligo::exprs(probeset_level_data) -probeset_expression_matrix.gene_mapped <- probeset_expression_matrix %>% +probeset_expression_matrix.biomart_mapped <- probeset_expression_matrix %>% as.data.frame() %>% tibble::rownames_to_column(var = "ProbesetID") %>% # Ensure rownames (probeset IDs) can be used as join key dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% - dplyr::mutate( count_gene_mappings := ifelse(is.na(count_gene_mappings), 0, count_gene_mappings) ) %>% - dplyr::mutate( gene_mapping_source := unique(unique_probe_ids$gene_mapping_source) ) + dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) ) ``` -**Parameter Definitions:** - -- `df_rs$organism` (organism specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) -- `df_rs$biomart_attribute` (array design biomart identifier specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) -- `annotation_file_path` (reference organism annotation file url indicated in the 'genelab_annots_link' column of the GeneLab Annotations file provided in `annotation_table_link`, output from [Step 2b](#2b-load-annotation-metadata)) -- `ensembl_version` (reference organism Ensembl version indicated in the 'ensemblVersion' column of the GeneLab Annotations file provided in `annotation_table_link`, output from [Step 2b](#2b-load-annotation-metadata)) -- `annot_key` (Keytype to join annotation table and microarray probes, dependent on organism, e.g. mus musculus uses 'ENSEMBL') -- `local_annotation_dir` (Path to local annotation directory if using custom annotations, defined in [Step 2a](#2a-load-metadata-and-raw-data)) - - > Note: See [here](../Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md) for details on what to include in this directory. - **Input Data:** +- `df_rs$organism` (organism specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) +- `df_rs$'biomart_attribute'` (array design biomart identifier specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) +- ENSEMBL_VERSION (reference organism Ensembl version indicated in the `ensemblVersion` column of the [GL-DPPD-7110_annotations.csv](../../../GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv) GeneLab Annotations file) - `probeset_level_data` (R object containing probeset level expression values after summarization of normalized probeset level data, output from [Step 7](#7-probeset-summarization)) **Output Data:** -- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations) -- `unique_probe_ids` (R object containing probeset ID to gene annotation mappings) +- `probeset_expression_matrix.biomart_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html))
-### 8b. Summarize Gene Mapping +### 8b. Summarize Biomart Mapping ```R # Pie Chart with Percentages slices <- c( - 'Unique Mapping' = nrow(probeset_expression_matrix.gene_mapped %>% dplyr::filter(count_gene_mappings == 1) %>% dplyr::distinct(ProbesetID)), - 'Multi Mapping' = nrow(probeset_expression_matrix.gene_mapped %>% dplyr::filter(count_gene_mappings > 1) %>% dplyr::distinct(ProbesetID)), - 'No Mapping' = nrow(probeset_expression_matrix.gene_mapped %>% dplyr::filter(count_gene_mappings == 0) %>% dplyr::distinct(ProbesetID)) + 'Unique Mapping' = nrow(probeset_expression_matrix.biomart_mapped %>% dplyr::filter(count_ENSEMBL_mappings == 1) %>% dplyr::distinct(ProbesetID)), + 'Multi Mapping' = nrow(probeset_expression_matrix.biomart_mapped %>% dplyr::filter(count_ENSEMBL_mappings > 1) %>% dplyr::distinct(ProbesetID)), + 'No Mapping' = nrow(probeset_expression_matrix.biomart_mapped %>% dplyr::filter(count_ENSEMBL_mappings == 0) %>% dplyr::distinct(ProbesetID)) ) pct <- round(slices/sum(slices)*100) chart_names <- names(slices) @@ -970,130 +827,24 @@ chart_names <- glue::glue("{names(slices)} ({slices})") # add count to labels chart_names <- paste(chart_names, pct) # add percents to labels chart_names <- paste(chart_names,"%",sep="") # ad % to labels pie(slices,labels = chart_names, col=rainbow(length(slices)), - main=glue::glue("Mapping to Primary Keytype\n {nrow(probeset_expression_matrix.gene_mapped %>% dplyr::distinct(ProbesetID))} Total Unique Probesets") + main=glue::glue("Biomart Mapping to Ensembl Primary Keytype\n {nrow(probeset_expression_matrix.biomart_mapped %>% dplyr::distinct(ProbesetID))} Total Unique Probesets") ) -print(glue::glue("Unique Mapping Count: {slices[['Unique Mapping']]}")) +print(glue::glue("Biomart Unique Mapping Count: {slices[['Unique Mapping']]}")) ``` **Input Data:** -- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations, output from [Step 8a](#8a-get-probeset-annotations) above) +- `probeset_expression_matrix.biomart_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html), output from [Step 8a](#8a-add-probeset-annotations) above) **Output Data:** -- A pie chart denoting the gene mapping rates for each unique probeset ID -- A printout denoting the count of unique mappings for gene mapping +- A pie chart denoting the biomart mapping rates for each unique probeset ID +- A printout denoting the count of unique mappings for biomart mapping
-### 8c. Save Annotated Tables - -```R -## Reorder columns before saving to file -ANNOTATIONS_COLUMN_ORDER = c( - annot_key, - "SYMBOL", - "GENENAME", - "REFSEQ", - "ENTREZID", - "STRING_id", - "GOSLIM_IDS" -) - -SAMPLE_COLUMN_ORDER <- df_rs$`Sample Name` - -probeset_expression_matrix.gene_mapped <- probeset_expression_matrix.gene_mapped %>% dplyr::rename( !!annot_key := ENSEMBL ) - -## Output column subset file with just normalized probeset level expression values -write.csv( - probeset_expression_matrix.gene_mapped[c( - ANNOTATIONS_COLUMN_ORDER, - "ProbesetID", - "count_gene_mappings", - "gene_mapping_source", - SAMPLE_COLUMN_ORDER) - ], file.path(DIR_NORMALIZED_EXPRESSION, "normalized_expression_probeset_GLmicroarray.csv"), row.names = FALSE) - -## Determine column order for probe level tables - -PROBE_INFO_COLUMN_ORDER = c( - "ProbesetID", - "ProbeID", - "count_gene_mappings", - "gene_mapping_source" -) - -FINAL_COLUMN_ORDER <- c( - ANNOTATIONS_COLUMN_ORDER, - PROBE_INFO_COLUMN_ORDER, - SAMPLE_COLUMN_ORDER -) - -## Generate raw intensity matrix that includes annotations - -background_corrected_data_annotated <- oligo::exprs(background_corrected_data) %>% - as.data.frame() %>% - tibble::rownames_to_column(var = "fid") %>% # Ensure rownames (probeset IDs) can be used as join key - dplyr::mutate(dplyr::across(fid, as.integer)) %>% # Ensure fid is integer type, consistent with getProbeInfo typing - dplyr::right_join(oligo::getProbeInfo(background_corrected_data), by = "fid") %>% # Add 'man_fsetid' via mapping based on fid - dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID - dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID - dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% # Join with biomaRt ENSEMBL mappings - dplyr::mutate( count_gene_mappings := ifelse(is.na(count_gene_mappings), 0, count_gene_mappings) ) %>% # Convert NA mapping to 0 - dplyr::mutate( gene_mapping_source := unique(unique_probe_ids$gene_mapping_source) ) %>% - dplyr::rename( !!annot_key := ENSEMBL ) - -## Perform reordering -background_corrected_data_annotated <- background_corrected_data_annotated %>% - dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER)) - -write.csv(background_corrected_data_annotated, file.path(DIR_RAW_DATA, "raw_intensities_probe_GLmicroarray.csv"), row.names = FALSE) - -## Generate normalized expression matrix that includes annotations -norm_data_matrix_annotated <- oligo::exprs(norm_data) %>% - as.data.frame() %>% - tibble::rownames_to_column(var = "fid") %>% # Ensure rownames (probeset IDs) can be used as join key - dplyr::mutate(dplyr::across(fid, as.integer)) %>% # Ensure fid is integer type, consistent with getProbeInfo typing - dplyr::right_join(oligo::getProbeInfo(norm_data), by = "fid") %>% # Add 'man_fsetid' via mapping based on fid - dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID - dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID - dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% - dplyr::mutate( count_gene_mappings := ifelse(is.na(count_gene_mappings), 0, count_gene_mappings) ) %>% # Convert NA mapping to 0 - dplyr::mutate( gene_mapping_source := unique(unique_probe_ids$gene_mapping_source) ) %>% - dplyr::rename( !!annot_key := ENSEMBL ) - -norm_data_matrix_annotated <- norm_data_matrix_annotated %>% - dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER)) - -write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "normalized_intensities_probe_GLmicroarray.csv"), row.names = FALSE) -``` - -**Parameter Definitions:** - -- `df_rs[['Sample Name']]` (sample names specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) -- `annot_key` (Keytype to join annotation table and microarray probes, dependent on organism, e.g. mus musculus uses 'ENSEMBL', defined in [Step 8a](#8a-get-probeset-annotations)) - -**Input Data:** - -- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations, output from [Step 8a](#8a-get-probeset-annotations) above) -- `background_corrected_data` (R object containing background-corrected microarray data created in [Step 4](#4-background-correction)) -- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization)) -- `unique_probe_ids` (R object containing probeset ID to gene annotation mappings, output from [Step 8a](#8a-get-probeset-annotations)) - -**Output Data:** - -- **normalized_expression_probeset_GLmicroarray.csv** (table containing the background corrected, normalized probeset expression values for each sample. The ProbesetID is the unique index column.) -- **raw_intensities_probe_GLmicroarray.csv** (table containing the background corrected, unnormalized probe intensity values for each sample including gene annotations. The ProbeID is the unique index column.) -- **normalized_intensities_probe_GLmicroarray.csv** (table containing the background corrected, normalized probe intensity values for each sample including gene annotations. The ProbeID is the unique index column.) - -## 9. Perform Probeset Differential Expression (DE) - -> Note: Run differential expression analysis only if there is at least 1 replicate per factor group. - -
- -### 9a. Generate Design Matrix +### 8c. Generate Design Matrix ```R # Pull all factors for each sample in the study from the runsheet created in Step 1 @@ -1160,7 +911,7 @@ write.csv(design_data$contrasts, file.path(DIR_DGE, "contrasts_GLmicroarray.csv"
-### 9b. Perform Individual Probeset Level DE +### 8d. Perform Individual Probeset Level DE ```R lmFitPairwise <- function(norm_data, design) { @@ -1190,16 +941,12 @@ limma::write.fit(res, adjust = 'BH', row.names = FALSE, quote = TRUE, sep = ",") - -### Generate and export PCA table for GeneLab visualization plots -PCA_raw <- prcomp(t(exprs(probeset_level_data)), scale = FALSE) # Note: expression at the Probeset level is already log2 transformed -write.csv(PCA_raw$x, file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.csv")) ``` **Input Data:** - `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization)) -- `design` (R object containing the limma study design matrix, indicating the group that each sample belongs to, created in [Step 9a](#9a-generate-design-matrix) above) +- `design` (R object containing the limma study design matrix, indicating the group that each sample belongs to, created in [Step 8c](#8c-generate-design-matrix) above) - `probeset_level_data` (R object containing probeset level expression values after summarization of normalized probeset level data, output from [Step 7](#7-probeset-summarization)) **Output Data:** @@ -1209,11 +956,10 @@ write.csv(PCA_raw$x, file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.cs - T statistic for all pairwise comparison tests - P value for all pairwise comparison tests - Adjusted P value for all pairwise comparison tests) -- visualization_PCA_table_GLmicroarray.csv (file used to generate GeneLab PCA plots)
-### 9c. Save DE Table +### 8e. Add Additional Columns and Format DE Table ```R ## Reformat Table for consistency across DE analyses tables within GeneLab ## @@ -1221,9 +967,9 @@ write.csv(PCA_raw$x, file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.cs # Read in DE table df_interim <- read.csv("INTERIM.csv") -# Bind columns from gene mapped expression table +# Bind columns from biomart mapped expression table df_interim <- df_interim %>% - dplyr::bind_cols(probeset_expression_matrix.gene_mapped) + dplyr::bind_cols(probeset_expression_matrix.biomart_mapped) # Reformat column names reformat_names <- function(colname, group_name_mapping) { @@ -1235,7 +981,7 @@ reformat_names <- function(colname, group_name_mapping) { stringr::str_replace(pattern = ".condition", replacement = "v") # remap to group names before make.names was applied - unique_group_name_mapping <- unique(group_name_mapping) %>% arrange(-nchar(safe_name)) + unique_group_name_mapping <- unique(group_name_mapping) for ( i in seq(nrow(unique_group_name_mapping)) ) { safe_name <- unique_group_name_mapping[i,]$safe_name original_name <- unique_group_name_mapping[i,]$original_name @@ -1245,7 +991,7 @@ reformat_names <- function(colname, group_name_mapping) { return(new_colname) } -df_interim <- df_interim %>% dplyr::rename_with(reformat_names, .cols = matches('\\.condition'), group_name_mapping = design_data$mapping) +df_interim <- df_interim %>% dplyr::rename_with( reformat_names, group_name_mapping = design_data$mapping ) ## Add Group Wise Statistics ## @@ -1278,10 +1024,11 @@ for ( i in seq_along(unique_groups) ) { as.data.frame() } +all_samples <- design_data$group %>% dplyr::pull(sample) df_interim <- df_interim %>% dplyr::mutate( - "All.mean" := rowMeans(dplyr::select(., all_of(SAMPLE_COLUMN_ORDER))), - "All.stdev" := matrixStats::rowSds(as.matrix(dplyr::select(., all_of(SAMPLE_COLUMN_ORDER)))), + "All.mean" := rowMeans(dplyr::select(., all_of(all_samples))), + "All.stdev" := matrixStats::rowSds(as.matrix(dplyr::select(., all_of(all_samples)))), ) %>% dplyr::ungroup() %>% as.data.frame() @@ -1295,12 +1042,56 @@ colnames_to_remove = c( df_interim <- df_interim %>% dplyr::select(-any_of(colnames_to_remove)) +## Concatenate annotations for genes (for uniquely mapped probes) ## +### Read in annotation table for the appropriate organism ### +annot <- read.table( + annotation_file_path, + sep = "\t", + header = TRUE, + quote = "", + comment.char = "", + ) + +# Join annotation table and uniquely mapped data + +# Determine appropriate keytype as found in annotation tables +map_primary_keytypes <- c( + 'Caenorhabditis elegans' = 'ENSEMBL', + 'Danio rerio' = 'ENSEMBL', + 'Drosophila melanogaster' = 'ENSEMBL', + 'Rattus norvegicus' = 'ENSEMBL', + 'Saccharomyces cerevisiae' = 'ENSEMBL', + 'Homo sapiens' = 'ENSEMBL', + 'Mus musculus' = 'ENSEMBL', + 'Arabidopsis thaliana' = 'TAIR' +) + +df_interim <- merge( + annot, + df_interim, + by.x = map_primary_keytypes[[unique(df_rs$organism)]], + by.y = "ENSEMBL", + # ensure all original dge rows are kept. + # If unmatched in the annotation database, then fill missing with NAN + all.y = TRUE + ) + +## Reorder columns before saving to file +ANNOTATIONS_COLUMN_ORDER = c( + map_primary_keytypes[[unique(df_rs$organism)]], + "SYMBOL", + "GENENAME", + "REFSEQ", + "ENTREZID", + "STRING_id", + "GOSLIM_IDS" +) + PROBE_INFO_COLUMN_ORDER = c( "ProbesetID", - "count_gene_mappings", - "gene_mapping_source" + "count_ENSEMBL_mappings" ) - +SAMPLE_COLUMN_ORDER <- all_samples generate_prefixed_column_order <- function(subjects, prefixes) { #' Return a vector of columns based on subject and given prefixes #' Used for both contrasts and groups column name generation @@ -1368,15 +1159,106 @@ df_interim <- df_interim %>% dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER)) # Save to file write.csv(df_interim, file.path(DIR_DGE, "differential_expression_GLmicroarray.csv"), row.names = FALSE) + +## Output column subset file with just normalized probeset level expression values +write.csv( + df_interim[c( + ANNOTATIONS_COLUMN_ORDER, + "ProbesetID", + "count_ENSEMBL_mappings", + all_samples) + ], file.path(DIR_NORMALIZED_EXPRESSION, "normalized_expression_probeset_GLmicroarray.csv"), row.names = FALSE) + +### Generate and export PCA table for GeneLab visualization plots +PCA_raw <- prcomp(t(exprs(probeset_level_data)), scale = FALSE) # Note: expression at the Probeset level is already log2 transformed +write.csv(PCA_raw$x, + file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.csv") + ) + +## Generate raw intensity matrix that includes annotations + +background_corrected_data_annotated <- oligo::exprs(background_corrected_data) %>% + as.data.frame() %>% + tibble::rownames_to_column(var = "fid") %>% # Ensure rownames (probeset IDs) can be used as join key + dplyr::mutate(dplyr::across(fid, as.integer)) %>% # Ensure fid is integer type, consistent with getProbeInfo typing + dplyr::right_join(oligo::getProbeInfo(background_corrected_data), by = "fid") %>% # Add 'man_fsetid' via mapping based on fid + dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID + dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID + dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% # Join with biomaRt ENSEMBL mappings + dplyr::left_join(annot, by = c("ENSEMBL" = map_primary_keytypes[[unique(df_rs$organism)]])) %>% # Join with GeneLab Reference Annotation Table using key name expected in organism specific annotation table + dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) ) %>% # Convert NA mapping to 0 + dplyr::rename( !!map_primary_keytypes[[unique(df_rs$organism)]] := ENSEMBL ) + +## Determine column order for probe level tables + +PROBE_INFO_COLUMN_ORDER = c( + "ProbesetID", + "ProbeID", + "count_ENSEMBL_mappings" +) + +FINAL_COLUMN_ORDER <- c( + ANNOTATIONS_COLUMN_ORDER, + PROBE_INFO_COLUMN_ORDER, + SAMPLE_COLUMN_ORDER + ) + +## Generate raw intensity matrix that includes annotations + +background_corrected_data_annotated <- oligo::exprs(background_corrected_data) %>% + as.data.frame() %>% + tibble::rownames_to_column(var = "fid") %>% # Ensure rownames (probeset IDs) can be used as join key + dplyr::mutate(dplyr::across(fid, as.integer)) %>% # Ensure fid is integer type, consistent with getProbeInfo typing + dplyr::right_join(oligo::getProbeInfo(background_corrected_data), by = "fid") %>% # Add 'man_fsetid' via mapping based on fid + dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID + dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID + dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% # Join with biomaRt ENSEMBL mappings + dplyr::left_join(annot, by = c("ENSEMBL" = map_primary_keytypes[[unique(df_rs$organism)]])) %>% # Join with GeneLab Reference Annotation Table using key name expected in organism specific annotation table + dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) ) %>% # Convert NA mapping to 0 + dplyr::rename( !!map_primary_keytypes[[unique(df_rs$organism)]] := ENSEMBL ) + +## Perform reordering +background_corrected_data_annotated <- background_corrected_data_annotated %>% + dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER)) + +write.csv(background_corrected_data_annotated, file.path(DIR_RAW_DATA, "raw_intensities_probe_GLmicroarray.csv"), row.names = FALSE) + +## Generate normalized expression matrix that includes annotations +norm_data_matrix_annotated <- oligo::exprs(norm_data) %>% + as.data.frame() %>% + tibble::rownames_to_column(var = "fid") %>% # Ensure rownames (probeset IDs) can be used as join key + dplyr::mutate(dplyr::across(fid, as.integer)) %>% # Ensure fid is integer type, consistent with getProbeInfo typing + dplyr::right_join(oligo::getProbeInfo(norm_data), by = "fid") %>% # Add 'man_fsetid' via mapping based on fid + dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID + dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID + dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% + dplyr::left_join(annot, by = c("ENSEMBL" = map_primary_keytypes[[unique(df_rs$organism)]])) %>% # Join with GeneLab Reference Annotation Table using key name expected in organism specific annotation table + dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) ) %>% # Convert NA mapping to 0 + dplyr::rename( !!map_primary_keytypes[[unique(df_rs$organism)]] := ENSEMBL ) + + + +norm_data_matrix_annotated <- norm_data_matrix_annotated %>% + dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER)) + +write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "normalized_intensities_probe_GLmicroarray.csv"), row.names = FALSE) + ``` **Input Data:** -- INTERIM.csv (Statistical values from individual probeset level DE analysis, output from [Step 9b](#9b-perform-individual-probeset-level-de) above) -- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations, output from [Step 8a](#8a-get-probeset-annotations) above) +- INTERIM.csv (Statistical values from individual probeset level DE analysis, output from [Step 8d](#8d-perform-individual-probeset-level-de) above) +- `annotation_file_path` (Annotation file url from 'genelab_annots_link' column of [GL-DPPD-7110_annotations.csv](https://github.com/nasa/GeneLab_Data_Processing/blob/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv) corresponding to the subject organism) +- `primary_keytype` (Keytype to join annotation table and microarray probes, dependent on organism, e.g. mus musculus uses 'ENSEMBL') +- `background_corrected_data` (R object containing background-corrected microarray data) +- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization)) **Output Data:** - **differential_expression_GLmicroarray.csv** (table containing normalized probeset expression values for each sample, group statistics, Limma probeset DE results for each pairwise comparison, and gene annotations. The ProbesetID is the unique index column.) +- **normalized_expression_probeset_GLmicroarray.csv** (table containing the background corrected, normalized probeset expression values for each sample. The ProbesetID is the unique index column.) +- visualization_PCA_table_GLmicroarray.csv (file used to generate GeneLab PCA plots) +- **raw_intensities_probe_GLmicroarray.csv** (table containing the background corrected, unnormalized probe intensity values for each sample including gene annotations. The ProbeID is the unique index column.) +- **normalized_intensities_probe_GLmicroarray.csv** (table containing the background corrected, normalized probe intensity values for each sample including gene annotations. The ProbeID is the unique index column.) > All steps of the Microarray pipeline are performed using R markdown and the completed R markdown is rendered (via Quarto) as an html file (**NF_MAAffymetrix_v\*_GLmicroarray.html**) and published in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/) for the respective dataset. \ No newline at end of file From 683e0c33988ab464fa86b87598b2d9213ce0e447 Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Sun, 19 Jan 2025 13:37:22 -0800 Subject: [PATCH 05/25] NF_MAAffymetrix: track array annotations --- .../Array_Annotations/Affymetrix_array_annotations.csv | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Microarray/Affymetrix/Array_Annotations/Affymetrix_array_annotations.csv diff --git a/Microarray/Affymetrix/Array_Annotations/Affymetrix_array_annotations.csv b/Microarray/Affymetrix/Array_Annotations/Affymetrix_array_annotations.csv new file mode 100644 index 00000000..03ea9b68 --- /dev/null +++ b/Microarray/Affymetrix/Array_Annotations/Affymetrix_array_annotations.csv @@ -0,0 +1,3 @@ +array_design,annot_type,annot_filename,download_link,download_date +AFFY E coli Genome 2 0,3prime-IVT,E_coli_2.na36.annot.csv,https://www.thermofisher.com/order/catalog/product/sec/assets?url=TFS-Assets/LSG/Support-Files/E_coli_2-na36-annot-csv.zip,2024-06-15 +AFFY GeneChip P. aeruginosa Genome,3prime-IVT,Pae_G1a.na36.annot.csv,https://www.thermofisher.com/order/catalog/product/sec/assets?url=TFS-Assets/LSG/Support-Files/Pae_G1a-na36-annot-csv.zip,2024-06-15 From 8e6a96a520707e02a582557b226c25451ba19d95 Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Sun, 19 Jan 2025 19:31:29 -0800 Subject: [PATCH 06/25] NF_MAAffymetrix: #113 update custom annotations config --- .../GL-DPPD-7114-A.md | 22 ++++++++++++------- .../examples/annotations/README.md | 4 ++-- .../workflow_code/bin/Affymetrix.qmd | 18 ++++++++------- .../workflow_code/config/default.config | 1 + .../modules/PROCESS_AFFYMETRIX.nf | 1 + 5 files changed, 28 insertions(+), 18 deletions(-) diff --git a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md index c2cdc0bb..4fe07165 100644 --- a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md +++ b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md @@ -155,8 +155,9 @@ options(dplyr.summarise.inform = FALSE) # Don't print out '`summarise()` has gro # Define path to runsheet runsheet <- "/path/to/runsheet/{OSD-Accession-ID}_microarray_v{version}_runsheet.csv" -# If using custom annotation, define path to directory containing annotation file and config +# If using custom annotation, local_annotation_dir is path to directory containing annotation file and annotation_config_path is path/url to config file local_annotation_dir <- NULL # +annotation_config_path <- NULL # ## Set up output structure @@ -288,6 +289,10 @@ print(paste0("Number of Probes: ", dim(raw_data)[1])) > Note: If not using custom annotations, leave `local_annotation_dir` as `NULL`. +- `annotation_config_path` (URL or path to annotation config file if using custom annotations, see [Step 8a](#8a-get-probeset-annotations)) + + > Note: If not using custom annotations, leave `annotation_config_path` as `NULL`. + **Output Data:** - `df_rs` (R dataframe containing information from the runsheet) @@ -843,16 +848,16 @@ if (use_custom_annot) { expected_attribute_name <- 'ProbesetID' annot_type <- 'NO_CUSTOM_ANNOT' - if (!is.null(local_annotation_dir) && file.exists(file.path(local_annotation_dir, 'config.csv'))) { - config_df <- read.csv(file.path(local_annotation_dir, 'config.csv'), row.names=1) - if (df_rs$`biomart_attribute` %in% row.names(config_df)) { - annot_config <- config_df[df_rs$`biomart_attribute`, ] + if (!is.null(local_annotation_dir) && !is.null(annotation_config_path)) { + config_df <- read.csv(annotation_config_path, row.names=1) + if (unique(df_rs$`biomart_attribute`) %in% row.names(config_df)) { + annot_config <- config_df[unique(df_rs$`biomart_attribute`), ] annot_type <- annot_config$annot_type[[1]] } else { - warning(paste0("No entry for '", df_rs$`biomart_attribute`, "' in provided config.csv")) + warning(paste0("No entry for '", unique(df_rs$`biomart_attribute`), "' in provided config file: ", annotation_config_path)) } } else { - warning(paste0("No 'config.csv' file found in path (--referenceStorePath): ", local_annotation_dir)) + warning("Need to provide both local_annotation_dir and annotation_config_path to use custom annotation.") } if (annot_type == '3prime-IVT') { @@ -941,8 +946,9 @@ probeset_expression_matrix.gene_mapped <- probeset_expression_matrix %>% - `ensembl_version` (reference organism Ensembl version indicated in the 'ensemblVersion' column of the GeneLab Annotations file provided in `annotation_table_link`, output from [Step 2b](#2b-load-annotation-metadata)) - `annot_key` (Keytype to join annotation table and microarray probes, dependent on organism, e.g. mus musculus uses 'ENSEMBL') - `local_annotation_dir` (Path to local annotation directory if using custom annotations, defined in [Step 2a](#2a-load-metadata-and-raw-data)) +- `annotation_config_path` (URL or path to annotation config file if using custom annotations, defined in [Step 2a](#2a-load-metadata-and-raw-data)) - > Note: See [here](../Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md) for details on what to include in this directory. + > Note: See [Affymetrix_array_annotations.csv](../Array_Annotations/Affymetrix_array_annotations.csv) for the latest config file used at GeneLab. This file can also be created manually by following the [file specification](../Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md). **Input Data:** diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md index 33e59b9f..59f2dd4a 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md @@ -2,8 +2,8 @@ ## Description -* If using custom gene annotations when processing Affymetrix datasets through GeneLab's Affymetrix processing pipeline, a csv file named `config.csv` must be provided as specified below. -* Both the `config.csv` and custom annotations files must be placed in the directory specified by `local_annotation_dir` in the pipeline. +* If using custom gene annotations when processing Affymetrix datasets through GeneLab's Affymetrix processing pipeline, a csv config file must be provided as specified below. +* See [Affymetrix_array_annotations.csv](../Array_Annotations/Affymetrix_array_annotations.csv) for the latest config file used at GeneLab. ## Example diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd index c319d3c0..53012fd7 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd @@ -21,6 +21,7 @@ params: annotation_file_path: NULL # str, Annotation file from 'genelab_annots_link' column of https://github.com/nasa/GeneLab_Data_Processing/blob/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv ensembl_version: NULL # str, Used to determine ensembl version local_annotation_dir: NULL + annotation_config_path: NULL DEBUG_limit_biomart_query: NULL # int, If supplied, only the first n probeIDs are queried run_DE: 'true' @@ -47,8 +48,9 @@ if (is.null(params$runsheet)) { runsheet <- params$runsheet # -# If using custom annotation, local_annotation_dir is path to directory containing annotation file and config -local_annotation_dir <- params$local_annotation_dir # +# If using custom annotation, local_annotation_dir is path to directory containing annotation file and annotation_config_path is path/url to config file +local_annotation_dir <- params$local_annotation_dir # +annotation_config_path <- params$annotation_config_path # message(params) @@ -671,16 +673,16 @@ if (use_custom_annot) { expected_attribute_name <- 'ProbesetID' annot_type <- 'NO_CUSTOM_ANNOT' - if (!is.null(local_annotation_dir) && file.exists(file.path(local_annotation_dir, 'config.csv'))) { - config_df <- read.csv(file.path(local_annotation_dir, 'config.csv'), row.names=1) - if (df_rs$`biomart_attribute` %in% row.names(config_df)) { - annot_config <- config_df[df_rs$`biomart_attribute`, ] + if (!is.null(local_annotation_dir) && !is.null(annotation_config_path)) { + config_df <- read.csv(annotation_config_path, row.names=1) + if (unique(df_rs$`biomart_attribute`) %in% row.names(config_df)) { + annot_config <- config_df[unique(df_rs$`biomart_attribute`), ] annot_type <- annot_config$annot_type[[1]] } else { - warning(paste0("No entry for '", df_rs$`biomart_attribute`, "' in provided config.csv")) + warning(paste0("No entry for '", unique(df_rs$`biomart_attribute`), "' in provided config file: ", annotation_config_path)) } } else { - warning(paste0("No 'config.csv' file found in path (--referenceStorePath): ", local_annotation_dir)) + warning("Need to provide both local_annotation_dir and annotation_config_path to use custom annotation.") } if (annot_type == '3prime-IVT') { diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/config/default.config b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/config/default.config index b63c8772..949e0f89 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/config/default.config +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/config/default.config @@ -42,6 +42,7 @@ params { */ // For now, this particular is good for all organisms listed on the file. annotation_file_path = "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv" + annotation_config_path = "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/master/Microarray/Affymetrix/Array_Annotations/Affymetrix_array_annotations.csv" /* DEBUG related parameters, not likely useful in production diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/PROCESS_AFFYMETRIX.nf b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/PROCESS_AFFYMETRIX.nf index e8947df1..32d104a6 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/PROCESS_AFFYMETRIX.nf +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/PROCESS_AFFYMETRIX.nf @@ -33,6 +33,7 @@ process PROCESS_AFFYMETRIX { -P 'annotation_file_path:${annotation_file_path}' \ -P 'ensembl_version:${ensemblVersion}' \ -P 'local_annotation_dir:${params.referenceStorePath}' \ + -P 'annotation_config_path:${params.annotation_config_path}' \ ${limit_biomart_query_parameter} \ ${run_DE} From 4b490ba31b7a2896f24d88dd7c724b501c124e76 Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Sun, 19 Jan 2025 19:37:48 -0800 Subject: [PATCH 07/25] NF_MAAffymetrix: use reference annotations GL-DPPD-7110-A --- .../NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd | 2 +- .../NF_MAAffymetrix/workflow_code/config/default.config | 2 +- .../workflow_code/modules/PARSE_ANNOTATION_TABLE.nf | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd index 53012fd7..cc5d3a19 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd @@ -18,7 +18,7 @@ params: id: NULL # str, used to name output files runsheet: NULL # str, path to runsheet biomart_attribute: NULL # str, used as a fallback value if 'Array Design REF' column is not found in the runsheet - annotation_file_path: NULL # str, Annotation file from 'genelab_annots_link' column of https://github.com/nasa/GeneLab_Data_Processing/blob/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv + annotation_file_path: NULL # str, Annotation file from 'genelab_annots_link' column of GeneLab Annotations file ensembl_version: NULL # str, Used to determine ensembl version local_annotation_dir: NULL annotation_config_path: NULL diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/config/default.config b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/config/default.config index 949e0f89..70dba72d 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/config/default.config +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/config/default.config @@ -41,7 +41,7 @@ params { Parameters that SHOULD NOT be overwritten */ // For now, this particular is good for all organisms listed on the file. - annotation_file_path = "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv" + annotation_file_path = "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/GL_RefAnnotTable-A_1.1.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv" annotation_config_path = "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/master/Microarray/Affymetrix/Array_Annotations/Affymetrix_array_annotations.csv" /* diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/PARSE_ANNOTATION_TABLE.nf b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/PARSE_ANNOTATION_TABLE.nf index a6a39281..74addb06 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/PARSE_ANNOTATION_TABLE.nf +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/PARSE_ANNOTATION_TABLE.nf @@ -1,6 +1,6 @@ process PARSE_ANNOTATION_TABLE { // Extracts data from this kind of table: - // https://github.com/nasa/GeneLab_Data_Processing/blob/master/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv + // https://github.com/nasa/GeneLab_Data_Processing/blob/master/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv input: val(annotations_csv_url_string) @@ -22,7 +22,7 @@ process PARSE_ANNOTATION_TABLE { organism_key = organism_sci.capitalize().replace("_"," ") // fasta_url = organisms[organism_key][5] // gtf_url = organisms[organism_key][6] - annotations_db_url = organisms[organism_key][9] + annotations_db_url = organisms[organism_key][10] ensemblVersion = organisms[organism_key][3] ensemblSource = organisms[organism_key][4] From 645b2580aba4fe0403c98d860fca5af4dfd3eda3 Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Sun, 19 Jan 2025 20:32:25 -0800 Subject: [PATCH 08/25] NF_MAAffymetrix: update tool versions --- .../GL-DPPD-7114-A.md | 26 +++++++++---------- .../bin/dp_tools__affymetrix/checks.py | 6 ++--- .../bin/dp_tools__affymetrix_skipDE/checks.py | 6 ++--- .../config/software/by_docker_image.config | 4 +-- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md index 4fe07165..9e96c9f1 100644 --- a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md +++ b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md @@ -57,21 +57,21 @@ Lauren Sanders (acting GeneLab Project Scientist) |Program|Version|Relevant Links| |:------|:------:|:-------------| -|R|4.1.3|[https://www.bioinformatics.babraham.ac.uk/projects/fastqc/](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)| -|DT|0.26|[https://github.com/rstudio/DT](https://github.com/rstudio/DT)| -|dplyr|1.0.10|[https://dplyr.tidyverse.org](https://dplyr.tidyverse.org)| -|tibble|3.1.8|[https://tibble.tidyverse.org](https://tibble.tidyverse.org)| -|stringr|1.5.0|[https://stringr.tidyverse.org](https://stringr.tidyverse.org)| -|R.utils|2.12.2|[https://github.com/HenrikBengtsson/R.utils](https://github.com/HenrikBengtsson/R.utils)| -|oligo|1.58.0|[https://bioconductor.org/packages/3.14/bioc/html/oligo.html](https://bioconductor.org/packages/3.14/bioc/html/oligo.html)| -|limma|3.50.3|[https://bioconductor.org/packages/3.14/bioc/html/limma.html](https://bioconductor.org/packages/3.14/bioc/html/limma.html)| -|glue|1.6.2|[https://glue.tidyverse.org](https://glue.tidyverse.org)| -|biomaRt|2.50.0|[https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html)| -|matrixStats|0.63.0|[https://github.com/HenrikBengtsson/matrixStats](https://github.com/HenrikBengtsson/matrixStats)| +|R|4.4.2|[https://www.r-project.org/](https://www.r-project.org/)| +|DT|0.33|[https://github.com/rstudio/DT](https://github.com/rstudio/DT)| +|dplyr|1.1.4|[https://dplyr.tidyverse.org](https://dplyr.tidyverse.org)| +|tibble|3.2.1|[https://tibble.tidyverse.org](https://tibble.tidyverse.org)| +|stringr|1.5.1|[https://stringr.tidyverse.org](https://stringr.tidyverse.org)| +|R.utils|2.12.3|[https://github.com/HenrikBengtsson/R.utils](https://github.com/HenrikBengtsson/R.utils)| +|oligo|1.70.0|[https://bioconductor.org/packages/3.14/bioc/html/oligo.html](https://bioconductor.org/packages/3.14/bioc/html/oligo.html)| +|limma|3.62.2|[https://bioconductor.org/packages/3.14/bioc/html/limma.html](https://bioconductor.org/packages/3.14/bioc/html/limma.html)| +|glue|1.8.0|[https://glue.tidyverse.org](https://glue.tidyverse.org)| +|biomaRt|2.62.0|[https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html)| +|matrixStats|1.5.0|[https://github.com/HenrikBengtsson/matrixStats](https://github.com/HenrikBengtsson/matrixStats)| |statmod|1.5.0|[https://github.com/cran/statmod](https://github.com/cran/statmod)| -|dp_tools|1.3.4|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)| +|dp_tools|1.3.5|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)| |singularity|3.9|[https://sylabs.io](https://sylabs.io)| -|Quarto|1.2.313|[https://quarto.org](https://quarto.org)| +|Quarto|1.6.40|[https://quarto.org](https://quarto.org)| --- diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/checks.py b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/checks.py index fef5126e..aca43c6b 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/checks.py +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/checks.py @@ -319,8 +319,8 @@ def utils_common_constraints_on_dataframe( col_constraints = col_constraints.copy() # limit to only columns of interest - query_df = df[col_set] - for (colname, colseries) in query_df.iteritems(): + query_df = df[list(col_set)] + for (colname, colseries) in query_df.items(): # check non null constraint if col_constraints.pop("nonNull", False) and nonNull(colseries) == False: issues["Failed non null constraint"].append(colname) @@ -398,7 +398,7 @@ def check_dge_table_sample_columns_constraints( ) -> FlagEntry: MINIMUM_COUNT = 0 # data specific preprocess - df_dge = pd.read_csv(dge_table)[samples] + df_dge = pd.read_csv(dge_table)[list(samples)] schema = pa.DataFrameSchema({ sample: pa.Column(float) diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix_skipDE/checks.py b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix_skipDE/checks.py index fef5126e..aca43c6b 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix_skipDE/checks.py +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix_skipDE/checks.py @@ -319,8 +319,8 @@ def utils_common_constraints_on_dataframe( col_constraints = col_constraints.copy() # limit to only columns of interest - query_df = df[col_set] - for (colname, colseries) in query_df.iteritems(): + query_df = df[list(col_set)] + for (colname, colseries) in query_df.items(): # check non null constraint if col_constraints.pop("nonNull", False) and nonNull(colseries) == False: issues["Failed non null constraint"].append(colname) @@ -398,7 +398,7 @@ def check_dge_table_sample_columns_constraints( ) -> FlagEntry: MINIMUM_COUNT = 0 # data specific preprocess - df_dge = pd.read_csv(dge_table)[samples] + df_dge = pd.read_csv(dge_table)[list(samples)] schema = pa.DataFrameSchema({ sample: pa.Column(float) diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/config/software/by_docker_image.config b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/config/software/by_docker_image.config index 6744407f..df4735a4 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/config/software/by_docker_image.config +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/config/software/by_docker_image.config @@ -1,8 +1,8 @@ process { withName: 'PROCESS_AFFYMETRIX' { - container = "quay.io/j_81/gl_images:NF_AffyMP-A_1.0.0-RC7" + container = "quay.io/nasa_genelab/gl-microarray:1.0.0" } withName: 'RUNSHEET_FROM_GLDS|RUNSHEET_FROM_ISA|VV_AFFYMETRIX|GENERATE_MD5SUMS|UPDATE_ISA_TABLES|GENERATE_SOFTWARE_TABLE' { - container = "quay.io/j_81/dp_tools:1.3.4" + container = "quay.io/nasa_genelab/dp_tools:1.3.5" } } From 7b14d1d9b529c6d9d33db0a0a86f202d822b2bd6 Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Tue, 21 Jan 2025 12:59:45 -0800 Subject: [PATCH 09/25] NF_MAAffymetrix: update pipeline version from GL-DPPD-7114 to GL-DPPD-7114-A --- .../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md | 4 ++-- Microarray/Affymetrix/README.md | 6 +++++- .../NF_MAAffymetrix/CHANGELOG.md | 6 ++++++ .../Workflow_Documentation/NF_MAAffymetrix/README.md | 10 +++++----- .../resources/usr/bin/generate_protocol.sh | 2 +- .../NF_MAAffymetrix/workflow_code/nextflow.config | 2 +- Microarray/Affymetrix/Workflow_Documentation/README.md | 4 ++-- 7 files changed, 22 insertions(+), 12 deletions(-) diff --git a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md index 9e96c9f1..6d0eb8d9 100644 --- a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md +++ b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md @@ -1,6 +1,6 @@ # GeneLab bioinformatics processing pipeline for Affymetrix microarray data -> **This page holds an overview and instructions for how GeneLab processes Affymetrix microarray datasets. Exact processing commands and GL-DPPD-7114 version used for specific GeneLab datasets (GLDS) are provided with their processed data in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo).** +> **This page holds an overview and instructions for how GeneLab processes Affymetrix microarray datasets. Exact processing commands and GL-DPPD-7114-A version used for specific GeneLab datasets (GLDS) are provided with their processed data in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo).** > > \* The pipeline detailed below currently supports gene annotations for Arabidopsis Thaliana via Ensembl FTP, all animals available in Biomart, and custom annotations (see [Step 8a](#8a-get-probeset-annotations)). @@ -8,7 +8,7 @@ **Date:** March 31, 2023 **Revision:** - -**Document Number:** GL-DPPD-7114 +**Document Number:** GL-DPPD-7114-A **Submitted by:** Jonathan Oribello (GeneLab Data Processing Team) diff --git a/Microarray/Affymetrix/README.md b/Microarray/Affymetrix/README.md index 6743ea3a..5758b9f2 100644 --- a/Microarray/Affymetrix/README.md +++ b/Microarray/Affymetrix/README.md @@ -1,7 +1,7 @@ # GeneLab bioinformatics processing pipeline for Affymetrix microarray data -> **The document [`GL-DPPD-7114.md`](Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md) holds an overview and example commands for how GeneLab processes Affymetrix microarray datasets. See the [Repository Links](#repository-links) descriptions below for more information. Processed data output files and processing code is provided for each GLDS dataset along with the processed data in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/).** +> **The document [`GL-DPPD-7114-A.md`](Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md) holds an overview and example commands for how GeneLab processes Affymetrix microarray datasets. See the [Repository Links](#repository-links) descriptions below for more information. Processed data output files and processing code is provided for each GLDS dataset along with the processed data in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/).** --- @@ -20,6 +20,10 @@ - Contains instructions for installing and running the GeneLab NF_MAAffymetrix workflow +* [**Array_Annotations**](Array_Annotations) + + - Contains the custom annotations table used in the GeneLab NF_MAAffymetrix + --- **Developed by:** Jonathan Oribello diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/CHANGELOG.md b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/CHANGELOG.md index de658e22..5a747ced 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/CHANGELOG.md +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [TBD](#) - YYYY-MM-DD + +### Changed + +- Better support for custom annotations, see [specification](examples/annotations/README.md) ([#113](https://github.com/nasa/GeneLab_Data_Processing/issues/113)) + ## [1.0.5](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_MAAffymetrix_1.0.5/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix) - 2024-08-30 ### Added diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/README.md b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/README.md index f21d727a..2354426e 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/README.md +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/README.md @@ -4,7 +4,7 @@ ### Implementation Tools -The current GeneLab Affymetrix Microarray consensus processing pipeline (NF_MAAffymetrix), [GL-DPPD-7114](../../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md), is implemented as a [Nextflow](https://nextflow.io/) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) to run all tools in containers. This workflow (NF_MAAffymetrix) is run using the command line interface (CLI) of any unix-based system. While knowledge of creating workflows in Nextflow is not required to run the workflow as is, [the Nextflow documentation](https://nextflow.io/docs/latest/index.html) is a useful resource for users who want to modify and/or extend this workflow. +The current GeneLab Affymetrix Microarray consensus processing pipeline (NF_MAAffymetrix), [GL-DPPD-7114-A](../../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md), is implemented as a [Nextflow](https://nextflow.io/) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) to run all tools in containers. This workflow (NF_MAAffymetrix) is run using the command line interface (CLI) of any unix-based system. While knowledge of creating workflows in Nextflow is not required to run the workflow as is, [the Nextflow documentation](https://nextflow.io/docs/latest/index.html) is a useful resource for users who want to modify and/or extend this workflow. ### Workflow & Subworkflows @@ -14,8 +14,8 @@ The current GeneLab Affymetrix Microarray consensus processing pipeline (NF_MAAf --- The NF_MAAffymetrix workflow is composed of three subworkflows as shown in the image above. -Below is a description of each subworkflow and the additional output files generated that are not already indicated in the [GL-DPPD-7114 pipeline -document](../../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md): +Below is a description of each subworkflow and the additional output files generated that are not already indicated in the [GL-DPPD-7114-A pipeline +document](../../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md): 1. **Analysis Staging Subworkflow** @@ -26,7 +26,7 @@ document](../../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md): 2. **Affymetrix Microarray Processing Subworkflow** - Description: - - This subworkflow uses the staged raw data and metadata parameters from the Analysis Staging Subworkflow to generate processed data using the [GL-DPPD-7114 pipeline](../../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md). + - This subworkflow uses the staged raw data and metadata parameters from the Analysis Staging Subworkflow to generate processed data using the [GL-DPPD-7114-A pipeline](../../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md). 1. **V&V Pipeline Subworkflow** @@ -200,7 +200,7 @@ All R code steps and output are rendered within a Quarto document yielding the f The outputs from the Analysis Staging and V&V Pipeline Subworkflows are described below: -> Note: The outputs from the Affymetrix Microarray Processing Subworkflow are documented in the [GL-DPPD-7114.md](../../../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md) processing protocol. +> Note: The outputs from the Affymetrix Microarray Processing Subworkflow are documented in the [GL-DPPD-7114-A.md](../../../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md) processing protocol. **Analysis Staging Subworkflow** diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/POST_PROCESSING/GENERATE_PROTOCOL/resources/usr/bin/generate_protocol.sh b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/POST_PROCESSING/GENERATE_PROTOCOL/resources/usr/bin/generate_protocol.sh index bddffaa3..1748a3a2 100755 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/POST_PROCESSING/GENERATE_PROTOCOL/resources/usr/bin/generate_protocol.sh +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/modules/POST_PROCESSING/GENERATE_PROTOCOL/resources/usr/bin/generate_protocol.sh @@ -80,7 +80,7 @@ else fi # Read the template file -template="Data were processed as described in GL-DPPD-7114 (https://github.com/nasa/GeneLab_Data_Processing/blob/master/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md) using NF_MAAffymetrix version $1 (https://github.com/nasa/GeneLab_Data_Processing/tree/NF_MAAffymetrix_$1/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix). In short, a RunSheet containing raw data file location and processing metadata from the study's *ISA.zip file was generated using dp_tools (version ${dp_tools_VERSION}). The raw array data files were loaded into R (version ${R_VERSION}) using oligo (version ${oligo_VERSION}). Raw data quality assurance density plot, pseudo images, MA plots, and boxplots were generated using oligo (version ${oligo_VERSION}). The raw probe level intensity data was background corrected and normalized across arrays via the oligo (version ${oligo_VERSION}) quantile method. Normalized probe level data quality assurance density plot, pseudo images, MA plots, and boxplots were generated using oligo (version ${oligo_VERSION}). Normalized probe level data was summarized to the probeset level using the oligo (version ${oligo_VERSION}) RMA method. ${GENE_MAPPING_STEP} ${DE_STEP} ${ANNOT_STEP}" +template="Data were processed as described in GL-DPPD-7114-A (https://github.com/nasa/GeneLab_Data_Processing/blob/master/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md) using NF_MAAffymetrix version $1 (https://github.com/nasa/GeneLab_Data_Processing/tree/NF_MAAffymetrix_$1/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix). In short, a RunSheet containing raw data file location and processing metadata from the study's *ISA.zip file was generated using dp_tools (version ${dp_tools_VERSION}). The raw array data files were loaded into R (version ${R_VERSION}) using oligo (version ${oligo_VERSION}). Raw data quality assurance density plot, pseudo images, MA plots, and boxplots were generated using oligo (version ${oligo_VERSION}). The raw probe level intensity data was background corrected and normalized across arrays via the oligo (version ${oligo_VERSION}) quantile method. Normalized probe level data quality assurance density plot, pseudo images, MA plots, and boxplots were generated using oligo (version ${oligo_VERSION}). Normalized probe level data was summarized to the probeset level using the oligo (version ${oligo_VERSION}) RMA method. ${GENE_MAPPING_STEP} ${DE_STEP} ${ANNOT_STEP}" # Output the filled template echo "$template" > PROTOCOL_GLmicroarray.txt \ No newline at end of file diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/nextflow.config b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/nextflow.config index df8f884e..984eb788 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/nextflow.config +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/nextflow.config @@ -41,7 +41,7 @@ profiles { manifest { homePage = 'https://github.com/nasa/GeneLab_Data_Processing/tree/master/Microarray/Affymetrix' - description = 'Affymetrix Microarray Workflow for Document GL-DPPD-7114' + description = 'Affymetrix Microarray Workflow for Document GL-DPPD-7114-A' mainScript = 'main.nf' defaultBranch = 'main' nextflowVersion = '>=23.10.1' diff --git a/Microarray/Affymetrix/Workflow_Documentation/README.md b/Microarray/Affymetrix/Workflow_Documentation/README.md index 6c28bc73..bfd83848 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/README.md +++ b/Microarray/Affymetrix/Workflow_Documentation/README.md @@ -1,14 +1,14 @@ # GeneLab RNAseq Workflow Information > ** For the processing pipeline for Affymetrix microarray data, -[`GL-DPPD-7114.md`](../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md), +[`GL-DPPD-7114-A.md`](../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md), GeneLab has wrapped each step of the pipeline into a workflow with validation and verification of output files built in after each step. The table below lists (and links to) each NF_MAAffymetrix version and the corresponding workflow subdirectory, the current NF_MAAffymetrix/workflow implementation is indicated. Each workflow subdirectory contains information about the workflow along with instructions for installation and usage.** ## NF_MAAffymetrix Version and Corresponding Workflow |Pipeline Version|Current Workflow Version (for respective pipeline version)|Nextflow Version| |:---------------|:---------------------------------------------------------|:---------------| -|*[GL-DPPD-7114.md](../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md)|[1.0.4](NF_MAAffymetrix)|23.10.1| +|*[GL-DPPD-7114-A.md](../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md)|[1.0.5](NF_MAAffymetrix)|23.10.1| *Current GeneLab Pipeline/Workflow Implementation From a177711725f2d474fb2b1944db897e8a3bd43cc4 Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Thu, 30 Jan 2025 21:40:47 -0800 Subject: [PATCH 10/25] NF_MAAffymetrix: update pipeline doc --- .../GL-DPPD-7114-A.md | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md index 6d0eb8d9..e31af8f8 100644 --- a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md +++ b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md @@ -21,6 +21,37 @@ Lauren Sanders (acting GeneLab Project Scientist) --- +## Updates from previous version + +Updated [Ensembl Reference Files](../../../GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv) to the following releases: +- Animals: Ensembl release 112 +- Plants: Ensembl plants release 59 +- Bacteria: Ensembl bacteria release 59 + +Software Updates: + +| Program | Previous Version | New Version | +|:--------|:-----------------|:---------------| +|R|4.1.3|4.4.2| +|DT|0.26|0.33| +|dplyr|1.0.10|1.1.4| +|tibble|3.1.8|3.2.1| +|stringr|1.5.0|1.5.1| +|R.utils|2.12.2|2.12.3| +|oligo|1.58.0|1.70.0| +|limma|3.50.3|3.62.2| +|glue|1.6.2|1.8.0| +|biomaRt|2.50.0|2.62.0| +|matrixStats|0.63.0|1.5.0| +|statmod|1.5.0|1.5.0| +|dp_tools|1.3.4|1.3.5| +|singularity|3.9|3.9| +|Quarto|1.2.313|1.6.40| + +Added better support for custom annotations, see [specification](../Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md). + +--- + # Table of contents - [Software used](#software-used) From 0cd2a5917c904bd54625408a236971a089360499 Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Mon, 3 Feb 2025 18:27:51 -0800 Subject: [PATCH 11/25] NF_MAAffymetrix: update pipeline doc --- .../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md index e31af8f8..14d3c353 100644 --- a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md +++ b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md @@ -48,7 +48,13 @@ Software Updates: |singularity|3.9|3.9| |Quarto|1.2.313|1.6.40| -Added better support for custom annotations, see [specification](../Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md). +MA Plots + +- Added support for plotting HTAFeatureSet data + +Custom Annotations + +- Added ability to use custom gene annotations when annotations are not available in Biomart or Ensembl FTP for Arabidopsis Thaliana --- From cfb2e946e7f3c6a7cde1c1531f7c67c1ac63682f Mon Sep 17 00:00:00 2001 From: asaravia-butler <70983120+asaravia-butler@users.noreply.github.com> Date: Thu, 6 Feb 2025 18:10:50 -0800 Subject: [PATCH 12/25] Update GL-DPPD-7114-A.md --- .../GL-DPPD-7114-A.md | 90 +++++++++---------- 1 file changed, 42 insertions(+), 48 deletions(-) diff --git a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md index 14d3c353..3795d734 100644 --- a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md +++ b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md @@ -1,23 +1,23 @@ # GeneLab bioinformatics processing pipeline for Affymetrix microarray data -> **This page holds an overview and instructions for how GeneLab processes Affymetrix microarray datasets. Exact processing commands and GL-DPPD-7114-A version used for specific GeneLab datasets (GLDS) are provided with their processed data in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo).** +> **This page holds an overview and instructions for how GeneLab processes Affymetrix microarray datasets. Exact processing commands and GL-DPPD-7114 version used for specific GeneLab datasets (GLDS) are provided with their processed data in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo).** > -> \* The pipeline detailed below currently supports gene annotations for Arabidopsis Thaliana via Ensembl FTP, all animals available in Biomart, and custom annotations (see [Step 8a](#8a-get-probeset-annotations)). +> \* The pipeline detailed below is currently used for animal and *Arabidopsis thaliana* studies only, it will be updated soon for processing microbe microarray data and other plant data. --- -**Date:** March 31, 2023 -**Revision:** - +**Date:** February XX, 2025 +**Revision:** -A **Document Number:** GL-DPPD-7114-A **Submitted by:** -Jonathan Oribello (GeneLab Data Processing Team) +Crystal Han (GeneLab Data Processing Team) **Approved by:** -Sylvain Costes (GeneLab Project Manager) -Samrawit Gebre (GeneLab Deputy Project Manager) -Amanda Saravia-Butler (GeneLab Data Processing Lead) -Lauren Sanders (acting GeneLab Project Scientist) +Samrawit Gebre (OSDR Project Manager) +Lauren Sanders (OSDR Project Scientist) +Amanda Saravia-Butler (GeneLab Science Lead) +Barbara Novak (GeneLab Data Processing Lead) --- @@ -43,9 +43,7 @@ Software Updates: |glue|1.6.2|1.8.0| |biomaRt|2.50.0|2.62.0| |matrixStats|0.63.0|1.5.0| -|statmod|1.5.0|1.5.0| |dp_tools|1.3.4|1.3.5| -|singularity|3.9|3.9| |Quarto|1.2.313|1.6.40| MA Plots @@ -54,7 +52,7 @@ MA Plots Custom Annotations -- Added ability to use custom gene annotations when annotations are not available in Biomart or Ensembl FTP for Arabidopsis Thaliana +- Added ability to use custom gene annotations when annotations are not available in Biomart or Ensembl FTP for *Arabidopsis thaliana*, see [Step 8](#8-probeset-annotations) --- @@ -82,11 +80,11 @@ Custom Annotations - [8. Probeset Annotations](#8-probeset-annotations) - [8a. Get Probeset Annotations](#8a-get-probeset-annotations) - [8b. Summarize Gene Mapping](#8b-summarize-gene-mapping) - - [8c. Save Annotated Tables](#8c-save-annotated-tables) + - [8c. Generate Annotated Raw and Normalized Expression Tables](#8c-generate-annotated-raw-and-normalized-expression-tables) - [9. Perform Probeset Differential Expression (DE)](#9-perform-probeset-differential-expression-de) - [9a. Generate Design Matrix](#9a-generate-design-matrix) - [9b. Perform Individual Probeset Level DE](#9b-perform-individual-probeset-level-de) - - [9c. Save DE Table](#9c-save-de-table) + - [9c. Add Annotation and Stats Columns and Format DE Table](#9c-add-annotation-and-stats-columns-and-format-de-table) --- @@ -192,10 +190,6 @@ options(dplyr.summarise.inform = FALSE) # Don't print out '`summarise()` has gro # Define path to runsheet runsheet <- "/path/to/runsheet/{OSD-Accession-ID}_microarray_v{version}_runsheet.csv" -# If using custom annotation, local_annotation_dir is path to directory containing annotation file and annotation_config_path is path/url to config file -local_annotation_dir <- NULL # -annotation_config_path <- NULL # - ## Set up output structure # Output Constants @@ -319,16 +313,9 @@ print(paste0("Number of Arrays: ", dim(raw_data)[2])) print(paste0("Number of Probes: ", dim(raw_data)[1])) ``` -**Parameter Definitions:** +**Input Data:** - `runsheet` (Path to runsheet, output from [Step 1](#1-create-sample-runsheet)) -- `local_annotation_dir` (Path to local annotation directory if using custom annotations, see [Step 8a](#8a-get-probeset-annotations)) - - > Note: If not using custom annotations, leave `local_annotation_dir` as `NULL`. - -- `annotation_config_path` (URL or path to annotation config file if using custom annotations, see [Step 8a](#8a-get-probeset-annotations)) - - > Note: If not using custom annotations, leave `annotation_config_path` as `NULL`. **Output Data:** @@ -342,6 +329,10 @@ print(paste0("Number of Probes: ", dim(raw_data)[1])) ### 2b. Load Annotation Metadata ```R +# If using custom annotation, local_annotation_dir is path to directory containing annotation file and annotation_config_path is path/url to config file +local_annotation_dir <- NULL # +annotation_config_path <- NULL # + ## Determines the organism specific annotation file to use based on the organism in the runsheet fetch_organism_specific_annotation_table <- function(organism) { # Uses the latest GeneLab annotations table to find the organism specific annotation file path and ensembl version @@ -367,7 +358,15 @@ annotation_file_path <- annotation_table$genelab_annots_link ensembl_version <- as.character(annotation_table$ensemblVersion) ``` -**Parameter Definitions:** +**Input Data:** + +- `local_annotation_dir` (Path to local annotation directory if using custom annotations, see [Step 8a](#8a-get-probeset-annotations)) + + > Note: If not using custom annotations, leave `local_annotation_dir` as `NULL`. + +- `annotation_config_path` (URL or path to annotation config file if using custom annotations, see [Step 8a](#8a-get-probeset-annotations)) + + > Note: If not using custom annotations, leave `annotation_config_path` as `NULL`. - `df_rs$organism` (organism specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) - `annotation_table_link` (URL or path to latest GeneLab Annotations file, see [GL-DPPD-7110-A_annotations.csv](../../../GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv)) @@ -975,26 +974,24 @@ probeset_expression_matrix.gene_mapped <- probeset_expression_matrix %>% dplyr::mutate( gene_mapping_source := unique(unique_probe_ids$gene_mapping_source) ) ``` -**Parameter Definitions:** +**Input Data:** - `df_rs$organism` (organism specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) - `df_rs$biomart_attribute` (array design biomart identifier specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) - `annotation_file_path` (reference organism annotation file url indicated in the 'genelab_annots_link' column of the GeneLab Annotations file provided in `annotation_table_link`, output from [Step 2b](#2b-load-annotation-metadata)) - `ensembl_version` (reference organism Ensembl version indicated in the 'ensemblVersion' column of the GeneLab Annotations file provided in `annotation_table_link`, output from [Step 2b](#2b-load-annotation-metadata)) -- `annot_key` (Keytype to join annotation table and microarray probes, dependent on organism, e.g. mus musculus uses 'ENSEMBL') -- `local_annotation_dir` (Path to local annotation directory if using custom annotations, defined in [Step 2a](#2a-load-metadata-and-raw-data)) -- `annotation_config_path` (URL or path to annotation config file if using custom annotations, defined in [Step 2a](#2a-load-metadata-and-raw-data)) +- `annot_key` (keytype to join annotation table and microarray probes, dependent on organism, e.g. mus musculus uses 'ENSEMBL') +- `local_annotation_dir` (path to local annotation directory if using custom annotations, output from [Step 2b](#2b-load-annotation-metadata)) +- `annotation_config_path` (URL or path to annotation config file if using custom annotations, output from [Step 2b](#2b-load-annotation-metadata)) > Note: See [Affymetrix_array_annotations.csv](../Array_Annotations/Affymetrix_array_annotations.csv) for the latest config file used at GeneLab. This file can also be created manually by following the [file specification](../Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md). -**Input Data:** - - `probeset_level_data` (R object containing probeset level expression values after summarization of normalized probeset level data, output from [Step 7](#7-probeset-summarization)) **Output Data:** -- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations) - `unique_probe_ids` (R object containing probeset ID to gene annotation mappings) +- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations)
@@ -1030,7 +1027,7 @@ print(glue::glue("Unique Mapping Count: {slices[['Unique Mapping']]}"))
-### 8c. Save Annotated Tables +### 8c. Generate Annotated Raw and Normalized Expression Tables ```R ## Reorder columns before saving to file @@ -1112,16 +1109,13 @@ norm_data_matrix_annotated <- norm_data_matrix_annotated %>% write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "normalized_intensities_probe_GLmicroarray.csv"), row.names = FALSE) ``` -**Parameter Definitions:** - -- `df_rs[['Sample Name']]` (sample names specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) -- `annot_key` (Keytype to join annotation table and microarray probes, dependent on organism, e.g. mus musculus uses 'ENSEMBL', defined in [Step 8a](#8a-get-probeset-annotations)) - **Input Data:** +- `df_rs` (R dataframe containing information from the runsheet, output from [Step 2a](#2a-load-metadata-and-raw-data)) +- `annot_key` (keytype to join annotation table and microarray probes, dependent on organism, e.g. mus musculus uses 'ENSEMBL', defined in [Step 8a](#8a-get-probeset-annotations)) - `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations, output from [Step 8a](#8a-get-probeset-annotations) above) -- `background_corrected_data` (R object containing background-corrected microarray data created in [Step 4](#4-background-correction)) -- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization)) +- `background_corrected_data` (R object containing background-corrected microarray data, output from [Step 4](#4-background-correction)) +- `norm_data` (R object containing background-corrected and normalized microarray data, output from [Step 5](#5-between-array-normalization)) - `unique_probe_ids` (R object containing probeset ID to gene annotation mappings, output from [Step 8a](#8a-get-probeset-annotations)) **Output Data:** @@ -1132,7 +1126,7 @@ write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "norm ## 9. Perform Probeset Differential Expression (DE) -> Note: Run differential expression analysis only if there is at least 1 replicate per factor group. +> Note: Run differential expression analysis only if there is at least 2 replicates per factor group.
@@ -1193,7 +1187,7 @@ write.csv(design_data$contrasts, file.path(DIR_DGE, "contrasts_GLmicroarray.csv" **Input Data:** -- `runsheet` (Path to runsheet, output from [Step 1](#1-create-sample-runsheet)) +- `runsheet` (path to runsheet, output from [Step 1](#1-create-sample-runsheet)) **Output Data:** @@ -1241,8 +1235,8 @@ write.csv(PCA_raw$x, file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.cs **Input Data:** -- `norm_data` (R object containing background-corrected and normalized microarray data created in [Step 5](#5-between-array-normalization)) -- `design` (R object containing the limma study design matrix, indicating the group that each sample belongs to, created in [Step 9a](#9a-generate-design-matrix) above) +- `norm_data` (R object containing background-corrected and normalized microarray data, output from [Step 5](#5-between-array-normalization)) +- `design` (R object containing the limma study design matrix, indicating the group that each sample belongs to, output from [Step 9a](#9a-generate-design-matrix) above) - `probeset_level_data` (R object containing probeset level expression values after summarization of normalized probeset level data, output from [Step 7](#7-probeset-summarization)) **Output Data:** @@ -1256,7 +1250,7 @@ write.csv(PCA_raw$x, file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.cs
-### 9c. Save DE Table +### 9c. Add Annotation and Stats Columns and Format DE Table ```R ## Reformat Table for consistency across DE analyses tables within GeneLab ## @@ -1422,4 +1416,4 @@ write.csv(df_interim, file.path(DIR_DGE, "differential_expression_GLmicroarray.c - **differential_expression_GLmicroarray.csv** (table containing normalized probeset expression values for each sample, group statistics, Limma probeset DE results for each pairwise comparison, and gene annotations. The ProbesetID is the unique index column.) -> All steps of the Microarray pipeline are performed using R markdown and the completed R markdown is rendered (via Quarto) as an html file (**NF_MAAffymetrix_v\*_GLmicroarray.html**) and published in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/) for the respective dataset. \ No newline at end of file +> All steps of the Microarray pipeline are performed using R markdown and the completed R markdown is rendered (via Quarto) as an html file (**NF_MAAffymetrix_v\*_GLmicroarray.html**) and published in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/) for the respective dataset. From 7433f69a67fc6352383a435b14be2e03f3a2533b Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Sun, 9 Feb 2025 19:27:24 -0800 Subject: [PATCH 13/25] NF_MAAffymetrix: reorder DE table columns --- .../GL-DPPD-7114-A.md | 17 ++++++----------- .../workflow_code/bin/Affymetrix.qmd | 17 ++++++----------- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md index 3795d734..c18cccb0 100644 --- a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md +++ b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md @@ -1370,27 +1370,22 @@ ALL_SAMPLE_STATS_COLUMNS_ORDER <- c( "F.p.value" ) -GROUP_MEAN_COLUMNS_ORDER <- generate_prefixed_column_order( - subjects = unique(design_data$groups$group), - prefixes = c( - "Group.Mean_" - ) - ) -GROUP_STDEV_COLUMNS_ORDER <- generate_prefixed_column_order( +GROUP_MEAN_STDEV_COLUMNS_ORDER <- generate_prefixed_column_order( subjects = unique(design_data$groups$group), prefixes = c( + "Group.Mean_", "Group.Stdev_" - ) ) +) + FINAL_COLUMN_ORDER <- c( ANNOTATIONS_COLUMN_ORDER, PROBE_INFO_COLUMN_ORDER, SAMPLE_COLUMN_ORDER, STAT_COLUMNS_ORDER, ALL_SAMPLE_STATS_COLUMNS_ORDER, - GROUP_MEAN_COLUMNS_ORDER, - GROUP_STDEV_COLUMNS_ORDER - ) + GROUP_MEAN_STDEV_COLUMNS_ORDER +) ## Assert final column order includes all columns from original table if (!setequal(FINAL_COLUMN_ORDER, colnames(df_interim))) { diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd index cc5d3a19..d9b42592 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd @@ -1113,27 +1113,22 @@ ALL_SAMPLE_STATS_COLUMNS_ORDER <- c( "F.p.value" ) -GROUP_MEAN_COLUMNS_ORDER <- generate_prefixed_column_order( - subjects = unique(design_data$groups$group), - prefixes = c( - "Group.Mean_" - ) - ) -GROUP_STDEV_COLUMNS_ORDER <- generate_prefixed_column_order( +GROUP_MEAN_STDEV_COLUMNS_ORDER <- generate_prefixed_column_order( subjects = unique(design_data$groups$group), prefixes = c( + "Group.Mean_", "Group.Stdev_" - ) ) +) + FINAL_COLUMN_ORDER <- c( ANNOTATIONS_COLUMN_ORDER, PROBE_INFO_COLUMN_ORDER, SAMPLE_COLUMN_ORDER, STAT_COLUMNS_ORDER, ALL_SAMPLE_STATS_COLUMNS_ORDER, - GROUP_MEAN_COLUMNS_ORDER, - GROUP_STDEV_COLUMNS_ORDER - ) + GROUP_MEAN_STDEV_COLUMNS_ORDER +) ## Assert final column order includes all columns from original table if (!setequal(FINAL_COLUMN_ORDER, colnames(df_interim))) { From 44ecb25bb006d5ad9d6951fc3e9aa69b88f0c064 Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Sun, 9 Feb 2025 19:29:53 -0800 Subject: [PATCH 14/25] NF_MAAffymetrix: remove visualization_PCA_table_GLmicroarray.csv output --- .../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md | 5 ----- .../NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd | 4 ---- 2 files changed, 9 deletions(-) diff --git a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md index c18cccb0..0241cc2d 100644 --- a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md +++ b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md @@ -1227,10 +1227,6 @@ limma::write.fit(res, adjust = 'BH', row.names = FALSE, quote = TRUE, sep = ",") - -### Generate and export PCA table for GeneLab visualization plots -PCA_raw <- prcomp(t(exprs(probeset_level_data)), scale = FALSE) # Note: expression at the Probeset level is already log2 transformed -write.csv(PCA_raw$x, file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.csv")) ``` **Input Data:** @@ -1246,7 +1242,6 @@ write.csv(PCA_raw$x, file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.cs - T statistic for all pairwise comparison tests - P value for all pairwise comparison tests - Adjusted P value for all pairwise comparison tests) -- visualization_PCA_table_GLmicroarray.csv (file used to generate GeneLab PCA plots)
diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd index d9b42592..3e029fc0 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd @@ -964,10 +964,6 @@ limma::write.fit(res, adjust = 'BH', row.names = FALSE, quote = TRUE, sep = ",") - -### Generate and export PCA table for GeneLab visualization plots -PCA_raw <- prcomp(t(exprs(probeset_level_data)), scale = FALSE) # Note: expression at the Probeset level is already log2 transformed -write.csv(PCA_raw$x, file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.csv")) ``` ### Save DE Table From bfa4ebaee72e042e8b612e1cbff1131aa9f73709 Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Sun, 9 Feb 2025 19:32:58 -0800 Subject: [PATCH 15/25] NF_MAAffymetrix: update report headings --- .../NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd index 3e029fc0..19f38c49 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd @@ -789,7 +789,7 @@ print(glue::glue("Unique Mapping Count: {slices[['Unique Mapping']]}")) message(glue::glue("Unique Mapping Count: {slices[['Unique Mapping']]}")) # NON_DPPD ``` -### Save Annotated Tables +### Generate Annotated Raw and Normalized Expression Tables ```{r save-tables} ## Reorder columns before saving to file @@ -966,7 +966,7 @@ limma::write.fit(res, adjust = 'BH', sep = ",") ``` -### Save DE Table +### Add Annotation and Stats Columns and Format DE Table ``` {r save-de-table} #| message: false From a228c9475fa01b495059fdf1c7afa1dbd47531c8 Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Sun, 9 Feb 2025 20:19:36 -0800 Subject: [PATCH 16/25] NF_MAAffymetrix: remove viz output from V&V --- .../bin/dp_tools__affymetrix/config.yaml | 10 ------- .../bin/dp_tools__affymetrix/protocol.py | 27 ------------------- 2 files changed, 37 deletions(-) diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/config.yaml b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/config.yaml index 071eff4f..bb72fe74 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/config.yaml +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/config.yaml @@ -263,16 +263,6 @@ data assets: resource categories: *DGEAnalysisData - viz PCA table: - processed location: - - *DGEDataDir - - "visualization_PCA_table_GLmicroarray.csv" - - tags: - - processed - - resource categories: *neverPublished - data asset sets: # These assets are not generated in the workflow, but are generated after the workflow PUTATIVE: [] diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/protocol.py b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/protocol.py index e17c9a1d..d55f4f54 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/protocol.py +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/protocol.py @@ -365,33 +365,6 @@ def validate( """) ) - with vp.component_start( - name="Viz Tables", - description="Extended from the dge tables", - ): - with vp.payload( - payloads=[ - { - "samples": lambda: set(dataset.samples), - "pca_table": lambda: dataset.data_assets[ - "viz PCA table" - ].path, - } - ] - ): - vp.add( - bulkRNASeq.checks.check_viz_pca_table_index_and_columns_exist, - full_description=textwrap.dedent(f""" - - Check: Ensure all samples (row-indices) present and columns PC1, PC2 and PC3 are present - - Reason: - - PCA table should include all samples and PC1, PC2, PC3 (for 3D PCA viz) - - Potential Source of Problems: - - Bug in processing script - - Flag Condition: - - Green: All samples and all columns present - - Halt: At least one sample or column is missing - """) - ) with vp.component_start( name="Processing Report", description="", From d07ee405ae1d06f8ad7337ef1c40ee263dc33fba Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Sun, 9 Feb 2025 20:20:53 -0800 Subject: [PATCH 17/25] NF_MAAffymetrix: use original sample name in output files --- .../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md | 10 +++++----- .../NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md index 0241cc2d..f5f361b3 100644 --- a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md +++ b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md @@ -296,7 +296,7 @@ if ( allTrue(stringr::str_ends(local_paths, ".gz")) ) { ) } -df_local_paths <- data.frame(`Sample Name` = df_rs$`Sample Name`, `Local Paths` = local_paths, check.names = FALSE) +df_local_paths <- data.frame(`Sample Name` = df_rs$`Original Sample Name`, `Local Paths` = local_paths, check.names = FALSE) # Load raw data into R object @@ -1041,7 +1041,7 @@ ANNOTATIONS_COLUMN_ORDER = c( "GOSLIM_IDS" ) -SAMPLE_COLUMN_ORDER <- df_rs$`Sample Name` +SAMPLE_COLUMN_ORDER <- df_rs$`Original Sample Name` probeset_expression_matrix.gene_mapped <- probeset_expression_matrix.gene_mapped %>% dplyr::rename( !!annot_key := ENSEMBL ) @@ -1141,7 +1141,7 @@ runsheetToDesignMatrix <- function(runsheet_path) { colnames(factors) = paste("factor",1:dim(factors)[2], sep= "_") # Load metadata from runsheet csv file - compare_csv = data.frame(sample_id = df[,c("Sample Name")], factors) + compare_csv = data.frame(sample_id = df[,c("Original Sample Name")], factors) # Create data frame containing all samples and respective factors study <- as.data.frame(compare_csv[,2:dim(compare_csv)[2]]) @@ -1165,13 +1165,13 @@ runsheetToDesignMatrix <- function(runsheet_path) { contrasts <- cbind(contrasts,contrasts[c(2,1),]) colnames(contrasts) <- contrast.names sampleTable <- data.frame(condition=factor(group)) - rownames(sampleTable) <- df[,c("Sample Name")] + rownames(sampleTable) <- df[,c("Original Sample Name")] condition <- sampleTable[,'condition'] names_mapping <- as.data.frame(cbind(safe_name = as.character(condition), original_name = group_names)) design <- model.matrix(~ 0 + condition) - design_data <- list( matrix = design, mapping = names_mapping, groups = as.data.frame( cbind(sample = df[,c("Sample Name")], group = group_names) ), contrasts = contrasts ) + design_data <- list( matrix = design, mapping = names_mapping, groups = as.data.frame( cbind(sample = df[,c("Original Sample Name")], group = group_names) ), contrasts = contrasts ) return(design_data) } diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd index 19f38c49..80dd7fa0 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd @@ -175,7 +175,7 @@ if ( allTrue(stringr::str_ends(local_paths, ".gz")) ) { ) } -df_local_paths <- data.frame(`Sample Name` = df_rs$`Sample Name`, `Local Paths` = local_paths, check.names = FALSE) +df_local_paths <- data.frame(`Sample Name` = df_rs$`Original Sample Name`, `Local Paths` = local_paths, check.names = FALSE) # NON_DPPD:START print("Raw Data Loaded Successfully") DT::datatable(df_local_paths) @@ -803,7 +803,7 @@ ANNOTATIONS_COLUMN_ORDER = c( "GOSLIM_IDS" ) -SAMPLE_COLUMN_ORDER <- df_rs$`Sample Name` +SAMPLE_COLUMN_ORDER <- df_rs$`Original Sample Name` probeset_expression_matrix.gene_mapped <- probeset_expression_matrix.gene_mapped %>% dplyr::rename( !!annot_key := ENSEMBL ) @@ -886,7 +886,7 @@ runsheetToDesignMatrix <- function(runsheet_path) { colnames(factors) = paste("factor",1:dim(factors)[2], sep= "_") # Load metadata from runsheet csv file - compare_csv = data.frame(sample_id = df[,c("Sample Name")], factors) + compare_csv = data.frame(sample_id = df[,c("Original Sample Name")], factors) # Create data frame containing all samples and respective factors study <- as.data.frame(compare_csv[,2:dim(compare_csv)[2]]) @@ -910,13 +910,13 @@ runsheetToDesignMatrix <- function(runsheet_path) { contrasts <- cbind(contrasts,contrasts[c(2,1),]) colnames(contrasts) <- contrast.names sampleTable <- data.frame(condition=factor(group)) - rownames(sampleTable) <- df[,c("Sample Name")] + rownames(sampleTable) <- df[,c("Original Sample Name")] condition <- sampleTable[,'condition'] names_mapping <- as.data.frame(cbind(safe_name = as.character(condition), original_name = group_names)) design <- model.matrix(~ 0 + condition) - design_data <- list( matrix = design, mapping = names_mapping, groups = as.data.frame( cbind(sample = df[,c("Sample Name")], group = group_names) ), contrasts = contrasts ) + design_data <- list( matrix = design, mapping = names_mapping, groups = as.data.frame( cbind(sample = df[,c("Original Sample Name")], group = group_names) ), contrasts = contrasts ) return(design_data) } From f8bb3fe6bd12a4b5b1e0b8fa4d73e5d02794ef69 Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Tue, 11 Feb 2025 00:24:19 -0800 Subject: [PATCH 18/25] Revert "NF_MAAffymetrix: use original sample name in output files" This reverts commit d07ee405ae1d06f8ad7337ef1c40ee263dc33fba. --- .../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md | 10 +++++----- .../NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md index f5f361b3..0241cc2d 100644 --- a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md +++ b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md @@ -296,7 +296,7 @@ if ( allTrue(stringr::str_ends(local_paths, ".gz")) ) { ) } -df_local_paths <- data.frame(`Sample Name` = df_rs$`Original Sample Name`, `Local Paths` = local_paths, check.names = FALSE) +df_local_paths <- data.frame(`Sample Name` = df_rs$`Sample Name`, `Local Paths` = local_paths, check.names = FALSE) # Load raw data into R object @@ -1041,7 +1041,7 @@ ANNOTATIONS_COLUMN_ORDER = c( "GOSLIM_IDS" ) -SAMPLE_COLUMN_ORDER <- df_rs$`Original Sample Name` +SAMPLE_COLUMN_ORDER <- df_rs$`Sample Name` probeset_expression_matrix.gene_mapped <- probeset_expression_matrix.gene_mapped %>% dplyr::rename( !!annot_key := ENSEMBL ) @@ -1141,7 +1141,7 @@ runsheetToDesignMatrix <- function(runsheet_path) { colnames(factors) = paste("factor",1:dim(factors)[2], sep= "_") # Load metadata from runsheet csv file - compare_csv = data.frame(sample_id = df[,c("Original Sample Name")], factors) + compare_csv = data.frame(sample_id = df[,c("Sample Name")], factors) # Create data frame containing all samples and respective factors study <- as.data.frame(compare_csv[,2:dim(compare_csv)[2]]) @@ -1165,13 +1165,13 @@ runsheetToDesignMatrix <- function(runsheet_path) { contrasts <- cbind(contrasts,contrasts[c(2,1),]) colnames(contrasts) <- contrast.names sampleTable <- data.frame(condition=factor(group)) - rownames(sampleTable) <- df[,c("Original Sample Name")] + rownames(sampleTable) <- df[,c("Sample Name")] condition <- sampleTable[,'condition'] names_mapping <- as.data.frame(cbind(safe_name = as.character(condition), original_name = group_names)) design <- model.matrix(~ 0 + condition) - design_data <- list( matrix = design, mapping = names_mapping, groups = as.data.frame( cbind(sample = df[,c("Original Sample Name")], group = group_names) ), contrasts = contrasts ) + design_data <- list( matrix = design, mapping = names_mapping, groups = as.data.frame( cbind(sample = df[,c("Sample Name")], group = group_names) ), contrasts = contrasts ) return(design_data) } diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd index 80dd7fa0..19f38c49 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd @@ -175,7 +175,7 @@ if ( allTrue(stringr::str_ends(local_paths, ".gz")) ) { ) } -df_local_paths <- data.frame(`Sample Name` = df_rs$`Original Sample Name`, `Local Paths` = local_paths, check.names = FALSE) +df_local_paths <- data.frame(`Sample Name` = df_rs$`Sample Name`, `Local Paths` = local_paths, check.names = FALSE) # NON_DPPD:START print("Raw Data Loaded Successfully") DT::datatable(df_local_paths) @@ -803,7 +803,7 @@ ANNOTATIONS_COLUMN_ORDER = c( "GOSLIM_IDS" ) -SAMPLE_COLUMN_ORDER <- df_rs$`Original Sample Name` +SAMPLE_COLUMN_ORDER <- df_rs$`Sample Name` probeset_expression_matrix.gene_mapped <- probeset_expression_matrix.gene_mapped %>% dplyr::rename( !!annot_key := ENSEMBL ) @@ -886,7 +886,7 @@ runsheetToDesignMatrix <- function(runsheet_path) { colnames(factors) = paste("factor",1:dim(factors)[2], sep= "_") # Load metadata from runsheet csv file - compare_csv = data.frame(sample_id = df[,c("Original Sample Name")], factors) + compare_csv = data.frame(sample_id = df[,c("Sample Name")], factors) # Create data frame containing all samples and respective factors study <- as.data.frame(compare_csv[,2:dim(compare_csv)[2]]) @@ -910,13 +910,13 @@ runsheetToDesignMatrix <- function(runsheet_path) { contrasts <- cbind(contrasts,contrasts[c(2,1),]) colnames(contrasts) <- contrast.names sampleTable <- data.frame(condition=factor(group)) - rownames(sampleTable) <- df[,c("Original Sample Name")] + rownames(sampleTable) <- df[,c("Sample Name")] condition <- sampleTable[,'condition'] names_mapping <- as.data.frame(cbind(safe_name = as.character(condition), original_name = group_names)) design <- model.matrix(~ 0 + condition) - design_data <- list( matrix = design, mapping = names_mapping, groups = as.data.frame( cbind(sample = df[,c("Original Sample Name")], group = group_names) ), contrasts = contrasts ) + design_data <- list( matrix = design, mapping = names_mapping, groups = as.data.frame( cbind(sample = df[,c("Sample Name")], group = group_names) ), contrasts = contrasts ) return(design_data) } From 25293197b1aa0c99c02ca5d027d2d8c984a8cb10 Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Wed, 26 Feb 2025 20:38:49 -0800 Subject: [PATCH 19/25] NF_MAAffymetrix: minor updates to qmd --- .../GL-DPPD-7114-A.md | 23 ++++++++++++------- .../workflow_code/bin/Affymetrix.qmd | 23 ++++++++++++------- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md index 0241cc2d..ef4e4dea 100644 --- a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md +++ b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md @@ -819,16 +819,26 @@ if (organism %in% c("athaliana")) { expected_dataset_name <- shortenedOrganismName(unique(df_rs$organism)) %>% stringr::str_c("_gene_ensembl") print(paste0("Expected dataset name: '", expected_dataset_name, "'")) + expected_attribute_name <- getBioMartAttribute(df_rs) + print(paste0("Expected attribute name: '", expected_attribute_name, "'")) # Specify Ensembl version used in current GeneLab reference annotations ENSEMBL_VERSION <- ensembl_version print(glue::glue("Using Ensembl biomart to get specific version of mapping table. Ensembl version: {ENSEMBL_VERSION}")) - # Check if organism in supported in biomart - ensembl <- biomaRt::useEnsembl(biomart = "genes") + # Check if organism/array design is supported in biomart + use_custom_annot <- TRUE + + ensembl <- biomaRt::useEnsembl(biomart = "genes", version = ENSEMBL_VERSION) ensembl_datasets <- biomaRt::listDatasets(ensembl) - use_custom_annot <- !expected_dataset_name %in% ensembl_datasets$dataset + if (expected_dataset_name %in% ensembl_datasets$dataset) { + ensembl <- biomaRt::useEnsembl(biomart = "genes", dataset = expected_dataset_name, version = ENSEMBL_VERSION) + ensembl_attributes <- biomaRt::listAttributes(ensembl) + if (expected_attribute_name %in% ensembl_attributes$name) { + use_custom_annot <- FALSE + } + } if (use_custom_annot) { unloadNamespace("biomaRt") @@ -839,9 +849,6 @@ if (organism %in% c("athaliana")) { version = ENSEMBL_VERSION) print(ensembl) - expected_attribute_name <- getBioMartAttribute(df_rs) - print(paste0("Expected attribute name: '", expected_attribute_name, "'")) - # Some probe_ids for affy_hta_2_0 may end in .hg.1 instead of .hg (how it is in biomaRt), leading to 0 results returned if (expected_attribute_name == 'affy_hta_2_0') { rownames(probeset_level_data) <- stringr::str_replace(rownames(probeset_level_data), '\\.hg\\.1$', '.hg') @@ -923,7 +930,7 @@ if (use_custom_annot) { if (sum(!is.na(unique_probe_ids$ENTREZID)) > sum(!is.na(unique_probe_ids$ENSEMBL))) { gene_col <- 'ENTREZID' } - if (sum(!is.na(unique_probe_ids$SYMBOL)) > sum(!is.na(unique_probe_ids$ENTREZID))) { + if (sum(!is.na(unique_probe_ids$SYMBOL)) > max(sum(!is.na(unique_probe_ids$ENTREZID)), sum(!is.na(unique_probe_ids$ENSEMBL)))) { gene_col <- 'SYMBOL' } @@ -1126,7 +1133,7 @@ write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "norm ## 9. Perform Probeset Differential Expression (DE) -> Note: Run differential expression analysis only if there is at least 2 replicates per factor group. +> Note: Run differential expression analysis only if there are at least 2 replicates per factor group.
diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd index 19f38c49..bfa00ee7 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd @@ -597,6 +597,9 @@ if (organism %in% c("athaliana")) { print(paste0("Expected dataset name: '", expected_dataset_name, "'")) message(paste0("Expected dataset name: '", expected_dataset_name, "'")) # NON_DPPD + expected_attribute_name <- getBioMartAttribute(df_rs) + print(paste0("Expected attribute name: '", expected_attribute_name, "'")) + message(paste0("Expected attribute name: '", expected_attribute_name, "'")) # NON_DPPD # Specify Ensembl version used in current GeneLab reference annotations ENSEMBL_VERSION <- ensembl_version @@ -604,10 +607,18 @@ if (organism %in% c("athaliana")) { print(glue::glue("Using Ensembl biomart to get specific version of mapping table. Ensembl version: {ENSEMBL_VERSION}")) - # Check if organism in supported in biomart - ensembl <- biomaRt::useEnsembl(biomart = "genes") + # Check if organism/array design is supported in biomart + use_custom_annot <- TRUE + + ensembl <- biomaRt::useEnsembl(biomart = "genes", version = ENSEMBL_VERSION) ensembl_datasets <- biomaRt::listDatasets(ensembl) - use_custom_annot <- !expected_dataset_name %in% ensembl_datasets$dataset + if (expected_dataset_name %in% ensembl_datasets$dataset) { + ensembl <- biomaRt::useEnsembl(biomart = "genes", dataset = expected_dataset_name, version = ENSEMBL_VERSION) + ensembl_attributes <- biomaRt::listAttributes(ensembl) + if (expected_attribute_name %in% ensembl_attributes$name) { + use_custom_annot <- FALSE + } + } if (use_custom_annot) { unloadNamespace("biomaRt") @@ -618,10 +629,6 @@ if (organism %in% c("athaliana")) { version = ENSEMBL_VERSION) print(ensembl) - expected_attribute_name <- getBioMartAttribute(df_rs) - print(paste0("Expected attribute name: '", expected_attribute_name, "'")) - message(paste0("Expected attribute name: '", expected_attribute_name, "'")) # NON_DPPD - # Some probe_ids for affy_hta_2_0 may end in .hg.1 instead of .hg (how it is in biomaRt), leading to 0 results returned if (expected_attribute_name == 'affy_hta_2_0') { rownames(probeset_level_data) <- stringr::str_replace(rownames(probeset_level_data), '\\.hg\\.1$', '.hg') @@ -712,7 +719,7 @@ if (use_custom_annot) { if (sum(!is.na(unique_probe_ids$ENTREZID)) > sum(!is.na(unique_probe_ids$ENSEMBL))) { gene_col <- 'ENTREZID' } - if (sum(!is.na(unique_probe_ids$SYMBOL)) > sum(!is.na(unique_probe_ids$ENTREZID))) { + if (sum(!is.na(unique_probe_ids$SYMBOL)) > max(sum(!is.na(unique_probe_ids$ENTREZID)), sum(!is.na(unique_probe_ids$ENSEMBL)))) { gene_col <- 'SYMBOL' } From 3c8e0afd28408718e0b6dda828e89a3254f80325 Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Mon, 3 Mar 2025 18:22:23 -0800 Subject: [PATCH 20/25] NF_MAAffymetrix: update accepted ISA field name for label --- .../workflow_code/bin/dp_tools__affymetrix/config.yaml | 5 ++++- .../bin/dp_tools__affymetrix_skipDE/config.yaml | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/config.yaml b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/config.yaml index bb72fe74..e3e593fa 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/config.yaml +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/config.yaml @@ -75,7 +75,10 @@ Staging: Sample name is used as a unique sample identifier during processing Example: Atha_Col-0_Root_WT_Ctrl_45min_Rep1_GSM502538 - - ISA Field Name: Label + - ISA Field Name: + - Label + - Parameter Value[label] + - Parameter Value[Label] ISA Table Source: Sample Runsheet Column Name: Label Processing Usage: >- diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix_skipDE/config.yaml b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix_skipDE/config.yaml index e30bf80a..543a4fe6 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix_skipDE/config.yaml +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix_skipDE/config.yaml @@ -75,7 +75,10 @@ Staging: Sample name is used as a unique sample identifier during processing Example: Atha_Col-0_Root_WT_Ctrl_45min_Rep1_GSM502538 - - ISA Field Name: Label + - ISA Field Name: + - Label + - Parameter Value[label] + - Parameter Value[Label] ISA Table Source: Sample Runsheet Column Name: Label Processing Usage: >- From 57afe18de5fb35713290f8476e0d9a545ebe8856 Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Mon, 3 Mar 2025 18:26:09 -0800 Subject: [PATCH 21/25] NF_MAAffymetrix: minor updates to workflow version 1.0.5 --- .../NF_MAAffymetrix/CHANGELOG.md | 10 ++-------- Microarray/Affymetrix/Workflow_Documentation/README.md | 2 +- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/CHANGELOG.md b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/CHANGELOG.md index 5a747ced..5414e570 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/CHANGELOG.md +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/CHANGELOG.md @@ -5,17 +5,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [TBD](#) - YYYY-MM-DD - -### Changed - -- Better support for custom annotations, see [specification](examples/annotations/README.md) ([#113](https://github.com/nasa/GeneLab_Data_Processing/issues/113)) - -## [1.0.5](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_MAAffymetrix_1.0.5/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix) - 2024-08-30 +## [1.0.5](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_MAAffymetrix_1.0.5/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix) - 2025-03-03 ### Added -- Add support for bacteria annotations using manufacturer annotations ([#113](https://github.com/nasa/GeneLab_Data_Processing/issues/113)) +- Support for custom annotations, see [specification](examples/annotations/README.md) ([#113](https://github.com/nasa/GeneLab_Data_Processing/issues/113)) - Add option to skip differential expression analysis (`--skipDE`) ([#104](https://github.com/nasa/GeneLab_Data_Processing/issues/104)) ### Changed diff --git a/Microarray/Affymetrix/Workflow_Documentation/README.md b/Microarray/Affymetrix/Workflow_Documentation/README.md index bfd83848..ae9dbdd6 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/README.md +++ b/Microarray/Affymetrix/Workflow_Documentation/README.md @@ -8,7 +8,7 @@ GeneLab has wrapped each step of the pipeline into a workflow with validation an |Pipeline Version|Current Workflow Version (for respective pipeline version)|Nextflow Version| |:---------------|:---------------------------------------------------------|:---------------| -|*[GL-DPPD-7114-A.md](../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md)|[1.0.5](NF_MAAffymetrix)|23.10.1| +|*[GL-DPPD-7114-A.md](../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md)|[NF_MAAffymetrix_1.0.5](NF_MAAffymetrix)|23.10.1| *Current GeneLab Pipeline/Workflow Implementation From cca1601cd89e8fa99ff60d84ff34de02ddd2bac2 Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Mon, 3 Mar 2025 19:22:19 -0800 Subject: [PATCH 22/25] NF_MAAffymetrix: minor update to pipeline doc --- .../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md index ef4e4dea..cd8e6316 100644 --- a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md +++ b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md @@ -479,7 +479,7 @@ if (inherits(raw_data, "GeneFeatureSet")) { **Output Data:** -- M (log ratio of the subject array vs a pseudo-reference, the median of all other arrays) vs. A (average log expression) plot for each array before background correction and normalization +- `MA_plot` (M (log ratio of the subject array vs a pseudo-reference, the median of all other arrays) vs. A (average log expression) plot for each array before background correction and normalization)
@@ -514,7 +514,7 @@ par(original_par) **Output Data:** -- Boxplot of raw expression data for each array before background correction and normalization +- `boxplot` (Boxplot of raw expression data for each array before background correction and normalization)
@@ -647,7 +647,7 @@ MA_plot <- oligo::MAplot( **Output Data:** -- M (log ratio of the subject array vs a pseudo-reference, the median of all other arrays) vs. A (average log expression) plot for each array after background correction and normalization +- `MA_plot` (M (log ratio of the subject array vs a pseudo-reference, the median of all other arrays) vs. A (average log expression) plot for each array after background correction and normalization)
@@ -681,7 +681,7 @@ par(original_par) **Output Data:** -- Boxplot of expression data for each array after background correction and normalization +- `boxplot` (Boxplot of expression data for each array after background correction and normalization)
From 7733b7c674e323c6231a9b0f5e12e93c6ea17d2a Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Mon, 24 Mar 2025 20:53:53 -0700 Subject: [PATCH 23/25] NF_MAAffymetrix: update custom functions in pipeline doc --- .../GL-DPPD-7114-A.md | 741 +++++++++++------- .../workflow_code/bin/Affymetrix.qmd | 52 +- 2 files changed, 496 insertions(+), 297 deletions(-) diff --git a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md index cd8e6316..8f78a433 100644 --- a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md +++ b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md @@ -62,8 +62,10 @@ Custom Annotations - [General processing overview with example commands](#general-processing-overview-with-example-commands) - [1. Create Sample RunSheet](#1-create-sample-runsheet) - [2. Load Data](#2-load-data) - - [2a. Load Metadata and Raw Data](#2a-load-metadata-and-raw-data) - - [2b. Load Annotation Metadata](#2b-load-annotation-metadata) + - [2a. Load Libraries and Define Input Parameters](#2a-load-libraries-and-define-input-parameters) + - [2b. Define Custom Functions](#2b-define-custom-functions) + - [2c. Load Metadata and Raw Data](#2c-load-metadata-and-raw-data) + - [2d. Load Annotation Metadata](#2d-load-annotation-metadata) - [3. Raw Data Quality Assessment](#3-raw-data-quality-assessment) - [3a. Density Plot](#3a-density-plot) - [3b. Pseudo Image Plots](#3b-pseudo-image-plots) @@ -165,7 +167,7 @@ dpt-isa-to-runsheet --accession OSD-### \
-### 2a. Load Metadata and Raw Data +### 2a. Load Libraries and Define Input Parameters ```R ### Install R packages if not already installed ### @@ -208,81 +210,427 @@ original_par <- par() options(preferRaster=TRUE) # use Raster when possible to avoid antialiasing artifacts in images options(timeout=1000) # ensure enough time for data downloads +``` -# Utility function to improve robustness of function calls -# Used to remedy intermittent internet issues during runtime -retry_with_delay <- function(func, ...) { - max_attempts = 5 - initial_delay = 10 - delay_increase = 30 - attempt <- 1 - current_delay <- initial_delay - while (attempt <= max_attempts) { - result <- tryCatch( - expr = func(...), - error = function(e) e - ) +
- if (!inherits(result, "error")) { - return(result) - } else { - if (attempt < max_attempts) { - message(paste("Retry attempt", attempt, "failed for function with name <", deparse(substitute(func)) ,">. Retrying in", current_delay, "second(s)...")) - Sys.sleep(current_delay) - current_delay <- current_delay + delay_increase +### 2b. Define Custom Functions + +#### retry_with_delay() +
+ utility function to improve robustness of function calls; used to remedy intermittent internet issues during runtime + + ```R + retry_with_delay <- function(func, ...) { + max_attempts = 5 + initial_delay = 10 + delay_increase = 30 + attempt <- 1 + current_delay <- initial_delay + while (attempt <= max_attempts) { + result <- tryCatch( + expr = func(...), + error = function(e) e + ) + + if (!inherits(result, "error")) { + return(result) } else { - stop(paste("Max retry attempts reached. Last error:", result$message)) + if (attempt < max_attempts) { + message(paste("Retry attempt", attempt, "failed for function with name <", deparse(substitute(func)) ,">. Retrying in", current_delay, "second(s)...")) + Sys.sleep(current_delay) + current_delay <- current_delay + delay_increase + } else { + stop(paste("Max retry attempts reached. Last error:", result$message)) + } } + + attempt <- attempt + 1 } + } + ``` + + **Function Parameter Definitions:** + - `func=` - specifies the function to wrap + - `...` - other arguments passed on to the `func` + + **Returns:** the output of the wrapped function +
- attempt <- attempt + 1 +#### all_true() +
+ wraps R base::all() function; overrides default behavior for empty input vector + + ```R + all_true <- function(i_vector) { + if ( length(i_vector) == 0 ) { + stop(paste("Input vector is length zero")) + } + all(i_vector) } -} + ``` -df_rs <- read.csv(runsheet, check.names = FALSE) %>% - dplyr::mutate_all(function(x) iconv(x, "latin1", "ASCII", sub="")) # Convert all characters to ascii, when not possible, remove the character + **Function Parameter Definitions:** + - `i_vector=` - a vector of logical values -allTrue <- function(i_vector) { - if ( length(i_vector) == 0 ) { - stop(paste("Input vector is length zero")) + **Returns:** a logical of length 1; `TRUE` if all values are true, `FALSE` otherwise; stops and returns an error if input vector is empty +
+ +#### runsheet_paths_are_URIs() +
+ tests if paths provided in runsheet dataframe are URIs + + ```R + runsheet_paths_are_URIs <- function(df_runsheet) { + all_true(stringr::str_starts(df_runsheet$`Array Data File Path`, "https")) } - all(i_vector) -} + ``` -# Define paths to raw data files -runsheetPathsAreURIs <- function(df_runsheet) { - allTrue(stringr::str_starts(df_runsheet$`Array Data File Path`, "https")) -} + **Custom Functions Used:** + - [all_true()](#all_true) + + **Function Parameter Definitions:** + - `df_runsheet=` - a dataframe containing the sample runsheet information + + **Returns:** a logical of length 1; `TRUE` if all values in the `Array Data File Path` of the runsheet start with "https", `FALSE` otherwise; stops and returns an error if input vector is empty +
+ +#### download_files_from_runsheet() +
+ downloads the raw data files + + ```R + download_files_from_runsheet <- function(df_runsheet) { + urls <- df_runsheet$`Array Data File Path` + destinationFiles <- df_runsheet$`Array Data File Name` + + mapply(function(url, destinationFile) { + print(paste0("Downloading from '", url, "' TO '", destinationFile, "'")) + if ( file.exists(destinationFile ) ) { + warning(paste( "Using Existing File:", destinationFile )) + } else { + download.file(url, destinationFile) + } + }, urls, destinationFiles) + + destinationFiles # Return these paths + } + ``` + + **Function Parameter Definitions:** + - `df_runsheet=` - a dataframe containing the sample runsheet information + + **Returns:** a list of filenames that were downloaded; same as the `Array Data File Name` in the sample runsheet +
+ +#### fetch_organism_specific_annotation_table() +
+ determines the organism specific annotation file to use based on the provided organism name + + ```R + fetch_organism_specific_annotation_table <- function(organism, annotation_table_link) { + # Uses the latest GeneLab annotations table to find the organism specific annotation file path and ensembl version + # Raises an exception if the organism does not have an associated annotation file or ensembl version yet + + all_organism_table <- read.csv(annotation_table_link) + + annotation_table <- all_organism_table %>% dplyr::filter(species == organism) + + # Guard clause: Ensure annotation_table populated + # Else: raise exception for unsupported organism + if (nrow(annotation_table) == 0 || annotation_table$genelab_annots_link == "" || is.na(annotation_table$ensemblVersion)) { + stop(glue::glue("Organism supplied '{organism}' is not supported. See the following url for supported organisms: {annotation_table_link}. Supported organisms will correspond to a row based on the 'species' column and include a url in the 'genelab_annots_link' column of that row and a version number in the 'ensemblVersion' column.")) + } + + return(annotation_table) + } + ``` + + **Function Parameter Definitions:** + - `organism=` - a string containing the name of the organism (as found in the species column of the GeneLab annotation table) + - `annotation_table_link=` - a string specifying the URL or path to latest GeneLab Annotations file, see [GL-DPPD-7110-A_annotations.csv](../../../GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv) + + **Returns:** a dataframe containing all rows in the GeneLab annotations file that match the specified organism +
+#### shortened_organism_name() +
+ shortens organism names, for example 'Homo Sapiens' to 'hsapiens' -# Download raw data files -downloadFilesFromRunsheet <- function(df_runsheet) { - urls <- df_runsheet$`Array Data File Path` - destinationFiles <- df_runsheet$`Array Data File Name` + ```R + shortened_organism_name <- function(long_name) { + tokens <- long_name %>% stringr::str_split(" ", simplify = TRUE) + genus_name <- tokens[1] - mapply(function(url, destinationFile) { - print(paste0("Downloading from '", url, "' TO '", destinationFile, "'")) - if ( file.exists(destinationFile ) ) { - warning(paste( "Using Existing File:", destinationFile )) + species_name <- tokens[2] + + short_name <- stringr::str_to_lower(paste0(substr(genus_name, start = 1, stop = 1), species_name)) + + return(short_name) + } + ``` + + **Function Parameter Definitions:** + - `long_name=` - a string containing the long name of the organism + + **Returns:** a string containing the short name of the organism +
+ +#### get_biomart_attribute() +
+ retrieves resolved biomart attribute source from runsheet dataframe + + ```R + get_biomart_attribute <- function(df_rs) { + # check if runsheet has 'biomart_attribute' column + if ( !is.null(df_rs$`biomart_attribute`) ) { + print("Using attribute name sourced from runsheet") + # Format according to biomart needs + formatted_value <- unique(df_rs$`biomart_attribute`) %>% + stringr::str_replace_all(" ","_") %>% # Replace all spaces with underscore + stringr::str_to_lower() # Lower casing only + return(formatted_value) } else { - download.file(url, destinationFile) + stop("ERROR: Could not find 'biomart_attribute' in runsheet") } - }, urls, destinationFiles) + } + ``` - destinationFiles # Return these paths -} + **Function Parameter Definitions:** + - `df_rs=` - a dataframe containing the sample runsheet information + + **Returns:** a string containing the formatted value from the `biomart_attribute` column of the runsheet, with all spaces converted to underscores and uppercase converted to lowercase; if no `biomart_attribute` exists in the runsheet, stop and return an error +
+ +#### get_ensembl_genomes_mappings_from_ftp() +
+ obtains mapping table directly from ftp; useful when biomart live service no longer exists for desired version + + ```R + get_ensembl_genomes_mappings_from_ftp <- function(organism, ensembl_genomes_portal, ensembl_genomes_version, biomart_attribute) { + request_url <- glue::glue("https://ftp.ebi.ac.uk/ensemblgenomes/pub/{ensembl_genomes_portal}/release-{ensembl_genomes_version}/mysql/{ensembl_genomes_portal}_mart_{ensembl_genomes_version}/{organism}_eg_gene__efg_{biomart_attribute}__dm.txt.gz") + + print(glue::glue("Mappings file URL: {request_url}")) + + # Create a temporary file name + temp_file <- tempfile(fileext = ".gz") + + # Download the gzipped table file using the download.file function + download.file(url = request_url, destfile = temp_file, method = "libcurl") # Use 'libcurl' to support ftps + + # Uncompress the file + uncompressed_temp_file <- tempfile() + gzcon <- gzfile(temp_file, "rt") + content <- readLines(gzcon) + writeLines(content, uncompressed_temp_file) + close(gzcon) + + + # Load the data into a dataframe + mapping <- read.table(uncompressed_temp_file, # Read the uncompressed file + # Add column names as follows: MAPID, TAIR, PROBESETID + col.names = c("MAPID", "ensembl_gene_id", biomart_attribute), + header = FALSE, # No header in original table + sep = "\t") # Tab separated + + # Clean up temporary files + unlink(temp_file) + unlink(uncompressed_temp_file) + + return(mapping) + } + ``` + + **Function Parameter Definitions:** + - `organism=` - a string containing the name of the organism (formatted using `shortened_organism_name()`) + - `ensembl_genomes_portal=` - a string containing the name of the genomes portal, for example 'plants' + - `ensembl_genomes_version=` - a string containing the version of Ensembl to use + - `biomart_attribute=` - a string containing the biomart attribute (formatted using `get_biomart_attribute()`) + + **Returns:** a dataframe containing the mapping between Ensembl ID and probeset ID, as obtained via FTP +
+ +#### list_to_unique_piped_string() +
+ converts character vector into string denoting unique elements separated by '|' characters + + ```R + list_to_unique_piped_string <- function(str_list) { + #! Convert vector of multi-mapped genes to string separated by '|' characters + #! e.g. c("GO1","GO2","GO2","G03") -> "GO1|GO2|GO3" + return(toString(unique(str_list)) %>% stringr::str_replace_all(pattern = stringr::fixed(", "), replacement = "|")) + } + ``` + + **Function Parameter Definitions:** + - `str_list=` - vector of character elements + + **Returns:** a string containing the unique elements from `str_list` concatenated together, separated by '|' characters +
+ +#### runsheet_to_design_matrix() +
+ loads the GeneLab runsheet into a list of dataframes + + ```R + runsheet_to_design_matrix <- function(runsheet_path) { + # Pull all factors for each sample in the study from the runsheet created in Step 1 + df <- read.csv(runsheet, check.names = FALSE) %>% + dplyr::mutate_all(function(x) iconv(x, "latin1", "ASCII", sub="")) # Convert all characters to ascii, when not possible, remove the character # get only Factor Value columns + factors = as.data.frame(df[,grep("Factor.Value", colnames(df), ignore.case=TRUE)]) + colnames(factors) = paste("factor",1:dim(factors)[2], sep= "_") + + # Load metadata from runsheet csv file + compare_csv = data.frame(sample_id = df[,c("Sample Name")], factors) + + # Create data frame containing all samples and respective factors + study <- as.data.frame(compare_csv[,2:dim(compare_csv)[2]]) + colnames(study) <- colnames(compare_csv)[2:dim(compare_csv)[2]] + rownames(study) <- compare_csv[,1] + + # Format groups and indicate the group that each sample belongs to + if (dim(study)[2] >= 2){ + group<-apply(study,1,paste,collapse = " & ") # concatenate multiple factors into one condition per sample + } else{ + group<-study[,1] + } + group_names <- paste0("(",group,")",sep = "") # human readable group names + group <- sub("^BLOCKER_", "", make.names(paste0("BLOCKER_", group))) # group naming compatible with R models, this maintains the default behaviour of make.names with the exception that 'X' is never prepended to group namesnames(group) <- group_names + names(group) <- group_names + + # Format contrasts table, defining pairwise comparisons for all groups + contrast.names <- combn(levels(factor(names(group))),2) # generate matrix of pairwise group combinations for comparison + contrasts <- apply(contrast.names, MARGIN=2, function(col) sub("^BLOCKER_", "", make.names(paste0("BLOCKER_", stringr::str_sub(col, 2, -2))))) + contrast.names <- c(paste(contrast.names[1,],contrast.names[2,],sep = "v"),paste(contrast.names[2,],contrast.names[1,],sep = "v")) # format combinations for output table files names + contrasts <- cbind(contrasts,contrasts[c(2,1),]) + colnames(contrasts) <- contrast.names + sampleTable <- data.frame(condition=factor(group)) + rownames(sampleTable) <- df[,c("Sample Name")] + + condition <- sampleTable[,'condition'] + names_mapping <- as.data.frame(cbind(safe_name = as.character(condition), original_name = group_names)) + + design <- model.matrix(~ 0 + condition) + design_data <- list( matrix = design, mapping = names_mapping, groups = as.data.frame( cbind(sample = df[,c("Sample Name")], group = group_names) ), contrasts = contrasts ) + return(design_data) + } + ``` + + **Function Parameter Definitions:** + - `runsheet_path=` - a string containing the path to the runsheet generated in [Step 1](#1-create-sample-runsheet) + + **Returns:** a list of R objects containing the sample information and metadata + - `design_data$matrix` - a design (or model) matrix describing the conditions in the dataset + - `design_data$mapping` - a dataframe mapping the human-readable group names to the names of the conditions modified for use in R + - `design_data$groups` - a dataframe of group names and contrasts for each sample + - `design_data$contrasts` - a matrix of all pairwise comparisons of the groups +
+ +#### lm_fit_pairwise() +
+ performs all pairwise comparisons using limma::lmFit() + + ```R + lm_fit_pairwise <- function(norm_data, design) { + # Approach based on limma manual section 17.4 (version 3.52.4) + fit <- limma::lmFit(norm_data, design) + + # Create Contrast Model + fit.groups <- colnames(fit$design)[which(fit$assign == 1)] + combos <- combn(fit.groups,2) + contrasts<-c(paste(combos[1,],combos[2,],sep = "-"),paste(combos[2,],combos[1,],sep = "-")) # format combinations for limma:makeContrasts + cont.matrix <- limma::makeContrasts(contrasts=contrasts,levels=design) + contrast.fit <- limma::contrasts.fit(fit, cont.matrix) + + contrast.fit <- limma::eBayes(contrast.fit,trend=TRUE,robust=TRUE) + return(contrast.fit) + } + ``` + + **Function Parameter Definitions:** + - `norm_data=` - an R object containing log-ratios or log-expression values for a series of arrays, with rows corresponding to genes and columns to samples + - `design=` - the design matrix of the microarray experiment, with rows corresponding to samples and columns to coefficients to be estimated + + **Returns:** an R object of class `MArrayLM` +
+ +#### reformat_names() +
+ reformats column names for consistency across DE analyses tables within GeneLab + + ```R + reformat_names <- function(colname, group_name_mapping) { + new_colname <- colname %>% + stringr::str_replace(pattern = "^P.value.adj.condition", replacement = "Adj.p.value_") %>% + stringr::str_replace(pattern = "^P.value.condition", replacement = "P.value_") %>% + stringr::str_replace(pattern = "^Coef.condition", replacement = "Log2fc_") %>% # This is the Log2FC as per: https://rdrr.io/bioc/limma/man/writefit.html + stringr::str_replace(pattern = "^t.condition", replacement = "T.stat_") %>% + stringr::str_replace(pattern = ".condition", replacement = "v") + + # remap to group names before make.names was applied + unique_group_name_mapping <- unique(group_name_mapping) %>% arrange(-nchar(safe_name)) + for ( i in seq(nrow(unique_group_name_mapping)) ) { + safe_name <- unique_group_name_mapping[i,]$safe_name + original_name <- unique_group_name_mapping[i,]$original_name + new_colname <- new_colname %>% stringr::str_replace(pattern = stringr::fixed(safe_name), replacement = original_name) + } + + return(new_colname) + } + ``` + + **Function Parameter Definitions:** + - `colnames=` - a character vector containing the column names to reformat + - `group_name_mapping=` - a dataframe mapping the original human-readable group names to the R modified safe names + + **Returns:** a character vector containing the formatted column names +
+ +#### generate_prefixed_column_order() +
+ creates a vector of column names based on subject and given prefixes; used for both contrasts and groups column name generation + + ```R + generate_prefixed_column_order <- function(subjects, prefixes) { + # Track order of columns + final_order = c() + + # For each contrast + for (subject in subjects) { + # Generate column names for each prefix and append to final_order + for (prefix in prefixes) { + final_order <- append(final_order, glue::glue("{prefix}{subject}")) + } + } + return(final_order) + } + ``` + + **Function Parameter Definitions:** + - `subjects` - a character vector containing subject strings to add prefixes to + - `prefixes` - a character vector of prefixes to add to the beginning of each subject string + + **Returns:** a character vector with all possible combinations of prefix + subject +
+ +
+ +### 2c. Load Metadata and Raw Data + +```R +df_rs <- read.csv(runsheet, check.names = FALSE) %>% + dplyr::mutate_all(function(x) iconv(x, "latin1", "ASCII", sub="")) # Convert all characters to ascii, when not possible, remove the character -if ( runsheetPathsAreURIs(df_rs) ) { +if ( runsheet_paths_are_URIs(df_rs) ) { print("Determined Raw Data Locations are URIS") - local_paths <- retry_with_delay(downloadFilesFromRunsheet, df_rs) + local_paths <- retry_with_delay(download_files_from_runsheet, df_rs) } else { print("Or Determined Raw Data Locations are local paths") local_paths <- df_rs$`Array Data File Path` } - # uncompress files if needed -if ( allTrue(stringr::str_ends(local_paths, ".gz")) ) { +if ( all_true(stringr::str_ends(local_paths, ".gz")) ) { print("Determined these files are gzip compressed... uncompressing now") # This does the uncompression lapply(local_paths, R.utils::gunzip, remove = FALSE, overwrite = TRUE) @@ -298,21 +646,26 @@ if ( allTrue(stringr::str_ends(local_paths, ".gz")) ) { df_local_paths <- data.frame(`Sample Name` = df_rs$`Sample Name`, `Local Paths` = local_paths, check.names = FALSE) - # Load raw data into R object # Retry with delay here to accomodate oligo's automatic loading of annotation packages and occasional internet related failures to load raw_data <- retry_with_delay( oligo::read.celfiles, - df_local_paths$`Local Paths`, - sampleNames = df_local_paths$`Sample Name`# Map column names as Sample Names (instead of default filenames) - ) - + df_local_paths$`Local Paths`, + sampleNames = df_local_paths$`Sample Name`# Map column names as Sample Names (instead of default filenames) + ) # Summarize raw data print(paste0("Number of Arrays: ", dim(raw_data)[2])) print(paste0("Number of Probes: ", dim(raw_data)[1])) ``` +**Custom Functions Used:** + +- [retry_with_delay()](#retry_with_delay) +- [all_true()](#all_true) +- [runsheet_paths_are_URIs()](#runsheet_paths_are_uris) +- [download_files_from_runsheet()](#download_files_from_runsheet) + **Input Data:** - `runsheet` (Path to runsheet, output from [Step 1](#1-create-sample-runsheet)) @@ -326,38 +679,26 @@ print(paste0("Number of Probes: ", dim(raw_data)[1]))
-### 2b. Load Annotation Metadata +### 2d. Load Annotation Metadata ```R # If using custom annotation, local_annotation_dir is path to directory containing annotation file and annotation_config_path is path/url to config file local_annotation_dir <- NULL # annotation_config_path <- NULL # -## Determines the organism specific annotation file to use based on the organism in the runsheet -fetch_organism_specific_annotation_table <- function(organism) { - # Uses the latest GeneLab annotations table to find the organism specific annotation file path and ensembl version - # Raises an exception if the organism does not have an associated annotation file or ensembl version yet - - annotation_table_link <- "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/GL_RefAnnotTable-A_1.1.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv" - all_organism_table <- read.csv(annotation_table_link) - - annotation_table <- all_organism_table %>% dplyr::filter(species == organism) - - # Guard clause: Ensure annotation_table populated - # Else: raise exception for unsupported organism - if (nrow(annotation_table) == 0 || annotation_table$genelab_annots_link == "" || is.na(annotation_table$ensemblVersion)) { - stop(glue::glue("Organism supplied '{organism}' is not supported. See the following url for supported organisms: {annotation_table_link}. Supported organisms will correspond to a row based on the 'species' column and include a url in the 'genelab_annots_link' column of that row and a version number in the 'ensemblVersion' column.")) - } - - return(annotation_table) -} +annotation_table_link <- "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/GL_RefAnnotTable-A_1.1.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv" -annotation_table <- retry_with_delay(fetch_organism_specific_annotation_table, unique(df_rs$organism)) +annotation_table <- retry_with_delay(fetch_organism_specific_annotation_table, unique(df_rs$organism), annotation_table_link) annotation_file_path <- annotation_table$genelab_annots_link ensembl_version <- as.character(annotation_table$ensemblVersion) ``` +**Custom Functions Used:** + +- [retry_with_delay()](#retry_with_delay) +- [fetch_organism_specific_annotation_table()](#fetch_organism_specific_annotation_table) + **Input Data:** - `local_annotation_dir` (Path to local annotation directory if using custom annotations, see [Step 8a](#8a-get-probeset-annotations)) @@ -417,7 +758,7 @@ par(original_par) **Input Data:** -- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) +- `raw_data` (raw data R object created in [Step 2c](#2c-load-metadata-and-raw-data) above) **Output Data:** @@ -438,7 +779,7 @@ for ( i in seq_along(1:ncol(raw_data))) { **Input Data:** -- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) +- `raw_data` (raw data R object created in [Step 2c](#2c-load-metadata-and-raw-data) above) **Output Data:** @@ -475,7 +816,7 @@ if (inherits(raw_data, "GeneFeatureSet")) { **Input Data:** -- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) +- `raw_data` (raw data R object created in [Step 2c](#2c-load-metadata-and-raw-data) above) **Output Data:** @@ -510,7 +851,7 @@ par(original_par) **Input Data:** -- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) +- `raw_data` (raw data R object created in [Step 2c](#2c-load-metadata-and-raw-data) above) **Output Data:** @@ -528,7 +869,7 @@ background_corrected_data <- raw_data %>% oligo::backgroundCorrect(method="rma") **Input Data:** -- `raw_data` (raw data R object created in [Step 2a](#2a-load-metadata-and-raw-data) above) +- `raw_data` (raw data R object created in [Step 2c](#2c-load-metadata-and-raw-data) above) **Output Data:** @@ -691,9 +1032,9 @@ par(original_par) ```R probeset_level_data <- oligo::rma(norm_data, - normalize=FALSE, - background=FALSE - ) + normalize=FALSE, + background=FALSE + ) # Summarize background-corrected and normalized data print("Summarized Probeset Level Data Below") @@ -721,87 +1062,16 @@ print(paste0("Number of Probesets: ", dim(unique(oligo::getProbeInfo(probeset_le ### 8a. Get Probeset Annotations ```R -shortenedOrganismName <- function(long_name) { - #' Convert organism names like 'Homo Sapiens' into 'hsapiens' - tokens <- long_name %>% stringr::str_split(" ", simplify = TRUE) - genus_name <- tokens[1] - - species_name <- tokens[2] - - short_name <- stringr::str_to_lower(paste0(substr(genus_name, start = 1, stop = 1), species_name)) - - return(short_name) -} - -getBioMartAttribute <- function(df_rs) { - #' Returns resolved biomart attribute source from runsheet - - # check if runsheet has 'biomart_attribute' column - if ( !is.null(df_rs$`biomart_attribute`) ) { - print("Using attribute name sourced from runsheet") - # Format according to biomart needs - formatted_value <- unique(df_rs$`biomart_attribute`) %>% - stringr::str_replace_all(" ","_") %>% # Replace all spaces with underscore - stringr::str_to_lower() # Lower casing only - return(formatted_value) - } else { - stop("ERROR: Could not find 'biomart_attribute' in runsheet") - } -} - -get_ensembl_genomes_mappings_from_ftp <- function(organism, ensembl_genomes_portal, ensembl_genomes_version, biomart_attribute) { - #' Obtain mapping table directly from ftp. Useful when biomart live service no longer exists for desired version - - request_url <- glue::glue("https://ftp.ebi.ac.uk/ensemblgenomes/pub/{ensembl_genomes_portal}/release-{ensembl_genomes_version}/mysql/{ensembl_genomes_portal}_mart_{ensembl_genomes_version}/{organism}_eg_gene__efg_{biomart_attribute}__dm.txt.gz") - - print(glue::glue("Mappings file URL: {request_url}")) - - # Create a temporary file name - temp_file <- tempfile(fileext = ".gz") - - # Download the gzipped table file using the download.file function - download.file(url = request_url, destfile = temp_file, method = "libcurl") # Use 'libcurl' to support ftps - - # Uncompress the file - uncompressed_temp_file <- tempfile() - gzcon <- gzfile(temp_file, "rt") - content <- readLines(gzcon) - writeLines(content, uncompressed_temp_file) - close(gzcon) - - - # Load the data into a dataframe - mapping <- read.table(uncompressed_temp_file, # Read the uncompressed file - # Add column names as follows: MAPID, TAIR, PROBESETID - col.names = c("MAPID", "ensembl_gene_id", biomart_attribute), - header = FALSE, # No header in original table - sep = "\t") # Tab separated - - # Clean up temporary files - unlink(temp_file) - unlink(uncompressed_temp_file) - - return(mapping) -} - -# Convert list of multi-mapped genes to string -listToUniquePipedString <- function(str_list) { - #! convert lists into strings denoting unique elements separated by '|' characters - #! e.g. c("GO1","GO2","GO2","G03") -> "GO1|GO2|GO3" - return(toString(unique(str_list)) %>% stringr::str_replace_all(pattern = stringr::fixed(", "), replacement = "|")) -} - - -organism <- shortenedOrganismName(unique(df_rs$organism)) +organism <- shortened_organism_name(unique(df_rs$organism)) annot_key <- ifelse(organism %in% c("athaliana"), 'TAIR', 'ENSEMBL') if (organism %in% c("athaliana")) { ENSEMBL_VERSION = ensembl_version ensembl_genomes_portal = "plants" print(glue::glue("Using ensembl genomes ftp to get specific version of probeset id mapping table. Ensembl genomes portal: {ensembl_genomes_portal}, version: {ENSEMBL_VERSION}")) - expected_attribute_name <- getBioMartAttribute(df_rs) + expected_attribute_name <- get_biomart_attribute(df_rs) df_mapping <- retry_with_delay( - get_ensembl_genomes_mappings_from_ftp, + get_ensembl_genomes_mappings_from_ftp, organism = organism, ensembl_genomes_portal = ensembl_genomes_portal, ensembl_genomes_version = ENSEMBL_VERSION, @@ -816,10 +1086,10 @@ if (organism %in% c("athaliana")) { } else { # Use biomart from main Ensembl website which archives keep each release on the live service # locate dataset - expected_dataset_name <- shortenedOrganismName(unique(df_rs$organism)) %>% stringr::str_c("_gene_ensembl") + expected_dataset_name <- shortened_organism_name(unique(df_rs$organism)) %>% stringr::str_c("_gene_ensembl") print(paste0("Expected dataset name: '", expected_dataset_name, "'")) - expected_attribute_name <- getBioMartAttribute(df_rs) + expected_attribute_name <- get_biomart_attribute(df_rs) print(paste0("Expected attribute name: '", expected_attribute_name, "'")) # Specify Ensembl version used in current GeneLab reference annotations @@ -961,7 +1231,7 @@ if (use_custom_annot) { dplyr::mutate(dplyr::across(!!sym(expected_attribute_name), as.character)) %>% # Ensure probeset ids treated as character type dplyr::group_by(!!sym(expected_attribute_name)) %>% dplyr::summarise( - ENSEMBL = listToUniquePipedString(ensembl_gene_id) + ENSEMBL = list_to_unique_piped_string(ensembl_gene_id) ) %>% # Count number of ensembl IDS mapped dplyr::mutate( @@ -981,15 +1251,23 @@ probeset_expression_matrix.gene_mapped <- probeset_expression_matrix %>% dplyr::mutate( gene_mapping_source := unique(unique_probe_ids$gene_mapping_source) ) ``` +**Custom Functions Used:** + +- [retry_with_delay()](#retry_with_delay) +- [shortened_organism_name()](#shortened_organism_name) +- [get_biomart_attribute()](#get_biomart_attribute) +- [get_ensembl_genomes_mappings_from_ftp()](#get_ensembl_genomes_mappings_from_ftp) +- [list_to_unique_piped_string()](#list_to_unique_piped_string) + **Input Data:** - `df_rs$organism` (organism specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) - `df_rs$biomart_attribute` (array design biomart identifier specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) -- `annotation_file_path` (reference organism annotation file url indicated in the 'genelab_annots_link' column of the GeneLab Annotations file provided in `annotation_table_link`, output from [Step 2b](#2b-load-annotation-metadata)) -- `ensembl_version` (reference organism Ensembl version indicated in the 'ensemblVersion' column of the GeneLab Annotations file provided in `annotation_table_link`, output from [Step 2b](#2b-load-annotation-metadata)) +- `annotation_file_path` (reference organism annotation file url indicated in the 'genelab_annots_link' column of the GeneLab Annotations file provided in `annotation_table_link`, output from [Step 2d](#2d-load-annotation-metadata)) +- `ensembl_version` (reference organism Ensembl version indicated in the 'ensemblVersion' column of the GeneLab Annotations file provided in `annotation_table_link`, output from [Step 2d](#2d-load-annotation-metadata)) - `annot_key` (keytype to join annotation table and microarray probes, dependent on organism, e.g. mus musculus uses 'ENSEMBL') -- `local_annotation_dir` (path to local annotation directory if using custom annotations, output from [Step 2b](#2b-load-annotation-metadata)) -- `annotation_config_path` (URL or path to annotation config file if using custom annotations, output from [Step 2b](#2b-load-annotation-metadata)) +- `local_annotation_dir` (path to local annotation directory if using custom annotations, output from [Step 2d](#2d-load-annotation-metadata)) +- `annotation_config_path` (URL or path to annotation config file if using custom annotations, output from [Step 2d](#2d-load-annotation-metadata)) > Note: See [Affymetrix_array_annotations.csv](../Array_Annotations/Affymetrix_array_annotations.csv) for the latest config file used at GeneLab. This file can also be created manually by following the [file specification](../Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md). @@ -1118,7 +1396,7 @@ write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "norm **Input Data:** -- `df_rs` (R dataframe containing information from the runsheet, output from [Step 2a](#2a-load-metadata-and-raw-data)) +- `df_rs` (R dataframe containing information from the runsheet, output from [Step 2c](#2c-load-metadata-and-raw-data)) - `annot_key` (keytype to join annotation table and microarray probes, dependent on organism, e.g. mus musculus uses 'ENSEMBL', defined in [Step 8a](#8a-get-probeset-annotations)) - `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations, output from [Step 8a](#8a-get-probeset-annotations) above) - `background_corrected_data` (R object containing background-corrected microarray data, output from [Step 4](#4-background-correction)) @@ -1140,51 +1418,8 @@ write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "norm ### 9a. Generate Design Matrix ```R -# Pull all factors for each sample in the study from the runsheet created in Step 1 -runsheetToDesignMatrix <- function(runsheet_path) { - df <- read.csv(runsheet, check.names = FALSE) %>% - dplyr::mutate_all(function(x) iconv(x, "latin1", "ASCII", sub="")) # Convert all characters to ascii, when not possible, remove the character # get only Factor Value columns - factors = as.data.frame(df[,grep("Factor.Value", colnames(df), ignore.case=TRUE)]) - colnames(factors) = paste("factor",1:dim(factors)[2], sep= "_") - - # Load metadata from runsheet csv file - compare_csv = data.frame(sample_id = df[,c("Sample Name")], factors) - - # Create data frame containing all samples and respective factors - study <- as.data.frame(compare_csv[,2:dim(compare_csv)[2]]) - colnames(study) <- colnames(compare_csv)[2:dim(compare_csv)[2]] - rownames(study) <- compare_csv[,1] - - # Format groups and indicate the group that each sample belongs to - if (dim(study)[2] >= 2){ - group<-apply(study,1,paste,collapse = " & ") # concatenate multiple factors into one condition per sample - } else{ - group<-study[,1] - } - group_names <- paste0("(",group,")",sep = "") # human readable group names - group <- sub("^BLOCKER_", "", make.names(paste0("BLOCKER_", group))) # group naming compatible with R models, this maintains the default behaviour of make.names with the exception that 'X' is never prepended to group namesnames(group) <- group_names - names(group) <- group_names - - # Format contrasts table, defining pairwise comparisons for all groups - contrast.names <- combn(levels(factor(names(group))),2) # generate matrix of pairwise group combinations for comparison - contrasts <- apply(contrast.names, MARGIN=2, function(col) sub("^BLOCKER_", "", make.names(paste0("BLOCKER_", stringr::str_sub(col, 2, -2))))) - contrast.names <- c(paste(contrast.names[1,],contrast.names[2,],sep = "v"),paste(contrast.names[2,],contrast.names[1,],sep = "v")) # format combinations for output table files names - contrasts <- cbind(contrasts,contrasts[c(2,1),]) - colnames(contrasts) <- contrast.names - sampleTable <- data.frame(condition=factor(group)) - rownames(sampleTable) <- df[,c("Sample Name")] - - condition <- sampleTable[,'condition'] - names_mapping <- as.data.frame(cbind(safe_name = as.character(condition), original_name = group_names)) - - design <- model.matrix(~ 0 + condition) - design_data <- list( matrix = design, mapping = names_mapping, groups = as.data.frame( cbind(sample = df[,c("Sample Name")], group = group_names) ), contrasts = contrasts ) - return(design_data) -} - - # Loading metadata from runsheet csv file -design_data <- runsheetToDesignMatrix(runsheet) +design_data <- runsheet_to_design_matrix(runsheet) design <- design_data$matrix # Write SampleTable.csv and contrasts.csv file @@ -1192,12 +1427,21 @@ write.csv(design_data$groups, file.path(DIR_DGE, "SampleTable_GLmicroarray.csv") write.csv(design_data$contrasts, file.path(DIR_DGE, "contrasts_GLmicroarray.csv")) ``` +**Custom Functions Used:** + +- [runsheet_to_design_matrix()](#runsheet_to_design_matrix) + **Input Data:** - `runsheet` (path to runsheet, output from [Step 1](#1-create-sample-runsheet)) **Output Data:** +- `design_data` (a list of R objects containing the sample information and metadata + - `design_data$matrix` - the limma study design matrix, indicating the group that each sample belongs to + - `design_data$mapping` - a dataframe of conditions and group names + - `design_data$groups` - a dataframe of samples and group names + - `design_data$contrasts` - a matrix of all pairwise comparisons of the groups) - `design` (R object containing the limma study design matrix, indicating the group that each sample belongs to) - **SampleTable_GLmicroarray.csv** (table containing samples and their respective groups) - **contrasts_GLmicroarray.csv** (table containing all pairwise comparisons) @@ -1207,26 +1451,8 @@ write.csv(design_data$contrasts, file.path(DIR_DGE, "contrasts_GLmicroarray.csv" ### 9b. Perform Individual Probeset Level DE ```R -lmFitPairwise <- function(norm_data, design) { - #' Perform all pairwise comparisons - - #' Approach based on limma manual section 17.4 (version 3.52.4) - - fit <- limma::lmFit(norm_data, design) - - # Create Contrast Model - fit.groups <- colnames(fit$design)[which(fit$assign == 1)] - combos <- combn(fit.groups,2) - contrasts<-c(paste(combos[1,],combos[2,],sep = "-"),paste(combos[2,],combos[1,],sep = "-")) # format combinations for limma:makeContrasts - cont.matrix <- limma::makeContrasts(contrasts=contrasts,levels=design) - contrast.fit <- limma::contrasts.fit(fit, cont.matrix) - - contrast.fit <- limma::eBayes(contrast.fit,trend=TRUE,robust=TRUE) - return(contrast.fit) -} - # Calculate results -res <- lmFitPairwise(probeset_level_data, design) +res <- lm_fit_pairwise(probeset_level_data, design) # Print DE table, without filtering limma::write.fit(res, adjust = 'BH', @@ -1236,15 +1462,18 @@ limma::write.fit(res, adjust = 'BH', sep = ",") ``` +**Custom Functions Used:** + +- [lm_fit_pairwise()](#lm_fit_pairwise) + **Input Data:** -- `norm_data` (R object containing background-corrected and normalized microarray data, output from [Step 5](#5-between-array-normalization)) -- `design` (R object containing the limma study design matrix, indicating the group that each sample belongs to, output from [Step 9a](#9a-generate-design-matrix) above) - `probeset_level_data` (R object containing probeset level expression values after summarization of normalized probeset level data, output from [Step 7](#7-probeset-summarization)) +- `design` (R object containing the limma study design matrix, indicating the group that each sample belongs to, output from [Step 9a](#9a-generate-design-matrix) above) **Output Data:** -- INTERIM.csv (Statistical values from individual probeset level DE analysis, including: +- INTERIM.csv (statistical values from individual probeset level DE analysis, including: - Log2fc between all pairwise comparisons - T statistic for all pairwise comparison tests - P value for all pairwise comparison tests @@ -1264,26 +1493,6 @@ df_interim <- read.csv("INTERIM.csv") df_interim <- df_interim %>% dplyr::bind_cols(probeset_expression_matrix.gene_mapped) -# Reformat column names -reformat_names <- function(colname, group_name_mapping) { - new_colname <- colname %>% - stringr::str_replace(pattern = "^P.value.adj.condition", replacement = "Adj.p.value_") %>% - stringr::str_replace(pattern = "^P.value.condition", replacement = "P.value_") %>% - stringr::str_replace(pattern = "^Coef.condition", replacement = "Log2fc_") %>% # This is the Log2FC as per: https://rdrr.io/bioc/limma/man/writefit.html - stringr::str_replace(pattern = "^t.condition", replacement = "T.stat_") %>% - stringr::str_replace(pattern = ".condition", replacement = "v") - - # remap to group names before make.names was applied - unique_group_name_mapping <- unique(group_name_mapping) %>% arrange(-nchar(safe_name)) - for ( i in seq(nrow(unique_group_name_mapping)) ) { - safe_name <- unique_group_name_mapping[i,]$safe_name - original_name <- unique_group_name_mapping[i,]$original_name - new_colname <- new_colname %>% stringr::str_replace(pattern = stringr::fixed(safe_name), replacement = original_name) - } - - return(new_colname) -} - df_interim <- df_interim %>% dplyr::rename_with(reformat_names, .cols = matches('\\.condition'), group_name_mapping = design_data$mapping) @@ -1340,22 +1549,6 @@ PROBE_INFO_COLUMN_ORDER = c( "gene_mapping_source" ) -generate_prefixed_column_order <- function(subjects, prefixes) { - #' Return a vector of columns based on subject and given prefixes - #' Used for both contrasts and groups column name generation - - # Track order of columns - final_order = c() - - # For each contrast - for (subject in subjects) { - # Generate column names for each prefix and append to final_order - for (prefix in prefixes) { - final_order <- append(final_order, glue::glue("{prefix}{subject}")) - } - } - return(final_order) -} STAT_COLUMNS_ORDER <- generate_prefixed_column_order( subjects = colnames(design_data$contrasts), prefixes = c( @@ -1404,9 +1597,15 @@ df_interim <- df_interim %>% dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER)) write.csv(df_interim, file.path(DIR_DGE, "differential_expression_GLmicroarray.csv"), row.names = FALSE) ``` +**Custom Functions Used:** + +- [reformat_names()](#reformat_names) +- [generate_prefixed_column_order()](#generate_prefixed_column_order) + **Input Data:** -- INTERIM.csv (Statistical values from individual probeset level DE analysis, output from [Step 9b](#9b-perform-individual-probeset-level-de) above) +- `design_data` (a list of R objects containing the sample information and metadata, output from [Step 9a](#9a-generate-design-matrix) above) +- INTERIM.csv (statistical values from individual probeset level DE analysis, output from [Step 9b](#9b-perform-individual-probeset-level-de) above) - `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations, output from [Step 8a](#8a-get-probeset-annotations) above) **Output Data:** diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd index bfa00ee7..2f024992 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd @@ -120,7 +120,7 @@ DT::datatable(df_rs) print("Here are the expected comparison groups") # NON_DPPD:END print("Loading Raw Data...") # NON_DPPD -allTrue <- function(i_vector) { +all_true <- function(i_vector) { if ( length(i_vector) == 0 ) { stop(paste("Input vector is length zero")) } @@ -128,13 +128,13 @@ allTrue <- function(i_vector) { } # Define paths to raw data files -runsheetPathsAreURIs <- function(df_runsheet) { - allTrue(stringr::str_starts(df_runsheet$`Array Data File Path`, "https")) +runsheet_paths_are_URIs <- function(df_runsheet) { + all_true(stringr::str_starts(df_runsheet$`Array Data File Path`, "https")) } # Download raw data files -downloadFilesFromRunsheet <- function(df_runsheet) { +download_files_from_runsheet <- function(df_runsheet) { urls <- df_runsheet$`Array Data File Path` destinationFiles <- df_runsheet$`Array Data File Name` @@ -151,9 +151,9 @@ downloadFilesFromRunsheet <- function(df_runsheet) { } -if ( runsheetPathsAreURIs(df_rs) ) { +if ( runsheet_paths_are_URIs(df_rs) ) { print("Determined Raw Data Locations are URIS") - local_paths <- retry_with_delay(downloadFilesFromRunsheet, df_rs) + local_paths <- retry_with_delay(download_files_from_runsheet, df_rs) } else { print("Or Determined Raw Data Locations are local paths") local_paths <- df_rs$`Array Data File Path` @@ -161,7 +161,7 @@ if ( runsheetPathsAreURIs(df_rs) ) { # uncompress files if needed -if ( allTrue(stringr::str_ends(local_paths, ".gz")) ) { +if ( all_true(stringr::str_ends(local_paths, ".gz")) ) { print("Determined these files are gzip compressed... uncompressing now") # This does the uncompression lapply(local_paths, R.utils::gunzip, remove = FALSE, overwrite = TRUE) @@ -186,9 +186,9 @@ DT::datatable(df_local_paths) # Retry with delay here to accomodate oligo's automatic loading of annotation packages and occasional internet related failures to load raw_data <- retry_with_delay( oligo::read.celfiles, - df_local_paths$`Local Paths`, - sampleNames = df_local_paths$`Sample Name`# Map column names as Sample Names (instead of default filenames) - ) + df_local_paths$`Local Paths`, + sampleNames = df_local_paths$`Sample Name`# Map column names as Sample Names (instead of default filenames) + ) print(str(raw_data)) @@ -470,9 +470,9 @@ par(original_par) #| message: false # Call RMA but skip normalize and background correction since those have already been applied probeset_level_data <- oligo::rma(norm_data, - normalize=FALSE, - background=FALSE - ) + normalize=FALSE, + background=FALSE + ) # Summarize background-corrected and normalized data print("Summarized Probeset Level Data Below") # NON_DPPD @@ -494,7 +494,7 @@ DT::datatable(head(raw_data$genes, n = 20), caption = "First 20 rows of raw data ``` {r retrieve-probeset-annotations} #| message: false -shortenedOrganismName <- function(long_name) { +shortened_organism_name <- function(long_name) { #' Convert organism names like 'Homo Sapiens' into 'hsapiens' tokens <- long_name %>% stringr::str_split(" ", simplify = TRUE) genus_name <- tokens[1] @@ -506,7 +506,7 @@ shortenedOrganismName <- function(long_name) { return(short_name) } -getBioMartAttribute <- function(df_rs) { +get_biomart_attribute <- function(df_rs) { #' Returns resolved biomart attribute source from runsheet # NON_DPPD:START #' this either comes from the runsheet or as a fall back, the parameters injected during render @@ -562,23 +562,23 @@ get_ensembl_genomes_mappings_from_ftp <- function(organism, ensembl_genomes_port } # Convert list of multi-mapped genes to string -listToUniquePipedString <- function(str_list) { +list_to_unique_piped_string <- function(str_list) { #! convert lists into strings denoting unique elements separated by '|' characters #! e.g. c("GO1","GO2","GO2","G03") -> "GO1|GO2|GO3" return(toString(unique(str_list)) %>% stringr::str_replace_all(pattern = stringr::fixed(", "), replacement = "|")) } -organism <- shortenedOrganismName(unique(df_rs$organism)) +organism <- shortened_organism_name(unique(df_rs$organism)) annot_key <- ifelse(organism %in% c("athaliana"), 'TAIR', 'ENSEMBL') if (organism %in% c("athaliana")) { ENSEMBL_VERSION = ensembl_version ensembl_genomes_portal = "plants" print(glue::glue("Using ensembl genomes ftp to get specific version of probeset id mapping table. Ensembl genomes portal: {ensembl_genomes_portal}, version: {ENSEMBL_VERSION}")) - expected_attribute_name <- getBioMartAttribute(df_rs) + expected_attribute_name <- get_biomart_attribute(df_rs) df_mapping <- retry_with_delay( - get_ensembl_genomes_mappings_from_ftp, + get_ensembl_genomes_mappings_from_ftp, organism = organism, ensembl_genomes_portal = ensembl_genomes_portal, ensembl_genomes_version = ENSEMBL_VERSION, @@ -593,11 +593,11 @@ if (organism %in% c("athaliana")) { } else { # Use biomart from main Ensembl website which archives keep each release on the live service # locate dataset - expected_dataset_name <- shortenedOrganismName(unique(df_rs$organism)) %>% stringr::str_c("_gene_ensembl") + expected_dataset_name <- shortened_organism_name(unique(df_rs$organism)) %>% stringr::str_c("_gene_ensembl") print(paste0("Expected dataset name: '", expected_dataset_name, "'")) message(paste0("Expected dataset name: '", expected_dataset_name, "'")) # NON_DPPD - expected_attribute_name <- getBioMartAttribute(df_rs) + expected_attribute_name <- get_biomart_attribute(df_rs) print(paste0("Expected attribute name: '", expected_attribute_name, "'")) message(paste0("Expected attribute name: '", expected_attribute_name, "'")) # NON_DPPD @@ -751,7 +751,7 @@ if (use_custom_annot) { dplyr::mutate(dplyr::across(!!sym(expected_attribute_name), as.character)) %>% # Ensure probeset ids treated as character type dplyr::group_by(!!sym(expected_attribute_name)) %>% dplyr::summarise( - ENSEMBL = listToUniquePipedString(ensembl_gene_id) + ENSEMBL = list_to_unique_piped_string(ensembl_gene_id) ) %>% # Count number of ensembl IDS mapped dplyr::mutate( @@ -886,7 +886,7 @@ write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "norm #| include: !expr params$run_DE #| eval: !expr params$run_DE -runsheetToDesignMatrix <- function(runsheet_path) { +runsheet_to_design_matrix <- function(runsheet_path) { df <- read.csv(runsheet, check.names = FALSE) %>% dplyr::mutate_all(function(x) iconv(x, "latin1", "ASCII", sub="")) # Convert all characters to ascii, when not possible, remove the character # get only Factor Value columns factors = as.data.frame(df[,grep("Factor.Value", colnames(df), ignore.case=TRUE)]) @@ -929,7 +929,7 @@ runsheetToDesignMatrix <- function(runsheet_path) { # Loading metadata from runsheet csv file -design_data <- runsheetToDesignMatrix(runsheet) +design_data <- runsheet_to_design_matrix(runsheet) design <- design_data$matrix # Write SampleTable.csv and contrasts.csv file @@ -943,7 +943,7 @@ write.csv(design_data$contrasts, file.path(DIR_DGE, "contrasts_GLmicroarray.csv" #| include: !expr params$run_DE #| eval: !expr params$run_DE -lmFitPairwise <- function(norm_data, design) { +lm_fit_pairwise <- function(norm_data, design) { #' Perform all pairwise comparisons #' Approach based on limma manual section 17.4 (version 3.52.4) @@ -962,7 +962,7 @@ lmFitPairwise <- function(norm_data, design) { } # Calculate results -res <- lmFitPairwise(probeset_level_data, design) +res <- lm_fit_pairwise(probeset_level_data, design) DT::datatable(limma::topTable(res)) # NON_DPPD # Print DE table, without filtering From 0d794474c20680206dba90a8083ba76963991d3c Mon Sep 17 00:00:00 2001 From: cyouh95 Date: Tue, 25 Mar 2025 00:15:47 -0700 Subject: [PATCH 24/25] NF_MAAffymetrix: update nextflow version from 23.10.1 to 24.10.5 --- .../NF_MAAffymetrix/workflow_code/nextflow.config | 2 +- Microarray/Affymetrix/Workflow_Documentation/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/nextflow.config b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/nextflow.config index 984eb788..fcd6ebbe 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/nextflow.config +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/nextflow.config @@ -44,7 +44,7 @@ manifest { description = 'Affymetrix Microarray Workflow for Document GL-DPPD-7114-A' mainScript = 'main.nf' defaultBranch = 'main' - nextflowVersion = '>=23.10.1' + nextflowVersion = '>=24.10.5' version = '1.0.5' } diff --git a/Microarray/Affymetrix/Workflow_Documentation/README.md b/Microarray/Affymetrix/Workflow_Documentation/README.md index ae9dbdd6..ce2148b1 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/README.md +++ b/Microarray/Affymetrix/Workflow_Documentation/README.md @@ -8,7 +8,7 @@ GeneLab has wrapped each step of the pipeline into a workflow with validation an |Pipeline Version|Current Workflow Version (for respective pipeline version)|Nextflow Version| |:---------------|:---------------------------------------------------------|:---------------| -|*[GL-DPPD-7114-A.md](../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md)|[NF_MAAffymetrix_1.0.5](NF_MAAffymetrix)|23.10.1| +|*[GL-DPPD-7114-A.md](../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md)|[NF_MAAffymetrix_1.0.5](NF_MAAffymetrix)|24.10.5| *Current GeneLab Pipeline/Workflow Implementation From 93d32ee5d8f9f97cee1ec69213dd05ef7baa33e2 Mon Sep 17 00:00:00 2001 From: Barbara Novak <19824106+bnovak32@users.noreply.github.com> Date: Wed, 28 May 2025 14:38:52 -0700 Subject: [PATCH 25/25] Update pipeline and annotation README docs - Update pipeline doc header - Add missing bioconductor/tidyverse software info - update bioconductor to correct version throughout document - fix dp_tools command to use plugin and point to workflow code for plugin example - add optional URL/date columns to annotation file readme - update links throughout both documents to point to absolute paths for anything outside of the Affymetrix folder --- .../GL-DPPD-7114-A.md | 55 ++++++++++--------- .../examples/annotations/README.md | 12 +++- 2 files changed, 39 insertions(+), 28 deletions(-) diff --git a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md index 8f78a433..d3e5393a 100644 --- a/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md +++ b/Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114-A.md @@ -6,7 +6,7 @@ --- -**Date:** February XX, 2025 +**Date:** May XX, 2025 **Revision:** -A **Document Number:** GL-DPPD-7114-A @@ -15,15 +15,16 @@ Crystal Han (GeneLab Data Processing Team) **Approved by:** Samrawit Gebre (OSDR Project Manager) -Lauren Sanders (OSDR Project Scientist) +Danielle Lopez (OSDR Deputy Project Manager) +Jonathan Galazka (OSDR Project Scientist) Amanda Saravia-Butler (GeneLab Science Lead) -Barbara Novak (GeneLab Data Processing Lead) +Barbara Novak (GeneLab Data Processing Lead) --- ## Updates from previous version -Updated [Ensembl Reference Files](../../../GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv) to the following releases: +Updated [Ensembl Reference Files](https://github.com/nasa/GeneLab_Data_Processing/blob/master/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv) to the following releases: - Animals: Ensembl release 112 - Plants: Ensembl plants release 59 - Bacteria: Ensembl bacteria release 59 @@ -45,6 +46,8 @@ Software Updates: |matrixStats|0.63.0|1.5.0| |dp_tools|1.3.4|1.3.5| |Quarto|1.2.313|1.6.40| +|Bioconductor|3.14|3.20| +|tidyverse|1.3.1|2.0.0| MA Plots @@ -52,7 +55,7 @@ MA Plots Custom Annotations -- Added ability to use custom gene annotations when annotations are not available in Biomart or Ensembl FTP for *Arabidopsis thaliana*, see [Step 8](#8-probeset-annotations) +- Added ability to use custom gene annotations when annotations are not available in Biomart or Ensembl FTP, see [Step 8](#8-probeset-annotations) --- @@ -95,21 +98,21 @@ Custom Annotations |Program|Version|Relevant Links| |:------|:------:|:-------------| |R|4.4.2|[https://www.r-project.org/](https://www.r-project.org/)| +|Bioconductor|3.20|[https://bioconductor.org](https://bioconductor.org)| +|tidyverse|2.0.0|[https://www.tidyverse.org](https://www.tidyverse.org) |DT|0.33|[https://github.com/rstudio/DT](https://github.com/rstudio/DT)| |dplyr|1.1.4|[https://dplyr.tidyverse.org](https://dplyr.tidyverse.org)| |tibble|3.2.1|[https://tibble.tidyverse.org](https://tibble.tidyverse.org)| |stringr|1.5.1|[https://stringr.tidyverse.org](https://stringr.tidyverse.org)| |R.utils|2.12.3|[https://github.com/HenrikBengtsson/R.utils](https://github.com/HenrikBengtsson/R.utils)| -|oligo|1.70.0|[https://bioconductor.org/packages/3.14/bioc/html/oligo.html](https://bioconductor.org/packages/3.14/bioc/html/oligo.html)| -|limma|3.62.2|[https://bioconductor.org/packages/3.14/bioc/html/limma.html](https://bioconductor.org/packages/3.14/bioc/html/limma.html)| +|oligo|1.70.0|[https://bioconductor.org/packages/3.20/bioc/html/oligo.html](https://bioconductor.org/packages/3.20/bioc/html/oligo.html)| +|limma|3.62.2|[https://bioconductor.org/packages/3.20/bioc/html/limma.html](https://bioconductor.org/packages/3.20/bioc/html/limma.html)| |glue|1.8.0|[https://glue.tidyverse.org](https://glue.tidyverse.org)| -|biomaRt|2.62.0|[https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html)| +|biomaRt|2.62.0|[https://bioconductor.org/packages/3.20/bioc/html/biomaRt.html](https://bioconductor.org/packages/3.20/bioc/html/biomaRt.html)| |matrixStats|1.5.0|[https://github.com/HenrikBengtsson/matrixStats](https://github.com/HenrikBengtsson/matrixStats)| |statmod|1.5.0|[https://github.com/cran/statmod](https://github.com/cran/statmod)| -|dp_tools|1.3.5|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)| -|singularity|3.9|[https://sylabs.io](https://sylabs.io)| +|dp_tools|1.3.5|[https://github.com/torres-alexis/dp_tools](https://github.com/torres-alexis/dp_tools)| |Quarto|1.6.40|[https://quarto.org](https://quarto.org)| - --- # General processing overview with example commands @@ -133,17 +136,15 @@ dpt-get-isa-archive \ ### Parse the metadata from the *ISA.zip file to create a sample runsheet ### -dpt-isa-to-runsheet --accession OSD-### \ - --config-type microarray \ - --config-version Latest \ - --isa-archive *ISA.zip +dpt-isa-to-runsheet --accession GLDS-### \ + --plugin-dir /path/to/dp_tools__microarray_plugin \ + --isa-archive *ISA.zip ``` **Parameter Definitions:** - `--accession OSD-###` - OSD accession ID (replace ### with the OSD number being processed), used to retrieve the urls for the ISA archive and raw expression files hosted on the GeneLab Repository -- `--config-type` - Instructs the script to extract the metadata required for `microarray` processing from the ISA archive -- `--config-version` - Specifies the `dp-tools` configuration version to use, a value of `Latest` will specify the most recent version +- `--plugin-dir /path/to/dp_tools__microarray_plugin` - specifies the path to the plugin directory defining the dp-tools configuration for the desired assay type. A plugin for both the Affymetrix microarray assay is provided in the [Workflow_Documentation](../Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/) folder - `--isa-archive` - Specifies the *ISA.zip file for the respective GLDS dataset, downloaded in the `dpt-get-isa-archive` command @@ -179,7 +180,7 @@ install.packages("matrixStats") install.packages("statmod") if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager") -BiocManager::install(version = "3.14") +BiocManager::install(version = "3.20") BiocManager::install("limma") BiocManager::install("biomaRt") BiocManager::install("oligo") @@ -273,7 +274,7 @@ options(timeout=1000) # ensure enough time for data downloads **Function Parameter Definitions:** - `i_vector=` - a vector of logical values - **Returns:** a logical of length 1; `TRUE` if all values are true, `FALSE` otherwise; stops and returns an error if input vector is empty + **Returns:** a logical vector of length 1; `TRUE` if all values are true, `FALSE` otherwise; stops and returns an error if input vector is empty #### runsheet_paths_are_URIs() @@ -292,7 +293,7 @@ options(timeout=1000) # ensure enough time for data downloads **Function Parameter Definitions:** - `df_runsheet=` - a dataframe containing the sample runsheet information - **Returns:** a logical of length 1; `TRUE` if all values in the `Array Data File Path` of the runsheet start with "https", `FALSE` otherwise; stops and returns an error if input vector is empty + **Returns:** a logical vector of length 1; `TRUE` if all values in the `Array Data File Path` of the runsheet start with "https", `FALSE` otherwise; stops and returns an error if input vector is empty #### download_files_from_runsheet() @@ -348,7 +349,7 @@ options(timeout=1000) # ensure enough time for data downloads **Function Parameter Definitions:** - `organism=` - a string containing the name of the organism (as found in the species column of the GeneLab annotation table) - - `annotation_table_link=` - a string specifying the URL or path to latest GeneLab Annotations file, see [GL-DPPD-7110-A_annotations.csv](../../../GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv) + - `annotation_table_link=` - a string specifying the URL or path to latest GeneLab Annotations file, see [GL-DPPD-7110-A_annotations.csv](https://github.com/nasa/GeneLab_Data_Processing/blob/master/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv) **Returns:** a dataframe containing all rows in the GeneLab annotations file that match the specified organism @@ -710,7 +711,7 @@ ensembl_version <- as.character(annotation_table$ensemblVersion) > Note: If not using custom annotations, leave `annotation_config_path` as `NULL`. - `df_rs$organism` (organism specified in the runsheet created in [Step 1](#1-create-sample-runsheet)) -- `annotation_table_link` (URL or path to latest GeneLab Annotations file, see [GL-DPPD-7110-A_annotations.csv](../../../GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv)) +- `annotation_table_link` (URL or path to latest GeneLab Annotations file, see [GL-DPPD-7110-A_annotations.csv](https://github.com/nasa/GeneLab_Data_Processing/blob/master/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv)) **Output Data:** @@ -1269,14 +1270,14 @@ probeset_expression_matrix.gene_mapped <- probeset_expression_matrix %>% - `local_annotation_dir` (path to local annotation directory if using custom annotations, output from [Step 2d](#2d-load-annotation-metadata)) - `annotation_config_path` (URL or path to annotation config file if using custom annotations, output from [Step 2d](#2d-load-annotation-metadata)) - > Note: See [Affymetrix_array_annotations.csv](../Array_Annotations/Affymetrix_array_annotations.csv) for the latest config file used at GeneLab. This file can also be created manually by following the [file specification](../Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md). + > Note: See [Affymetrix_array_annotations.csv](https://github.com/nasa/GeneLab_Data_Processing/tree/master/Microarray/Affymetrix/Array_Annotations/Affymetrix_array_annotations.csv) for the latest config file used at GeneLab. This file can also be created manually by following the [file specification](../Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md). - `probeset_level_data` (R object containing probeset level expression values after summarization of normalized probeset level data, output from [Step 7](#7-probeset-summarization)) **Output Data:** - `unique_probe_ids` (R object containing probeset ID to gene annotation mappings) -- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations) +- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.20/bioc/html/biomaRt.html) or custom annotations)
@@ -1303,7 +1304,7 @@ print(glue::glue("Unique Mapping Count: {slices[['Unique Mapping']]}")) **Input Data:** -- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations, output from [Step 8a](#8a-get-probeset-annotations) above) +- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.20/bioc/html/biomaRt.html) or custom annotations, output from [Step 8a](#8a-get-probeset-annotations) above) **Output Data:** @@ -1398,7 +1399,7 @@ write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "norm - `df_rs` (R dataframe containing information from the runsheet, output from [Step 2c](#2c-load-metadata-and-raw-data)) - `annot_key` (keytype to join annotation table and microarray probes, dependent on organism, e.g. mus musculus uses 'ENSEMBL', defined in [Step 8a](#8a-get-probeset-annotations)) -- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations, output from [Step 8a](#8a-get-probeset-annotations) above) +- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.20/bioc/html/biomaRt.html) or custom annotations, output from [Step 8a](#8a-get-probeset-annotations) above) - `background_corrected_data` (R object containing background-corrected microarray data, output from [Step 4](#4-background-correction)) - `norm_data` (R object containing background-corrected and normalized microarray data, output from [Step 5](#5-between-array-normalization)) - `unique_probe_ids` (R object containing probeset ID to gene annotation mappings, output from [Step 8a](#8a-get-probeset-annotations)) @@ -1606,7 +1607,7 @@ write.csv(df_interim, file.path(DIR_DGE, "differential_expression_GLmicroarray.c - `design_data` (a list of R objects containing the sample information and metadata, output from [Step 9a](#9a-generate-design-matrix) above) - INTERIM.csv (statistical values from individual probeset level DE analysis, output from [Step 9b](#9b-perform-individual-probeset-level-de) above) -- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html) or custom annotations, output from [Step 8a](#8a-get-probeset-annotations) above) +- `probeset_expression_matrix.gene_mapped` (R object containing probeset level expression values after summarization of normalized probeset level data combined with gene annotations specified by [Biomart](https://bioconductor.org/packages/3.20/bioc/html/biomaRt.html) or custom annotations, output from [Step 8a](#8a-get-probeset-annotations) above) **Output Data:** diff --git a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md index 59f2dd4a..62131ed3 100644 --- a/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md +++ b/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/examples/annotations/README.md @@ -3,7 +3,8 @@ ## Description * If using custom gene annotations when processing Affymetrix datasets through GeneLab's Affymetrix processing pipeline, a csv config file must be provided as specified below. -* See [Affymetrix_array_annotations.csv](../Array_Annotations/Affymetrix_array_annotations.csv) for the latest config file used at GeneLab. +* See [Affymetrix_array_annotations.csv](https://github.com/nasa/GeneLab_Data_Processing/tree/master/Microarray/Affymetrix/Array_Annotations/Affymetrix_array_annotations.csv) + for the latest config file used at GeneLab. ## Example @@ -18,3 +19,12 @@ | array_design | string | A bioMart attribute identifier denoting the microarray probe/probeset attribute used for annotation mapping. | AFFY E coli Genome 2 0 | | annot_type | string | Used to determine how the custom annotations are parsed before merging to the data. Currently, only the below are supported:
  • `3prime-IVT`: Annotations file is expected to be in the format of the 3' IVT expression analysis arrays annotations by [Thermo Fisher](https://www.thermofisher.com/us/en/home/life-science/microarray-analysis/microarray-data-analysis/genechip-array-annotation-files.html)
  • `custom`: Annotations file is merged as is, expected to have the following columns: `ProbesetID`, `ENTREZID`, `SYMBOL`, `GENENAME`, `ENSEMBL`, `REFSEQ`, `GOSLIM_IDS`, `STRING_id`, `count_gene_mappings`, `gene_mapping_source`
| 3prime-IVT | | annot_filename | string | Name of the custom annotations file. | E_coli_2.na36.annot.csv | + +## Optional columns +If the file was downloaded from a website, provide the download link used and date +downloaded in additional columns after the required column for traceability. + +| Column Name | Type | Description | Example | +|:------------|:-----|:------------|:--------| +| download_link | string | The URL used to retrieve the annotation file. | https://www.thermofisher.com/order/catalog/product/sec/assets?url=TFS-Assets/LSG/Support-Files/E_coli_2-na36-annot-csv.zip | +| download_date | date string | The date the file was retrieved in YYYY-MM-DD format. | 2024-06-15 |