PNNL-CompBio · sgosline · Jan 30, 2026 · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025
diff --git a/02_normalize_batchcorrect_omics.R b/02_normalize_batchcorrect_omics.R
diff --git a/02_normalize_harmonize_proteomics.html b/02_normalize_harmonize_proteomics.html
diff --git a/02_run_normalize_omics.Rmd b/02_run_normalize_omics.Rmd
@@ -0,0 +1,150 @@
+---
+title: "run_normalize_omics"
+author: "JJ"
+date: "2025-11-04"
+output: html_document
+---
+
+# Get Helper Scripts
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(synapser)
+synLogin()
+
+syn <- list(get = synapser::synGet, store = synapser::synStore)
+
+
+# Load helper metadata (your cNF_helper_code.R defines 'meta' and 'pcols')
+source("cNF_helper_code.R")
+
+# Source the pipeline
+source("02_normalize_batchcorrect_omics.R")
+
+```
+
+
+# Run batch correction / normalization across phospho, global, and rna samples.
+
+```{r}
+# ---------------------------------------------------------------------------
+# run_modality() — quick reference for args & expected batch structure
+# ---------------------------------------------------------------------------
+# Args:
+#
+# - modality  : "global" | "phospho" | "rna" (case-insensitive).
+# - batches   : list of cohort specs; each has:
+#               syn_id, cohort, fname_aliquot_index, (optional) value_start_col
+#               (auto-detects; fallback = 5 for global/phospho, 3 for rna).
+# - meta      : data.frame joined by (aliquot, cohort). Uses Specimen/Patient/Tumor
+#               if present. For RNA, falls back to normalized Specimen matching.
+# - syn       : synapser client (e.g., syn <- synLogin()).
+# - drop_name_substrings : character vector; OR-regex to drop sample cols (NULL = keep all).
+# - out_dir   : output dir (auto-created).
+# - out_prefix: filename stem (used unless save_basename set).
+# - upload_parent_id : Synapse folder ID for uploads (NULL = no upload).
+# - pcols     : named colors for Patient in PCA (optional).
+# - write_outputs : TRUE = write CSV/PDF (+upload if parent set); FALSE = in-memory only.
+# - save_basename : override file stem (else out_prefix).
+# - do_batch_correct : TRUE = ComBat by cohort; FALSE = skip (adds *_noBatchCorrect).
+#
+# Per-modality normalization
+# - phospho : 0 toNA to drop >50% missing to log2(x+0.01) to per-sample modified z.
+# - global  : log2(x) to per-sample modified z.
+# - rna     : drop >50% missing to log2(TPM+1) to per-sample modified z.
+#
+# Returns (list)
+# - se_batches : per-batch normalized SEs.
+# - se_combined: intersection-features combined SE.
+# - se_corrected: post-ComBat SE (NULL if do_batch_correct=FALSE).
+# - se_post   : SE used “post” (se_corrected or se_combined).
+# - did_combat: TRUE/FALSE.
+# - long_pre  : long table from se_combined (finite only).
+# - long_post : long table from se_post.
+# - pca_df_pre, pca_df_post : PCA inputs (complete-case features).
+# - plots     : pre_pca, pre_hist, pca, hist (ggplot).
+# - files     : if write_outputs=TRUE, $queued = written file paths.
+#
+# Notes
+# - ComBat drops samples with NA cohort; requires colData(se)$cohort.
+# - Global: splits multi-symbol Genes by ';'. RNA aliquot may be NA (Specimen fallback).
+# - Sample headers auto-detected if path-like (*.raw, *.mzML, paths).
+# ---------------------------------------------------------------------------
+
+
+
+# Substrings to drop (These were the protocol optimization samples)
+drop_subs <- c(
+  "cNF_organoid_DIA_G_02_11Feb25",
+  "cNF_organoid_DIA_G_05_11Feb25",
+  "cNF_organoid_DIA_G_06_11Feb25",
+  "cNF_organoid_DIA_P_02_29Jan25",
+  "cNF_organoid_DIA_P_05_11Feb25",
+  "cNF_organoid_DIA_P_06_11Feb25"
+)
+
+# PHOSPHO BATCHES
+phospho_batches <- list(
+  list(syn_id = "syn69963552", cohort = 1, value_start_col = 5, fname_aliquot_index = 8),
+  list(syn_id = "syn69947351", cohort = 2, value_start_col = 5, fname_aliquot_index = 9)
+)
+
+# Run
+phospho <- run_modality(
+  modality = "Phospho",
+  batches  = phospho_batches,
+  meta     = meta,
+  syn      = syn,
+  drop_name_substrings = drop_subs,
+  out_dir          = "phospho_test",
+  out_prefix       = "phospho",
+  upload_parent_id = "syn70078365",
+  pcols            = pcols,
+  write_outputs    = FALSE,
+  save_basename    = "phospho_batch12_corrected",
+  do_batch_correct = TRUE
+)
+
+# GLOBAL BATCHES
+global_batches <- list(
+  list(syn_id = "syn69947355", cohort = 1, value_start_col = 5, fname_aliquot_index = 6),
+  list(syn_id = "syn69947352", cohort = 2, value_start_col = 5, fname_aliquot_index = 7)
+)
+
+global <- run_modality(
+  modality = "Global",
+  batches  = global_batches,
+  meta     = meta,
+  syn      = syn,
+  drop_name_substrings = drop_subs,
+  out_dir          = "global_test",
+  out_prefix       = "global",
+  upload_parent_id = "syn70078365",
+  pcols            = pcols,
+  write_outputs    = FALSE,
+  save_basename    = "global_batch12_corrected",
+  do_batch_correct = TRUE
+)
+
+
+# RNA BATCHES
+rna_batches <- list(
+  list(syn_id = "syn66352931", cohort = 1, value_start_col = 5, fname_aliquot_index = 6),
+  list(syn_id = "syn70765053", cohort = 2, value_start_col = 5, fname_aliquot_index = 7)
+)
+
+rna <- run_modality(
+  modality = "rna",
+  batches  = rna_batches,
+  meta     = meta,
+  syn      = syn,
+  drop_name_substrings = drop_subs,
+  out_dir          = "rna_test",
+  out_prefix       = "rna",
+  upload_parent_id = "syn71099587",
+  pcols            = pcols,
+  write_outputs    = FALSE,
+  save_basename    = "RNA_12_no_batch_correct",
+  do_batch_correct = FALSE      #Note this is set to false right now. Not needed for RNA
+)
+
+```