liulab-dfci
diff --git a/‎RIMA.snakefile‎
Lines changed: 125 additions & 0 deletions b/‎RIMA.snakefile‎
Lines changed: 125 additions & 0 deletions
diff --git a/‎config.yaml‎
Lines changed: 53 additions & 92 deletions b/‎config.yaml‎
Lines changed: 53 additions & 92 deletions
diff --git a/‎execution.yaml‎
Lines changed: 18 additions & 21 deletions b/‎execution.yaml‎
Lines changed: 18 additions & 21 deletions
diff --git a/‎metasheet.txt‎
Lines changed: 0 additions & 8 deletions b/‎metasheet.txt‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎modules/.DS_Store‎
10 KB b/‎modules/.DS_Store‎
10 KB
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+#----------------main snakefile to run RIMA-------------------------#
+import os
+import sys
+import subprocess
+import pandas as pd
+import yaml
+
+from string import Template
+#from snakemake_files.scripts.utils import getTargetInfo
+
+def getRuns(config):
+    ret = {}
+    #LEN: Weird, but using pandas to handle the comments in the file
+    #KEY: need skipinitialspace to make it fault tolerant to spaces!
+    metadata = pd.read_csv(config['metasheet'], index_col=0, sep=',', comment='#', skipinitialspace=True)
+    f = metadata.to_csv().split() #make it resemble an actual file with lines
+    #SKIP the hdr
+    for l in f[1:]:
+        tmp = l.strip().split(",")
+        #print(tmp)
+        ret[tmp[0]] = tmp[1:]
+        #print(ret)
+        config['runs'] = ret
+    return config
+
+
+def addCondaPaths_Config(config):
+    """ADDS the python2 paths to config"""
+    conda_root = subprocess.check_output('conda info --root',shell=True).decode('utf-8').strip()
+    conda_path = os.path.join(conda_root, 'pkgs')
+    current_path = os.getcwd()
+    config['conda_root'] = conda_root
+    config['stat_root'] = "%s/envs/stat_perl_r" % conda_root
+    config['centrifuge_root']= "/%s/envs/centrifuge_env" % conda_root
+    config['rseqc_root']= "/%s/envs/rseqc_env" % conda_root
+
+
+def addExecPaths(config):
+    conda_root = subprocess.check_output('conda info --root',shell=True).decode('utf-8').strip()
+    conda_path = os.path.join(conda_root, 'pkgs')
+    current_path = os.getcwd()
+    #NEED the following when invoking python2 (to set proper PYTHONPATH)
+    if not "trust4_path" in config or not config["trust4_path"]:
+        config["trust4_path"] = os.path.join(current_path,'TRUST4')
+    return config
+
+def loadRef(config):
+    f = open(config['ref'])
+    ref_info = yaml.safe_load(f)
+    f.close()
+    #print(ref_info[config['assembly']])
+    for (k,v) in ref_info[config['assembly']].items():
+    #NO CLOBBERING what is user-defined!
+        if k not in config:
+            config[k] = v
+
+def load_config(config_file):
+    #load the main config file including parameters which are not change a lot
+    with open(config_file, 'r') as stream:
+        try:
+            return yaml.safe_load(stream)
+        except yaml.YAMLError as exc:
+            print(exc)
+
+def load_execution(execution_file):
+    #load the main config file including parameters which are not change a lot
+    with open(execution_file, 'r') as stream:
+        try:
+            return yaml.safe_load(stream)
+        except yaml.YAMLError as exc:
+            print(exc)
+
+#---------  CONFIG set up  ---------------
+config = load_config('config.yaml')
+execution = load_execution('execution.yaml')
+addCondaPaths_Config(config)
+loadRef(config)
+addExecPaths(config)
+
+
+
+
+#------------------------------------------------------------------------------
+# TARGETS
+#------------------------------------------------------------------------------
+def all_targets(wildcards):
+    ls = []
+    if execution["preprocess_individual"]:
+        ls.extend(preprocess_individual_targets(wildcards))
+    if execution["preprocess_cohort"]:
+        ls.extend(preprocess_cohort_targets(wildcards))
+    if execution["differential_expression_cohort"]:
+        ls.extend(diffexpr_targets(wildcards))
+    if execution["immune_infiltration_cohort"]:
+        ls.extend(immune_infiltration_targets(wildcards))
+    if execution["immune_repertoire_individual"]:
+        ls.extend(immune_repertoire_individual_targets(wildcards))
+    if execution["microbiome_individual"]:
+        ls.extend(microbiome_individual_targets(wildcards))
+    if execution["microbiome_cohort"]:
+        ls.extend(microbiome_cohort_targets(wildcards))    
+    return ls
+
+
+rule target:
+    input:
+        all_targets
+    message: "Compiling all outputs"
+
+
+if execution["preprocess_individual"]:
+    include: "./modules/preprocess/preprocess_individual.snakefile"
+if execution["preprocess_cohort"]:
+    include: "./modules/preprocess/preprocess_cohort.snakefile"
+if execution["differential_expression_cohort"]:
+    include: "./modules/differential_expression/differential_expression_cohort.snakefile"
+if execution["immune_infiltration_cohort"]:
+    include: "./modules/immune_infiltration/immune_infiltration_cohort.snakefile"
+if execution["immune_repertoire_individual"]:
+    include: "./modules/immune_repertoire/immune_repertoire_individual.snakefile"
+if execution["microbiome_individual"]:
+    include: "./modules/microbiome/microbiome_individual.snakefile"
+if execution["microbiome_cohort"]:
+    include: "./modules/microbiome/microbiome_cohort.snakefile"
@@ -1,100 +1,61 @@
----
-############################################################
-#                   Data information                       #
-############################################################
-
-ref: ref.yaml
-assembly: mm10 #hg38 or mm10
-
-cancer_type: AUTO #short name of cancer type
-metasheet: metasheet.txt
-designs: [Condition] #the column from metasheet which is used to do comparsion
-
-############################################################
-#                     Data Processing                      #
-############################################################
-
-### star 
-#Possible values are [ff-firststrand, ff-secondstrand, ff-unstranded, fr-firststrand, fr-secondstrand, fr-unstranded (default), transfrags]
-library_type: 'fr-firststrand'
-stranded: true
-threads: 8
-
-### rseqc parameters
-rseqc_ref: house_keeping  #rseqc ref model
-
-############################################################
-#             Differential Gene Expression                 #
-############################################################
-
-
-### batch_removal and PCA analysis
-#If you don't need to do batch removal, just setting [no] 
-batch_covariates: [no]
-
-neoantigen_callers: "NetMHC"
-neoantigen_epitope_lengths: "8,9,10,11"
-
-### deseq2
-batch: [no] #[Clinical Phenotype] or [no], Clinical phenotype could be the column from metasheet which accounts for batch effect
-comparison: between ##between or loop. between: compare any two phenotypes in a given condition column;loop: compare one phenotype and all the others 
-
-############################################################
-#                       level3                             #
-############################################################
-
-### microbiota
-centrifuge: true  #run centrifuge or pathseq
-
-### Trust4
-trust4_clinical_phenotype: [Condition]
-
-### arcasHLA
-##specifying group imformation annotated in HLA oncoplot
-hla_annot_group: [Responder, Gender]
+#########Fixed and user-defined parameters################
+metasheet: metasheet.csv  # Meta info 
+ref: ref.yaml             # Reference config 
+assembly: mm10
+cancer_type: GBM          #TCGA cancer type abbreviations
+rseqc_ref: house_keeping  #Option: 'house_keeping' or 'false'. 
+                          #By default, a subset of housekeeping genes is used by RSeQC to assess alignment quality.  
+                          #This reduces the amount of time needed to run RSeQC.  
+mate: [1,2]               #paired-end fastq format, we recommend naming paired-end reads with _1.fq.gz and _2.fq.gz
+
+
+#########Cohort level analysis parameters################
+design: Responder             # Condition on which to do comparsion (as set up in metasheet.csv)
+Treatment: R              # Treatment use in DESeq2, corresponding to positive log fold change
+Control: NR               # Control use in DESeq2, corresponding to negative log fold change
+batch: syn_batch          # Options: 'false' or a column name from the metasheet.csv.  
+                          # If set to a column name in the metasheet.csv, the column name will be used for batch effect analysis                                (limma).  
+                          # It will also be used as a covariate for differential analysis (DESeq2) to account for batch effect.  
 
 ############################################################
 #                     list samples                         #
 ############################################################
+
 samples:
-  SRR8281228:
-    - /liulab/yang/Immunotherapy/toDIGEST/rawRNASeq/fastq/Zhao2019_PD1_Glioblastoma_RNASeq/SRR8281228_1.fastq.gz
-    - /liulab/yang/Immunotherapy/toDIGEST/rawRNASeq/fastq/Zhao2019_PD1_Glioblastoma_RNASeq/SRR8281228_2.fastq.gz
-  SRR8281222:
-    - /liulab/yang/Immunotherapy/toDIGEST/rawRNASeq/fastq/Zhao2019_PD1_Glioblastoma_RNASeq/SRR8281222_1.fastq.gz
-    - /liulab/yang/Immunotherapy/toDIGEST/rawRNASeq/fastq/Zhao2019_PD1_Glioblastoma_RNASeq/SRR8281222_2.fastq.gz
-  SRR8281224:
-    - /liulab/yang/Immunotherapy/toDIGEST/rawRNASeq/fastq/Zhao2019_PD1_Glioblastoma_RNASeq/SRR8281224_1.fastq.gz
-    - /liulab/yang/Immunotherapy/toDIGEST/rawRNASeq/fastq/Zhao2019_PD1_Glioblastoma_RNASeq/SRR8281224_2.fastq.gz
-  SRR8281225:
-    - /liulab/yang/Immunotherapy/toDIGEST/rawRNASeq/fastq/Zhao2019_PD1_Glioblastoma_RNASeq/SRR8281225_1.fastq.gz
-    - /liulab/yang/Immunotherapy/toDIGEST/rawRNASeq/fastq/Zhao2019_PD1_Glioblastoma_RNASeq/SRR8281225_2.fastq.gz
   SRR8281218:
-    - /liulab/yang/Immunotherapy/toDIGEST/rawRNASeq/fastq/Zhao2019_PD1_Glioblastoma_RNASeq/SRR8281218_1.fastq.gz
-    - /liulab/yang/Immunotherapy/toDIGEST/rawRNASeq/fastq/Zhao2019_PD1_Glioblastoma_RNASeq/SRR8281218_2.fastq.gz
+    - /mnt/RIMA/data/SRR8281218_1.fastq.gz
+    - /mnt/RIMA/data/SRR8281218_2.fastq.gz
   SRR8281219:
-    - /liulab/yang/Immunotherapy/toDIGEST/rawRNASeq/fastq/Zhao2019_PD1_Glioblastoma_RNASeq/SRR8281219_1.fastq.gz
-    - /liulab/yang/Immunotherapy/toDIGEST/rawRNASeq/fastq/Zhao2019_PD1_Glioblastoma_RNASeq/SRR8281219_2.fastq.gz
-  SRR8281220:
-    - /liulab/yang/Immunotherapy/toDIGEST/rawRNASeq/fastq/Zhao2019_PD1_Glioblastoma_RNASeq/SRR8281220_1.fastq.gz
-    - /liulab/yang/Immunotherapy/toDIGEST/rawRNASeq/fastq/Zhao2019_PD1_Glioblastoma_RNASeq/SRR8281220_2.fastq.gz
-
-###########################################################
-#                      run settings                       #
-###########################################################
-runs:
-  run1:
-    - SRR8281228
-  run3:
-    - SRR8281222
-  run4:
-    - SRR8281224
-  run5:
-    - SRR8281225
-  run6:
-    - SRR8281218
-  run7:
-    - SRR8281219
-  run8:
-    - SRR8281220
+    - /mnt/RIMA/data/SRR8281219_1.fastq.gz
+    - /mnt/RIMA/data/SRR8281219_2.fastq.gz
+  SRR8281226:
+    - /mnt/RIMA/data/SRR8281226_1.fastq.gz
+    - /mnt/RIMA/data/SRR8281226_2.fastq.gz
+  SRR8281236:
+    - /mnt/RIMA/data/SRR8281236_1.fastq.gz
+    - /mnt/RIMA/data/SRR8281236_2.fastq.gz
+  SRR8281230:
+    - /mnt/RIMA/data/SRR8281230_1.fastq.gz
+    - /mnt/RIMA/data/SRR8281230_2.fastq.gz
+  SRR8281233:
+    - /mnt/RIMA/data/SRR8281233_1.fastq.gz
+    - /mnt/RIMA/data/SRR8281233_2.fastq.gz
+  SRR8281244:
+    - /mnt/RIMA/data/SRR8281244_1.fastq.gz
+    - /mnt/RIMA/data/SRR8281244_2.fastq.gz
+  SRR8281245:
+    - /mnt/RIMA/data/SRR8281245_1.fastq.gz
+    - /mnt/RIMA/data/SRR8281245_2.fastq.gz
+  SRR8281243:
+    - /mnt/RIMA/data/SRR8281243_1.fastq.gz
+    - /mnt/RIMA/data/SRR8281243_2.fastq.gz
+  SRR8281251:
+    - /mnt/RIMA/data/SRR8281251_1.fastq.gz
+    - /mnt/RIMA/data/SRR8281251_2.fastq.gz
+  SRR8281238:
+    - /mnt/RIMA/data/SRR8281238_1.fastq.gz
+    - /mnt/RIMA/data/SRR8281238_2.fastq.gz
+  SRR8281250:
+    - /mnt/RIMA/data/SRR8281250_1.fastq.gz
+    - /mnt/RIMA/data/SRR8281250_2.fastq.gz
 
@@ -1,23 +1,20 @@
-##level1
-star: true
-salmon: true
-rseqc: true
-##level2
-batch_removal: false
-deseq2: false
-gsea: false
-ssgsea: false
-##level3
-microbiota: false
-trust4: false
-arcasHLA: false
-pvacseq: false
-##level4
-immunedeconv: false
-mMCP: false
-##level5
-msisensor2: false
-##Report
-report: false
 
+##Note: Preprocess individual and cohort module necessary to get the alignment and quality results.
+##Run the remaining modules only after these two modules.
+preprocess_individual: true
+preprocess_cohort: true
+
+##Optional modules
+##Note: The below modules are specialized modules, each dealing with specific targets.
+##Make sure to run individual and cohort of each module to get all the results.
+
+##Individual runs
+immune_repertoire_individual: false
+microbiome_individual: false
+
+
+##Cohort runs
+differential_expression_cohort: true
+immune_infiltration_cohort: false
+microbiome_cohort: false