Merge pull request #171 from griffithlab/testing_met

kcotto · web-flow · commit bdc22f18f79e · 2023-01-10T11:54:24.000-06:00
add updated dockerfile which installs python libs
diff --git a/Dockerfile b/Dockerfile
@@ -52,7 +52,7 @@ RUN make
 RUN make install
 
 # install R packages
-RUN R --vanilla -e 'install.packages(c("data.table", "plyr", "tidyverse"), repos = "http://cran.us.r-project.org")'
+RUN R --vanilla -e 'install.packages(c("data.table", "plyr", "tidyverse", "optparse"), repos = "http://cran.us.r-project.org")'
 
 ################################################################################
 ##################### Install SpliceAI #########################################
@@ -61,6 +61,15 @@ RUN pip3 install spliceai
 RUN pip3 install --upgrade tensorflow
 RUN pip3 install keras==2.4.3
 
+################################################################################
+##################### Install other python libraries ###########################
+
+RUN pip3 install dfply
+RUN pip3 install pandas
+RUN pip3 install numpy
+RUN pip3 install scipy
+RUN pip3 install argparse
+
 ################################################################################
 ##################### Install Regtools #########################################
 
@@ -74,6 +83,7 @@ ADD . /regtools
 WORKDIR /regtools
 
 # compile from source
+RUN ls
 RUN mkdir build && cd build && cmake .. && make
 
 ################################################################################
@@ -87,5 +97,5 @@ RUN chmod ugo+x *
 ###################### set environment path    #################################
 
 # add regtools executable to path
-ENV PATH="/regtools/build:/usr/local/bin:/usr/local/bin/R-${r_version}:${PATH}"
+ENV PATH="/regtools/build:/regtools/scripts:/usr/local/bin:/usr/local/bin/R-${r_version}:${PATH}"
 
diff --git a/scripts/compare_junctions_hist.py b/scripts/compare_junctions_hist.py
@@ -1,5 +1,4 @@
 import csv
-from doctest import master
 from itertools import groupby
 import pandas as pd
 from dfply import *
@@ -8,42 +7,47 @@
 import os
 import argparse
 
-input_parser = argparse.ArgumentParser(
-    description="Run RegTools stats script",
-)
-input_parser.add_argument(
-    '-t',
-    '--tag',
-    help="Variant tag parameter used to run RegTools.",
-)
-input_parser.add_argument(
-    '-i',
-    '--variants_file',
-    help="File containing variants to be considered as splicing relevant."
-)
-input_parser.add_argument(
-    '-d',
-    '--dir_names',
-    help="File containing directory names corresponding to each sample that is to be processed."
-)
-input_parser.add_argument(
-    '-v',
-    '--variant-grouping',
-    help="",
-    choices=['strict', 'exclude', 'include']
-)
-
-args = input_parser.parse_args()
-
-tag = args.tag
-splicing_variants_inputfile = args.variants_file
-samples_inputfile = args.dir_names
-variant_grouping_mode = args.variant_grouping
-os.chdir('/Users/kcotto/Desktop/CHOL/')
+# input_parser = argparse.ArgumentParser(
+#     description="Run RegTools stats script",
+# )
+# input_parser.add_argument(
+#     '-t',
+#     '--tag',
+#     help="Variant tag parameter used to run RegTools.",
+# )
+# input_parser.add_argument(
+#     '-i',
+#     '--variants_file',
+#     help="File containing variants to be considered as splicing relevant."
+# )
+# input_parser.add_argument(
+#     '-d',
+#     '--dir_names',
+#     help="File containing directory names corresponding to each sample that is to be processed."
+# )
+# input_parser.add_argument(
+#     '-v',
+#     '--variant-grouping',
+#     help="",
+#     choices=['strict', 'exclude', 'include']
+# )
+
+# args = input_parser.parse_args()
+
+# tag = args.tag
+# splicing_variants_inputfile = args.variants_file
+# samples_inputfile = args.dir_names
+# variant_grouping_mode = args.variant_grouping
+
+tag = 'default'
+splicing_variants_inputfile = '/Users/kcotto/Desktop/MET_samples/MET_splicing_variants.bed'
+samples_inputfile = '/Users/kcotto/Desktop/MET_samples/samples.txt'
+variant_grouping_mode = 'strict'
+os.chdir('/Users/kcotto/Desktop/MET_samples/')
 
 # read in all splicing variants
 all_splicing_variants = pd.read_csv(
-    splicing_variants_inputfile, delimiter='\t', header=0)
+    splicing_variants_inputfile, delimiter='\t', header=None)
 
 # create key to match regtools variant_info column and key2 that is the same as key but with sample name added
 
@@ -80,7 +84,7 @@ def createkey(row):
 # read each sample's output file into a df and subset columns, split variants into multirows,
 # and require that variant is in all_splicing_variants
 for sample in all_samples:
-    path = f'samples/{sample}/output/cse_identify_filtered_compare_{tag}.tsv'
+    path = f'{sample}/output/cse_identify_filtered_compare_{tag}.tsv'
     df = f'df_{sample}'
     print(f'Reading in {sample}')
     df = pd.read_csv(path, delimiter='\t', header=0)
@@ -130,10 +134,24 @@ def createkey(row):
     all_splicing_variants['key2'])]
 # print(samples_w_variant_df.info(verbose=True))
 
+def add_zeros_variant(row):
+    norm_scores = row[1]
+    if norm_scores == 0:
+        norm_scores = [0]
+    samples_wout_variant = row[2]
+    samples_w_variant = row[3]    
+    num_of_zeros_toadd = num_of_samples - samples_wout_variant - samples_w_variant
+    zeros = np.repeat(0, num_of_zeros_toadd).tolist()
+    norm_scores = norm_scores + zeros
+    norm_scores.sort(reverse=True)
+    new_norm_score_value = (',').join(map(str, norm_scores))
+    return new_norm_score_value
+
+# tmp_df['new_norm_scores'] = tmp_df.apply(lambda row: add_zeros_nonvariant(row), axis=1)
+
 # start performing the calculations for this subset of data
 print('Calculating normalized scores for samples with variants of interest')
-# variant_grouping_mode = 'strict'
-if variant_grouping_mode == 'group':
+if variant_grouping_mode == 'include':
     samples_w_variant_df = (samples_w_variant_df >>
                             group_by(X.key) >>
                             summarize(score_tmp=X.score.sum()) >>
@@ -198,7 +216,7 @@ def createkey(row):
     all_splicing_variants['key2'])]
 del (master_df)
 
-# mode = 'strict' #others include 'exclude' and 'group'
+# mode = 'strict' #others include 'include' and 'exclude'
 # if mode == 'strict':
 samples_wout_variant_df = (samples_wout_variant_df >>
                            group_by(X.key) >>
@@ -214,7 +232,7 @@ def createkey(row):
 samples_wout_variant_df = pd.merge(samples_wout_variant_df, tmp_df, on='info')
 samples_wout_variant_df['samples_wout_variant_count'] = samples_wout_variant_df['norm_score_y'].astype(
     str).str.count(',') + 1
-if variant_grouping_mode == 'group' or variant_grouping_mode == 'exclude':
+if variant_grouping_mode == 'include' or variant_grouping_mode == 'exclude':
     samples_wout_variant_df = samples_wout_variant_df[~samples_wout_variant_df['junction'].isin(
         samples_w_variant_df['junction'])]
     tmp_df = samples_wout_variant_df.groupby(
@@ -226,6 +244,7 @@ def createkey(row):
                            summarize(total_score_non=X.score.sum()) >>
                            outer_join(samples_wout_variant_df, by='info')
                            )
+    print(samples_wout_variant_df.info())
     samples_wout_variant_df = samples_wout_variant_df[['sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
                                                        'info', 'genes', 'norm_score_x_y', 'junction', 'total_score_non', 'samples_wout_variant_count']]
 else:
@@ -252,7 +271,7 @@ def createkey(row):
 tmp_df = master_df[['info', 'norm_scores_non', 'samples_wout_variant_count', 'samples_w_variant_count']]
 tmp_df = tmp_df.fillna(0)
 
-def add_zeros(row):
+def add_zeros_nonvariant(row):
     norm_scores = row[1]
     if norm_scores == 0:
         norm_scores = [0]
@@ -265,7 +284,7 @@ def add_zeros(row):
     new_norm_score_value = (',').join(map(str, norm_scores))
     return new_norm_score_value
 
-tmp_df['new_norm_scores'] = tmp_df.apply(lambda row: add_zeros(row), axis=1)
+tmp_df['new_norm_scores'] = tmp_df.apply(lambda row: add_zeros_nonvariant(row), axis=1)
 master_df = pd.merge(master_df, tmp_df, how='left' ,on='info')
 del(tmp_df)
 
@@ -285,6 +304,8 @@ def get_sd(row):
 
 master_df['sd_norm_score_non'] = master_df.apply(lambda row: get_sd(row), axis=1)
 
+print('getting p-values for associations')
+
 def get_min(row):
     values = row[12]
     values = [float(i) for i in values]
@@ -329,7 +350,5 @@ def get_pvalue_min(row):
 master_df = master_df.applymap(lambda x: x[0] if isinstance(x, list) else x)
 master_df = master_df.fillna(0)
 
-master_df.to_csv(f'junction_pvalues_{tag}_out.tsv', sep='\t', index=False)
-print(master_df.info())
-# master_df = master_df[['samples', 'variant_info_x', ']]
-#why are variant_samples >1 missing?
+master_df.to_csv(f'junction_pvalues_{tag}_{variant_grouping_mode}.tsv', sep='\t', index=False)
+print(master_df.info())
diff --git a/scripts/compare_junctions_hist_v2.R b/scripts/compare_junctions_hist_v2.R
@@ -9,21 +9,22 @@ library(tidyverse)
 
 debug = F
 
-system.time({
-if (debug){
-  tag = paste("_", "default", sep="")
-} else {
-  # get options tag
-  args = commandArgs(trailingOnly = TRUE)
-  tag = args[1]
-  input_file = args[2]
-  if ( substr(tag, 2, 3) == "--"){
-    stop("Please specify an option tag (e.g. \"default\", \"i20e5\")")
-  }
-}
-
-# tag = 'E'
-# input_file = '/Users/kcotto/Desktop/CHOL/all_splicing_variants_E.bed'
+# system.time({
+# if (debug){
+#   tag = paste("_", "default", sep="")
+# } else {
+#   # get options tag
+#   args = commandArgs(trailingOnly = TRUE)
+#   tag = args[1]
+#   input_file = args[2]
+#   if ( substr(tag, 2, 3) == "--"){
+#     stop("Please specify an option tag (e.g. \"default\", \"i20e5\")")
+#   }
+# }
+
+setwd('~/Desktop/CHOL')
+tag = 'E'
+input_file = '/Users/kcotto/Desktop/CHOL/all_splicing_variants_E.bed'
 
 # All splicing relevant variants (union of rows from variants.bed files; add column with comma-separated list of sample names)
 all_splicing_variants = unique(data.table::fread(input_file), sep = '\t', header = T, stringsAsFactors = FALSE)
@@ -33,7 +34,7 @@ colnames(all_splicing_variants) <- c("chrom", "start", "end", "samples")
 all_splicing_variants$key <- paste0(all_splicing_variants$chrom, ":", all_splicing_variants$start, "-", all_splicing_variants$end) #this key is just a 1bp-long chrom:start-stop designed to match the regtools output variant_info column
 
 ## Get all of the samples
-all_samples = strsplit(scan("dir_names.tsv", what="", sep="\n"), "[[:space:]]+")
+all_samples = strsplit(scan("/Users/kcotto/Desktop/CHOL/dir_names.tsv", what="", sep="\n"), "[[:space:]]+")
 
 ################################################################################
 ##### Helper functions #########################################################
@@ -65,6 +66,7 @@ dt[,info := paste(chrom, start, end, anchor, variant_info, sep="_")]
 
 # make a sample/variant_info key
 dt[,key := paste0(variant_info, "_", sample)]
+dt[,junction := paste0(chrom, ":", start, "-", end, "_", sample)]
 
 print("zl")
 cse_identify_v1 <- dt
@@ -99,12 +101,12 @@ print("test4")
 # subset and rename columns to match the original output
 cse_identify_v1 <- cse_identify_v1[,c("sample.y", "variant_info", "chrom", "start", "end", "strand", "anchor",
                                       "variant_info", "info", "genes","name.y", "mean_norm_score_variant.y",
-                                      "sd_norm_score_variant", "norm_scores_variant", "total_score_variant")]
+                                      "sd_norm_score_variant", "norm_scores_variant", "total_score_variant", "junction")]
 colnames(cse_identify_v1) <- c("sample", "key", "chrom", "start", "end", "strand", "anchor", "variant_info",
                                "info", "genes", "names", "mean_norm_score_variant", "sd_norm_score_variant",
-                               "norm_scores_variant", "total_score_variant")
+                               "norm_scores_variant", "total_score_variant", "junction_key")
 
-################ aggrregate variants with no sample ############################
+################ aggregate variants with no sample ############################
 
 # second, we just want entries where the variant is not in the sample we care about
 cse_identify_v2 <- cse_identify_v2[!key %chin% all_splicing_variants$key2]
@@ -129,6 +131,7 @@ a <- function(x){
 }
 cse_identify_v2 <- split(cse_identify_v2, cse_identify_v2$variant_info)
 cse_identify_v2 <- lapply(cse_identify_v2, a)
+#this is where non-variant samples are aggregated
 cse_identify_v2 <- rbindlist(cse_identify_v2)
 
 print("test6")
@@ -309,4 +312,4 @@ regtools_data = regtools_data %>% distinct()
 
 write.table(regtools_data, file=paste(input_file, "_out.tsv", sep=""), quote=FALSE, sep='\t', row.names = F)
 
-})
+# })
diff --git a/scripts/filter_and_BH.R b/scripts/filter_and_BH.R
@@ -1,6 +1,7 @@
 # filter_and_BH.R
 library(data.table)
 library(stats)
+library()
 
 debug = F