Merge pull request #130 from griffithlab/improved_compare_junctions

kcotto · web-flow · commit be2d7d31b51d · 2020-01-07T13:50:45.000-06:00
Improved compare junctions
diff --git a/Dockerfile b/Dockerfile
@@ -1,3 +1,10 @@
+################################################################################
+##################### Add Container Labels #####################################
+LABEL "Regtools_License"="MIT"
+LABEL "Description"="Software package which integrate DNA-seq and RNA-seq data\
+                     to help interpret mutations in a regulatory and splicing\
+                     context."
+
 ################################################################################
 ##################### Set Inital Image to work from ############################
 
@@ -26,13 +33,23 @@ RUN apt-get update -y && apt-get install -y \
   build-essential \
   cmake \
   python3
-
+  
 ################################################################################
-##################### Add Container Labels #####################################
-LABEL "Regtools_License"="MIT"
-LABEL "Description"="Software package which integrate DNA-seq and RNA-seq data\
-                     to help interpret mutations in a regulatory and splicing\
-                     context."
+####################### Install R ##############################################
+
+# change working dir
+WORKDIR /usr/local/bin
+
+# install R
+RUN wget https://cran.r-project.org/src/base/R-3/R-${r_version}.tar.gz
+RUN tar -zxvf R-${r_version}.tar.gz
+WORKDIR /usr/local/bin/R-${r_version}
+RUN ./configure --prefix=/usr/local/ --with-x=no
+RUN make
+RUN make install
+
+# install R packages
+RUN R --vanilla -e 'install.packages(c("data.table", "plyr", "tidyverse"), repos = "http://cran.us.r-project.org")'
 
 ################################################################################
 ##################### Install Regtools #########################################
@@ -53,20 +70,3 @@ RUN cd /regtools/build && make
 
 # add regtools executable to path
 ENV PATH="/regtools/build:${PATH}"
-
-################################################################################
-####################### Install R ##############################################
-
-# change working dir
-WORKDIR /usr/local/bin
-
-# install R
-RUN wget https://cran.r-project.org/src/base/R-3/R-${r_version}.tar.gz
-RUN tar -zxvf R-${r_version}.tar.gz
-WORKDIR /usr/local/bin/R-${r_version}
-RUN ./configure --prefix=/usr/local/ --with-x=no
-RUN make
-RUN make install
-
-# install R packages
-RUN R --vanilla -e 'install.packages(c("data.table", "plyr", "tidyverse"), repos = "http://cran.us.r-project.org")'
diff --git a/scripts/compare_junctions_hist_v2.R b/scripts/compare_junctions_hist_v2.R
@@ -151,14 +151,43 @@ regtools_data = subset(regtools_data, select=columns_to_keep)
 
 # zeroes need to be added in for some samples
 a <- function(x, y, z){
-  toAdd <- y - length(x) - str_count(z, ',') - 1
+  toAdd <- y - length(x) - 1
   # browser()
   toAdd <- rep(0.0000000, toAdd)
   x <- c(x, toAdd)
   return(x)
 }
 x <- mapply(a, regtools_data$norm_scores_non, length(all_samples), regtools_data$samples)
+
+
+# if (typeof(x) == 'list') {
+#   x <- matrix(pad(unlist(x), ncols),nrow = rows, byrow = TRUE, ncol = cols)
+#   x <- t(x)
+#   }
+# browser()
+
+get_num_zeros_to_rm <- function(z){
+  num_zeroes_to_rm = str_count(z, ',') 
+  return(num_zeroes_to_rm)
+}
+
+num_zeroes_to_rm <- mapply(get_num_zeros_to_rm, regtools_data$samples)
+
+x = split(x, rep(1:ncol(x), each = nrow(x)))
 regtools_data$norm_scores_non = x
+regtools_data$zeroes_to_rm = num_zeroes_to_rm
+
+rm_zeroes <- function(x,y){
+  new_length <- length(x) - y
+  x <- sort(x,decreasing = TRUE)
+  x <- x[1:new_length]
+  return(x)
+}
+
+if (max(num_zeroes_to_rm > 0)) {
+x <- mapply(rm_zeroes, regtools_data$norm_scores_non, regtools_data$zeroes_to_rm)
+regtools_data$norm_scores_non = x
+}
 print("test7")
 
 ################ calculate p-values ############################################
@@ -187,8 +216,8 @@ a <- function(x){
 }
 
 regtools_data$p_value <- apply(regtools_data, 1, a)
-print("test8")
-
+print("Number of rows in data.table")
+print(length(regtools_data$samples))
 
 paste_commas <- function(v){
    return(paste(v,collapse = ","))
@@ -203,7 +232,7 @@ regtools_data = subset(regtools_data, select=columns_to_keep)
 colnames(regtools_data) <- c('variant_samples', 'variant_info', 'genes', 'junction_samples', "chrom", "start", "end", 'strand', 'anchor', 'variant_junction_info',
                              'names', 'mean_norm_score_variant', 'sd_norm_score_variant', 'norm_scores_variant',
                              'total_score_variant', 'mean_norm_score_non', 'sd_norm_score_non', 'norm_scores_non',
-                             'total_score_non', 'p_value')
+                             'total_score_non', 'pvalue')
 regtools_data$sd_norm_score_variant[is.na(regtools_data$sd_norm_score_variant)] = 0
 regtools_data$mean_norm_score_non[is.na(regtools_data$mean_norm_score_non)] = 0
 regtools_data$sd_norm_score_non[is.na(regtools_data$sd_norm_score_non)] = 0
@@ -213,6 +242,6 @@ all_splicing_variants <- as.data.table(all_splicing_variants)
 regtools_data = regtools_data %>% distinct()
 
 
-write.table(regtools_data, file=paste(input_file, "_out_test.tsv", sep=""), quote=FALSE, sep='\t', row.names = F)
+write.table(regtools_data, file=paste(input_file, "_out.tsv", sep=""), quote=FALSE, sep='\t', row.names = F)
 
 })
diff --git a/scripts/stats_wrapper.py b/scripts/stats_wrapper.py
@@ -17,36 +17,41 @@
 tag = args.tag
 cwd = os.getcwd()
 
-lines_per_file = 1000
+lines_per_file = 25000
 smallfile = None
-num_small_file = 0
 with open(f'all_splicing_variants_{tag}.bed', 'r') as bigfile:
-    num_small_file +=1
+    header = bigfile.readline()
     for lineno, line in enumerate(bigfile):
         if lineno % lines_per_file == 0:
             if smallfile:
                 smallfile.close()
             small_filename = 'small_file_{}.txt'.format(lineno + lines_per_file)
             smallfile = open(small_filename, "w")
+            smallfile.write(header)
         smallfile.write(line)
     if smallfile:
-        num_small_file += 1
         smallfile.close()
 #get chunks
 files = glob.glob('small_file_*')
 files.sort()
+number_of_in_files = len(files)
 for file in files:
-    subprocess.run(f'Rscript --vanilla ~/Git/regtools/scripts/compare_junctions_hist_v2.R {tag} {file}', shell=True, check=False)
+    subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True)
 output_files = glob.glob("*_out.tsv")
-output_files.sort()  # glob lacks reliable ordering, so impose your own if output order matters
-with open(f'junction_pvalues_{tag}.tsv', 'wb') as outfile:
-    for i, fname in enumerate(output_files):
-        with open(fname, 'rb') as infile:
-            if i != 0:
-                infile.readline()  # Throw away header on all but first file
-            # Block copy rest of file from input to output without parsing
-            shutil.copyfileobj(infile, outfile)
-            print(fname + " has been imported.")
+output_files.sort()# glob lacks reliable ordering, so impose your own if output order matters
+number_of_out_files = len(output_files)
+if number_of_in_files == number_of_out_files:
+    with open(f'compare_junctions/hist/junction_pvalues_{tag}.tsv', 'wb') as outfile:
+        for i, fname in enumerate(output_files):
+            with open(fname, 'rb') as infile:
+                if i != 0:
+                    infile.readline()  # Throw away header on all but first file
+                # Block copy rest of file from input to output without parsing
+                shutil.copyfileobj(infile, outfile)
+                print(fname + " has been imported.")
+else:
+    print("Number of output files doesn't match the number of input files that should have been processed")
+files = glob.glob('small_file_*')
 for file in files:
-    os.remove(file)
+     os.remove(file)
 
diff --git a/scripts/vep_aws_workflow.py b/scripts/vep_aws_workflow.py