Skip to content

Commit adf28e3

Browse files
committed
add updated dockerfile which installs python libs
1 parent 7e2bc1e commit adf28e3

File tree

4 files changed

+100
-67
lines changed

4 files changed

+100
-67
lines changed

Dockerfile

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ RUN make
5252
RUN make install
5353

5454
# install R packages
55-
RUN R --vanilla -e 'install.packages(c("data.table", "plyr", "tidyverse"), repos = "http://cran.us.r-project.org")'
55+
RUN R --vanilla -e 'install.packages(c("data.table", "plyr", "tidyverse", "optparse"), repos = "http://cran.us.r-project.org")'
5656

5757
################################################################################
5858
##################### Install SpliceAI #########################################
@@ -61,6 +61,15 @@ RUN pip3 install spliceai
6161
RUN pip3 install --upgrade tensorflow
6262
RUN pip3 install keras==2.4.3
6363

64+
################################################################################
65+
##################### Install other python libraries ###########################
66+
67+
RUN pip3 install dfply
68+
RUN pip3 install pandas
69+
RUN pip3 install numpy
70+
RUN pip3 install scipy
71+
RUN pip3 install argparse
72+
6473
################################################################################
6574
##################### Install Regtools #########################################
6675

@@ -74,6 +83,7 @@ ADD . /regtools
7483
WORKDIR /regtools
7584

7685
# compile from source
86+
RUN ls
7787
RUN mkdir build && cd build && cmake .. && make
7888

7989
################################################################################
@@ -87,5 +97,5 @@ RUN chmod ugo+x *
8797
###################### set environment path #################################
8898

8999
# add regtools executable to path
90-
ENV PATH="/regtools/build:/usr/local/bin:/usr/local/bin/R-${r_version}:${PATH}"
100+
ENV PATH="/regtools/build:/regtools/scripts:/usr/local/bin:/usr/local/bin/R-${r_version}:${PATH}"
91101

scripts/compare_junctions_hist.py

Lines changed: 64 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import csv
2-
from doctest import master
32
from itertools import groupby
43
import pandas as pd
54
from dfply import *
@@ -8,42 +7,47 @@
87
import os
98
import argparse
109

11-
input_parser = argparse.ArgumentParser(
12-
description="Run RegTools stats script",
13-
)
14-
input_parser.add_argument(
15-
'-t',
16-
'--tag',
17-
help="Variant tag parameter used to run RegTools.",
18-
)
19-
input_parser.add_argument(
20-
'-i',
21-
'--variants_file',
22-
help="File containing variants to be considered as splicing relevant."
23-
)
24-
input_parser.add_argument(
25-
'-d',
26-
'--dir_names',
27-
help="File containing directory names corresponding to each sample that is to be processed."
28-
)
29-
input_parser.add_argument(
30-
'-v',
31-
'--variant-grouping',
32-
help="",
33-
choices=['strict', 'exclude', 'include']
34-
)
35-
36-
args = input_parser.parse_args()
37-
38-
tag = args.tag
39-
splicing_variants_inputfile = args.variants_file
40-
samples_inputfile = args.dir_names
41-
variant_grouping_mode = args.variant_grouping
42-
os.chdir('/Users/kcotto/Desktop/CHOL/')
10+
# input_parser = argparse.ArgumentParser(
11+
# description="Run RegTools stats script",
12+
# )
13+
# input_parser.add_argument(
14+
# '-t',
15+
# '--tag',
16+
# help="Variant tag parameter used to run RegTools.",
17+
# )
18+
# input_parser.add_argument(
19+
# '-i',
20+
# '--variants_file',
21+
# help="File containing variants to be considered as splicing relevant."
22+
# )
23+
# input_parser.add_argument(
24+
# '-d',
25+
# '--dir_names',
26+
# help="File containing directory names corresponding to each sample that is to be processed."
27+
# )
28+
# input_parser.add_argument(
29+
# '-v',
30+
# '--variant-grouping',
31+
# help="",
32+
# choices=['strict', 'exclude', 'include']
33+
# )
34+
35+
# args = input_parser.parse_args()
36+
37+
# tag = args.tag
38+
# splicing_variants_inputfile = args.variants_file
39+
# samples_inputfile = args.dir_names
40+
# variant_grouping_mode = args.variant_grouping
41+
42+
tag = 'default'
43+
splicing_variants_inputfile = '/Users/kcotto/Desktop/MET_samples/MET_splicing_variants.bed'
44+
samples_inputfile = '/Users/kcotto/Desktop/MET_samples/samples.txt'
45+
variant_grouping_mode = 'strict'
46+
os.chdir('/Users/kcotto/Desktop/MET_samples/')
4347

4448
# read in all splicing variants
4549
all_splicing_variants = pd.read_csv(
46-
splicing_variants_inputfile, delimiter='\t', header=0)
50+
splicing_variants_inputfile, delimiter='\t', header=None)
4751

4852
# create key to match regtools variant_info column and key2 that is the same as key but with sample name added
4953

@@ -80,7 +84,7 @@ def createkey(row):
8084
# read each sample's output file into a df and subset columns, split variants into multirows,
8185
# and require that variant is in all_splicing_variants
8286
for sample in all_samples:
83-
path = f'samples/{sample}/output/cse_identify_filtered_compare_{tag}.tsv'
87+
path = f'{sample}/output/cse_identify_filtered_compare_{tag}.tsv'
8488
df = f'df_{sample}'
8589
print(f'Reading in {sample}')
8690
df = pd.read_csv(path, delimiter='\t', header=0)
@@ -130,10 +134,24 @@ def createkey(row):
130134
all_splicing_variants['key2'])]
131135
# print(samples_w_variant_df.info(verbose=True))
132136

137+
def add_zeros_variant(row):
138+
norm_scores = row[1]
139+
if norm_scores == 0:
140+
norm_scores = [0]
141+
samples_wout_variant = row[2]
142+
samples_w_variant = row[3]
143+
num_of_zeros_toadd = num_of_samples - samples_wout_variant - samples_w_variant
144+
zeros = np.repeat(0, num_of_zeros_toadd).tolist()
145+
norm_scores = norm_scores + zeros
146+
norm_scores.sort(reverse=True)
147+
new_norm_score_value = (',').join(map(str, norm_scores))
148+
return new_norm_score_value
149+
150+
# tmp_df['new_norm_scores'] = tmp_df.apply(lambda row: add_zeros_nonvariant(row), axis=1)
151+
133152
# start performing the calculations for this subset of data
134153
print('Calculating normalized scores for samples with variants of interest')
135-
# variant_grouping_mode = 'strict'
136-
if variant_grouping_mode == 'group':
154+
if variant_grouping_mode == 'include':
137155
samples_w_variant_df = (samples_w_variant_df >>
138156
group_by(X.key) >>
139157
summarize(score_tmp=X.score.sum()) >>
@@ -198,7 +216,7 @@ def createkey(row):
198216
all_splicing_variants['key2'])]
199217
del (master_df)
200218

201-
# mode = 'strict' #others include 'exclude' and 'group'
219+
# mode = 'strict' #others include 'include' and 'exclude'
202220
# if mode == 'strict':
203221
samples_wout_variant_df = (samples_wout_variant_df >>
204222
group_by(X.key) >>
@@ -214,7 +232,7 @@ def createkey(row):
214232
samples_wout_variant_df = pd.merge(samples_wout_variant_df, tmp_df, on='info')
215233
samples_wout_variant_df['samples_wout_variant_count'] = samples_wout_variant_df['norm_score_y'].astype(
216234
str).str.count(',') + 1
217-
if variant_grouping_mode == 'group' or variant_grouping_mode == 'exclude':
235+
if variant_grouping_mode == 'include' or variant_grouping_mode == 'exclude':
218236
samples_wout_variant_df = samples_wout_variant_df[~samples_wout_variant_df['junction'].isin(
219237
samples_w_variant_df['junction'])]
220238
tmp_df = samples_wout_variant_df.groupby(
@@ -226,6 +244,7 @@ def createkey(row):
226244
summarize(total_score_non=X.score.sum()) >>
227245
outer_join(samples_wout_variant_df, by='info')
228246
)
247+
print(samples_wout_variant_df.info())
229248
samples_wout_variant_df = samples_wout_variant_df[['sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
230249
'info', 'genes', 'norm_score_x_y', 'junction', 'total_score_non', 'samples_wout_variant_count']]
231250
else:
@@ -252,7 +271,7 @@ def createkey(row):
252271
tmp_df = master_df[['info', 'norm_scores_non', 'samples_wout_variant_count', 'samples_w_variant_count']]
253272
tmp_df = tmp_df.fillna(0)
254273

255-
def add_zeros(row):
274+
def add_zeros_nonvariant(row):
256275
norm_scores = row[1]
257276
if norm_scores == 0:
258277
norm_scores = [0]
@@ -265,7 +284,7 @@ def add_zeros(row):
265284
new_norm_score_value = (',').join(map(str, norm_scores))
266285
return new_norm_score_value
267286

268-
tmp_df['new_norm_scores'] = tmp_df.apply(lambda row: add_zeros(row), axis=1)
287+
tmp_df['new_norm_scores'] = tmp_df.apply(lambda row: add_zeros_nonvariant(row), axis=1)
269288
master_df = pd.merge(master_df, tmp_df, how='left' ,on='info')
270289
del(tmp_df)
271290

@@ -285,6 +304,8 @@ def get_sd(row):
285304

286305
master_df['sd_norm_score_non'] = master_df.apply(lambda row: get_sd(row), axis=1)
287306

307+
print('getting p-values for associations')
308+
288309
def get_min(row):
289310
values = row[12]
290311
values = [float(i) for i in values]
@@ -329,7 +350,5 @@ def get_pvalue_min(row):
329350
master_df = master_df.applymap(lambda x: x[0] if isinstance(x, list) else x)
330351
master_df = master_df.fillna(0)
331352

332-
master_df.to_csv(f'junction_pvalues_{tag}_out.tsv', sep='\t', index=False)
333-
print(master_df.info())
334-
# master_df = master_df[['samples', 'variant_info_x', ']]
335-
#why are variant_samples >1 missing?
353+
master_df.to_csv(f'junction_pvalues_{tag}_{variant_grouping_mode}.tsv', sep='\t', index=False)
354+
print(master_df.info())

scripts/compare_junctions_hist_v2.R

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,22 @@ library(tidyverse)
99

1010
debug = F
1111

12-
system.time({
13-
if (debug){
14-
tag = paste("_", "default", sep="")
15-
} else {
16-
# get options tag
17-
args = commandArgs(trailingOnly = TRUE)
18-
tag = args[1]
19-
input_file = args[2]
20-
if ( substr(tag, 2, 3) == "--"){
21-
stop("Please specify an option tag (e.g. \"default\", \"i20e5\")")
22-
}
23-
}
24-
25-
# tag = 'E'
26-
# input_file = '/Users/kcotto/Desktop/CHOL/all_splicing_variants_E.bed'
12+
# system.time({
13+
# if (debug){
14+
# tag = paste("_", "default", sep="")
15+
# } else {
16+
# # get options tag
17+
# args = commandArgs(trailingOnly = TRUE)
18+
# tag = args[1]
19+
# input_file = args[2]
20+
# if ( substr(tag, 2, 3) == "--"){
21+
# stop("Please specify an option tag (e.g. \"default\", \"i20e5\")")
22+
# }
23+
# }
24+
25+
setwd('~/Desktop/CHOL')
26+
tag = 'E'
27+
input_file = '/Users/kcotto/Desktop/CHOL/all_splicing_variants_E.bed'
2728

2829
# All splicing relevant variants (union of rows from variants.bed files; add column with comma-separated list of sample names)
2930
all_splicing_variants = unique(data.table::fread(input_file), sep = '\t', header = T, stringsAsFactors = FALSE)
@@ -33,7 +34,7 @@ colnames(all_splicing_variants) <- c("chrom", "start", "end", "samples")
3334
all_splicing_variants$key <- paste0(all_splicing_variants$chrom, ":", all_splicing_variants$start, "-", all_splicing_variants$end) #this key is just a 1bp-long chrom:start-stop designed to match the regtools output variant_info column
3435

3536
## Get all of the samples
36-
all_samples = strsplit(scan("dir_names.tsv", what="", sep="\n"), "[[:space:]]+")
37+
all_samples = strsplit(scan("/Users/kcotto/Desktop/CHOL/dir_names.tsv", what="", sep="\n"), "[[:space:]]+")
3738

3839
################################################################################
3940
##### Helper functions #########################################################
@@ -65,6 +66,7 @@ dt[,info := paste(chrom, start, end, anchor, variant_info, sep="_")]
6566

6667
# make a sample/variant_info key
6768
dt[,key := paste0(variant_info, "_", sample)]
69+
dt[,junction := paste0(chrom, ":", start, "-", end, "_", sample)]
6870

6971
print("zl")
7072
cse_identify_v1 <- dt
@@ -99,12 +101,12 @@ print("test4")
99101
# subset and rename columns to match the original output
100102
cse_identify_v1 <- cse_identify_v1[,c("sample.y", "variant_info", "chrom", "start", "end", "strand", "anchor",
101103
"variant_info", "info", "genes","name.y", "mean_norm_score_variant.y",
102-
"sd_norm_score_variant", "norm_scores_variant", "total_score_variant")]
104+
"sd_norm_score_variant", "norm_scores_variant", "total_score_variant", "junction")]
103105
colnames(cse_identify_v1) <- c("sample", "key", "chrom", "start", "end", "strand", "anchor", "variant_info",
104106
"info", "genes", "names", "mean_norm_score_variant", "sd_norm_score_variant",
105-
"norm_scores_variant", "total_score_variant")
107+
"norm_scores_variant", "total_score_variant", "junction_key")
106108

107-
################ aggrregate variants with no sample ############################
109+
################ aggregate variants with no sample ############################
108110

109111
# second, we just want entries where the variant is not in the sample we care about
110112
cse_identify_v2 <- cse_identify_v2[!key %chin% all_splicing_variants$key2]
@@ -129,6 +131,7 @@ a <- function(x){
129131
}
130132
cse_identify_v2 <- split(cse_identify_v2, cse_identify_v2$variant_info)
131133
cse_identify_v2 <- lapply(cse_identify_v2, a)
134+
#this is where non-variant samples are aggregated
132135
cse_identify_v2 <- rbindlist(cse_identify_v2)
133136

134137
print("test6")
@@ -309,4 +312,4 @@ regtools_data = regtools_data %>% distinct()
309312

310313
write.table(regtools_data, file=paste(input_file, "_out.tsv", sep=""), quote=FALSE, sep='\t', row.names = F)
311314

312-
})
315+
# })

scripts/filter_and_BH.R

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# filter_and_BH.R
22
library(data.table)
33
library(stats)
4+
library()
45

56
debug = F
67

0 commit comments

Comments
 (0)