Skip to content

Commit be2d7d3

Browse files
authored
Merge pull request #130 from griffithlab/improved_compare_junctions
Improved compare junctions
2 parents 17bb43d + 1e1e27c commit be2d7d3

File tree

4 files changed

+77
-43
lines changed

4 files changed

+77
-43
lines changed

Dockerfile

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
################################################################################
2+
##################### Add Container Labels #####################################
3+
LABEL "Regtools_License"="MIT"
4+
LABEL "Description"="Software package which integrate DNA-seq and RNA-seq data\
5+
to help interpret mutations in a regulatory and splicing\
6+
context."
7+
18
################################################################################
29
##################### Set Inital Image to work from ############################
310

@@ -26,13 +33,23 @@ RUN apt-get update -y && apt-get install -y \
2633
build-essential \
2734
cmake \
2835
python3
29-
36+
3037
################################################################################
31-
##################### Add Container Labels #####################################
32-
LABEL "Regtools_License"="MIT"
33-
LABEL "Description"="Software package which integrate DNA-seq and RNA-seq data\
34-
to help interpret mutations in a regulatory and splicing\
35-
context."
38+
####################### Install R ##############################################
39+
40+
# change working dir
41+
WORKDIR /usr/local/bin
42+
43+
# install R
44+
RUN wget https://cran.r-project.org/src/base/R-3/R-${r_version}.tar.gz
45+
RUN tar -zxvf R-${r_version}.tar.gz
46+
WORKDIR /usr/local/bin/R-${r_version}
47+
RUN ./configure --prefix=/usr/local/ --with-x=no
48+
RUN make
49+
RUN make install
50+
51+
# install R packages
52+
RUN R --vanilla -e 'install.packages(c("data.table", "plyr", "tidyverse"), repos = "http://cran.us.r-project.org")'
3653

3754
################################################################################
3855
##################### Install Regtools #########################################
@@ -53,20 +70,3 @@ RUN cd /regtools/build && make
5370

5471
# add regtools executable to path
5572
ENV PATH="/regtools/build:${PATH}"
56-
57-
################################################################################
58-
####################### Install R ##############################################
59-
60-
# change working dir
61-
WORKDIR /usr/local/bin
62-
63-
# install R
64-
RUN wget https://cran.r-project.org/src/base/R-3/R-${r_version}.tar.gz
65-
RUN tar -zxvf R-${r_version}.tar.gz
66-
WORKDIR /usr/local/bin/R-${r_version}
67-
RUN ./configure --prefix=/usr/local/ --with-x=no
68-
RUN make
69-
RUN make install
70-
71-
# install R packages
72-
RUN R --vanilla -e 'install.packages(c("data.table", "plyr", "tidyverse"), repos = "http://cran.us.r-project.org")'

scripts/compare_junctions_hist_v2.R

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -151,14 +151,43 @@ regtools_data = subset(regtools_data, select=columns_to_keep)
151151

152152
# zeroes need to be added in for some samples
153153
a <- function(x, y, z){
154-
toAdd <- y - length(x) - str_count(z, ',') - 1
154+
toAdd <- y - length(x) - 1
155155
# browser()
156156
toAdd <- rep(0.0000000, toAdd)
157157
x <- c(x, toAdd)
158158
return(x)
159159
}
160160
x <- mapply(a, regtools_data$norm_scores_non, length(all_samples), regtools_data$samples)
161+
162+
163+
# if (typeof(x) == 'list') {
164+
# x <- matrix(pad(unlist(x), ncols),nrow = rows, byrow = TRUE, ncol = cols)
165+
# x <- t(x)
166+
# }
167+
# browser()
168+
169+
get_num_zeros_to_rm <- function(z){
170+
num_zeroes_to_rm = str_count(z, ',')
171+
return(num_zeroes_to_rm)
172+
}
173+
174+
num_zeroes_to_rm <- mapply(get_num_zeros_to_rm, regtools_data$samples)
175+
176+
x = split(x, rep(1:ncol(x), each = nrow(x)))
161177
regtools_data$norm_scores_non = x
178+
regtools_data$zeroes_to_rm = num_zeroes_to_rm
179+
180+
rm_zeroes <- function(x,y){
181+
new_length <- length(x) - y
182+
x <- sort(x,decreasing = TRUE)
183+
x <- x[1:new_length]
184+
return(x)
185+
}
186+
187+
if (max(num_zeroes_to_rm > 0)) {
188+
x <- mapply(rm_zeroes, regtools_data$norm_scores_non, regtools_data$zeroes_to_rm)
189+
regtools_data$norm_scores_non = x
190+
}
162191
print("test7")
163192

164193
################ calculate p-values ############################################
@@ -187,8 +216,8 @@ a <- function(x){
187216
}
188217

189218
regtools_data$p_value <- apply(regtools_data, 1, a)
190-
print("test8")
191-
219+
print("Number of rows in data.table")
220+
print(length(regtools_data$samples))
192221

193222
paste_commas <- function(v){
194223
return(paste(v,collapse = ","))
@@ -203,7 +232,7 @@ regtools_data = subset(regtools_data, select=columns_to_keep)
203232
colnames(regtools_data) <- c('variant_samples', 'variant_info', 'genes', 'junction_samples', "chrom", "start", "end", 'strand', 'anchor', 'variant_junction_info',
204233
'names', 'mean_norm_score_variant', 'sd_norm_score_variant', 'norm_scores_variant',
205234
'total_score_variant', 'mean_norm_score_non', 'sd_norm_score_non', 'norm_scores_non',
206-
'total_score_non', 'p_value')
235+
'total_score_non', 'pvalue')
207236
regtools_data$sd_norm_score_variant[is.na(regtools_data$sd_norm_score_variant)] = 0
208237
regtools_data$mean_norm_score_non[is.na(regtools_data$mean_norm_score_non)] = 0
209238
regtools_data$sd_norm_score_non[is.na(regtools_data$sd_norm_score_non)] = 0
@@ -213,6 +242,6 @@ all_splicing_variants <- as.data.table(all_splicing_variants)
213242
regtools_data = regtools_data %>% distinct()
214243

215244

216-
write.table(regtools_data, file=paste(input_file, "_out_test.tsv", sep=""), quote=FALSE, sep='\t', row.names = F)
245+
write.table(regtools_data, file=paste(input_file, "_out.tsv", sep=""), quote=FALSE, sep='\t', row.names = F)
217246

218247
})

scripts/stats_wrapper.py

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,36 +17,41 @@
1717
tag = args.tag
1818
cwd = os.getcwd()
1919

20-
lines_per_file = 1000
20+
lines_per_file = 25000
2121
smallfile = None
22-
num_small_file = 0
2322
with open(f'all_splicing_variants_{tag}.bed', 'r') as bigfile:
24-
num_small_file +=1
23+
header = bigfile.readline()
2524
for lineno, line in enumerate(bigfile):
2625
if lineno % lines_per_file == 0:
2726
if smallfile:
2827
smallfile.close()
2928
small_filename = 'small_file_{}.txt'.format(lineno + lines_per_file)
3029
smallfile = open(small_filename, "w")
30+
smallfile.write(header)
3131
smallfile.write(line)
3232
if smallfile:
33-
num_small_file += 1
3433
smallfile.close()
3534
#get chunks
3635
files = glob.glob('small_file_*')
3736
files.sort()
37+
number_of_in_files = len(files)
3838
for file in files:
39-
subprocess.run(f'Rscript --vanilla ~/Git/regtools/scripts/compare_junctions_hist_v2.R {tag} {file}', shell=True, check=False)
39+
subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True)
4040
output_files = glob.glob("*_out.tsv")
41-
output_files.sort() # glob lacks reliable ordering, so impose your own if output order matters
42-
with open(f'junction_pvalues_{tag}.tsv', 'wb') as outfile:
43-
for i, fname in enumerate(output_files):
44-
with open(fname, 'rb') as infile:
45-
if i != 0:
46-
infile.readline() # Throw away header on all but first file
47-
# Block copy rest of file from input to output without parsing
48-
shutil.copyfileobj(infile, outfile)
49-
print(fname + " has been imported.")
41+
output_files.sort()# glob lacks reliable ordering, so impose your own if output order matters
42+
number_of_out_files = len(output_files)
43+
if number_of_in_files == number_of_out_files:
44+
with open(f'compare_junctions/hist/junction_pvalues_{tag}.tsv', 'wb') as outfile:
45+
for i, fname in enumerate(output_files):
46+
with open(fname, 'rb') as infile:
47+
if i != 0:
48+
infile.readline() # Throw away header on all but first file
49+
# Block copy rest of file from input to output without parsing
50+
shutil.copyfileobj(infile, outfile)
51+
print(fname + " has been imported.")
52+
else:
53+
print("Number of output files doesn't match the number of input files that should have been processed")
54+
files = glob.glob('small_file_*')
5055
for file in files:
51-
os.remove(file)
56+
os.remove(file)
5257

scripts/vep_aws_workflow.py

Whitespace-only changes.

0 commit comments

Comments
 (0)