@@ -9,29 +9,36 @@ library(foreach)
9
9
library(doParallel )
10
10
registerDoParallel(cores = 32 )
11
11
12
- system.time({
13
- # get options tag
14
- # argc = length(commandArgs())
15
- tag = paste(" _" , " default" , sep = " " )
12
+ debug = F
16
13
17
- if ( substr(tag , 2 , 3 ) == " --" ){
18
- stop(" Please specify an option tag (e.g. \" default\" , \" i20e5\" )" )
14
+ system.time({
15
+ if (debug ){
16
+ tag = paste(" _" , " default" , sep = " " )
17
+ } else {
18
+ # get options tag
19
+ argc = length(commandArgs())
20
+ tag = paste(" _" , commandArgs(trailingOnly = F )[argc ], sep = " " )
21
+
22
+ if ( substr(tag , 2 , 3 ) == " --" ){
23
+ stop(" Please specify an option tag (e.g. \" default\" , \" i20e5\" )" )
24
+ }
19
25
}
20
26
21
27
# # All splicing relevant variants (union of rows from variants.bed files; add column with comma-separated list of sample names)
22
28
all_splicing_variants = unique(data.table :: fread(paste(" all_splicing_variants" , tag , " .bed" ,sep = " " ), sep = ' \t ' , header = T , stringsAsFactors = FALSE ))
23
29
colnames(all_splicing_variants ) <- c(" chrom" , " start" , " end" , " samples" )
24
30
25
31
all_splicing_variants = as.data.table(aggregate(samples ~ chrom + start + end , paste , data = all_splicing_variants ))
26
- all_splicing_variants $ key <- paste0(all_splicing_variants $ chrom , " :" , all_splicing_variants $ start , " -" , all_splicing_variants $ end )
32
+ all_splicing_variants $ key <- paste0(all_splicing_variants $ chrom , " :" , all_splicing_variants $ start , " -" , all_splicing_variants $ end ) # this key is just a 1bp-long chrom:start-stop designed to match the regtools output variant_info column
27
33
28
- # # Get all of the samples
34
+ # # Get all of the samples
29
35
all_samples = strsplit(scan(" dir_names.tsv" , what = " " , sep = " \n " ), " [[:space:]]+" )
30
36
31
37
# ###############################################################################
32
38
# #### Helper functions #########################################################
33
39
# ###############################################################################
34
40
41
+ # basically combines all the cse_identify_filtered_compare files for this cohort
35
42
get_sample_data <- function (sample ){
36
43
37
44
file_name = paste(" samples/" , sample , " /output/cse_identify_filtered_compare" , tag ," .tsv" , sep = " " )
@@ -63,8 +70,8 @@ get_sample_data <- function(sample){
63
70
# #### Combine cse_identify_filtered_compare.tsv files for each sample ##########
64
71
# ###############################################################################
65
72
66
- # this is regtools output per sample
67
- df <- rbindlist(lapply(all_samples , get_sample_data ))
73
+ # this is regtools compare output across samples (so it contains variant-junction lines even from samples without variant)
74
+ df <- rbindlist(lapply(all_samples , get_sample_data )) # eventually becomes all_cse_identify_data in function a
68
75
df $ info <- paste(df $ chrom , df $ start ,
69
76
df $ end , df $ anchor ,
70
77
df $ variant_info , sep = " _" )
@@ -74,15 +81,14 @@ df$info <- paste(df$chrom, df$start,
74
81
# ###############################################################################
75
82
76
83
a <- function (x , all_cse_identify_data ){
77
-
78
- # # Get data from samples with variant
79
- variant_samples <- unlist(x $ samples )
84
+ # # x comes from all_splicing_variants, so this just gives the samples with the variant
85
+ variant_samples <- unlist(x $ samples )
80
86
81
87
# #############################################################################
82
88
# #### Get junction and variant information for variant samples ###############
83
89
# #############################################################################
84
90
85
- # # Get the data from sample with the variant and the significant junctions
91
+ # # Get junctions from concatenated regtools output associated with each variant, only if the data came from a sample actually containing the variant
86
92
variant_junctions_data <- all_cse_identify_data [all_cse_identify_data $ variant_info == x $ key & all_cse_identify_data $ sample %in% variant_samples ,]
87
93
88
94
# # Make dataset with the sample and the junction/variant information
@@ -95,22 +101,22 @@ a <- function(x, all_cse_identify_data){
95
101
variant_junctions_data $ norm_score <- variant_junctions_data $ score / variant_junctions_data $ score.tmp
96
102
97
103
# Aggregate data across junction-samples
98
- if (nrow(variant_junctions_data )[[1 ]] > 0 ){
99
- variant_junctions_aggr = variant_junctions_data [, list (average_norm_score = mean(norm_score ),total_score = sum(score )),
100
- by = list (chrom ,start ,end ,name , strand ,anchor ,variant_info ,info )]
104
+ if (nrow(variant_junctions_data )[[1 ]] > 0 ){ # names column refers to names that regtools gave each junction in the variant output (only will be multiple if variant and junction was found in multiple samples)
105
+ variant_junctions_aggr = variant_junctions_data [, list (names = list ( name ), mean_norm_score_variant = mean(norm_score ),sd_norm_score_variant = sd( norm_score ), norm_scores_variant = list ( norm_score ), total_score_variant = sum(score )),
106
+ by = list (chrom ,start ,end ,strand ,anchor ,variant_info ,info )]
101
107
102
108
} else {
103
109
return (data.table())
104
110
}
105
111
106
112
# # Identify mutated samples
107
113
a <- function (x , tempData ){
108
- return (paste(as.character(tempData [info == x , " sample" ]$ sample ), collapse = " | " ) )
114
+ return (paste(as.character(tempData [info == x , " sample" ]$ sample ), collapse = " , " )) # for super-junction aggregated across samples with the variant, get list of samples with variant (and junction )
109
115
}
110
116
variant_junctions_aggr $ sample <- sapply(variant_junctions_aggr $ info , a , tempData = sample_data )
111
117
112
118
# #############################################################################
113
- # #### Get junction and variant information for non-variant samples#### ###########
119
+ # #### Get junction and variant information for non-variant samples ###########
114
120
# #############################################################################
115
121
116
122
# # Samples without variant
@@ -130,13 +136,22 @@ a <- function(x, all_cse_identify_data){
130
136
colnames(summed_non_variant_scores ) <- c(" sample" , " score.tmp" )
131
137
non_variant_junctions_data <- merge(non_variant_junctions_data , summed_non_variant_scores , by = " sample" )
132
138
non_variant_junctions_data $ norm_score <- non_variant_junctions_data $ score / non_variant_junctions_data $ score.tmp
133
-
134
-
135
139
140
+ # Aggregate data across junction-samples
141
+ if (nrow(variant_junctions_data )[[1 ]] > 0 ){
142
+ non_variant_junctions_aggr = non_variant_junctions_data [, list (mean_norm_score_non = mean(norm_score ),sd_norm_score_non = sd(norm_score ),norm_scores_non = sd(norm_score ),total_score_non = sum(score )),
143
+ by = list (chrom ,start ,end ,strand ,anchor ,variant_info ,info )]
144
+ } else {
145
+ return (data.table())
146
+ }
136
147
137
148
# Generate histogram of normalized junction scores
138
149
collapsed_norm_scores = non_variant_junctions_data [, list (norm_scores = list (norm_score )), by = list (chrom ,start ,end ,strand )]
139
150
151
+ # Add non-variant sample aggregate stats to output
152
+ variant_junctions_aggr = merge(x = variant_junctions_aggr , y = non_variant_junctions_aggr ,by =
153
+ c(" chrom" ," start" ," end" ," strand" ," anchor" ," variant_info" ," info" ),all.x = T )
154
+
140
155
# split out what p-values can be calculated on vs not
141
156
collapsed_norm_scores $ key <- paste(collapsed_norm_scores $ chrom , collapsed_norm_scores $ start , collapsed_norm_scores $ end , collapsed_norm_scores $ strand , sep = ' .' )
142
157
variant_junctions_aggr $ key <- paste(variant_junctions_aggr $ chrom , variant_junctions_aggr $ start , variant_junctions_aggr $ end , variant_junctions_aggr $ strand , sep = ' .' )
@@ -149,7 +164,7 @@ a <- function(x, all_cse_identify_data){
149
164
start = trimws(junction_data [[" start" ]])
150
165
end = trimws(junction_data [[" end" ]])
151
166
strand = trimws(junction_data [[" strand" ]])
152
- variant_norm_score = junction_data [[" average_norm_score " ]]
167
+ variant_norm_score = junction_data [[" mean_norm_score_variant " ]]
153
168
# ## pass this variable
154
169
i = which(collapsed_norm_scores $ chrom == chrom & collapsed_norm_scores $ start == start & collapsed_norm_scores $ end == end & collapsed_norm_scores $ strand == strand )
155
170
if (length(i )){
@@ -186,16 +201,33 @@ a <- function(x, all_cse_identify_data){
186
201
variant_junctions_aggr <- variant_junctions_aggr_norun_pvalues
187
202
}
188
203
189
- return (variant_junctions_aggr )
190
-
204
+ return (variant_junctions_aggr )
191
205
}
192
206
193
- regtools_data <- adply(all_splicing_variants , 1 , a , all_cse_identify_data = df , .parallel = TRUE )
207
+ # for each iteration (i.e. line in all_splicing_variants),
208
+ # return a dataframe of regtools info and stats, basically what you end up seeing in regtools_data
209
+ # so the samples column comes from all_splicing_variants, since adply uses rbind.fill, which fills in missing columns
210
+ regtools_data <- adply(all_splicing_variants , 1 , a , all_cse_identify_data = df , .parallel = ! debug )
194
211
195
- # regtools_data <- rbindlist(regtools_data, fill=TRUE)
212
+ paste_commas <- function (v ){
213
+ return (paste(v ,collapse = " ," ))
214
+ }
196
215
197
216
regtools_data <- regtools_data [order(anchor , sample , pvalue )]
198
- regtools_data $ samples <- paste(regtools_data $ samples )
217
+ regtools_data $ norm_scores_variant <- unlist(lapply(regtools_data $ norm_scores_variant ,paste_commas ))
218
+ regtools_data $ norm_scores_non <- unlist(lapply(regtools_data $ norm_scores_non ,paste_commas ))
219
+ regtools_data $ samples <- unlist(lapply(regtools_data $ samples ,paste_commas ))
220
+ regtools_data $ names <- unlist(lapply(regtools_data $ names ,paste_commas ))
221
+ regtools_data $ sd_norm_score_variant [is.na(regtools_data $ sd_norm_score_variant )] = 0
222
+ regtools_data $ mean_norm_score_non [is.na(regtools_data $ mean_norm_score_non )] = 0
223
+ regtools_data $ sd_norm_score_non [is.na(regtools_data $ sd_norm_score_non )] = 0
224
+ regtools_data $ total_score_variant [is.na(regtools_data $ total_score_variant )] = 0
225
+ regtools_data $ total_score_non [is.na(regtools_data $ total_score_non )] = 0
226
+
227
+ regtools_data = subset(regtools_data , select = ! (names(regtools_data ) %in% c(" key" )))
228
+ colnames(regtools_data )[colnames(regtools_data )== " sample" ] <- " variant_junction_samples"
229
+ colnames(regtools_data )[colnames(regtools_data )== " samples" ] <- " variant_samples"
230
+ colnames(regtools_data )[colnames(regtools_data )== " info" ] <- " variant_junction_info"
199
231
})
200
- write.table(regtools_data , file = paste(" compare_junctions/hist/" , " junction_pvalues" , tag , " .tsv" , sep = " " ), quote = FALSE , sep = ' \t ' , row.names = F )
201
232
233
+ write.table(regtools_data , file = paste(" compare_junctions/hist/" , " junction_pvalues" , tag , " .tsv" , sep = " " ), quote = FALSE , sep = ' \t ' , row.names = F )
0 commit comments