14
14
15
15
# read in all splicing variants
16
16
all_splicing_variants = pd .read_csv (splicing_variants_inputfile , delimiter = '\t ' , header = 0 )
17
- # print(all_splicing_variants.head(20))
18
17
19
18
# create key to match regtools variant_info column and key2 that is the same as key but with sample name added
20
19
def createkey (row ):
@@ -47,22 +46,15 @@ def createkey(row):
47
46
df = pd .read_csv (path , delimiter = '\t ' , header = 0 )
48
47
df ['sample' ] = sample
49
48
df = df [['sample' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' , 'score' , 'name' , 'genes' ]]
50
- # print(df.info(verbose=True))
51
49
df = df .dropna (subset = ['variant_info' ])
52
- # print(df.info(verbose=True))
53
50
df = df .set_index (['sample' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' , 'score' , 'name' , 'genes' ]).apply (lambda x : x .str .split (',' ).explode ()).reset_index ()
54
- # print(df.info(verbose=True))
55
- # print(df.head(20))
56
- # print(all_splicing_variants.head(20))
57
51
df = df .loc [df ['variant_info' ].isin (all_splicing_variants ['key' ])]
58
- # print(df.info(verbose=True))
59
52
dfs .append (df )
60
53
61
54
# concat all individual dfs into one df
62
55
print ("Concatenating each sample's df together" )
63
56
master_df = pd .concat (dfs , axis = 0 , ignore_index = True )
64
57
del dfs
65
- # print(master_df.info(verbose=True))
66
58
67
59
# create various keys
68
60
def createkey (row ):
@@ -76,47 +68,59 @@ def createkey(row):
76
68
master_df ['key' ] = master_df .apply (lambda row : createkey (row ), axis = 1 )
77
69
78
70
def createkey (row ):
79
- key = row [1 ] + '_' + str (row [2 ]) + '_' + str (row [3 ]) + '_' + row [ 0 ]
71
+ key = row [1 ] + '_' + str (row [2 ]) + '_' + str (row [3 ])
80
72
return key
81
73
master_df ['junction' ] = master_df .apply (lambda row : createkey (row ), axis = 1 )
82
- # print(master_df.info(verbose=True))
83
74
75
+ # subset data to work on samples with splicing variant of interest
84
76
samples_w_variant_df = master_df .loc [master_df ['key' ].isin (all_splicing_variants ['key2' ])]
85
77
# print(samples_w_variant_df.info(verbose=True))
86
78
87
79
# start performing the calculations for this subset of data
88
80
print ('Calculating for samples with variants of interest' )
81
+ mode = 'blah'
89
82
print (samples_w_variant_df .head (10 ))
90
- samples_w_variant_df = (samples_w_variant_df >>
83
+ if mode == 'group' :
84
+ samples_w_variant_df = (samples_w_variant_df >>
91
85
group_by (X .key ) >>
92
86
summarize (score_tmp = X .score .sum ()) >>
93
87
outer_join (samples_w_variant_df , by = 'key' )
94
88
)
95
- samples_w_variant_df ['norm_score' ] = samples_w_variant_df ['score' ]/ samples_w_variant_df ['score_tmp' ]
96
- # tmp_df = samples_w_variant_df.groupby('info')['norm_score'].agg([np.mean, np.std])
97
- samples_w_variant_df = (samples_w_variant_df >>
89
+ samples_w_variant_df ['norm_score' ] = samples_w_variant_df ['score' ]/ samples_w_variant_df ['score_tmp' ]
90
+ samples_w_variant_df = (samples_w_variant_df >>
91
+ group_by ('junction' ) >>
92
+ summarize (mean_norm_score_variant = X .norm_score .mean (), sd_norm_score_variant = X .norm_score .std (), total_score_variant = X .score .sum ()) >>
93
+ outer_join (samples_w_variant_df , by = 'junction' )
94
+ )
95
+ tmp_df = samples_w_variant_df .groupby ('junction' )[['norm_score' , 'score' , 'variant_info' , 'sample' , 'name' , 'info' ]].aggregate (lambda x : x .tolist ()).reset_index ()
96
+ samples_w_variant_df = pd .merge (samples_w_variant_df , tmp_df , on = 'junction' )
97
+ samples_w_variant_df = samples_w_variant_df [['sample_y' , 'variant_info_y' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
98
+ 'info_y' , 'genes' , 'name_y' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
99
+ 'norm_score_y' , 'score_y' , 'junction' , 'total_score_variant' ]]
100
+ samples_w_variant_df .columns = ['samples' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
101
+ 'info' , 'genes' , 'names' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
102
+ 'norm_scores_variant' , 'scores' , 'junction' , 'total_score_variant' ]
103
+ samples_w_variant_df = samples_w_variant_df .astype (str ).drop_duplicates ()
104
+ else :
105
+ samples_w_variant_df = (samples_w_variant_df >>
106
+ group_by (X .key ) >>
107
+ summarize (score_tmp = X .score .sum ()) >>
108
+ outer_join (samples_w_variant_df , by = 'key' )
109
+ )
110
+ samples_w_variant_df ['norm_score' ] = samples_w_variant_df ['score' ]/ samples_w_variant_df ['score_tmp' ]
111
+ samples_w_variant_df = (samples_w_variant_df >>
98
112
group_by ('info' ) >>
99
113
summarize (mean_norm_score_variant = X .norm_score .mean (), sd_norm_score_variant = X .norm_score .std (), total_score_variant = X .score .sum ()) >>
100
114
outer_join (samples_w_variant_df , by = 'info' )
101
115
)
102
- # samples_w_variant_df = (samples_w_variant_df >>
103
- # group_by(X.info) >>
104
- # summarize_each([np.mean, np.std], X.norm_score) >>
105
- # outer_join(samples_w_variant_df, by='info')
106
- # )
107
- print (samples_w_variant_df .head (10 ))
108
- tmp_df = samples_w_variant_df .groupby ('info' )[['norm_score' , 'score' , 'sd_norm_score_variant' , 'mean_norm_score_variant' , 'sample' ]].aggregate (lambda x : x .tolist ()).reset_index ()
109
- samples_w_variant_df = pd .merge (samples_w_variant_df , tmp_df , on = 'info' )
110
- samples_w_variant_df = samples_w_variant_df [['sample_y' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
111
- 'info' , 'genes' , 'name' , 'mean_norm_score_variant_y' , 'sd_norm_score_variant_y' ,
116
+ tmp_df = samples_w_variant_df .groupby ('info' )[['norm_score' , 'score' , 'sd_norm_score_variant' , 'mean_norm_score_variant' , 'sample' , 'name' ]].aggregate (lambda x : x .tolist ()).reset_index ()
117
+ samples_w_variant_df = pd .merge (samples_w_variant_df , tmp_df , on = 'info' )
118
+ samples_w_variant_df = samples_w_variant_df [['sample_y' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
119
+ 'info' , 'genes' , 'name_y' , 'mean_norm_score_variant_y' , 'sd_norm_score_variant_y' ,
112
120
'norm_score_y' , 'score_y' , 'junction' , 'total_score_variant' ]]
113
- samples_w_variant_df .columns = ['samples' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
114
- 'info' , 'genes' , 'name' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
115
- 'norm_scores_variant' , 'scores' , 'junction_key' , 'total_score_variant' ]
116
- # samples_w_variant_df['mean_norm_score_variant'] = samples_w_variant_df.groupby(['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor', 'info']).score_norm.mean().reset_index()
117
- # samples_w_variant_df['sd_norm_score_variant'] = samples_w_variant_df.groupby(['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor', 'info']).score_norm.sd().reset_index()
118
- # samples_w_variant_df['total_score_variant'] = samples_w_variant_df.groupby(['variant_info', 'chrom', 'start', 'end', 'strand', 'anchor', 'info']).score.sum().reset_index()
119
- print (samples_w_variant_df .head (10 ))
121
+ samples_w_variant_df .columns = ['samples' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
122
+ 'info' , 'genes' , 'names' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
123
+ 'norm_scores_variant' , 'scores' , 'junction' , 'total_score_variant' ]
120
124
121
125
# work on samples that don't have the variant of interest
122
126
@@ -130,7 +134,7 @@ def createkey(row):
130
134
131
135
mode = 'strict' #others include 'exclude' and 'group'
132
136
133
- if mode == 'strict' :
137
+ # if mode == 'strict':
134
138
135
139
136
140
0 commit comments