Skip to content

Commit 519ff7d

Browse files
committed
add working version
1 parent f51efb7 commit 519ff7d

File tree

1 file changed

+28
-8
lines changed

1 file changed

+28
-8
lines changed

scripts/compare_junctions_hist.py

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def createkey(row):
153153
samples_w_variant_df.columns = ['sample_x', 'sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
154154
'info', 'genes', 'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
155155
'norm_scores_variant', 'scores', 'junction', 'total_score_variant']
156-
tmp_df = samples_w_variant_df.groupby('variant_info')[['sample_x']].aggregate(lambda x: set(x.tolist())).reset_index()
156+
tmp_df = samples_w_variant_df.groupby('variant_info')[['sample_x']].aggregate(lambda x: list(set(x.tolist()))).reset_index()
157157
samples_w_variant_df = pd.merge(samples_w_variant_df, tmp_df, on='variant_info')
158158
samples_w_variant_df = samples_w_variant_df[['sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
159159
'info', 'genes', 'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
@@ -227,12 +227,13 @@ def createkey(row):
227227
def add_zeros(row):
228228
norm_scores = row[1]
229229
if norm_scores == 0:
230-
norm_scores = ['0']
230+
norm_scores = [0]
231231
samples_wout_variant = row[2]
232232
samples_w_variant = row[3]
233233
num_of_zeros_toadd = num_of_samples - samples_wout_variant - samples_w_variant
234234
zeros = np.repeat(0, num_of_zeros_toadd).tolist()
235235
norm_scores = norm_scores + zeros
236+
norm_scores.sort(reverse=True)
236237
new_norm_score_value = (',').join(map(str, norm_scores))
237238
return new_norm_score_value
238239

@@ -241,15 +242,15 @@ def add_zeros(row):
241242
del(tmp_df)
242243

243244
def get_mean(row):
244-
values = row[-1].split(',')
245+
values = row[33].split(',')
245246
values = [float(i) for i in values]
246247
mean = np.mean(values)
247248
return mean
248249

249250
master_df['mean_norm_score_non'] = master_df.apply(lambda row: get_mean(row), axis=1)
250251

251252
def get_sd(row):
252-
values = row[-2].split(',')
253+
values = row[33].split(',')
253254
values = [float(i) for i in values]
254255
std = np.std(values)
255256
return std
@@ -265,23 +266,42 @@ def get_min(row):
265266
master_df['min_norm_score_variant'] = master_df.apply(lambda row: get_min(row), axis=1)
266267

267268
def get_pvalue_mean(row):
268-
values = row[32].split(',')
269+
values = row[33].split(',')
269270
values = [float(i) for i in values]
270271
mean_value = row[10]
271272
pvalue = stats.percentileofscore(values, mean_value)
272273
pvalue = 1 - (pvalue/100.0)
274+
pvalue = re.sub('[\[\]]', '', np.array_str(pvalue))
273275
return pvalue
274276

275277
master_df['p_value_mean'] = master_df.apply(lambda row: get_pvalue_mean(row), axis=1)
276278

277279
def get_pvalue_min(row):
278-
values = row[32].split(',')
280+
values = row[33].split(',')
279281
values = [float(i) for i in values]
280-
mean_value = row[35]
281-
pvalue = stats.percentileofscore(values, mean_value)
282+
min_value = row[36]
283+
pvalue = stats.percentileofscore(values, min_value)
282284
pvalue = 1 - (pvalue/100.0)
285+
pvalue = re.sub('[\[\]]', '', np.array_str(pvalue))
283286
return pvalue
284287

285288
master_df['p_value_min'] = master_df.apply(lambda row: get_pvalue_mean(row), axis=1)
286289

290+
master_df = master_df[['variant_samples', 'variant_info_x', 'genes_x', 'junction_samples',
291+
'chrom_x', 'start_x', 'end_x', 'strand_x', 'anchor_x', 'info',
292+
'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
293+
'norm_scores_variant', 'total_score_variant', 'mean_norm_score_non',
294+
'sd_norm_score_non', 'new_norm_scores', 'total_score_non', 'p_value_mean','p_value_min']]
295+
master_df.columns = ['variant_samples', 'variant_info', 'genes', 'junction_samples',
296+
'chrom', 'start', 'end', 'strand', 'anchor', 'variant_junction_info',
297+
'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
298+
'norm_scores_variant', 'total_score_variant', 'mean_norm_score_non',
299+
'sd_norm_score_non', 'norm_scores_non', 'total_score_non', 'p_value_mean','p_value_min']
300+
301+
master_df = master_df.applymap(lambda x: x[0] if isinstance(x, list) else x)
302+
master_df = master_df.fillna(0)
303+
304+
master_df.to_csv('test_results.tsv', sep='\t', index=False)
305+
print(master_df.info())
287306
# master_df = master_df[['samples', 'variant_info_x', ']]
307+
#why are variant_samples >1 missing?

0 commit comments

Comments
 (0)