Skip to content

Commit 0a230c4

Browse files
committed
first part working for new python script
1 parent e3be7ac commit 0a230c4

File tree

1 file changed

+36
-32
lines changed

1 file changed

+36
-32
lines changed

scripts/compare_junctions_hist.py

Lines changed: 36 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414

1515
# read in all splicing variants
1616
all_splicing_variants = pd.read_csv(splicing_variants_inputfile, delimiter='\t', header=0)
17-
# print(all_splicing_variants.head(20))
1817

1918
# create key to match regtools variant_info column and key2 that is the same as key but with sample name added
2019
def createkey(row):
@@ -47,22 +46,15 @@ def createkey(row):
4746
df = pd.read_csv(path, delimiter='\t', header=0)
4847
df['sample'] = sample
4948
df = df[['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor', 'score', 'name', 'genes']]
50-
# print(df.info(verbose=True))
5149
df = df.dropna(subset=['variant_info'])
52-
# print(df.info(verbose=True))
5350
df = df.set_index(['sample', 'chrom', 'start', 'end', 'strand', 'anchor', 'score', 'name', 'genes']).apply(lambda x: x.str.split(',').explode()).reset_index()
54-
# print(df.info(verbose=True))
55-
# print(df.head(20))
56-
# print(all_splicing_variants.head(20))
5751
df = df.loc[df['variant_info'].isin(all_splicing_variants['key'])]
58-
# print(df.info(verbose=True))
5952
dfs.append(df)
6053

6154
# concat all individual dfs into one df
6255
print("Concatenating each sample's df together")
6356
master_df = pd.concat(dfs, axis=0, ignore_index=True)
6457
del dfs
65-
# print(master_df.info(verbose=True))
6658

6759
# create various keys
6860
def createkey(row):
@@ -76,47 +68,59 @@ def createkey(row):
7668
master_df['key'] = master_df.apply(lambda row: createkey(row), axis=1)
7769

7870
def createkey(row):
79-
key = row[1] + '_' + str(row[2]) + '_' + str(row[3]) + '_' + row[0]
71+
key = row[1] + '_' + str(row[2]) + '_' + str(row[3])
8072
return key
8173
master_df['junction'] = master_df.apply(lambda row: createkey(row), axis=1)
82-
# print(master_df.info(verbose=True))
8374

75+
# subset data to work on samples with splicing variant of interest
8476
samples_w_variant_df = master_df.loc[master_df['key'].isin(all_splicing_variants['key2'])]
8577
# print(samples_w_variant_df.info(verbose=True))
8678

8779
# start performing the calculations for this subset of data
8880
print('Calculating for samples with variants of interest')
81+
mode = 'blah'
8982
print(samples_w_variant_df.head(10))
90-
samples_w_variant_df = (samples_w_variant_df >>
83+
if mode == 'group':
84+
samples_w_variant_df = (samples_w_variant_df >>
9185
group_by(X.key) >>
9286
summarize(score_tmp = X.score.sum()) >>
9387
outer_join(samples_w_variant_df, by='key')
9488
)
95-
samples_w_variant_df['norm_score'] = samples_w_variant_df['score']/samples_w_variant_df['score_tmp']
96-
# tmp_df = samples_w_variant_df.groupby('info')['norm_score'].agg([np.mean, np.std])
97-
samples_w_variant_df = (samples_w_variant_df >>
89+
samples_w_variant_df['norm_score'] = samples_w_variant_df['score']/samples_w_variant_df['score_tmp']
90+
samples_w_variant_df = (samples_w_variant_df >>
91+
group_by('junction') >>
92+
summarize(mean_norm_score_variant=X.norm_score.mean(), sd_norm_score_variant=X.norm_score.std(), total_score_variant=X.score.sum()) >>
93+
outer_join(samples_w_variant_df, by='junction')
94+
)
95+
tmp_df = samples_w_variant_df.groupby('junction')[['norm_score', 'score', 'variant_info', 'sample', 'name', 'info']].aggregate(lambda x: x.tolist()).reset_index()
96+
samples_w_variant_df = pd.merge(samples_w_variant_df, tmp_df, on='junction')
97+
samples_w_variant_df = samples_w_variant_df[['sample_y', 'variant_info_y', 'chrom', 'start', 'end', 'strand', 'anchor',
98+
'info_y', 'genes', 'name_y', 'mean_norm_score_variant', 'sd_norm_score_variant',
99+
'norm_score_y', 'score_y', 'junction', 'total_score_variant']]
100+
samples_w_variant_df.columns = ['samples', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
101+
'info', 'genes', 'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
102+
'norm_scores_variant', 'scores', 'junction', 'total_score_variant']
103+
samples_w_variant_df = samples_w_variant_df.astype(str).drop_duplicates()
104+
else:
105+
samples_w_variant_df = (samples_w_variant_df >>
106+
group_by(X.key) >>
107+
summarize(score_tmp = X.score.sum()) >>
108+
outer_join(samples_w_variant_df, by='key')
109+
)
110+
samples_w_variant_df['norm_score'] = samples_w_variant_df['score']/samples_w_variant_df['score_tmp']
111+
samples_w_variant_df = (samples_w_variant_df >>
98112
group_by('info') >>
99113
summarize(mean_norm_score_variant=X.norm_score.mean(), sd_norm_score_variant=X.norm_score.std(), total_score_variant=X.score.sum()) >>
100114
outer_join(samples_w_variant_df, by='info')
101115
)
102-
# samples_w_variant_df = (samples_w_variant_df >>
103-
# group_by(X.info) >>
104-
# summarize_each([np.mean, np.std], X.norm_score) >>
105-
# outer_join(samples_w_variant_df, by='info')
106-
# )
107-
print(samples_w_variant_df.head(10))
108-
tmp_df = samples_w_variant_df.groupby('info')[['norm_score', 'score', 'sd_norm_score_variant', 'mean_norm_score_variant', 'sample']].aggregate(lambda x: x.tolist()).reset_index()
109-
samples_w_variant_df = pd.merge(samples_w_variant_df, tmp_df, on='info')
110-
samples_w_variant_df = samples_w_variant_df[['sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
111-
'info', 'genes', 'name', 'mean_norm_score_variant_y', 'sd_norm_score_variant_y',
116+
tmp_df = samples_w_variant_df.groupby('info')[['norm_score', 'score', 'sd_norm_score_variant', 'mean_norm_score_variant', 'sample', 'name']].aggregate(lambda x: x.tolist()).reset_index()
117+
samples_w_variant_df = pd.merge(samples_w_variant_df, tmp_df, on='info')
118+
samples_w_variant_df = samples_w_variant_df[['sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
119+
'info', 'genes', 'name_y', 'mean_norm_score_variant_y', 'sd_norm_score_variant_y',
112120
'norm_score_y', 'score_y', 'junction', 'total_score_variant']]
113-
samples_w_variant_df.columns = ['samples', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
114-
'info', 'genes', 'name', 'mean_norm_score_variant', 'sd_norm_score_variant',
115-
'norm_scores_variant', 'scores', 'junction_key', 'total_score_variant']
116-
# samples_w_variant_df['mean_norm_score_variant'] = samples_w_variant_df.groupby(['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor', 'info']).score_norm.mean().reset_index()
117-
# samples_w_variant_df['sd_norm_score_variant'] = samples_w_variant_df.groupby(['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor', 'info']).score_norm.sd().reset_index()
118-
# samples_w_variant_df['total_score_variant'] = samples_w_variant_df.groupby(['variant_info', 'chrom', 'start', 'end', 'strand', 'anchor', 'info']).score.sum().reset_index()
119-
print(samples_w_variant_df.head(10))
121+
samples_w_variant_df.columns = ['samples', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
122+
'info', 'genes', 'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
123+
'norm_scores_variant', 'scores', 'junction', 'total_score_variant']
120124

121125
# work on samples that don't have the variant of interest
122126

@@ -130,7 +134,7 @@ def createkey(row):
130134

131135
mode = 'strict' #others include 'exclude' and 'group'
132136

133-
if mode == 'strict':
137+
# if mode == 'strict':
134138

135139

136140

0 commit comments

Comments
 (0)