Skip to content

Commit e3be7ac

Browse files
committed
initial commit of python version of stats script
1 parent 2e3a291 commit e3be7ac

File tree

1 file changed

+136
-0
lines changed

1 file changed

+136
-0
lines changed

scripts/compare_junctions_hist.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
import csv
2+
from doctest import master
3+
from itertools import groupby
4+
import pandas as pd
5+
from dfply import *
6+
import numpy as np
7+
import glob
8+
import os
9+
10+
tag = 'E'
11+
os.chdir('/Users/kcotto/Desktop/CHOL/')
12+
splicing_variants_inputfile = '/Users/kcotto/Desktop/CHOL/all_splicing_variants_E.bed'
13+
samples_inputfile = '/Users/kcotto/Desktop/CHOL/dir_names.tsv'
14+
15+
# read in all splicing variants
16+
all_splicing_variants = pd.read_csv(splicing_variants_inputfile, delimiter='\t', header=0)
17+
# print(all_splicing_variants.head(20))
18+
19+
# create key to match regtools variant_info column and key2 that is the same as key but with sample name added
20+
def createkey(row):
21+
key = row[0] + ':' + str(row[1]) + '-' + str(row[2])
22+
return key
23+
all_splicing_variants['key'] = all_splicing_variants.apply(lambda row: createkey(row), axis=1)
24+
25+
def createkey(row):
26+
key = row[0] + ':' + str(row[1]) + '-' + str(row[2]) + '_' + row[3]
27+
return key
28+
all_splicing_variants['key2'] = all_splicing_variants.apply(lambda row: createkey(row), axis=1)
29+
30+
# read in the sample names
31+
all_samples = []
32+
with open(samples_inputfile, 'r') as samples:
33+
reader = csv.reader(samples, delimiter='\t')
34+
for line in reader:
35+
all_samples.append(line[0])
36+
37+
### read in all of the regtools cse output for this cohort ###
38+
# create list to hold each sample's df
39+
dfs = []
40+
41+
# read each sample's output file into a df and subset columns, split variants into multirows,
42+
# and require that variant is in all_splicing_variants
43+
for sample in all_samples:
44+
path = f'samples/{sample}/output/cse_identify_filtered_compare_{tag}.tsv'
45+
df = f'df_{sample}'
46+
print(f'Reading in {sample}')
47+
df = pd.read_csv(path, delimiter='\t', header=0)
48+
df['sample'] = sample
49+
df = df[['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor', 'score', 'name', 'genes']]
50+
# print(df.info(verbose=True))
51+
df = df.dropna(subset=['variant_info'])
52+
# print(df.info(verbose=True))
53+
df = df.set_index(['sample', 'chrom', 'start', 'end', 'strand', 'anchor', 'score', 'name', 'genes']).apply(lambda x: x.str.split(',').explode()).reset_index()
54+
# print(df.info(verbose=True))
55+
# print(df.head(20))
56+
# print(all_splicing_variants.head(20))
57+
df = df.loc[df['variant_info'].isin(all_splicing_variants['key'])]
58+
# print(df.info(verbose=True))
59+
dfs.append(df)
60+
61+
# concat all individual dfs into one df
62+
print("Concatenating each sample's df together")
63+
master_df = pd.concat(dfs, axis=0, ignore_index=True)
64+
del dfs
65+
# print(master_df.info(verbose=True))
66+
67+
# create various keys
68+
def createkey(row):
69+
key = row[1] + '_' + str(row[2]) + '_' + str(row[3]) + '_' + row[5] + '_' + row[9]
70+
return key
71+
master_df['info'] = master_df.apply(lambda row: createkey(row), axis=1)
72+
73+
def createkey(row):
74+
key = row[9] + '_' + row[0]
75+
return key
76+
master_df['key'] = master_df.apply(lambda row: createkey(row), axis=1)
77+
78+
def createkey(row):
79+
key = row[1] + '_' + str(row[2]) + '_' + str(row[3]) + '_' + row[0]
80+
return key
81+
master_df['junction'] = master_df.apply(lambda row: createkey(row), axis=1)
82+
# print(master_df.info(verbose=True))
83+
84+
samples_w_variant_df = master_df.loc[master_df['key'].isin(all_splicing_variants['key2'])]
85+
# print(samples_w_variant_df.info(verbose=True))
86+
87+
# start performing the calculations for this subset of data
88+
print('Calculating for samples with variants of interest')
89+
print(samples_w_variant_df.head(10))
90+
samples_w_variant_df = (samples_w_variant_df >>
91+
group_by(X.key) >>
92+
summarize(score_tmp = X.score.sum()) >>
93+
outer_join(samples_w_variant_df, by='key')
94+
)
95+
samples_w_variant_df['norm_score'] = samples_w_variant_df['score']/samples_w_variant_df['score_tmp']
96+
# tmp_df = samples_w_variant_df.groupby('info')['norm_score'].agg([np.mean, np.std])
97+
samples_w_variant_df = (samples_w_variant_df >>
98+
group_by('info') >>
99+
summarize(mean_norm_score_variant=X.norm_score.mean(), sd_norm_score_variant=X.norm_score.std(), total_score_variant=X.score.sum()) >>
100+
outer_join(samples_w_variant_df, by='info')
101+
)
102+
# samples_w_variant_df = (samples_w_variant_df >>
103+
# group_by(X.info) >>
104+
# summarize_each([np.mean, np.std], X.norm_score) >>
105+
# outer_join(samples_w_variant_df, by='info')
106+
# )
107+
print(samples_w_variant_df.head(10))
108+
tmp_df = samples_w_variant_df.groupby('info')[['norm_score', 'score', 'sd_norm_score_variant', 'mean_norm_score_variant', 'sample']].aggregate(lambda x: x.tolist()).reset_index()
109+
samples_w_variant_df = pd.merge(samples_w_variant_df, tmp_df, on='info')
110+
samples_w_variant_df = samples_w_variant_df[['sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
111+
'info', 'genes', 'name', 'mean_norm_score_variant_y', 'sd_norm_score_variant_y',
112+
'norm_score_y', 'score_y', 'junction', 'total_score_variant']]
113+
samples_w_variant_df.columns = ['samples', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
114+
'info', 'genes', 'name', 'mean_norm_score_variant', 'sd_norm_score_variant',
115+
'norm_scores_variant', 'scores', 'junction_key', 'total_score_variant']
116+
# samples_w_variant_df['mean_norm_score_variant'] = samples_w_variant_df.groupby(['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor', 'info']).score_norm.mean().reset_index()
117+
# samples_w_variant_df['sd_norm_score_variant'] = samples_w_variant_df.groupby(['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor', 'info']).score_norm.sd().reset_index()
118+
# samples_w_variant_df['total_score_variant'] = samples_w_variant_df.groupby(['variant_info', 'chrom', 'start', 'end', 'strand', 'anchor', 'info']).score.sum().reset_index()
119+
print(samples_w_variant_df.head(10))
120+
121+
# work on samples that don't have the variant of interest
122+
123+
samples_wout_variant_df = master_df[-master_df['key'].isin(all_splicing_variants['key2'])]
124+
samples_wout_variant_df = (samples_wout_variant_df >>
125+
group_by(X.key) >>
126+
summarize(score_tmp = X.score.sum()) >>
127+
outer_join(samples_wout_variant_df, by='key')
128+
)
129+
samples_wout_variant_df['norm_score'] = samples_wout_variant_df['score']/samples_wout_variant_df['score_tmp']
130+
131+
mode = 'strict' #others include 'exclude' and 'group'
132+
133+
if mode == 'strict':
134+
135+
136+

0 commit comments

Comments
 (0)