|
| 1 | +import csv |
| 2 | +from doctest import master |
| 3 | +from itertools import groupby |
| 4 | +import pandas as pd |
| 5 | +from dfply import * |
| 6 | +import numpy as np |
| 7 | +import glob |
| 8 | +import os |
| 9 | + |
| 10 | +tag = 'E' |
| 11 | +os.chdir('/Users/kcotto/Desktop/CHOL/') |
| 12 | +splicing_variants_inputfile = '/Users/kcotto/Desktop/CHOL/all_splicing_variants_E.bed' |
| 13 | +samples_inputfile = '/Users/kcotto/Desktop/CHOL/dir_names.tsv' |
| 14 | + |
| 15 | +# read in all splicing variants |
| 16 | +all_splicing_variants = pd.read_csv(splicing_variants_inputfile, delimiter='\t', header=0) |
| 17 | +# print(all_splicing_variants.head(20)) |
| 18 | + |
| 19 | +# create key to match regtools variant_info column and key2 that is the same as key but with sample name added |
| 20 | +def createkey(row): |
| 21 | + key = row[0] + ':' + str(row[1]) + '-' + str(row[2]) |
| 22 | + return key |
| 23 | +all_splicing_variants['key'] = all_splicing_variants.apply(lambda row: createkey(row), axis=1) |
| 24 | + |
| 25 | +def createkey(row): |
| 26 | + key = row[0] + ':' + str(row[1]) + '-' + str(row[2]) + '_' + row[3] |
| 27 | + return key |
| 28 | +all_splicing_variants['key2'] = all_splicing_variants.apply(lambda row: createkey(row), axis=1) |
| 29 | + |
| 30 | +# read in the sample names |
| 31 | +all_samples = [] |
| 32 | +with open(samples_inputfile, 'r') as samples: |
| 33 | + reader = csv.reader(samples, delimiter='\t') |
| 34 | + for line in reader: |
| 35 | + all_samples.append(line[0]) |
| 36 | + |
| 37 | +### read in all of the regtools cse output for this cohort ### |
| 38 | +# create list to hold each sample's df |
| 39 | +dfs = [] |
| 40 | + |
| 41 | +# read each sample's output file into a df and subset columns, split variants into multirows, |
| 42 | +# and require that variant is in all_splicing_variants |
| 43 | +for sample in all_samples: |
| 44 | + path = f'samples/{sample}/output/cse_identify_filtered_compare_{tag}.tsv' |
| 45 | + df = f'df_{sample}' |
| 46 | + print(f'Reading in {sample}') |
| 47 | + df = pd.read_csv(path, delimiter='\t', header=0) |
| 48 | + df['sample'] = sample |
| 49 | + df = df[['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor', 'score', 'name', 'genes']] |
| 50 | + # print(df.info(verbose=True)) |
| 51 | + df = df.dropna(subset=['variant_info']) |
| 52 | + # print(df.info(verbose=True)) |
| 53 | + df = df.set_index(['sample', 'chrom', 'start', 'end', 'strand', 'anchor', 'score', 'name', 'genes']).apply(lambda x: x.str.split(',').explode()).reset_index() |
| 54 | + # print(df.info(verbose=True)) |
| 55 | + # print(df.head(20)) |
| 56 | + # print(all_splicing_variants.head(20)) |
| 57 | + df = df.loc[df['variant_info'].isin(all_splicing_variants['key'])] |
| 58 | + # print(df.info(verbose=True)) |
| 59 | + dfs.append(df) |
| 60 | + |
| 61 | +# concat all individual dfs into one df |
| 62 | +print("Concatenating each sample's df together") |
| 63 | +master_df = pd.concat(dfs, axis=0, ignore_index=True) |
| 64 | +del dfs |
| 65 | +# print(master_df.info(verbose=True)) |
| 66 | + |
| 67 | +# create various keys |
| 68 | +def createkey(row): |
| 69 | + key = row[1] + '_' + str(row[2]) + '_' + str(row[3]) + '_' + row[5] + '_' + row[9] |
| 70 | + return key |
| 71 | +master_df['info'] = master_df.apply(lambda row: createkey(row), axis=1) |
| 72 | + |
| 73 | +def createkey(row): |
| 74 | + key = row[9] + '_' + row[0] |
| 75 | + return key |
| 76 | +master_df['key'] = master_df.apply(lambda row: createkey(row), axis=1) |
| 77 | + |
| 78 | +def createkey(row): |
| 79 | + key = row[1] + '_' + str(row[2]) + '_' + str(row[3]) + '_' + row[0] |
| 80 | + return key |
| 81 | +master_df['junction'] = master_df.apply(lambda row: createkey(row), axis=1) |
| 82 | +# print(master_df.info(verbose=True)) |
| 83 | + |
| 84 | +samples_w_variant_df = master_df.loc[master_df['key'].isin(all_splicing_variants['key2'])] |
| 85 | +# print(samples_w_variant_df.info(verbose=True)) |
| 86 | + |
| 87 | +# start performing the calculations for this subset of data |
| 88 | +print('Calculating for samples with variants of interest') |
| 89 | +print(samples_w_variant_df.head(10)) |
| 90 | +samples_w_variant_df = (samples_w_variant_df >> |
| 91 | + group_by(X.key) >> |
| 92 | + summarize(score_tmp = X.score.sum()) >> |
| 93 | + outer_join(samples_w_variant_df, by='key') |
| 94 | + ) |
| 95 | +samples_w_variant_df['norm_score'] = samples_w_variant_df['score']/samples_w_variant_df['score_tmp'] |
| 96 | +# tmp_df = samples_w_variant_df.groupby('info')['norm_score'].agg([np.mean, np.std]) |
| 97 | +samples_w_variant_df = (samples_w_variant_df >> |
| 98 | + group_by('info') >> |
| 99 | + summarize(mean_norm_score_variant=X.norm_score.mean(), sd_norm_score_variant=X.norm_score.std(), total_score_variant=X.score.sum()) >> |
| 100 | + outer_join(samples_w_variant_df, by='info') |
| 101 | + ) |
| 102 | +# samples_w_variant_df = (samples_w_variant_df >> |
| 103 | +# group_by(X.info) >> |
| 104 | +# summarize_each([np.mean, np.std], X.norm_score) >> |
| 105 | +# outer_join(samples_w_variant_df, by='info') |
| 106 | +# ) |
| 107 | +print(samples_w_variant_df.head(10)) |
| 108 | +tmp_df = samples_w_variant_df.groupby('info')[['norm_score', 'score', 'sd_norm_score_variant', 'mean_norm_score_variant', 'sample']].aggregate(lambda x: x.tolist()).reset_index() |
| 109 | +samples_w_variant_df = pd.merge(samples_w_variant_df, tmp_df, on='info') |
| 110 | +samples_w_variant_df = samples_w_variant_df[['sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor', |
| 111 | + 'info', 'genes', 'name', 'mean_norm_score_variant_y', 'sd_norm_score_variant_y', |
| 112 | + 'norm_score_y', 'score_y', 'junction', 'total_score_variant']] |
| 113 | +samples_w_variant_df.columns = ['samples', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor', |
| 114 | + 'info', 'genes', 'name', 'mean_norm_score_variant', 'sd_norm_score_variant', |
| 115 | + 'norm_scores_variant', 'scores', 'junction_key', 'total_score_variant'] |
| 116 | +# samples_w_variant_df['mean_norm_score_variant'] = samples_w_variant_df.groupby(['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor', 'info']).score_norm.mean().reset_index() |
| 117 | +# samples_w_variant_df['sd_norm_score_variant'] = samples_w_variant_df.groupby(['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor', 'info']).score_norm.sd().reset_index() |
| 118 | +# samples_w_variant_df['total_score_variant'] = samples_w_variant_df.groupby(['variant_info', 'chrom', 'start', 'end', 'strand', 'anchor', 'info']).score.sum().reset_index() |
| 119 | +print(samples_w_variant_df.head(10)) |
| 120 | + |
| 121 | +# work on samples that don't have the variant of interest |
| 122 | + |
| 123 | +samples_wout_variant_df = master_df[-master_df['key'].isin(all_splicing_variants['key2'])] |
| 124 | +samples_wout_variant_df = (samples_wout_variant_df >> |
| 125 | + group_by(X.key) >> |
| 126 | + summarize(score_tmp = X.score.sum()) >> |
| 127 | + outer_join(samples_wout_variant_df, by='key') |
| 128 | + ) |
| 129 | +samples_wout_variant_df['norm_score'] = samples_wout_variant_df['score']/samples_wout_variant_df['score_tmp'] |
| 130 | + |
| 131 | +mode = 'strict' #others include 'exclude' and 'group' |
| 132 | + |
| 133 | +if mode == 'strict': |
| 134 | + |
| 135 | + |
| 136 | + |
0 commit comments