|
7 | 7 | import re |
8 | 8 | import io |
9 | 9 |
|
| 10 | + |
10 | 11 | def main(): |
11 | 12 | """ Main function """ |
12 | 13 | __doc__ = "Evaluate the run based on expected values" |
13 | 14 | outpath = os.path.dirname(os.path.realpath(__file__)) |
14 | 15 | os.chdir(outpath) |
15 | | - a = pd.read_csv( |
16 | | - 'results/GN/PUM2_K562_ENCSR661ICQ_2/PUM2_K562_ENCSR661ICQ_2_total_peaks.csv', |
17 | | - sep='\t', header=0, index_col=None) |
18 | | - expected = pd.read_csv( |
19 | | - 'means_crosslink.tsv', |
20 | | - sep='\t', header=0, index_col=None) |
21 | | - a['center'] = a['start'] + a['crosslink'] |
22 | | - |
23 | | - for index, row in a.iterrows(): |
24 | | - distance = ( |
25 | | - row['center'] - expected['crosslink'][expected['strand'] == row['strand']]).abs().min() |
26 | | - if distance <= 50: |
27 | | - a.loc[index, 'distance'] = distance |
28 | | - else: |
29 | | - a.loc[index, 'distance'] = np.nan |
30 | | - rmsd = np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()])) |
31 | | - outlier_percentage = len(a[a.distance.isna()])/len(a) * 100 |
32 | | - |
33 | | - |
34 | | - # expected = pd.read_csv( |
35 | | - # 'means_peak_center.tsv', |
36 | | - # sep='\t', header=0, index_col=None) |
37 | | - # a['center'] = a['start'] + a['mi'] |
38 | | - |
39 | | - # for index, row in a.iterrows(): |
40 | | - # distance = (row['center'] - expected['peak_center'][expected['strand'] == row['strand']]).abs().min() |
41 | | - # if distance <= 50: |
42 | | - # a.loc[index, 'distance'] = distance |
43 | | - # else: |
44 | | - # a.loc[index, 'distance'] = np.nan |
45 | | - # sys.stdout.write(f'rmsd of peak_center as center: {np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()]))}\n' |
46 | | - # ) |
47 | | - # sys.stdout.write(f'percentage of outlier peaks: {len(a[a.distance.isna()])/len(a) * 100} %\n' |
48 | | - # ) |
49 | | - |
50 | | - # add the new test best pwm |
51 | | - path = f'results/GN/PUM2_K562_ENCSR661ICQ_2/motif_analysis_final/peak_center/motif_enrichment.tsv' |
52 | | - a = pd.read_csv(path, sep='\t', header=0, index_col=0) |
53 | | - b = a.loc[a['mean_enrichment'][a['motif'].str.contains('trimmed')].idxmax(), 'motif'] |
54 | | - run = b.split('_')[-1].split('.')[0] |
55 | | - name = path.split('/')[-4] |
56 | | - path_new = path.replace( |
57 | | - 'motif_analysis_final', |
58 | | - f'motif_analysis_crossvalidation/{run}').replace('motif_enrichment.tsv', f'wlib/{b}') |
59 | | - shutil.copy(path_new, f'wlib/{name}') |
60 | | - |
61 | | - df = pd.DataFrame() |
62 | | - for path1 in glob.glob('wlib/*'): |
63 | | - name1 = path1.split('/')[-1] |
64 | | - for path2 in glob.glob('wlib/*'): |
65 | | - name2 = path2.split('/')[-1] |
66 | | - sim = getsimilarity(path1, path2) |
67 | | - df.loc[name1, name2] = sim |
68 | | - mean_sim = np.mean([df.loc['PUM2_K562_ENCSR661ICQ_2', :].mean(), df.loc[:, 'PUM2_K562_ENCSR661ICQ_2'].mean()]) |
69 | | - |
70 | | - sys.stdout.write(f'rmsd of crosslink centers: {rmsd}\n') |
71 | | - sys.stdout.write(f'percentage of outlier peaks: {outlier_percentage} % \n') |
72 | | - sys.stdout.write(f'motif similarity: {mean_sim}\n') |
73 | | - if rmsd < 1.5: |
74 | | - sys.stdout.write('Rmsd is low. Test passed. 1/3\n') |
75 | | - else: |
76 | | - sys.stdout.write('Rmsd seems to be too high 1/3\n') |
77 | | - if outlier_percentage < 5: |
78 | | - sys.stdout.write('Few outliers detected. Test passed. 2/3\n') |
79 | | - else: |
80 | | - sys.stdout.write('Rmsd seems to be too high 2/3\n') |
81 | | - if mean_sim > 0.75: |
82 | | - sys.stdout.write('Motif similarity is high.Test passed 3/3\n') |
83 | | - else: |
84 | | - sys.stdout.write('Similarity seems to be low 3/3\n') |
| 16 | + for runtype in ['GN', 'TR']: |
| 17 | + for sample in ["PUM2_K562_ENCSR661ICQ_2_paired_end", "PUM2_K562_ENCSR661ICQ_2_single_end"]: |
| 18 | + a = pd.read_csv( |
| 19 | + f'results/{runtype}/{sample}/{sample}_total_peaks.csv', |
| 20 | + sep='\t', header=0, index_col=None) |
| 21 | + expected = pd.read_csv( |
| 22 | + 'means_crosslink.tsv', |
| 23 | + sep='\t', header=0, index_col=None) |
| 24 | + a['center'] = a['start'] + a['crosslink'] |
| 25 | + |
| 26 | + for index, row in a.iterrows(): |
| 27 | + distance = ( |
| 28 | + row['center'] - expected['crosslink'][expected['strand'] == row['strand']]).abs().min() |
| 29 | + if distance <= 50: |
| 30 | + a.loc[index, 'distance'] = distance |
| 31 | + else: |
| 32 | + a.loc[index, 'distance'] = np.nan |
| 33 | + if runtype == "GN": |
| 34 | + rmsd = np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()])) |
| 35 | + outlier_percentage = len(a[a.distance.isna()])/len(a) * 100 |
| 36 | + |
| 37 | + |
| 38 | + # expected = pd.read_csv( |
| 39 | + # 'means_peak_center.tsv', |
| 40 | + # sep='\t', header=0, index_col=None) |
| 41 | + # a['center'] = a['start'] + a['mi'] |
| 42 | + |
| 43 | + # for index, row in a.iterrows(): |
| 44 | + # distance = (row['center'] - expected['peak_center'][expected['strand'] == row['strand']]).abs().min() |
| 45 | + # if distance <= 50: |
| 46 | + # a.loc[index, 'distance'] = distance |
| 47 | + # else: |
| 48 | + # a.loc[index, 'distance'] = np.nan |
| 49 | + # sys.stdout.write(f'rmsd of peak_center as center: {np.sqrt(np.sum((a["distance"]**2)) / len(a[~a.distance.isna()]))}\n' |
| 50 | + # ) |
| 51 | + # sys.stdout.write(f'percentage of outlier peaks: {len(a[a.distance.isna()])/len(a) * 100} %\n' |
| 52 | + # ) |
| 53 | + |
| 54 | + # add the new test best pwm |
| 55 | + path = f'results/{runtype}/{sample}/motif_analysis_final/peak_center/motif_enrichment.tsv' |
| 56 | + a = pd.read_csv(path, sep='\t', header=0, index_col=0) |
| 57 | + b = a.loc[a['mean_enrichment'][a['motif'].str.contains('trimmed')].idxmax(), 'motif'] |
| 58 | + run = b.split('_')[-1].split('.')[0] |
| 59 | + name = path.split('/')[-4] |
| 60 | + path_new = path.replace( |
| 61 | + 'motif_analysis_final', |
| 62 | + f'motif_analysis_crossvalidation/{run}').replace('motif_enrichment.tsv', f'wlib/{b}') |
| 63 | + shutil.copy(path_new, f'wlib/PUM2_K562_ENCSR661ICQ_2') |
| 64 | + |
| 65 | + df = pd.DataFrame() |
| 66 | + for path1 in glob.glob('wlib/*'): |
| 67 | + name1 = path1.split('/')[-1] |
| 68 | + for path2 in glob.glob('wlib/*'): |
| 69 | + name2 = path2.split('/')[-1] |
| 70 | + sim = getsimilarity(path1, path2) |
| 71 | + df.loc[name1, name2] = sim |
| 72 | + mean_sim = np.mean([df.loc['PUM2_K562_ENCSR661ICQ_2', :].mean(), |
| 73 | + df.loc[:, 'PUM2_K562_ENCSR661ICQ_2'].mean()]) |
| 74 | + sys.stdout.write(f'''{runtype.replace("GN","Genomic").replace("TR","Transcriptomic")} {"-".join(sample.split("_")[-2:])} evaluation\n''') |
| 75 | + sys.stdout.write(f'motif similarity: {mean_sim}\n') |
| 76 | + if runtype == "GN": |
| 77 | + sys.stdout.write(f'rmsd of crosslink centers: {rmsd}\n') |
| 78 | + sys.stdout.write(f'percentage of outlier peaks: {outlier_percentage} % \n') |
| 79 | + |
| 80 | + |
| 81 | + if mean_sim > 0.75: |
| 82 | + sys.stdout.write('Motif similarity is high.Test passed 1/3\n') |
| 83 | + else: |
| 84 | + sys.stdout.write('Motif similarity seems to be low 1/3\n') |
| 85 | + if runtype == "GN": |
| 86 | + if rmsd < 1.5: |
| 87 | + sys.stdout.write('Rmsd is low. Peaks are highly overlapping. Test passed. 2/3\n') |
| 88 | + else: |
| 89 | + sys.stdout.write('Rmsd seems to be too high. Peaks are far apart. 2/3\n') |
| 90 | + if outlier_percentage < 5: |
| 91 | + sys.stdout.write('Few outliers detected. Test passed. 3/3\n') |
| 92 | + else: |
| 93 | + sys.stdout.write('Too many peaks are not found in the test runs 3/3\n') |
| 94 | + sys.stdout.write('\n') |
85 | 95 | return |
86 | 96 |
|
87 | 97 |
|
88 | | - |
89 | 98 | def getSimilarityScore(wm1, wm2): |
90 | 99 | wm1 = np.array(wm1) |
91 | 100 | wm2 = np.array(wm2) |
@@ -119,7 +128,7 @@ def get_wm(path): |
119 | 128 | def getsimilarity(wm1_path, wm2_path): |
120 | 129 | wm1 = get_wm(wm1_path) |
121 | 130 | wm2 = get_wm(wm2_path) |
122 | | - similarity = ( |
| 131 | + similarity = ( |
123 | 132 | (2 * getSimilarityScore(wm1, wm2)) / ( |
124 | 133 | getSimilarityScore(wm1, wm1) + getSimilarityScore(wm2, wm2))) |
125 | 134 | return similarity |
|
0 commit comments