1+ #!/usr/bin/python
2+ # -*- coding: utf-8 -*-
3+ """
4+ Created on Fri Oct 13 17:49:59 2017
5+
6+ @author: philipp
7+ """
8+
9+ # Average counts across replicates
10+ # =======================================================================
11+ # Imports
12+ from __future__ import division # floating point division by default
13+ import pandas
14+ import numpy
15+ import os
16+ import glob
17+ import time
18+ import yaml
19+ import sys
20+
21+
22+ def AverageReadCounts (treatment ):
23+ # ------------------------------------------------
24+ # Get parameters
25+ # ------------------------------------------------
26+ configFile = open ('configuration.yaml' ,'r' )
27+ config = yaml .load (configFile )
28+ configFile .close ()
29+ ScriptsDir = config ['ScriptsDir' ]
30+ AlnQCDir = config ['AlnQCDir' ]
31+ repl_avg = config ['repl_avg' ]
32+ AvgDir = treatment + '_avg'
33+
34+ # ------------------------------------------------
35+ # Get counts from each replicate
36+ # ------------------------------------------------
37+ start = time .time ()
38+ print ('Processing ' + treatment + ' ...' )
39+ colnames_s = ['sgRNA' ,'gene' ,'counts' ]
40+ colnames_g = ['gene' ,'counts' ]
41+ os .chdir (AlnQCDir )
42+ ReplDirs = [d for d in os .listdir (AlnQCDir ) if treatment in d and '_avg' not in d ]
43+ R = len (ReplDirs )
44+ if R >= 2 :
45+ print ('Averaging read counts over ' + str (R )+ ' replicates ...' )
46+ AllGuideCounts = pandas .DataFrame ()
47+ AllGuideCounts0 = pandas .DataFrame ()
48+ AllGeneCounts = pandas .DataFrame ()
49+ AllGeneCounts0 = pandas .DataFrame ()
50+ for replicate in ReplDirs :
51+ os .chdir (replicate )
52+ # sgRNA counts
53+ filename = glob .glob ('*GuideCounts.txt' )[0 ]
54+ CountsFile = pandas .read_table (filename , sep = '\t ' ,names = colnames_s )
55+ CountsFile = CountsFile .sort_values (['gene' ,'sgRNA' ])
56+ sgIDs = list (CountsFile ['sgRNA' ])
57+ genes = list (CountsFile ['gene' ])
58+ counts = list (CountsFile ['counts' ])
59+ AllGuideCounts ['sgRNA' ] = sgIDs
60+ AllGuideCounts ['gene' ] = genes
61+ AllGuideCounts [replicate ] = counts
62+ # normalized sgRNA counts
63+ filename = glob .glob ('*GuideCounts_0.txt' )[0 ]
64+ CountsFile = pandas .read_table (filename , sep = '\t ' ,names = colnames_s )
65+ CountsFile = CountsFile .sort_values (['gene' ,'sgRNA' ])
66+ sgIDs = list (CountsFile ['sgRNA' ])
67+ genes = list (CountsFile ['gene' ])
68+ counts = list (CountsFile ['counts' ])
69+ AllGuideCounts0 ['sgRNA' ] = sgIDs
70+ AllGuideCounts0 ['gene' ] = genes
71+ AllGuideCounts0 [replicate ] = counts
72+ # gene counts
73+ filename = glob .glob ('*GeneCounts.txt' )[0 ]
74+ CountsFile = pandas .read_table (filename , sep = '\t ' ,names = colnames_g )
75+ CountsFile = CountsFile .sort_values (['gene' ])
76+ genes = list (CountsFile ['gene' ])
77+ counts = list (CountsFile ['counts' ])
78+ AllGeneCounts ['gene' ] = genes
79+ AllGeneCounts [replicate ] = counts
80+ # normalized gene counts
81+ filename = glob .glob ('*GeneCounts_0.txt' )[0 ]
82+ CountsFile = pandas .read_table (filename , sep = '\t ' ,names = colnames_g )
83+ CountsFile = CountsFile .sort_values (['gene' ])
84+ genes = list (CountsFile ['gene' ])
85+ counts = list (CountsFile ['counts' ])
86+ AllGeneCounts0 ['gene' ] = genes
87+ AllGeneCounts0 [replicate ] = counts
88+ os .chdir (AlnQCDir )
89+ # ------------------------------------------------
90+ # Compute averages
91+ # ------------------------------------------------
92+ # sgRNA counts
93+ repl_counts = AllGuideCounts .iloc [:,2 :]
94+ if repl_avg == 'median' :
95+ avg_counts = repl_counts .median (axis = 1 )
96+ elif repl_avg == 'mean' :
97+ avg_counts = repl_counts .mean (axis = 1 )
98+ AllGuideCounts [treatment + '_avg' ] = avg_counts
99+ del_columns = range (2 ,2 + R )
100+ AllGuideCounts .drop (AllGuideCounts .columns [del_columns ],axis = 1 ,inplace = True )
101+ # normalized sgRNA counts
102+ repl_counts = AllGuideCounts0 .iloc [:,2 :]
103+ if repl_avg == 'median' :
104+ avg_counts = repl_counts .median (axis = 1 )
105+ elif repl_avg == 'mean' :
106+ avg_counts = repl_counts .mean (axis = 1 )
107+ AllGuideCounts0 [treatment + '_avg' ] = avg_counts
108+ del_columns = range (2 ,2 + R )
109+ AllGuideCounts0 .drop (AllGuideCounts0 .columns [del_columns ],axis = 1 ,inplace = True )
110+ # gene counts
111+ repl_counts = AllGeneCounts .iloc [:,1 :]
112+ if repl_avg == 'median' :
113+ avg_counts = repl_counts .median (axis = 1 )
114+ elif repl_avg == 'mean' :
115+ avg_counts = repl_counts .mean (axis = 1 )
116+ AllGeneCounts [treatment + '_avg' ] = avg_counts
117+ del_columns = range (1 ,1 + R )
118+ AllGeneCounts .drop (AllGeneCounts .columns [del_columns ],axis = 1 ,inplace = True )
119+ # normalized gene counts
120+ repl_counts = AllGeneCounts0 .iloc [:,1 :]
121+ if repl_avg == 'median' :
122+ avg_counts = repl_counts .median (axis = 1 )
123+ elif repl_avg == 'mean' :
124+ avg_counts = repl_counts .mean (axis = 1 )
125+ AllGeneCounts0 [treatment + '_avg' ] = avg_counts
126+ del_columns = range (1 ,1 + R )
127+ AllGeneCounts0 .drop (AllGeneCounts0 .columns [del_columns ],axis = 1 ,inplace = True )
128+ # ------------------------------------------------
129+ # Write result dataframes
130+ # ------------------------------------------------
131+ os .chdir (AlnQCDir )
132+ if not os .path .exists (AvgDir ):
133+ os .makedirs (AvgDir )
134+ os .chdir (AvgDir )
135+ AllGuideCounts .to_csv (treatment + '_avg_GuideCounts.txt' , sep = '\t ' , index = False , header = False )
136+ AllGuideCounts0 .to_csv (treatment + '_avg_GuideCounts_0.txt' , sep = '\t ' , index = False , header = False )
137+ AllGeneCounts .to_csv (treatment + '_avg_GeneCounts.txt' , sep = '\t ' , index = False , header = False )
138+ AllGeneCounts0 .to_csv (treatment + '_avg_GeneCounts_0.txt' , sep = '\t ' , index = False , header = False )
139+ else :
140+ print ('(No replicates found)' )
141+
142+
143+ # --------------------------------------
144+ # Time stamp
145+ # --------------------------------------
146+ os .chdir (ScriptsDir )
147+ end = time .time ()
148+ # Final time stamp
149+ sec_elapsed = end - start
150+ if sec_elapsed < 60 :
151+ time_elapsed = sec_elapsed
152+ print ('Time elapsed [secs]: ' + '%.3f' % time_elapsed )
153+ elif sec_elapsed < 3600 :
154+ time_elapsed = sec_elapsed / 60
155+ print ('Time elapsed [mins]: ' + '%.3f' % time_elapsed )
156+ else :
157+ time_elapsed = sec_elapsed / 3600
158+ print ('Time elapsed [hours]: ' + '%.3f' % time_elapsed )
159+
160+ if __name__ == "__main__" :
161+ input1 = sys .argv [1 ]
162+ AverageReadCounts (input1 )
0 commit comments