11import sys
22import os
3- import scipy as sp
3+ import numpy as np
44import pickle
55import h5py
66
@@ -34,11 +34,11 @@ def _prepare_count_hdf5(options, OUT, event_features, sample_idx=None):
3434 OUT .create_dataset (name = 'strains' , data = codeUTF8 (options .strains ))
3535 feat = OUT .create_group (name = 'event_features' )
3636 for f in event_features :
37- feat .create_dataset (name = f , data = codeUTF8 (sp .array (event_features [f ], dtype = 'str' )))
38- OUT .create_dataset (name = 'gene_names' , data = codeUTF8 (sp .array ([x .name for x in genes ], dtype = 'str' )))
39- OUT .create_dataset (name = 'gene_chr' , data = codeUTF8 (sp .array ([x .chr for x in genes ], dtype = 'str' )))
40- OUT .create_dataset (name = 'gene_strand' , data = codeUTF8 (sp .array ([x .strand for x in genes ], dtype = 'str' )))
41- OUT .create_dataset (name = 'gene_pos' , data = sp .array ([[x .start , x .stop ] for x in genes ], dtype = 'int' ))
37+ feat .create_dataset (name = f , data = codeUTF8 (np .array (event_features [f ], dtype = 'str' )))
38+ OUT .create_dataset (name = 'gene_names' , data = codeUTF8 (np .array ([x .name for x in genes ], dtype = 'str' )))
39+ OUT .create_dataset (name = 'gene_chr' , data = codeUTF8 (np .array ([x .chr for x in genes ], dtype = 'str' )))
40+ OUT .create_dataset (name = 'gene_strand' , data = codeUTF8 (np .array ([x .strand for x in genes ], dtype = 'str' )))
41+ OUT .create_dataset (name = 'gene_pos' , data = np .array ([[x .start , x .stop ] for x in genes ], dtype = 'int' ))
4242
4343
4444def analyze_events (options , event_type , sample_idx = None ):
@@ -90,25 +90,25 @@ def analyze_events(options, event_type, sample_idx=None):
9090 events_all_strains = options .strains
9191
9292 ### handle case where we did not find any event of this type
93- if sp .sum ([x .event_type == event_type for x in events_all ]) == 0 :
93+ if np .sum ([x .event_type == event_type for x in events_all ]) == 0 :
9494 OUT = h5py .File (fn_out_count , 'w' )
9595 OUT .create_dataset (name = 'event_counts' , data = [0 ])
9696 _prepare_count_hdf5 (options , OUT , event_features , sample_idx = sample_idx )
9797 OUT .close ()
98- confirmed_idx = sp .array ([], dtype = 'int' )
98+ confirmed_idx = np .array ([], dtype = 'int' )
9999 else :
100100 if not options .pyproc :
101101 if options .merge == 'single' :
102102 (events_all , counts ) = verify_all_events (events_all , sample_idx , options .bam_fnames , event_type , options )
103103 else :
104- (events_all , counts ) = verify_all_events (events_all , sp .arange (len (options .strains )), options .bam_fnames , event_type , options )
105- verified = sp .array ([x .verified for x in events_all ], dtype = 'bool' )
104+ (events_all , counts ) = verify_all_events (events_all , np .arange (len (options .strains )), options .bam_fnames , event_type , options )
105+ verified = np .array ([x .verified for x in events_all ], dtype = 'bool' )
106106 for ev in events_all :
107107 ev .verified = []
108108
109- psi = sp .empty ((counts .shape [0 ], counts .shape [2 ]), dtype = 'float' )
110- iso1 = sp .empty ((counts .shape [0 ], counts .shape [2 ]), dtype = 'int32' )
111- iso2 = sp .empty ((counts .shape [0 ], counts .shape [2 ]), dtype = 'int32' )
109+ psi = np .empty ((counts .shape [0 ], counts .shape [2 ]), dtype = 'float' )
110+ iso1 = np .empty ((counts .shape [0 ], counts .shape [2 ]), dtype = 'int32' )
111+ iso2 = np .empty ((counts .shape [0 ], counts .shape [2 ]), dtype = 'int32' )
112112 for i in range (counts .shape [2 ]):
113113 (psi [:, i ], iso1 [:, i ], iso2 [:, i ]) = compute_psi (counts [:, :, i ], event_type , options )
114114
@@ -117,7 +117,7 @@ def analyze_events(options, event_type, sample_idx=None):
117117 OUT .create_dataset (name = 'psi' , data = psi , compression = 'gzip' )
118118 OUT .create_dataset (name = 'iso1' , data = iso1 , compression = 'gzip' )
119119 OUT .create_dataset (name = 'iso2' , data = iso2 , compression = 'gzip' )
120- OUT .create_dataset (name = 'gene_idx' , data = sp .array ([x .gene_idx for x in events_all ], dtype = 'int' ), compression = 'gzip' )
120+ OUT .create_dataset (name = 'gene_idx' , data = np .array ([x .gene_idx for x in events_all ], dtype = 'int' ), compression = 'gzip' )
121121 OUT .create_dataset (name = 'verified' , data = verified , compression = 'gzip' )
122122 _prepare_count_hdf5 (options , OUT , event_features , sample_idx = sample_idx )
123123 else :
@@ -126,9 +126,9 @@ def analyze_events(options, event_type, sample_idx=None):
126126 chunk_size_events = 5000
127127 chunk_size_strains = 500
128128 for i in range (0 , events_all .shape [0 ], chunk_size_events ):
129- idx_events = sp .arange (i , min (i + chunk_size_events , events_all .shape [0 ]))
129+ idx_events = np .arange (i , min (i + chunk_size_events , events_all .shape [0 ]))
130130 for j in range (0 , len (options .strains ), chunk_size_strains ):
131- idx_strains = sp .arange (j , min (j + chunk_size_strains , len (options .strains )))
131+ idx_strains = np .arange (j , min (j + chunk_size_strains , len (options .strains )))
132132 PAR ['ev' ] = events_all [idx_events ].copy ()
133133 PAR ['strain_idx' ] = idx_strains
134134 PAR ['list_bam' ] = options .bam_fnames
@@ -151,9 +151,9 @@ def analyze_events(options, event_type, sample_idx=None):
151151 print ('Collecting results from chunks ...' )
152152 OUT = h5py .File (fn_out_count , 'w' )
153153 for i in range (0 , events_all .shape [0 ], chunk_size_events ):
154- idx_events = sp .arange (i , min (i + chunk_size_events , events_all .shape [0 ]))
154+ idx_events = np .arange (i , min (i + chunk_size_events , events_all .shape [0 ]))
155155 for j in range (0 , len (options .strains ), chunk_size_strains ):
156- idx_strains = sp .arange (j , min (j + chunk_size_strains , len (options .strains )))
156+ idx_strains = np .arange (j , min (j + chunk_size_strains , len (options .strains )))
157157 print ('\r %i (%i), %i (%i)' % (i , events_all .shape [0 ], j , len (options .strains )))
158158 out_fn = '%s/event_count_chunks/%s_%i_%i_C%i.pickle' % (options .outdir , event_type , i , j , options .confidence )
159159 if not os .path .exists (out_fn ):
@@ -166,22 +166,22 @@ def analyze_events(options, event_type, sample_idx=None):
166166 verified_ = [x .verified .astype ('bool' ) for x in ev ]
167167 collect_ids_ = [x .id for x in ev ]
168168 else :
169- counts = sp .r_ [counts , counts_ ]
169+ counts = np .r_ [counts , counts_ ]
170170 for jj in range (len (ev_ )):
171- verified_ [jj ] = sp .r_ [verified_ [jj ], ev_ [jj ].verified ]
171+ verified_ [jj ] = np .r_ [verified_ [jj ], ev_ [jj ].verified ]
172172 del counts_
173173
174- psi = sp .empty ((counts .shape [0 ], counts .shape [2 ]), dtype = 'float' )
175- iso1 = sp .empty ((counts .shape [0 ], counts .shape [2 ]), dtype = 'int32' )
176- iso2 = sp .empty ((counts .shape [0 ], counts .shape [2 ]), dtype = 'int32' )
174+ psi = np .empty ((counts .shape [0 ], counts .shape [2 ]), dtype = 'float' )
175+ iso1 = np .empty ((counts .shape [0 ], counts .shape [2 ]), dtype = 'int32' )
176+ iso2 = np .empty ((counts .shape [0 ], counts .shape [2 ]), dtype = 'int32' )
177177 for j in range (counts .shape [2 ]):
178178 (psi [:, j ], iso1 [:, j ], iso2 [:, j ]) = compute_psi (counts [:, :, j ], event_type , options )
179179
180180 if i == 0 :
181181 OUT .create_dataset (name = 'event_counts' , data = counts , maxshape = (len (options .strains ), len (event_features [event_type ]), None ), compression = 'gzip' )
182- OUT .create_dataset (name = 'psi' , data = sp .atleast_2d (psi ), maxshape = (psi .shape [0 ], None ), compression = 'gzip' )
183- OUT .create_dataset (name = 'iso1' , data = sp .atleast_2d (iso1 ), maxshape = (iso1 .shape [0 ], None ), compression = 'gzip' )
184- OUT .create_dataset (name = 'iso2' , data = sp .atleast_2d (iso2 ), maxshape = (iso2 .shape [0 ], None ), compression = 'gzip' )
182+ OUT .create_dataset (name = 'psi' , data = np .atleast_2d (psi ), maxshape = (psi .shape [0 ], None ), compression = 'gzip' )
183+ OUT .create_dataset (name = 'iso1' , data = np .atleast_2d (iso1 ), maxshape = (iso1 .shape [0 ], None ), compression = 'gzip' )
184+ OUT .create_dataset (name = 'iso2' , data = np .atleast_2d (iso2 ), maxshape = (iso2 .shape [0 ], None ), compression = 'gzip' )
185185 else :
186186 tmp = OUT ['event_counts' ].shape
187187 OUT ['event_counts' ].resize ((tmp [0 ], tmp [1 ], tmp [2 ] + len (ev )))
@@ -197,33 +197,33 @@ def analyze_events(options, event_type, sample_idx=None):
197197 OUT ['iso2' ][:, tmp [1 ]:] = iso2
198198 verified .extend (verified_ )
199199 collect_ids .extend (collect_ids_ )
200- gene_idx_ = sp .r_ [gene_idx_ , [x .gene_idx for x in ev ]]
200+ gene_idx_ = np .r_ [gene_idx_ , [x .gene_idx for x in ev ]]
201201 del iso1 , iso2 , psi , counts , ev , ev_
202202
203- verified = sp .array (verified , dtype = 'bool' )
203+ verified = np .array (verified , dtype = 'bool' )
204204
205205 assert (events_all .shape [0 ] == verified .shape [0 ])
206- assert (sp .all ([events_all [e ].id for e in range (events_all .shape [0 ])] == collect_ids ))
206+ assert (np .all ([events_all [e ].id for e in range (events_all .shape [0 ])] == collect_ids ))
207207
208208 OUT .create_dataset (name = 'verified' , data = verified , dtype = 'bool' , compression = 'gzip' )
209209 OUT .create_dataset (name = 'gene_idx' , data = gene_idx_ )
210210 _prepare_count_hdf5 (options , OUT , event_features , sample_idx = sample_idx )
211211
212212 ### write more event infos to hdf5
213213 if event_type == 'exon_skip' :
214- event_pos = sp .array ([x .exons2 .ravel () for x in events_all ])
214+ event_pos = np .array ([x .exons2 .ravel () for x in events_all ])
215215 elif event_type == 'intron_retention' :
216- event_pos = sp .array ([x .exons1 .ravel () for x in events_all ])
216+ event_pos = np .array ([x .exons1 .ravel () for x in events_all ])
217217 elif event_type in ['alt_3prime' , 'alt_5prime' ]:
218- event_pos = sp .array ([unique_rows (sp .c_ [x .exons1 , x .exons2 ]).ravel () for x in events_all ])
218+ event_pos = np .array ([unique_rows (np .c_ [x .exons1 , x .exons2 ]).ravel () for x in events_all ])
219219 elif event_type == 'mult_exon_skip' :
220- event_pos = sp .array ([x .exons2 [[0 , 1 , - 2 , - 1 ], :].ravel () for x in events_all ])
220+ event_pos = np .array ([x .exons2 [[0 , 1 , - 2 , - 1 ], :].ravel () for x in events_all ])
221221 elif event_type == 'mutex_exons' :
222- event_pos = sp .array ([sp .c_ [x .exons1 [0 , :], x .exons1 [1 , :], x .exons2 [1 , :], x .exons2 [2 , :]] for x in events_all ])
222+ event_pos = np .array ([np .c_ [x .exons1 [0 , :], x .exons1 [1 , :], x .exons2 [1 , :], x .exons2 [2 , :]] for x in events_all ])
223223
224224 OUT .create_dataset (name = 'event_pos' , data = event_pos )
225225
226- num_verified = sp .sum (verified , axis = 1 )
226+ num_verified = np .sum (verified , axis = 1 )
227227 confirmed = num_verified .min (axis = 1 )
228228 OUT .create_dataset (name = 'num_verified' , data = num_verified )
229229 OUT .create_dataset (name = 'confirmed' , data = confirmed )
@@ -232,7 +232,7 @@ def analyze_events(options, event_type, sample_idx=None):
232232 #for min_verified = 1:length(options.strains),
233233 # verified_count(min_verified) = sum([events_all.confirmed] >= min_verified) ;
234234
235- confirmed_idx = sp .where (confirmed >= 1 )[0 ]
235+ confirmed_idx = np .where (confirmed >= 1 )[0 ]
236236 if confirmed_idx .shape [0 ] > 0 :
237237 OUT .create_dataset (name = 'conf_idx' , data = confirmed_idx )
238238
@@ -275,7 +275,7 @@ def analyze_events(options, event_type, sample_idx=None):
275275 if isinstance (sample_idx , int ):
276276 sample_idx = [sample_idx ]
277277 elif sample_idx is None :
278- sample_idx = sp .arange (options .strains .shape [0 ])
278+ sample_idx = np .arange (options .strains .shape [0 ])
279279
280280 if options .output_gff3 :
281281 if os .path .exists (fn_out_gff3 ):
@@ -331,13 +331,13 @@ def analyze_events(options, event_type, sample_idx=None):
331331 print ('%s already exists' % fn_out_conf_txt )
332332 else :
333333 print ('\n Writing filtered events (sample freq 0.05):' )
334- cf_idx = sp .where ([x .confirmed for x in events_all [confirmed_idx ]] >= (0.05 * options .strains .shape [0 ]))[0 ]
334+ cf_idx = np .where ([x .confirmed for x in events_all [confirmed_idx ]] >= (0.05 * options .strains .shape [0 ]))[0 ]
335335 write_events_txt (fn_out_conf_txt , options .strains [sample_idx ], events_all , fn_out_count , event_idx = confirmed_idx [cf_idx ])
336336
337337 fn_out_conf_txt = fn_out_conf .replace ('.pickle' , '.filt0.1.txt' )
338338 if os .path .exists (fn_out_conf_txt ):
339339 print ('%s already exists' % fn_out_conf_txt )
340340 else :
341341 print ('\n Writing filtered events (sample freq 0.01):' )
342- cf_idx = sp .where ([x .confirmed for x in events_all [confirmed_idx ]] >= (0.01 * options .strains .shape [0 ]))[0 ]
342+ cf_idx = np .where ([x .confirmed for x in events_all [confirmed_idx ]] >= (0.01 * options .strains .shape [0 ]))[0 ]
343343 write_events_txt (fn_out_conf_txt , options .strains [sample_idx ], events_all , fn_out_count , event_idx = confirmed_idx [cf_idx ])
0 commit comments