66@author: philipp
77"""
88
9- # Perform alignment, count & normalize reads
9+ # Perform Bowtie2 alignment
1010# =======================================================================
1111# Imports
1212from __future__ import division # floating point division by default
1717import glob
1818import time
1919import sys
20- from collections import Counter
21- from joblib import Parallel , delayed
22- import multiprocessing
23- from mpl_toolkits .mplot3d import axes3d
24- import matplotlib
25- matplotlib .use ('Agg' )
26- import matplotlib .pyplot as plt
27- import numpy
28- import pysam
29- from matplotlib .ticker import FuncFormatter
3020
31- def millions (x , pos ):
32- return '%1.1fM' % (x * 1e-6 )
3321
34- def millions2 (x , pos ):
35- return '%1.2fM' % (x * 1e-6 )
36-
37- def CountReadsPerGene (g ):
38- global GeneList
39- global geneIDs
40- global ReadsPerGuide
41- gene = GeneList [g ]
42- geneIndex = [i for i ,x in enumerate (geneIDs ) if x == gene ]
43- sgCounts = [ReadsPerGuide [i ] for i in geneIndex ]
44- g_counts = sum (sgCounts )
45- return g_counts
46-
47- def CountReadsPerGeneX (g ):
48- gene = GeneList [g ]
49- I = geneIDs .index (gene )
50- j = I
51- g_counts = 0
52- terminate = False
53- while geneIDs [j ] == gene and terminate == False :
54- g_counts = g_counts + ReadsPerGuide [j ]
55- if j <= L - 2 :
56- j += 1
57- else :
58- terminate = True
59- return g_counts
60-
61-
62- def MapAndCount (sample ):
22+ def ReadAlignment (sample ):
6323 # ------------------------------------------------
6424 # Print header
6525 # ------------------------------------------------
@@ -75,36 +35,21 @@ def MapAndCount(sample):
7535 ScriptsDir = config ['ScriptsDir' ]
7636 WorkingDir = config ['WorkingDir' ]
7737 TempDataDir = config ['TempDataDir' ]
78- AnalysisDir = config ['AnalysisDir' ]
79- CutAdaptDir = config ['CutAdaptDir' ]
8038 bw2Dir = config ['bw2Dir' ]
8139 IndexDir = config ['IndexDir' ]
8240 LibDir = config ['LibDir' ]
8341 AlnStemDir = config ['AlignDir' ]
8442 AlnDir = AlnStemDir + sample + '/'
85- OutputDir = config ['AlnQCDir' ]+ sample
86- minN = config ['Cutoff' ]
8743 LibFilename = config ['LibFilename' ]
8844 LibFormat = LibFilename [- 3 :]
8945 if LibFormat == 'tsv' :
9046 libsep = '\t '
9147 elif LibFormat == 'csv' :
9248 libsep = ','
93- Theta = config ['Theta' ]
94- AS_min = config ['AS_min' ]
9549 L_bw = config ['L_bw' ]
9650 N_bw = config ['N_bw' ]
9751 i_bw = config ['i_bw' ]
98- N0 = 1000000
99- res = config ['dpi' ]
100- svg = config ['svg' ]
101- AlnOutput = config ['AlnOutput' ]
102- keepCutReads = config ['keepCutReads' ]
103- AlnFileSuffix = '_bw2Aln.txt'
104- GuideCount_Suffix = '_GuideCounts.txt'
105- GeneCount_Suffix = '_GeneCounts.txt'
106- cutadaptLog = sample + '_cutadapt_log.txt'
107- logfilename = sample + '_AlignmentResults.txt'
52+
10853
10954 # ------------------------------------------------
11055 # Read library
@@ -152,278 +97,6 @@ def MapAndCount(sample):
15297 print ('Time elapsed (Alignment) [hours]: ' + '%.3f' % time_elapsed )
15398
15499
155- # ------------------------------------------
156- # Extract and analyze alignments
157- # ------------------------------------------
158- start = time .time ()
159- print ('Analyzing alignment ...' )
160- print ('Applying matching threshold ...' )
161- print ('Applying ambiguity threshold ...' )
162- # CLASSIFY ALIGNMENTS
163- os .chdir (AlnDir )
164- bw2outputFilename = ReadsFilename0 + '_bw2output.sam'
165- bw2sam = pysam .AlignmentFile (bw2outputFilename ,'rb' )
166- NFail = 0 ; NUnique = 0 ; NTol = 0 ; NAmb = 0
167- mapQ = list ()
168- primScore = list ()
169- secScore = list ()
170- AlnStatus = list ()
171- sgRNA_Hitlist = list ()
172- for read in bw2sam .fetch ():
173- mapQ .append (read .mapping_quality )
174- # read with primary and seconday alignment
175- if read .has_tag ('AS' ):
176- AS = read .get_tag ('AS' )
177- if AS >= AS_min :
178- if read .has_tag ('XS' ):
179- XS = read .get_tag ('XS' )
180- primScore .append (AS )
181- secScore .append (XS )
182- if XS <= AS - Theta :
183- NTol += 1
184- AlnStatus .append ('Tolerate' )
185- sgRNA_Hitlist .append (read .reference_name )
186- else :
187- NAmb += 1
188- AlnStatus .append ('Ambiguous' )
189- # read with only primary alignment
190- else :
191- primScore .append (AS )
192- secScore .append (0 )
193- NUnique += 1
194- AlnStatus .append ('Unique' )
195- sgRNA_Hitlist .append (read .reference_name )
196- else :
197- # read with insufficient primary score
198- primScore .append (AS )
199- if read .has_tag ('XS' ):
200- XS = read .get_tag ('XS' )
201- secScore .append (XS )
202- else :
203- secScore .append (0 )
204- NFail += 1
205- AlnStatus .append ('Fail' )
206- # read with failed alignment
207- else :
208- primScore .append (0 )
209- secScore .append (0 )
210- NFail += 1
211- AlnStatus .append ('Fail' )
212- bw2sam .close ();
213- NReads = NTol + NAmb + NUnique + NFail
214- FracUnique = round (NUnique / NReads * 1000 )/ 10
215- FracTol = round (NTol / NReads * 1000 )/ 10
216- FracAmb = round (NAmb / NReads * 1000 )/ 10
217- FracFail = round (NFail / NReads * 1000 )/ 10
218- print ('*** Successfully mapped reads: ' \
219- + str (NUnique + NTol )+ ' (' + str (FracUnique + FracTol )+ '%) ***' )
220-
221-
222- # ------------------------------------------
223- # Text output and plots
224- # ------------------------------------------
225- print ('Writing alignment logfile ...' )
226- if aln_time < 60 :
227- time_elapsed = aln_time
228- time_unit = ' [secs]'
229- elif aln_time < 3600 :
230- time_elapsed = aln_time / 60
231- time_unit = ' [mins]'
232- else :
233- aln_time = aln_time / 3600
234- time_unit = ' [hours]'
235- if not os .path .exists (OutputDir ):
236- os .makedirs (OutputDir )
237- os .chdir (OutputDir )
238- LogFile = open (logfilename ,'w' )
239- LogFile .write (sample + ' Alignment Results\n ' )
240- LogFile .write ('**************************************\n ' )
241- LogFile .write ('\n ' )
242- LogFile .write ('Total Number of Reads: \t ' + str (NReads )+ '\n ' )
243- LogFile .write ('\n ' )
244- LogFile .write ('Number of Reads with unique Alignments: \t ' + str (NUnique )+ ' (' + str (FracUnique )+ '%)\n ' )
245- LogFile .write ('Number of Reads above Ambiguity Tolerance: \t ' + str (NTol )+ ' (' + str (FracTol )+ '%)\n ' )
246- LogFile .write ('---------------------------------------------------------------\n ' )
247- LogFile .write ('Total Number of Reads matched: \t \t \t ' + str (NUnique + NTol )+ ' (' + str (FracUnique + FracTol )+ '%)\n ' )
248- LogFile .write ('\n \n ' )
249- LogFile .write ('Number of Reads below Ambiguity Tolerance: \t ' + str (NAmb )+ ' (' + str (FracAmb )+ '%)\n ' )
250- LogFile .write ('Number of Reads with failed Alignment: \t \t ' + str (NFail )+ ' (' + str (FracFail )+ '%)\n ' )
251- LogFile .write ('---------------------------------------------------------------\n ' )
252- LogFile .write ('Total Number of Reads discarded: \t \t ' + str (NFail + NAmb )+ ' (' + str (FracFail + FracAmb )+ '%)\n ' )
253- LogFile .write ('\n \n \n ' )
254- LogFile .write ('---- Alignment completed in %.2f' % time_elapsed + time_unit + ' ----' )
255- LogFile .write ('\n \n \n ' )
256- LogFile .write ('Parameter settings\n ' )
257- LogFile .write ('L_bw (Seed Length): ' + str (L_bw )+ '\n ' )
258- LogFile .write ('N_bw (Seed Mismatch): ' + str (N_bw )+ '\n ' )
259- LogFile .write ('i_bw (Interval Function): ' + str (i_bw )+ '\n ' )
260- LogFile .write ('Theta (Ambiguity Threshold): ' + str (Theta )+ '\n ' )
261- LogFile .write ('AS_min (Matching Threshold): ' + str (AS_min )+ '\n ' )
262- LogFile .close ()
263- # DRAW PLOTS
264- # MAPPING QUALITY HISTOGRAM
265- os .chdir (OutputDir )
266- print ('Plotting mapping quality ...' )
267- maxQuality = max (mapQ )
268- fig , ax = plt .subplots (figsize = (5 ,3.5 ))
269- plt .hist (mapQ , bins = range (maxQuality + 1 ), width = 1 , align = 'left' )
270- plt .xlim ([- 1 ,maxQuality + 1 ])
271- plt .title ('Read Alignment Rates' , fontsize = 14 )
272- plt .xlabel ('Mapping Quality' , fontsize = 14 )
273- plt .ylabel ('Number of Reads' , fontsize = 14 )
274- plt .tick_params (labelsize = 14 )
275- formatter = FuncFormatter (millions2 )
276- ax .yaxis .set_major_formatter (formatter )
277- plt .tight_layout ()
278- plt .savefig (sample + '_MappingQuality.png' ,dpi = res )
279- if svg :
280- plt .savefig (sample + '_MappingQuality.svg' )
281- # ALIGNMENT SCORE BARPLOT
282- print ('Plotting alignment scores ...' )
283- primScoreKeep = [primScore [k ] for k in range (NReads ) if AlnStatus [k ] in ['Unique' ,'Tolerate' ]]
284- secScoreKeep = [secScore [k ] for k in range (NReads ) if AlnStatus [k ] in ['Unique' ,'Tolerate' ]]
285- primScoreToss = [primScore [k ] for k in range (NReads ) if AlnStatus [k ] in ['Fail' ,'Ambiguous' ]]
286- secScoreToss = [secScore [k ] for k in range (NReads ) if AlnStatus [k ] in ['Fail' ,'Ambiguous' ]]
287- # Plot bars (matched reads)
288- xedges = range (42 )
289- yedges = range (42 )
290- H , xedges , yedges = numpy .histogram2d (secScoreKeep , primScoreKeep , bins = (xedges , yedges ))
291- Y , X = numpy .nonzero (H )
292- Z = H [Y ,X ]
293- Z_off = numpy .zeros (len (Y ))
294- dX = numpy .ones (len (X ))
295- dY = numpy .ones (len (Y ))
296- # Plot bars (discarded reads)
297- h , xedges , yedges = numpy .histogram2d (secScoreToss , primScoreToss , bins = (xedges , yedges ))
298- y , x = numpy .nonzero (h )
299- z = h [y ,x ]
300- z_off = numpy .zeros (len (y ))
301- dx = numpy .ones (len (x ))
302- dy = numpy .ones (len (y ))
303- # Show plot
304- fig = plt .figure (figsize = (6 ,5 ))
305- ax = fig .gca (projection = '3d' )
306- ax .bar3d (X ,Y ,Z_off ,dX ,dY ,Z , color = (97 / 255 , 252 / 255 , 80 / 255 )) # bar3d has a bug with hex code
307- green_proxy = plt .Rectangle ((0 , 0 ), 1 , 1 , fc = (97 / 255 , 252 / 255 , 80 / 255 ))
308- ax .bar3d (x ,y ,z_off ,dx ,dy ,z ,color = (255 / 255 , 51 / 255 , 51 / 255 ))
309- red_proxy = plt .Rectangle ((0 , 0 ), 1 , 1 , fc = (255 / 255 , 51 / 255 , 51 / 255 ))
310- ax .legend ([green_proxy ,red_proxy ],['Reads Accepted' ,'Reads Discarded' ],loc = 'upper left' ,prop = {'size' :10 })
311- ax .set_title ('Alignment Analysis' ,fontsize = 14 )
312- ax .set_xlabel ('Prim. Alignment Score' , fontsize = 12 )
313- ax .set_ylabel ('Sec. Alignment Score' , fontsize = 12 )
314- formatter = FuncFormatter (millions )
315- ax .zaxis .set_major_formatter (formatter )
316- ax .set_zlabel ('Number of Reads' ,fontsize = 12 )
317- ax .set_xticks ([0 ,10 ,20 ,30 ,40 ]); ax .set_yticks ([0 ,10 ,20 ,30 ,40 ]);
318- plt .tight_layout ()
319- plt .savefig (sample + '_AlignmentScores.png' ,dpi = res )
320- if svg :
321- plt .savefig (sample + '_AlignmentScores.svg' ,dpi = res )
322- print ('Alignment analysis completed.' )
323- end = time .time ()
324- # Time stamp
325- sec_elapsed = end - start
326- if sec_elapsed < 60 :
327- time_elapsed = sec_elapsed
328- print ('Time elapsed (Alignment analysis) [secs]: ' + '%.3f' % time_elapsed )
329- elif sec_elapsed < 3600 :
330- time_elapsed = sec_elapsed / 60
331- print ('Time elapsed (Alignment analysis) [mins]: ' + '%.3f' % time_elapsed )
332- else :
333- time_elapsed = sec_elapsed / 3600
334- print ('Time elapsed (Alignment analysis) [hours]: ' + '%.3f' % time_elapsed )
335-
336-
337- # --------------------------------------
338- # Get read counts
339- # --------------------------------------
340- start = time .time ()
341- print ('Read counting in progress ...' )
342- # Read counts per sgRNA
343- print ('Counting reads per sgRNA ...' )
344- ReadCounts = Counter ()
345- for sgRNA in sgRNA_Hitlist :
346- ReadCounts [sgRNA ] += 1
347- global ReadsPerGuide
348- ReadsPerGuide = list ()
349- for sgRNA in sgIDs :
350- ReadsPerGuide .append (ReadCounts [sgRNA ])
351- # Apply read count cut-off
352- ReadSel = [ReadsPerGuide [k ] >= NReads / N0 * minN for k in range (L )]
353- ReadsPerGuide = [ReadSel [k ]* ReadsPerGuide [k ] for k in range (L )]
354- # Write read counts
355- os .chdir (OutputDir )
356- GuideCountsFilename = sample + GuideCount_Suffix
357- GuideCounts = open (GuideCountsFilename ,'w' )
358- for k in range (L ):
359- GuideCounts .write (str (sgIDs [k ]) + '\t ' + str (geneIDs [k ]) + '\t ' + str (ReadsPerGuide [k ]) + '\n ' )
360- GuideCounts .close ()
361- # Read counts per gene in library
362- print ('Counting reads per gene ...' )
363- global GeneList
364- GeneList = list (set (geneIDs ))
365- G = len (GeneList )
366- num_cores = multiprocessing .cpu_count ()
367- ReadsPerGene = Parallel (n_jobs = num_cores )(delayed (CountReadsPerGeneX )(g ) for g in range (G ))
368- GeneCountsFilename = sample + GeneCount_Suffix
369- GeneCounts = open (GeneCountsFilename ,'w' )
370- for g in range (G ):
371- GeneCounts .write (str (GeneList [g ]) + '\t ' + str (ReadsPerGene [g ]) + '\n ' )
372- GeneCounts .close ()
373- end = time .time ()
374- print ('Read counting completed.' )
375- # No-mapping warning
376- if sum (ReadsPerGuide ) == 0 :
377- print ('### ERROR: Zero read counts! Check library and alignment ###' )
378- # Time stamp
379- sec_elapsed = end - start
380- if sec_elapsed < 60 :
381- time_elapsed = sec_elapsed
382- print ('Time elapsed (Read Counting) [secs]: ' + '%.3f' % time_elapsed )
383- elif sec_elapsed < 3600 :
384- time_elapsed = sec_elapsed / 60
385- print ('Time elapsed (Read Counting) [mins]: ' + '%.3f' % time_elapsed )
386- else :
387- time_elapsed = sec_elapsed / 3600
388- print ('Time elapsed (Read Counting) [hours]: ' + '%.3f' % time_elapsed )
389-
390-
391- # --------------------------------------
392- # Cleaning up...
393- # --------------------------------------
394- start = time .time ()
395- # clipped reads file
396- if not keepCutReads :
397- os .chdir (TempDataDir )
398- os .system ('rm ' + ReadsFilename0 )
399- # alignment output
400- if AlnOutput == 'Compress' :
401- print ('Compressing raw alignment output...' )
402- os .chdir (AlnDir )
403- # converting SAM to BAM
404- SAM_output = glob .glob ('*bw2output.sam' )[0 ]
405- BAM_output = SAM_output [:- 3 ] + 'bam'
406- os .system ('samtools view -buSH ' + SAM_output + ' > ' + BAM_output )
407- os .system ('rm ' + SAM_output )
408- elif AlnOutput == 'Delete' :
409- print ('Removing raw alignment output...' )
410- os .chdir (AlnDir )
411- os .system ('rm *' )
412- elif AlnOutput == 'Keep' :
413- print ('Keeping raw alignment output ...' )
414- end = time .time ()
415- # Time stamp
416- sec_elapsed = end - start
417- if sec_elapsed < 60 :
418- time_elapsed = sec_elapsed
419- print ('Time elapsed (Clean-up) [secs]: ' + '%.3f' % time_elapsed )
420- elif sec_elapsed < 3600 :
421- time_elapsed = sec_elapsed / 60
422- print ('Time elapsed (Clean-up) [mins]: ' + '%.3f' % time_elapsed )
423- else :
424- time_elapsed = sec_elapsed / 3600
425- print ('Time elapsed (Clean-up) [hours]: ' + '%.3f' % time_elapsed )
426-
427100 # --------------------------------------
428101 # Final time stamp
429102 # --------------------------------------
@@ -445,4 +118,4 @@ def MapAndCount(sample):
445118
446119if __name__ == "__main__" :
447120 input1 = sys .argv [1 ]
448- MapAndCount (input1 )
121+ ReadAlignment (input1 )
0 commit comments