Skip to content

Commit eb3a937

Browse files
committed
v2.9
1 parent ccfee65 commit eb3a937

30 files changed

+2931
-1249
lines changed

Scripts/AlignReads.py

Lines changed: 4 additions & 331 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
@author: philipp
77
"""
88

9-
# Perform alignment, count & normalize reads
9+
# Perform Bowtie2 alignment
1010
# =======================================================================
1111
# Imports
1212
from __future__ import division # floating point division by default
@@ -17,49 +17,9 @@
1717
import glob
1818
import time
1919
import sys
20-
from collections import Counter
21-
from joblib import Parallel, delayed
22-
import multiprocessing
23-
from mpl_toolkits.mplot3d import axes3d
24-
import matplotlib
25-
matplotlib.use('Agg')
26-
import matplotlib.pyplot as plt
27-
import numpy
28-
import pysam
29-
from matplotlib.ticker import FuncFormatter
3020

31-
def millions(x, pos):
32-
return '%1.1fM' % (x*1e-6)
3321

34-
def millions2(x, pos):
35-
return '%1.2fM' % (x*1e-6)
36-
37-
def CountReadsPerGene(g):
38-
global GeneList
39-
global geneIDs
40-
global ReadsPerGuide
41-
gene = GeneList[g]
42-
geneIndex = [i for i,x in enumerate(geneIDs) if x==gene]
43-
sgCounts = [ReadsPerGuide[i] for i in geneIndex]
44-
g_counts = sum(sgCounts)
45-
return g_counts
46-
47-
def CountReadsPerGeneX(g):
48-
gene = GeneList[g]
49-
I = geneIDs.index(gene)
50-
j = I
51-
g_counts = 0
52-
terminate = False
53-
while geneIDs[j] == gene and terminate == False:
54-
g_counts = g_counts + ReadsPerGuide[j]
55-
if j <= L-2:
56-
j+=1
57-
else:
58-
terminate = True
59-
return g_counts
60-
61-
62-
def MapAndCount(sample):
22+
def ReadAlignment(sample):
6323
# ------------------------------------------------
6424
# Print header
6525
# ------------------------------------------------
@@ -75,36 +35,21 @@ def MapAndCount(sample):
7535
ScriptsDir = config['ScriptsDir']
7636
WorkingDir = config['WorkingDir']
7737
TempDataDir = config['TempDataDir']
78-
AnalysisDir = config['AnalysisDir']
79-
CutAdaptDir = config['CutAdaptDir']
8038
bw2Dir = config['bw2Dir']
8139
IndexDir = config['IndexDir']
8240
LibDir = config['LibDir']
8341
AlnStemDir = config['AlignDir']
8442
AlnDir = AlnStemDir+sample+'/'
85-
OutputDir = config['AlnQCDir']+sample
86-
minN = config['Cutoff']
8743
LibFilename = config['LibFilename']
8844
LibFormat = LibFilename[-3:]
8945
if LibFormat == 'tsv':
9046
libsep = '\t'
9147
elif LibFormat == 'csv':
9248
libsep = ','
93-
Theta = config['Theta']
94-
AS_min = config['AS_min']
9549
L_bw = config['L_bw']
9650
N_bw = config['N_bw']
9751
i_bw = config['i_bw']
98-
N0 = 1000000
99-
res = config['dpi']
100-
svg = config['svg']
101-
AlnOutput = config['AlnOutput']
102-
keepCutReads = config['keepCutReads']
103-
AlnFileSuffix = '_bw2Aln.txt'
104-
GuideCount_Suffix = '_GuideCounts.txt'
105-
GeneCount_Suffix = '_GeneCounts.txt'
106-
cutadaptLog = sample+'_cutadapt_log.txt'
107-
logfilename = sample+'_AlignmentResults.txt'
52+
10853

10954
# ------------------------------------------------
11055
# Read library
@@ -152,278 +97,6 @@ def MapAndCount(sample):
15297
print('Time elapsed (Alignment) [hours]: ' + '%.3f' % time_elapsed)
15398

15499

155-
# ------------------------------------------
156-
# Extract and analyze alignments
157-
# ------------------------------------------
158-
start = time.time()
159-
print('Analyzing alignment ...')
160-
print('Applying matching threshold ...')
161-
print('Applying ambiguity threshold ...')
162-
# CLASSIFY ALIGNMENTS
163-
os.chdir(AlnDir)
164-
bw2outputFilename = ReadsFilename0 + '_bw2output.sam'
165-
bw2sam = pysam.AlignmentFile(bw2outputFilename,'rb')
166-
NFail = 0; NUnique = 0; NTol = 0; NAmb = 0
167-
mapQ = list()
168-
primScore = list()
169-
secScore = list()
170-
AlnStatus = list()
171-
sgRNA_Hitlist = list()
172-
for read in bw2sam.fetch():
173-
mapQ.append(read.mapping_quality)
174-
# read with primary and seconday alignment
175-
if read.has_tag('AS'):
176-
AS = read.get_tag('AS')
177-
if AS >= AS_min:
178-
if read.has_tag('XS'):
179-
XS = read.get_tag('XS')
180-
primScore.append(AS)
181-
secScore.append(XS)
182-
if XS <= AS - Theta:
183-
NTol += 1
184-
AlnStatus.append('Tolerate')
185-
sgRNA_Hitlist.append(read.reference_name)
186-
else:
187-
NAmb += 1
188-
AlnStatus.append('Ambiguous')
189-
# read with only primary alignment
190-
else:
191-
primScore.append(AS)
192-
secScore.append(0)
193-
NUnique += 1
194-
AlnStatus.append('Unique')
195-
sgRNA_Hitlist.append(read.reference_name)
196-
else:
197-
# read with insufficient primary score
198-
primScore.append(AS)
199-
if read.has_tag('XS'):
200-
XS = read.get_tag('XS')
201-
secScore.append(XS)
202-
else:
203-
secScore.append(0)
204-
NFail += 1
205-
AlnStatus.append('Fail')
206-
# read with failed alignment
207-
else:
208-
primScore.append(0)
209-
secScore.append(0)
210-
NFail += 1
211-
AlnStatus.append('Fail')
212-
bw2sam.close();
213-
NReads = NTol + NAmb + NUnique + NFail
214-
FracUnique = round(NUnique/NReads*1000)/10
215-
FracTol = round(NTol/NReads*1000)/10
216-
FracAmb = round(NAmb/NReads*1000)/10
217-
FracFail = round(NFail/NReads*1000)/10
218-
print('*** Successfully mapped reads: '\
219-
+str(NUnique+NTol)+' ('+str(FracUnique+FracTol)+'%) ***')
220-
221-
222-
# ------------------------------------------
223-
# Text output and plots
224-
# ------------------------------------------
225-
print('Writing alignment logfile ...')
226-
if aln_time < 60:
227-
time_elapsed = aln_time
228-
time_unit = ' [secs]'
229-
elif aln_time < 3600:
230-
time_elapsed = aln_time/60
231-
time_unit = ' [mins]'
232-
else:
233-
aln_time = aln_time/3600
234-
time_unit = ' [hours]'
235-
if not os.path.exists(OutputDir):
236-
os.makedirs(OutputDir)
237-
os.chdir(OutputDir)
238-
LogFile = open(logfilename,'w')
239-
LogFile.write(sample+' Alignment Results\n')
240-
LogFile.write('**************************************\n')
241-
LogFile.write('\n')
242-
LogFile.write('Total Number of Reads: \t'+str(NReads)+'\n')
243-
LogFile.write('\n')
244-
LogFile.write('Number of Reads with unique Alignments: \t'+str(NUnique)+' ('+str(FracUnique)+'%)\n')
245-
LogFile.write('Number of Reads above Ambiguity Tolerance: \t'+str(NTol)+' ('+str(FracTol)+'%)\n')
246-
LogFile.write('---------------------------------------------------------------\n')
247-
LogFile.write('Total Number of Reads matched: \t\t\t'+str(NUnique+NTol)+' ('+str(FracUnique+FracTol)+'%)\n')
248-
LogFile.write('\n\n')
249-
LogFile.write('Number of Reads below Ambiguity Tolerance: \t'+str(NAmb)+' ('+str(FracAmb)+'%)\n')
250-
LogFile.write('Number of Reads with failed Alignment: \t\t'+str(NFail)+' ('+str(FracFail)+'%)\n')
251-
LogFile.write('---------------------------------------------------------------\n')
252-
LogFile.write('Total Number of Reads discarded: \t\t'+str(NFail+NAmb)+' ('+str(FracFail+FracAmb)+'%)\n')
253-
LogFile.write('\n\n\n')
254-
LogFile.write('---- Alignment completed in %.2f' % time_elapsed + time_unit+' ----')
255-
LogFile.write('\n\n\n')
256-
LogFile.write('Parameter settings\n')
257-
LogFile.write('L_bw (Seed Length): '+str(L_bw)+'\n')
258-
LogFile.write('N_bw (Seed Mismatch): '+str(N_bw)+'\n')
259-
LogFile.write('i_bw (Interval Function): '+str(i_bw)+'\n')
260-
LogFile.write('Theta (Ambiguity Threshold): '+str(Theta)+'\n')
261-
LogFile.write('AS_min (Matching Threshold): '+str(AS_min)+'\n')
262-
LogFile.close()
263-
# DRAW PLOTS
264-
# MAPPING QUALITY HISTOGRAM
265-
os.chdir(OutputDir)
266-
print('Plotting mapping quality ...')
267-
maxQuality = max(mapQ)
268-
fig, ax = plt.subplots(figsize=(5,3.5))
269-
plt.hist(mapQ, bins = range(maxQuality+1), width = 1, align = 'left')
270-
plt.xlim([-1,maxQuality+1])
271-
plt.title('Read Alignment Rates', fontsize=14)
272-
plt.xlabel('Mapping Quality', fontsize=14)
273-
plt.ylabel('Number of Reads', fontsize=14)
274-
plt.tick_params(labelsize=14)
275-
formatter = FuncFormatter(millions2)
276-
ax.yaxis.set_major_formatter(formatter)
277-
plt.tight_layout()
278-
plt.savefig(sample+'_MappingQuality.png',dpi=res)
279-
if svg:
280-
plt.savefig(sample+'_MappingQuality.svg')
281-
# ALIGNMENT SCORE BARPLOT
282-
print('Plotting alignment scores ...')
283-
primScoreKeep = [primScore[k] for k in range(NReads) if AlnStatus[k] in ['Unique','Tolerate']]
284-
secScoreKeep = [secScore[k] for k in range(NReads) if AlnStatus[k] in ['Unique','Tolerate']]
285-
primScoreToss = [primScore[k] for k in range(NReads) if AlnStatus[k] in ['Fail','Ambiguous']]
286-
secScoreToss = [secScore[k] for k in range(NReads) if AlnStatus[k] in ['Fail','Ambiguous']]
287-
# Plot bars (matched reads)
288-
xedges = range(42)
289-
yedges = range(42)
290-
H, xedges, yedges = numpy.histogram2d(secScoreKeep, primScoreKeep, bins=(xedges, yedges))
291-
Y, X = numpy.nonzero(H)
292-
Z = H[Y,X]
293-
Z_off = numpy.zeros(len(Y))
294-
dX = numpy.ones(len(X))
295-
dY = numpy.ones(len(Y))
296-
# Plot bars (discarded reads)
297-
h, xedges, yedges = numpy.histogram2d(secScoreToss, primScoreToss, bins=(xedges, yedges))
298-
y, x = numpy.nonzero(h)
299-
z = h[y,x]
300-
z_off = numpy.zeros(len(y))
301-
dx = numpy.ones(len(x))
302-
dy = numpy.ones(len(y))
303-
# Show plot
304-
fig = plt.figure(figsize=(6,5))
305-
ax = fig.gca(projection='3d')
306-
ax.bar3d(X,Y,Z_off,dX,dY,Z, color=(97/255, 252/255, 80/255)) # bar3d has a bug with hex code
307-
green_proxy = plt.Rectangle((0, 0), 1, 1, fc=(97/255, 252/255, 80/255))
308-
ax.bar3d(x,y,z_off,dx,dy,z,color=(255/255, 51/255, 51/255))
309-
red_proxy = plt.Rectangle((0, 0), 1, 1, fc=(255/255, 51/255, 51/255))
310-
ax.legend([green_proxy,red_proxy],['Reads Accepted','Reads Discarded'],loc='upper left',prop={'size':10})
311-
ax.set_title('Alignment Analysis',fontsize=14)
312-
ax.set_xlabel('Prim. Alignment Score', fontsize=12)
313-
ax.set_ylabel('Sec. Alignment Score', fontsize=12)
314-
formatter = FuncFormatter(millions)
315-
ax.zaxis.set_major_formatter(formatter)
316-
ax.set_zlabel('Number of Reads',fontsize=12)
317-
ax.set_xticks([0,10,20,30,40]); ax.set_yticks([0,10,20,30,40]);
318-
plt.tight_layout()
319-
plt.savefig(sample+'_AlignmentScores.png',dpi=res)
320-
if svg:
321-
plt.savefig(sample+'_AlignmentScores.svg',dpi=res)
322-
print('Alignment analysis completed.')
323-
end = time.time()
324-
# Time stamp
325-
sec_elapsed = end-start
326-
if sec_elapsed < 60:
327-
time_elapsed = sec_elapsed
328-
print('Time elapsed (Alignment analysis) [secs]: ' + '%.3f' % time_elapsed)
329-
elif sec_elapsed < 3600:
330-
time_elapsed = sec_elapsed/60
331-
print('Time elapsed (Alignment analysis) [mins]: ' + '%.3f' % time_elapsed)
332-
else:
333-
time_elapsed = sec_elapsed/3600
334-
print('Time elapsed (Alignment analysis) [hours]: ' + '%.3f' % time_elapsed)
335-
336-
337-
# --------------------------------------
338-
# Get read counts
339-
# --------------------------------------
340-
start = time.time()
341-
print('Read counting in progress ...')
342-
# Read counts per sgRNA
343-
print('Counting reads per sgRNA ...')
344-
ReadCounts = Counter()
345-
for sgRNA in sgRNA_Hitlist:
346-
ReadCounts[sgRNA] += 1
347-
global ReadsPerGuide
348-
ReadsPerGuide = list()
349-
for sgRNA in sgIDs:
350-
ReadsPerGuide.append(ReadCounts[sgRNA])
351-
# Apply read count cut-off
352-
ReadSel = [ReadsPerGuide[k] >= NReads/N0*minN for k in range(L)]
353-
ReadsPerGuide = [ReadSel[k]*ReadsPerGuide[k] for k in range(L)]
354-
# Write read counts
355-
os.chdir(OutputDir)
356-
GuideCountsFilename = sample + GuideCount_Suffix
357-
GuideCounts = open(GuideCountsFilename,'w')
358-
for k in range(L):
359-
GuideCounts.write(str(sgIDs[k]) + '\t'+ str(geneIDs[k]) + '\t' + str(ReadsPerGuide[k]) + '\n')
360-
GuideCounts.close()
361-
# Read counts per gene in library
362-
print('Counting reads per gene ...')
363-
global GeneList
364-
GeneList = list(set(geneIDs))
365-
G = len(GeneList)
366-
num_cores = multiprocessing.cpu_count()
367-
ReadsPerGene = Parallel(n_jobs=num_cores)(delayed(CountReadsPerGeneX)(g) for g in range(G))
368-
GeneCountsFilename = sample + GeneCount_Suffix
369-
GeneCounts = open(GeneCountsFilename,'w')
370-
for g in range(G):
371-
GeneCounts.write(str(GeneList[g]) + '\t' + str(ReadsPerGene[g]) + '\n')
372-
GeneCounts.close()
373-
end = time.time()
374-
print('Read counting completed.')
375-
# No-mapping warning
376-
if sum(ReadsPerGuide) == 0:
377-
print('### ERROR: Zero read counts! Check library and alignment ###')
378-
# Time stamp
379-
sec_elapsed = end-start
380-
if sec_elapsed < 60:
381-
time_elapsed = sec_elapsed
382-
print('Time elapsed (Read Counting) [secs]: ' + '%.3f' % time_elapsed)
383-
elif sec_elapsed < 3600:
384-
time_elapsed = sec_elapsed/60
385-
print('Time elapsed (Read Counting) [mins]: ' + '%.3f' % time_elapsed)
386-
else:
387-
time_elapsed = sec_elapsed/3600
388-
print('Time elapsed (Read Counting) [hours]: ' + '%.3f' % time_elapsed)
389-
390-
391-
# --------------------------------------
392-
# Cleaning up...
393-
# --------------------------------------
394-
start = time.time()
395-
# clipped reads file
396-
if not keepCutReads:
397-
os.chdir(TempDataDir)
398-
os.system('rm '+ReadsFilename0)
399-
# alignment output
400-
if AlnOutput == 'Compress':
401-
print('Compressing raw alignment output...')
402-
os.chdir(AlnDir)
403-
# converting SAM to BAM
404-
SAM_output = glob.glob('*bw2output.sam')[0]
405-
BAM_output = SAM_output[:-3] + 'bam'
406-
os.system('samtools view -buSH '+SAM_output+' > '+BAM_output)
407-
os.system('rm '+SAM_output)
408-
elif AlnOutput == 'Delete':
409-
print('Removing raw alignment output...')
410-
os.chdir(AlnDir)
411-
os.system('rm *')
412-
elif AlnOutput == 'Keep':
413-
print('Keeping raw alignment output ...')
414-
end = time.time()
415-
# Time stamp
416-
sec_elapsed = end-start
417-
if sec_elapsed < 60:
418-
time_elapsed = sec_elapsed
419-
print('Time elapsed (Clean-up) [secs]: ' + '%.3f' % time_elapsed)
420-
elif sec_elapsed < 3600:
421-
time_elapsed = sec_elapsed/60
422-
print('Time elapsed (Clean-up) [mins]: ' + '%.3f' % time_elapsed)
423-
else:
424-
time_elapsed = sec_elapsed/3600
425-
print('Time elapsed (Clean-up) [hours]: ' + '%.3f' % time_elapsed)
426-
427100
# --------------------------------------
428101
# Final time stamp
429102
# --------------------------------------
@@ -445,4 +118,4 @@ def MapAndCount(sample):
445118

446119
if __name__ == "__main__":
447120
input1 = sys.argv[1]
448-
MapAndCount(input1)
121+
ReadAlignment(input1)

0 commit comments

Comments
 (0)