-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathkrakenAnalysis.py
More file actions
259 lines (173 loc) · 9.7 KB
/
krakenAnalysis.py
File metadata and controls
259 lines (173 loc) · 9.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 01 11:56:57 2016
@author: Ben
"""
############## Modules ##############
from __future__ import division
import sqlite3
import shlex
import subprocess
import numpy as np
from collections import Counter
import math
import argparse
from scipy.stats import mode
### Matplotlib imports
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
matplotlib.rcParams['font.family'] = 'Ubuntu'
### Pandas imports
import pandas as pd
from sqlalchemy import create_engine
############## Functions ##############
def getNumClassified (table):
# Inserts the number of classified reads for each bargroup into the database.
# Pull all data from reads with phylogenetic data in this column:
df = pd.read_sql_query('select barcode,classified from "%s" order by barcode' % table, db)
barcode = list(df['barcode']) # Barcodes from classified reads
classified = list(df['classified']) # Classification status of a read
numClassified = [] # Number of classified reads in a bargroup
barcodes = [] # Barcodes analyzed
last = 0 # Index for beginning for current group
for i in range(len(barcode) - 1):
# Continue while still in the same bargroup:
if barcode[i] == barcode[i + 1]:
continue
# Once we reach the end of the bargroup:
else:
barSize = i - last + 1 # Number of reads in the group (this layer)
barcodes += [barcode[i]] # Add to barcodes list
numClassified += [classified[last:(i + 1)].count('Y')] # Number of classified reads
last = i + 1 # Set bargroup size counter to current position
# Add last bargroup to the list:
barSize = len(barcode) - last
barcodes += [barcode[-1]]
numClassified += [classified[last:len(barcode)].count('Y')]
# Import into db:
sql = sqlite3.connect(sqliteDB, isolation_level = 'Exclusive')
s = sql.cursor() # Create cursor
# These settings allow DB to be stored in memory and written periodically to the disk:
s.execute('PRAGMA synchronous = 0')
s.execute('PRAGMA journal_mode = OFF')
# Add column for numClassified:
try:
s.execute('alter table "%s" add column numClassified int' % table)
except sqlite3.Error:
pass
# Insert into db:
for i in range(len(barcodes)):
s.execute('update "%s" set numCLassified="%s" where barcode="%s"' %
(table, numClassified[i], barcodes[i]))
if (i % 1e3 == 0 and i != 0):
sql.commit() # Write reads to persistent DB
print "Statistics for %d barcode groups exported to database so far..." % i
sql.commit() # Commit db changes to disk
sql.close() # Close the db
def computeStats (table, column, classifiedSizes):
# Calculates the statistics for Kraken data in a specified table.
# Pull all data from reads with phylogenetic data in this column:
df = pd.read_sql_query('select barcode,barSize,numClassified,"%s" from "%s" where "%s"!="" order by barcode' % (column, table, column), db)
barcode = list(df['barcode']) # Barcodes from classified reads
sizeAll = list(df['barSize']) # Size of ENTIRE group
columndb = list(df[column]) # Column of interest
numClassified = list(df['numClassified']) # Number of classified reads in bargroup
purities = [] # Purity for given column
fracClassifiedLayer = [] # Fraction of classified reads for this phylogenetic layer
fracClassified = [] # Fraction of classified reads in a bargroup
mostCommon = [] # Most common entry in given column
barcodes = [] # Barcodes analyzed
last = 0 # Index for beginning for current group
for i in range(len(barcode) - 1):
# Continue while still in the same bargroup:
if barcode[i] == barcode[i + 1]:
continue
# Once we reach the end of the bargroup:
else:
barSize = i - last + 1 # Number of classified reads in the group (this layer)
fracClassifiedLayer += [barSize / numClassified[i]] # % classified (specified layer only)
fracClassified += [barSize / sizeAll[i]] # % classified (all reads)
barcodes += [barcode[i]] # Add to barcodes list
modeCount = mode(columndb[last:(i + 1)]) # Find mode of column (i.e. most common)
mostCommon += [modeCount[0][0]] # Add to most common species list
purities += [modeCount[1][0] / barSize] # Divide count of most common by count of aligned
last = i + 1 # Set bargroup size counter to current position
# Add last bargroup to the list:
barSize = len(barcode) - last
fracClassifiedLayer += [barSize / numClassified[-1]]
fracClassified += [barSize / sizeAll[-1]]
barcodes += [barcode[-1]]
modeCount = mode(columndb[last:len(barcode)])
mostCommon += [modeCount[0][0]]
purities += [modeCount[1][0] / barSize]
print 'Data computed for column %s!' % column
return purities, fracClassified, fracClassifiedLayer, mostCommon, barcodes
def insertStats (table, column, purities, fracClassified, fracClassifiedLayer, mostCommon, barcodes):
# Creates new columns in the table with the computed statistics
print 'Inserting phylogenetic statistics into database...'
sql = sqlite3.connect(sqliteDB, isolation_level = 'Exclusive')
s = sql.cursor() # Create cursor
# These settings allow DB to be stored in memory and written periodically to the disk:
s.execute('PRAGMA synchronous = 0')
s.execute('PRAGMA journal_mode = OFF')
# Classified reads fraction (all layers) - only need to do this once:
if column == 'domain':
try:
s.execute('alter table "%s" add column fracClassified float default 0' % table)
except sqlite3.Error:
pass
# Add index for barcodes (if not already done):
try:
# Create index on barcodes:
s.execute('create index indexMain on "%s" (barcode)' % table)
except sqlite3.Error:
pass
# Add layer-specific columns:
try:
s.execute('alter table "%s" add column "%s" float' % (table, 'purity' + column))
s.execute('alter table "%s" add column "%s" float' % (table, 'fracClassified' + column))
s.execute('alter table "%s" add column "%s" text' % (table, 'mostCommon' + column))
except sqlite3.Error:
pass
# Insert phylogenetic statistics data:
for i in range(len(barcodes)):
# Insert bargroup classified fraction with domain data only:
if column == 'domain':
s.execute('update "%s" set fracClassified="%s","%s"="%s","%s"="%s","%s"="%s" where barcode="%s"' %
(table, fracClassified[i], 'purity' + column, purities[i], 'fracClassified' + column, fracClassifiedLayer[i], 'mostCommon' + column, mostCommon[i], barcodes[i]))
if (i % 1e3 == 0 and i != 0):
sql.commit() # Write reads to persistent DB
print "Statistics for %d barcode groups exported to database so far..." % i
else:
s.execute('update "%s" set "%s"="%s","%s"="%s","%s"="%s" where barcode="%s"' %
(table, 'purity' + column, purities[i], 'fracClassified' + column, fracClassifiedLayer[i], 'mostCommon' + column, mostCommon[i], barcodes[i]))
if (i % 1e3 == 0 and i != 0):
sql.commit() # Write reads to persistent DB
print "Statistics for %d barcode groups exported to database so far..." % i
sql.commit() # Commit db changes to disk
sql.close() # Close the db
print 'Phylogenetic statistics for column %s inserted into database!' % column
if __name__ == "__main__":
### Setup argument parser ###
parser = argparse.ArgumentParser(description="""
A script for analyzing phylogenetic data produced by Kraken.
@author: Ben Demaree
""", formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('db', metavar='<db_filename>', type=str,
help='SQLite input database filename')
parser.add_argument('-t', metavar='table', type=str, default='kraken',
help='table in database to analyze [default: kraken]')
args = parser.parse_args() # Parse arguments
sqliteDB = args.db # DB filename
table = args.t # Table to analyze
db = create_engine('sqlite:///' + sqliteDB) # Create DB interface
### Compute phylogenetic statistics ###
columns = ['domain', 'phylum', 'class', "order", 'family', 'genus', 'species']
classifiedSizes = []
getNumClassified(table)
for level in columns:
# Compute phyologenetic statistics:
purities, fracClassified, fracClassifiedLayer, mostCommon, barcodes = computeStats(table, level, classifiedSizes)
# Insert data into database:
insertStats(table, level, purities, fracClassified, fracClassifiedLayer, mostCommon, barcodes)