-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpipe_BrainDiseaseVariantCaller.py
More file actions
101 lines (84 loc) · 2.92 KB
/
pipe_BrainDiseaseVariantCaller.py
File metadata and controls
101 lines (84 loc) · 2.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import sys
import collections
import gzip
def read_annotation(INFO) :
INFO = INFO.split(';')
annotation = {}
for i in INFO :
if i[0:4] == 'CSQ=' :
ANN = i.split('|')
annotation['TYPE'] = ANN[1]
annotation['IMPACT'] = ANN[2]
annotation['GENE'] = ANN[3]
annotation['HGVSc'] = ANN[10]
annotation['HGVSp'] = ANN[11]
annotation['Existing_variation'] = ANN[17]
annotation['SIFT'] = ANN[36]
annotation['gnomAD_EAS_AF'] = ANN[51]
return annotation
def read_DATA(FORMAT, DATA) :
data = {}
for index, value in enumerate(FORMAT) :
data[value] = DATA[index]
return data
def read_brainDB(db) :
DB = open(db,'r')
db_dic = {}
for DBline in DB :
DBtab = DBline.split('\t')
db_dic[DBtab[0]] = DBtab[1].rstrip('\n')
return db_dic
def write_header(output_file) :
output_file.write('CHR\tPOS\tREF\tALT\tREF_count\tALT_count\tAF\tID\tGENE\tTYPE\tIMPACT\tHGVS.c\tHGVS.p\tSIFT\tgnomAD_EAS_AF\tBrain_DB\n')
if __name__== '__main__' :
vcf_file = sys.argv[1]
output_path = os.path.dirname(vcf_file)
filename = os.path.basename(vcf_file).rstrip('vcf.gz')
vcf_file = gzip.open('%s'%vcf_file,'rb')
output1_file = open('%s/%s.tsv'%(output_path,filename),'w')
output2_file = open('%s/%s.rare.tsv'%(output_path,filename),'w')
output3_file = open('%s/%s.rare.brain.tsv'%(output_path,filename),'w')
write_header(output1_file)
write_header(output2_file)
write_header(output3_file)
brainDB = read_brainDB("BrainDB.txt")
for line in vcf_file :
if line[:1] == '#' : continue
line = line.rstrip()
col = line.split('\t')
CHROM = col[0]
POS = col[1]
ID = col[2]
REF = col[3]
ALT = col[4]
INFO = col[7]
FORMAT = col[8].split(':')
DATA = col[9].split(':')
key = CHROM+'\t'+POS+'\t'+REF+'\t'+ALT
annotation = read_annotation(INFO)
data = read_DATA(FORMAT, DATA)
AD = data['AD'].split(',')
try :
AF = float(AD[1]) / ( float(AD[0]) + float(AD[1]) )
except :
print (AD)
continue
try :
output_line = '\t'.join([key, AD[0], AD[1], str(round(AF,2)), annotation['Existing_variation'], annotation['GENE'],annotation['TYPE'],annotation['IMPACT'],annotation['HGVSc'],annotation['HGVSp'],annotation['SIFT'],annotation['gnomAD_EAS_AF'], brainDB[annotation['GENE']]])+'\n'
except :
output_line = '\t'.join([key, AD[0], AD[1], str(round(AF,2)), annotation['Existing_variation'] , annotation['GENE'],annotation['TYPE'],annotation['IMPACT'],annotation['HGVSc'],annotation['HGVSp'],annotation['SIFT'],annotation['gnomAD_EAS_AF'],''])+'\n'
output1_file.write(output_line)
try :
EAS_AF = float(annotation['gnomAD_EAS_AF'])
if EAS_AF > 0.01 : continue
except :
EAS_AF = ''
if ID == '.' and annotation['IMPACT'] == 'HIGH' :
output2_file.write(output_line)
if annotation['GENE'] in brainDB :
output3_file.write(output_line)
if ID == '.' and annotation['SIFT'].split('(')[0] == 'deleterious' :
output2_file.write(output_line)
if annotation['GENE'] in brainDB :
output3_file.write(output_line)