-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathExmFilt.GetSpecificGeneInFamily.py
More file actions
171 lines (145 loc) · 8.73 KB
/
ExmFilt.GetSpecificGeneInFamily.py
File metadata and controls
171 lines (145 loc) · 8.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/usr/bin/env python
#$ -N CustFilt -cwd
################################################################################################################
##### USAGE ###
################################################################################################################
#### The purpose of this script is to filter a VCF file for all variants in a specific gene found in a group of samples
#### For example: To filter according to an autosomal dominant inheritance model, filter all mutations that are homozygous reference (0/0) in one parent and heterozygous (0/1) in the other parent and the proband.
#### -v/--vcf <required> The script requires an input vcf
#### -o/--out <required> The use should specify a base name for the output files.The script outputs the filtered results as both a tab delimited file and a vcf file. The script also produces a log file and an auxilary file that contains the results of every test of every variant as "TRUE" or "FALSE" (primarily for debugging purposes).
#### -s/--sam Sample ids as in the vcf separated by a comma. Leave blank for all variants and no sample data
#### -g/--gene <required> Gene Symbol to search for in the INFO field
####
################################################################################################################
################################################################################################################
################################################################################################################
################################################################################################################
##### GET PARAMETERS ###
################################################################################################################
import os
from optparse import OptionParser
parser = OptionParser()
## Basic Input Files, required
parser.add_option("-v", "--vcf", dest="VCFfile",help="Input VCF file", metavar="VCFfile")
parser.add_option("-o", "--out", dest="OutputFileName",help="Base of names for output files", metavar="OutputFileName")
## Assign samples to heterozygous, homozygous or reference for genotype filtering
parser.set_defaults(sampleIDs="ALL")
parser.add_option("-s", "--sam", dest="sampleIDs",help="Samples IDs, must match vcf CHROM line", metavar="sampleIDs")
parser.add_option("-g", "--gene", dest="geneName",help="Gene Symbol to search for in the INFO field", metavar="geneName")
(options, args) = parser.parse_args()
################################################################################################################
##### OPEN INPUT FILES AND INITIALISE OUTPUT FILES ###
################################################################################################################
##Assign input and output files
VCF=open(options.VCFfile,'r')
BaseName=str(options.OutputFileName)
TabOutputFilename=BaseName+'.tsv'
VcfOutputFilename=BaseName+'.vcf'
LogOutputFilename=BaseName+'.log'
Output=open(TabOutputFilename,'w')
Outvcf=open(VcfOutputFilename,'w')
Outlog=open(LogOutputFilename,'w')
################################################################################################################
##### ASSIGN FILTER VARIABLES ###
################################################################################################################
## Gene/Text
SearchGene=str(options.geneName)
# initialise empty lists so that if no samples are specified we have an empty list for comparing to later
Samples=options.sampleIDs
Sample=[]
if Samples is not None:
Sample=Samples.split(',') #.upper().split(',')
################################################################################################################
##### INITIALISE OUTPUT FILES ###
################################################################################################################
##start log file
import datetime
TimeNow=str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))
Outlog.write("Filtering log: "+TimeNow+"\n")
Outlog.write("VCF: "+str(options.VCFfile)+"\n")
Outlog.write("OutputName: "+str(options.OutputFileName)+"\n")
Outlog.write("\n")
Outlog.write("Samples: "+str(Samples)+"\n")
Outlog.write("Gene: "+str(SearchGene)+"\n")
#adjust search term to get exact matchs only
SearchGene='='+SearchGene+";"
#initialise arrays
NameToColumn={}
ColumnToName={}
BadGT=['./.', '0/0']
for line in VCF:
if '#' in line:
Outvcf.write(line)
if line.startswith('#CHROM'):
################################################################################################################
##### MAP COLUMN NAME TO NUMBER AND THEN LIST COLUMN NUMBERS CORRESPONDING TO EACH SAMPLE ###
################################################################################################################
linelist=line.strip().split("\t")
ColumnAndNumber=enumerate(linelist)
print Sample
if Sample[0]=="ALL":
Sample=linelist[9:]
print Sample
for i in ColumnAndNumber:
NameToColumn[i[1]]=i[0]
ColumnToName[i[0]]=i[1]
ColumnNumber=[ NameToColumn[i] for i in Sample ]
##Start output tables
headerlist=['Chromosome','Position','ID','REF','ALT','Gene','VariantFunction','VariantClass','AAchange','AlleleFrequency.ExAC','AlleleFrequency.1KG','AlleleFrequency.ESP','SIFTprediction','PP2prediction','MAprediction','MTprediction','GERP++','CADDscore','SegmentalDuplication']+Sample+['FILTER', 'INFO']+Sample
Output.write("\t".join(headerlist)+"\n")
if '#' not in line:
################################################################################################################
##### PARSE LINE AND GET VARIANT INFO ###
################################################################################################################
linelist=line.split("\t")
##Get Alternate Alleles
AltAlls=linelist[4]
AltAlls=AltAlls.split(",")
AltNum=len(AltAlls)
QUAL=float(linelist[5])
FILTER=linelist[6]
## Get variant data from INFO Field
INFOstring=linelist[7]
INFOcolumn=INFOstring.split(";")
INFOdict={}
for element in INFOcolumn:
if '=' in element:
FieldName,FieldValue=element.split('=',1)
INFOdict[FieldName]=FieldValue
DPnumber=float(INFOdict.get('DP','.'))
MQ0number=float(INFOdict.get('MQ0','.'))
GeneName=INFOdict.get('GeneName','.')
VariantFunction=INFOdict.get('VarFunc','none')
SegDup=INFOdict.get('SegDup','none')
KGFreq=str(INFOdict.get('1KGfreq','.'))
ESPFreq=str(INFOdict.get('ESPfreq','.'))
ExACFreq=str(INFOdict.get('ExACfreq','.'))
VCFFreq=str(INFOdict.get('AF',0))
VariantClass=INFOdict.get('VarClass','none')
AAchange=INFOdict.get('AAChange','.')
SIFTprediction=INFOdict.get('SIFTprd','.')
PP2prediction=INFOdict.get('PP2.hvar.prd','.')
MAprediction=INFOdict.get('MutAprd','.')
MTprediction=INFOdict.get('MutTprd','.')
GERPscore=str(INFOdict.get('GERP','.'))
CADDscore=str(INFOdict.get('CADDphred','.'))
## Get Sample Level Variant information
QualityString=[ linelist[i].strip() for i in ColumnNumber ]
QualityList=[ i.split(':') for i in QualityString ]
GT=[ QualityList[i][0] for i in range(0,len(QualityString)) ]
## Check if all genotypes are present
PassGeno=False
if any(i not in BadGT for i in GT):
PassGeno=True
PassGene=False
if SearchGene in INFOstring:
PassGene=True
if PassGeno and PassGene:
OutputList=linelist[0:5]+[GeneName,VariantFunction,VariantClass,AAchange,ExACFreq,KGFreq,ESPFreq,SIFTprediction,PP2prediction,MAprediction,MTprediction,GERPscore,CADDscore,SegDup]+GT+[FILTER,INFOstring]+QualityString
OutputList= [ str(i) for i in OutputList ]
OutputString="\t".join(OutputList)
Output.write(OutputString+"\n")
Outvcf.write(line)
TimeNow=str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))
Outlog.write("Filtering Finished: "+TimeNow+"\n")
print "Filtering finished "+TimeNow