Skip to content

Commit c565b13

Browse files
authored
Adding a script to filter large VCF files
Filters big VCF files to contain only frequent SNPs; add CHR to the chromosome names
1 parent cb86f20 commit c565b13

File tree

1 file changed

+125
-0
lines changed

1 file changed

+125
-0
lines changed

scripts/vcf2vcf_forFreec.pl

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
#!/usr/bin/perl -w
2+
#translate dbSNP vcf file into the format acceptable by FREEC (for dbSNP files)
3+
4+
use strict;
5+
6+
my $usage = qq{
7+
$0
8+
9+
-f file dbSNP vcf file
10+
11+
####
12+
13+
OUTPUT will be in the standard output!!! pipe it if you need with gzip!!!
14+
15+
};
16+
17+
if(scalar(@ARGV) == 0){
18+
print $usage;
19+
exit(0);
20+
}
21+
22+
## mandatory arguments
23+
24+
my $filename = "";
25+
26+
my $minFreq = 0.05;
27+
28+
my $keepSingleOnly = 1;
29+
30+
## parse command line arguments
31+
32+
while(scalar(@ARGV) > 0){
33+
my $this_arg = shift @ARGV;
34+
if ( $this_arg eq '-h') {print "$usage\n"; exit; }
35+
36+
elsif ( $this_arg eq '-f') {$filename = shift @ARGV;}
37+
38+
39+
elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";}
40+
}
41+
42+
if( $filename eq ""){
43+
die "you should specify VCF file from dbSNP\n";
44+
}
45+
46+
47+
48+
#-----------read file with pairs-----
49+
50+
if ($filename =~ /.gz$/) {
51+
open(FILE, "gunzip -c $filename |") or die "$0: can't open ".$filename.":$!\n";
52+
} else {
53+
open (FILE, "<$filename") or die "Cannot open file $filename!!!!: $!";
54+
}
55+
56+
##INFO=<ID=MUT,Number=0,Type=Flag,Description="Is mutation (journal citation, explicit fact): a low frequency variation that is cited in journal and other reputable sources">
57+
#1 10389 rs373144384 AC A . . RS=373144384;RSPOS=10390;dbSNPBuildID=138;SSR=0;SAO=0;VP=0x050000020005000002000200;WGT=1;VC=DIV;R5;ASP
58+
#1 10433 rs56289060 A AC . . RS=56289060;RSPOS=10433;dbSNPBuildID=129;SSR=0;SAO=0;VP=0x050000020005000002000200;WGT=1;VC=DIV;R5;ASP
59+
#1 10440 rs112155239 C A . . RS=112155239;RSPOS=10440;dbSNPBuildID=132;SSR=0;SAO=0;VP=0x050000020005000002000100;WGT=1;VC=SNV;R5;ASP
60+
#CHROM POS ID REF ALT QUAL FILTER INFO
61+
62+
#or
63+
64+
#chr1 10177 rs367896724 A T . . RS=367896724;RSPOS=10177;dbSNPBuildID=138;SSR=0;SAO=0;VP=0x050000020005170026000200;GENEINFO=DDX11L1:100287102;WGT=1;VC=DIV;R5;ASP;VLD;G5A;G5;KGPhase3;CAF=0.5747,0.4253;COMMON=1;TOPMED=0.76728147298674821,0.23271852701325178
65+
#chr1 10352 rs555500075 T G . . RS=555500075;RSPOS=10352;dbSNPBuildID=142;SSR=0;SAO=0;VP=0x050000020005170026000200;GENEINFO=DDX11L1:100287102;WGT=1;VC=DIV;R5;ASP;VLD;G5A;G5;KGPhase3;CAF=0.5625,0.4375;COMMON=1;TOPMED=0.86356396534148827,0.13643603465851172
66+
#chr1 10616 rs376342519 CCGCCGTTGCAAAGGCGCGCCG C . . RS=376342519;RSPOS=10617;dbSNPBuildID=142;SSR=0;SAO=0;VP=0x050000020005040026000200;GENEINFO=DDX11L1:100287102;WGT=1;VC=DIV;R5;ASP;VLD;KGPhase3;CAF=0.006989,0.993;COMMON=1
67+
#chr1 11012 rs544419019 C G . . RS=544419019;RSPOS=11012;dbSNPBuildID=142;SSR=0;SAO=0;VP=0x050000020005150024000100;GENEINFO=DDX11L1:100287102;WGT=1;VC=SNV;R5;ASP;VLD;G5;KGPhase3;CAF=0.9119,0.08806;COMMON=1
68+
#chr1 11063 rs561109771 T G . . RS=561109771;RSPOS=11063;dbSNPBuildID=142;SSR=0;SAO=0;VP=0x050000020005040026000100;GENEINFO=DDX11L1:100287102;WGT=1;VC=SNV;R5;ASP;VLD;KGPhase3;CAF=0.997,0.002995;COMMON=1;TOPMED=0.99760289245667686,0.00239710754332313
69+
70+
# to (Freq >0.01 by default; $minFreq) and keep only single nucleotide variants ($keepSingleOnly)
71+
72+
#chr1 10177 rs367896724 A T . . RS=367896724;TOPMED=0.76728147298674821,0.23271852701325178
73+
#chr1 10352 rs555500075 T G . . RS=555500075;TOPMED=0.86356396534148827,0.13643603465851172
74+
75+
76+
my ($chr,$start,$possibilities,$refAllele,$ID, $freqString) = ("","","","+","","");
77+
78+
my $numberOfSites = 0;
79+
my $totalCount=0;
80+
my $lines = 0;
81+
82+
my($dot,$dot2,$info,$AltAllele);
83+
84+
while (<FILE>) {
85+
$lines++;
86+
next if (m/^\#\#/);
87+
chomp;
88+
($chr,$start,$ID,$refAllele,$AltAllele,$dot,$dot2,$info) = split /\t/;
89+
$totalCount++;
90+
if ($keepSingleOnly) {next unless (length($refAllele)==1 && length($AltAllele)==1);}
91+
my @infoElements = split /TOPMED=/, $info;
92+
my $isPrint = 0;
93+
my $isAlleleFreqProvided = 0;
94+
if (scalar(@infoElements)>1) {
95+
$isAlleleFreqProvided = 1;
96+
my @infoElements2 = split (',' , $infoElements[1]);
97+
# print $infoElements2[0],"\n";
98+
if ($infoElements2[0]<(1-$minFreq)) {
99+
$isPrint=1;
100+
}
101+
102+
}
103+
if (!$isPrint) {
104+
@infoElements = split /CAF=/, $info;
105+
if (scalar(@infoElements)>1) {
106+
$isAlleleFreqProvided = 1;
107+
my @infoElements2 = split (',' , $infoElements[1]);
108+
if ($infoElements2[0]<(1-$minFreq)) {
109+
$isPrint=1;
110+
}
111+
}
112+
}
113+
if ($isPrint || !$isPrint&&!$isAlleleFreqProvided) { #checked whether TOPMED or CAF were actually provided
114+
unless($chr =~ m/^chr/) {
115+
$chr="chr".$chr;
116+
}
117+
$numberOfSites++;
118+
print $_,"\n";
119+
}
120+
}
121+
close FILE;
122+
123+
124+
print STDERR "Read: $filename\tlines: $lines;\ttotal sites: $totalCount\taccepted sites: $numberOfSites\n";
125+

0 commit comments

Comments
 (0)