Skip to content

Commit 6bbe079

Browse files
committed
vcf and TagDigger outputs
1 parent 76b1a0c commit 6bbe079

File tree

2 files changed

+39
-20
lines changed

2 files changed

+39
-20
lines changed

GBS-Chip-Gmatrix.R

Lines changed: 39 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/echo Source me don't execute me
22

3-
KGDver <- "1.2.1"
3+
KGDver <- "1.2.2"
44
cat("KGD version:",KGDver,"\n")
55
if (!exists("nogenos")) nogenos <- FALSE
66
if (!exists("gform")) gform <- "uneak"
@@ -130,7 +130,7 @@ depth2Kchoose <- function(dmodel="bb",param) { # function to choose redefine de
130130

131131

132132
readGBS <- function(genofilefn = genofile, usedt="recommended") {
133-
gform <- tolower(gform)
133+
gform <<- tolower(gform)
134134
if (gform == "chip") readChip(genofilefn)
135135
if (gform == "angsdcounts") readANGSD(genofilefn)
136136
if (gform == "tagdigger") readTD(genofilefn)
@@ -232,7 +232,7 @@ readANGSD <- function(genofilefn0 = genofile) {
232232
nind <<- length(seqID)
233233
chrom <<- vcfin$`#CHROM`
234234
pos <<- vcfin$POS
235-
if(all(SNP_Names==".")) SNP_Names <<- paste(chrom,pos,sep="_")
235+
if(all(SNP_Names==".")) SNP_Names <<- paste(chrom,as.integer(pos),sep="_")
236236
nfields <- 1+ lengths(regmatches(vcfin$FORMAT,gregexpr(":",vcfin$FORMAT)))
237237
tempformats <- read.table(text=vcfin$FORMAT,sep=":",fill=TRUE,stringsAsFactors=FALSE,col.names=paste0("V",1:max(nfields)))
238238
gthave = apply(tempformats=="GT",1,any)
@@ -306,7 +306,7 @@ readTassel <- function(genofilefn0 = genofile, usedt="recommended") {
306306
tempin <- scan(genofilefn0, what=list(CHROM = "", POS = 0),skip=1,quote="",flush=TRUE, quiet=TRUE) # 1st 2 columns only
307307
chrom <<- tempin$CHROM
308308
pos <<- tempin$POS
309-
SNP_Names <<- paste(chrom,pos,sep="_")
309+
SNP_Names <<- paste(chrom,as.integer(pos),sep="_")
310310
}
311311
nsnps <<- length(SNP_Names)
312312
cat("Data file has", nsnps, "SNPs \n")
@@ -2442,7 +2442,8 @@ writeVCF <- function(indsubset, snpsubset, outname=NULL, ep=0.001, puse = p, IDu
24422442
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
24432443
'##FORMAT=<ID=GP,Number=G,Type=Float,Description="Genotype Probability">',
24442444
metalik,
2445-
'##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allele Read Counts">\n',sep="\n")
2445+
'##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allele Read Counts">\n',
2446+
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">\n',sep="\n")
24462447
cat(metaInfo, file=filename)
24472448
if(contig.meta) write.table(cbind("##contig=<ID=",SNP_Names[snpsubset],">"),file=filename,sep="",quote=FALSE,row.names=FALSE,col.names=FALSE,append=TRUE)
24482449
## colnames:
@@ -2452,7 +2453,7 @@ writeVCF <- function(indsubset, snpsubset, outname=NULL, ep=0.001, puse = p, IDu
24522453
out <- matrix(nrow=length(snpsubset),ncol=9+length(indsubset))
24532454

24542455
temp <- options()$scipen
2455-
options(scipen=10) #needed for formating
2456+
options(scipen=15) #needed for formating. Increased from 10
24562457
## Compute the Data line fields
24572458
if(gform == "tassel"){
24582459
out[,1] <- chrom[snpsubset]
@@ -2472,7 +2473,6 @@ writeVCF <- function(indsubset, snpsubset, outname=NULL, ep=0.001, puse = p, IDu
24722473
out[,8] <- rep(".", length(snpsubset))
24732474
out[,9] <- rep("GT:GP:GL:AD:DP", length(snpsubset))
24742475
if(usePL) out[,9] <- rep("GT:GP:PL:AD:DP", length(snpsubset))
2475-
24762476

24772477
## compute probs
24782478
paa <- (1-ep)^ref*ep^alt*pmat^2
@@ -2483,13 +2483,19 @@ writeVCF <- function(indsubset, snpsubset, outname=NULL, ep=0.001, puse = p, IDu
24832483
pab <- round(pab/psum,4)
24842484
pbb <- round(pbb/psum,4)
24852485
## compute likelihood values
2486-
compLike <- function(x) 1/2^(ref+alt)*((2-x)*ep + x*(1-ep))^ref * ((2-x)*(1-ep) + x*ep)^alt
2487-
llaa <- log10(compLike(2))
2488-
llab <- log10(compLike(1))
2489-
llbb <- log10(compLike(0))
2490-
llaa[is.infinite(llaa)] <- -1000
2491-
llab[is.infinite(llab)] <- -1000
2492-
llbb[is.infinite(llbb)] <- -1000
2486+
compLike0 <- function(x) ((2-x)*ep + x*(1-ep))^ref * ((2-x)*(1-ep) + x*ep)^alt
2487+
2488+
compLike <- function(x) max(.Machine$double.xmin,1/2^(ref+alt))*((2-x)*ep + x*(1-ep))^ref * ((2-x)*(1-ep) + x*ep)^alt
2489+
2490+
#llaa <- log10(compLike(2)) # alternative method
2491+
#llab <- log10(compLike(1))
2492+
#llbb <- log10(compLike(0))
2493+
llaa <- (ref+alt)*log10(1/2)+log10(compLike0(2))
2494+
llab <- (ref+alt)*log10(1/2)+log10(compLike0(1))
2495+
llbb <- (ref+alt)*log10(1/2)+log10(compLike0(0))
2496+
llaa <- pmin(pmax(llaa,-1000),1000) # deal with + or - Inf
2497+
llab <- pmin(pmax(llab,-1000),1000)
2498+
llbb <- pmin(pmax(llbb,-1000),1000)
24932499
#phred-scaled ...
24942500
minll <- pmax(llaa,llab,llbb)
24952501
plaa <- -10 * round(llaa - minll,0)
@@ -2517,15 +2523,15 @@ writeVCF <- function(indsubset, snpsubset, outname=NULL, ep=0.001, puse = p, IDu
25172523
genon0_tmp[which((pab > paa) & (pab > pbb))] = 1
25182524
genon0_tmp[is.na(genon0)] <- -1
25192525
genon0 = genon0_tmp
2520-
} else if (GTmethod == "observed"){
2526+
} else { # if (GTmethod == "observed"){ # do this for anything other than GP
25212527
genon0[is.na(genon0)] <- -1
25222528
}
25232529
if (is.big) {
25242530
gt <- apply(genon0+2, 2, function(x) c("./.","1/1","0/1","0/0")[x])
25252531
out[,-c(1:9)] <- apply(cbind(gt,paa,pab,pbb,llaa,llab,llbb,ref,alt,depthsub),1,genostring)
25262532
} else {
25272533
gt <- sapply(as.vector(genon0), function(x) switch(x+2,"./.","1/1","0/1","0/0"))
2528-
out[,-c(1:9)] <- matrix(gsub("NA",".",gsub("NA,","",paste(gt,paste(paa,pab,pbb,sep=","),paste(llaa,llab,llbb,sep=","), paste(ref,alt,sep=","),depthsub, sep=":"))),
2534+
out[,-c(1:9)] <- matrix(gsub("NA",".",gsub("NA,","",paste(gt,paste(paa,pab,pbb,sep=","),paste(llaa,llab,llbb,sep=","), paste(ref,alt,sep=","),depthsub, sep=":"))),
25292535
nrow=length(snpsubset), ncol=length(indsubset), byrow=TRUE)
25302536
# for missings, first change NA, to empty so that any set of NA,NA,...,NA changes to NA, then can set that to . which is vcf missing for the whole field
25312537
}
@@ -2541,18 +2547,20 @@ writeVCF <- function(indsubset, snpsubset, outname=NULL, ep=0.001, puse = p, IDu
25412547
return(invisible(NULL))
25422548
}
25432549

2544-
writeGBS <- function(indsubset,snpsubset,outname="HapMap.hmc.txt",outformat=gform,seqIDuse=seqID) {
2550+
writeGBS <- function(indsubset, snpsubset, outname="HapMap.hmc.txt", outformat=gform, seqIDuse=seqID, allele.ref="C", allele.alt="G") {
25452551
outformat <- tolower(outformat)
25462552
written <- FALSE
25472553
if(missing(indsubset)) indsubset <- 1:nind
25482554
if(missing(snpsubset)) snpsubset <- 1:nsnps
25492555
if(length(seqIDuse) == nind) seqIDuse <- seqIDuse[indsubset]
2550-
if(tolower(outformat) == "uneak") {
2556+
if(outformat != "chip") {
25512557
if(!exists("alleles"))
25522558
stop("Allele matrix does not exist. Change the 'alleles.keep' argument to TRUE and rerun KGD")
25532559
else if(is.null(alleles))
25542560
stop("Allele matrix object `alleles` is set to NULL.")
25552561
if(nrow(alleles) != nind | ncol(alleles) != 2* nsnps) stop("Allele matrix does not correspond to genotype matrix")
2562+
}
2563+
if(outformat == "uneak") {
25562564
ref <- alleles[indsubset, seq(1, 2 * nsnps - 1, 2)[snpsubset], drop=FALSE]
25572565
alt <- alleles[indsubset, seq(2, 2 * nsnps, 2)[snpsubset], drop=FALSE]
25582566
depthsub <- ref + alt
@@ -2575,7 +2583,19 @@ writeGBS <- function(indsubset,snpsubset,outname="HapMap.hmc.txt",outformat=gfor
25752583
outname,row.names=FALSE,quote=FALSE,sep="\t")
25762584
written <- TRUE
25772585
}
2578-
if(tolower(outformat) == "chip") {
2586+
if(outformat == "tagdigger") {
2587+
nsnpsub <- length(snpsubset)
2588+
if(length(allele.ref) == nsnps) allele.ref <- allele.ref[snpsubset]
2589+
if(length(allele.alt) == nsnps) allele.alt <- allele.alt[snpsubset]
2590+
if(length(allele.ref) == 1) allele.ref <- rep(allele.ref,nsnpsub)
2591+
if(length(allele.alt) == 1) allele.alt <- rep(allele.alt,nsnpsub)
2592+
alleles <- alleles[indsubset,c(1,2) + rep(2*snpsubset-2,each=2)]
2593+
colnames(alleles) <- paste(rep(SNP_Names, each=2),c(allele.ref,allele.alt)[c(0,nsnpsub)+rep(1:nsnpsub,each=2)],sep="_")
2594+
if(require("data.table")) fwrite(data.frame(seqID=seqIDuse,alleles),outname) else
2595+
write.csv(cbind(seqID=seqIDuse,alleles),outname,row.names=FALSE,quote=FALSE)
2596+
written <- TRUE
2597+
}
2598+
if(outformat == "chip") {
25792599
if(!exists("genon") | !exists("depth"))
25802600
stop("genon and/or depth matrix does not exist")
25812601
if(!all(depth==Inf | depth == 0)) cat("Warning: Some depths are not 0 or Inf\n")
@@ -2664,4 +2684,3 @@ genderassign <- function(ped.df, index_Y_SNPs, index_X_SNPs, sfx="", hetgamsex =
26642684
dev.off()
26652685
gender_output
26662686
}
2667-

KGDManual.pdf

4.81 KB
Binary file not shown.

0 commit comments

Comments
 (0)