1
1
# !/bin/echo Source me don't execute me
2
2
3
- KGDver <- " 1.2.1 "
3
+ KGDver <- " 1.2.2 "
4
4
cat(" KGD version:" ,KGDver ," \n " )
5
5
if (! exists(" nogenos" )) nogenos <- FALSE
6
6
if (! exists(" gform" )) gform <- " uneak"
@@ -130,7 +130,7 @@ depth2Kchoose <- function(dmodel="bb",param) { # function to choose redefine de
130
130
131
131
132
132
readGBS <- function (genofilefn = genofile , usedt = " recommended" ) {
133
- gform <- tolower(gform )
133
+ gform << - tolower(gform )
134
134
if (gform == " chip" ) readChip(genofilefn )
135
135
if (gform == " angsdcounts" ) readANGSD(genofilefn )
136
136
if (gform == " tagdigger" ) readTD(genofilefn )
@@ -232,7 +232,7 @@ readANGSD <- function(genofilefn0 = genofile) {
232
232
nind <<- length(seqID )
233
233
chrom <<- vcfin $ `#CHROM`
234
234
pos <<- vcfin $ POS
235
- if (all(SNP_Names == " ." )) SNP_Names <<- paste(chrom ,pos ,sep = " _" )
235
+ if (all(SNP_Names == " ." )) SNP_Names <<- paste(chrom ,as.integer( pos ) ,sep = " _" )
236
236
nfields <- 1 + lengths(regmatches(vcfin $ FORMAT ,gregexpr(" :" ,vcfin $ FORMAT )))
237
237
tempformats <- read.table(text = vcfin $ FORMAT ,sep = " :" ,fill = TRUE ,stringsAsFactors = FALSE ,col.names = paste0(" V" ,1 : max(nfields )))
238
238
gthave = apply(tempformats == " GT" ,1 ,any )
@@ -306,7 +306,7 @@ readTassel <- function(genofilefn0 = genofile, usedt="recommended") {
306
306
tempin <- scan(genofilefn0 , what = list (CHROM = " " , POS = 0 ),skip = 1 ,quote = " " ,flush = TRUE , quiet = TRUE ) # 1st 2 columns only
307
307
chrom <<- tempin $ CHROM
308
308
pos <<- tempin $ POS
309
- SNP_Names <<- paste(chrom ,pos ,sep = " _" )
309
+ SNP_Names <<- paste(chrom ,as.integer( pos ) ,sep = " _" )
310
310
}
311
311
nsnps <<- length(SNP_Names )
312
312
cat(" Data file has" , nsnps , " SNPs \n " )
@@ -2442,7 +2442,8 @@ writeVCF <- function(indsubset, snpsubset, outname=NULL, ep=0.001, puse = p, IDu
2442
2442
' ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">' ,
2443
2443
' ##FORMAT=<ID=GP,Number=G,Type=Float,Description="Genotype Probability">' ,
2444
2444
metalik ,
2445
- ' ##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allele Read Counts">\n ' ,sep = " \n " )
2445
+ ' ##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allele Read Counts">\n ' ,
2446
+ ' ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">\n ' ,sep = " \n " )
2446
2447
cat(metaInfo , file = filename )
2447
2448
if (contig.meta ) write.table(cbind(" ##contig=<ID=" ,SNP_Names [snpsubset ]," >" ),file = filename ,sep = " " ,quote = FALSE ,row.names = FALSE ,col.names = FALSE ,append = TRUE )
2448
2449
# # colnames:
@@ -2452,7 +2453,7 @@ writeVCF <- function(indsubset, snpsubset, outname=NULL, ep=0.001, puse = p, IDu
2452
2453
out <- matrix (nrow = length(snpsubset ),ncol = 9 + length(indsubset ))
2453
2454
2454
2455
temp <- options()$ scipen
2455
- options(scipen = 10 ) # needed for formating
2456
+ options(scipen = 15 ) # needed for formating. Increased from 10
2456
2457
# # Compute the Data line fields
2457
2458
if (gform == " tassel" ){
2458
2459
out [,1 ] <- chrom [snpsubset ]
@@ -2472,7 +2473,6 @@ writeVCF <- function(indsubset, snpsubset, outname=NULL, ep=0.001, puse = p, IDu
2472
2473
out [,8 ] <- rep(" ." , length(snpsubset ))
2473
2474
out [,9 ] <- rep(" GT:GP:GL:AD:DP" , length(snpsubset ))
2474
2475
if (usePL ) out [,9 ] <- rep(" GT:GP:PL:AD:DP" , length(snpsubset ))
2475
-
2476
2476
2477
2477
# # compute probs
2478
2478
paa <- (1 - ep )^ ref * ep ^ alt * pmat ^ 2
@@ -2483,13 +2483,19 @@ writeVCF <- function(indsubset, snpsubset, outname=NULL, ep=0.001, puse = p, IDu
2483
2483
pab <- round(pab / psum ,4 )
2484
2484
pbb <- round(pbb / psum ,4 )
2485
2485
# # compute likelihood values
2486
- compLike <- function (x ) 1 / 2 ^ (ref + alt )* ((2 - x )* ep + x * (1 - ep ))^ ref * ((2 - x )* (1 - ep ) + x * ep )^ alt
2487
- llaa <- log10(compLike(2 ))
2488
- llab <- log10(compLike(1 ))
2489
- llbb <- log10(compLike(0 ))
2490
- llaa [is.infinite(llaa )] <- - 1000
2491
- llab [is.infinite(llab )] <- - 1000
2492
- llbb [is.infinite(llbb )] <- - 1000
2486
+ compLike0 <- function (x ) ((2 - x )* ep + x * (1 - ep ))^ ref * ((2 - x )* (1 - ep ) + x * ep )^ alt
2487
+
2488
+ compLike <- function (x ) max(.Machine $ double.xmin ,1 / 2 ^ (ref + alt ))* ((2 - x )* ep + x * (1 - ep ))^ ref * ((2 - x )* (1 - ep ) + x * ep )^ alt
2489
+
2490
+ # llaa <- log10(compLike(2)) # alternative method
2491
+ # llab <- log10(compLike(1))
2492
+ # llbb <- log10(compLike(0))
2493
+ llaa <- (ref + alt )* log10(1 / 2 )+ log10(compLike0(2 ))
2494
+ llab <- (ref + alt )* log10(1 / 2 )+ log10(compLike0(1 ))
2495
+ llbb <- (ref + alt )* log10(1 / 2 )+ log10(compLike0(0 ))
2496
+ llaa <- pmin(pmax(llaa ,- 1000 ),1000 ) # deal with + or - Inf
2497
+ llab <- pmin(pmax(llab ,- 1000 ),1000 )
2498
+ llbb <- pmin(pmax(llbb ,- 1000 ),1000 )
2493
2499
# phred-scaled ...
2494
2500
minll <- pmax(llaa ,llab ,llbb )
2495
2501
plaa <- - 10 * round(llaa - minll ,0 )
@@ -2517,15 +2523,15 @@ writeVCF <- function(indsubset, snpsubset, outname=NULL, ep=0.001, puse = p, IDu
2517
2523
genon0_tmp [which((pab > paa ) & (pab > pbb ))] = 1
2518
2524
genon0_tmp [is.na(genon0 )] <- - 1
2519
2525
genon0 = genon0_tmp
2520
- } else if (GTmethod == " observed" ){
2526
+ } else { # if (GTmethod == "observed"){ # do this for anything other than GP
2521
2527
genon0 [is.na(genon0 )] <- - 1
2522
2528
}
2523
2529
if (is.big ) {
2524
2530
gt <- apply(genon0 + 2 , 2 , function (x ) c(" ./." ," 1/1" ," 0/1" ," 0/0" )[x ])
2525
2531
out [,- c(1 : 9 )] <- apply(cbind(gt ,paa ,pab ,pbb ,llaa ,llab ,llbb ,ref ,alt ,depthsub ),1 ,genostring )
2526
2532
} else {
2527
2533
gt <- sapply(as.vector(genon0 ), function (x ) switch (x + 2 ," ./." ," 1/1" ," 0/1" ," 0/0" ))
2528
- out [,- c(1 : 9 )] <- matrix (gsub(" NA" ," ." ,gsub(" NA," ," " ,paste(gt ,paste(paa ,pab ,pbb ,sep = " ," ),paste(llaa ,llab ,llbb ,sep = " ," ), paste(ref ,alt ,sep = " ," ),depthsub , sep = " :" ))),
2534
+ out [,- c(1 : 9 )] <- matrix (gsub(" NA" ," ." ,gsub(" NA," ," " ,paste(gt ,paste(paa ,pab ,pbb ,sep = " ," ),paste(llaa ,llab ,llbb ,sep = " ," ), paste(ref ,alt ,sep = " ," ),depthsub , sep = " :" ))),
2529
2535
nrow = length(snpsubset ), ncol = length(indsubset ), byrow = TRUE )
2530
2536
# for missings, first change NA, to empty so that any set of NA,NA,...,NA changes to NA, then can set that to . which is vcf missing for the whole field
2531
2537
}
@@ -2541,18 +2547,20 @@ writeVCF <- function(indsubset, snpsubset, outname=NULL, ep=0.001, puse = p, IDu
2541
2547
return (invisible (NULL ))
2542
2548
}
2543
2549
2544
- writeGBS <- function (indsubset ,snpsubset ,outname = " HapMap.hmc.txt" ,outformat = gform ,seqIDuse = seqID ) {
2550
+ writeGBS <- function (indsubset , snpsubset , outname = " HapMap.hmc.txt" , outformat = gform , seqIDuse = seqID , allele.ref = " C " , allele.alt = " G " ) {
2545
2551
outformat <- tolower(outformat )
2546
2552
written <- FALSE
2547
2553
if (missing(indsubset )) indsubset <- 1 : nind
2548
2554
if (missing(snpsubset )) snpsubset <- 1 : nsnps
2549
2555
if (length(seqIDuse ) == nind ) seqIDuse <- seqIDuse [indsubset ]
2550
- if (tolower( outformat ) == " uneak " ) {
2556
+ if (outformat != " chip " ) {
2551
2557
if (! exists(" alleles" ))
2552
2558
stop(" Allele matrix does not exist. Change the 'alleles.keep' argument to TRUE and rerun KGD" )
2553
2559
else if (is.null(alleles ))
2554
2560
stop(" Allele matrix object `alleles` is set to NULL." )
2555
2561
if (nrow(alleles ) != nind | ncol(alleles ) != 2 * nsnps ) stop(" Allele matrix does not correspond to genotype matrix" )
2562
+ }
2563
+ if (outformat == " uneak" ) {
2556
2564
ref <- alleles [indsubset , seq(1 , 2 * nsnps - 1 , 2 )[snpsubset ], drop = FALSE ]
2557
2565
alt <- alleles [indsubset , seq(2 , 2 * nsnps , 2 )[snpsubset ], drop = FALSE ]
2558
2566
depthsub <- ref + alt
@@ -2575,7 +2583,19 @@ writeGBS <- function(indsubset,snpsubset,outname="HapMap.hmc.txt",outformat=gfor
2575
2583
outname ,row.names = FALSE ,quote = FALSE ,sep = " \t " )
2576
2584
written <- TRUE
2577
2585
}
2578
- if (tolower(outformat ) == " chip" ) {
2586
+ if (outformat == " tagdigger" ) {
2587
+ nsnpsub <- length(snpsubset )
2588
+ if (length(allele.ref ) == nsnps ) allele.ref <- allele.ref [snpsubset ]
2589
+ if (length(allele.alt ) == nsnps ) allele.alt <- allele.alt [snpsubset ]
2590
+ if (length(allele.ref ) == 1 ) allele.ref <- rep(allele.ref ,nsnpsub )
2591
+ if (length(allele.alt ) == 1 ) allele.alt <- rep(allele.alt ,nsnpsub )
2592
+ alleles <- alleles [indsubset ,c(1 ,2 ) + rep(2 * snpsubset - 2 ,each = 2 )]
2593
+ colnames(alleles ) <- paste(rep(SNP_Names , each = 2 ),c(allele.ref ,allele.alt )[c(0 ,nsnpsub )+ rep(1 : nsnpsub ,each = 2 )],sep = " _" )
2594
+ if (require(" data.table" )) fwrite(data.frame (seqID = seqIDuse ,alleles ),outname ) else
2595
+ write.csv(cbind(seqID = seqIDuse ,alleles ),outname ,row.names = FALSE ,quote = FALSE )
2596
+ written <- TRUE
2597
+ }
2598
+ if (outformat == " chip" ) {
2579
2599
if (! exists(" genon" ) | ! exists(" depth" ))
2580
2600
stop(" genon and/or depth matrix does not exist" )
2581
2601
if (! all(depth == Inf | depth == 0 )) cat(" Warning: Some depths are not 0 or Inf\n " )
@@ -2664,4 +2684,3 @@ genderassign <- function(ped.df, index_Y_SNPs, index_X_SNPs, sfx="", hetgamsex =
2664
2684
dev.off()
2665
2685
gender_output
2666
2686
}
2667
-
0 commit comments