From 1706189a7e34165917666ca40002fc03e99e0879 Mon Sep 17 00:00:00 2001 From: Macgregor Todd Date: Thu, 6 Jan 2022 14:44:29 +1100 Subject: [PATCH] add script to generate obs counts for mito variants --- .../CreateMitoPopulationStatisticsVCF.groovy | 214 ++++++++++++++++++ 1 file changed, 214 insertions(+) create mode 100644 src/main/groovy/gngs/tools/CreateMitoPopulationStatisticsVCF.groovy diff --git a/src/main/groovy/gngs/tools/CreateMitoPopulationStatisticsVCF.groovy b/src/main/groovy/gngs/tools/CreateMitoPopulationStatisticsVCF.groovy new file mode 100644 index 00000000..e87713d4 --- /dev/null +++ b/src/main/groovy/gngs/tools/CreateMitoPopulationStatisticsVCF.groovy @@ -0,0 +1,214 @@ +/* + * Groovy NGS Utils - Some simple utilites for processing Next Generation Sequencing data. + * + * Copyright (C) 2018 Simon Sadedin, ssadedingmail.com + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package gngs.tools + +import gngs.* +import groovy.transform.CompileStatic +import groovy.transform.Memoized +import groovy.util.logging.Log +import htsjdk.variant.variantcontext.Genotype +import htsjdk.variant.variantcontext.GenotypesContext +import htsjdk.variant.variantcontext.VariantContext + +/** + * A tool to create a "population statistics VCF" that represents the frequency of + * variants from a given set of samples by entries in the info field. + *

+ * The following population metrics are calculated: + *

  • AC - the number of alleles that were observed at the site, binned according to frequencies [0-0.25, 0.25-0.5, 0.5-0.75, 0.75-1] + *
  • AN - the total number of observed alleles + *

    + * Limitations: + * The current implementation has a number of limitations: + * + *
  • Only VCF information is considered, not gVCF, which is to say, an allele is counted as + * unobserved at a site even if there was no coverage and therefore was not ascertained at + * that site. + * + * These limitations should be corrected to generate truly accurate counts. + * + * @author Macgregor Todd & Simon Sadedin + */ +@Log +class CreateMitoPopulationStatisticsVCF extends ToolBase { + + int numVCFs + + PrintWriter out = null + + @Override + public void run() { + + if(opts.arguments().isEmpty()) { + parser.usage() + System.exit(0) + } + + if(out != null) { + // Use it as is + } + else + if(opts.o) { + out = Utils.writer(opts.o) + } + else { + out = new PrintWriter(System.out) + } + + out.withWriter { + + List vcfs = opts.arguments().collect { new VCF(it) } // will help us compute the number of samples + + int numVCFs = vcfs*.samples*.size().sum() + + log.info "Computing allele count bins for $numVCFs samples ..." + + printVCFHeader(numVCFs) + + out.flush() + + VariantContext lastVariant = null + ProgressCounter progress = new ProgressCounter(withRate: true, withTime:true, extra: { + "chr: $lastVariant.contig pos: $lastVariant.start (${lastVariant.genotypes[0].sampleName})" + }) + VCFSiteWalker locusWalker = new VCFSiteWalker(opts.arguments()) + locusWalker.walk { List variants -> + lastVariant = variants[0] + processSite(variants) + progress.count() + } + progress.end() + out.close() + } + } + + @CompileStatic + void processSite(List variants) { + // + + Map> alleles = + variants.groupBy { VariantContext ctx -> + ctx.getAlternateAllele(0).baseString + } + + if(alleles.size()>1) { + log.info "Site ${variants[0].contig}:${variants[0].start} is multiallelic" + } + + for(List allele : alleles.values()) { + VariantContext v0 = allele[0] + List ac = computeAlleleCount(allele) + int an = (int)ac.sum() + printVCFSite(v0, ac, an) + } + } + + /** + * Print out a line of the output VCF based on the given computed statistics + * + * @param v0 + * @param ac + * @param an + */ + @CompileStatic + protected void printVCFSite(VariantContext v0, List ac, int an) { + String ac_forprint = ac.toString()[1..-2].replaceAll(' ', '') + out.println([ + v0.contig, + v0.start, + v0.getID(), + v0.reference.baseString, + v0.alternateAlleles[0], + '.', + '.', + "AC=$ac_forprint;AN=$an" + ].join('\t')) + } + + /** + * Compute the allele count at a site, based on the given list of variants at that site. + * + * @return the number of alleles observed, based on the het / hom status of the variants + * and the sex of the respective samples + */ + @CompileStatic + List computeAlleleCount(List variants) { + Set samples = new HashSet(numVCFs*2) + List ac_raw = (List)variants.collect { VariantContext vc -> // count across VCFs + vc.genotypes.collect { Genotype gt -> // sum across samples within this vcf // *** typically no trio vcfs in mito, but this closure maintains compatibility + + LinkedHashMap bins = [ + b1:0, + b2:0, + b3:0, + b4:0 + ] + + // Make us robust to samples being provided twice + if(samples.contains(gt.sampleName in samples)) + return bins + else + samples.add(gt.sampleName) + + if((gt.extendedAttributes.AF as Double) <= 0.25) { + //log.info "$gt.extendedAttributes.AF into bin 1" + bins.b1 += 1 + } + else if((gt.extendedAttributes.AF as Double) <= 0.5) { + bins.b2 += 1 + } + else if((gt.extendedAttributes.AF as Double) <= 0.75) { + bins.b3 += 1 + } + else { + bins.b4 += 1 + } + return bins + + }.collectMany{it.entrySet()}.groupBy{it.key}.collectEntries{[it.key, it.value.sum{it.value}]} // maintains trio compatibility + } + // sum across VCFs + LinkedHashMap ac_summed = (LinkedHashMap)ac_raw.collectMany{it.entrySet()}.groupBy{it.key}.collectEntries{[it.key, it.value.sum{it.value}]} + List ac = ac_summed.collect{it.value} + } + + /** + * Generate a header for the population VCF. This is accomplished by copying headers from the first VCF and + * adding INFO declarations for the population statistics entries. + */ + void printVCFHeader(int numVCFs) { + VCF result = new VCF() + result.headerLines = new VCF(opts.arguments()[0]).headerLines + result.addInfoHeaders([ + '##INFO=,Description="Allele counts in genotypes, for each ALT allele, PER allele frequency bin. Frequency bins are [0-0.25, 0.25-0.5, 0.5-0.75, 0.75-1]">', + '##INFO=', + "##These mito obs counts were calculated from $numVCFs VCFs" + ]) + result.headerLines[-1] = result.headerLines[-1].tokenize('\t')[0..7].join('\t') + result.print(out) + } + + static void main(String [] args) { + cli('CreateMitoPopulationStatisticsVCF ...', args) { + o 'Output file', args:1, required: false + args: Cli.UNLIMITED + } + } +}