diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bd1ee8..7706037 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,13 @@ Please add a new candidate release at the top after changing the latest one. Fee Try to use the following format: +## [unreleased] +### Changed +- Return phased genotype string from Genotype class if variant is phased ([#195](https://github.com/Clinical-Genomics/genmod/pull/195)) +- Slightly more readable Genotype class ([#195](https://github.com/Clinical-Genomics/genmod/pull/195)) + ## [3.10.2] -### Fixed +### Fixed - Add scoring normalisation for flag lookup mode ([#177](https://github.com/Clinical-Genomics/genmod/pull/177)) - Fix crash on test for missing annotation key for phased mode ([#178](https://github.com/Clinical-Genomics/genmod/pull/178)) - Bugfixes related to multiprocessing stability and error handling ([#183](https://github.com/Clinical-Genomics/genmod/pull/183) and [#186](https://github.com/Clinical-Genomics/genmod/pull/186)) @@ -33,7 +38,7 @@ Try to use the following format: - Fixed sorting of variants ([#152](https://github.com/Clinical-Genomics/genmod/pull/152)) - genmod annotate for mitochondrial variants when using the `chrM` notation ([#157](https://github.com/Clinical-Genomics/genmod/pull/157)) - Fix linting issues ([#154](https://github.com/Clinical-Genomics/genmod/issues/154)) -- genmod models adds headers to VCF even if it contains no variants ([#160](https://github.com/Clinical-Genomics/genmod/pull/160)) +- genmod models adds headers to VCF even if it contains no variants ([#160](https://github.com/Clinical-Genomics/genmod/pull/160)) ## [3.9] - Fixed wrong models when chromosome X was named `chrX` and not `X` ([#135](https://github.com/Clinical-Genomics/genmod/pull/135)) diff --git a/genmod/vcf_tools/genotype.py b/genmod/vcf_tools/genotype.py index 43742f5..ceb4b65 100755 --- a/genmod/vcf_tools/genotype.py +++ b/genmod/vcf_tools/genotype.py @@ -5,7 +5,8 @@ This is a class with information about genotypecalls that follows the (GATK) .vcf standard. -The indata, that is the genotype call, is allways on the form x/x, so they look like 0/0, 1/2, 1/1 and so on. +The indata, that is the genotype call, is typically on the form x/x (unphased) or x|x (phased), +so they look like 0/0, 1/2, 1/1, 0|1 and so on. The first sign inidcates what we find on the first allele, the second is a separator on the form '/' or '|' and the third indicates what is seen on the second allele. The alleles are unordered. @@ -43,81 +44,75 @@ class Genotype(object): """Holds information about a genotype""" - def __init__(self, **kwargs): - super(Genotype, self).__init__() - # These are the different genotypes: - GT = kwargs.get("GT", "./.") - AD = kwargs.get("AD", ".,.") - DP = kwargs.get("DP", "0") - GQ = kwargs.get("GQ", "0") - PL = kwargs.get("PL", None) + def __init__(self, GT="./.", AD=".,.", DP="0", GQ="0", **kwargs): + GT = GT or "./." + AD = AD or ".,." + DP = DP or "0" + GQ = GQ or "0" + + self.phased = "|" in GT + self.separator = "|" if self.phased else "/" + + # Split alleles safely + if self.separator in GT: + self.alleles = GT.split(self.separator) + else: + # Haploid calls (e.g. "0", "1") or missing (".") + self.alleles = [GT] if GT else ["."] + while len(self.alleles) < 2: + self.alleles.append(".") + + self.allele_1 = self.alleles[0] + self.allele_2 = self.alleles[1] + + # Flags + self.genotyped = GT not in ("./.", ".|.", ".") self.heterozygote = False - self.allele_depth = False self.homo_alt = False self.homo_ref = False self.has_variant = False - self.genotyped = False - self.phased = False self.depth_of_coverage = 0 self.quality_depth = 0 - self.genotype_quality = 0 - # Check phasing - if "|" in GT: - self.phased = True - # Check the genotyping: - # This is the case when only one allele is present(eg. X-chromosome) and presented like '0' or '1': - if len(GT) < 3: - self.allele_1 = GT - self.allele_2 = "." - else: - self.allele_1 = GT[0] - self.allele_2 = GT[-1] - # The genotype should allways be represented on the same form - self.genotype = self.allele_1 + "/" + self.allele_2 - - if self.genotype != "./.": - self.genotyped = True - # Check allele status - if self.genotype in ["0/0", "./0", "0/."]: + self.genotype_quality = 0.0 + self.genotype = self.separator.join(self.alleles) + + # Check if a variant is homozygote reference, homozygote alternative or heterozygote + if self.genotyped: + if all(allele in ("0", ".") for allele in (self.allele_1, self.allele_2)): self.homo_ref = True + # Treat e.g. ./1 or 1/. as heterozygous elif self.allele_1 == self.allele_2: self.homo_alt = True - self.has_variant = True else: self.heterozygote = True - self.has_variant = True - # Check the allele depth: - self.ref_depth = 0 - self.alt_depth = 0 - allele_depths = AD.split(",") + self.has_variant = self.homo_alt or self.heterozygote - if len(allele_depths) > 1: - if allele_depths[0].isdigit(): - self.ref_depth = int(allele_depths[0]) - if allele_depths[1].isdigit(): - self.alt_depth = int(allele_depths[1]) + # Parse allele depth + self.ref_depth = 0 + self.alt_depth = 0 + try: + allele_depths = [int(depth) if depth.isdigit() else 0 for depth in AD.split(",")] + self.ref_depth = allele_depths[0] if len(allele_depths) > 0 else 0 + self.alt_depth = sum(allele_depths[1:]) if len(allele_depths) > 1 else 0 + except Exception: + self.ref_depth = 0 + self.alt_depth = 0 self.quality_depth = self.ref_depth + self.alt_depth + # Check the depth of coverage: try: self.depth_of_coverage = int(DP) except ValueError: pass + # Check the genotype quality try: self.genotype_quality = float(GQ) except ValueError: pass - # Check the genotype likelihoods - self.phred_likelihoods = [] - - if PL: - try: - self.phred_likelihoods = [int(score) for score in PL.split(",")] - except ValueError: - pass def __str__(self): """Specifies what will be printed when printing the object.""" - return self.allele_1 + "/" + self.allele_2 + return self.genotype diff --git a/tests/vcf_tools/test_genotype.py b/tests/vcf_tools/test_genotype.py index 8616bce..2732f79 100755 --- a/tests/vcf_tools/test_genotype.py +++ b/tests/vcf_tools/test_genotype.py @@ -45,13 +45,31 @@ def test_nocall(): assert not my_nocall.genotyped +def test_nocall_phased(): + """ + A nocall is when no informations is found on this position for the + individual. It should be False on all questions except nocall. + Also in the case of haploidity the result should be the same. + """ + my_nocall = Genotype(**{"GT": ".|."}) + assert ( + my_nocall.genotype == ".|." + ) # We never need to look at the alleles since genotype is defined by 'allele_1/allele_2' + assert not my_nocall.heterozygote + assert not my_nocall.homo_ref + assert not my_nocall.homo_alt + assert not my_nocall.has_variant + assert not my_nocall.genotyped + + def test_haploid_genotype(): """ Test how genotype behaves with haploid call """ haploid_call = Genotype(**{"GT": "1"}) assert haploid_call.genotype == "1/." - # assert not haploid_call.heterozygote + assert haploid_call.heterozygote + # TODO: Add a good description on why this test contains these commented lines. Do we not care about haploid calls? # assert not haploid_call.homo_ref # assert haploid_call.homo_alt # assert haploid_call.has_variant @@ -104,14 +122,6 @@ def test_bad_gq(): assert my_genotype.genotype_quality == 0 -def test_phred_likelihoods(): - """ - A normal heterozygote call, has_variant and heterozygote is true. - """ - my_genotype = Genotype(**{"GT": "0/1", "PL": "60,70,80"}) - assert my_genotype.phred_likelihoods == [60, 70, 80] - - def test_genotype_1_2(): """ A normal heterozygote call, has_variant and heterozygote is true. @@ -174,7 +184,7 @@ def test_phased_data(): """ my_genotype = Genotype(**{"GT": "1|0"}) assert ( - my_genotype.genotype == "1/0" + my_genotype.genotype == "1|0" ) # If asked about the genotype, it should still be on the same form. assert my_genotype.heterozygote assert not my_genotype.homo_ref