From a6e2a92e621181f72fe035730ab1fcbb109a8997 Mon Sep 17 00:00:00 2001 From: Felix Lenner Date: Mon, 9 Mar 2026 09:05:31 +0100 Subject: [PATCH 1/6] More readable Genotype class --- genmod/vcf_tools/genotype.py | 92 +++++++++++++++++--------------- tests/vcf_tools/test_genotype.py | 19 ++++++- 2 files changed, 65 insertions(+), 46 deletions(-) diff --git a/genmod/vcf_tools/genotype.py b/genmod/vcf_tools/genotype.py index 43742f5..70b7443 100755 --- a/genmod/vcf_tools/genotype.py +++ b/genmod/vcf_tools/genotype.py @@ -5,7 +5,8 @@ This is a class with information about genotypecalls that follows the (GATK) .vcf standard. -The indata, that is the genotype call, is allways on the form x/x, so they look like 0/0, 1/2, 1/1 and so on. +The indata, that is the genotype call, is typically on the form x/x (unphased) or x|x (phased), +so they look like 0/0, 1/2, 1/1, 0|1 and so on. The first sign inidcates what we find on the first allele, the second is a separator on the form '/' or '|' and the third indicates what is seen on the second allele. The alleles are unordered. @@ -43,75 +44,78 @@ class Genotype(object): """Holds information about a genotype""" - def __init__(self, **kwargs): - super(Genotype, self).__init__() - # These are the different genotypes: - GT = kwargs.get("GT", "./.") - AD = kwargs.get("AD", ".,.") - DP = kwargs.get("DP", "0") - GQ = kwargs.get("GQ", "0") - PL = kwargs.get("PL", None) + def __init__(self, GT="./.", AD=".,.", DP="0", GQ="0", PL=None, **kwargs): + GT = GT or "./." + AD = AD or ".,." + DP = DP or "0" + GQ = GQ or "0" + PL = PL or None + + self.phased = "|" in GT + self.separator = "|" if self.phased else "/" + + # Split alleles safely + if self.separator in GT: + self.alleles = GT.split(self.separator) + else: + # Haploid calls (e.g. "0", "1") or missing (".") + self.alleles = [GT] if GT else ["."] + while len(self.alleles) < 2: + self.alleles.append(".") + + self.allele_1 = self.alleles[0] + self.allele_2 = self.alleles[1] + + # Flags + self.genotyped = GT not in ("./.", ".|.", ".") self.heterozygote = False - self.allele_depth = False self.homo_alt = False self.homo_ref = False self.has_variant = False - self.genotyped = False - self.phased = False self.depth_of_coverage = 0 self.quality_depth = 0 - self.genotype_quality = 0 - # Check phasing - if "|" in GT: - self.phased = True - # Check the genotyping: - # This is the case when only one allele is present(eg. X-chromosome) and presented like '0' or '1': - if len(GT) < 3: - self.allele_1 = GT - self.allele_2 = "." - else: - self.allele_1 = GT[0] - self.allele_2 = GT[-1] - # The genotype should allways be represented on the same form - self.genotype = self.allele_1 + "/" + self.allele_2 - - if self.genotype != "./.": - self.genotyped = True - # Check allele status - if self.genotype in ["0/0", "./0", "0/."]: + self.genotype_quality = 0.0 + self.genotype = self.separator.join(self.alleles) + + # Check if a variant is homozygote reference, homozygote alternative or heterozygote + if self.genotyped: + if all(allele in ("0", ".") for allele in (self.allele_1, self.allele_2)): self.homo_ref = True + # Treat e.g. ./1 or 1/. as heterozygous elif self.allele_1 == self.allele_2: self.homo_alt = True - self.has_variant = True else: self.heterozygote = True - self.has_variant = True - # Check the allele depth: - self.ref_depth = 0 - self.alt_depth = 0 - allele_depths = AD.split(",") + self.has_variant = self.homo_alt or self.heterozygote - if len(allele_depths) > 1: - if allele_depths[0].isdigit(): - self.ref_depth = int(allele_depths[0]) - if allele_depths[1].isdigit(): - self.alt_depth = int(allele_depths[1]) + # Parse allele depth + self.ref_depth = 0 + self.alt_depth = 0 + try: + allele_depths = [int(depth) if depth.isdigit() else 0 for depth in AD.split(",")] + self.ref_depth = allele_depths[0] if len(allele_depths) > 0 else 0 + self.alt_depth = sum(allele_depths[1:]) if len(allele_depths) > 1 else 0 + except Exception: + self.ref_depth = 0 + self.alt_depth = 0 self.quality_depth = self.ref_depth + self.alt_depth + # Check the depth of coverage: try: self.depth_of_coverage = int(DP) except ValueError: pass + # Check the genotype quality try: self.genotype_quality = float(GQ) except ValueError: pass + # Check the genotype likelihoods self.phred_likelihoods = [] - if PL: try: self.phred_likelihoods = [int(score) for score in PL.split(",")] @@ -120,4 +124,4 @@ def __init__(self, **kwargs): def __str__(self): """Specifies what will be printed when printing the object.""" - return self.allele_1 + "/" + self.allele_2 + return self.genotype diff --git a/tests/vcf_tools/test_genotype.py b/tests/vcf_tools/test_genotype.py index 8616bce..d7b7b3d 100755 --- a/tests/vcf_tools/test_genotype.py +++ b/tests/vcf_tools/test_genotype.py @@ -44,6 +44,21 @@ def test_nocall(): assert not my_nocall.has_variant assert not my_nocall.genotyped +def test_nocall_phased(): + """ + A nocall is when no informations is found on this position for the + individual. It should be False on all questions except nocall. + Also in the case of haploidity the result should be the same. + """ + my_nocall = Genotype(**{"GT": ".|."}) + assert ( + my_nocall.genotype == ".|." + ) # We never need to look at the alleles since genotype is defined by 'allele_1/allele_2' + assert not my_nocall.heterozygote + assert not my_nocall.homo_ref + assert not my_nocall.homo_alt + assert not my_nocall.has_variant + assert not my_nocall.genotyped def test_haploid_genotype(): """ @@ -51,7 +66,7 @@ def test_haploid_genotype(): """ haploid_call = Genotype(**{"GT": "1"}) assert haploid_call.genotype == "1/." - # assert not haploid_call.heterozygote + assert haploid_call.heterozygote # assert not haploid_call.homo_ref # assert haploid_call.homo_alt # assert haploid_call.has_variant @@ -174,7 +189,7 @@ def test_phased_data(): """ my_genotype = Genotype(**{"GT": "1|0"}) assert ( - my_genotype.genotype == "1/0" + my_genotype.genotype == "1|0" ) # If asked about the genotype, it should still be on the same form. assert my_genotype.heterozygote assert not my_genotype.homo_ref From 1c0336d90a45e04abd45b2fc9a574ab1e909e1b8 Mon Sep 17 00:00:00 2001 From: Felix Lenner Date: Mon, 9 Mar 2026 14:04:43 +0100 Subject: [PATCH 2/6] CHANGELOG --- CHANGELOG.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bd1ee8..7706037 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,13 @@ Please add a new candidate release at the top after changing the latest one. Fee Try to use the following format: +## [unreleased] +### Changed +- Return phased genotype string from Genotype class if variant is phased ([#195](https://github.com/Clinical-Genomics/genmod/pull/195)) +- Slightly more readable Genotype class ([#195](https://github.com/Clinical-Genomics/genmod/pull/195)) + ## [3.10.2] -### Fixed +### Fixed - Add scoring normalisation for flag lookup mode ([#177](https://github.com/Clinical-Genomics/genmod/pull/177)) - Fix crash on test for missing annotation key for phased mode ([#178](https://github.com/Clinical-Genomics/genmod/pull/178)) - Bugfixes related to multiprocessing stability and error handling ([#183](https://github.com/Clinical-Genomics/genmod/pull/183) and [#186](https://github.com/Clinical-Genomics/genmod/pull/186)) @@ -33,7 +38,7 @@ Try to use the following format: - Fixed sorting of variants ([#152](https://github.com/Clinical-Genomics/genmod/pull/152)) - genmod annotate for mitochondrial variants when using the `chrM` notation ([#157](https://github.com/Clinical-Genomics/genmod/pull/157)) - Fix linting issues ([#154](https://github.com/Clinical-Genomics/genmod/issues/154)) -- genmod models adds headers to VCF even if it contains no variants ([#160](https://github.com/Clinical-Genomics/genmod/pull/160)) +- genmod models adds headers to VCF even if it contains no variants ([#160](https://github.com/Clinical-Genomics/genmod/pull/160)) ## [3.9] - Fixed wrong models when chromosome X was named `chrX` and not `X` ([#135](https://github.com/Clinical-Genomics/genmod/pull/135)) From 936b0c9834d94ec81e37e2f101f20806304c7184 Mon Sep 17 00:00:00 2001 From: Felix Lenner Date: Mon, 9 Mar 2026 14:07:58 +0100 Subject: [PATCH 3/6] ruf --- tests/vcf_tools/test_genotype.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/vcf_tools/test_genotype.py b/tests/vcf_tools/test_genotype.py index d7b7b3d..005333c 100755 --- a/tests/vcf_tools/test_genotype.py +++ b/tests/vcf_tools/test_genotype.py @@ -44,6 +44,7 @@ def test_nocall(): assert not my_nocall.has_variant assert not my_nocall.genotyped + def test_nocall_phased(): """ A nocall is when no informations is found on this position for the @@ -60,6 +61,7 @@ def test_nocall_phased(): assert not my_nocall.has_variant assert not my_nocall.genotyped + def test_haploid_genotype(): """ Test how genotype behaves with haploid call From 4fa2f649b686134766adf77747f3e07b9e223c3e Mon Sep 17 00:00:00 2001 From: Felix Lenner Date: Mon, 9 Mar 2026 14:11:17 +0100 Subject: [PATCH 4/6] Add todo string --- tests/vcf_tools/test_genotype.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/vcf_tools/test_genotype.py b/tests/vcf_tools/test_genotype.py index 005333c..0e94b8e 100755 --- a/tests/vcf_tools/test_genotype.py +++ b/tests/vcf_tools/test_genotype.py @@ -69,6 +69,7 @@ def test_haploid_genotype(): haploid_call = Genotype(**{"GT": "1"}) assert haploid_call.genotype == "1/." assert haploid_call.heterozygote + # TODO: Add a good description on why this test contains these commented lines. Do we not care about haploid calls? # assert not haploid_call.homo_ref # assert haploid_call.homo_alt # assert haploid_call.has_variant From a0c710b1c8278c761ebb24770fc33391b76d63a8 Mon Sep 17 00:00:00 2001 From: Felix Lenner Date: Mon, 9 Mar 2026 18:33:37 +0100 Subject: [PATCH 5/6] Unused code --- genmod/vcf_tools/genotype.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/genmod/vcf_tools/genotype.py b/genmod/vcf_tools/genotype.py index 70b7443..ceb4b65 100755 --- a/genmod/vcf_tools/genotype.py +++ b/genmod/vcf_tools/genotype.py @@ -44,12 +44,11 @@ class Genotype(object): """Holds information about a genotype""" - def __init__(self, GT="./.", AD=".,.", DP="0", GQ="0", PL=None, **kwargs): + def __init__(self, GT="./.", AD=".,.", DP="0", GQ="0", **kwargs): GT = GT or "./." AD = AD or ".,." DP = DP or "0" GQ = GQ or "0" - PL = PL or None self.phased = "|" in GT self.separator = "|" if self.phased else "/" @@ -114,14 +113,6 @@ def __init__(self, GT="./.", AD=".,.", DP="0", GQ="0", PL=None, **kwargs): except ValueError: pass - # Check the genotype likelihoods - self.phred_likelihoods = [] - if PL: - try: - self.phred_likelihoods = [int(score) for score in PL.split(",")] - except ValueError: - pass - def __str__(self): """Specifies what will be printed when printing the object.""" return self.genotype From 2474d87e729fa678f3367b4a806fdedbde56afc9 Mon Sep 17 00:00:00 2001 From: Felix Lenner Date: Mon, 9 Mar 2026 18:44:40 +0100 Subject: [PATCH 6/6] now unused test --- tests/vcf_tools/test_genotype.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/vcf_tools/test_genotype.py b/tests/vcf_tools/test_genotype.py index 0e94b8e..2732f79 100755 --- a/tests/vcf_tools/test_genotype.py +++ b/tests/vcf_tools/test_genotype.py @@ -122,14 +122,6 @@ def test_bad_gq(): assert my_genotype.genotype_quality == 0 -def test_phred_likelihoods(): - """ - A normal heterozygote call, has_variant and heterozygote is true. - """ - my_genotype = Genotype(**{"GT": "0/1", "PL": "60,70,80"}) - assert my_genotype.phred_likelihoods == [60, 70, 80] - - def test_genotype_1_2(): """ A normal heterozygote call, has_variant and heterozygote is true.