Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,13 @@ Please add a new candidate release at the top after changing the latest one. Fee

Try to use the following format:

## [unreleased]
### Changed
- Return phased genotype string from Genotype class if variant is phased ([#195](https://github.com/Clinical-Genomics/genmod/pull/195))
- Slightly more readable Genotype class ([#195](https://github.com/Clinical-Genomics/genmod/pull/195))

## [3.10.2]
### Fixed
### Fixed
- Add scoring normalisation for flag lookup mode ([#177](https://github.com/Clinical-Genomics/genmod/pull/177))
- Fix crash on test for missing annotation key for phased mode ([#178](https://github.com/Clinical-Genomics/genmod/pull/178))
- Bugfixes related to multiprocessing stability and error handling ([#183](https://github.com/Clinical-Genomics/genmod/pull/183) and [#186](https://github.com/Clinical-Genomics/genmod/pull/186))
Expand All @@ -33,7 +38,7 @@ Try to use the following format:
- Fixed sorting of variants ([#152](https://github.com/Clinical-Genomics/genmod/pull/152))
- genmod annotate for mitochondrial variants when using the `chrM` notation ([#157](https://github.com/Clinical-Genomics/genmod/pull/157))
- Fix linting issues ([#154](https://github.com/Clinical-Genomics/genmod/issues/154))
- genmod models adds headers to VCF even if it contains no variants ([#160](https://github.com/Clinical-Genomics/genmod/pull/160))
- genmod models adds headers to VCF even if it contains no variants ([#160](https://github.com/Clinical-Genomics/genmod/pull/160))

## [3.9]
- Fixed wrong models when chromosome X was named `chrX` and not `X` ([#135](https://github.com/Clinical-Genomics/genmod/pull/135))
Expand Down
97 changes: 46 additions & 51 deletions genmod/vcf_tools/genotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

This is a class with information about genotypecalls that follows the (GATK) .vcf standard.

The indata, that is the genotype call, is allways on the form x/x, so they look like 0/0, 1/2, 1/1 and so on.
The indata, that is the genotype call, is typically on the form x/x (unphased) or x|x (phased),
so they look like 0/0, 1/2, 1/1, 0|1 and so on.
The first sign inidcates what we find on the first allele, the second is a separator on the form '/' or '|' and the third indicates what is seen on the second allele.
The alleles are unordered.

Expand Down Expand Up @@ -43,81 +44,75 @@
class Genotype(object):
"""Holds information about a genotype"""

def __init__(self, **kwargs):
super(Genotype, self).__init__()
# These are the different genotypes:
GT = kwargs.get("GT", "./.")
AD = kwargs.get("AD", ".,.")
DP = kwargs.get("DP", "0")
GQ = kwargs.get("GQ", "0")
PL = kwargs.get("PL", None)
def __init__(self, GT="./.", AD=".,.", DP="0", GQ="0", **kwargs):
GT = GT or "./."
AD = AD or ".,."
DP = DP or "0"
GQ = GQ or "0"

self.phased = "|" in GT
self.separator = "|" if self.phased else "/"

# Split alleles safely
if self.separator in GT:
self.alleles = GT.split(self.separator)
else:
# Haploid calls (e.g. "0", "1") or missing (".")
self.alleles = [GT] if GT else ["."]
while len(self.alleles) < 2:
self.alleles.append(".")

self.allele_1 = self.alleles[0]
self.allele_2 = self.alleles[1]

# Flags
self.genotyped = GT not in ("./.", ".|.", ".")
self.heterozygote = False
self.allele_depth = False
self.homo_alt = False
self.homo_ref = False
self.has_variant = False
self.genotyped = False
self.phased = False
self.depth_of_coverage = 0
self.quality_depth = 0
self.genotype_quality = 0
# Check phasing
if "|" in GT:
self.phased = True
# Check the genotyping:
# This is the case when only one allele is present(eg. X-chromosome) and presented like '0' or '1':
if len(GT) < 3:
self.allele_1 = GT
self.allele_2 = "."
else:
self.allele_1 = GT[0]
self.allele_2 = GT[-1]
# The genotype should allways be represented on the same form
self.genotype = self.allele_1 + "/" + self.allele_2

if self.genotype != "./.":
self.genotyped = True
# Check allele status
if self.genotype in ["0/0", "./0", "0/."]:
self.genotype_quality = 0.0
self.genotype = self.separator.join(self.alleles)

# Check if a variant is homozygote reference, homozygote alternative or heterozygote
if self.genotyped:
if all(allele in ("0", ".") for allele in (self.allele_1, self.allele_2)):
self.homo_ref = True
# Treat e.g. ./1 or 1/. as heterozygous
elif self.allele_1 == self.allele_2:
self.homo_alt = True
self.has_variant = True
else:
self.heterozygote = True
self.has_variant = True
# Check the allele depth:
self.ref_depth = 0
self.alt_depth = 0

allele_depths = AD.split(",")
self.has_variant = self.homo_alt or self.heterozygote

if len(allele_depths) > 1:
if allele_depths[0].isdigit():
self.ref_depth = int(allele_depths[0])
if allele_depths[1].isdigit():
self.alt_depth = int(allele_depths[1])
# Parse allele depth
self.ref_depth = 0
self.alt_depth = 0
try:
allele_depths = [int(depth) if depth.isdigit() else 0 for depth in AD.split(",")]
self.ref_depth = allele_depths[0] if len(allele_depths) > 0 else 0
self.alt_depth = sum(allele_depths[1:]) if len(allele_depths) > 1 else 0
except Exception:
self.ref_depth = 0
self.alt_depth = 0

self.quality_depth = self.ref_depth + self.alt_depth

# Check the depth of coverage:
try:
self.depth_of_coverage = int(DP)
except ValueError:
pass

# Check the genotype quality
try:
self.genotype_quality = float(GQ)
except ValueError:
pass
# Check the genotype likelihoods
self.phred_likelihoods = []

if PL:
try:
self.phred_likelihoods = [int(score) for score in PL.split(",")]
except ValueError:
pass

def __str__(self):
"""Specifies what will be printed when printing the object."""
return self.allele_1 + "/" + self.allele_2
return self.genotype
30 changes: 20 additions & 10 deletions tests/vcf_tools/test_genotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,31 @@ def test_nocall():
assert not my_nocall.genotyped


def test_nocall_phased():
"""
A nocall is when no informations is found on this position for the
individual. It should be False on all questions except nocall.
Also in the case of haploidity the result should be the same.
"""
my_nocall = Genotype(**{"GT": ".|."})
assert (
my_nocall.genotype == ".|."
) # We never need to look at the alleles since genotype is defined by 'allele_1/allele_2'
assert not my_nocall.heterozygote
assert not my_nocall.homo_ref
assert not my_nocall.homo_alt
assert not my_nocall.has_variant
assert not my_nocall.genotyped


def test_haploid_genotype():
"""
Test how genotype behaves with haploid call
"""
haploid_call = Genotype(**{"GT": "1"})
assert haploid_call.genotype == "1/."
# assert not haploid_call.heterozygote
assert haploid_call.heterozygote
# TODO: Add a good description on why this test contains these commented lines. Do we not care about haploid calls?
# assert not haploid_call.homo_ref
# assert haploid_call.homo_alt
# assert haploid_call.has_variant
Expand Down Expand Up @@ -104,14 +122,6 @@ def test_bad_gq():
assert my_genotype.genotype_quality == 0


def test_phred_likelihoods():
"""
A normal heterozygote call, has_variant and heterozygote is true.
"""
my_genotype = Genotype(**{"GT": "0/1", "PL": "60,70,80"})
assert my_genotype.phred_likelihoods == [60, 70, 80]


def test_genotype_1_2():
"""
A normal heterozygote call, has_variant and heterozygote is true.
Expand Down Expand Up @@ -174,7 +184,7 @@ def test_phased_data():
"""
my_genotype = Genotype(**{"GT": "1|0"})
assert (
my_genotype.genotype == "1/0"
my_genotype.genotype == "1|0"
) # If asked about the genotype, it should still be on the same form.
assert my_genotype.heterozygote
assert not my_genotype.homo_ref
Expand Down