Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,15 @@ Please add a new candidate release at the top after changing the latest one. Fee

Try to use the following format:

## [unreleased]
### Changed
- Changed to use PS-tag instead of intervals to check for variants in phase with `genmod models --phased` ([#197](https://github.com/Clinical-Genomics/genmod/pull/197))
- Refactored the implementation of `check_compounds()` to improve maintainability ([#197](https://github.com/Clinical-Genomics/genmod/pull/197))
### Fixed
- KeyError when running `genmod models --phased` with multiple individuals ([#197](https://github.com/Clinical-Genomics/genmod/pull/197))

## [3.10.2]
### Fixed
### Fixed
- Add scoring normalisation for flag lookup mode ([#177](https://github.com/Clinical-Genomics/genmod/pull/177))
- Fix crash on test for missing annotation key for phased mode ([#178](https://github.com/Clinical-Genomics/genmod/pull/178))
- Bugfixes related to multiprocessing stability and error handling ([#183](https://github.com/Clinical-Genomics/genmod/pull/183) and [#186](https://github.com/Clinical-Genomics/genmod/pull/186))
Expand All @@ -33,7 +40,7 @@ Try to use the following format:
- Fixed sorting of variants ([#152](https://github.com/Clinical-Genomics/genmod/pull/152))
- genmod annotate for mitochondrial variants when using the `chrM` notation ([#157](https://github.com/Clinical-Genomics/genmod/pull/157))
- Fix linting issues ([#154](https://github.com/Clinical-Genomics/genmod/issues/154))
- genmod models adds headers to VCF even if it contains no variants ([#160](https://github.com/Clinical-Genomics/genmod/pull/160))
- genmod models adds headers to VCF even if it contains no variants ([#160](https://github.com/Clinical-Genomics/genmod/pull/160))

## [3.9]
- Fixed wrong models when chromosome X was named `chrX` and not `X` ([#135](https://github.com/Clinical-Genomics/genmod/pull/135))
Expand Down
3 changes: 1 addition & 2 deletions genmod/annotate_models/genetic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ def check_genetic_models(variant_batch, families, phased=False, strict=False):
# A variant batch is a dictionary on the form
# {variant_id:variant_dict, variant_2_id:variant_dict_2, ...}
logger = logging.getLogger(__name__)
intervals = variant_batch.pop("haploblocks", {})

# We check the genetic models for one family at a time
for family_id in families:
Expand Down Expand Up @@ -192,7 +191,7 @@ def check_genetic_models(variant_batch, families, phased=False, strict=False):
variant_1["inheritance_models"][family_id]["AR_comp"] = True
variant_2["inheritance_models"][family_id]["AR_comp"] = True
# We know from check_compound_candidates that all variants are present in all affected
elif check_compounds(variant_1, variant_2, family, intervals, phased):
elif check_compounds(variant_1, variant_2, family, phased):
parents_found = False
for individual_id in individuals:
individual = individuals[individual_id]
Expand Down
161 changes: 93 additions & 68 deletions genmod/annotate_models/models/compound_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@
from __future__ import print_function

import logging
from typing import Union

from genmod.vcf_tools.genotype import Genotype

def check_compounds(variant_1, variant_2, family, intervals, phased):

def check_compounds(variant_1: dict, variant_2: dict, family, phased: bool) -> bool:
"""
Check if two variants of a pair follow the compound heterozygous model.

Expand All @@ -34,90 +37,112 @@ def check_compounds(variant_1, variant_2, family, intervals, phased):
Args:
variant_1, variant_2: Variants in a potential compound pair
family: A family object with the individuals
intervals: A interval tree that describes the phased intervals
phased: A bool that tells if the individuals are phased

Returns:
bool: depending on if the pair follow the rules stated above

"""
# Check in all individuals what genotypes that are in the trio based of the
# individual picked.
logger = logging.getLogger(__name__)

for individual_id in family.individuals:
for individual_id, individual in family.individuals.items():
logger.debug("Check compounds for individual {0}".format(individual_id))
individual = family.individuals[individual_id]

genotype_1 = variant_1["genotypes"][individual_id]
genotype_2 = variant_2["genotypes"][individual_id]

# If the individual has parents we can check if the parents are healthy and have both variants, if they do they can not be a compound pair, since the variants could be on different alleles.
# The parents would then be carriers of the disease, which is not possible if the parents are healthy.
if individual.has_parents:
mother_id = individual.mother
father_id = individual.father

if mother_id != "0":
# mother_genotype_1 = variant_1['genotypes'][mother_id]
# mother_genotype_2 = variant_2['genotypes'][mother_id]
mother = family.individuals[mother_id]

if father_id != "0":
# father_genotype_1 = variant_1['genotypes'][father_id]
# father_genotype_2 = variant_2['genotypes'][father_id]
father = family.individuals[father_id]

if mother_id != "0" and mother.healthy:
if (
variant_1["genotypes"][mother_id].has_variant
and variant_2["genotypes"][mother_id].has_variant
):
return False

if father_id != "0" and father.healthy:
if (
variant_1["genotypes"][father_id].has_variant
and variant_2["genotypes"][father_id].has_variant
):
return False

# check if variants are in the same phased interval:

if phased:
variant_1_interval = intervals[individual_id].find_range(
[int(variant_1["POS"]), int(variant_1["POS"])]
)
variant_2_interval = intervals[individual_id].find_range(
[int(variant_2["POS"]), int(variant_2["POS"])]
)

# If phased a healthy individual can have both variants if they are on
# the same haploblock
# if not phased:

if individual.healthy:
if genotype_1.heterozygote and genotype_2.heterozygote:
return False
# The case where the individual is affected
# We know since ealier that all affected are heterozygotes
# for these variants
# So we only need to know if the variants are on the same phase
elif individual.affected:
# If the individual is sick and phased it has to have one variant on
# each allele
if phased:
# Variants need to be in the same phased interval, othervise we
# do not have any extra info
if variant_1_interval == variant_2_interval:
# If they are in the same interval they can not be on same
# allele
if (genotype_1.allele_1 == genotype_2.allele_1) or (
genotype_1.allele_2 == genotype_2.allele_2
for parent_id in (individual.mother, individual.father):
if parent_id != "0":
parent = family.individuals[parent_id]
parent_genotypes = [
get_genotype(variant, parent_id) for variant in (variant_1, variant_2)
]
if parent.healthy and all(
genotype.has_variant for genotype in parent_genotypes
):
return False

genotype_1 = get_genotype(variant_1, individual_id)
genotype_2 = get_genotype(variant_2, individual_id)

# If a healthy individual is not phased and has both variants it can not be a compound pair, since the variants could be on different alleles.
# The individual would then be a carrier of the disease, which is not possible if the individual is healthy.
if individual.healthy and (genotype_1.heterozygote and genotype_2.heterozygote):
return False

# If the individual is affected and phased we can say that the variants need to be on different alleles, otherwise it can not be a compound pair.
if (
individual.affected
and phased
and variants_on_same_allele(individual.individual_id, variant_1, variant_2)
):
return False

# If the individual is affected and not phased we can not say anything about the phase, so we say it is a compound pair, since it could be a compound pair.
return True


def get_genotype(variant: dict, individual_id: str) -> Genotype:
"""
Return the Genotype object for a variants for a given individual.

Args:
variant (dict): A dictionary representing a variant (with per-sample genotypes)
individual_id (str): Sample/individual ID

Returns:
Genotype: The genotype object for the individual at the given variant
"""
return variant["genotypes"][individual_id]


def get_phase_set(variant_dict: dict, sample_id: str) -> Union[str, None]:
"""
Extracts the PS (phase set) field for a given sample from a VCF-like variant dictionary.

Parameters:
variant_dict (dict): A dictionary representing a variant (with FORMAT and per-sample fields)
sample_id (str): The sample/individual ID to extract the PS for

Returns:
str or None: The PS value for the sample, or None if missing
"""
format_fields = variant_dict["FORMAT"].split(":")
sample_values = variant_dict.get(sample_id, "").split(":")
field_map = dict(zip(format_fields, sample_values))
return field_map.get("PS")


def variants_on_same_allele(individual_id: str, variant_1: dict, variant_2: dict) -> bool:
"""
Determine if two phased variants are on the same haplotype for a given individual.

This function assumes that both variants are phased and that the PS (phase set)
field is present in the genotype information. Two variants are considered to be
on the same allele if they belong to the same phase set and have at least one
allele in the same haplotype position (allele_1 with allele_1 or allele_2 with allele_2).

Parameters:
individual_id (str): ID of the individual/sample.
variant_1 (dict): First variant dictionary containing per-sample genotypes.
variant_2 (dict): Second variant dictionary containing per-sample genotypes.

Returns:
bool: True if the variants are on the same allele in the same phase set,
False otherwise.
"""
genotype_1 = get_genotype(variant_1, individual_id)
genotype_2 = get_genotype(variant_2, individual_id)

same_phase = get_phase_set(variant_1, individual_id) == get_phase_set(variant_2, individual_id)
overlapping_allele = (
genotype_1.allele_1 == genotype_2.allele_1 or genotype_1.allele_2 == genotype_2.allele_2
)

# Variants are on the same allele if they overlap in the same haplotype and are in the same phase set
return same_phase and overlapping_allele


def main():
pass

Expand Down
8 changes: 6 additions & 2 deletions genmod/commands/annotate_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,11 @@
@click.option(
"--vep", is_flag=True, help="If variants are annotated with the Variant Effect Predictor."
)
@click.option("--phased", is_flag=True, help="If data is phased use this flag.")
@click.option(
"--phased",
is_flag=True,
help="If data is phased use this flag. Requires PS (Phase set) genotype fields.",
)
@click.option(
"-s",
"--strict",
Expand All @@ -78,7 +82,7 @@
"-k",
"--keyword",
default="Annotation",
help="""What annotation keyword that should be used when
help="""What annotation keyword that should be used when
searching for features.""",
)
@outfile
Expand Down
138 changes: 138 additions & 0 deletions tests/genetic_models/test_compound_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
from genmod.annotate_models.models.compound_model import (
check_compounds,
get_genotype,
get_phase_set,
variants_on_same_allele,
)
from genmod.vcf_tools import Genotype
from ped_parser import FamilyParser


def get_family(family_lines):
return FamilyParser(family_lines)


def make_variant(*, genotypes, format_str="GT:PS", sample_fields=None):
variant = {"genotypes": dict(genotypes), "FORMAT": format_str}
if sample_fields:
variant.update(sample_fields)
return variant


def test_get_genotype_returns_expected_object():
genotype = Genotype(**{"GT": "0/1"})
variant = {"genotypes": {"sample": genotype}}
assert get_genotype(variant, "sample") is genotype


def test_get_phase_set_returns_ps_when_present():
variant = {"FORMAT": "GT:AD:PS", "sample": "0/1:10,5:12345"}
assert get_phase_set(variant, "sample") == "12345"


def test_get_phase_set_returns_none_when_ps_missing_in_format():
variant = {"FORMAT": "GT:AD", "sample": "0/1:10,5"}
assert get_phase_set(variant, "sample") is None


def test_get_phase_set_returns_none_when_sample_missing_or_unset():
variant_missing_sample = {"FORMAT": "GT:PS"}
assert get_phase_set(variant_missing_sample, "sample") is None

variant_empty_sample = {"FORMAT": "GT:PS", "sample": ""}
assert get_phase_set(variant_empty_sample, "sample") is None

variant_short_sample = {"FORMAT": "GT:PS", "sample": "0|1"}
assert get_phase_set(variant_short_sample, "sample") is None


def test_variants_on_same_allele_true_same_ps_and_overlap():
variant_1 = make_variant(
genotypes={"sample": Genotype(**{"GT": "0|1"})},
sample_fields={"sample": "0|1:PS1"},
)
variant_2 = make_variant(
genotypes={"sample": Genotype(**{"GT": "0|1"})},
sample_fields={"sample": "0|1:PS1"},
)
assert variants_on_same_allele("sample", variant_1, variant_2) is True


def test_variants_on_same_allele_false_different_phase_set():
variant_1 = make_variant(
genotypes={"sample": Genotype(**{"GT": "0|1"})},
sample_fields={"sample": "0|1:1"},
)
variant_2 = make_variant(
genotypes={"sample": Genotype(**{"GT": "0|1"})},
sample_fields={"sample": "0|1:2"},
)
assert variants_on_same_allele("sample", variant_1, variant_2) is False


def test_variants_on_same_allele_false_same_phase_set_no_overlap():
variant_1 = make_variant(
genotypes={"sample": Genotype(**{"GT": "0|1"})},
sample_fields={"sample": "0|1:1"},
)
variant_2 = make_variant(
genotypes={"sample": Genotype(**{"GT": "1|0"})},
sample_fields={"sample": "1|0:1"},
)
assert variants_on_same_allele("sample", variant_1, variant_2) is False


def test_check_compounds_false_when_affected_phased_same_allele():
family_lines = [
"#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n",
"1\tchild\tfather\tmother\t1\t2\n",
"1\tfather\t0\t0\t1\t1\n",
"1\tmother\t0\t0\t2\t1\n",
]
family = get_family(family_lines)

variant_1 = make_variant(
genotypes={
"child": Genotype(**{"GT": "0|1"}),
"father": Genotype(**{"GT": "0|0"}),
"mother": Genotype(**{"GT": "0|0"}),
},
sample_fields={"child": "0|1:1"},
)
variant_2 = make_variant(
genotypes={
"child": Genotype(**{"GT": "0|1"}),
"father": Genotype(**{"GT": "0|0"}),
"mother": Genotype(**{"GT": "0|0"}),
},
sample_fields={"child": "0|1:1"},
)

assert check_compounds(variant_1, variant_2, family=family, phased=True) is False


def test_check_compounds_false_when_healthy_parent_has_both_variants():
family_lines = [
"#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n",
"1\tchild\tfather\tmother\t1\t2\n",
"1\tfather\t0\t0\t1\t1\n",
"1\tmother\t0\t0\t2\t1\n",
]
family = get_family(family_lines)

variant_1 = make_variant(
genotypes={
"child": Genotype(**{"GT": "0/1"}),
"father": Genotype(**{"GT": "0/1"}),
"mother": Genotype(**{"GT": "0/0"}),
}
)
variant_2 = make_variant(
genotypes={
"child": Genotype(**{"GT": "0/1"}),
"father": Genotype(**{"GT": "0/1"}),
"mother": Genotype(**{"GT": "0/0"}),
}
)

assert check_compounds(variant_1, variant_2, family=family, phased=False) is False