diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bd1ee8..97eee79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,15 @@ Please add a new candidate release at the top after changing the latest one. Fee Try to use the following format: +## [unreleased] +### Changed +- Changed to use PS-tag instead of intervals to check for variants in phase with `genmod models --phased` ([#197](https://github.com/Clinical-Genomics/genmod/pull/197)) +- Refactored the implementation of `check_compounds()` to improve maintainability ([#197](https://github.com/Clinical-Genomics/genmod/pull/197)) +### Fixed +- KeyError when running `genmod models --phased` with multiple individuals ([#197](https://github.com/Clinical-Genomics/genmod/pull/197)) + ## [3.10.2] -### Fixed +### Fixed - Add scoring normalisation for flag lookup mode ([#177](https://github.com/Clinical-Genomics/genmod/pull/177)) - Fix crash on test for missing annotation key for phased mode ([#178](https://github.com/Clinical-Genomics/genmod/pull/178)) - Bugfixes related to multiprocessing stability and error handling ([#183](https://github.com/Clinical-Genomics/genmod/pull/183) and [#186](https://github.com/Clinical-Genomics/genmod/pull/186)) @@ -33,7 +40,7 @@ Try to use the following format: - Fixed sorting of variants ([#152](https://github.com/Clinical-Genomics/genmod/pull/152)) - genmod annotate for mitochondrial variants when using the `chrM` notation ([#157](https://github.com/Clinical-Genomics/genmod/pull/157)) - Fix linting issues ([#154](https://github.com/Clinical-Genomics/genmod/issues/154)) -- genmod models adds headers to VCF even if it contains no variants ([#160](https://github.com/Clinical-Genomics/genmod/pull/160)) +- genmod models adds headers to VCF even if it contains no variants ([#160](https://github.com/Clinical-Genomics/genmod/pull/160)) ## [3.9] - Fixed wrong models when chromosome X was named `chrX` and not `X` ([#135](https://github.com/Clinical-Genomics/genmod/pull/135)) diff --git a/genmod/annotate_models/genetic_models.py b/genmod/annotate_models/genetic_models.py index 0dd9de7..8510a72 100755 --- a/genmod/annotate_models/genetic_models.py +++ b/genmod/annotate_models/genetic_models.py @@ -94,7 +94,6 @@ def check_genetic_models(variant_batch, families, phased=False, strict=False): # A variant batch is a dictionary on the form # {variant_id:variant_dict, variant_2_id:variant_dict_2, ...} logger = logging.getLogger(__name__) - intervals = variant_batch.pop("haploblocks", {}) # We check the genetic models for one family at a time for family_id in families: @@ -192,7 +191,7 @@ def check_genetic_models(variant_batch, families, phased=False, strict=False): variant_1["inheritance_models"][family_id]["AR_comp"] = True variant_2["inheritance_models"][family_id]["AR_comp"] = True # We know from check_compound_candidates that all variants are present in all affected - elif check_compounds(variant_1, variant_2, family, intervals, phased): + elif check_compounds(variant_1, variant_2, family, phased): parents_found = False for individual_id in individuals: individual = individuals[individual_id] diff --git a/genmod/annotate_models/models/compound_model.py b/genmod/annotate_models/models/compound_model.py index 204ec75..606f04e 100755 --- a/genmod/annotate_models/models/compound_model.py +++ b/genmod/annotate_models/models/compound_model.py @@ -13,9 +13,12 @@ from __future__ import print_function import logging +from typing import Union +from genmod.vcf_tools.genotype import Genotype -def check_compounds(variant_1, variant_2, family, intervals, phased): + +def check_compounds(variant_1: dict, variant_2: dict, family, phased: bool) -> bool: """ Check if two variants of a pair follow the compound heterozygous model. @@ -34,90 +37,112 @@ def check_compounds(variant_1, variant_2, family, intervals, phased): Args: variant_1, variant_2: Variants in a potential compound pair family: A family object with the individuals - intervals: A interval tree that describes the phased intervals phased: A bool that tells if the individuals are phased Returns: bool: depending on if the pair follow the rules stated above """ - # Check in all individuals what genotypes that are in the trio based of the - # individual picked. logger = logging.getLogger(__name__) - - for individual_id in family.individuals: + for individual_id, individual in family.individuals.items(): logger.debug("Check compounds for individual {0}".format(individual_id)) individual = family.individuals[individual_id] - genotype_1 = variant_1["genotypes"][individual_id] - genotype_2 = variant_2["genotypes"][individual_id] - + # If the individual has parents we can check if the parents are healthy and have both variants, if they do they can not be a compound pair, since the variants could be on different alleles. + # The parents would then be carriers of the disease, which is not possible if the parents are healthy. if individual.has_parents: - mother_id = individual.mother - father_id = individual.father - - if mother_id != "0": - # mother_genotype_1 = variant_1['genotypes'][mother_id] - # mother_genotype_2 = variant_2['genotypes'][mother_id] - mother = family.individuals[mother_id] - - if father_id != "0": - # father_genotype_1 = variant_1['genotypes'][father_id] - # father_genotype_2 = variant_2['genotypes'][father_id] - father = family.individuals[father_id] - - if mother_id != "0" and mother.healthy: - if ( - variant_1["genotypes"][mother_id].has_variant - and variant_2["genotypes"][mother_id].has_variant - ): - return False - - if father_id != "0" and father.healthy: - if ( - variant_1["genotypes"][father_id].has_variant - and variant_2["genotypes"][father_id].has_variant - ): - return False - - # check if variants are in the same phased interval: - - if phased: - variant_1_interval = intervals[individual_id].find_range( - [int(variant_1["POS"]), int(variant_1["POS"])] - ) - variant_2_interval = intervals[individual_id].find_range( - [int(variant_2["POS"]), int(variant_2["POS"])] - ) - - # If phased a healthy individual can have both variants if they are on - # the same haploblock - # if not phased: - - if individual.healthy: - if genotype_1.heterozygote and genotype_2.heterozygote: - return False - # The case where the individual is affected - # We know since ealier that all affected are heterozygotes - # for these variants - # So we only need to know if the variants are on the same phase - elif individual.affected: - # If the individual is sick and phased it has to have one variant on - # each allele - if phased: - # Variants need to be in the same phased interval, othervise we - # do not have any extra info - if variant_1_interval == variant_2_interval: - # If they are in the same interval they can not be on same - # allele - if (genotype_1.allele_1 == genotype_2.allele_1) or ( - genotype_1.allele_2 == genotype_2.allele_2 + for parent_id in (individual.mother, individual.father): + if parent_id != "0": + parent = family.individuals[parent_id] + parent_genotypes = [ + get_genotype(variant, parent_id) for variant in (variant_1, variant_2) + ] + if parent.healthy and all( + genotype.has_variant for genotype in parent_genotypes ): return False + genotype_1 = get_genotype(variant_1, individual_id) + genotype_2 = get_genotype(variant_2, individual_id) + + # If a healthy individual is not phased and has both variants it can not be a compound pair, since the variants could be on different alleles. + # The individual would then be a carrier of the disease, which is not possible if the individual is healthy. + if individual.healthy and (genotype_1.heterozygote and genotype_2.heterozygote): + return False + + # If the individual is affected and phased we can say that the variants need to be on different alleles, otherwise it can not be a compound pair. + if ( + individual.affected + and phased + and variants_on_same_allele(individual.individual_id, variant_1, variant_2) + ): + return False + + # If the individual is affected and not phased we can not say anything about the phase, so we say it is a compound pair, since it could be a compound pair. return True +def get_genotype(variant: dict, individual_id: str) -> Genotype: + """ + Return the Genotype object for a variants for a given individual. + + Args: + variant (dict): A dictionary representing a variant (with per-sample genotypes) + individual_id (str): Sample/individual ID + + Returns: + Genotype: The genotype object for the individual at the given variant + """ + return variant["genotypes"][individual_id] + + +def get_phase_set(variant_dict: dict, sample_id: str) -> Union[str, None]: + """ + Extracts the PS (phase set) field for a given sample from a VCF-like variant dictionary. + + Parameters: + variant_dict (dict): A dictionary representing a variant (with FORMAT and per-sample fields) + sample_id (str): The sample/individual ID to extract the PS for + + Returns: + str or None: The PS value for the sample, or None if missing + """ + format_fields = variant_dict["FORMAT"].split(":") + sample_values = variant_dict.get(sample_id, "").split(":") + field_map = dict(zip(format_fields, sample_values)) + return field_map.get("PS") + + +def variants_on_same_allele(individual_id: str, variant_1: dict, variant_2: dict) -> bool: + """ + Determine if two phased variants are on the same haplotype for a given individual. + + This function assumes that both variants are phased and that the PS (phase set) + field is present in the genotype information. Two variants are considered to be + on the same allele if they belong to the same phase set and have at least one + allele in the same haplotype position (allele_1 with allele_1 or allele_2 with allele_2). + + Parameters: + individual_id (str): ID of the individual/sample. + variant_1 (dict): First variant dictionary containing per-sample genotypes. + variant_2 (dict): Second variant dictionary containing per-sample genotypes. + + Returns: + bool: True if the variants are on the same allele in the same phase set, + False otherwise. + """ + genotype_1 = get_genotype(variant_1, individual_id) + genotype_2 = get_genotype(variant_2, individual_id) + + same_phase = get_phase_set(variant_1, individual_id) == get_phase_set(variant_2, individual_id) + overlapping_allele = ( + genotype_1.allele_1 == genotype_2.allele_1 or genotype_1.allele_2 == genotype_2.allele_2 + ) + + # Variants are on the same allele if they overlap in the same haplotype and are in the same phase set + return same_phase and overlapping_allele + + def main(): pass diff --git a/genmod/commands/annotate_models.py b/genmod/commands/annotate_models.py index 9e1009a..63b2f66 100755 --- a/genmod/commands/annotate_models.py +++ b/genmod/commands/annotate_models.py @@ -62,7 +62,11 @@ @click.option( "--vep", is_flag=True, help="If variants are annotated with the Variant Effect Predictor." ) -@click.option("--phased", is_flag=True, help="If data is phased use this flag.") +@click.option( + "--phased", + is_flag=True, + help="If data is phased use this flag. Requires PS (Phase set) genotype fields.", +) @click.option( "-s", "--strict", @@ -78,7 +82,7 @@ "-k", "--keyword", default="Annotation", - help="""What annotation keyword that should be used when + help="""What annotation keyword that should be used when searching for features.""", ) @outfile diff --git a/tests/genetic_models/test_compound_model.py b/tests/genetic_models/test_compound_model.py new file mode 100644 index 0000000..b9749fc --- /dev/null +++ b/tests/genetic_models/test_compound_model.py @@ -0,0 +1,138 @@ +from genmod.annotate_models.models.compound_model import ( + check_compounds, + get_genotype, + get_phase_set, + variants_on_same_allele, +) +from genmod.vcf_tools import Genotype +from ped_parser import FamilyParser + + +def get_family(family_lines): + return FamilyParser(family_lines) + + +def make_variant(*, genotypes, format_str="GT:PS", sample_fields=None): + variant = {"genotypes": dict(genotypes), "FORMAT": format_str} + if sample_fields: + variant.update(sample_fields) + return variant + + +def test_get_genotype_returns_expected_object(): + genotype = Genotype(**{"GT": "0/1"}) + variant = {"genotypes": {"sample": genotype}} + assert get_genotype(variant, "sample") is genotype + + +def test_get_phase_set_returns_ps_when_present(): + variant = {"FORMAT": "GT:AD:PS", "sample": "0/1:10,5:12345"} + assert get_phase_set(variant, "sample") == "12345" + + +def test_get_phase_set_returns_none_when_ps_missing_in_format(): + variant = {"FORMAT": "GT:AD", "sample": "0/1:10,5"} + assert get_phase_set(variant, "sample") is None + + +def test_get_phase_set_returns_none_when_sample_missing_or_unset(): + variant_missing_sample = {"FORMAT": "GT:PS"} + assert get_phase_set(variant_missing_sample, "sample") is None + + variant_empty_sample = {"FORMAT": "GT:PS", "sample": ""} + assert get_phase_set(variant_empty_sample, "sample") is None + + variant_short_sample = {"FORMAT": "GT:PS", "sample": "0|1"} + assert get_phase_set(variant_short_sample, "sample") is None + + +def test_variants_on_same_allele_true_same_ps_and_overlap(): + variant_1 = make_variant( + genotypes={"sample": Genotype(**{"GT": "0|1"})}, + sample_fields={"sample": "0|1:PS1"}, + ) + variant_2 = make_variant( + genotypes={"sample": Genotype(**{"GT": "0|1"})}, + sample_fields={"sample": "0|1:PS1"}, + ) + assert variants_on_same_allele("sample", variant_1, variant_2) is True + + +def test_variants_on_same_allele_false_different_phase_set(): + variant_1 = make_variant( + genotypes={"sample": Genotype(**{"GT": "0|1"})}, + sample_fields={"sample": "0|1:1"}, + ) + variant_2 = make_variant( + genotypes={"sample": Genotype(**{"GT": "0|1"})}, + sample_fields={"sample": "0|1:2"}, + ) + assert variants_on_same_allele("sample", variant_1, variant_2) is False + + +def test_variants_on_same_allele_false_same_phase_set_no_overlap(): + variant_1 = make_variant( + genotypes={"sample": Genotype(**{"GT": "0|1"})}, + sample_fields={"sample": "0|1:1"}, + ) + variant_2 = make_variant( + genotypes={"sample": Genotype(**{"GT": "1|0"})}, + sample_fields={"sample": "1|0:1"}, + ) + assert variants_on_same_allele("sample", variant_1, variant_2) is False + + +def test_check_compounds_false_when_affected_phased_same_allele(): + family_lines = [ + "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", + "1\tchild\tfather\tmother\t1\t2\n", + "1\tfather\t0\t0\t1\t1\n", + "1\tmother\t0\t0\t2\t1\n", + ] + family = get_family(family_lines) + + variant_1 = make_variant( + genotypes={ + "child": Genotype(**{"GT": "0|1"}), + "father": Genotype(**{"GT": "0|0"}), + "mother": Genotype(**{"GT": "0|0"}), + }, + sample_fields={"child": "0|1:1"}, + ) + variant_2 = make_variant( + genotypes={ + "child": Genotype(**{"GT": "0|1"}), + "father": Genotype(**{"GT": "0|0"}), + "mother": Genotype(**{"GT": "0|0"}), + }, + sample_fields={"child": "0|1:1"}, + ) + + assert check_compounds(variant_1, variant_2, family=family, phased=True) is False + + +def test_check_compounds_false_when_healthy_parent_has_both_variants(): + family_lines = [ + "#FamilyID\tSampleID\tFather\tMother\tSex\tPhenotype\n", + "1\tchild\tfather\tmother\t1\t2\n", + "1\tfather\t0\t0\t1\t1\n", + "1\tmother\t0\t0\t2\t1\n", + ] + family = get_family(family_lines) + + variant_1 = make_variant( + genotypes={ + "child": Genotype(**{"GT": "0/1"}), + "father": Genotype(**{"GT": "0/1"}), + "mother": Genotype(**{"GT": "0/0"}), + } + ) + variant_2 = make_variant( + genotypes={ + "child": Genotype(**{"GT": "0/1"}), + "father": Genotype(**{"GT": "0/1"}), + "mother": Genotype(**{"GT": "0/0"}), + } + ) + + assert check_compounds(variant_1, variant_2, family=family, phased=False) is False