diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a8fb33..1d94fa5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,9 @@ Try to use the following format: ### Fixed - Add scoring normalisation for flag lookup mode ([#177](https://github.com/Clinical-Genomics/genmod/pull/177)) - Fix crash on test for missing annotation key for phased mode ([#178](https://github.com/Clinical-Genomics/genmod/pull/178)) +- Fix for deadlock in compound scoring, improved error logging ([#180](https://github.com/Clinical-Genomics/genmod/pull/180)) +### Changed +- Add option to allow rescoring of (compound) variants ([#180](https://github.com/Clinical-Genomics/genmod/pull/180)) ## [3.10.1] ### Fixed diff --git a/Makefile b/Makefile index 2e71c3f..7d6b2bd 100644 --- a/Makefile +++ b/Makefile @@ -20,5 +20,17 @@ test_dist: docker-build pip3 install dist/genmod*.tar.gz && \ uv run genmod -v annotate --annotate-regions --genome-build 37 examples/test_vcf.vcf" +test_rescore: docker-build + $(DOCKER) run -i -l genmod-test genmod/test -v -s -o log_cli=true -k test_rescore_with_annotation_suffix 2>&1 + +test_mivmir: docker-build + $(DOCKER) run -i -l genmod-test genmod/test -v -s -o log_cli=true -k test_mivmir_minimal_score_config 2>&1 + +build-export-singularity: + $(DOCKER) build -t genmod/hasta --force-rm=true --rm=true -f Dockerfile . + docker save genmod/hasta -o genmod.tar + singularity build -F genmod.sif docker-archive://genmod.tar + rm genmod.tar + docker-clean-images: docker system prune diff --git a/genmod/commands/score_compounds.py b/genmod/commands/score_compounds.py index ef15273..e856385 100755 --- a/genmod/commands/score_compounds.py +++ b/genmod/commands/score_compounds.py @@ -17,8 +17,9 @@ import sys from codecs import open from datetime import datetime -from multiprocessing import JoinableQueue, Manager, cpu_count, util +from multiprocessing import JoinableQueue, Manager, cpu_count, log_to_stderr, util from tempfile import NamedTemporaryFile +from time import sleep import click @@ -29,7 +30,8 @@ from .utils import get_file_handle, outfile, processes, silent, temp_dir, variant_file -logger = logging.getLogger(__name__) +logger = log_to_stderr(logging.INFO) +logging.basicConfig(stream=sys.stderr, force=True) util.abstract_sockets_supported = False @@ -49,9 +51,24 @@ default=9, ) @click.option("--penalty", type=int, help="Penalty applied together with --threshold", default=6) +@click.option( + "-s", + "--annotation_suffix", + default=None, + help="Target score with SUFFIX and append suffix to compound INFO fields (to not overwrite existing compound score entries).", +) @click.pass_context def compound( - context, variant_file, silent, outfile, vep, threshold: int, penalty: int, processes, temp_dir + context, + variant_file, + silent, + outfile, + vep, + threshold: int, + penalty: int, + annotation_suffix: str, + processes, + temp_dir, ): """ Score compound variants in a vcf file based on their rank score. @@ -75,6 +92,13 @@ def compound( else: break + # Setup INFO field name suffix + if annotation_suffix is None: + annotation_suffix: str = "" # i.e. add no suffix to INFO field name + else: + annotation_suffix: str = f"{annotation_suffix}" + logger.debug(f"Adding scoring suffix: {annotation_suffix}") + logger.info("Headers parsed") if not line.startswith("#"): @@ -88,7 +112,7 @@ def compound( add_metadata( head, "info", - "CompoundsNormalized", + "CompoundsNormalized" + annotation_suffix, annotation_number=".", entry_type="String", description="Rank score as provided by compound analysis, based on RankScoreNormalized. family_id:rank_score", @@ -119,6 +143,7 @@ def compound( individuals=individuals, threshold=threshold, penalty=penalty, + annotation_suffix=annotation_suffix, ) for i in range(num_scorers) ] @@ -160,6 +185,15 @@ def compound( for i in range(num_scorers): variant_queue.put(None) + # Before joining on variant_queue, check whether workers have completed + # or failed, to avoid main process deadlock on never-decreasing queue semaphore. + while any([worker.is_alive() for worker in compound_scorers]): + sleep(1) # Don't churn CPU + for worker in compound_scorers: + if not worker.is_alive() and worker.exitcode != 0: + raise RuntimeError(f"Worker {worker} failed") + logger.debug(f"Worker {worker} alive") + variant_queue.join() results.put(None) variant_printer.join() @@ -172,7 +206,7 @@ def compound( for line in f: print_variant(variant_line=line, outfile=outfile, mode="modified", silent=silent) except Exception as e: - logger.warning(e) + logger.error(e) for worker in compound_scorers: worker.terminate() variant_printer.terminate() diff --git a/genmod/commands/score_variants.py b/genmod/commands/score_variants.py index afa5f10..ed6c44c 100755 --- a/genmod/commands/score_variants.py +++ b/genmod/commands/score_variants.py @@ -66,6 +66,17 @@ @click.option( "-c", "--score_config", type=click.Path(exists=True), help="The plug-in config file(.ini)" ) +@click.option( + "--skip_is_previously_scored_check", + is_flag=True, + help="Allow rescoring of previously scored VCF", +) +@click.option( + "-s", + "--annotation_suffix", + default=None, + help="Append suffix to all INFO fields related to scoring (to not overwrite existing entries).", +) @click.pass_context def score( context, @@ -77,6 +88,8 @@ def score( silent, skip_plugin_check, rank_results, + skip_is_previously_scored_check, + annotation_suffix, outfile, ): """ @@ -137,6 +150,13 @@ def score( else: logger.info("All plugins are defined in vcf") + # Setup INFO field name suffix + if annotation_suffix is None: + annotation_suffix: str = "" # i.e. add no suffix to INFO field name + else: + annotation_suffix: str = f"{annotation_suffix}" + logger.debug(f"Adding scoring suffix: {annotation_suffix}") + csq_format = head.vep_columns # Add the first variant to the iterator if not line.startswith("#"): @@ -147,7 +167,7 @@ def score( header_line = head.header - if "RankScore" in head.info_dict: + if "RankScore" in head.info_dict and not skip_is_previously_scored_check: logger.warning("Variants already scored according to VCF header") logger.info("Please check VCF file") context.abort() @@ -156,7 +176,7 @@ def score( add_metadata( head, "info", - rank_score_type, + rank_score_type + annotation_suffix, annotation_number=".", entry_type="String", description=rank_score_description, @@ -165,7 +185,7 @@ def score( add_metadata( head, "info", - "RankScoreMinMax", + f"RankScoreMinMax{annotation_suffix}", annotation_number=".", entry_type="String", description="The rank score MIN-MAX bounds. family_id:min:max.", @@ -175,7 +195,7 @@ def score( add_metadata( head, "info", - "RankResult", + "RankResult" + annotation_suffix, annotation_number=".", entry_type="String", description="|".join(score_categories), @@ -221,19 +241,19 @@ def score( ) variant = add_vcf_info( - keyword="RankScore", + keyword="RankScore" + annotation_suffix, variant_dict=variant, annotation="{0}:{1}".format(family_id, rank_score), ) variant: dict = add_vcf_info( - keyword="RankScoreNormalized", + keyword=f"RankScoreNormalized{annotation_suffix}", variant_dict=variant, annotation="{0}:{1}".format(family_id, rank_score_normalized), ) variant: dict = add_vcf_info( - keyword="RankScoreMinMax", + keyword=f"RankScoreMinMax{annotation_suffix}", variant_dict=variant, annotation="{0}:{1}:{2}".format( family_id, category_scores_min, category_scores_max @@ -242,7 +262,9 @@ def score( if rank_results: variant = add_vcf_info( - keyword="RankResult", variant_dict=variant, annotation="|".join(category_scores) + keyword="RankResult" + annotation_suffix, + variant_dict=variant, + annotation="|".join(category_scores), ) print_variant( diff --git a/genmod/score_variants/compound_scorer.py b/genmod/score_variants/compound_scorer.py index 9448e21..d03ca65 100755 --- a/genmod/score_variants/compound_scorer.py +++ b/genmod/score_variants/compound_scorer.py @@ -14,7 +14,8 @@ from __future__ import division, print_function import logging -from multiprocessing import Process +import traceback +from multiprocessing import Process, log_to_stderr from typing import Dict, List, Tuple, Union from genmod.score_variants.cap_rank_score_to_min_bound import cap_rank_score_to_min_bound @@ -26,72 +27,7 @@ ) from genmod.vcf_tools import add_vcf_info, replace_vcf_info -logger = logging.getLogger(__name__) - - -def get_rank_score( - rank_score_type: str, - threshold: Union[int, float], - min_rank_score_value: float, - max_rank_score_value: float, -) -> Union[int, float]: - """ - Return raw rank score or normalized rank score. - - Args: - rank_score_type: Type according to RANK_SCORE_TYPE_NAMES - threshold: A rank score like value - min_rank_score_value: input to as_normalized_max_min - max_rank_score_value: input to as_normalized_max_min - Returns: - A rank score like value, possibly normalized - """ - if rank_score_type == "RankScore": - return threshold - elif rank_score_type == "RankScoreNormalized": - # Normalize raw rank score - return as_normalized_max_min( - score=float(threshold), - min_score_value=min_rank_score_value, - max_score_value=max_rank_score_value, - ) - raise ValueError("Unknown RANK_SCORE_TYPE_NAMES config", rank_score_type) - - -def get_rank_score_as_magnitude( - rank_score_type: str, - rank_score: Union[int, float], - min_rank_score_value: float, - max_rank_score_value: float, -) -> float: - """ - Returns rank score as a magnitude (delta), to make the rank score - suitable for addition/subtraction operations. - - In case of raw rank score, return the input value as the magnitude. - - In case of normalized rank score, return the input value as the - percentage of the rank score dynamic range. - - Args: - rank_score_type: Type according to RANK_SCORE_TYPE_NAMES - rank_score: A rank score like value, to be used in an addition/subtraction operation - min_rank_score_value: input to as_normalized_max_min - max_rank_score_value: input to as_normalized_max_min - Returns: - A value, magnitude, compatible with raw or normalized rank score values - - """ - if rank_score_type == "RankScore": - return rank_score - elif rank_score_type == "RankScoreNormalized": - normalized_rank_score: float = rank_score / (max_rank_score_value - min_rank_score_value) - if not (MIN_SCORE_NORMALIZED <= normalized_rank_score <= MAX_SCORE_NORMALIZED): - raise ValueError( - f"Failed to normalize to within expected bounds {normalized_rank_score}" - ) - return normalized_rank_score - raise ValueError(f"Unknown rank score type {rank_score_type}") +logger = log_to_stderr(level=logging.INFO) class CompoundScorer(Process): @@ -100,7 +36,15 @@ class CompoundScorer(Process): the results queue. """ - def __init__(self, task_queue, results_queue, individuals, threshold: int, penalty: int): + def __init__( + self, + task_queue, + results_queue, + individuals, + threshold: int, + penalty: int, + annotation_suffix: str, + ): """ Initialize the VariantAnnotator @@ -112,6 +56,10 @@ def __init__(self, task_queue, results_queue, individuals, threshold: int, penal task_queue (Queue) results_queue (Queue) individuals (list) + annotation_suffix: Suffix to target and append to INFO annotations. + If this is used, the Compound Scorer will also target a RankScore[SUFFIX] score + in contrast to the per-default RankScore annotation. I.e. this step has a + pre-requisite on generating variant scores with SUFFIX in a previous step. """ Process.__init__(self) @@ -136,8 +84,87 @@ def __init__(self, task_queue, results_queue, individuals, threshold: int, penal else: self.models = ["AR_comp", "AR_comp_dn"] - @staticmethod - def _get_rankscore_normalization_bounds(variant_batch: Dict[str, Dict]) -> Dict[str, Tuple]: + self._annotation_suffix = annotation_suffix + # Create RankScore names to load during compound score computation and rank score adjustment + # Treat the suffix name no different than any other entry in RANK_SCORE_TYPE_NAMES + self._rank_score_type_names = [ + f"{name}{self._annotation_suffix}" for name in RANK_SCORE_TYPE_NAMES + ] + logger.debug(f"Rank score type names: {self._rank_score_type_names}") + + def get_rank_score( + self, + rank_score_type: str, + threshold: Union[int, float], + min_rank_score_value: float, + max_rank_score_value: float, + ) -> Union[int, float]: + """ + Return raw rank score or normalized rank score. + + Args: + rank_score_type: Type according to RANK_SCORE_TYPE_NAMES + threshold: A rank score like value + min_rank_score_value: input to as_normalized_max_min + max_rank_score_value: input to as_normalized_max_min + Returns: + A rank score like value, possibly normalized + """ + if rank_score_type == f"RankScore{self._annotation_suffix}": + return threshold + elif rank_score_type == f"RankScoreNormalized{self._annotation_suffix}": + # Normalize raw rank score + return as_normalized_max_min( + score=float(threshold), + min_score_value=min_rank_score_value, + max_score_value=max_rank_score_value, + ) + raise ValueError( + f"Unknown RANK_SCORE_TYPE_NAMES config {rank_score_type}" + + f"expected any in {self._rank_score_type_names}" + ) + + def get_rank_score_as_magnitude( + self, + rank_score_type: str, + rank_score: Union[int, float], + min_rank_score_value: float, + max_rank_score_value: float, + ) -> float: + """ + Returns rank score as a magnitude (delta), to make the rank score + suitable for addition/subtraction operations. + + In case of raw rank score, return the input value as the magnitude. + + In case of normalized rank score, return the input value as the + percentage of the rank score dynamic range. + + Args: + rank_score_type: Type according to RANK_SCORE_TYPE_NAMES + rank_score: A rank score like value, to be used in an addition/subtraction operation + min_rank_score_value: input to as_normalized_max_min + max_rank_score_value: input to as_normalized_max_min + Returns: + A value, magnitude, compatible with raw or normalized rank score values + + """ + if rank_score_type == f"RankScore{self._annotation_suffix}": + return rank_score + elif rank_score_type == f"RankScoreNormalized{self._annotation_suffix}": + normalized_rank_score: float = rank_score / ( + max_rank_score_value - min_rank_score_value + ) + if not (MIN_SCORE_NORMALIZED <= normalized_rank_score <= MAX_SCORE_NORMALIZED): + raise ValueError( + f"Failed to normalize to within expected bounds {normalized_rank_score}" + ) + return normalized_rank_score + raise ValueError(f"Unknown rank score type {rank_score_type}") + + def _get_rankscore_normalization_bounds( + self, variant_batch: Dict[str, Dict] + ) -> Dict[str, Tuple]: """ For all variants in a variant batch, find the rank score normalization min-max bounds. @@ -148,7 +175,7 @@ def _get_rankscore_normalization_bounds(variant_batch: Dict[str, Dict]) -> Dict[ variant_rankscore_normalization_bounds: dict = {} for variant_id in variant_batch: entry_minmax: List[str] = variant_batch[variant_id]["info_dict"][ - "RankScoreMinMax" + f"RankScoreMinMax{self._annotation_suffix}" ].split(":") rankscore_normalization_min_max: tuple = ( float(entry_minmax[1]), @@ -169,7 +196,7 @@ def _get_rankscore_normalization_bounds(variant_batch: Dict[str, Dict]) -> Dict[ ) return variant_rankscore_normalization_bounds - def run(self): + def _run(self): """Run the consuming""" logger.info("%s: Starting!" % self.proc_name) # Check if there are any batches in the queue @@ -188,7 +215,7 @@ def run(self): # This is a dictionary on the form {'variant_id: rank_score} rank_scores = {} # We are now going to score the compounds - for rank_score_type in RANK_SCORE_TYPE_NAMES: + for rank_score_type in self._rank_score_type_names: # Prepare rank_scores dict to contain raw RankScore and RankScoreNormalized compound scores rank_scores.update({rank_score_type: dict()}) @@ -219,7 +246,7 @@ def run(self): # we want to pennalise the score if the compounds have low scores variant = variant_batch[variant_id] raw_compounds = variant["info_dict"].get("Compounds", None) - for rank_score_type in RANK_SCORE_TYPE_NAMES: + for rank_score_type in self._rank_score_type_names: if raw_compounds: logger.debug("Scoring compound for variant %s" % variant_id) # Variable to see if we should correct the rank score @@ -256,7 +283,7 @@ def run(self): # Loop through compounds to check if they are only low scored for compound_id in compound_list: compound_rank_score = rank_scores[rank_score_type][compound_id] - if compound_rank_score > get_rank_score( + if compound_rank_score > self.get_rank_score( rank_score_type=rank_score_type, threshold=self.threshold, min_rank_score_value=variant_rankscore_normalization_bounds[ @@ -271,7 +298,7 @@ def run(self): if correct_score and only_low: logger.debug("correcting rank score for {0}".format(variant_id)) - current_rank_score -= get_rank_score_as_magnitude( + current_rank_score -= self.get_rank_score_as_magnitude( rank_score_type=rank_score_type, rank_score=self.penalty, min_rank_score_value=variant_rankscore_normalization_bounds[ @@ -284,7 +311,10 @@ def run(self): # In case the current_rank_score falls outside normalization bounds after modification, # cap it to within the MIN normalization bound. current_rank_score = cap_rank_score_to_min_bound( - rank_score_type=rank_score_type, + # Strip suffix in rank_score_type to adhere to default naming convention + rank_score_type=rank_score_type.replace( + self._annotation_suffix, "" + ), rank_score=current_rank_score, min_rank_score_value=variant_rankscore_normalization_bounds[ variant_id @@ -316,9 +346,9 @@ def run(self): # variant['info_dict']['IndividualRankScore'] = current_rank_score_string variant["info_dict"][f"{rank_score_type}"] = new_rank_score_string - variant["info_dict"][f"Compounds{rank_score_type.strip('RankScore')}"] = ( - new_compound_string - ) + variant["info_dict"][ + f"Compounds{rank_score_type.replace('RankScore', '')}" + ] = new_compound_string variant = replace_vcf_info( keyword=f"{rank_score_type}", @@ -328,7 +358,7 @@ def run(self): # CompoundsNormalized is not previously added to VCF. # For this case, perform an VCF INFO ADD operation, rather than a REPLACE - keyword_compounds = f"Compounds{rank_score_type.strip('RankScore')}" + keyword_compounds = f"Compounds{rank_score_type.replace('RankScore', '')}" fn_add_replace_vcf_info = replace_vcf_info if not ( keyword_compounds in variant["INFO"] @@ -349,3 +379,11 @@ def run(self): self.task_queue.task_done() return + + def run(self, *args, **kwargs): + # Wrapper for catching errors in main method + try: + return self._run(*args, **kwargs) + except Exception as e: + logger.fatal(f"{e}:{traceback.format_exc()}") + raise e diff --git a/genmod/utils/variant_printer.py b/genmod/utils/variant_printer.py index eec4b83..f8dbf24 100755 --- a/genmod/utils/variant_printer.py +++ b/genmod/utils/variant_printer.py @@ -12,12 +12,16 @@ from __future__ import print_function import logging +import sys from codecs import open -from multiprocessing import Process +from multiprocessing import Process, log_to_stderr from genmod.utils import get_chromosome_priority, get_rank_score from genmod.vcf_tools import print_variant +logger = log_to_stderr(logging.INFO) +logging.basicConfig(stream=sys.stderr, force=True) + class VariantPrinter(Process): """ @@ -45,7 +49,7 @@ class VariantPrinter(Process): def __init__(self, task_queue, head, mode="chromosome", outfile=None, silent=False): Process.__init__(self) - self.logger = logging.getLogger(__name__) + self.logger = logger self.task_queue = task_queue self.outfile = outfile self.header = head.header diff --git a/tests/fixtures/score_variants/rank_model_-v1.39-mivmir.ini b/tests/fixtures/score_variants/rank_model_-v1.39-mivmir.ini new file mode 100644 index 0000000..be95531 --- /dev/null +++ b/tests/fixtures/score_variants/rank_model_-v1.39-mivmir.ini @@ -0,0 +1,155 @@ +[Version] + version = 1.39 + name = rank_model_mivmir + +[Categories] + [[allele_frequency]] + category_aggregation = min + + [[inheritance_models]] + category_aggregation = min + + [[variant_call_quality_filter]] + category_aggregation = sum + +[model_score] + category = variant_call_quality_filter + data_type = integer + description = Inheritance model score + field = INFO + info_key = ModelScore + record_rule = min + separators = ',',':', + + [[not_reported]] + score = 0 + + [[low_qual]] + score = -5 + lower = 0 + upper = 10 + + [[medium_qual]] + score = -2 + lower = 10 + upper = 20 + + [[high_qual]] + score = 0 + lower = 20 + upper = 300 + +[mtaf] + category = allele_frequency + data_type = float + description = genbank MT frequency + field = INFO + info_key = MTAF + record_rule = max + separators = ',', + + [[not_reported]] + score = 4 + + [[common]] + score = -12 + lower = 0.02 + upper = 1.1 + + [[intermediate]] + score = 1 + lower = 0.005 + upper = 0.02 + + [[rare]] + score = 2 + lower = 0.0005 + upper = 0.005 + + [[very_rare]] + score = 3 + lower = 0 + upper = 0.0005 + +[genetic_models] + category = inheritance_models + data_type = string + description = Inheritance models followed for the variant + field = INFO + info_key = GeneticModels + record_rule = max + separators = ',', ':', '|', + + [[ad]] + priority = 1 + score = 1 + string = 'AD' + + [[ad_dn]] + score = 1 + priority = 1 + string = 'AD_dn' + + [[ar]] + score = 1 + priority = 1 + string = 'AR_hom' + + [[ar_dn]] + score = 1 + priority = 1 + string = 'AR_hom_dn' + + [[ar_comp]] + score = 1 + priority = 1 + string = 'AR_comp' + + [[ar_comp_dn]] + score = 1 + priority = 1 + string = 'AR_comp_dn' + + [[xr]] + score = 1 + priority = 1 + string = 'XR' + + [[xr_dn]] + score = 1 + priority = 1 + string = 'XR_dn' + + [[xd]] + score = 1 + priority = 1 + string = 'XD' + + [[xd_dn]] + score = 1 + priority = 1 + string = 'XD_dn' + + [[not_reported]] + score = -12 + +[filter] + category = variant_call_quality_filter + data_type = string + description = The filters for the variant + field = FILTER + record_rule = min + separators = ';', + + [[not_reported]] + score = 0 + + [[pass]] + score = 3 + priority = 1 + string = 'PASS' + + [[dot]] + score = 3 + priority = 2 + string = '.' diff --git a/tests/fixtures/test_vcf_annotated_scored.vcf b/tests/fixtures/test_vcf_annotated_scored.vcf index 394ae90..589d318 100644 --- a/tests/fixtures/test_vcf_annotated_scored.vcf +++ b/tests/fixtures/test_vcf_annotated_scored.vcf @@ -5,23 +5,26 @@ ##INFO= ##INFO= ##INFO= -##INFO= +##INFO= +##INFO= +##INFO= +##INFO= ##contig= ##reference=file:///humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta ##Software= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT father mother proband father_2 mother_2 proband_2 -1 879537 . T C 100 PASS MQ=1;GeneticModels=1:AR_hom;ModelScore=1:55;Annotation=SAMD11;CADD=1.248;1000GAF=0.000199681;RankScore=1:6.0 GT:AD:GQ 0/1:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 -1 879541 . G A 100 PASS MQ=1;GeneticModels=1:AR_hom|AR_hom_dn;ModelScore=1:57;Annotation=SAMD11;CADD=4.003;1000GAF=0.000599042;RankScore=1:6.0 GT:AD:GQ ./. 0/1:10,10:60 1/1:10,10:60 ./. 0/1:10,10:60 0/1:10,10:60 -1 879595 . C T 100 PASS MQ=1;GeneticModels=1:AR_hom_dn;ModelScore=1:55;Annotation=NOC2L,SAMD11;CADD=8.271;1000GAF=0.000399361;RankScore=1:6.0 GT:AD:GQ 0/1:10,10:60 0/0:10,10:60 1/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 -1 879676 . G A 100 PASS MQ=1;Annotation=NOC2L,SAMD11;CADD=7.019;1000GAF=0.885982;RankScore=1:-23.0 GT:AD:GQ 0/1:10,10:60 1/1:10,10:60 1/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 -1 879911 . G A 100 PASS MQ=1;Compounds=1:1_880086_T_C|1_880012_A_G;GeneticModels=1:AR_comp|AR_comp_dn;ModelScore=1:55;Annotation=NOC2L,SAMD11;CADD=4.408;1000GAF=0.00998403;RankScore=1:6.0 GT:AD:GQ 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 -1 880012 . A G 100 PASS MQ=1;Compounds=1:1_880086_T_C|1_879911_G_A;GeneticModels=1:AR_comp|AR_comp_dn;ModelScore=1:55;Annotation=NOC2L;CADD=3.326;1000GAF=0.00119808;RankScore=1:6.0 GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 -1 880086 . T C 100 PASS MQ=1;Compounds=1:1_880012_A_G|1_879911_G_A;GeneticModels=1:AR_comp_dn|AD_dn;ModelScore=1:55;Annotation=NOC2L;CADD=0.091;1000GAF=0.000399361;RankScore=1:6.0 GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 -1 880199 . G A 100 PASS MQ=1;GeneticModels=1:AD_dn;ModelScore=1:55;Annotation=NOC2L;CADD=3.450;RankScore=1:7.0 GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 -1 880217 . T G 100 PASS MQ=1;GeneticModels=1:AD_dn;ModelScore=1:55;Annotation=NOC2L;CADD=4.208;RankScore=1:7.0 GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 -10 76154051 . A G 100 PASS MQ=1;Compounds=1:10_76154073_T_G;GeneticModels=1:AR_comp_dn;ModelScore=1:55;Annotation=ADK;CADD=9.261;1000GAF=0.000199681;RankScore=1:6.0 GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 -10 76154073 . T G 100 PASS MQ=1;Compounds=1:10_76154051_A_G;GeneticModels=1:AR_comp_dn|AD_dn;ModelScore=1:55;Annotation=ADK;CADD=22.4;RankScore=1:9.0 GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 -10 76154074 . C G 100 PASS MQ=1;Annotation=ADK;CADD=8.374;RankScore=1:-8.0 GT:AD:GQ ./. 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 -10 76154076 . G C 100 PASS MQ=1;GeneticModels=1:AD|AD_dn;ModelScore=1:57;Annotation=ADK;CADD=11.65;RankScore=1:8.0 GT:AD:GQ ./. 0/0:10,10:60 0/1:10,10:60 ./. 0/0:10,10:60 0/1:10,10:60 -X 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1;GeneticModels=1:XD|XR;ModelScore=1:55;Annotation=PPP2R3B;1000GAF=0.04;RankScore=1:4.0 GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60 -MT 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1;GeneticModels=1:AR_hom_dn;ModelScore=1:55;RankScore=1:6.0 GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60 +1 879537 . T C 100 PASS MQ=1;GeneticModels=1:AR_hom;ModelScore=1:55;Annotation=SAMD11;CADD=1.248;1000GAF=0.000199681;RankScore=1:6.0;RankScoreNormalized=1:0.8378378378378378;RankScoreMinMax=1:-25.0:12.0 GT:AD:GQ 0/1:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 +1 879541 . G A 100 PASS MQ=1;GeneticModels=1:AR_hom|AR_hom_dn;ModelScore=1:57;Annotation=SAMD11;CADD=4.003;1000GAF=0.000599042;RankScore=1:6.0;RankScoreNormalized=1:0.8378378378378378;RankScoreMinMax=1:-25.0:12.0 GT:AD:GQ ./. 0/1:10,10:60 1/1:10,10:60 ./. 0/1:10,10:60 0/1:10,10:60 +1 879595 . C T 100 PASS MQ=1;GeneticModels=1:AR_hom_dn;ModelScore=1:55;Annotation=NOC2L,SAMD11;CADD=8.271;1000GAF=0.000399361;RankScore=1:6.0;RankScoreNormalized=1:0.8378378378378378;RankScoreMinMax=1:-25.0:12.0 GT:AD:GQ 0/1:10,10:60 0/0:10,10:60 1/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 +1 879676 . G A 100 PASS MQ=1;Annotation=NOC2L,SAMD11;CADD=7.019;1000GAF=0.885982;RankScore=1:-23.0;RankScoreNormalized=1:0.05405405405405406;RankScoreMinMax=1:-25.0:12.0 GT:AD:GQ 0/1:10,10:60 1/1:10,10:60 1/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 +1 879911 . G A 100 PASS MQ=1;Compounds=1:1_880086_T_C|1_880012_A_G;GeneticModels=1:AR_comp|AR_comp_dn;ModelScore=1:55;Annotation=NOC2L,SAMD11;CADD=4.408;1000GAF=0.00998403;RankScore=1:6.0;RankScoreNormalized=1:0.8378378378378378;RankScoreMinMax=1:-25.0:12.0 GT:AD:GQ 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 +1 880012 . A G 100 PASS MQ=1;Compounds=1:1_880086_T_C|1_879911_G_A;GeneticModels=1:AR_comp|AR_comp_dn;ModelScore=1:55;Annotation=NOC2L;CADD=3.326;1000GAF=0.00119808;RankScore=1:6.0;RankScoreNormalized=1:0.8378378378378378;RankScoreMinMax=1:-25.0:12.0GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 +1 880086 . T C 100 PASS MQ=1;Compounds=1:1_880012_A_G|1_879911_G_A;GeneticModels=1:AR_comp_dn|AD_dn;ModelScore=1:55;Annotation=NOC2L;CADD=0.091;1000GAF=0.000399361;RankScore=1:6.0;RankScoreNormalized=1:0.8378378378378378;RankScoreMinMax=1:-25.0:12.0GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 +1 880199 . G A 100 PASS MQ=1;GeneticModels=1:AD_dn;ModelScore=1:55;Annotation=NOC2L;CADD=3.450;RankScore=1:7.0;RankScoreNormalized=1:0.8648648648648649;RankScoreMinMax=1:-25.0:12.0 GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 +1 880217 . T G 100 PASS MQ=1;GeneticModels=1:AD_dn;ModelScore=1:55;Annotation=NOC2L;CADD=4.208;RankScore=1:7.0;RankScoreNormalized=1:0.8648648648648649;RankScoreMinMax=1:-25.0:12.0 GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 +10 76154051 . A G 100 PASS MQ=1;Compounds=1:10_76154073_T_G;GeneticModels=1:AR_comp_dn;ModelScore=1:55;Annotation=ADK;CADD=9.261;1000GAF=0.000199681;RankScore=1:6.0;RankScoreNormalized=1:0.8378378378378378;RankScoreMinMax=1:-25.0:12.0 GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 0/0:10,10:60 0/1:10,10:60 0/1:10,10:60 +10 76154073 . T G 100 PASS MQ=1;Compounds=1:10_76154051_A_G;GeneticModels=1:AR_comp_dn|AD_dn;ModelScore=1:55;Annotation=ADK;CADD=22.4;RankScore=1:9.0;RankScoreNormalized=1:0.918918918918919;RankScoreMinMax=1:-25.0:12.0 GT:AD:GQ 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 0/0:10,10:60 0/0:10,10:60 0/1:10,10:60 +10 76154074 . C G 100 PASS MQ=1;Annotation=ADK;CADD=8.374;RankScore=1:-8.0;RankScoreNormalized=1:0.4594594594594595;RankScoreMinMax=1:-25.0:12.0 GT:AD:GQ ./. 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 0/1:10,10:60 +10 76154076 . G C 100 PASS MQ=1;GeneticModels=1:AD|AD_dn;ModelScore=1:57;Annotation=ADK;CADD=11.65;RankScore=1:8.0;RankScoreNormalized=1:0.8918918918918919;RankScoreMinMax=1:-25.0:12.0 GT:AD:GQ ./. 0/0:10,10:60 0/1:10,10:60 ./. 0/0:10,10:60 0/1:10,10:60 +X 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1;GeneticModels=1:XD|XR;ModelScore=1:55;Annotation=PPP2R3B;1000GAF=0.04;RankScore=1:4.0;RankScoreNormalized=1:0.7837837837837838;RankScoreMinMax=1:-25.0:12.0 GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60 +MT 302253 . CCCTCCTGCCCCT C 100 PASS MQ=1;GeneticModels=1:AR_hom_dn;ModelScore=1:55;RankScore=1:6.0;RankScoreNormalized=1:0.8378378378378378;RankScoreMinMax=1:-25.0:12.0 GT:AD:GQ 0/0:10,10:60 0/1:10,10:60 1/1:10,10:60 0/0:10,10:60 1/1:10,10:60 1/1:10,10:60 diff --git a/tests/rescoring/test_rescoring_functionality.py b/tests/rescoring/test_rescoring_functionality.py new file mode 100644 index 0000000..9c75d97 --- /dev/null +++ b/tests/rescoring/test_rescoring_functionality.py @@ -0,0 +1,153 @@ +from tempfile import NamedTemporaryFile +from typing import Dict, List + +from click.testing import CliRunner +from genmod.commands import score_command, score_compounds_command +from genmod.vcf_tools import HeaderParser, get_info_dict, get_variant_dict, get_variant_id + +SCORED_VCF = "tests/fixtures/test_vcf_annotated_scored.vcf" +SCORE_CONFIG = "tests/fixtures/score_variants/genmod_example.ini" + + +def _flatten_dict(d: Dict) -> Dict: + """ + Flatten dictionary + """ + output = d.copy() + for key, value in d.items(): + if isinstance(value, dict): + output = d | value + return output + + +def _yield_variants_from_vcf(vcf_file_path: str): + head = HeaderParser() + # Parse file header + with open(vcf_file_path, "r") as fp: + for line in fp: + line = line.rstrip() + if line.startswith("#"): + if line.startswith("##"): + head.parse_meta_data(line) + else: + head.parse_header_line(line) + else: + break + # Yield variants + with open(vcf_file_path, "r") as fp: + try: + for line in fp: + if line.startswith("#"): + continue + variant = get_variant_dict(line, head.header) + variant_id = get_variant_id(variant) + variant["variant_id"] = variant_id + variant["info_dict"] = get_info_dict(variant["INFO"]) + variant = _flatten_dict(variant) + yield variant + except Exception as e: + print(f"Failed parsing VCF line; '{line}'") + raise e + + +def _sort_variants_on_id(variants: List[Dict]) -> List[Dict]: + return sorted(variants, key=lambda variant: variant["variant_id"]) + + +def test_rescore_with_annotation_suffix(): + """ + Test for asserting secondary Genmod annotation is present in VCF. + This is intended for secondary scoring using a different rank config, + and where score annotations are added with a suffix appended. + + Applies only to: + - Variant scoring + - Compound scoring + remaining annotations, models etc are out of scope for this test. + + Expected: + - All previously existing GENMOD annotations are present and unaltered + - New scoring annotations are added that's based on an additional scoring config + """ + SUFFIX = "Suffix" + + # Variant Scoring with annotation suffix + runner = CliRunner() + result = runner.invoke( + score_command, + [ + SCORED_VCF, + "-c", + SCORE_CONFIG, + "--skip_is_previously_scored_check", + "--annotation_suffix", + SUFFIX, + ], + ) + assert result.exit_code == 0 + rescored_variants = NamedTemporaryFile(suffix=".vcf") + with open(rescored_variants.name, "w") as file: + file.write(result.stdout_bytes.decode("utf-8")) # Save processed VCF to file + + # Compound Scoring with annotation suffix + runner = CliRunner() + # WHEN computing compound score + result_compounds = runner.invoke( + score_compounds_command, + [rescored_variants.name, "--annotation_suffix", SUFFIX], + ) + assert result_compounds.exit_code == 0 + rescored_variants_with_compounds = NamedTemporaryFile(suffix=".vcf") + with open(rescored_variants_with_compounds.name, "w") as file: + file.write(result_compounds.stdout_bytes.decode("utf-8")) # Save processed VCF to file + + # WHEN comparing the variants pre and post suffix annotation + original_variants = [] + for variant in _yield_variants_from_vcf(SCORED_VCF): + original_variants.append(variant) + original_variants = _sort_variants_on_id(original_variants) + + rescored_variants = [] + for variant in _yield_variants_from_vcf(rescored_variants_with_compounds.name): + rescored_variants.append(variant) + rescored_variants = _sort_variants_on_id(rescored_variants) + + assert len(original_variants) == len(rescored_variants), ( + len(original_variants), + len(rescored_variants), + ) + + unaltered_annotations = ["RankScore", "RankScoreNormalized", "RankScoreMinMax", "Compounds"] + added_annotations = [f"{name}{SUFFIX}" for name in unaltered_annotations] + + # THEN expect that the previously annotated information is unchanged ... + for unalted_annotation in unaltered_annotations: + for original_variant, rescored_variant in zip(original_variants, rescored_variants): + # Compounds annotation not present for all variants (continue if missing) + if ( + unalted_annotation == "Compounds" + and unalted_annotation not in original_variant.keys() + ): + continue + assert original_variant[unalted_annotation] == rescored_variant[unalted_annotation], ( + original_variant["variant_id"], + rescored_variant["variant_id"], + unalted_annotation, + original_variant[unalted_annotation], + rescored_variant[unalted_annotation], + ) + + # THEN expect that new annotations are added + for added_annotation in added_annotations: + for original_variant, rescored_variant in zip(original_variants, rescored_variants): + # Compounds annotation not present for all variants (continue if missing) + if ( + added_annotation.replace(SUFFIX, "") == "Compounds" + and added_annotation not in original_variant.keys() + ): + continue + assert added_annotation not in original_variant.keys(), ( + added_annotation, + original_variant, + ) + assert added_annotation in rescored_variant.keys(), (added_annotation, rescored_variant) diff --git a/tests/score_variants/test_minimal_mivmir_score_config.py b/tests/score_variants/test_minimal_mivmir_score_config.py new file mode 100644 index 0000000..318ba39 --- /dev/null +++ b/tests/score_variants/test_minimal_mivmir_score_config.py @@ -0,0 +1,30 @@ +import pytest as pt + +from tempfile import NamedTemporaryFile +from typing import Dict, List + +from click.testing import CliRunner +from genmod.commands import score_command, score_compounds_command +from genmod.vcf_tools import HeaderParser, get_info_dict, get_variant_dict, get_variant_id + + +SCORED_VCF = "tests/fixtures/test_vcf_annotated_scored.vcf" +SCORE_CONFIG = "tests/fixtures/score_variants/rank_model_-v1.39-mivmir.ini" +SUFFIX = "MIVMIR" + +def test_mivmir_minimal_score_config(): + runner = CliRunner() + result = runner.invoke( + score_command, + [ + SCORED_VCF, + "-c", + SCORE_CONFIG, + "--skip_is_previously_scored_check", + "--annotation_suffix", + SUFFIX, + ], + ) + assert result.exit_code == 0 + print(result.stdout_bytes.decode("utf-8")) + assert False \ No newline at end of file