Skip to content

Commit f117508

Browse files
committed
Variant Rank Score Normalization
** Variant Ranking ** Normalize variant rank score by means of max-min normalization. This is preferable because it makes output values agnostic to the number of plugins and their scoring behavior. Max and min is computed as the total sum of upper and lower bounds of each plugin. The VCF attribute 'RankScoreNormalized' (bounded (0, 1)) is added to the scored VCF file. No changes to existing 'RankScore' behavior, which is still present in VCF. ** Compound Ranking ** This patch makes CompoundScore class use the additional RankScoreNormalized VCF field for scoring purposes. This patch adds a CompoundsNormalized INFO field to VCF, based on RankScoreNormalized. * Export RankScore bounds MIN, MAX to VCF file This is required by CompoundScore because it thresholds and edits RankScore(Normalized) during the scoring process. * Configure CompoundScorer to use normalized rank score in addition to legacy RankScore. * Apply CompoundScorer custom thresholds normalized to RankScore Min-Max bounds. Signed-off-by: Tor Björgen <tor.bjorgen@scilifelab.se>
1 parent e880836 commit f117508

File tree

12 files changed

+846
-134
lines changed

12 files changed

+846
-134
lines changed

docs/commands/score-variants.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Score Variant
2+
3+
## Rank Score Normalization
4+
5+
The rank score is MAXMIN normalized into range (0, 1) according to the following formula:
6+
7+
```
8+
RankScoreNormalized = (RankScore - CategorySumMin) / (CategorySumMax - CategorySumMin)
9+
```
10+
where `RankScore` is the sum of rank score across categories (including rules such as min, max, sum etc)
11+
`RankScore = SUM(Score_category_n) for 0...n categories`
12+
and `CategorySumMin` is the sum of minimal score values for all categories,
13+
i. e `CategorySumMin = SUM(CategoryMin_n) for 0...n categories`.
14+
The same applies to `CategorySumMax = SUM(CategoryMax_n) for 0...n categories`.
15+
16+
Refer to `score_variants.py::score()` method for implementation details.

genmod/commands/score_compounds.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,13 @@ def compound(context, variant_file, silent, outfile, vep, processes, temp_dir):
8080
header_line = head.header
8181
individuals = head.individuals
8282

83+
add_metadata(head,
84+
'info',
85+
'CompoundsNormalized',
86+
annotation_number='.',
87+
entry_type='String',
88+
description='Rank score as provided by compound analysis, based on RankScoreNormalized. family_id:rank_score')
89+
8390
###################################################################
8491
### The task queue is where all jobs(in this case batches that ###
8592
### represents variants in a region) is put. The consumers will ###

genmod/commands/score_variants.py

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
print_headers, HeaderParser, get_variant_dict, get_info_dict)
2828

2929
from genmod.score_variants import (ConfigParser, get_category_score,
30-
check_plugins)
30+
check_plugins, as_normalized_max_min, RANK_SCORE_TYPES)
3131

3232
from genmod import __version__
3333

@@ -133,14 +133,22 @@ def score(context, variant_file, family_id, family_file, family_type, score_conf
133133
logger.warning("Variants already scored according to VCF header")
134134
logger.info("Please check VCF file")
135135
context.abort()
136-
136+
137+
for rank_score_type, rank_score_description in RANK_SCORE_TYPES.items():
138+
add_metadata(head,
139+
'info',
140+
rank_score_type,
141+
annotation_number='.',
142+
entry_type='String',
143+
description=rank_score_description)
144+
137145
add_metadata(
138146
head,
139147
'info',
140-
'RankScore',
141-
annotation_number='.',
142-
entry_type='String',
143-
description="The rank score for this variant in this family. family_id:rank_score."
148+
'RankScoreMinMax',
149+
annotation_number='.',
150+
entry_type='String',
151+
description="The rank score MIN-MAX bounds. family_id:min:max."
144152
)
145153

146154
if rank_results:
@@ -169,26 +177,46 @@ def score(context, variant_file, family_id, family_file, family_type, score_conf
169177
rank_score = 0
170178
# This is for printing results to vcf:
171179
category_scores = []
180+
# Keep track for per-category min max scores for normalization purposes
181+
category_scores_max: float = 0.0
182+
category_scores_min: float = 0.0
172183
for category in score_categories:
173-
category_score = get_category_score(
184+
category_score, category_score_min, category_score_max = get_category_score(
174185
variant=variant,
175186
category=category,
176187
config_parser=config_parser,
177188
csq_format=csq_format
178189
)
179190
logger.debug("Adding category score {0} to rank_score".format(category_score))
180-
181191
rank_score += category_score
182192
logger.debug("Updating rank score to {0}".format(rank_score))
193+
category_scores_min += category_score_min
194+
category_scores_max += category_score_max
183195

184196
category_scores.append(str(category_score))
185197

186-
198+
# Normalize ranks score (across all categories)
199+
rank_score_normalized: float = as_normalized_max_min(score=float(rank_score),
200+
min_score_value=category_scores_min,
201+
max_score_value=category_scores_max)
202+
187203
variant = add_vcf_info(
188204
keyword = 'RankScore',
189205
variant_dict=variant,
190206
annotation="{0}:{1}".format(family_id, rank_score)
191207
)
208+
209+
variant: dict = add_vcf_info(
210+
keyword = 'RankScoreNormalized',
211+
variant_dict=variant,
212+
annotation="{0}:{1}".format(family_id, rank_score_normalized)
213+
)
214+
215+
variant: dict = add_vcf_info(
216+
keyword = 'RankScoreMinMax',
217+
variant_dict=variant,
218+
annotation="{0}:{1}:{2}".format(family_id, category_scores_min, category_scores_max)
219+
)
192220

193221
if rank_results:
194222
variant = add_vcf_info(

genmod/score_variants/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from .score_function import ScoreFunction
44
from .config_parser import ConfigParser
5-
from .score_variant import get_category_score
5+
from .score_variant import get_category_score, as_normalized_max_min
66
from .compound_scorer import CompoundScorer
7-
from .check_plugins import check_plugins
7+
from .check_plugins import check_plugins
8+
from .rank_score_variant_definitions import RANK_SCORE_TYPES, RANK_SCORE_TYPE_NAMES

0 commit comments

Comments
 (0)