Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
199 changes: 193 additions & 6 deletions gnomad_qc/v5/annotations/generate_variant_qc_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,31 @@
import logging

import hail as hl
from gnomad.utils.annotations import annotate_allele_info, get_lowqual_expr
from gnomad.resources.grch38.gnomad import GROUPS
from gnomad.resources.grch38.reference_data import get_truth_ht
from gnomad.utils.annotations import (
annotate_allele_info,
get_lowqual_expr,
pab_max_expr,
)
from gnomad.utils.sparse_mt import split_info_annotation
from gnomad.variant_qc.pipeline import generate_sib_stats, generate_trio_stats
from gnomad.variant_qc.random_forest import median_impute_features

from gnomad_qc.resource_utils import check_resource_existence
from gnomad_qc.v4.annotations.generate_variant_qc_annotations import (
INFO_FEATURES,
NON_INFO_FEATURES,
TRUTH_DATA,
)
from gnomad_qc.v5.annotations.annotation_utils import annotate_adj_no_dp, get_adj_expr
from gnomad_qc.v5.resources.annotations import (
aou_annotated_sites_only_vcf,
aou_vcf_header,
get_info_ht,
get_sib_stats,
get_trio_stats,
get_variant_qc_annotations,
)
from gnomad_qc.v5.resources.basics import get_aou_vds, get_logging_path
from gnomad_qc.v5.resources.constants import GNOMAD_TMP_BUCKET, WORKSPACE_BUCKET
Expand All @@ -30,6 +43,8 @@ def generate_ac_info_ht(vds: hl.vds.VariantDataset) -> hl.Table:
"""
Compute AC and AC_raw annotations for each allele count filter group.

Function also adds `AS_pab_max` and `allele_info` annotations.

:param vds: VariantDataset to use for computing AC and AC_raw annotations.
:return: Table with AC and AC_raw annotations split by high quality, release, and unrelated.
"""
Expand Down Expand Up @@ -80,6 +95,10 @@ def generate_ac_info_ht(vds: hl.vds.VariantDataset) -> hl.Table:
},
)

ac_info_expr = ac_info_expr.annotate(
AS_pab_max=pab_max_expr(mt.LGT, mt.LAD, mt.LA, hl.len(mt.alleles))
)

ac_info_ht = mt.select_rows(AC_info=ac_info_expr).rows()

# Split multi-allelic sites.
Expand Down Expand Up @@ -163,8 +182,15 @@ def create_info_ht(

logger.info("Adding AC info annotations to info ht...")
ac_info_ht = generate_ac_info_ht(vds)
ht = ht.annotate(**ac_info_ht[ht.key])
return ht
ac_info_ht = ac_info_ht.checkpoint(hl.utils.new_temp_file("ac_info_ht", "ht"))
# Annotate ac_info_ht with info annotations because info VCF
# provided by AoU has variants not present in samples we consider high quality.
info_ht = ac_info_ht.annotate(**ht[ac_info_ht.key])
info_ht = info_ht.annotate(
info=info_ht.info.annotate(AS_pab_max=info_ht.AC_info.AS_pab_max),
AC_info=info_ht.AC_info.drop("AS_pab_max"),
)
return info_ht


def run_generate_trio_stats(
Expand Down Expand Up @@ -202,6 +228,126 @@ def run_generate_sib_stats(
return generate_sib_stats(mt, relatedness_ht)


def create_variant_qc_annotation_ht(
info_ht: hl.Table,
trio_stats_ht: hl.Table,
sib_stats_ht: hl.Table,
impute_features: bool = True,
n_partitions: int = 5000,
) -> hl.Table:
"""
Create a Table with all necessary annotations for variant QC.

Annotations that are included:

Features for RF:
- variant_type
- allele_type
- n_alt_alleles
- has_star
- AS_QD
- AS_pab_max
- AS_MQRankSum
- AS_SOR
- AS_ReadPosRankSum

Training sites (bool):
- transmitted_singleton
- sibling_singleton
- fail_hard_filters - (ht.QD < 2) | (ht.FS > 60) | (ht.MQ < 30)

:param info_ht: Info Table with split multi-allelics.
:param trio_stats_ht: Table with trio statistics.
:param sib_stats_ht: Table with sibling statistics.
:param impute_features: Whether to impute features using feature medians (this is
done by variant type).
:param n_partitions: Number of partitions to use for final annotated Table.
:return: Hail Table with all annotations needed for variant QC.
"""
truth_data_ht = get_truth_ht()

ht = info_ht.transmute(**info_ht.AC_info, **info_ht.allele_info)
ht = ht.annotate(**ht.info.select(*INFO_FEATURES))

if impute_features:
impute_ht = ht.select("variant_type", **ht.info)
impute_ht = median_impute_features(
impute_ht, {"variant_type": impute_ht.variant_type}
).checkpoint(hl.utils.new_temp_file("median_impute", "ht"))

impute_result = impute_ht[ht.key]
ht = ht.annotate(
info=impute_result.drop("feature_imputed", "variant_type"),
feature_imputed=impute_result.feature_imputed,
)
ht = ht.annotate_globals(feature_medians=hl.eval(impute_ht.feature_medians))

logger.info("Annotating Table with trio and sibling stats and reference truth data")
trio_stats_ht = trio_stats_ht.select(
*[f"{a}_{group}" for a in ["n_transmitted", "ac_children"] for group in GROUPS]
)
ht = ht.annotate(
**trio_stats_ht[ht.key],
**sib_stats_ht[ht.key],
**truth_data_ht[ht.key],
)
tp_map = {
"transmitted_singleton": "n_transmitted",
"sibling_singleton": "n_sib_shared_variants",
}

# Filter to only variants found in high quality samples and are not lowqual.
ht = ht.filter((ht.AC_high_quality_raw > 0) & ~ht.AS_lowqual)

select_dict = {tp: hl.or_else(ht[tp], False) for tp in TRUTH_DATA}
select_dict.update(
{
f"{tp}_{group}": hl.or_else(
(ht[f"{n}_{group}"] == 1)
& (ht[f"AC_high_quality{'' if group == 'adj' else f'_raw'}"] == 2),
False,
)
for tp, n in tp_map.items()
for group in GROUPS
}
)
select_dict.update(
{
# NOTE: Previous versions used QD < 2, but we decided this was too
# stringent based on the distribution of this metric in AoU.
"fail_hard_filters": (ht.AS_QD < 0.5) | (ht.AS_FS > 60) | (ht.AS_MQ < 30),
"singleton": ht.AC_release_raw == 1,
"ac_raw": ht.AC_high_quality_raw,
"ac": ht.AC_release,
"ac_unrelated_raw": ht.AC_unrelated_raw,
}
)

if impute_features:
select_dict["feature_imputed"] = ht.feature_imputed

ht = ht.select(
"a_index",
"was_split",
*NON_INFO_FEATURES,
"info",
**select_dict,
)

temp_path = hl.utils.new_temp_file("variant_qc_annotations", "ht")
ht.write(temp_path)
ht = hl.read_table(temp_path, _n_partitions=n_partitions)
ht.describe()

summary = ht.group_by(
*TRUTH_DATA, *[f"{tp}_{group}" for tp in tp_map for group in GROUPS]
).aggregate(n=hl.agg.count())
logger.info("Summary of truth data annotations:")
summary.show(-1)

return ht


def main(args):
"""Generate all variant annotations needed for variant QC."""
if args.rwb:
Expand All @@ -223,6 +369,13 @@ def main(args):
test_n_partitions = args.test_n_partitions
test = args.test or test_n_partitions is not None

info_ht_path = get_info_ht(test=test, environment=environment).path
trio_stats_ht_path = get_trio_stats(test=test, environment=environment).path
sib_stats_ht_path = get_sib_stats(test=test, environment=environment).path
variant_qc_annotation_ht_path = get_variant_qc_annotations(
test=test, environment=environment
).path

# NOTE: VDS will have 'aou_' prefix on sample IDs.
vds = get_aou_vds(
high_quality_only=True,
Expand All @@ -235,7 +388,6 @@ def main(args):

try:
if args.create_info_ht:
info_ht_path = get_info_ht(test=test, environment=environment).path
check_resource_existence(
output_step_resources={
"info_ht": [info_ht_path],
Expand All @@ -254,7 +406,6 @@ def main(args):

if args.generate_trio_stats:
logger.info("Generating trio stats...")
trio_stats_ht_path = get_trio_stats(test=test, environment=environment).path
check_resource_existence(
output_step_resources={"trio_stats_ht": [trio_stats_ht_path]},
overwrite=overwrite,
Expand All @@ -268,7 +419,6 @@ def main(args):

if args.generate_sibling_stats:
logger.info("Generating sibling stats...")
sib_stats_ht_path = get_sib_stats(test=test, environment=environment).path
check_resource_existence(
output_step_resources={"sib_stats_ht": [sib_stats_ht_path]},
overwrite=overwrite,
Expand All @@ -277,6 +427,23 @@ def main(args):
ht = run_generate_sib_stats(vds.variant_data, relatedness().ht())
ht.write(sib_stats_ht_path, overwrite=overwrite)

if args.create_variant_qc_annotation_ht:
logger.info("Creating variant QC annotation HT...")
check_resource_existence(
output_step_resources={
"variant_qc_annotation_ht": [variant_qc_annotation_ht_path]
},
overwrite=overwrite,
)
ht = create_variant_qc_annotation_ht(
hl.read_table(info_ht_path),
hl.read_table(trio_stats_ht_path),
hl.read_table(sib_stats_ht_path),
impute_features=args.impute_features,
n_partitions=args.n_partitions,
)
ht.write(variant_qc_annotation_ht_path, overwrite=overwrite)

finally:
logger.info("Copying log to logging bucket...")
hl.copy_log(
Expand Down Expand Up @@ -331,6 +498,26 @@ def get_script_argument_parser() -> argparse.ArgumentParser:
type=int,
)

variant_qc_annotation_args = parser.add_argument_group(
"Variant QC annotation HT parameters"
)
variant_qc_annotation_args.add_argument(
"--create-variant-qc-annotation-ht",
help="Creates an annotated HT with features for variant QC.",
action="store_true",
)
variant_qc_annotation_args.add_argument(
"--impute-features",
help="If set, imputation is performed for variant QC features.",
action="store_true",
)
variant_qc_annotation_args.add_argument(
"--n-partitions",
help="Desired number of partitions for variant QC annotation HT .",
type=int,
default=5000,
)

return parser


Expand Down
Loading