diff --git a/gnomad/resources/resource_utils.py b/gnomad/resources/resource_utils.py index db56768ab..bb76b05bd 100644 --- a/gnomad/resources/resource_utils.py +++ b/gnomad/resources/resource_utils.py @@ -715,19 +715,31 @@ def import_sites_vcf(**kwargs) -> hl.Table: return hl.import_vcf(**kwargs).rows() -def import_gencode(gtf_path: str, **kwargs) -> hl.Table: +def import_gencode( + gtf_path: str, + include_version: bool = False, + **kwargs, +) -> hl.Table: """ Import GENCODE annotations GTF file as a Hail Table. :param gtf_path: Path to GENCODE GTF file. + :param include_version: Whether to include two additional fields that contain the full gene or transcript + ID and the version number (`gene_id_version`, `transcript_id_version`). :return: Table with GENCODE annotation information. """ ht = hl.experimental.import_gtf(gtf_path, **kwargs) # Only get gene and transcript stable IDs (without version numbers if they # exist), early versions of GENCODE have no version numbers but later ones do. - ht = ht.annotate( - gene_id=ht.gene_id.split("\\.")[0], - transcript_id=ht.transcript_id.split("\\.")[0], - ) + ann_expr = { + "gene_id": ht.gene_id.split("\\.")[0], + "transcript_id": ht.transcript_id.split("\\.")[0], + } + if include_version: + ann_expr["gene_id_version"] = ht.gene_id + ann_expr["transcript_id_version"] = ht.transcript_id + + ht = ht.annotate(**ann_expr) + return ht diff --git a/gnomad/utils/constraint.py b/gnomad/utils/constraint.py index baf032d65..a1e405b6a 100644 --- a/gnomad/utils/constraint.py +++ b/gnomad/utils/constraint.py @@ -1710,15 +1710,26 @@ def calculate_raw_z_score_sd( def add_gencode_transcript_annotations( ht: hl.Table, gencode_ht: hl.Table, - annotations: List[str] = ["level", "transcript_type"], + annotations: List[str] = [ + "transcript_id_version", + "gene_id_version", + "level", + "transcript_type", + "start_position", + "end_position", + ], + remove_y_par: bool = True, ) -> hl.Table: """ Add GENCODE annotations to Table based on transcript id. .. note:: + Added annotations by default are: - level - transcript_type + - start_position (start of the transcript) + - end_position (end of the transcript) Computed annotations are: - chromosome @@ -1727,22 +1738,51 @@ def add_gencode_transcript_annotations( :param ht: Input Table. :param gencode_ht: Table with GENCODE annotations. - :param annotations: List of GENCODE annotations to add. Default is ["level", "transcript_type"]. - Added annotations also become keys for the group by when computing "cds_length" and "num_coding_exons". + :param annotations: List of GENCODE annotations to add. Default is + ["transcript_id_version", "gene_id_version", "level", "transcript_type", + "start_position", "end_position"]. + :param remove_y_par: Whether to remove features for the Y chromosome PAR regions. + Default is True because the Y chromosome PAR regions are typically not included + in the constraint calculations and both chrX and chrY will have the same + 'transcript_id' field for these regions. This parameter can only be True if + `gencode_ht` includes a 'transcript_id_version' field because Y_PAR is included + in the version of the transcript, which has been stripped from the + 'transcript_id' field. :return: Table with transcript annotations from GENCODE added. """ + if remove_y_par and "transcript_id_version" not in gencode_ht.row: + raise ValueError( + "remove_y_par is True but 'transcript_id_version' is not in gencode_ht" + ) + + if remove_y_par: + gencode_ht = gencode_ht.filter( + ~gencode_ht.transcript_id_version.endswith("Y_PAR") + ) + gencode_ht = gencode_ht.annotate( length=gencode_ht.interval.end.position - gencode_ht.interval.start.position + 1, chromosome=gencode_ht.interval.start.contig, + start_position=gencode_ht.interval.start.position, + end_position=gencode_ht.interval.end.position, + ) + + # Add transcript annotations to input Table. + annotations_to_add = set(annotations + ["chromosome", "transcript_id"]) + gencode_transcript_ht = ( + gencode_ht.filter(gencode_ht.feature == "transcript") + .select(*annotations_to_add) + .key_by("transcript_id") + .drop("interval") ) # Obtain CDS annotations from GENCODE file and calculate CDS length and # number of exons. annotations_to_add = set(annotations + ["chromosome", "transcript_id", "length"]) - gencode_cds = ( + gencode_cds_ht = ( gencode_ht.filter(gencode_ht.feature == "CDS") .select(*annotations_to_add) .key_by("transcript_id") @@ -1751,19 +1791,17 @@ def add_gencode_transcript_annotations( annotations_to_add.remove("length") - gencode_cds = ( - gencode_cds.group_by(*annotations_to_add) - .aggregate( - cds_length=hl.agg.sum(gencode_cds.length), num_coding_exons=hl.agg.count() - ) - .key_by("transcript_id") + gencode_cds_ht = gencode_cds_ht.group_by("transcript_id").aggregate( + cds_length=hl.agg.sum(gencode_cds_ht.length), + num_coding_exons=hl.agg.count(), ) - gencode_cds = gencode_cds.checkpoint( - new_temp_file(prefix="gencode_cds", extension="ht") + # Join the transcript and CDS annotations. + gencode_transcript_ht = gencode_transcript_ht.join(gencode_cds_ht).checkpoint( + new_temp_file(prefix="gencode", extension="ht") ) # Add GENCODE annotations to input Table. - ht = ht.annotate(**gencode_cds[ht.transcript]) + ht = ht.annotate(**gencode_transcript_ht[ht.transcript]) return ht