@@ -1710,15 +1710,26 @@ def calculate_raw_z_score_sd(
17101710def add_gencode_transcript_annotations (
17111711 ht : hl .Table ,
17121712 gencode_ht : hl .Table ,
1713- annotations : List [str ] = ["level" , "transcript_type" ],
1713+ annotations : List [str ] = [
1714+ "transcript_id_version" ,
1715+ "gene_id_version" ,
1716+ "level" ,
1717+ "transcript_type" ,
1718+ "start_position" ,
1719+ "end_position" ,
1720+ ],
1721+ remove_y_par : bool = True ,
17141722) -> hl .Table :
17151723 """
17161724 Add GENCODE annotations to Table based on transcript id.
17171725
17181726 .. note::
1727+
17191728 Added annotations by default are:
17201729 - level
17211730 - transcript_type
1731+ - start_position (start of the transcript)
1732+ - end_position (end of the transcript)
17221733
17231734 Computed annotations are:
17241735 - chromosome
@@ -1727,22 +1738,51 @@ def add_gencode_transcript_annotations(
17271738
17281739 :param ht: Input Table.
17291740 :param gencode_ht: Table with GENCODE annotations.
1730- :param annotations: List of GENCODE annotations to add. Default is ["level", "transcript_type"].
1731- Added annotations also become keys for the group by when computing "cds_length" and "num_coding_exons".
1741+ :param annotations: List of GENCODE annotations to add. Default is
1742+ ["transcript_id_version", "gene_id_version", "level", "transcript_type",
1743+ "start_position", "end_position"].
1744+ :param remove_y_par: Whether to remove features for the Y chromosome PAR regions.
1745+ Default is True because the Y chromosome PAR regions are typically not included
1746+ in the constraint calculations and both chrX and chrY will have the same
1747+ 'transcript_id' field for these regions. This parameter can only be True if
1748+ `gencode_ht` includes a 'transcript_id_version' field because Y_PAR is included
1749+ in the version of the transcript, which has been stripped from the
1750+ 'transcript_id' field.
17321751 :return: Table with transcript annotations from GENCODE added.
17331752 """
1753+ if remove_y_par and "transcript_id_version" not in gencode_ht .row :
1754+ raise ValueError (
1755+ "remove_y_par is True but 'transcript_id_version' is not in gencode_ht"
1756+ )
1757+
1758+ if remove_y_par :
1759+ gencode_ht = gencode_ht .filter (
1760+ ~ gencode_ht .transcript_id_version .endswith ("Y_PAR" )
1761+ )
1762+
17341763 gencode_ht = gencode_ht .annotate (
17351764 length = gencode_ht .interval .end .position
17361765 - gencode_ht .interval .start .position
17371766 + 1 ,
17381767 chromosome = gencode_ht .interval .start .contig ,
1768+ start_position = gencode_ht .interval .start .position ,
1769+ end_position = gencode_ht .interval .end .position ,
1770+ )
1771+
1772+ # Add transcript annotations to input Table.
1773+ annotations_to_add = set (annotations + ["chromosome" , "transcript_id" ])
1774+ gencode_transcript_ht = (
1775+ gencode_ht .filter (gencode_ht .feature == "transcript" )
1776+ .select (* annotations_to_add )
1777+ .key_by ("transcript_id" )
1778+ .drop ("interval" )
17391779 )
17401780
17411781 # Obtain CDS annotations from GENCODE file and calculate CDS length and
17421782 # number of exons.
17431783 annotations_to_add = set (annotations + ["chromosome" , "transcript_id" , "length" ])
17441784
1745- gencode_cds = (
1785+ gencode_cds_ht = (
17461786 gencode_ht .filter (gencode_ht .feature == "CDS" )
17471787 .select (* annotations_to_add )
17481788 .key_by ("transcript_id" )
@@ -1751,19 +1791,17 @@ def add_gencode_transcript_annotations(
17511791
17521792 annotations_to_add .remove ("length" )
17531793
1754- gencode_cds = (
1755- gencode_cds .group_by (* annotations_to_add )
1756- .aggregate (
1757- cds_length = hl .agg .sum (gencode_cds .length ), num_coding_exons = hl .agg .count ()
1758- )
1759- .key_by ("transcript_id" )
1794+ gencode_cds_ht = gencode_cds_ht .group_by ("transcript_id" ).aggregate (
1795+ cds_length = hl .agg .sum (gencode_cds_ht .length ),
1796+ num_coding_exons = hl .agg .count (),
17601797 )
17611798
1762- gencode_cds = gencode_cds .checkpoint (
1763- new_temp_file (prefix = "gencode_cds" , extension = "ht" )
1799+ # Join the transcript and CDS annotations.
1800+ gencode_transcript_ht = gencode_transcript_ht .join (gencode_cds_ht ).checkpoint (
1801+ new_temp_file (prefix = "gencode" , extension = "ht" )
17641802 )
17651803
17661804 # Add GENCODE annotations to input Table.
1767- ht = ht .annotate (** gencode_cds [ht .transcript ])
1805+ ht = ht .annotate (** gencode_transcript_ht [ht .transcript ])
17681806
17691807 return ht
0 commit comments