Skip to content

Commit 83d2603

Browse files
authored
Merge pull request #806 from broadinstitute/jg/updates_to_gencode_for_constraint
Add `include_version` parameter to `import_gencode` and `remove_y_par` to `add_gencode_transcript_annotations`
2 parents 897d750 + b9b5ffb commit 83d2603

File tree

2 files changed

+68
-18
lines changed

2 files changed

+68
-18
lines changed

gnomad/resources/resource_utils.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -715,19 +715,31 @@ def import_sites_vcf(**kwargs) -> hl.Table:
715715
return hl.import_vcf(**kwargs).rows()
716716

717717

718-
def import_gencode(gtf_path: str, **kwargs) -> hl.Table:
718+
def import_gencode(
719+
gtf_path: str,
720+
include_version: bool = False,
721+
**kwargs,
722+
) -> hl.Table:
719723
"""
720724
Import GENCODE annotations GTF file as a Hail Table.
721725
722726
:param gtf_path: Path to GENCODE GTF file.
727+
:param include_version: Whether to include two additional fields that contain the full gene or transcript
728+
ID and the version number (`gene_id_version`, `transcript_id_version`).
723729
:return: Table with GENCODE annotation information.
724730
"""
725731
ht = hl.experimental.import_gtf(gtf_path, **kwargs)
726732

727733
# Only get gene and transcript stable IDs (without version numbers if they
728734
# exist), early versions of GENCODE have no version numbers but later ones do.
729-
ht = ht.annotate(
730-
gene_id=ht.gene_id.split("\\.")[0],
731-
transcript_id=ht.transcript_id.split("\\.")[0],
732-
)
735+
ann_expr = {
736+
"gene_id": ht.gene_id.split("\\.")[0],
737+
"transcript_id": ht.transcript_id.split("\\.")[0],
738+
}
739+
if include_version:
740+
ann_expr["gene_id_version"] = ht.gene_id
741+
ann_expr["transcript_id_version"] = ht.transcript_id
742+
743+
ht = ht.annotate(**ann_expr)
744+
733745
return ht

gnomad/utils/constraint.py

Lines changed: 51 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1710,15 +1710,26 @@ def calculate_raw_z_score_sd(
17101710
def add_gencode_transcript_annotations(
17111711
ht: hl.Table,
17121712
gencode_ht: hl.Table,
1713-
annotations: List[str] = ["level", "transcript_type"],
1713+
annotations: List[str] = [
1714+
"transcript_id_version",
1715+
"gene_id_version",
1716+
"level",
1717+
"transcript_type",
1718+
"start_position",
1719+
"end_position",
1720+
],
1721+
remove_y_par: bool = True,
17141722
) -> hl.Table:
17151723
"""
17161724
Add GENCODE annotations to Table based on transcript id.
17171725
17181726
.. note::
1727+
17191728
Added annotations by default are:
17201729
- level
17211730
- transcript_type
1731+
- start_position (start of the transcript)
1732+
- end_position (end of the transcript)
17221733
17231734
Computed annotations are:
17241735
- chromosome
@@ -1727,22 +1738,51 @@ def add_gencode_transcript_annotations(
17271738
17281739
:param ht: Input Table.
17291740
:param gencode_ht: Table with GENCODE annotations.
1730-
:param annotations: List of GENCODE annotations to add. Default is ["level", "transcript_type"].
1731-
Added annotations also become keys for the group by when computing "cds_length" and "num_coding_exons".
1741+
:param annotations: List of GENCODE annotations to add. Default is
1742+
["transcript_id_version", "gene_id_version", "level", "transcript_type",
1743+
"start_position", "end_position"].
1744+
:param remove_y_par: Whether to remove features for the Y chromosome PAR regions.
1745+
Default is True because the Y chromosome PAR regions are typically not included
1746+
in the constraint calculations and both chrX and chrY will have the same
1747+
'transcript_id' field for these regions. This parameter can only be True if
1748+
`gencode_ht` includes a 'transcript_id_version' field because Y_PAR is included
1749+
in the version of the transcript, which has been stripped from the
1750+
'transcript_id' field.
17321751
:return: Table with transcript annotations from GENCODE added.
17331752
"""
1753+
if remove_y_par and "transcript_id_version" not in gencode_ht.row:
1754+
raise ValueError(
1755+
"remove_y_par is True but 'transcript_id_version' is not in gencode_ht"
1756+
)
1757+
1758+
if remove_y_par:
1759+
gencode_ht = gencode_ht.filter(
1760+
~gencode_ht.transcript_id_version.endswith("Y_PAR")
1761+
)
1762+
17341763
gencode_ht = gencode_ht.annotate(
17351764
length=gencode_ht.interval.end.position
17361765
- gencode_ht.interval.start.position
17371766
+ 1,
17381767
chromosome=gencode_ht.interval.start.contig,
1768+
start_position=gencode_ht.interval.start.position,
1769+
end_position=gencode_ht.interval.end.position,
1770+
)
1771+
1772+
# Add transcript annotations to input Table.
1773+
annotations_to_add = set(annotations + ["chromosome", "transcript_id"])
1774+
gencode_transcript_ht = (
1775+
gencode_ht.filter(gencode_ht.feature == "transcript")
1776+
.select(*annotations_to_add)
1777+
.key_by("transcript_id")
1778+
.drop("interval")
17391779
)
17401780

17411781
# Obtain CDS annotations from GENCODE file and calculate CDS length and
17421782
# number of exons.
17431783
annotations_to_add = set(annotations + ["chromosome", "transcript_id", "length"])
17441784

1745-
gencode_cds = (
1785+
gencode_cds_ht = (
17461786
gencode_ht.filter(gencode_ht.feature == "CDS")
17471787
.select(*annotations_to_add)
17481788
.key_by("transcript_id")
@@ -1751,19 +1791,17 @@ def add_gencode_transcript_annotations(
17511791

17521792
annotations_to_add.remove("length")
17531793

1754-
gencode_cds = (
1755-
gencode_cds.group_by(*annotations_to_add)
1756-
.aggregate(
1757-
cds_length=hl.agg.sum(gencode_cds.length), num_coding_exons=hl.agg.count()
1758-
)
1759-
.key_by("transcript_id")
1794+
gencode_cds_ht = gencode_cds_ht.group_by("transcript_id").aggregate(
1795+
cds_length=hl.agg.sum(gencode_cds_ht.length),
1796+
num_coding_exons=hl.agg.count(),
17601797
)
17611798

1762-
gencode_cds = gencode_cds.checkpoint(
1763-
new_temp_file(prefix="gencode_cds", extension="ht")
1799+
# Join the transcript and CDS annotations.
1800+
gencode_transcript_ht = gencode_transcript_ht.join(gencode_cds_ht).checkpoint(
1801+
new_temp_file(prefix="gencode", extension="ht")
17641802
)
17651803

17661804
# Add GENCODE annotations to input Table.
1767-
ht = ht.annotate(**gencode_cds[ht.transcript])
1805+
ht = ht.annotate(**gencode_transcript_ht[ht.transcript])
17681806

17691807
return ht

0 commit comments

Comments
 (0)