Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions gnomad/resources/resource_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -715,19 +715,31 @@ def import_sites_vcf(**kwargs) -> hl.Table:
return hl.import_vcf(**kwargs).rows()


def import_gencode(gtf_path: str, **kwargs) -> hl.Table:
def import_gencode(
gtf_path: str,
include_version: bool = False,
**kwargs,
) -> hl.Table:
"""
Import GENCODE annotations GTF file as a Hail Table.
:param gtf_path: Path to GENCODE GTF file.
:param include_version: Whether to include two additional fields that contain the full gene or transcript
ID and the version number (`gene_id_version`, `transcript_id_version`).
:return: Table with GENCODE annotation information.
"""
ht = hl.experimental.import_gtf(gtf_path, **kwargs)

# Only get gene and transcript stable IDs (without version numbers if they
# exist), early versions of GENCODE have no version numbers but later ones do.
ht = ht.annotate(
gene_id=ht.gene_id.split("\\.")[0],
transcript_id=ht.transcript_id.split("\\.")[0],
)
ann_expr = {
"gene_id": ht.gene_id.split("\\.")[0],
"transcript_id": ht.transcript_id.split("\\.")[0],
}
if include_version:
ann_expr["gene_id_version"] = ht.gene_id
ann_expr["transcript_id_version"] = ht.transcript_id

ht = ht.annotate(**ann_expr)

return ht
64 changes: 51 additions & 13 deletions gnomad/utils/constraint.py
Original file line number Diff line number Diff line change
Expand Up @@ -1710,15 +1710,26 @@ def calculate_raw_z_score_sd(
def add_gencode_transcript_annotations(
ht: hl.Table,
gencode_ht: hl.Table,
annotations: List[str] = ["level", "transcript_type"],
annotations: List[str] = [
"transcript_id_version",
"gene_id_version",
"level",
"transcript_type",
"start_position",
"end_position",
],
remove_y_par: bool = True,
) -> hl.Table:
"""
Add GENCODE annotations to Table based on transcript id.
.. note::
Added annotations by default are:
- level
- transcript_type
- start_position (start of the transcript)
- end_position (end of the transcript)
Computed annotations are:
- chromosome
Expand All @@ -1727,22 +1738,51 @@ def add_gencode_transcript_annotations(
:param ht: Input Table.
:param gencode_ht: Table with GENCODE annotations.
:param annotations: List of GENCODE annotations to add. Default is ["level", "transcript_type"].
Added annotations also become keys for the group by when computing "cds_length" and "num_coding_exons".
:param annotations: List of GENCODE annotations to add. Default is
["transcript_id_version", "gene_id_version", "level", "transcript_type",
"start_position", "end_position"].
:param remove_y_par: Whether to remove features for the Y chromosome PAR regions.
Default is True because the Y chromosome PAR regions are typically not included
in the constraint calculations and both chrX and chrY will have the same
'transcript_id' field for these regions. This parameter can only be True if
`gencode_ht` includes a 'transcript_id_version' field because Y_PAR is included
in the version of the transcript, which has been stripped from the
'transcript_id' field.
:return: Table with transcript annotations from GENCODE added.
"""
if remove_y_par and "transcript_id_version" not in gencode_ht.row:
raise ValueError(
"remove_y_par is True but 'transcript_id_version' is not in gencode_ht"
)

if remove_y_par:
gencode_ht = gencode_ht.filter(
~gencode_ht.transcript_id_version.endswith("Y_PAR")
)

gencode_ht = gencode_ht.annotate(
length=gencode_ht.interval.end.position
- gencode_ht.interval.start.position
+ 1,
chromosome=gencode_ht.interval.start.contig,
start_position=gencode_ht.interval.start.position,
end_position=gencode_ht.interval.end.position,
)

# Add transcript annotations to input Table.
annotations_to_add = set(annotations + ["chromosome", "transcript_id"])
gencode_transcript_ht = (
gencode_ht.filter(gencode_ht.feature == "transcript")
.select(*annotations_to_add)
.key_by("transcript_id")
.drop("interval")
)

# Obtain CDS annotations from GENCODE file and calculate CDS length and
# number of exons.
annotations_to_add = set(annotations + ["chromosome", "transcript_id", "length"])

gencode_cds = (
gencode_cds_ht = (
gencode_ht.filter(gencode_ht.feature == "CDS")
.select(*annotations_to_add)
.key_by("transcript_id")
Expand All @@ -1751,19 +1791,17 @@ def add_gencode_transcript_annotations(

annotations_to_add.remove("length")

gencode_cds = (
gencode_cds.group_by(*annotations_to_add)
.aggregate(
cds_length=hl.agg.sum(gencode_cds.length), num_coding_exons=hl.agg.count()
)
.key_by("transcript_id")
gencode_cds_ht = gencode_cds_ht.group_by("transcript_id").aggregate(
cds_length=hl.agg.sum(gencode_cds_ht.length),
num_coding_exons=hl.agg.count(),
)

gencode_cds = gencode_cds.checkpoint(
new_temp_file(prefix="gencode_cds", extension="ht")
# Join the transcript and CDS annotations.
gencode_transcript_ht = gencode_transcript_ht.join(gencode_cds_ht).checkpoint(
new_temp_file(prefix="gencode", extension="ht")
)

# Add GENCODE annotations to input Table.
ht = ht.annotate(**gencode_cds[ht.transcript])
ht = ht.annotate(**gencode_transcript_ht[ht.transcript])

return ht