ensembl ID to Symbol

DustinSokolowski · web-flow · commit 5eab912dbaff · 2025-06-16T17:35:59.000-04:00
Ensembl and GenAnT use gene IDs with gene symbols. As a default we were transferring gene IDs over, making finding the gene symbol in the gene_symbol_table inconvenient. We added an R script to processing the reference directory so that gene symbols get transferred instead of GeneIDs when possible. It also deals with redundant gene symbols and pseudogenes.
diff --git a/setup/Preprocess Reference Species.md b/setup/Preprocess Reference Species.md
@@ -200,3 +200,5 @@ This script and workflow is nearly identical to ncbi. The few differences are.
 * Ensembl uses `Name` to denote gene symbol, while ncbi uses `gene`
 * Ensembl tags gene and transcript with `gene:` and `transcript:` while Refseq uses `gene-` and `rna-`.
 * Ensembl includes transcript number in gene symbol, NCBI does not.
+* This script will end up transferring gene IDs and not gene symbols which is inconvenient. We added a `ShiftNamesToIDs.R` script to `reference_directory_ensembl.sh` so that TOGA/LiftOff will automatically transfer gene symbols to your reference instead of IDs, meaning you wont have to map gene symbols manually later. 
+
diff --git a/setup/ShiftNamesToIDs.R b/setup/ShiftNamesToIDs.R
@@ -0,0 +1,64 @@
+library(optparse)
+library(rtracklayer)
+
+option_list <- list(
+  make_option(c("-g", "--gff"), type = "character", default = "FALSE",
+              help = "The gff file for the reference assembly after initial cleaning",
+              metavar = "character")
+)
+
+opt_parser <- OptionParser(option_list = option_list)
+opt <- parse_args(opt_parser)
+
+gffname <- opt[[1]]
+
+# $prefix.full.gffread.gff
+
+prefix <- gsub(".full.gffread.gff","",gffname)
+
+# Read in GFF
+
+gff <- as.data.frame(rtracklayer::readGFF(gffname))
+
+# Isolate the ID, Name, and type columns. This will create a key that we will use later to keep track of which names match with which IDs
+
+key <- gff[,c("ID","Name","type")]
+
+# Only keep gene and transcript features. We can do this by using `grepl` to return a vector of TRUE or FALSE values and using this to isolate the rows in the key. The following `grepl` statement will match any feature that contains "gene", "transcript", or "RNA" (so will match "pseudogene", "ncRNA", etc.).
+
+key <- key[grepl("gene|transcript|RNA", key$type),]
+
+# Since genes and pseudogenes share the same gene name but we want gene IDs to be unique, we can add "pseudo" after the gene name if something is a pseudogene.
+
+key$Name[key$type == "pseudogene"] <- gsub("$","-pseudo",
+                                           key$Name[key$type == "pseudogene"])
+
+# Remove anything that has NA in the "Name" column
+
+key <- key[!is.na(key$Name),]
+
+# If a feature is not a gene, add "rna-" in front of the name
+
+key$Name[grepl("transcript|RNA", key$type)] <- gsub("^","rna-",
+                                                    key$Name[grepl("transcript|RNA", key$type)])
+
+# For any features that still don't have a unique gene name, add a copy number to the end
+
+key$Name <- make.unique(key$Name, sep = "-copy")
+
+# Now we want to replace the IDs and Parent features with the new unique names. Whenever there is an exact match between something in the ID or Parent column in the GFF with the ID in the key, the new name will replace the old ID or Parent feature.
+
+# First, create a named vector for fast lookup
+id_to_name <- setNames(key$Name, key$ID)
+# Replace values in gff$ID if they match any key$ID
+gff$ID <- ifelse(gff$ID %in% key$ID, id_to_name[gff$ID], gff$ID)
+# Unlist Parent
+gff$Parent <- sapply(gff$Parent, function(x) {
+  if (length(x) == 0) NA_character_ else as.character(x)
+})
+# Replace values in gff$Parent if they match any key$ID
+gff$Parent <- ifelse(gff$Parent %in% key$ID, id_to_name[gff$Parent], gff$Parent)
+
+rtracklayer::export.gff3(gff,
+                         paste0(prefix,".gffread.F.nameIDs.gff3"),
+                         format = "gff3")
diff --git a/setup/reference_directory_ensembl.sh b/setup/reference_directory_ensembl.sh
@@ -13,7 +13,13 @@ prefix=`basename $gff .gff3`
 
 # clean GFF
 
-gffread $gff --keep-genes -o $prefix.gffread.gff
+# clean GFF1 for moving gene ID to gene symbol
+
+gffread --keep-genes -F $gff -o $prefix.full.gffread.gff
+
+Rscript --vanilla ShiftNamesToIDs.R -g $prefix.full.gffread.gff
+
+gffread $prefix.gffread.F.nameIDs.gff3 --keep-genes -o $prefix.gffread.gff
 
 # make protein faa for orthofinder