|
| 1 | +library(optparse) |
| 2 | +library(rtracklayer) |
| 3 | + |
| 4 | +option_list <- list( |
| 5 | + make_option(c("-g", "--gff"), type = "character", default = "FALSE", |
| 6 | + help = "The gff file for the reference assembly after initial cleaning", |
| 7 | + metavar = "character") |
| 8 | +) |
| 9 | + |
| 10 | +opt_parser <- OptionParser(option_list = option_list) |
| 11 | +opt <- parse_args(opt_parser) |
| 12 | + |
| 13 | +gffname <- opt[[1]] |
| 14 | + |
| 15 | +# $prefix.full.gffread.gff |
| 16 | + |
| 17 | +prefix <- gsub(".full.gffread.gff","",gffname) |
| 18 | + |
| 19 | +# Read in GFF |
| 20 | + |
| 21 | +gff <- as.data.frame(rtracklayer::readGFF(gffname)) |
| 22 | + |
| 23 | +# Isolate the ID, Name, and type columns. This will create a key that we will use later to keep track of which names match with which IDs |
| 24 | + |
| 25 | +key <- gff[,c("ID","Name","type")] |
| 26 | + |
| 27 | +# Only keep gene and transcript features. We can do this by using `grepl` to return a vector of TRUE or FALSE values and using this to isolate the rows in the key. The following `grepl` statement will match any feature that contains "gene", "transcript", or "RNA" (so will match "pseudogene", "ncRNA", etc.). |
| 28 | + |
| 29 | +key <- key[grepl("gene|transcript|RNA", key$type),] |
| 30 | + |
| 31 | +# Since genes and pseudogenes share the same gene name but we want gene IDs to be unique, we can add "pseudo" after the gene name if something is a pseudogene. |
| 32 | + |
| 33 | +key$Name[key$type == "pseudogene"] <- gsub("$","-pseudo", |
| 34 | + key$Name[key$type == "pseudogene"]) |
| 35 | + |
| 36 | +# Remove anything that has NA in the "Name" column |
| 37 | + |
| 38 | +key <- key[!is.na(key$Name),] |
| 39 | + |
| 40 | +# If a feature is not a gene, add "rna-" in front of the name |
| 41 | + |
| 42 | +key$Name[grepl("transcript|RNA", key$type)] <- gsub("^","rna-", |
| 43 | + key$Name[grepl("transcript|RNA", key$type)]) |
| 44 | + |
| 45 | +# For any features that still don't have a unique gene name, add a copy number to the end |
| 46 | + |
| 47 | +key$Name <- make.unique(key$Name, sep = "-copy") |
| 48 | + |
| 49 | +# Now we want to replace the IDs and Parent features with the new unique names. Whenever there is an exact match between something in the ID or Parent column in the GFF with the ID in the key, the new name will replace the old ID or Parent feature. |
| 50 | + |
| 51 | +# First, create a named vector for fast lookup |
| 52 | +id_to_name <- setNames(key$Name, key$ID) |
| 53 | +# Replace values in gff$ID if they match any key$ID |
| 54 | +gff$ID <- ifelse(gff$ID %in% key$ID, id_to_name[gff$ID], gff$ID) |
| 55 | +# Unlist Parent |
| 56 | +gff$Parent <- sapply(gff$Parent, function(x) { |
| 57 | + if (length(x) == 0) NA_character_ else as.character(x) |
| 58 | +}) |
| 59 | +# Replace values in gff$Parent if they match any key$ID |
| 60 | +gff$Parent <- ifelse(gff$Parent %in% key$ID, id_to_name[gff$Parent], gff$Parent) |
| 61 | + |
| 62 | +rtracklayer::export.gff3(gff, |
| 63 | + paste0(prefix,".gffread.F.nameIDs.gff3"), |
| 64 | + format = "gff3") |
0 commit comments