Skip to content

Commit e7197fb

Browse files
committed
Make GFF parsing more robust to variations in gene nomenclature
1 parent 070517b commit e7197fb

File tree

1 file changed

+54
-19
lines changed

1 file changed

+54
-19
lines changed

bin/count_reads.R

Lines changed: 54 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,33 @@ threads <- opt$threads
4545

4646

4747

48-
## ------------------------------------------------------------------------------
48+
## -----------------------------------------------------------------------------
49+
## Functions
50+
## -----------------------------------------------------------------------------
51+
parse_gff_attributes <- function(
52+
attributes_string, attribute_name, start_pos, end_pos, default_val = NULL) {
53+
attr_pairs_raw <- stringr::str_split(attributes_string, ";")[[1]]
54+
attr_pairs <- stringr::str_trim(
55+
stringr::str_replace_all(attr_pairs_raw, "[\\s\\p{Zs}\\p{C}]", " "))
56+
search_pattern_regex <- paste0("^", attribute_name, "=")
57+
target_attr <- attr_pairs[grepl(search_pattern_regex, attr_pairs)]
58+
if (length(target_attr) > 0) {
59+
value <- stringr::str_remove(target_attr[1], search_pattern_regex)
60+
return(value)
61+
} else {
62+
if (!is.null(default_val)) {
63+
return(default_val)
64+
} else {
65+
default_value_generated <- paste0("unknown_", start_pos, "_", end_pos)
66+
return(default_value_generated)
67+
}
68+
}
69+
}
70+
71+
72+
## -----------------------------------------------------------------------------
4973
## Read data
50-
## ------------------------------------------------------------------------------
74+
## -----------------------------------------------------------------------------
5175
meta_tab <- read_tsv(meta_f)
5276

5377
total_counts_list <- map(meta_tab$sample, function(x) {
@@ -59,25 +83,36 @@ merged_total_counts <- bind_rows(total_counts_list)
5983

6084

6185

62-
## ------------------------------------------------------------------------------
86+
## -----------------------------------------------------------------------------
6387
## Read genome annotation
64-
## ------------------------------------------------------------------------------
88+
## -----------------------------------------------------------------------------
6589
ref_annot <- ape::read.gff(gff_f, na.strings = c(".", "?"), GFF3 = TRUE)
6690

6791
ref_annot <- subset(ref_annot, type == "gene")
6892

6993
gene_attr <- stringr::str_split(ref_annot$attributes, ";")
70-
locus_tags <- unlist(lapply(gene_attr, function(x) {
71-
x[grepl("^locus_tag", x)]
72-
}))
73-
gene_biotypes <- unlist(lapply(gene_attr, function(x) {
74-
x[grepl("^gene_biotype", x)]
75-
}))
76-
common_gene_names <- unlist(lapply(gene_attr, function(x) {
77-
x <- x[grepl("^gene=", x)]
78-
x[identical(x, character(0))] <- ""
79-
x
80-
}))
94+
95+
locus_tags <- unname(mapply(parse_gff_attributes,
96+
ref_annot$attributes,
97+
attribute_name = "locus_tag",
98+
start_pos = ref_annot$start,
99+
end_pos = ref_annot$end))
100+
101+
gene_biotypes <- unname(mapply(parse_gff_attributes,
102+
ref_annot$attributes,
103+
attribute_name = "gene_biotype",
104+
start_pos = ref_annot$start,
105+
end_pos = ref_annot$end,
106+
"unknown"))
107+
108+
common_gene_names <- unname(mapply(parse_gff_attributes,
109+
ref_annot$attributes,
110+
attribute_name = "gene",
111+
start_pos = ref_annot$start,
112+
end_pos = ref_annot$end,
113+
"unknown"))
114+
115+
81116
gene_lengths <- (ref_annot$end - ref_annot$start) + 1
82117

83118
ref_gene_df <- data.frame(
@@ -93,9 +128,9 @@ ref_gene_df$gene_name <- sub("^.*=", "", ref_gene_df$gene_name)
93128
write_tsv(ref_gene_df, "ref_gene_df.tsv")
94129

95130

96-
## ------------------------------------------------------------------------------
131+
## -----------------------------------------------------------------------------
97132
## Count reads mapping to genes
98-
## ------------------------------------------------------------------------------
133+
## -----------------------------------------------------------------------------
99134
bamfilesCount <- paste0(meta_tab$sample, ".bam")
100135

101136
# 0 (unstranded), 1 (stranded) and 2 (reversely stranded)
@@ -132,9 +167,9 @@ gene_counts_pc <- counts_mat[ref_gene_df$biotype == "protein_coding", ]
132167
write_tsv(gene_counts_pc, "gene_counts_pc.tsv")
133168

134169

135-
## ------------------------------------------------------------------------------
170+
## -----------------------------------------------------------------------------
136171
## Plot library composition
137-
## ------------------------------------------------------------------------------
172+
## -----------------------------------------------------------------------------
138173
## set up plots
139174
brewer_pallette1 <- brewer.pal(9, "Set1")
140175
brewer_pallette3 <- brewer.pal(8, "Dark2")

0 commit comments

Comments
 (0)