Skip to content

Commit e188f4c

Browse files
committed
Gene extractor and gene association rule translation functions. Class reactDB for storage of ready-to-use gene annotation objects. Removal of redundant data frames with metabolite and reaction association information.
1 parent 3f8078c commit e188f4c

17 files changed

+470
-129
lines changed

NAMESPACE

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
# Generated by roxygen2: do not edit by hand
22

3+
export(as_reactDB)
4+
export(extract_genes)
35
export(geneSBML)
46
export(is_geneSBML)
57
export(is_memoSaver)
8+
export(is_reactDB)
9+
export(reactDB)
610
importFrom(Rcpp,sourceCpp)
711
importFrom(dplyr,all_of)
812
importFrom(dplyr,arrange)
@@ -56,6 +60,7 @@ importFrom(rlang,`.env`)
5660
importFrom(rlang,set_names)
5761
importFrom(stats,complete.cases)
5862
importFrom(stats,median)
63+
importFrom(stats,na.omit)
5964
importFrom(stats,p.adjust)
6065
importFrom(stats,pnorm)
6166
importFrom(stats,qnorm)

R/classes.R

Lines changed: 90 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# S3 class definitions
22

3-
# geneSBML class ------
3+
# `geneSBML` class ------
44

55
#' Create a `geneSBML` object.
66
#'
@@ -106,4 +106,93 @@
106106

107107
is_memoSaver <- function(x) inherits(x, "geneSBML") & inherits(x, "memoSaver")
108108

109+
# `reactDB` class --------
110+
111+
#' Database of reaction annotation: `reactDB` class.
112+
#'
113+
#' @description
114+
#' Constructs an instance of a reaction annotation database data frame with
115+
#' processed gene - reaction association rules and gene - reaction rule evaluation
116+
#' expressions.
117+
#'
118+
#' @details
119+
#' The function conducts basic validation for conformity with reaction
120+
#' evaluation tools.
121+
#' The input and output data frames have the following columns:
122+
#' * __id__: reaction identifier beginning with `"R_"` string.
123+
#' * __name__: character strings with reaction names.
124+
#' * __subsystem__: character strings with names of Recon subsystems.
125+
#' * __gene_association__: character strings with gene - reaction association rules.
126+
#' * __entrez_id__: a column with lists of Entrez IDs of genes associated with
127+
#' the reactions.
128+
#' * __exprs__: a column with `NULL`, R symbols or R language expressions used
129+
#' to evaluate the gene - reaction association rules.
130+
#'
131+
#' @return a data frame of class `reactDB`.
132+
#'
133+
#' @param x a data frame with columns specified in Details.
134+
#'
135+
#' @md
136+
#' @export
137+
138+
reactDB <- function(x) {
139+
140+
## input controls --------
141+
142+
if(is_reactDB(x)) return(x)
143+
144+
if(!is.data.frame(x)) stop("`x` has to be a data frame.", call. = FALSE)
145+
146+
fix_cols <-
147+
c("id", "name", "subsystem", "gene_association", "entrez_id", "exprs" )
148+
149+
missing_cols <- setdiff(fix_cols, names(x))
150+
151+
if(length(missing_cols) > 0) {
152+
153+
stop(paste("The following obligatory columns are missing from `x`:",
154+
paste(missing_cols, collapse = ", ")),
155+
call. = FALSE)
156+
157+
}
158+
159+
char_cols <- c("id", "name", "subsystem", "gene_association")
160+
161+
class_check <- map_lgl(x[char_cols], is.character)
162+
163+
if(any(!class_check)) {
164+
165+
stop(paste("The following obligatory column in `x` must be of character type:",
166+
paste(char_cols[!class_check]), collapse = ", "),
167+
call. = FALSE)
168+
169+
}
170+
171+
if(!is.list(x[["entrez_id"]])) stop("Column `entrez_id` must be a list of Entrez IDs.", call. = FALSE)
172+
173+
if(!is.list(x[["exprs"]])) stop("Column `exprs` must be a list.", call. = FALSE)
174+
175+
class_check <-
176+
map_lgl(x[["exprs"]], function(x) is.call(x) | is.name(x) | is.null(x))
177+
178+
if(any(!class_check)) {
179+
180+
stop(paste("Unrecognized objects in `exprs` columns.",
181+
"The allowed formats are NULL, R calls, or names."),
182+
call. = FALSE)
183+
184+
}
185+
186+
## the structure -------
187+
188+
structure(x, class = c("reactDB", class(x)))
189+
190+
}
191+
192+
#' @rdname reactDB
193+
#' @export
194+
195+
is_reactDB <- function(x) inherits(x, "reactDB")
196+
197+
109198
# END ------

R/data_desc.R

Lines changed: 2 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -33,58 +33,6 @@
3333

3434
# Reaction annotation data -------
3535

36-
#' BiGG reaction annotation data.
37-
#'
38-
#' @description
39-
#' A data frame with BiGG database annotation of metabolic
40-
#' reactions.
41-
#'
42-
#' @format
43-
#' A data frame with 28301 rows and 6 variables:
44-
#' * __bigg_id__: BiGG reaction identifier
45-
#' * __name__: reaction name
46-
#' * __reaction_string__: reaction equation
47-
#' * __model_list__: list of BiGG models containing the reaction
48-
#' * __database_links__: links to the BiGG on-line database
49-
#' * __old_bigg_ids__: legacy BiGG ID
50-
#'
51-
#' @source BiGG: http://bigg.ucsd.edu/data_access
52-
#'
53-
#' @docType data
54-
#'
55-
#' @name reactions
56-
#'
57-
#' @usage data(reactions)
58-
59-
NULL
60-
61-
# Metabolite annotation data ------
62-
63-
#' BiGG metabolite annotation data.
64-
#'
65-
#' @description
66-
#' A data frame with BiGG database annotation of metabolites.
67-
#'
68-
#' @format A data frame with 15724 rows and 6 variables:
69-
#' * __bigg_id__: BiGG metabolite identifier
70-
#' * __universal_bigg_id__: universal BiGG metabolite identifier
71-
#' * __name__: metabolite name
72-
#' * __model_list__: list of models with containing the metabolite
73-
#' * __database_links__: links to the BiGG on-line database
74-
#' * __old_bigg_ids__: legacy BiGG ID
75-
#'
76-
#' @source BiGG: http://bigg.ucsd.edu/data_access
77-
#'
78-
#' @docType data
79-
#'
80-
#' @name metabolites
81-
#'
82-
#' @usage data(metabolites)
83-
84-
NULL
85-
86-
# Reaction annotation data -------
87-
8836
#' Reaction association rules for the Recon2 model.
8937
#'
9038
#' @description
@@ -141,10 +89,10 @@
14189
#'
14290
#' @docType data
14391
#'
144-
#' @name Recon2D
92+
#' @name Recon2_2D
14593
#'
14694
#' @md
147-
#' @usage data(Recon2D)
95+
#' @usage data(Recon2_2D)
14896

14997
NULL
15098

R/extraction_funs.R

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
# Extraction of features from Recon data frames with reaction annotation
2+
3+
# Mapping of reactions to gene identifiers -------
4+
5+
#' Extract gene identifiers from reaction annotations.
6+
#'
7+
#' @description
8+
#' The functions extract Entrez ID gene identifiers mapped to all or
9+
#' user-specified reactions present in a data frame with reaction annotation
10+
#' and gene - reaction association rules.
11+
#' Entrez IDs of genes associated with reactions are listed, and character
12+
#' strings with the gene - reaction association rules are translated to R
13+
#' expressions.
14+
#' `as_reactDB()` offers a simplified tools for generation of
15+
#' \code{\link{reactDB}} data frames,
16+
#' while `extract_genes()` allows for selection of reactions of interest and
17+
#' parsing error diagnostics.
18+
#'
19+
#' @details
20+
#' Association rules with unrecognized Entrez IDs are removed from the output
21+
#' and a parsing warning is raised.
22+
#' The gene association rules have to operate with "bare" Entrez IDs without
23+
#' the version information (i.e. numbers after a dot).
24+
#'
25+
#' @return a data frame of class \code{\link{reactDB}} containing reaction
26+
#' IDs (`id`), reaction names (`name`), subsystem information (`subsystem`),
27+
#' list of gene identifiers (`entrez_id`) and a list of expressions to be
28+
#' evaluated by at calculating reaction activity estimates.
29+
#'
30+
#' @param x a data frame with the following obligatory columns: `id` with reaction
31+
#' identifiers, `name` with reaction names, `subsystem` with subsystem assignment,
32+
#' and `gene_association` with character strings with gene association rules.
33+
#' @param react_id BiGG reaction ID, with or without the leading 'R_' string.
34+
#' Defaults to `NULL`, which means that all reactions are mapped to genes.
35+
#' @param inspect_errors logical. If `TRUE`, the function returns parsing errors.
36+
#' @param ... additional arguments passed to `extract_genes()`.
37+
#'
38+
#' @export
39+
40+
extract_genes <- function(x,
41+
react_id = NULL,
42+
inspect_errors = FALSE) {
43+
44+
## entry control ----------
45+
46+
if(!is.data.frame(x)) stop("`x` has to be a data frame.", call. = FALSE)
47+
48+
fix_cols <- c("id", "name", "subsystem", "gene_association")
49+
50+
missing_cols <- setdiff(fix_cols, names(x))
51+
52+
if(length(missing_cols) > 0) {
53+
54+
stop(paste("The following obligatory columns are missing from `x`:",
55+
paste(missing_cols, collapse = ", ")),
56+
call. = FALSE)
57+
58+
}
59+
60+
x <- x[, fix_cols]
61+
62+
class_check <- map_lgl(x, is.character)
63+
64+
if(any(!class_check)) {
65+
66+
stop(paste("The following obligatory column in `x` must be of character type:",
67+
paste(fix_cols[!class_check]), collapse = ", "),
68+
call. = FALSE)
69+
70+
}
71+
72+
if(any(stri_detect(na.omit(x[["gene_association"]]),
73+
regex = "\\d+\\.\\d+"))) {
74+
75+
stop(paste("Gene association rules contain identifiers with",
76+
"version information, i.e. number after a dot.",
77+
"Please remove it to proceed."),
78+
call. = FALSE)
79+
80+
}
81+
82+
if(!is.null(react_id)) {
83+
84+
stopifnot(is.character(react_id))
85+
86+
react_id <-
87+
ifelse(!stri_detect(react_id, regex = '^R_'),
88+
paste0('R_', react_id), react_id)
89+
90+
x <- filter(x, .data[["id"]] %in% react_id)
91+
92+
if(nrow(x) == 0) {
93+
94+
stop("No reactions to precess after filtering with `react_id`.",
95+
call. = FALSE)
96+
97+
}
98+
99+
}
100+
101+
stopifnot(is.logical(inspect_errors))
102+
inspect_errors <- inspect_errors[1]
103+
104+
## filtering the reaction annotation df ---------
105+
106+
### getting rid of empty rules and space-only rules in the annotation
107+
### data frame
108+
109+
proc_df <-
110+
filter(x[, c("id", "gene_association")],
111+
!is.na(.data[["id"]]),
112+
.data[["id"]] != "",
113+
!is.na(.data[["gene_association"]]),
114+
.data[["gene_association"]] != "",
115+
!stri_detect(.data[["gene_association"]],
116+
regex = "^\\s+$"))
117+
118+
### removal of leading and trailing spaces in the gene association rules
119+
120+
proc_df[["gene_association"]] <-
121+
stri_replace_all(proc_df[["gene_association"]],
122+
regex = "^\\s+",
123+
replacement = "")
124+
125+
proc_df[["gene_association"]] <-
126+
stri_replace_all(proc_df[["gene_association"]],
127+
regex = "\\s+$",
128+
replacement = "")
129+
130+
## extraction of the Entrez IDs --------
131+
132+
proc_df[["entrez_id"]] <-
133+
map(proc_df[["gene_association"]],
134+
stri_extract_all, regex = "\\d+")
135+
136+
proc_df[["entrez_id"]] <- map(proc_df[["entrez_id"]],
137+
unlist)
138+
139+
proc_df[["entrez_id"]] <- map(proc_df[["entrez_id"]],
140+
unique)
141+
142+
## translation of gene assignment rules to R expressions ------
143+
144+
exp_lst <-
145+
map(set_names(proc_df[["gene_association"]],
146+
proc_df[["id"]]),
147+
escape_numbers)
148+
149+
exp_lst <-
150+
map(exp_lst,
151+
stri_replace_all,
152+
regex = "and|AND",
153+
replacement = "%AND%")
154+
155+
exp_lst <-
156+
map(exp_lst,
157+
stri_replace_all,
158+
regex = "or|OR",
159+
replacement = "%OR%")
160+
161+
exp_lst <- map(exp_lst, safely(str2lang))
162+
163+
parse_errors <- compact(map(exp_lst, ~.x$error))
164+
165+
if(length(parse_errors) > 0) {
166+
167+
warning(paste('There were', length(parse_errors), 'parsing errors.'),
168+
call. = FALSE)
169+
170+
}
171+
172+
if(inspect_errors) return(parse_errors)
173+
174+
exp_lst <- compact(map(exp_lst, ~.x$result))
175+
176+
## output -------
177+
178+
id <- NULL
179+
exprs <- NULL
180+
181+
map_df <- tibble(id = names(exp_lst),
182+
exprs = exp_lst)
183+
184+
out_df <- left_join(proc_df, map_df, by = "id")
185+
186+
out_tbl <- left_join(x[, c("id", "name", "subsystem")],
187+
out_df,
188+
by = "id")
189+
190+
return(reactDB(out_tbl))
191+
192+
}
193+
194+
#' @rdname extract_genes
195+
#' @export
196+
197+
as_reactDB <- function(x, ...) extract_genes(x, ...)
198+
199+
# END ------

0 commit comments

Comments
 (0)