PiotrTymoszuk
diff --git a/‎NAMESPACE‎
Lines changed: 5 additions & 0 deletions b/‎NAMESPACE‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎R/classes.R‎
Lines changed: 90 additions & 1 deletion b/‎R/classes.R‎
Lines changed: 90 additions & 1 deletion
diff --git a/‎R/data_desc.R‎
Lines changed: 2 additions & 54 deletions b/‎R/data_desc.R‎
Lines changed: 2 additions & 54 deletions
diff --git a/‎R/extraction_funs.R‎
Lines changed: 199 additions & 0 deletions b/‎R/extraction_funs.R‎
Lines changed: 199 additions & 0 deletions
@@ -1,8 +1,12 @@
 # Generated by roxygen2: do not edit by hand
 
+export(as_reactDB)
+export(extract_genes)
 export(geneSBML)
 export(is_geneSBML)
 export(is_memoSaver)
+export(is_reactDB)
+export(reactDB)
 importFrom(Rcpp,sourceCpp)
 importFrom(dplyr,all_of)
 importFrom(dplyr,arrange)
@@ -56,6 +60,7 @@ importFrom(rlang,`.env`)
 importFrom(rlang,set_names)
 importFrom(stats,complete.cases)
 importFrom(stats,median)
+importFrom(stats,na.omit)
 importFrom(stats,p.adjust)
 importFrom(stats,pnorm)
 importFrom(stats,qnorm)
 
@@ -1,6 +1,6 @@
 # S3 class definitions
 
-# geneSBML class ------
+# `geneSBML` class ------
 
 #' Create a `geneSBML` object.
 #'
@@ -106,4 +106,93 @@
 
   is_memoSaver <- function(x) inherits(x, "geneSBML") & inherits(x, "memoSaver")
 
+# `reactDB` class --------
+
+#' Database of reaction annotation: `reactDB` class.
+#'
+#' @description
+#' Constructs an instance of a reaction annotation database data frame with
+#' processed gene - reaction association rules and gene - reaction rule evaluation
+#' expressions.
+#'
+#' @details
+#' The function conducts basic validation for conformity with reaction
+#' evaluation tools.
+#' The input and output data frames have the following columns:
+#' * __id__: reaction identifier beginning with `"R_"` string.
+#' * __name__: character strings with reaction names.
+#' * __subsystem__: character strings with names of Recon subsystems.
+#' * __gene_association__: character strings with gene - reaction association rules.
+#' * __entrez_id__: a column with lists of Entrez IDs of genes associated with
+#' the reactions.
+#' * __exprs__: a column with `NULL`, R symbols or R language expressions used
+#' to evaluate the gene - reaction association rules.
+#'
+#' @return a data frame of class `reactDB`.
+#'
+#' @param x a data frame with columns specified in Details.
+#'
+#' @md
+#' @export
+
+  reactDB <- function(x) {
+
+    ## input controls --------
+
+    if(is_reactDB(x)) return(x)
+
+    if(!is.data.frame(x)) stop("`x` has to be a data frame.", call. = FALSE)
+
+    fix_cols <-
+      c("id", "name", "subsystem", "gene_association", "entrez_id", "exprs" )
+
+    missing_cols <- setdiff(fix_cols, names(x))
+
+    if(length(missing_cols) > 0) {
+
+      stop(paste("The following obligatory columns are missing from `x`:",
+                 paste(missing_cols, collapse = ", ")),
+           call. = FALSE)
+
+    }
+
+    char_cols <- c("id", "name", "subsystem", "gene_association")
+
+    class_check <- map_lgl(x[char_cols], is.character)
+
+    if(any(!class_check)) {
+
+      stop(paste("The following obligatory column in `x` must be of character type:",
+                 paste(char_cols[!class_check]), collapse = ", "),
+           call. = FALSE)
+
+    }
+
+    if(!is.list(x[["entrez_id"]])) stop("Column `entrez_id` must be a list of Entrez IDs.", call. = FALSE)
+
+    if(!is.list(x[["exprs"]])) stop("Column `exprs` must be a list.", call. = FALSE)
+
+    class_check <-
+      map_lgl(x[["exprs"]], function(x) is.call(x) | is.name(x) | is.null(x))
+
+    if(any(!class_check)) {
+
+      stop(paste("Unrecognized objects in `exprs` columns.",
+                 "The allowed formats are NULL, R calls, or names."),
+           call. = FALSE)
+
+    }
+
+    ## the structure -------
+
+    structure(x, class = c("reactDB", class(x)))
+
+  }
+
+#' @rdname reactDB
+#' @export
+
+  is_reactDB <- function(x) inherits(x, "reactDB")
+
+
 # END ------
@@ -33,58 +33,6 @@
 
 # Reaction annotation data -------
 
-#' BiGG reaction annotation data.
-#'
-#' @description
-#' A data frame with BiGG database annotation of metabolic
-#' reactions.
-#'
-#' @format
-#' A data frame with 28301 rows and 6 variables:
-#' * __bigg_id__: BiGG reaction identifier
-#' * __name__: reaction name
-#' * __reaction_string__: reaction equation
-#' * __model_list__: list of BiGG models containing the reaction
-#' * __database_links__: links to the BiGG on-line database
-#' * __old_bigg_ids__: legacy BiGG ID
-#'
-#' @source BiGG: http://bigg.ucsd.edu/data_access
-#'
-#' @docType data
-#'
-#' @name reactions
-#'
-#' @usage data(reactions)
-
-  NULL
-
-# Metabolite annotation data ------
-
-#' BiGG metabolite annotation data.
-#'
-#' @description
-#' A data frame with BiGG database annotation of metabolites.
-#'
-#' @format A data frame with 15724 rows and 6 variables:
-#' * __bigg_id__: BiGG metabolite identifier
-#' * __universal_bigg_id__: universal BiGG metabolite identifier
-#' * __name__: metabolite name
-#' * __model_list__: list of models with containing the metabolite
-#' * __database_links__: links to the BiGG on-line database
-#' * __old_bigg_ids__: legacy BiGG ID
-#'
-#' @source BiGG: http://bigg.ucsd.edu/data_access
-#'
-#' @docType data
-#'
-#' @name metabolites
-#'
-#' @usage data(metabolites)
-
-  NULL
-
-# Reaction annotation data -------
-
 #' Reaction association rules for the Recon2 model.
 #'
 #' @description
@@ -141,10 +89,10 @@
 #'
 #' @docType data
 #'
-#' @name Recon2D
+#' @name Recon2_2D
 #'
 #' @md
-#' @usage data(Recon2D)
+#' @usage data(Recon2_2D)
 
 NULL
 
 
@@ -0,0 +1,199 @@
+# Extraction of features from Recon data frames with reaction annotation
+
+# Mapping of reactions to gene identifiers -------
+
+#' Extract gene identifiers from reaction annotations.
+#'
+#' @description
+#' The functions extract Entrez ID gene identifiers mapped to all or
+#' user-specified reactions present in a data frame with reaction annotation
+#' and gene - reaction association rules.
+#' Entrez IDs of genes associated with reactions are listed, and character
+#' strings with the gene - reaction association rules are translated to R
+#' expressions.
+#' `as_reactDB()` offers a simplified tools for generation of
+#' \code{\link{reactDB}} data frames,
+#' while `extract_genes()` allows for selection of reactions of interest and
+#' parsing error diagnostics.
+#'
+#' @details
+#' Association rules with unrecognized Entrez IDs are removed from the output
+#' and a parsing warning is raised.
+#' The gene association rules have to operate with "bare" Entrez IDs without
+#' the version information (i.e. numbers after a dot).
+#'
+#' @return a data frame of class \code{\link{reactDB}} containing reaction
+#' IDs (`id`), reaction names (`name`), subsystem information (`subsystem`),
+#' list of gene identifiers (`entrez_id`) and a list of expressions to be
+#' evaluated by at calculating reaction activity estimates.
+#'
+#' @param x a data frame with the following obligatory columns: `id` with reaction
+#' identifiers, `name` with reaction names, `subsystem` with subsystem assignment,
+#' and `gene_association` with character strings with gene association rules.
+#' @param react_id BiGG reaction ID, with or without the leading 'R_' string.
+#' Defaults to `NULL`, which means that all reactions are mapped to genes.
+#' @param inspect_errors logical. If `TRUE`, the function returns parsing errors.
+#' @param ... additional arguments passed to `extract_genes()`.
+#'
+#' @export
+
+  extract_genes <- function(x,
+                            react_id = NULL,
+                            inspect_errors = FALSE) {
+
+    ## entry control ----------
+
+    if(!is.data.frame(x)) stop("`x` has to be a data frame.", call. = FALSE)
+
+    fix_cols <- c("id", "name", "subsystem", "gene_association")
+
+    missing_cols <- setdiff(fix_cols, names(x))
+
+    if(length(missing_cols) > 0) {
+
+      stop(paste("The following obligatory columns are missing from `x`:",
+                 paste(missing_cols, collapse = ", ")),
+           call. = FALSE)
+
+    }
+
+    x <- x[, fix_cols]
+
+    class_check <- map_lgl(x, is.character)
+
+    if(any(!class_check)) {
+
+      stop(paste("The following obligatory column in `x` must be of character type:",
+                 paste(fix_cols[!class_check]), collapse = ", "),
+           call. = FALSE)
+
+    }
+
+    if(any(stri_detect(na.omit(x[["gene_association"]]),
+                       regex = "\\d+\\.\\d+"))) {
+
+      stop(paste("Gene association rules contain identifiers with",
+                 "version information, i.e. number after a dot.",
+                 "Please remove it to proceed."),
+           call. = FALSE)
+
+    }
+
+    if(!is.null(react_id)) {
+
+      stopifnot(is.character(react_id))
+
+      react_id <-
+        ifelse(!stri_detect(react_id, regex = '^R_'),
+               paste0('R_', react_id), react_id)
+
+      x <- filter(x, .data[["id"]] %in% react_id)
+
+      if(nrow(x) == 0) {
+
+        stop("No reactions to precess after filtering with `react_id`.",
+             call. = FALSE)
+
+      }
+
+    }
+
+    stopifnot(is.logical(inspect_errors))
+    inspect_errors <- inspect_errors[1]
+
+    ## filtering the reaction annotation df ---------
+
+    ### getting rid of empty rules and space-only rules in the annotation
+    ### data frame
+
+    proc_df <-
+      filter(x[, c("id", "gene_association")],
+             !is.na(.data[["id"]]),
+             .data[["id"]] != "",
+             !is.na(.data[["gene_association"]]),
+             .data[["gene_association"]] != "",
+             !stri_detect(.data[["gene_association"]],
+                          regex = "^\\s+$"))
+
+    ### removal of leading and trailing spaces in the gene association rules
+
+    proc_df[["gene_association"]] <-
+      stri_replace_all(proc_df[["gene_association"]],
+                       regex = "^\\s+",
+                       replacement = "")
+
+    proc_df[["gene_association"]] <-
+      stri_replace_all(proc_df[["gene_association"]],
+                       regex = "\\s+$",
+                       replacement = "")
+
+    ## extraction of the Entrez IDs --------
+
+    proc_df[["entrez_id"]] <-
+      map(proc_df[["gene_association"]],
+          stri_extract_all, regex = "\\d+")
+
+    proc_df[["entrez_id"]] <- map(proc_df[["entrez_id"]],
+                                  unlist)
+
+    proc_df[["entrez_id"]] <- map(proc_df[["entrez_id"]],
+                                  unique)
+
+    ## translation of gene assignment rules to R expressions ------
+
+    exp_lst <-
+      map(set_names(proc_df[["gene_association"]],
+                    proc_df[["id"]]),
+          escape_numbers)
+
+    exp_lst <-
+      map(exp_lst,
+          stri_replace_all,
+          regex = "and|AND",
+          replacement = "%AND%")
+
+    exp_lst <-
+      map(exp_lst,
+          stri_replace_all,
+          regex = "or|OR",
+          replacement = "%OR%")
+
+    exp_lst <- map(exp_lst, safely(str2lang))
+
+    parse_errors <- compact(map(exp_lst, ~.x$error))
+
+    if(length(parse_errors) > 0) {
+
+      warning(paste('There were', length(parse_errors), 'parsing errors.'),
+              call. = FALSE)
+
+    }
+
+    if(inspect_errors) return(parse_errors)
+
+    exp_lst <- compact(map(exp_lst, ~.x$result))
+
+    ## output -------
+
+    id <- NULL
+    exprs <- NULL
+
+    map_df <- tibble(id = names(exp_lst),
+                     exprs = exp_lst)
+
+    out_df <- left_join(proc_df, map_df, by = "id")
+
+    out_tbl <- left_join(x[, c("id", "name", "subsystem")],
+                         out_df,
+                         by = "id")
+
+    return(reactDB(out_tbl))
+
+  }
+
+#' @rdname extract_genes
+#' @export
+
+  as_reactDB <- function(x, ...) extract_genes(x, ...)
+
+# END ------