multimeric
diff --git a/‎DESCRIPTION‎
Lines changed: 3 additions & 3 deletions b/‎DESCRIPTION‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎NEWS.md‎
Lines changed: 2 additions & 1 deletion b/‎NEWS.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎R/internal_utils.R‎
Lines changed: 1 addition & 1 deletion b/‎R/internal_utils.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/multiqc.R‎
Lines changed: 18 additions & 13 deletions b/‎R/multiqc.R‎
Lines changed: 18 additions & 13 deletions
diff --git a/‎R/plot_parsers.R‎
Lines changed: 70 additions & 0 deletions b/‎R/plot_parsers.R‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎R/plots.R‎
Lines changed: 9 additions & 71 deletions b/‎R/plots.R‎
Lines changed: 9 additions & 71 deletions
diff --git a/‎_pkgdown.yml‎
Lines changed: 5 additions & 1 deletion b/‎_pkgdown.yml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎man/load_multiqc.Rd‎
Lines changed: 11 additions & 8 deletions b/‎man/load_multiqc.Rd‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎man/parse_bar_graph.Rd‎
Lines changed: 22 additions & 0 deletions b/‎man/parse_bar_graph.Rd‎
Lines changed: 22 additions & 0 deletions
@@ -1,7 +1,7 @@
 Package: TidyMultiqc
 Type: Package
 Title: Converts 'MultiQC' Reports into Tidy Data Frames
-Version: 0.1.1
+Version: 1.0.0
 Author: Michael Milton
 Maintainer: Michael Milton <[email protected]>
 Description: Provides the means to convert 'multiqc_data.json' files,
@@ -14,7 +14,6 @@ Encoding: UTF-8
 Imports: 
     assertthat,
     dplyr,
-    HistDat (>= 0.2.0),
     jsonlite,
     magrittr,
     purrr,
@@ -26,7 +25,8 @@ Suggests:
     testthat (>= 3.0.0),
     knitr,
     rmarkdown,
-    ggplot2
+    ggplot2,
+    HistDat
 Config/testthat/edition: 3
 RoxygenNote: 7.1.2
 Roxygen: list(markdown = TRUE)
 
@@ -2,8 +2,9 @@
 
 ## Breaking Changes
 
-* Removed the `plot_opts` key from the `load_multiqc` function. Instead, the plots are returns as list columns ie nested data frames inside the returned data frame. Users are then able to parse out summary statistics using normal `dplyr` and `tidyr` functions. Refer to the vignettes for examples. [[#1]](https://github.com/multimeric/TidyMultiqc/issues/1)
+* Removed the `plot_opts` key from the `load_multiqc` function. Instead, the plots are returns as list columns with nested data frames inside the returned data frame. Users are then able to parse out summary statistics using normal `dplyr` and `tidyr` functions. Refer to the vignettes for examples. [[#1]](https://github.com/multimeric/TidyMultiqc/issues/1)
 * Renamed "plots" to "plot" in the `sections` argument. This ensures consistency with the data frame column names for plots, which are "plot.XX"
+* `metadata.sample_id` is now always the first column in the data frame, even if you have provided a metadata function
 
 ## New Features
 
 
@@ -34,4 +34,4 @@ sanitise_column_name <- function(name) {
     stringr::str_to_lower()
 }
 
-ROW_IDENTIFIER = "metadata.sample_id"
+ROW_IDENTIFIER <- "metadata.sample_id"
@@ -95,36 +95,39 @@ parse_metadata <- function(parsed, samples, find_metadata) {
 #' @param find_metadata A single function that will be called with a sample name and the
 #' parsed JSON for the entire report and returns a named list of metadata fields for the sample.
 #' Refer to the vignette for an example.
-#' @param sections A string vector of zero or more sections to include in the output. 
+#' @param sections A string vector of zero or more sections to include in the output.
 #' Each section can be:
 #' \describe{
 #' \item{"plot"}{Parse plot data. Note that you should also provide a list of plots via the `plots` argument}
 #' \item{"general"}{parse the general stat section}
 #' \item{"raw"}{Parse the raw data section}
 #' }
 #' This defaults to 'general', which tends to contain the most useful statistics
-#' @param plots A string vector, each of which contains the ID of a plot you 
+#' @param plots A string vector, each of which contains the ID of a plot you
 #' want to include in the output. You can use [TidyMultiqc::list_plots()] to help here.
-#' @param plot_parsers. [Advanced] A named list of custom parser functions. 
+#' @param plot_parsers **Advanced**. A named list of custom parser functions.
 #' The names of the list should correspond to plotly plot types, such as "xy_line", and the values should be functions
 #' that return a named list of named lists. For the return value, the outer list is named by the sample ID, and the inner list
 #' is named by the name of the column. Refer to the source code for some examples.
 #' @export
 #' @return A tibble (data.frame subclass) with QC data and metadata as columns, and samples as rows.
-#' Columns are named according to the respective section they belong to, 
+#' Columns are named according to the respective section they belong to,
 #' and will always be listed in the following order:
-#' \item{`metadata.X`}{This column contains metadata for this sample. 
+#' \item{`metadata.X`}{This column contains metadata for this sample.
 #' By default this is only the sample ID, but if you have provided the
-#'`find_metadata` argument, there may be more columns.}
+#' `find_metadata` argument, there may be more columns.}
 #' \item{`general.X`}{This column contains a generally useful summary statistic for each sample}
-#' \item{`plot.X`}{This column contains a data frame of plot data for each sample}
-#' \item{`raw.X`}{This column contains a raw summary statistic or value relating to each sample}
-#' }
+#' \item{`plot.X`}{This column contains a data frame of plot data for each sample.
+#' Refer to the plot parsers documentation (ie the `parse_X` functions) for more information on the output format. }
+#' \item{`raw.X`}{This column contains a raw summary statistic or value relating to each sample }
+#' @seealso [TidyMultiqc::parse_xyline_plot()] [TidyMultiqc::parse_bar_graph()]
 #' @examples
 #' load_multiqc(system.file("extdata", "wgs/multiqc_data.json", package = "TidyMultiqc"))
 load_multiqc <- function(paths,
                          plots = NULL,
-                         find_metadata = function(...) { list() },
+                         find_metadata = function(...) {
+                           list()
+                         },
                          plot_parsers = list(),
                          sections = "general") {
   assertthat::assert_that(all(sections %in% c(
@@ -141,7 +144,7 @@ load_multiqc <- function(paths,
         purrr::map(~ switch(.,
           general = parse_general(parsed),
           raw = parse_raw(parsed),
-          plot = parse_plots(parsed, plots = plots, plot_parsers=plot_parsers)
+          plot = parse_plots(parsed, plots = plots, plot_parsers = plot_parsers)
         )) %>%
         purrr::reduce(~ purrr::list_merge(.x, !!!.y), .init = list()) %>%
         purrr::imap(~ purrr::list_merge(.x, metadata.sample_id = .y))
@@ -152,14 +155,16 @@ load_multiqc <- function(paths,
         dplyr::bind_rows()
     }) %>%
     # Only arrange the columns if we have at least 1 column
-      `if`(
+    `if`(
       # Move the columns into the order: metadata, general, plot, raw
       ncol(.) > 0,
       (.) %>%
         dplyr::relocate(dplyr::starts_with("raw")) %>%
         dplyr::relocate(dplyr::starts_with("plot")) %>%
         dplyr::relocate(dplyr::starts_with("general")) %>%
-        dplyr::relocate(dplyr::starts_with("metadata")),
+        dplyr::relocate(dplyr::starts_with("metadata")) %>%
+        # Always put the sample ID at the start
+        dplyr::relocate(metadata.sample_id),
       .
     )
 }
@@ -0,0 +1,70 @@
+#' Takes the JSON dictionary for an xyline plot, and returns a named list of
+#' data frames, one for each sample.
+#' @keywords internal
+#' @import rlang
+#' @keywords plot_parser
+#' @return A list of data frames, one for each sample.
+#' Each data frame will have two columns: x, and y.
+#' These correspond to the x and y coordinates in the plot.
+#' For example, for histogram data, the x values are values of the random
+#' variable, and the y values are the number of counts for that value.
+parse_xyline_plot <- function(plot_data, name) {
+  # This only works on xyline plots
+  assertthat::assert_that(plot_data$plot_type == "xy_line")
+
+  plot_data$datasets %>%
+    purrr::map(function(dataset) {
+      # For some reason there are two levels of nesting here
+      dataset %>%
+        kv_map(function(subdataset) {
+          name <- stringr::str_c("plot", name, sep = ".")
+          list(
+            key = subdataset$name,
+            value = subdataset$data %>%
+              purrr::map_dfr(~ list(x = .[[1]], y = .[[2]])) %>%
+              # Chop the multi-row data frame into one row
+              tidyr::nest({{ name }} := tidyr::everything()) # %>%
+          )
+        })
+    }) %>%
+    purrr::reduce(~ purrr::list_merge(.x, !!!.y))
+}
+
+#' Takes the JSON dictionary for a bar graph, and returns a named list of
+#' data frames, one for each sample.
+#' @keywords internal
+#' @import rlang
+#' @keywords plot_parser
+#' @return A list of data frames, one for each sample.
+#' Each data frame will have one column corresponding to each category in the bar chart.
+#' For example, for the plot "SnpEff: Counts by Genomic Region", we will have
+#' one column for the number of intron variants, one column for the number of exon variants, etc.
+#' This means that the number of columns will be fairly variable for different plots.
+parse_bar_graph <- function(plot_data, name) {
+  # This only works on bar_graphs
+  assertthat::assert_that(plot_data$plot_type == "bar_graph")
+
+  # Make a list of samples
+  samples <- plot_data$samples[[1]] %>% purrr::flatten_chr()
+
+  colname <- stringr::str_c("plot", sanitise_column_name(name), sep = ".")
+
+  plot_data$datasets[[1]] %>%
+    # First, build up a dictionary of samples -> dictionary of quality metrics
+    purrr::map(function(dataset) {
+      segment_name <- dataset$name
+      dataset$data %>%
+        # For this segment, each sample has a value
+        kv_map(function(value, idx) {
+          list(
+            key = samples[[idx]],
+            value = list(value) %>% purrr::set_names(sanitise_column_name(segment_name))
+          )
+        }, map_keys = TRUE)
+    }) %>%
+    purrr::reduce(utils::modifyList) %>%
+    # Then, convert each inner dictionary to a tibble row
+    purrr::map(tibble::as_tibble_row) %>%
+    # And nest each df so that we only have 1 cell of output per sample
+    purrr::map(~ tidyr::nest(., {{ colname }} := tidyr::everything()))
+}
@@ -1,67 +1,6 @@
-# Internal plot parsing functions
+# Plot parsing functions
 
-#' Takes the JSON dictionary for an xyline plot, and returns a named list of 
-#' data frames, one for each sample. 
-#' @keywords internal
-#' @import rlang
-#' @noRd
-parse_xyline_plot <- function(plot_data, name) {
-  # This only works on xyline plots
-  assertthat::assert_that(plot_data$plot_type == "xy_line")
-  
-  plot_data$datasets %>%
-    purrr::map(function(dataset) {
-      # For some reason there are two levels of nesting here
-      dataset %>%
-        kv_map(function(subdataset) {
-          name = stringr::str_c("plot", name, sep=".")
-          list(
-            key = subdataset$name,
-            value = subdataset$data %>%
-              purrr::map_dfr(~list(x=.[[1]], y=.[[2]])) %>%
-              # Chop the multi-row data frame into one row
-              tidyr::nest({{name}} := tidyr::everything()) #%>%
-          )
-        })
-    }) %>%
-    purrr::reduce(~ purrr::list_merge(.x, !!!.y))
-}
-
-#' Takes the JSON dictionary for a bar graph, and returns a named list of 
-#' data frames, one for each sample. 
-#' @keywords internal
-#' @import rlang
-#' @noRd
-parse_bar_graph <- function(plot_data, name) {
-  # This only works on bar_graphs
-  assertthat::assert_that(plot_data$plot_type == "bar_graph")
-
-  # Make a list of samples
-  samples <- plot_data$samples[[1]] %>% purrr::flatten_chr()
-  
-  colname = stringr::str_c("plot", sanitise_column_name(name), sep = ".") 
-  
-  plot_data$datasets[[1]] %>%
-    # First, build up a dictionary of samples -> dictionary of quality metrics
-    purrr::map(function(dataset) {
-      segment_name <- dataset$name
-      dataset$data %>%
-        # For this segment, each sample has a value
-        kv_map(function(value, idx) {
-          list(
-            key = samples[[idx]],
-            value = list(value) %>% purrr::set_names(sanitise_column_name(segment_name))
-          )
-        }, map_keys = TRUE)
-    }) %>%
-    purrr::reduce(utils::modifyList) %>%
-    # Then, convert each inner dictionary to a tibble row
-    purrr::map(tibble::as_tibble_row) %>%
-    # And nest each df so that we only have 1 cell of output per sample
-    purrr::map(~tidyr::nest(., {{colname}} := tidyr::everything()))
-}
-
-DEFAULT_PLOT_PARSERS = list(
+DEFAULT_PLOT_PARSERS <- list(
   xy_line = parse_xyline_plot,
   bar_graph = parse_bar_graph
 )
@@ -75,19 +14,18 @@ DEFAULT_PLOT_PARSERS = list(
 #' @noRd
 parse_plots <- function(parsed, plots, plot_parsers) {
   # Merge the default parsers with the user provided ones
-  parsers = purrr::list_modify(DEFAULT_PLOT_PARSERS, !!!plot_parsers)
+  parsers <- purrr::list_modify(DEFAULT_PLOT_PARSERS, !!!plot_parsers)
 
   # Plot data is more complex
   parsed$report_plot_data %>%
     purrr::imap(function(plot_data, plot_name) {
       # Skip any plot not explicitly in this list, it's impossible to infer
       # what type of plot each is
       if (plot_name %in% plots || is.null(plots)) {
-        parser = parsers[[plot_data$plot_type]]
-        if (!is.null(parser)){
+        parser <- parsers[[plot_data$plot_type]]
+        if (!is.null(parser)) {
           parser(plot_data = plot_data, name = plot_name)
-        }
-        else {
+        } else {
           warning(paste("No known (or provided) parser for a plot of type \"", plot_data$plot_type, "\""))
         }
       }
@@ -97,7 +35,7 @@ parse_plots <- function(parsed, plots, plot_parsers) {
 }
 
 #' List the plot identifiers of all the plots in a given multiqc report
-#' 
+#'
 #' @details The main use for this function is finding the plot identifiers
 #' that you will then pass into the `plots` argument of the [TidyMultiqc::load_multiqc()]
 #' function.
@@ -116,10 +54,10 @@ parse_plots <- function(parsed, plots, plot_parsers) {
 #' filepath <- system.file("extdata", "HG00096/multiqc_data.json", package = "TidyMultiqc")
 #' # This is the actual invocation
 #' list_plots(filepath)
-list_plots <- function(path){
+list_plots <- function(path) {
   jsonlite::read_json(path) %>%
     `$`("report_plot_data") %>%
-    purrr::imap_dfr(function(plot, id){
+    purrr::imap_dfr(function(plot, id) {
       list(
         id = id,
         title = plot$config$title
 
@@ -4,4 +4,8 @@ reference:
     desc: The public API to this package
   - contents:
     - load_multiqc
-    - list_plots
+    - list_plots
+  - title: Plot Parsers
+    desc: These are internal functions that you will never need to call yourself, and aren't exported. However, these are used to document the format of the nested data frames for different types of plots.
+  - contents:
+    - has_keyword("plot_parser")
Original file line number	Diff line number	Diff line change
`@@ -34,4 +34,4 @@ sanitise_column_name <- function(name) {`
`34`	`34`	`stringr::str_to_lower()`
`35`	`35`	`}`
`36`	`36`
`37`		`-ROW_IDENTIFIER = "metadata.sample_id"`
	`37`	`+ROW_IDENTIFIER <- "metadata.sample_id"`