Include another vignette in downsample, and reduce size further

multimeric · multimeric · commit 73a00ab625b3 · 2023-04-20T14:08:43.000+10:00
diff --git a/R/dev.R b/R/dev.R
@@ -199,45 +199,49 @@ downsample_metadata <- function(output = "sample_meta.parquet"){
         summarise(n = dplyr::n()) |> 
         dplyr::collect()
     
-    # For each of the 3 examples, we select the minimal file_id_db that will
-    # satisfy the corresponding filters
-    example_a_all <- metadata |> 
-        dplyr::filter(
-            .data$ethnicity == "African" &
+    # This is a table of all the datasets we need for the vignette. The
+    # # "datasets" column is how many datasets we need from that filter
+    # filters <- tibble::tibble(
+    #     filter = rlang::exprs(
+    #         .data$ethnicity == "African" &
+    #         stringr::str_like(.data$assay, "%10x%") &
+    #         .data$tissue == "lung parenchyma" &
+    #         stringr::str_like(.data$cell_type, "%CD4%"),
+    #         .data$cell_type_harmonised == "nk",
+    #         .data$cell_type_harmonised == "cd14 mono",
+    #         .data$tissue == "kidney blood vessel"
+    #     ),
+    #     dataset = c(
+    #         4,
+    #         1,
+    #         1,
+    #         1
+    #     )
+    # )
+    
+    # Find a minimal set of file_id_dbs we need
+    minimal_file_ids <- rlang::exprs(
+        .data$ethnicity == "African" &
             stringr::str_like(.data$assay, "%10x%") &
             .data$tissue == "lung parenchyma" &
-            stringr::str_like(.data$cell_type, "%CD4%")
-        ) |>
-        dplyr::pull(.data$file_id_db)
-    example_a_minimal <- dataset_sizes |>
-        dplyr::filter(.data$file_id_db %in% example_a_all) |>
-        dplyr::slice_head(n=5) |>
-        dplyr::pull(.data$file_id_db)
-    
-    example_b_all <- metadata |> 
-        dplyr::filter(.data$cell_type_harmonised == "cd14 mono") |>
-        dplyr::pull(.data$file_id_db)
-    example_b_minimal <- dataset_sizes |>
-        dplyr::filter(.data$file_id_db %in% example_b_all) |>
-        dplyr::slice_head(n=1) |>
-        dplyr::pull(.data$file_id_db)
-    
-    example_c_all <- metadata |> 
-        dplyr::filter(.data$cell_type_harmonised == "nk") |>
-        dplyr::pull(.data$file_id_db)
-    example_c_minimal <- dataset_sizes |>
-        dplyr::filter(.data$file_id_db %in% example_c_all) |>
-        dplyr::slice_head(n=1) |>
-        dplyr::pull(.data$file_id_db)
-    
-    # The final dataset is the union of all the selected file IDs
-    minimal_file_ids <- union(
-        example_a_minimal,
-        example_b_minimal
+            stringr::str_like(.data$cell_type, "%CD4%"),
+        .data$cell_type_harmonised == "nk",
+        .data$cell_type_harmonised == "cd14 mono",
+        .data$tissue == "kidney blood vessel"
     ) |>
-        union(example_c_minimal)
-
-    metadata |>
+        purrr::map(function(filter){
+            all_ids <- metadata |> 
+                dplyr::filter(!!filter) |>
+                dplyr::pull(.data$file_id_db)
+            
+            dataset_sizes |>
+                dplyr::filter(.data$file_id_db %in% all_ids) |>
+                dplyr::slice_min(n=1, order_by = .data$n, with_ties = FALSE) |>
+                dplyr::pull(.data$file_id_db)
+        }) |>
+        purrr::reduce(union)
+    
+    metadata |>    
         dplyr::filter(.data$file_id_db %in% minimal_file_ids) |>
         dplyr::arrange(.data$file_id_db, .data$sample_) |>
         dplyr::collect() |>
diff --git a/vignettes/Introduction.Rmd b/vignettes/Introduction.Rmd
@@ -14,7 +14,7 @@ knit: >
             x,
             output_format = "html_document",
             params = list(
-                metadata_url = CuratedAtlasQueryR::SAMPLE_DATABASE_URL
+                demo_metadata = TRUE
             )
         )
         rmarkdown::render(
@@ -24,7 +24,7 @@ knit: >
             output_dir = proj_root,
             knit_root_dir = proj_root,
             params = list(
-                metadata_url = CuratedAtlasQueryR::DATABASE_URL
+                demo_metadata = FALSE
             )
         )
     })
@@ -310,7 +310,7 @@ returns a data frame with one row per dataset, including the `unharmonised`
 column which contains unharmnised metadata as a nested data frame.
 
 ```{r}
-harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel")
+harmonised <- metadata |> dplyr::filter(tissue == "kidney blood vessel")
 unharmonised <- get_unharmonised_metadata(harmonised)
 unharmonised
 ```

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ knit: >`
`14`	`14`	`x,`
`15`	`15`	`output_format = "html_document",`
`16`	`16`	`params = list(`
`17`		`- metadata_url = CuratedAtlasQueryR::SAMPLE_DATABASE_URL`
	`17`	`+ demo_metadata = TRUE`
`18`	`18`	`)`
`19`	`19`	`)`
`20`	`20`	`rmarkdown::render(`
`@@ -24,7 +24,7 @@ knit: >`
`24`	`24`	`output_dir = proj_root,`
`25`	`25`	`knit_root_dir = proj_root,`
`26`	`26`	`params = list(`
`27`		`- metadata_url = CuratedAtlasQueryR::DATABASE_URL`
	`27`	`+ demo_metadata = FALSE`
`28`	`28`	`)`
`29`	`29`	`)`
`30`	`30`	`})`
@@ -310,7 +310,7 @@ returns a data frame with one row per dataset, including the `unharmonised`
`310`	`310`	`column which contains unharmnised metadata as a nested data frame.`
`311`	`311`
`312`	`312`	```{r}
`313`		`-harmonised <- get_metadata() \|> dplyr::filter(tissue == "kidney blood vessel")`
	`313`	`+harmonised <- metadata \|> dplyr::filter(tissue == "kidney blood vessel")`
`314`	`314`	`unharmonised <- get_unharmonised_metadata(harmonised)`
`315`	`315`	`unharmonised`
`316`	`316`	```