@@ -199,45 +199,49 @@ downsample_metadata <- function(output = "sample_meta.parquet"){
199199 summarise(n = dplyr :: n()) | >
200200 dplyr :: collect()
201201
202- # For each of the 3 examples, we select the minimal file_id_db that will
203- # satisfy the corresponding filters
204- example_a_all <- metadata | >
205- dplyr :: filter(
206- .data $ ethnicity == " African" &
202+ # This is a table of all the datasets we need for the vignette. The
203+ # # "datasets" column is how many datasets we need from that filter
204+ # filters <- tibble::tibble(
205+ # filter = rlang::exprs(
206+ # .data$ethnicity == "African" &
207+ # stringr::str_like(.data$assay, "%10x%") &
208+ # .data$tissue == "lung parenchyma" &
209+ # stringr::str_like(.data$cell_type, "%CD4%"),
210+ # .data$cell_type_harmonised == "nk",
211+ # .data$cell_type_harmonised == "cd14 mono",
212+ # .data$tissue == "kidney blood vessel"
213+ # ),
214+ # dataset = c(
215+ # 4,
216+ # 1,
217+ # 1,
218+ # 1
219+ # )
220+ # )
221+
222+ # Find a minimal set of file_id_dbs we need
223+ minimal_file_ids <- rlang :: exprs(
224+ .data $ ethnicity == " African" &
207225 stringr :: str_like(.data $ assay , " %10x%" ) &
208226 .data $ tissue == " lung parenchyma" &
209- stringr :: str_like(.data $ cell_type , " %CD4%" )
210- ) | >
211- dplyr :: pull(.data $ file_id_db )
212- example_a_minimal <- dataset_sizes | >
213- dplyr :: filter(.data $ file_id_db %in% example_a_all ) | >
214- dplyr :: slice_head(n = 5 ) | >
215- dplyr :: pull(.data $ file_id_db )
216-
217- example_b_all <- metadata | >
218- dplyr :: filter(.data $ cell_type_harmonised == " cd14 mono" ) | >
219- dplyr :: pull(.data $ file_id_db )
220- example_b_minimal <- dataset_sizes | >
221- dplyr :: filter(.data $ file_id_db %in% example_b_all ) | >
222- dplyr :: slice_head(n = 1 ) | >
223- dplyr :: pull(.data $ file_id_db )
224-
225- example_c_all <- metadata | >
226- dplyr :: filter(.data $ cell_type_harmonised == " nk" ) | >
227- dplyr :: pull(.data $ file_id_db )
228- example_c_minimal <- dataset_sizes | >
229- dplyr :: filter(.data $ file_id_db %in% example_c_all ) | >
230- dplyr :: slice_head(n = 1 ) | >
231- dplyr :: pull(.data $ file_id_db )
232-
233- # The final dataset is the union of all the selected file IDs
234- minimal_file_ids <- union(
235- example_a_minimal ,
236- example_b_minimal
227+ stringr :: str_like(.data $ cell_type , " %CD4%" ),
228+ .data $ cell_type_harmonised == " nk" ,
229+ .data $ cell_type_harmonised == " cd14 mono" ,
230+ .data $ tissue == " kidney blood vessel"
237231 ) | >
238- union(example_c_minimal )
239-
240- metadata | >
232+ purrr :: map(function (filter ){
233+ all_ids <- metadata | >
234+ dplyr :: filter(!! filter ) | >
235+ dplyr :: pull(.data $ file_id_db )
236+
237+ dataset_sizes | >
238+ dplyr :: filter(.data $ file_id_db %in% all_ids ) | >
239+ dplyr :: slice_min(n = 1 , order_by = .data $ n , with_ties = FALSE ) | >
240+ dplyr :: pull(.data $ file_id_db )
241+ }) | >
242+ purrr :: reduce(union )
243+
244+ metadata | >
241245 dplyr :: filter(.data $ file_id_db %in% minimal_file_ids ) | >
242246 dplyr :: arrange(.data $ file_id_db , .data $ sample_ ) | >
243247 dplyr :: collect() | >
0 commit comments