Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 73a00ab

Browse files
committed
Include another vignette in downsample, and reduce size further
1 parent b6a1f1e commit 73a00ab

File tree

2 files changed

+43
-39
lines changed

2 files changed

+43
-39
lines changed

R/dev.R

Lines changed: 40 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -199,45 +199,49 @@ downsample_metadata <- function(output = "sample_meta.parquet"){
199199
summarise(n = dplyr::n()) |>
200200
dplyr::collect()
201201

202-
# For each of the 3 examples, we select the minimal file_id_db that will
203-
# satisfy the corresponding filters
204-
example_a_all <- metadata |>
205-
dplyr::filter(
206-
.data$ethnicity == "African" &
202+
# This is a table of all the datasets we need for the vignette. The
203+
# # "datasets" column is how many datasets we need from that filter
204+
# filters <- tibble::tibble(
205+
# filter = rlang::exprs(
206+
# .data$ethnicity == "African" &
207+
# stringr::str_like(.data$assay, "%10x%") &
208+
# .data$tissue == "lung parenchyma" &
209+
# stringr::str_like(.data$cell_type, "%CD4%"),
210+
# .data$cell_type_harmonised == "nk",
211+
# .data$cell_type_harmonised == "cd14 mono",
212+
# .data$tissue == "kidney blood vessel"
213+
# ),
214+
# dataset = c(
215+
# 4,
216+
# 1,
217+
# 1,
218+
# 1
219+
# )
220+
# )
221+
222+
# Find a minimal set of file_id_dbs we need
223+
minimal_file_ids <- rlang::exprs(
224+
.data$ethnicity == "African" &
207225
stringr::str_like(.data$assay, "%10x%") &
208226
.data$tissue == "lung parenchyma" &
209-
stringr::str_like(.data$cell_type, "%CD4%")
210-
) |>
211-
dplyr::pull(.data$file_id_db)
212-
example_a_minimal <- dataset_sizes |>
213-
dplyr::filter(.data$file_id_db %in% example_a_all) |>
214-
dplyr::slice_head(n=5) |>
215-
dplyr::pull(.data$file_id_db)
216-
217-
example_b_all <- metadata |>
218-
dplyr::filter(.data$cell_type_harmonised == "cd14 mono") |>
219-
dplyr::pull(.data$file_id_db)
220-
example_b_minimal <- dataset_sizes |>
221-
dplyr::filter(.data$file_id_db %in% example_b_all) |>
222-
dplyr::slice_head(n=1) |>
223-
dplyr::pull(.data$file_id_db)
224-
225-
example_c_all <- metadata |>
226-
dplyr::filter(.data$cell_type_harmonised == "nk") |>
227-
dplyr::pull(.data$file_id_db)
228-
example_c_minimal <- dataset_sizes |>
229-
dplyr::filter(.data$file_id_db %in% example_c_all) |>
230-
dplyr::slice_head(n=1) |>
231-
dplyr::pull(.data$file_id_db)
232-
233-
# The final dataset is the union of all the selected file IDs
234-
minimal_file_ids <- union(
235-
example_a_minimal,
236-
example_b_minimal
227+
stringr::str_like(.data$cell_type, "%CD4%"),
228+
.data$cell_type_harmonised == "nk",
229+
.data$cell_type_harmonised == "cd14 mono",
230+
.data$tissue == "kidney blood vessel"
237231
) |>
238-
union(example_c_minimal)
239-
240-
metadata |>
232+
purrr::map(function(filter){
233+
all_ids <- metadata |>
234+
dplyr::filter(!!filter) |>
235+
dplyr::pull(.data$file_id_db)
236+
237+
dataset_sizes |>
238+
dplyr::filter(.data$file_id_db %in% all_ids) |>
239+
dplyr::slice_min(n=1, order_by = .data$n, with_ties = FALSE) |>
240+
dplyr::pull(.data$file_id_db)
241+
}) |>
242+
purrr::reduce(union)
243+
244+
metadata |>
241245
dplyr::filter(.data$file_id_db %in% minimal_file_ids) |>
242246
dplyr::arrange(.data$file_id_db, .data$sample_) |>
243247
dplyr::collect() |>

vignettes/Introduction.Rmd

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ knit: >
1414
x,
1515
output_format = "html_document",
1616
params = list(
17-
metadata_url = CuratedAtlasQueryR::SAMPLE_DATABASE_URL
17+
demo_metadata = TRUE
1818
)
1919
)
2020
rmarkdown::render(
@@ -24,7 +24,7 @@ knit: >
2424
output_dir = proj_root,
2525
knit_root_dir = proj_root,
2626
params = list(
27-
metadata_url = CuratedAtlasQueryR::DATABASE_URL
27+
demo_metadata = FALSE
2828
)
2929
)
3030
})
@@ -310,7 +310,7 @@ returns a data frame with one row per dataset, including the `unharmonised`
310310
column which contains unharmnised metadata as a nested data frame.
311311

312312
```{r}
313-
harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel")
313+
harmonised <- metadata |> dplyr::filter(tissue == "kidney blood vessel")
314314
unharmonised <- get_unharmonised_metadata(harmonised)
315315
unharmonised
316316
```

0 commit comments

Comments
 (0)