Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 57cdd4f

Browse files
committed
Tidy up get_SingleCell, add genes argument, add tests
1 parent 6bfd0b5 commit 57cdd4f

File tree

4 files changed

+56
-42
lines changed

4 files changed

+56
-42
lines changed

NAMESPACE

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,15 @@ importFrom(HDF5Array,loadHDF5SummarizedExperiment)
99
importFrom(RSQLite,SQLITE_RO)
1010
importFrom(RSQLite,SQLite)
1111
importFrom(SingleCellExperiment,SingleCellExperiment)
12+
importFrom(SummarizedExperiment,"assayNames<-")
1213
importFrom(SummarizedExperiment,colData)
1314
importFrom(dplyr,as_tibble)
15+
importFrom(dplyr,filter)
1416
importFrom(dplyr,pull)
1517
importFrom(dplyr,tbl)
1618
importFrom(glue,glue)
1719
importFrom(purrr,map)
20+
importFrom(purrr,map_int)
1821
importFrom(purrr,reduce)
1922
importFrom(stringr,str_remove)
2023
importFrom(tidySingleCellExperiment,inner_join)

R/query.R

Lines changed: 37 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3,26 +3,27 @@
33
#' @param .data A data frame containing, at minimum, a `.sample` column, which corresponds to a single cell sample ID.
44
#' This can be obtained from the [get_metadata()] function.
55
#' @param repository A character vector of length one, which is a file path to where the single cell data is stored
6+
#' @param genes An optional character vector of genes to return the counts for. By default counts for all genes will be returned.
67
#'
7-
#' @importFrom dplyr pull
8-
#' @importFrom tidySingleCellExperiment inner_join
9-
#' @importFrom purrr reduce
10-
#' @importFrom purrr map
8+
#' @importFrom dplyr pull filter
9+
#' @importFrom tidySingleCellExperiment inner_join
10+
#' @importFrom purrr reduce map map_int
1111
#' @importFrom BiocGenerics cbind
1212
#' @importFrom glue glue
1313
#' @importFrom dplyr as_tibble
14-
#' @importFrom HDF5Array loadHDF5SummarizedExperiment
14+
#' @importFrom HDF5Array loadHDF5SummarizedExperiment HDF5RealizationSink loadHDF5SummarizedExperiment
1515
#' @importFrom stringr str_remove
16-
#' @importFrom HDF5Array HDF5RealizationSink
17-
#' @importFrom HDF5Array loadHDF5SummarizedExperiment
1816
#' @importFrom SingleCellExperiment SingleCellExperiment
19-
#' @importFrom SummarizedExperiment colData
17+
#' @importFrom SummarizedExperiment colData assayNames<-
2018
#'
2119
#' @export
2220
#'
2321
#'
24-
get_SingleCellExperiment = function(.data, repository = "/vast/projects/RCP/human_cell_atlas/splitted_DB2_data"){
25-
22+
get_SingleCellExperiment = function(
23+
.data,
24+
repository = "/vast/projects/RCP/human_cell_atlas/splitted_DB2_data",
25+
genes = NULL
26+
){
2627
# We have to convert to an in-memory table here, or some of the dplyr operations will fail when passed a database connection
2728
raw_data = as_tibble(.data)
2829

@@ -32,47 +33,49 @@ get_SingleCellExperiment = function(.data, repository = "/vast/projects/RCP/huma
3233
unique() |>
3334
as.character()
3435

35-
message(glue("Reading {length(files_to_read)} files."))
36+
glue("Reading {length(files_to_read)} files.") |>
37+
message()
3638

37-
sce =
39+
# Load each file
40+
sces =
3841
files_to_read |>
3942
map(~ {
4043
cat(".")
41-
loadHDF5SummarizedExperiment(glue("{repository}/{.x}") ) |>
42-
inner_join(
43-
raw_data |>
44-
45-
# Needed because cell IDs are not unique outside the file_id or file_id_db
46-
filter(file_id_db == .x),
47-
by=".cell"
48-
)
49-
})
50-
51-
# Harmonise genes
52-
all_genes =
53-
sce |>
54-
map(rownames) |>
55-
unlist() |>
56-
unique()
44+
45+
sce = glue("{repository}/{.x}") |>
46+
loadHDF5SummarizedExperiment()
47+
48+
if (!is.null(genes)){
49+
# Optionally subset the genes
50+
sce = sce[
51+
intersect(genes, rownames(sce))
52+
]
53+
}
54+
55+
sce |>
56+
inner_join(
57+
# Needed because cell IDs are not unique outside the file_id or file_id_db
58+
filter(raw_data, file_id_db == .x),
59+
by=".cell"
60+
)
61+
})
5762

5863
# Drop files with one cell, which causes
5964
# the DFrame objects to combine must have the same column names
60-
sce = sce[map_int(sce, ncol)>1]
65+
sces = sces[map_int(sces, ncol)>1]
6166

6267
cat("\n")
6368

64-
6569
# Combine
6670
sce =
67-
sce |>
71+
sces |>
6872
do.call(cbind, args=_)
6973

7074
# Rename assay
71-
names(sce@assays@data) = "counts"
75+
assayNames(sce) = "counts"
7276

7377
# Return
7478
sce
75-
7679
}
7780

7881

@@ -85,12 +88,10 @@ get_SingleCellExperiment = function(.data, repository = "/vast/projects/RCP/huma
8588
#' @export
8689
#'
8790
#' @importFrom DBI dbConnect
88-
#' @importFrom RSQLite SQLite
89-
#' @importFrom RSQLite SQLITE_RO
91+
#' @importFrom RSQLite SQLite SQLITE_RO
9092
#' @importFrom dplyr tbl
9193
#'
9294
get_metadata = function(sqlite_path = "/vast/projects/RCP/human_cell_atlas/metadata.sqlite"){
93-
9495
SQLite() |>
9596
dbConnect(drv=_, dbname=sqlite_path, flags=SQLITE_RO) |>
9697
tbl("metadata")

man/get_SingleCellExperiment.Rd

Lines changed: 4 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-query.R

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,15 @@
1-
context("test query")
2-
31
library(HCAquery)
42

5-
6-
test_that("dummy", {
7-
0 |> expect_equal(0)
3+
test_that("The genes argument to get_SingleCellExperiment subsets genes", {
4+
meta = get_metadata() |> head(2)
5+
6+
# The un-subset dataset should have many genes
7+
sce_full = get_SingleCellExperiment(meta) |> row.names() |> length()
8+
expect_gt(sce_full, 1)
9+
10+
# The subset dataset should only have one gene
11+
sce_subset = get_SingleCellExperiment(meta, genes = "PUM1") |> row.names() |> length()
12+
expect_equal(sce_subset, 1)
13+
14+
expect_gt(sce_full, sce_subset)
815
})

0 commit comments

Comments
 (0)