33# ' @param .data A data frame containing, at minimum, a `.sample` column, which corresponds to a single cell sample ID.
44# ' This can be obtained from the [get_metadata()] function.
55# ' @param repository A character vector of length one, which is a file path to where the single cell data is stored
6+ # ' @param genes An optional character vector of genes to return the counts for. By default counts for all genes will be returned.
67# '
7- # ' @importFrom dplyr pull
8- # ' @importFrom tidySingleCellExperiment inner_join
9- # ' @importFrom purrr reduce
10- # ' @importFrom purrr map
8+ # ' @importFrom dplyr pull filter
9+ # ' @importFrom tidySingleCellExperiment inner_join
10+ # ' @importFrom purrr reduce map map_int
1111# ' @importFrom BiocGenerics cbind
1212# ' @importFrom glue glue
1313# ' @importFrom dplyr as_tibble
14- # ' @importFrom HDF5Array loadHDF5SummarizedExperiment
14+ # ' @importFrom HDF5Array loadHDF5SummarizedExperiment HDF5RealizationSink loadHDF5SummarizedExperiment
1515# ' @importFrom stringr str_remove
16- # ' @importFrom HDF5Array HDF5RealizationSink
17- # ' @importFrom HDF5Array loadHDF5SummarizedExperiment
1816# ' @importFrom SingleCellExperiment SingleCellExperiment
19- # ' @importFrom SummarizedExperiment colData
17+ # ' @importFrom SummarizedExperiment colData assayNames<-
2018# '
2119# ' @export
2220# '
2321# '
24- get_SingleCellExperiment = function (.data , repository = " /vast/projects/RCP/human_cell_atlas/splitted_DB2_data" ){
25-
22+ get_SingleCellExperiment = function (
23+ .data ,
24+ repository = " /vast/projects/RCP/human_cell_atlas/splitted_DB2_data" ,
25+ genes = NULL
26+ ){
2627 # We have to convert to an in-memory table here, or some of the dplyr operations will fail when passed a database connection
2728 raw_data = as_tibble(.data )
2829
@@ -32,47 +33,49 @@ get_SingleCellExperiment = function(.data, repository = "/vast/projects/RCP/huma
3233 unique() | >
3334 as.character()
3435
35- message(glue(" Reading {length(files_to_read)} files." ))
36+ glue(" Reading {length(files_to_read)} files." ) | >
37+ message()
3638
37- sce =
39+ # Load each file
40+ sces =
3841 files_to_read | >
3942 map(~ {
4043 cat(" ." )
41- loadHDF5SummarizedExperiment(glue(" {repository}/{.x}" ) ) | >
42- inner_join(
43- raw_data | >
44-
45- # Needed because cell IDs are not unique outside the file_id or file_id_db
46- filter(file_id_db == .x ),
47- by = " .cell"
48- )
49- })
50-
51- # Harmonise genes
52- all_genes =
53- sce | >
54- map(rownames ) | >
55- unlist() | >
56- unique()
44+
45+ sce = glue(" {repository}/{.x}" ) | >
46+ loadHDF5SummarizedExperiment()
47+
48+ if (! is.null(genes )){
49+ # Optionally subset the genes
50+ sce = sce [
51+ intersect(genes , rownames(sce ))
52+ ]
53+ }
54+
55+ sce | >
56+ inner_join(
57+ # Needed because cell IDs are not unique outside the file_id or file_id_db
58+ filter(raw_data , file_id_db == .x ),
59+ by = " .cell"
60+ )
61+ })
5762
5863 # Drop files with one cell, which causes
5964 # the DFrame objects to combine must have the same column names
60- sce = sce [map_int(sce , ncol )> 1 ]
65+ sces = sces [map_int(sces , ncol )> 1 ]
6166
6267 cat(" \n " )
6368
64-
6569 # Combine
6670 sce =
67- sce | >
71+ sces | >
6872 do.call(cbind , args = _)
6973
7074 # Rename assay
71- names (sce @ assays @ data ) = " counts"
75+ assayNames (sce ) = " counts"
7276
7377 # Return
7478 sce
75-
7679}
7780
7881
@@ -85,12 +88,10 @@ get_SingleCellExperiment = function(.data, repository = "/vast/projects/RCP/huma
8588# ' @export
8689# '
8790# ' @importFrom DBI dbConnect
88- # ' @importFrom RSQLite SQLite
89- # ' @importFrom RSQLite SQLITE_RO
91+ # ' @importFrom RSQLite SQLite SQLITE_RO
9092# ' @importFrom dplyr tbl
9193# '
9294get_metadata = function (sqlite_path = " /vast/projects/RCP/human_cell_atlas/metadata.sqlite" ){
93-
9495 SQLite() | >
9596 dbConnect(drv = _, dbname = sqlite_path , flags = SQLITE_RO ) | >
9697 tbl(" metadata" )
0 commit comments