Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 886c159

Browse files
authored
Merge pull request #28 from stemangiola/add_scaled_counts
add scale framework
2 parents 1b9407f + a1a8448 commit 886c159

File tree

7 files changed

+157
-22
lines changed

7 files changed

+157
-22
lines changed

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,10 @@ importFrom(dplyr,filter)
2020
importFrom(dplyr,pull)
2121
importFrom(dplyr,tbl)
2222
importFrom(glue,glue)
23+
importFrom(magrittr,equals)
2324
importFrom(purrr,map)
2425
importFrom(purrr,map_int)
2526
importFrom(purrr,reduce)
27+
importFrom(purrr,when)
2628
importFrom(stringr,str_remove)
2729
importFrom(tidySingleCellExperiment,inner_join)

R/query.R

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#' @param genes An optional character vector of genes to return the counts for. By default counts for all genes will be returned.
77
#'
88
#' @importFrom dplyr pull filter
9-
#' @importFrom tidySingleCellExperiment inner_join
9+
#' @importFrom tidySingleCellExperiment inner_join
1010
#' @importFrom purrr reduce map map_int
1111
#' @importFrom BiocGenerics cbind
1212
#' @importFrom glue glue
@@ -15,13 +15,20 @@
1515
#' @importFrom stringr str_remove
1616
#' @importFrom SingleCellExperiment SingleCellExperiment
1717
#' @importFrom SummarizedExperiment colData assayNames<-
18+
#' @importFrom purrr when
19+
#' @importFrom magrittr equals
1820
#'
1921
#' @export
2022
#'
2123
#'
2224
get_SingleCellExperiment = function(
2325
.data,
24-
repository = "/vast/projects/RCP/human_cell_atlas/splitted_DB2_data",
26+
assay = "counts",
27+
repository = when(
28+
assay,
29+
equals(., "counts") ~ "/vast/projects/RCP/human_cell_atlas/splitted_DB2_data",
30+
equals(., "counts_per_million") ~ "/vast/projects/RCP/human_cell_atlas/splitted_DB2_data_scaled"
31+
),
2532
genes = NULL
2633
){
2734
# We have to convert to an in-memory table here, or some of the dplyr operations will fail when passed a database connection
@@ -41,17 +48,17 @@ get_SingleCellExperiment = function(
4148
files_to_read |>
4249
map(~ {
4350
cat(".")
44-
51+
4552
sce = glue("{repository}/{.x}") |>
4653
loadHDF5SummarizedExperiment()
47-
54+
4855
if (!is.null(genes)){
4956
# Optionally subset the genes
5057
sce = sce[
51-
intersect(genes, rownames(sce))
58+
intersect(genes, rownames(sce))
5259
]
5360
}
54-
61+
5562
sce |>
5663
inner_join(
5764
# Needed because cell IDs are not unique outside the file_id or file_id_db
@@ -71,15 +78,15 @@ get_SingleCellExperiment = function(
7178
sces |>
7279
do.call(cbind, args=_)
7380

74-
# Rename assay
75-
assayNames(sce) = "counts"
81+
# Rename assay THIS WILL NOT BE NEEDED EVENTUALLY
82+
assayNames(sce) = assay
7683

7784
# Return
7885
sce
7986
}
8087

8188
#' @importFrom SeuratObject as.sparse
82-
#' @exportS3Method
89+
#' @exportS3Method
8390
as.sparse.DelayedMatrix = function(x){
8491
# This is glue to ensure the SCE -> Seurat conversion works properly with
8592
# DelayedArray types
@@ -89,7 +96,7 @@ as.sparse.DelayedMatrix = function(x){
8996
#' Given a data frame of HCA metadata, returns a Seurat object corresponding to the samples in that data frame
9097
#'
9198
#' @inheritDotParams get_SingleCellExperiment
92-
#' @importFrom Seurat as.Seurat
99+
#' @importFrom Seurat as.Seurat
93100
#' @export
94101
get_seurat = function(
95102
...

README.Rmd

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ library(dplyr)
1212
library(dbplyr)
1313
library(SingleCellExperiment)
1414
library(tidySingleCellExperiment)
15+
options("restore_SingleCellExperiment_show" = TRUE)
16+
1517
```
1618

1719
Load the data
@@ -29,10 +31,9 @@ get_metadata() |>
2931
arrange(desc(n))
3032
```
3133

32-
Query
34+
Query raw counts
3335

3436
```{r}
35-
options("restore_SingleCellExperiment_show" = TRUE)
3637
sce =
3738
get_metadata() |>
3839
filter(
@@ -47,4 +48,21 @@ sce =
4748
sce
4849
```
4950

51+
Query counts scaled per million. This is helpful if just few genes are of interest
52+
53+
```{r}
54+
sce =
55+
get_metadata() |>
56+
filter(
57+
ethnicity == "African" &
58+
assay %LIKE% "%10x%" &
59+
tissue == "lung parenchyma" &
60+
cell_type %LIKE% "%CD4%"
61+
) |>
62+
63+
get_SingleCellExperiment(assay = "counts_per_million")
64+
65+
sce
66+
```
67+
5068

README.md

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ library(dplyr)
99
library(dbplyr)
1010
library(SingleCellExperiment)
1111
library(tidySingleCellExperiment)
12+
options("restore_SingleCellExperiment_show" = TRUE)
1213
```
1314

1415
Load the data
@@ -17,7 +18,7 @@ Load the data
1718
get_metadata()
1819
```
1920

20-
## # Source: table<metadata> [?? x 48]
21+
## # Source: table<metadata> [?? x 52]
2122
## # Database: sqlite 3.39.2 [/vast/projects/RCP/human_cell_atlas/metadata.sqlite]
2223
## .cell .sample .samp…¹ assay assay…² cell_…³ cell_…⁴ devel…⁵ devel…⁶ disease
2324
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
@@ -31,7 +32,7 @@ get_metadata()
3132
## 8 AAACGG… 5f20d7… D17PrP… 10x … EFO:00… basal … CL:000… 31-yea… HsapDv… normal
3233
## 9 AAACGG… 5f20d7… D17PrP… 10x … EFO:00… lumina… CL:000… 31-yea… HsapDv… normal
3334
## 10 AAACGG… 5f20d7… D17PrP… 10x … EFO:00… basal … CL:000… 31-yea… HsapDv… normal
34-
## # … with more rows, 38 more variables: disease_ontology_term_id <chr>,
35+
## # … with more rows, 42 more variables: disease_ontology_term_id <chr>,
3536
## # ethnicity <chr>, ethnicity_ontology_term_id <chr>, file_id <chr>,
3637
## # is_primary_data.x <chr>, organism <chr>, organism_ontology_term_id <chr>,
3738
## # sample_placeholder <chr>, sex <chr>, sex_ontology_term_id <chr>,
@@ -65,10 +66,9 @@ get_metadata() |>
6566
## 10 thymus 17
6667
## # … with more rows
6768

68-
Query
69+
Query raw counts
6970

7071
``` r
71-
options("restore_SingleCellExperiment_show" = TRUE)
7272
sce =
7373
get_metadata() |>
7474
filter(
@@ -90,15 +90,53 @@ sce
9090
```
9191

9292
## class: SingleCellExperiment
93-
## dim: 28024 1571
93+
## dim: 60661 1571
94+
## metadata(0):
95+
## assays(1): counts
96+
## rownames(60661): TSPAN6 TNMD ... RP11-175I6.6 PRSS43P
97+
## rowData names(0):
98+
## colnames(1571): ACAGCCGGTCCGTTAA_F02526 GGGAATGAGCCCAGCT_F02526 ...
99+
## TACAACGTCAGCATTG_SC84 CATTCGCTCAATACCG_F02526
100+
## colData names(51): .sample .sample_name ... cell_annotation_azimuth_l2
101+
## cell_annotation_blueprint_singler
102+
## reducedDimNames(0):
103+
## mainExpName: NULL
104+
## altExpNames(0):
105+
106+
Query counts scaled per million. This is helpful if just few genes are
107+
of interest
108+
109+
``` r
110+
sce =
111+
get_metadata() |>
112+
filter(
113+
ethnicity == "African" &
114+
assay %LIKE% "%10x%" &
115+
tissue == "lung parenchyma" &
116+
cell_type %LIKE% "%CD4%"
117+
) |>
118+
119+
get_SingleCellExperiment(assay = "counts_per_million")
120+
```
121+
122+
## Reading 1 files.
123+
124+
## .
125+
126+
``` r
127+
sce
128+
```
129+
130+
## class: SingleCellExperiment
131+
## dim: 60661 1571
94132
## metadata(0):
95-
## assays(1): X
96-
## rownames(28024): TSPAN6 TNMD ... RP11-107E5.4 RP11-299P2.2
133+
## assays(1): counts_per_million
134+
## rownames(60661): TSPAN6 TNMD ... RP11-175I6.6 PRSS43P
97135
## rowData names(0):
98136
## colnames(1571): ACAGCCGGTCCGTTAA_F02526 GGGAATGAGCCCAGCT_F02526 ...
99137
## TACAACGTCAGCATTG_SC84 CATTCGCTCAATACCG_F02526
100-
## colData names(47): .sample .sample_name ... file_id_db
101-
## stringr..str_remove.stringr..str_remove..cell...sample...._...
138+
## colData names(51): .sample .sample_name ... cell_annotation_azimuth_l2
139+
## cell_annotation_blueprint_singler
102140
## reducedDimNames(0):
103141
## mainExpName: NULL
104142
## altExpNames(0):

dev/DB2_files.R

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,17 @@ transformation =
128128
pull(transformation) |>
129129
unique()
130130

131+
# Ad hoc transformations, not declared as log
132+
if(file_id %in% c(
133+
"1e81a742-e457-4fc6-9c39-c55189ec9dc2",
134+
"b51bfa2d-22b2-4c65-9803-d36d4de973fa",
135+
"4e4bbb2d-f341-4523-a5a0-5407d8b03e0e",
136+
"c48402e4-e7db-4c82-a9e9-51e285e5165c",
137+
"82ad3285-e5d4-46d1-89c0-3acf91a9e33f",
138+
"7addb561-c1bf-4fb5-ad10-16dd65b3643a",
139+
"575513b2-6e53-41e2-85a9-bc08a6233ce4"
140+
)) transformation = "log"
141+
131142
X =
132143
X |>
133144
when(

dev/scale_files.R

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
library(zellkonverter)
2+
library(Seurat)
3+
library(SingleCellExperiment) # load early to avoid masking dplyr::count()
4+
library(tidySingleCellExperiment)
5+
library(dplyr)
6+
library(cellxgenedp)
7+
library(tidyverse)
8+
#library(tidySingleCellExperiment)
9+
library(stringr)
10+
library(scMerge)
11+
library(glue)
12+
library(DelayedArray)
13+
library(HDF5Array)
14+
library(tidyseurat)
15+
library(celldex)
16+
library(SingleR)
17+
library(glmGamPoi)
18+
19+
20+
# # # CREATE MAKEFILE
21+
# tab = "\t"
22+
# root_directory = "/vast/projects/RCP/human_cell_atlas"
23+
# split_data_directory = glue("{root_directory}/splitted_DB2_data")
24+
# scaled_data_directory = glue("{root_directory}/splitted_DB2_data_scaled")
25+
#
26+
# dir(split_data_directory) |>
27+
# map( ~ glue("{scaled_data_directory}/{.x}:{split_data_directory}/{.x}\n{tab}Rscript scale_files.R {split_data_directory}/{.x} {scaled_data_directory}/{.x}")
28+
# ) |>
29+
# prepend(glue("CATEGORY=scale_data\nMEMORY=20000\nCORES=1\nWALL_TIME=30000")) |>
30+
# unlist() |>
31+
# write_lines(glue("dev/scale_files.makeflow"))
32+
33+
34+
35+
36+
37+
# Read arguments
38+
args = commandArgs(trailingOnly=TRUE)
39+
input_file = args[[1]]
40+
output_file = args[[2]]
41+
42+
# Create directory
43+
output_file |> dirname() |> dir.create( showWarnings = FALSE, recursive = TRUE)
44+
45+
# Read file_cell_types
46+
data = loadHDF5SummarizedExperiment(input_file )
47+
48+
sce = SingleCellExperiment(list(counts_per_million = scuttle::calculateCPM(data, assay.type = "X")))
49+
rownames(sce) = rownames(data)
50+
colnames(sce) = colnames(data)
51+
52+
rm(data)
53+
gc()
54+
55+
sce |> saveHDF5SummarizedExperiment(output_file, replace=TRUE)

man/get_SingleCellExperiment.Rd

Lines changed: 5 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)