Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 2fed3a7

Browse files
authored
Merge pull request #152 from myushen/master
Add prostate atlas
2 parents edd72cf + dbc4bc8 commit 2fed3a7

File tree

7 files changed

+461
-6
lines changed

7 files changed

+461
-6
lines changed

DESCRIPTION

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Type: Package
22
Package: CuratedAtlasQueryR
33
Title: Queries the Human Cell Atlas
4-
Version: 1.3.7
4+
Version: 1.4.7
55
Authors@R: c(
66
person(
77
"Stefano",
@@ -126,7 +126,7 @@ biocViews:
126126
Transcription,
127127
Transcriptomics
128128
Encoding: UTF-8
129-
RoxygenNote: 7.3.1
129+
RoxygenNote: 7.3.2
130130
LazyDataCompression: xz
131131
URL: https://github.com/stemangiola/CuratedAtlasQueryR
132132
BugReports: https://github.com/stemangiola/CuratedAtlasQueryR/issues

R/metadata.R

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ cache <- rlang::env(
1010
)
1111

1212
#' Returns the URLs for all metadata files
13-
#' @param databases A character vector specifying the names of the metadata files. Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")
13+
#' @param databases A character vector specifying the names of the metadata files.
14+
#' Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet", "prostate.0.1.0.parquet")
1415
#' @export
1516
#' @return A character vector of URLs to parquet files to download
1617
#' @examples
@@ -20,7 +21,8 @@ cache <- rlang::env(
2021
#' immune system across age, sex and ethnicity." bioRxiv (2023): 2023-06.
2122
#' doi:10.1101/2023.06.08.542671.
2223
#' @source [Mangiola et al.,2023](https://www.biorxiv.org/content/10.1101/2023.06.08.542671v3)
23-
get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")) {
24+
get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet",
25+
"prostate.0.1.0.parquet")) {
2426
glue::glue(
2527
"https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/{databases}")
2628
}

dev/census_accepted_assays.csv

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
id,assay
2+
EFO:0003755,FL-cDNA
3+
EFO:0008640,3'T-fill
4+
EFO:0008641,3’-end-seq
5+
EFO:0008643,3′-Seq
6+
EFO:0008661,Bru-Seq
7+
EFO:0008669,CAGEscan
8+
EFO:0008673,CapSeq
9+
EFO:0008675,CaptureSeq
10+
EFO:0008679,CEL-seq
11+
EFO:0008694,ClickSeq
12+
EFO:0008697,cP-RNA-Seq
13+
EFO:0008708,DeepCAGE
14+
EFO:0008710,Digital RNA
15+
EFO:0008718,DP-Seq
16+
EFO:0008720,DroNc-seq
17+
EFO:0008722,Drop-seq
18+
EFO:0008735,FACS-seq
19+
EFO:0008747,FRISCR
20+
EFO:0008748,FRT-Seq
21+
EFO:0008752,GMUCT 1.0
22+
EFO:0008753,GMUCT 2.0
23+
EFO:0008756,GRO-CAP
24+
EFO:0008763,Hi-SCL
25+
EFO:0008780,inDrop
26+
EFO:0008796,MARS-seq
27+
EFO:0008797,MATQ-seq
28+
EFO:0008824,NanoCAGE
29+
EFO:0008825,Nanogrid RNA-Seq
30+
EFO:0008826,NET-Seq
31+
EFO:0008850,PAS-Seq
32+
EFO:0008859,PEAT
33+
EFO:0008863,PLATE-Seq
34+
EFO:0008868,PRO-cap
35+
EFO:0008869,PRO-seq
36+
EFO:0008877,Quartz-seq
37+
EFO:0008896,RNA-Seq
38+
EFO:0008897,RNAtag-Seq
39+
EFO:0008898,RNET-seq
40+
EFO:0008903,SC3-seq
41+
EFO:0008908,SCI-seq
42+
EFO:0008919,Seq-Well
43+
EFO:0008929,SMA
44+
EFO:0008930,Smart-seq
45+
EFO:0008931,Smart-seq2
46+
EFO:0008937,snDrop-seq
47+
EFO:0008941,sNuc-Seq
48+
EFO:0008945,SPET-seq
49+
EFO:0008953,STRT-seq
50+
EFO:0008954,STRT-seq-2i
51+
EFO:0008956,SUPeR-seq
52+
EFO:0008962,TARDIS
53+
EFO:0008966,TCR Chain Paring
54+
EFO:0008967,TCR-LA-MC PCR
55+
EFO:0008972,TL-seq
56+
EFO:0008974,Tomo-Seq
57+
EFO:0008975,TRAP-Seq
58+
EFO:0008978,TSS Sequencing
59+
EFO:0008980,UMI Method
60+
EFO:0009309,Div-Seq
61+
EFO:0009899,10x 3' v2
62+
EFO:0009900,10x 5' v2
63+
EFO:0009901,10x 3' v1
64+
EFO:0009919,SPLiT-seq
65+
EFO:0009922,10x 3' v3
66+
EFO:0009991,Nuc-Seq
67+
EFO:0010003,RASL-seq
68+
EFO:0010004,SCRB-seq
69+
EFO:0010010,CEL-seq2
70+
EFO:0010022,Smart-3Seq
71+
EFO:0010034,Cappable-Seq
72+
EFO:0010041,Nascent-Seq
73+
EFO:0010058,Fluidigm C1-based library preparation
74+
EFO:0010184,Smart-like
75+
EFO:0010550,sci-RNA-seq
76+
EFO:0010713,10x immune profiling
77+
EFO:0010714,10x TCR enrichment
78+
EFO:0010715,10x Ig enrichment
79+
EFO:0010964,barcoded plate-based single cell RNA-seq
80+
EFO:0011025,10x 5' v1
81+
EFO:0022396,TruSeq
82+
EFO:0022488,Smart-seq3
83+
EFO:0022490,ScaleBio single cell RNA sequencing
84+
EFO:0030002,microwell-seq
85+
EFO:0030003,10x 3' transcription profiling
86+
EFO:0030004,10x 5' transcription profiling
87+
EFO:0030019,Seq-Well S3
88+
EFO:0030021,Nx1-seq
89+
EFO:0030028,sci-RNA-seq3
90+
EFO:0030030,Quant-seq
91+
EFO:0030031,SCOPE-chip
92+
EFO:0030061,mcSCRB-seq
93+
EFO:0030074,SORT-seq
94+
EFO:0030078,droplet-based single-cell RNA library preparation
95+
EFO:0700003,BD Rhapsody Whole Transcriptome Analysis
96+
EFO:0700004,BD Rhapsody Targeted mRNA
97+
EFO:0700010,TruDrop
98+
EFO:0700011,GEXSCOPE technology
99+
EFO:0700016,Smart-seq v4
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
library(glue)
2+
library(dplyr)
3+
library(arrow)
4+
library(SingleCellExperiment)
5+
library(tibble)
6+
library(SummarizedExperiment)
7+
library(glue)
8+
library(purrr)
9+
library(zellkonverter)
10+
library(tidyr)
11+
library(ggplot2)
12+
library(plotly)
13+
library(targets)
14+
library(stringr)
15+
library(CuratedAtlasQueryR)
16+
library(fs)
17+
library(HPCell)
18+
library(crew.cluster)
19+
directory = "/home/users/allstaff/shen.m/scratch/Census_rerun/split_h5ad_based_on_sample_id/"
20+
sample_anndata <- dir(glue("{directory}"), full.names = T)
21+
downloaded_samples_tbl <- read_parquet("/home/users/allstaff/shen.m/scratch/Census_rerun/census_samples_to_download_groups.parquet")
22+
downloaded_samples_tbl <- downloaded_samples_tbl |>
23+
rename(cell_number = list_length) |>
24+
mutate(cell_number = cell_number |> as.integer(),
25+
file_name = glue("{directory}{sample_2}.h5ad") |> as.character(),
26+
tier = case_when(
27+
cell_number < 500 ~ "tier_1", cell_number >= 500 &
28+
cell_number < 1000 ~ "tier_2", cell_number >= 1000 &
29+
cell_number < 10000 ~ "tier_3", cell_number >= 10000 ~ "tier_4"
30+
))
31+
32+
result_directory = "/vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024"
33+
sample_meta <- tar_read(metadata_dataset_id_common_sample_columns, store = glue("{result_directory}/_targets"))
34+
sample_tbl = downloaded_samples_tbl |> left_join(get_metadata() |> select(dataset_id, contains("norm")) |>
35+
distinct() |> filter(!is.na(x_normalization)) |>
36+
as_tibble(), by = "dataset_id")
37+
38+
39+
sample_tbl <- sample_tbl |> left_join(sample_meta, by = "dataset_id") |> distinct(file_name, tier, cell_number, dataset_id, sample_2,
40+
x_normalization, x_approximate_distribution) |>
41+
mutate(transform_method = case_when(str_like(x_normalization, "C%") ~ "log",
42+
x_normalization == "none" ~ "log",
43+
x_normalization == "normalized" ~ "log",
44+
is.na(x_normalization) & is.na(x_approximate_distribution) ~ "log",
45+
is.na(x_normalization) & x_approximate_distribution == "NORMAL" ~ "NORMAL",
46+
is.na(x_normalization) & x_approximate_distribution == "COUNT" ~ "COUNT",
47+
str_like(x_normalization, "%canpy%") ~ "log1p",
48+
TRUE ~ x_normalization)) |>
49+
50+
mutate(method_to_apply = case_when(transform_method %in% c("log","LogNormalization","LogNormalize","log-normalization") ~ "exp",
51+
is.na(x_normalization) & is.na(x_approximate_distribution) ~ "exp",
52+
str_like(transform_method, "Counts%") ~ "exp",
53+
str_like(transform_method, "%log2%") ~ "exp",
54+
transform_method %in% c("log1p", "log1p, base e", "Scanpy",
55+
"scanpy.api.pp.normalize_per_cell method, scaling factor 10000") ~ "expm1",
56+
transform_method == "log1p, base 2" ~ "expm1",
57+
transform_method == "NORMAL" ~ "exp",
58+
transform_method == "COUNT" ~ "identity",
59+
is.na(transform_method) ~ "identity"
60+
) ) |>
61+
mutate(comment = case_when(str_like(x_normalization, "Counts%") ~ "a checkpoint for max value of Assay must <= 50",
62+
is.na(x_normalization) & is.na(x_approximate_distribution) ~ "round negative value to 0",
63+
x_normalization == "normalized" ~ "round negative value to 0"
64+
))
65+
66+
sample_tbl <- sample_tbl |> mutate(transformation_function = map(
67+
method_to_apply,
68+
~ ( function(data) {
69+
assay_name <- data@assays |> names() |> magrittr::extract2(1)
70+
counts <- assay(data, assay_name)
71+
density_est <- counts |> as.matrix() |> density()
72+
mode_value <- density_est$x[which.max(density_est$y)]
73+
if (mode_value < 0 ) counts <- counts + abs(mode_value)
74+
75+
# Scale max counts to 20 to avoid any downstream failure
76+
if (.x != "identity" && (max(counts) > 20)) {
77+
scale_factor = 20 / max(counts)
78+
counts <- counts * scale_factor}
79+
80+
counts <- transform_method(counts)
81+
# round counts to avoid potential substraction error due to different digits print out
82+
counts <- counts |> round(5)
83+
majority_gene_counts = names(which.max(table(as.vector(counts)))) |> as.numeric()
84+
if (majority_gene_counts != 0) {
85+
counts <- counts - majority_gene_counts
86+
}
87+
88+
# Avoid downstream failures negative counts
89+
if((counts[,1:min(10000, ncol(counts))] |> min()) < 0)
90+
counts[counts < 0] <- 0
91+
92+
# Assign counts back to data
93+
assay(data, assay_name) <- counts
94+
95+
col_sums <- colSums(counts)
96+
# Drop all zero cells
97+
data <- data[, col_sums > 0]
98+
99+
# Avoid downstream binding error
100+
rowData(data) = NULL
101+
102+
data
103+
104+
}) |>
105+
# Meta programming, replacing the transformation programmatically
106+
substitute( env = list(transform_method = as.name(.x))) |>
107+
# Evaluate back to a working function
108+
eval()
109+
))
110+
111+
#sample_tbl |> saveRDS("~/scratch/Census_rerun/sample_tbl_input_for_hpcell.rds")
112+
sample_tbl <- readRDS("~/scratch/Census_rerun/sample_tbl_input_for_hpcell.rds")
113+
114+
# Set the parent directory where the subdirectories will be created
115+
# parent_dir <- "~/scratch/Census_rerun/"
116+
#
117+
# # Directory names to create
118+
# dir_names <- paste0("run", 1:25)
119+
#
120+
# # Full paths of the directories
121+
# full_dir_paths <- file.path(parent_dir, dir_names)
122+
#
123+
# # Create each directory if it does not exist
124+
# for (dir_path in full_dir_paths) {
125+
# if (!dir_exists(dir_path)) {
126+
# dir_create(dir_path)
127+
# }
128+
# }
129+
130+
# Run 1000 samples per run. Save log and result in the corresponding store
131+
store = "~/scratch/Census_rerun/run3/"
132+
setwd(glue("{store}"))
133+
sliced_sample_tbl = sample_tbl |> slice(2001:3000) |> select(file_name, tier, cell_number, dataset_id,
134+
sample_2, transformation_function)
135+
136+
# Enable sample_names.rds to store sample names for the input
137+
sample_names <- sliced_sample_tbl |> pull(file_name) |> set_names(sliced_sample_tbl |> pull(sample_2))
138+
139+
sample_names |>
140+
initialise_hpc(
141+
gene_nomenclature = "ensembl",
142+
data_container_type = "anndata",
143+
store = store,
144+
tier = sliced_sample_tbl |> pull(tier),
145+
computing_resources = list(
146+
crew_controller_slurm(
147+
name = "tier_1",
148+
script_lines = "#SBATCH --mem 35G",
149+
slurm_cpus_per_task = 1,
150+
workers = 200,
151+
tasks_max = 1,
152+
verbose = T
153+
),
154+
155+
crew_controller_slurm(
156+
name = "tier_2",
157+
script_lines = "#SBATCH --mem 60G",
158+
slurm_cpus_per_task = 1,
159+
workers = 50,
160+
tasks_max = 1,
161+
verbose = T
162+
),
163+
crew_controller_slurm(
164+
name = "tier_3",
165+
script_lines = "#SBATCH --mem 90G",
166+
slurm_cpus_per_task = 1,
167+
workers = 25,
168+
tasks_max = 1,
169+
verbose = T
170+
),
171+
crew_controller_slurm(
172+
name = "tier_4",
173+
script_lines = "#SBATCH --mem 100G",
174+
slurm_cpus_per_task = 1,
175+
workers = 14,
176+
tasks_max = 1,
177+
verbose = T
178+
)
179+
)
180+
181+
) |>
182+
tranform_assay(fx = sliced_sample_tbl |>
183+
pull(transformation_function),
184+
target_output = "sce_transformed") |>
185+
186+
# Remove empty outliers based on RNA count threshold per cell
187+
remove_empty_threshold(target_input = "sce_transformed", RNA_feature_threshold = 200) |>
188+
189+
# Remove dead cells
190+
remove_dead_scuttle(target_input = "sce_transformed") |>
191+
192+
# Score cell cycle
193+
score_cell_cycle_seurat(target_input = "sce_transformed") |>
194+
195+
# Remove doublets
196+
remove_doublets_scDblFinder(target_input = "sce_transformed") |>
197+
198+
# Annotation
199+
annotate_cell_type(target_input = "sce_transformed", azimuth_reference = "pbmcref") |>
200+
201+
normalise_abundance_seurat_SCT(
202+
factors_to_regress = c("subsets_Mito_percent", "subsets_Ribo_percent", "G2M.Score"),
203+
target_input = "sce_transformed"
204+
)
205+
206+

0 commit comments

Comments
 (0)