Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 1e068ea

Browse files
committed
census dev scripts
1 parent 7501f03 commit 1e068ea

File tree

3 files changed

+438
-0
lines changed

3 files changed

+438
-0
lines changed

dev/census_accepted_assays.csv

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
id,assay
2+
EFO:0003755,FL-cDNA
3+
EFO:0008640,3'T-fill
4+
EFO:0008641,3’-end-seq
5+
EFO:0008643,3′-Seq
6+
EFO:0008661,Bru-Seq
7+
EFO:0008669,CAGEscan
8+
EFO:0008673,CapSeq
9+
EFO:0008675,CaptureSeq
10+
EFO:0008679,CEL-seq
11+
EFO:0008694,ClickSeq
12+
EFO:0008697,cP-RNA-Seq
13+
EFO:0008708,DeepCAGE
14+
EFO:0008710,Digital RNA
15+
EFO:0008718,DP-Seq
16+
EFO:0008720,DroNc-seq
17+
EFO:0008722,Drop-seq
18+
EFO:0008735,FACS-seq
19+
EFO:0008747,FRISCR
20+
EFO:0008748,FRT-Seq
21+
EFO:0008752,GMUCT 1.0
22+
EFO:0008753,GMUCT 2.0
23+
EFO:0008756,GRO-CAP
24+
EFO:0008763,Hi-SCL
25+
EFO:0008780,inDrop
26+
EFO:0008796,MARS-seq
27+
EFO:0008797,MATQ-seq
28+
EFO:0008824,NanoCAGE
29+
EFO:0008825,Nanogrid RNA-Seq
30+
EFO:0008826,NET-Seq
31+
EFO:0008850,PAS-Seq
32+
EFO:0008859,PEAT
33+
EFO:0008863,PLATE-Seq
34+
EFO:0008868,PRO-cap
35+
EFO:0008869,PRO-seq
36+
EFO:0008877,Quartz-seq
37+
EFO:0008896,RNA-Seq
38+
EFO:0008897,RNAtag-Seq
39+
EFO:0008898,RNET-seq
40+
EFO:0008903,SC3-seq
41+
EFO:0008908,SCI-seq
42+
EFO:0008919,Seq-Well
43+
EFO:0008929,SMA
44+
EFO:0008930,Smart-seq
45+
EFO:0008931,Smart-seq2
46+
EFO:0008937,snDrop-seq
47+
EFO:0008941,sNuc-Seq
48+
EFO:0008945,SPET-seq
49+
EFO:0008953,STRT-seq
50+
EFO:0008954,STRT-seq-2i
51+
EFO:0008956,SUPeR-seq
52+
EFO:0008962,TARDIS
53+
EFO:0008966,TCR Chain Paring
54+
EFO:0008967,TCR-LA-MC PCR
55+
EFO:0008972,TL-seq
56+
EFO:0008974,Tomo-Seq
57+
EFO:0008975,TRAP-Seq
58+
EFO:0008978,TSS Sequencing
59+
EFO:0008980,UMI Method
60+
EFO:0009309,Div-Seq
61+
EFO:0009899,10x 3' v2
62+
EFO:0009900,10x 5' v2
63+
EFO:0009901,10x 3' v1
64+
EFO:0009919,SPLiT-seq
65+
EFO:0009922,10x 3' v3
66+
EFO:0009991,Nuc-Seq
67+
EFO:0010003,RASL-seq
68+
EFO:0010004,SCRB-seq
69+
EFO:0010010,CEL-seq2
70+
EFO:0010022,Smart-3Seq
71+
EFO:0010034,Cappable-Seq
72+
EFO:0010041,Nascent-Seq
73+
EFO:0010058,Fluidigm C1-based library preparation
74+
EFO:0010184,Smart-like
75+
EFO:0010550,sci-RNA-seq
76+
EFO:0010713,10x immune profiling
77+
EFO:0010714,10x TCR enrichment
78+
EFO:0010715,10x Ig enrichment
79+
EFO:0010964,barcoded plate-based single cell RNA-seq
80+
EFO:0011025,10x 5' v1
81+
EFO:0022396,TruSeq
82+
EFO:0022488,Smart-seq3
83+
EFO:0022490,ScaleBio single cell RNA sequencing
84+
EFO:0030002,microwell-seq
85+
EFO:0030003,10x 3' transcription profiling
86+
EFO:0030004,10x 5' transcription profiling
87+
EFO:0030019,Seq-Well S3
88+
EFO:0030021,Nx1-seq
89+
EFO:0030028,sci-RNA-seq3
90+
EFO:0030030,Quant-seq
91+
EFO:0030031,SCOPE-chip
92+
EFO:0030061,mcSCRB-seq
93+
EFO:0030074,SORT-seq
94+
EFO:0030078,droplet-based single-cell RNA library preparation
95+
EFO:0700003,BD Rhapsody Whole Transcriptome Analysis
96+
EFO:0700004,BD Rhapsody Targeted mRNA
97+
EFO:0700010,TruDrop
98+
EFO:0700011,GEXSCOPE technology
99+
EFO:0700016,Smart-seq v4
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
library(glue)
2+
library(dplyr)
3+
library(arrow)
4+
library(SingleCellExperiment)
5+
library(tibble)
6+
library(SummarizedExperiment)
7+
library(glue)
8+
library(purrr)
9+
library(zellkonverter)
10+
library(tidyr)
11+
library(ggplot2)
12+
library(plotly)
13+
library(targets)
14+
library(stringr)
15+
library(CuratedAtlasQueryR)
16+
library(fs)
17+
library(HPCell)
18+
library(crew.cluster)
19+
directory = "~/scratch/Census_rerun/split_h5ad_based_on_sample_id/"
20+
sample_anndata <- dir(glue("{directory}"), full.names = T)
21+
downloaded_samples_tbl <- read_parquet("~/scratch/Census_rerun/census_samples_to_download_groups.parquet")
22+
downloaded_samples_tbl <- downloaded_samples_tbl |>
23+
rename(cell_number = list_length) |>
24+
mutate(file_name = glue("{directory}{sample_2}.h5ad") |> as.character(),
25+
tier = case_when(
26+
cell_number < 500 ~ "tier_1", cell_number >= 500 &
27+
cell_number < 1000 ~ "tier_2", cell_number >= 1000 &
28+
cell_number < 10000 ~ "tier_3", cell_number >= 10000 ~ "tier_4"
29+
))
30+
31+
result_directory = "/vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024"
32+
sample_meta <- tar_read(metadata_dataset_id_common_sample_columns, store = glue("{result_directory}/_targets"))
33+
sample_tbl = downloaded_samples_tbl |> left_join(get_metadata() |> select(dataset_id, contains("norm")) |>
34+
distinct() |> filter(!is.na(x_normalization)) |>
35+
as_tibble(), by = "dataset_id")
36+
37+
38+
sample_tbl <- sample_tbl |> left_join(sample_meta, by = "dataset_id") |> distinct(file_name, tier, cell_number, dataset_id, sample_2,
39+
observation_joinid, x_normalization, x_approximate_distribution) |>
40+
mutate(transform_method = case_when(str_like(x_normalization, "C%") ~ "log",
41+
x_normalization == "none" ~ "log",
42+
x_normalization == "normalized" ~ "log",
43+
is.na(x_normalization) & is.na(x_approximate_distribution) ~ "log",
44+
is.na(x_normalization) & x_approximate_distribution == "NORMAL" ~ "NORMAL",
45+
is.na(x_normalization) & x_approximate_distribution == "COUNT" ~ "COUNT",
46+
str_like(x_normalization, "%canpy%") ~ "log1p",
47+
TRUE ~ x_normalization)) |>
48+
49+
mutate(method_to_apply = case_when(transform_method %in% c("log","LogNormalization","LogNormalize","log-normalization") ~ "exp",
50+
is.na(x_normalization) & is.na(x_approximate_distribution) ~ "exp",
51+
str_like(transform_method, "Counts%") ~ "exp",
52+
str_like(transform_method, "%log2%") ~ "exp",
53+
transform_method %in% c("log1p", "log1p, base e", "Scanpy",
54+
"scanpy.api.pp.normalize_per_cell method, scaling factor 10000") ~ "expm1",
55+
transform_method == "log1p, base 2" ~ "expm1",
56+
transform_method == "NORMAL" ~ "exp",
57+
transform_method == "COUNT" ~ "identity",
58+
is.na(transform_method) ~ "identity"
59+
) ) |>
60+
mutate(comment = case_when(str_like(x_normalization, "Counts%") ~ "a checkpoint for max value of Assay must <= 50",
61+
is.na(x_normalization) & is.na(x_approximate_distribution) ~ "round negative value to 0",
62+
x_normalization == "normalized" ~ "round negative value to 0"
63+
))
64+
65+
sample_tbl <- sample_tbl |> mutate(transformation_function = map(
66+
method_to_apply,
67+
~ ( function(data) {
68+
assay_name <- data@assays |> names() |> magrittr::extract2(1)
69+
counts <- assay(data, assay_name)
70+
density_est <- counts |> as.matrix() |> density()
71+
mode_value <- density_est$x[which.max(density_est$y)]
72+
if (mode_value < 0 ) counts <- counts + abs(mode_value)
73+
74+
# Scale max counts to 20 to avoid any downstream failure
75+
if (.x != "identity" && (max(counts) > 20)) {
76+
scale_factor = 20 / max(counts)
77+
counts <- counts * scale_factor}
78+
79+
counts <- transform_method(counts)
80+
# round counts to avoid potential substraction error due to different digits print out
81+
counts <- counts |> round(5)
82+
majority_gene_counts = names(which.max(table(as.vector(counts)))) |> as.numeric()
83+
if (majority_gene_counts != 0) {
84+
counts <- counts - majority_gene_counts
85+
}
86+
87+
# Avoid downstream failures negative counts
88+
if((counts[,1:min(10000, ncol(counts))] |> min()) < 0)
89+
counts[counts < 0] <- 0
90+
91+
# Assign counts back to data
92+
assay(data, assay_name) <- counts
93+
94+
col_sums <- colSums(counts)
95+
# Drop all zero cells
96+
data <- data[, col_sums > 0]
97+
98+
# Avoid downstream binding error
99+
rowData(data) = NULL
100+
101+
data
102+
103+
}) |>
104+
# Meta programming, replacing the transformation programmatically
105+
substitute( env = list(transform_method = as.name(.x))) |>
106+
# Evaluate back to a working function
107+
eval()
108+
))
109+
110+
#sample_tbl |> saveRDS("~/scratch/Census_rerun/sample_tbl_input_for_hpcell.rds")
111+
sample_tbl <- readRDS("~/scratch/Census_rerun/sample_tbl_input_for_hpcell.rds")
112+
113+
# Set the parent directory where the subdirectories will be created
114+
# parent_dir <- "~/scratch/Census_rerun/"
115+
#
116+
# # Directory names to create
117+
# dir_names <- paste0("run", 1:25)
118+
#
119+
# # Full paths of the directories
120+
# full_dir_paths <- file.path(parent_dir, dir_names)
121+
#
122+
# # Create each directory if it does not exist
123+
# for (dir_path in full_dir_paths) {
124+
# if (!dir_exists(dir_path)) {
125+
# dir_create(dir_path)
126+
# }
127+
# }
128+
129+
# Run 1000 samples per run. Save log and result in the corresponding store
130+
store = "~/scratch/Census_rerun/run3/"
131+
setwd(glue("{store}"))
132+
sliced_sample_tbl = sample_tbl |> slice(2001:3000) |> select(file_name, tier, cell_number, dataset_id,
133+
sample_2, transformation_function)
134+
135+
# Enable sample_names.rds to store sample names for the input
136+
sample_names <- sliced_sample_tbl |> pull(file_name) |> set_names(sliced_sample_tbl |> pull(sample_2))
137+
138+
sample_names |>
139+
initialise_hpc(
140+
gene_nomenclature = "ensembl",
141+
data_container_type = "anndata",
142+
store = store,
143+
tier = sliced_sample_tbl |> pull(tier),
144+
computing_resources = list(
145+
crew_controller_slurm(
146+
name = "tier_1",
147+
script_lines = "#SBATCH --mem 35G",
148+
slurm_cpus_per_task = 1,
149+
workers = 200,
150+
tasks_max = 1,
151+
verbose = T
152+
),
153+
154+
crew_controller_slurm(
155+
name = "tier_2",
156+
script_lines = "#SBATCH --mem 60G",
157+
slurm_cpus_per_task = 1,
158+
workers = 50,
159+
tasks_max = 1,
160+
verbose = T
161+
),
162+
crew_controller_slurm(
163+
name = "tier_3",
164+
script_lines = "#SBATCH --mem 90G",
165+
slurm_cpus_per_task = 1,
166+
workers = 25,
167+
tasks_max = 1,
168+
verbose = T
169+
),
170+
crew_controller_slurm(
171+
name = "tier_4",
172+
script_lines = "#SBATCH --mem 100G",
173+
slurm_cpus_per_task = 1,
174+
workers = 14,
175+
tasks_max = 1,
176+
verbose = T
177+
)
178+
)
179+
180+
) |>
181+
tranform_assay(fx = sliced_sample_tbl |>
182+
pull(transformation_function),
183+
target_output = "sce_transformed") |>
184+
185+
# Remove empty outliers based on RNA count threshold per cell
186+
remove_empty_threshold(target_input = "sce_transformed", RNA_feature_threshold = 200) |>
187+
188+
# Remove dead cells
189+
remove_dead_scuttle(target_input = "sce_transformed") |>
190+
191+
# Score cell cycle
192+
score_cell_cycle_seurat(target_input = "sce_transformed") |>
193+
194+
# Remove doublets
195+
remove_doublets_scDblFinder(target_input = "sce_transformed") |>
196+
197+
# Annotation
198+
annotate_cell_type(target_input = "sce_transformed", azimuth_reference = "pbmcref") |>
199+
200+
normalise_abundance_seurat_SCT(
201+
factors_to_regress = c("subsets_Mito_percent", "subsets_Ribo_percent", "G2M.Score"),
202+
target_input = "sce_transformed"
203+
)
204+
205+

0 commit comments

Comments
 (0)