Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 10531e7

Browse files
committed
update dev scripts
1 parent 8902d86 commit 10531e7

File tree

2 files changed

+116
-131
lines changed

2 files changed

+116
-131
lines changed

dev/HCA_cell_type_consensus.R

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,47 +9,41 @@ parquet_file = "/vast/projects/cellxgene_curated/census_samples/concensus_input.
99

1010
data_tbl <- tbl(con, sql(paste0("SELECT * FROM read_parquet('", parquet_file, "')")))
1111

12-
annotation_combination =
13-
data_tbl |>
14-
#select(azimuth_predicted.celltype.l2, monaco_first.labels.fine, blueprint_first.labels.fine) |>
15-
select(cell_, dataset_id, cell_type, cell_type_ontology_term_id, azimuth_predicted.celltype.l2, monaco_first.labels.fine, blueprint_first.labels.fine)
16-
#arrange(desc(n)) |>
17-
18-
19-
20-
12+
# annotation_combination =
13+
# data_tbl |>
14+
# #select(azimuth_predicted.celltype.l2, monaco_first.labels.fine, blueprint_first.labels.fine) |>
15+
# select(cell_, dataset_id, cell_type, cell_type_ontology_term_id, azimuth_predicted.celltype.l2, monaco_first.labels.fine, blueprint_first.labels.fine)
16+
# #arrange(desc(n)) |>
2117

2218
annotation_consensus =
23-
annotation_combination |>
19+
data_tbl |>
2420
distinct(azimuth_predicted.celltype.l2, monaco_first.labels.fine, blueprint_first.labels.fine) |>
2521
as_tibble() |>
26-
mutate(reannotation_consensus = reference_annotation_to_consensus(azimuth_input = azimuth_predicted.celltype.l2, monaco_input = monaco_first.labels.fine, blueprint_input = blueprint_first.labels.fine ))
22+
mutate(data_driven_consensus = reference_annotation_to_consensus(azimuth_input = azimuth_predicted.celltype.l2, monaco_input = monaco_first.labels.fine, blueprint_input = blueprint_first.labels.fine ))
2723

2824

29-
annotation_combination =
30-
annotation_combination |>
25+
data_tbl =
26+
data_tbl |>
3127
left_join(annotation_consensus, copy = TRUE)
3228

3329
output_parquet <- "/vast/projects/mangiola_immune_map/PostDoc/CuratedAtlasQueryR/dev/consensus_output.parquet"
3430

31+
con_write <- dbConnect(duckdb::duckdb(), dbdir = ":memory:")
32+
3533
# Use DuckDB's COPY TO command to write the data back to Parquet
3634
# We need to execute a SQL command using dbExecute()
3735
copy_query <- paste0("
3836
COPY (
3937
SELECT *
4038
FROM (
41-
", dbplyr::sql_render(annotation_combination), "
39+
", dbplyr::sql_render(data_tbl), "
4240
)
4341
) TO '", output_parquet, "' (FORMAT PARQUET);
4442
")
4543

4644
# Execute the COPY command
47-
dbExecute(con, copy_query)
45+
dbExecute(con_write, copy_query)
4846

4947
# Disconnect from the database
50-
dbDisconnect(con, shutdown = TRUE)
51-
52-
# Read back
53-
con <- dbConnect(duckdb::duckdb(), dbdir = ":memory:")
54-
data_consensus <- tbl(con, sql(paste0("SELECT * FROM read_parquet('", output_parquet, "')")))
48+
dbDisconnect(con_write, shutdown = TRUE)
5549

Lines changed: 102 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,14 @@
11
library(glue)
22
library(dplyr)
3-
library(arrow)
4-
library(SingleCellExperiment)
53
library(tibble)
6-
library(SummarizedExperiment)
74
library(glue)
85
library(purrr)
9-
library(zellkonverter)
10-
library(tidyr)
11-
library(ggplot2)
12-
library(plotly)
136
library(targets)
147
library(stringr)
15-
library(CuratedAtlasQueryR)
16-
library(fs)
178
library(HPCell)
189
library(crew.cluster)
10+
library(arrow)
11+
library(CuratedAtlasQueryR)
1912
directory = "/vast/scratch/users/shen.m/Census_rerun/split_h5ad_based_on_sample_id/"
2013
sample_anndata <- dir(glue("{directory}"), full.names = T)
2114
downloaded_samples_tbl <- read_parquet("/vast/scratch/users/shen.m/Census_rerun/census_samples_to_download_groups.parquet")
@@ -33,8 +26,8 @@ result_directory = "/vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_20
3326

3427
sample_meta <- tar_read(metadata_dataset_id_common_sample_columns, store = glue("{result_directory}/_targets"))
3528
sample_tbl = downloaded_samples_tbl |> left_join(get_metadata() |> select(dataset_id, contains("norm")) |>
36-
distinct() |> filter(!is.na(x_normalization)) |>
37-
as_tibble(), by = "dataset_id")
29+
distinct() |> filter(!is.na(x_normalization)) |>
30+
as_tibble(), by = "dataset_id")
3831

3932

4033
sample_tbl <- sample_tbl |> left_join(sample_meta, by = "dataset_id") |> distinct(file_name, tier, cell_number, dataset_id, sample_2,
@@ -64,52 +57,8 @@ sample_tbl <- sample_tbl |> left_join(sample_meta, by = "dataset_id") |> distinc
6457
x_normalization == "normalized" ~ "round negative value to 0"
6558
))
6659

67-
sample_tbl <- sample_tbl |> mutate(transformation_function = map(
68-
method_to_apply,
69-
~ ( function(data) {
70-
assay_name <- data@assays |> names() |> magrittr::extract2(1)
71-
counts <- assay(data, assay_name)
72-
density_est <- counts |> as.matrix() |> density()
73-
mode_value <- density_est$x[which.max(density_est$y)]
74-
if (mode_value < 0 ) counts <- counts + abs(mode_value)
75-
76-
# Scale max counts to 20 to avoid any downstream failure
77-
if (.x != "identity" && (max(counts) > 20)) {
78-
scale_factor = 20 / max(counts)
79-
counts <- counts * scale_factor}
80-
81-
counts <- transform_method(counts)
82-
# round counts to avoid potential substraction error due to different digits print out
83-
counts <- counts |> round(5)
84-
majority_gene_counts = names(which.max(table(as.vector(counts)))) |> as.numeric()
85-
if (majority_gene_counts != 0) {
86-
counts <- counts - majority_gene_counts
87-
}
88-
89-
# Avoid downstream failures negative counts
90-
if((counts[,1:min(10000, ncol(counts))] |> min()) < 0)
91-
counts[counts < 0] <- 0
92-
93-
# Assign counts back to data
94-
assay(data, assay_name) <- counts
95-
96-
col_sums <- colSums(counts)
97-
# Drop all zero cells
98-
data <- data[, col_sums > 0]
99-
100-
# Avoid downstream binding error
101-
rowData(data) = NULL
10260

103-
data
104-
105-
}) |>
106-
# Meta programming, replacing the transformation programmatically
107-
substitute( env = list(transform_method = as.name(.x))) |>
108-
# Evaluate back to a working function
109-
eval()
110-
))
11161

112-
sample_tbl <- readRDS("/vast/scratch/users/shen.m/Census_rerun/sample_tbl_input_for_hpcell.rds")
11362

11463
# Set the parent directory where the subdirectories will be created
11564
# parent_dir <- "~/scratch/Census_rerun/"
@@ -128,68 +77,110 @@ sample_tbl <- readRDS("/vast/scratch/users/shen.m/Census_rerun/sample_tbl_input_
12877
# }
12978

13079
# Run 1000 samples per run. Save log and result in the corresponding store
131-
store = "/vast/projects/mangiola_immune_map/PostDoc/CuratedAtlasQueryR/dev/debug_hpcell/target_store"
132-
setwd(glue("{store}"))
133-
sliced_sample_tbl = sample_tbl |> slice(2001:3000) |> select(file_name, tier, cell_number, dataset_id,
134-
sample_2, transformation_function)
80+
81+
# setwd(glue("{store}"))
82+
sliced_sample_tbl =
83+
sample_tbl |>
84+
select(file_name, tier, cell_number, dataset_id, sample_2, method_to_apply)
85+
86+
# sliced_sample_tbl =
87+
# sliced_sample_tbl |>
88+
# slice(1:20)
13589

13690
# Enable sample_names.rds to store sample names for the input
137-
sample_names <- sliced_sample_tbl |> pull(file_name) |> set_names(sliced_sample_tbl |> pull(sample_2))
138-
sample_names = sample_names |> str_replace("/home/users/allstaff/shen.m/scratch", "/vast/scratch/users/shen.m")
139-
140-
sample_names |>
141-
initialise_hpc(
142-
gene_nomenclature = "ensembl",
143-
data_container_type = "anndata",
144-
store = store,
145-
tier = sliced_sample_tbl |> pull(tier),
146-
computing_resources = list(
147-
crew_controller_slurm(
148-
name = "tier_1",
149-
script_lines = "#SBATCH --mem 35G",
150-
slurm_cpus_per_task = 1,
151-
workers = 200,
152-
tasks_max = 1,
153-
verbose = T
91+
sample_names <-
92+
sliced_sample_tbl |>
93+
pull(file_name) |>
94+
str_replace("/home/users/allstaff/shen.m/scratch", "/vast/scratch/users/shen.m") |>
95+
set_names(sliced_sample_tbl |> pull(sample_2))
96+
tiers = sliced_sample_tbl |> pull(tier)
97+
functions = sliced_sample_tbl |> pull(method_to_apply)
98+
# rm(sliced_sample_tbl)
99+
# gc()
100+
101+
job::job({
102+
103+
library(HPCell)
104+
105+
sample_names |>
106+
initialise_hpc(
107+
gene_nomenclature = "ensembl",
108+
data_container_type = "anndata",
109+
store = "/vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store",
110+
# store = "/vast/projects/cellxgene_curated/census_hpcell_oct_2024/target_store_1_20",
111+
tier = tiers,
112+
computing_resources = list(
113+
crew_controller_slurm(
114+
name = "tier_1",
115+
script_lines = "#SBATCH --mem 8G",
116+
slurm_cpus_per_task = 1,
117+
workers = 300,
118+
tasks_max = 10,
119+
verbose = T,
120+
launch_max = 5
121+
),
122+
123+
crew_controller_slurm(
124+
name = "tier_2",
125+
script_lines = "#SBATCH --mem 10G",
126+
slurm_cpus_per_task = 1,
127+
workers = 200,
128+
tasks_max = 10,
129+
verbose = T,
130+
launch_max = 5
131+
),
132+
crew_controller_slurm(
133+
name = "tier_3",
134+
script_lines = "#SBATCH --mem 15G",
135+
slurm_cpus_per_task = 1,
136+
workers = 100,
137+
tasks_max = 10,
138+
verbose = T,
139+
launch_max = 5
140+
),
141+
crew_controller_slurm(
142+
name = "tier_4",
143+
script_lines = "#SBATCH --mem 70G",
144+
slurm_cpus_per_task = 1,
145+
workers = 100,
146+
tasks_max = 10,
147+
verbose = T,
148+
launch_max = 5
149+
)
154150
),
151+
verbosity = "summary",
152+
# debug_step = "annotation_tbl_tier_4",
153+
update = "never",
154+
error = "continue",
155+
garbage_collection = 100,
156+
workspace_on_error = TRUE
155157

156-
crew_controller_slurm(
157-
name = "tier_2",
158-
script_lines = "#SBATCH --mem 60G",
159-
slurm_cpus_per_task = 1,
160-
workers = 50,
161-
tasks_max = 1,
162-
verbose = T
163-
),
164-
crew_controller_slurm(
165-
name = "tier_3",
166-
script_lines = "#SBATCH --mem 90G",
167-
slurm_cpus_per_task = 1,
168-
workers = 25,
169-
tasks_max = 1,
170-
verbose = T
171-
),
172-
crew_controller_slurm(
173-
name = "tier_4",
174-
script_lines = "#SBATCH --mem 100G",
175-
slurm_cpus_per_task = 1,
176-
workers = 14,
177-
tasks_max = 1,
178-
verbose = T
179-
)
180-
)
158+
) |>
159+
tranform_assay(fx = functions, target_output = "sce_transformed") |>
181160

182-
) |>
183-
tranform_assay(fx = sliced_sample_tbl |>
184-
pull(transformation_function),
185-
target_output = "sce_transformed")
186-
187-
|>
161+
# # Remove empty outliers based on RNA count threshold per cell
162+
remove_empty_threshold(target_input = "sce_transformed", RNA_feature_threshold = 200) |>
163+
164+
# Annotation
165+
annotate_cell_type(target_input = "sce_transformed", azimuth_reference = "pbmcref") |>
166+
167+
print()
188168

189-
# Remove empty outliers based on RNA count threshold per cell
190-
remove_empty_DropletUtils(target_input = "sce_transformed", RNA_feature_threshold = 200) |>
191169

192-
# Annotation
193-
annotate_cell_type(target_input = "sce_transformed", azimuth_reference = "pbmcref")
170+
})
171+
172+
173+
174+
tar_meta(starts_with("annotation_tbl"), store = "/vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store") |>
175+
filter(!error |> is.na()) |> arrange(desc(time)) |> select(error, name)
176+
177+
# I have to check the input of this NULL target
178+
# annotation_tbl_tier_1_ecac957542df0c20
179+
180+
tar_workspace(annotation_tbl_tier_4_8fcdb6edc543d3ea, store = "/vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store")
181+
annotation_label_transfer(sce_transformed_tier_4, empty_droplets_tbl = empty_tbl_tier_4, reference_azimuth = "pbmcref", feature_nomenclature = gene_nomenclature)
182+
183+
tar_read_raw("annotation_tbl_tier_1_ecac957542df0c20", store = "/vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store")
194184

195185

186+
# tar_invalidate(starts_with("annotation_tbl_tier_"), store = "/vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store")

0 commit comments

Comments
 (0)