11library(glue )
22library(dplyr )
3- library(arrow )
4- library(SingleCellExperiment )
53library(tibble )
6- library(SummarizedExperiment )
74library(glue )
85library(purrr )
9- library(zellkonverter )
10- library(tidyr )
11- library(ggplot2 )
12- library(plotly )
136library(targets )
147library(stringr )
15- library(CuratedAtlasQueryR )
16- library(fs )
178library(HPCell )
189library(crew.cluster )
10+ library(arrow )
11+ library(CuratedAtlasQueryR )
1912directory = " /vast/scratch/users/shen.m/Census_rerun/split_h5ad_based_on_sample_id/"
2013sample_anndata <- dir(glue(" {directory}" ), full.names = T )
2114downloaded_samples_tbl <- read_parquet(" /vast/scratch/users/shen.m/Census_rerun/census_samples_to_download_groups.parquet" )
@@ -33,8 +26,8 @@ result_directory = "/vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_20
3326
3427sample_meta <- tar_read(metadata_dataset_id_common_sample_columns , store = glue(" {result_directory}/_targets" ))
3528sample_tbl = downloaded_samples_tbl | > left_join(get_metadata() | > select(dataset_id , contains(" norm" )) | >
36- distinct() | > filter(! is.na(x_normalization )) | >
37- as_tibble(), by = " dataset_id" )
29+ distinct() | > filter(! is.na(x_normalization )) | >
30+ as_tibble(), by = " dataset_id" )
3831
3932
4033sample_tbl <- sample_tbl | > left_join(sample_meta , by = " dataset_id" ) | > distinct(file_name , tier , cell_number , dataset_id , sample_2 ,
@@ -64,52 +57,8 @@ sample_tbl <- sample_tbl |> left_join(sample_meta, by = "dataset_id") |> distinc
6457 x_normalization == " normalized" ~ " round negative value to 0"
6558 ))
6659
67- sample_tbl <- sample_tbl | > mutate(transformation_function = map(
68- method_to_apply ,
69- ~ ( function (data ) {
70- assay_name <- data @ assays | > names() | > magrittr :: extract2(1 )
71- counts <- assay(data , assay_name )
72- density_est <- counts | > as.matrix() | > density()
73- mode_value <- density_est $ x [which.max(density_est $ y )]
74- if (mode_value < 0 ) counts <- counts + abs(mode_value )
75-
76- # Scale max counts to 20 to avoid any downstream failure
77- if (.x != " identity" && (max(counts ) > 20 )) {
78- scale_factor = 20 / max(counts )
79- counts <- counts * scale_factor }
80-
81- counts <- transform_method(counts )
82- # round counts to avoid potential substraction error due to different digits print out
83- counts <- counts | > round(5 )
84- majority_gene_counts = names(which.max(table(as.vector(counts )))) | > as.numeric()
85- if (majority_gene_counts != 0 ) {
86- counts <- counts - majority_gene_counts
87- }
88-
89- # Avoid downstream failures negative counts
90- if ((counts [,1 : min(10000 , ncol(counts ))] | > min()) < 0 )
91- counts [counts < 0 ] <- 0
92-
93- # Assign counts back to data
94- assay(data , assay_name ) <- counts
95-
96- col_sums <- colSums(counts )
97- # Drop all zero cells
98- data <- data [, col_sums > 0 ]
99-
100- # Avoid downstream binding error
101- rowData(data ) = NULL
10260
103- data
104-
105- }) | >
106- # Meta programming, replacing the transformation programmatically
107- substitute( env = list (transform_method = as.name(.x ))) | >
108- # Evaluate back to a working function
109- eval()
110- ))
11161
112- sample_tbl <- readRDS(" /vast/scratch/users/shen.m/Census_rerun/sample_tbl_input_for_hpcell.rds" )
11362
11463# Set the parent directory where the subdirectories will be created
11564# parent_dir <- "~/scratch/Census_rerun/"
@@ -128,68 +77,110 @@ sample_tbl <- readRDS("/vast/scratch/users/shen.m/Census_rerun/sample_tbl_input_
12877# }
12978
13079# Run 1000 samples per run. Save log and result in the corresponding store
131- store = " /vast/projects/mangiola_immune_map/PostDoc/CuratedAtlasQueryR/dev/debug_hpcell/target_store"
132- setwd(glue(" {store}" ))
133- sliced_sample_tbl = sample_tbl | > slice(2001 : 3000 ) | > select(file_name , tier , cell_number , dataset_id ,
134- sample_2 , transformation_function )
80+
81+ # setwd(glue("{store}"))
82+ sliced_sample_tbl =
83+ sample_tbl | >
84+ select(file_name , tier , cell_number , dataset_id , sample_2 , method_to_apply )
85+
86+ # sliced_sample_tbl =
87+ # sliced_sample_tbl |>
88+ # slice(1:20)
13589
13690# Enable sample_names.rds to store sample names for the input
137- sample_names <- sliced_sample_tbl | > pull(file_name ) | > set_names(sliced_sample_tbl | > pull(sample_2 ))
138- sample_names = sample_names | > str_replace(" /home/users/allstaff/shen.m/scratch" , " /vast/scratch/users/shen.m" )
139-
140- sample_names | >
141- initialise_hpc(
142- gene_nomenclature = " ensembl" ,
143- data_container_type = " anndata" ,
144- store = store ,
145- tier = sliced_sample_tbl | > pull(tier ),
146- computing_resources = list (
147- crew_controller_slurm(
148- name = " tier_1" ,
149- script_lines = " #SBATCH --mem 35G" ,
150- slurm_cpus_per_task = 1 ,
151- workers = 200 ,
152- tasks_max = 1 ,
153- verbose = T
91+ sample_names <-
92+ sliced_sample_tbl | >
93+ pull(file_name ) | >
94+ str_replace(" /home/users/allstaff/shen.m/scratch" , " /vast/scratch/users/shen.m" ) | >
95+ set_names(sliced_sample_tbl | > pull(sample_2 ))
96+ tiers = sliced_sample_tbl | > pull(tier )
97+ functions = sliced_sample_tbl | > pull(method_to_apply )
98+ # rm(sliced_sample_tbl)
99+ # gc()
100+
101+ job :: job({
102+
103+ library(HPCell )
104+
105+ sample_names | >
106+ initialise_hpc(
107+ gene_nomenclature = " ensembl" ,
108+ data_container_type = " anndata" ,
109+ store = " /vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store" ,
110+ # store = "/vast/projects/cellxgene_curated/census_hpcell_oct_2024/target_store_1_20",
111+ tier = tiers ,
112+ computing_resources = list (
113+ crew_controller_slurm(
114+ name = " tier_1" ,
115+ script_lines = " #SBATCH --mem 8G" ,
116+ slurm_cpus_per_task = 1 ,
117+ workers = 300 ,
118+ tasks_max = 10 ,
119+ verbose = T ,
120+ launch_max = 5
121+ ),
122+
123+ crew_controller_slurm(
124+ name = " tier_2" ,
125+ script_lines = " #SBATCH --mem 10G" ,
126+ slurm_cpus_per_task = 1 ,
127+ workers = 200 ,
128+ tasks_max = 10 ,
129+ verbose = T ,
130+ launch_max = 5
131+ ),
132+ crew_controller_slurm(
133+ name = " tier_3" ,
134+ script_lines = " #SBATCH --mem 15G" ,
135+ slurm_cpus_per_task = 1 ,
136+ workers = 100 ,
137+ tasks_max = 10 ,
138+ verbose = T ,
139+ launch_max = 5
140+ ),
141+ crew_controller_slurm(
142+ name = " tier_4" ,
143+ script_lines = " #SBATCH --mem 70G" ,
144+ slurm_cpus_per_task = 1 ,
145+ workers = 100 ,
146+ tasks_max = 10 ,
147+ verbose = T ,
148+ launch_max = 5
149+ )
154150 ),
151+ verbosity = " summary" ,
152+ # debug_step = "annotation_tbl_tier_4",
153+ update = " never" ,
154+ error = " continue" ,
155+ garbage_collection = 100 ,
156+ workspace_on_error = TRUE
155157
156- crew_controller_slurm(
157- name = " tier_2" ,
158- script_lines = " #SBATCH --mem 60G" ,
159- slurm_cpus_per_task = 1 ,
160- workers = 50 ,
161- tasks_max = 1 ,
162- verbose = T
163- ),
164- crew_controller_slurm(
165- name = " tier_3" ,
166- script_lines = " #SBATCH --mem 90G" ,
167- slurm_cpus_per_task = 1 ,
168- workers = 25 ,
169- tasks_max = 1 ,
170- verbose = T
171- ),
172- crew_controller_slurm(
173- name = " tier_4" ,
174- script_lines = " #SBATCH --mem 100G" ,
175- slurm_cpus_per_task = 1 ,
176- workers = 14 ,
177- tasks_max = 1 ,
178- verbose = T
179- )
180- )
158+ ) | >
159+ tranform_assay(fx = function s , target_output = " sce_transformed" ) | >
181160
182- ) | >
183- tranform_assay(fx = sliced_sample_tbl | >
184- pull(transformation_function ),
185- target_output = " sce_transformed" )
186-
187- | >
161+ # # Remove empty outliers based on RNA count threshold per cell
162+ remove_empty_threshold(target_input = " sce_transformed" , RNA_feature_threshold = 200 ) | >
163+
164+ # Annotation
165+ annotate_cell_type(target_input = " sce_transformed" , azimuth_reference = " pbmcref" ) | >
166+
167+ print()
188168
189- # Remove empty outliers based on RNA count threshold per cell
190- remove_empty_DropletUtils(target_input = " sce_transformed" , RNA_feature_threshold = 200 ) | >
191169
192- # Annotation
193- annotate_cell_type(target_input = " sce_transformed" , azimuth_reference = " pbmcref" )
170+ })
171+
172+
173+
174+ tar_meta(starts_with(" annotation_tbl" ), store = " /vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store" ) | >
175+ filter(! error | > is.na()) | > arrange(desc(time )) | > select(error , name )
176+
177+ # I have to check the input of this NULL target
178+ # annotation_tbl_tier_1_ecac957542df0c20
179+
180+ tar_workspace(annotation_tbl_tier_4_8fcdb6edc543d3ea , store = " /vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store" )
181+ annotation_label_transfer(sce_transformed_tier_4 , empty_droplets_tbl = empty_tbl_tier_4 , reference_azimuth = " pbmcref" , feature_nomenclature = gene_nomenclature )
182+
183+ tar_read_raw(" annotation_tbl_tier_1_ecac957542df0c20" , store = " /vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store" )
194184
195185
186+ # tar_invalidate(starts_with("annotation_tbl_tier_"), store = "/vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store")
0 commit comments