@@ -13,23 +13,24 @@ library(HDF5Array)
1313library(openssl )
1414library(stringr )
1515library(HCAquery )
16+ library(purrr )
1617# # source("utility.R")
1718options(scipen = 999 )
1819#
1920
20- # CREATE MAKEFILE
21- tab = " \t "
22- root_directory = " /vast/projects/RCP/human_cell_atlas"
23- raw_data_directory = glue(" {root_directory}/raw_data" )
24- splitted_DB2_data_directory = glue(" {root_directory}/splitted_DB2_data " )
25- file_cell_types_directory = glue(" {root_directory}/file_cell_types" )
26- input_files_path = dir(file_cell_types_directory , full.names = TRUE )
27- gene_names = glue(" {root_directory}/gene_names.rds" )
21+ # # CREATE MAKEFILE
22+ # tab = "\t"
23+ # root_directory = "/vast/projects/RCP/human_cell_atlas"
24+ # raw_data_directory = glue("{root_directory}/raw_data")
25+ # splitted_DB2_data_directory = glue("{root_directory}/splitted_DB2_data_0.2 ")
26+ # file_cell_types_directory = glue("{root_directory}/file_cell_types")
27+ # input_files_path = dir(file_cell_types_directory, full.names = TRUE)
28+ # gene_names = glue("{root_directory}/gene_names.rds")
2829#
2930#
3031# ## metadata = readRDS(metadata_path)
3132#
32- # get_metadata() |>
33+ # get_metadata("/vast/projects/RCP/human_cell_atlas/metadata_annotated_0.2.sqlite" ) |>
3334# distinct(file_id, cell_type) |>
3435# as_tibble() |>
3536#
@@ -45,23 +46,21 @@ gene_names = glue("{root_directory}/gene_names.rds")
4546# mutate(memory = pmax(Mb * 2, 30000)) |>
4647# mutate(
4748# memory = case_when(
48- # input_file_path %in% c(
49- # "/vast/scratch/users/mangiola.s/human_cell_atlas/raw_data/b50b15f1-bf19-4775-ab89-02512ec941a6.H5AD",
50- # "/vast/scratch/users/mangiola.s/human_cell_atlas/raw_data/56e0359f-ee8d-4ba5-a51d-159a183643e5.H5AD",
51- # "/vast/scratch/users/mangiola.s/human_cell_atlas/raw_data/51f114ae-232a-4550-a910-934e175db814.H5AD",
52- # "/vast/scratch/users/mangiola.s/human_cell_atlas/raw_data/21ca95b3-776b-4fa2-9956-09a07c0e5224.H5AD"
53- # ) ~ 160000,
54- # input_file_path == "/vast/scratch/users/mangiola.s/human_cell_atlas/raw_data/08247324-7ab7-45c5-8bd6-6c22676761ed.H5AD" ~ 200000,
55- #
49+ # file_id_db2 %in% c(
50+ # '643d1a28d35ed6d9f087ed8732d0fbf0', 'f7cb9ed27af8bd19b8b05feeaf84ec3c', 'abb6ff154b70c74e395add7dca444bbc', '6690e8e6b89123b1e11eef9d4a153e49', '5734fb9bd18f48c88f1bab6e1ea6e5f7', 'd9f28c596bb5813d7da2e690b0473570', 'fea1eba9397c3763e77dc321ac1c35b8', '6b312db0bbe47249bf6dbb382a0f25aa', 'a252f45beddcd62c5ccc0894461861e2', '2d7cc7e650275abf80b672febbdbb47f', '64b9770f1679dc9a6d8b7b12613391ef', '133f0de6e86d712c079af21344ae4501', '2d6fa434060cd4e16987245c35294fdb', '5754ab342ca2f630f442c6fb11f7977f', 'b7411ae6d2d5025fe8735c9f5635bed3', '5a7c941fcf37216ce181e5af9857a897', 'b48aab951c5ac53a9b9b4250c309cb04', '9fe44e0afee2d48bbe9b9922490c4fdf', 'c4cda7f034432e9f88588905e96b8878', 'cb0fa793cff47ff26de5ebcf9d75d8d8', 'ec2cf3480224c44c2f87875b8a383416', '6c3915e70f391f4cc6d8460b8237e686', 'ecc5c83172b38a4c2d703cef9fee6600', '19bc78159fda9d0795ff82201a493627', '5e0dba8e536570c43c0e79bb4695671b', '680201e2dae91ed66c4cb2da45590607', '94c522ee71123a3b115bdebce647fb88', 'b448edda111f93495eb9eb0488c95613', 'ad33c246db845178ffa1d6d1de15946e', '672f90e815461cf1fde0fed7d83528b3', '95b234bb577816c848af6bdbf4e5f639', '5bf8f63f1b8435f51db88701578c9944', 'e604de2b6508a0056cf187b1790e2a2e', '7df8990ca100ac47df3dc857a739a531', 'a983aca109e8ef8cdfa2befbfec8a85d', 'e6cf0c9d608465fe69fc477e5d24a4da', 'a224c2660000d6fbc79296b31ddb05dd', '321ab8ef8102d8e9817588539216fcb2', 'd8fe285941ee94128c466ffe2fc72214', 'f464369e48799c7225ce613d4fac64e4', '60981c2a4aba42bfb81d4d52eb7d91c4', '2b2ca0d6dce081902657f28e6942a546', 'd48585f19116758ac55715d36af29faf', '2763074a04eafc1590ec8e4195c2d30b', '894c26dfae23520d5b1ec13681b973d4', 'bb270e6b0aea2bba1f8f044402b50d0f', '3837b82ce7110024d6010eb668c4d65f', 'afd38219dfc9fa010b181a6f1dc167df', 'f29f36400dcaa5b550e055dee3a6bfab', '0031d2758f70b4602aa5980a96a96a12', '044fc974c7f546b2f61d15ae5c804ddf', '562594615b5d41f9ebe52c2489325045', '9f7827abf98794f8c37d87e5de01691b', '1994ae237838e3271488f98a7a5afc37', 'aa9b46796d3d0e7cf774c12e34cc872f', '2bc6ef5c0f153cdaba618788f75dff13', '124aa1104d8fed6fa9e98fa2ca5c00eb', 'c032b8738a8d06e9f261f76564de0ec8', 'c54f874f27a35cce97fcb3c67701c8c3', 'bc89346be5f9ffc76d8b920f117f244d', '136bcc485075f101e9d109d0222f7211', '9ed4da7076000ffac93a812b075812da', '90099dad83392f82d7cccaff14ab21a8', '2bcc0d793ed9db6ac939c86f4fa67044', 'e29a49f612b72c770276406500392790', '88eeb885ba02a8f5a30202a65f2314d2', '93cfef1122d69293a9f19c054e031b0a', 'cfe2deeeb735797b871828d9860eff41', '11b27ed03f34435b1ed9b513705a99cf', 'b72533ee50311aded927f2d4acc2d696', '63473d2b4aabefe0385fc0d8a0378115', '6909d30b9f20cbb1c4f22a596918bd69', '4c39fdd1e8eb9a146f6e0766a8fbe4c7', 'ff572a8430b657bfc79fb6c6c3760ac1', 'f01c20412cab27a5e8feacd170cfc900', '0ae84745b3261a021cc0cfe055ca8c46', 'b61dc9d6067dc0aae0db5007e4d7f876', '1b855ddca92afe1bed522bb07dc5f788', '8aff79b4657c0aff54c9016ac1eef46e', '1d85feaf473d55854639514693504c02', 'b2377c149a40b99ec331dbf5ab1cf4ce', '77a8cd32e8188e707ed83575fcbed904', '7e8e47f7eb2537368d06cec14ac3a458', '03ff63330952859dbf86f41a48df6a83', '96c252abaf78877e2f400d838bf69166', '42b08eb49a5390b9e66ddc98663eb269', '148e3d4c60692354e3d1948519e4a0ea', 'f92e17f4efd838108af21129becd7b19', '5b56fc4fb694c62d4de406c24c7ca5e7', '63fbfcf952e803f5c39473fafb599251',
51+ # 'b531a86d7188be31428a03603d7405c6', '768700aa330a2a5891ad22b67ea49dc1', '2f36daf2a3a8aedd48db06acd6727b0a', '01a6385ee03ac917f245d7e52faa9f66', '02ffa11eb4dbc3106eda5dbdec29e817', '542e6a445f3a4d9dad4e32acc525f2c4', '3513dd06f3363335e0561979a5e5f2d1', '3c88cfffed093f566b06783c7033948f', '45c95a5fb1f303c138b0f60b54a94c2c', '848c8bf283e6c39555ed5085b3eba735', '2b9f5b251111939dd8689ddf86309c7b', '93ce665d647aa0319fc444596eb2bb5a', 'e61d313adfe3604cafc773210f461fc1', 'c6ebf4689bb0b3fe66b0139ccaf83ac2', '0b934d2125cf7ca9564ccdab65797c11', '396a6963d3b402548b38676200b9a544', 'dcc9e9def75bc9bae1b893da13daf378', 'c4aafa1124d87da0606bc67bb125d702', '018d85cd9702db9339920f0cb91d7afc', 'b628ccb3d8721fc019ae2ef279375a08', 'ca1de9c53113407dc7373bd33cad2a57', '92852e108c7612c0d7ff4ceccd59bdca', '133f20d89ba49eb68a7e231e590a0ec7', '90679d805829232802b794baf6a09338', '255400de1eda3bec328dcb932a0ce70e', 'ea03efb1ec2466ff0c3925a2c25255bc', '69bd89af9e84e240de4ae1a9c4730b78', '4b92fdac8484757e1ed805f2ef485aff', 'cea187ba19982b64b651046d0c55d683', '09a1bc602a7ad508f10b1c83785e3f5f', 'e27843805c7e5e9743202ad132f0cb74', 'd8216db7dd67453b948025092631e8f0', '76cb634ba33e4438517fa388f472413f', '0e3b0626806414deb561d4c95edf6cc9', 'd6dde5273fdf62bf1776a24007d59ae2', 'f95b23918aa5bcab2c0cc9dcc85e3f2f', 'ba1017fcabbc3f4552554cac3e1e8d51', 'd7096504ae4361e995f97dbcdcdea94e', 'ee701a2474084b3477ce862f2cef54d0', 'b708a95d6bc07e1d79dab68972fde016', '495f8be32619aa791c71947b66a6e44c', 'e48af5a58cf363b65dc2c027616155c3', 'a837f0376b0af3ee2243a26f96003fb6', 'bf522b5a8a05ef06876b1166cf76cd76', 'f2e97e6eb83c6ae6315735c3a58475dd', 'db3dc0285cc2e4866c211e151559df40', '4d957fd55b43b527b17f82a2e8534337', 'c646e9fb94b3331944050967ea820d11', '051579deb85a5b7eabe7c6df3264d302', 'ebdee2282dfd6ea3cb60844087f4c923', '3795117167beb9768034b7443fe5b754', 'cf1cf8debfc581c217707fb09d1c0cc9', '0c34089dbd4db0d48af97873baa8cb10', '867797d630b0a67438a7b1b6ddb9f46f', '44600d9ff423b2f8a674e4fdf928c36f', 'd8036a66b91006fb87312ab73b6f2647', 'd7ab07844e32d4e609e3fae4f6e624b5', '37decab3aa35040d72af0eeff9e557c2', '98435ec4176e33164ac2dd092911518b', 'b957873f07b73bc9fe2858de7560d354', '52643bbd6767cf5904b0bec3eaafe283', '453b598c0fa37d3df507932d8b742eca', '25be440bc59f97942579d25419517781', '48730f8b9ea4a4eb67c4826cec75e6d7', '6d4ceccbec07d4b3f41b8d73a5cc54a4', '099e4f9788ea5646b01899a4565b449b', '3cd5853986f381dc40e02414f76ae0a4', '63f6d25357ac92d889d2a2e08a6d95f1', 'ccf04cce8134efe5e667e2c0caa57474', 'd0f11393c659cf2f70dc989d21a682fa', '5b845723652627151f4ff121333fede1', 'a3802f7db8829109ac98eb4552231d54', '558dd61fb7a6a47ef76b97384898de27', 'cc556cae149faa24f2d011806d65168e', 'c3d01f08f2eb4f769b0fa3497e8222ff', '64bcbbf198f7e191e06173b054cff02a', 'a50e764e1e49e7877bf91864665de354', '4269e17cd671c56a9865b1ed5e316a61', '5a8d3746fadcec538c3d70490cc888ed', '2e5d52ce7ee10ea5bb37769409449e01', 'd0ad238960be64ad4cd8f67fa58a946b', '40d764a54eae749b6852910655e7c35c', '749821212c25ca95abb1831c48764d2a', '1625d480e0c3160c5e34c53006cd85ee', '354f1780bb909fe141b16a5ce155caf8', '51ae4a30442db8f7fcc3f77899370100', '4bb27edd1f01bf191c77271b580a186c', 'e22f00a4abd2d8d9ebe2ca5a6055a0d5', '4d3eeb032b517f91b8ac75babbd8a13b', 'c5db9e3b4ecc205b7d5d09ca660a0c4b', '314dce2d49ecf8b3d83d97f854b471ef', 'd58447f6ceb1c959538846df532bf82a', '346c5e08ea5bafee68a471f0f04fe397', '2c162ef8a4bdfef1ab5dd2d45389b232', 'f55a91bc0057a859f94245b047087dbf', '24dad6be9f859b50fb4f00e1a7981f82', '59a53c1c0078ad9cb88bdbf161c11800', 'c5a05f23f9784a3be3bfa651198a48eb', '18d13cca5e607b917f3c60bd75f863cd', '521b9b73c5b032f8e2809a5032d18266', '18ecc71ecc0df104a1628bdd4e529f85', '7aec18c648d573f34641b8329a2afe3e'
52+ # ) ~ 60000,
53+ # TRUE ~ memory
54+ # )
55+ # ) |>
56+ # mutate(
57+ # memory = case_when(
58+ # file_id_db2 %in% c(
59+ # 'f7cb9ed27af8bd19b8b05feeaf84ec3c', 'abb6ff154b70c74e395add7dca444bbc', '2d6fa434060cd4e16987245c35294fdb', '5a7c941fcf37216ce181e5af9857a897', '2c162ef8a4bdfef1ab5dd2d45389b232', 'f55a91bc0057a859f94245b047087dbf', '24dad6be9f859b50fb4f00e1a7981f82', 'c5a05f23f9784a3be3bfa651198a48eb'
60+ # ) ~ 120000,
5661# TRUE ~ memory
5762# )
5863# ) |>
59- # mutate(memory = if_else(file_id_db2 %in% c(
60- # readRDS("dev/DB2_files_list_that_need_more_memory.rds"),
61- # readRDS("dev/DB2_files_list_that_need_more_memory_2.rds")
62- # ), 100000, memory)) |>
63- # mutate(memory = if_else(file_id_db2 %in% c('a0a56250b7b38ab3c30df2ded38d1c1f', '317b03f4caa6af8d147fd74d44c04adf', 'beda5a9dd9ab716431b0e2db194ffbe0', '7db83cd468bfcb5cf125996437412b96', '74e3dc029ccd9e5e1c82a64ece3c616e', '5e75192c96af82a0a715998bcb417cb1', '11585c1edaa11913c306477bb3d9d552', '8c0ee4326521ac2daeb411e161cbf6dd', 'aea9854f73ec194517b3eb525b203cf4', 'e37562a76ad784eca662bf5c7a3ca409'), 200000, memory)) |>
64- #
6564# rowid_to_column() |>
6665# mutate(commands = pmap(
6766# list(output_file_path, input_file_path, memory, rowid),
@@ -75,14 +74,14 @@ gene_names = glue("{root_directory}/gene_names.rds")
7574# )) |>
7675# pull(commands) |>
7776# unlist() |>
78- # write_lines(glue("dev/ DB2_files.makeflow"))
77+ # write_lines(glue("DB2_files.makeflow"))
7978
8079
8180
8281# Read arguments
8382args = commandArgs(trailingOnly = TRUE )
8483input_file = args [[1 ]]
85- all_gene_names = args [[2 ]]
84+ gene_names = args [[2 ]]
8685output_file = args [[3 ]]
8786
8887output_file | > dirname() | > dir.create( showWarnings = FALSE , recursive = TRUE )
@@ -92,29 +91,108 @@ file_id_db2 = basename(output_file) |> tools::file_path_sans_ext()
9291cells_to_keep =
9392 get_metadata() | >
9493 filter(file_id == !! file_id ) | >
95- select(.cell , .sample , file_id , cell_type ) | >
94+ dplyr :: select(.cell , .sample , file_id , cell_type ) | >
9695 as_tibble() | >
9796 unite(" file_id_db2" , c(file_id , cell_type ), remove = FALSE ) | >
9897 mutate(file_id_db2 = file_id_db2 | > md5() | > as.character()) | >
9998 filter(file_id_db2 == !! file_id_db2 ) | >
100- mutate(.cell_original = .cell | > str_remove(.sample ) | > str_remove(" _$" ))
99+ mutate(.cell_original = .cell | > str_remove(.sample ) | > str_remove(" _$" )) | >
100+ pull(.cell_original )
101+
102+ # Read
103+ data = readH5AD(input_file , use_hdf5 = TRUE , reader = " R" )
104+ data_for_rows_and_columns = readH5AD(input_file , use_hdf5 = TRUE , skip_assays = TRUE )
105+ colnames(data ) = data_for_rows_and_columns | > colnames()
106+ rownames(data ) = data_for_rows_and_columns | > rownames()
107+ rm(data_for_rows_and_columns )
108+ gc()
109+
110+ # Subset cells
111+ data = data [,cells_to_keep ] | > select(.cell )
112+ gc()
113+
114+ col_sums = colSums(data @ assays @ data $ X )
115+
116+ # If I have negative values
117+ if ((data @ assays @ data $ X [,1 : min(10000 , ncol(data @ assays @ data $ X ))] | > min()) < 0 )
118+ data @ assays @ data $ X [data @ assays @ data $ X < 0 ] <- 0
119+
120+ # If there are huge value cap
121+ if (max(col_sums ) > 1e100 ) {
122+ temp = data @ assays @ data $ X [,sample(1 : ncol(data ), size = 10000 , replace = T ), drop = T ]
123+ q = quantile(temp [temp > 0 ], 0.9 )
124+ data @ assays @ data $ X [data @ assays @ data $ X > 1e100 ] = q
125+ }
126+
127+ # Drop all 0 cells
128+ data = data [,col_sums > 0 ]
129+
130+ # Set gene names
131+ rownames(data ) = AnnotationDbi :: mapIds(org.Hs.eg.db :: org.Hs.eg.db ,
132+ keys = rownames(data ),
133+ keytype = " ENSEMBL" ,
134+ column = " SYMBOL" ,
135+ multiVals = " first"
136+ )
137+ data = data [! is.na(rownames(data )),]
138+ rownames(data @ assays @ data $ X ) = rownames(data )
139+
140+ # Merge duplicate genes
141+ split_matrix_by_column <- function (matrix , chunk_size ) {
142+ ncols <- ncol(matrix )
143+ chunks <- lapply(seq(1 , ncols , by = chunk_size ), function (i ) {
144+ if (i + chunk_size - 1 > ncols ){
145+ matrix [, i : ncols , drop = FALSE ]
146+ }else {
147+ matrix [, i : (i + chunk_size - 1 ), drop = FALSE ]
148+ }
149+ })
150+ return (chunks )
151+ }
152+
153+ duplicated_genes = data [duplicated(rownames(data )),] | > rownames()
154+ deduplicated_matrix =
155+
156+ data [rownames(data ) %in% duplicated_genes ,]@ assays @ data $ X %> %
157+ split_matrix_by_column(10000 ) | >
158+ map(~ rowsum(.x , rownames(.x ))) | >
159+ reduce(cbind ) %> %
160+ {
161+ .x = writeHDF5Array(. , as.sparse = T )
162+ rownames(.x ) = rownames(. )
163+ .x
164+ }
165+
166+ sce = SingleCellExperiment(list (X = rbind(
167+ data [! rownames(data ) %in% duplicated_genes ,]@ assays @ data $ X ,
168+ deduplicated_matrix
169+ )[rownames(data ),,drop = FALSE ]
170+ ))
171+ colnames(sce ) = colnames(data )
172+
173+ rm(data )
174+ rm(deduplicated_matrix )
175+ gc()
101176
102- gene_names =
103- (
104- readH5AD(
105- input_file , use_hdf5 = TRUE ,
106- layers = FALSE , uns = FALSE , obs = FALSE , varm = FALSE , obsm = FALSE , varp = FALSE , obsp = FALSE , raw = FALSE
107- ) | > rowData( )
108- ) $ feature_name
177+ # Complete gene set
178+ missing_genes = readRDS( gene_names ) | > setdiff(rownames( sce ))
179+
180+ missing_matrix =
181+ HDF5RealizationSink(c(length( missing_genes ),ncol( sce )), as.sparse = TRUE ) | >
182+ as( " DelayedArray " )
183+ rownames( missing_matrix ) = missing_genes
109184
110- X =
111- readH5AD(
112- input_file , use_hdf5 = TRUE ,
113- layers = FALSE ,uns = FALSE ,var = FALSE , obs = FALSE ,varm = FALSE ,obsm = FALSE ,varp = FALSE ,obsp = FALSE ,raw = FALSE
114- )[,cells_to_keep | > pull(.cell_original )]@ assays @ data $ X
185+ missing_sce = SingleCellExperiment(list (X = missing_matrix ))
186+ colnames(missing_sce ) = colnames(sce )
187+ missing_sce = missing_sce [! is.na(rownames(missing_sce )),]
115188
116- rownames(X ) = gene_names
189+ # Make cell name unique
190+ sce = sce | > rbind(missing_sce )
191+ sce = sce [sce | > rownames() | > sort(),]
192+ rm(missing_sce )
193+ gc()
117194
195+ # Tranform
118196transformation =
119197 get_metadata() | >
120198 filter(file_id == !! file_id ) | >
@@ -128,7 +206,6 @@ transformation =
128206 pull(transformation ) | >
129207 unique()
130208
131- # Ad hoc transformations, not declared as log
132209if (file_id %in% c(
133210 " 1e81a742-e457-4fc6-9c39-c55189ec9dc2" ,
134211 " b51bfa2d-22b2-4c65-9803-d36d4de973fa" ,
@@ -139,8 +216,10 @@ if(file_id %in% c(
139216 " 575513b2-6e53-41e2-85a9-bc08a6233ce4"
140217)) transformation = " log"
141218
142- X =
143- X | >
219+
220+
221+ sce @ assays @ data $ X =
222+ sce @ assays @ data $ X | >
144223 when(
145224 transformation == " log" ~ {
146225 .x = expm1(. )
159238 }
160239 )
161240
162- # If I have negative values
163- if (min(X ) < 0 )
164- X [X < 0 ] <- 0
165-
166- # Select just the X assay
167- sce = SingleCellExperiment(list (X = X ))
168- rownames(sce ) = rownames(X )
169- colnames(sce ) = colnames(X )
170-
171- rm(X )
172- gc()
173-
174- # Add missing genes
175- missing_genes = readRDS(all_gene_names ) | > setdiff(rownames(sce ))
176- missing_matrix =
177- HDF5RealizationSink(c(length(missing_genes ),ncol(sce )), as.sparse = TRUE ) | >
178- as(" DelayedArray" )
179-
180- rownames(missing_matrix ) = missing_genes
181- colnames(missing_matrix ) = colnames(sce )
182-
183- missing_sce = SingleCellExperiment(list (X = missing_matrix ), colData = colData(sce ))
184- missing_sce @ int_colData = sce @ int_colData
185-
186- # Make cell name unique
187- sce = sce | > rbind(missing_sce )
188241
189242sce | > saveHDF5SummarizedExperiment(output_file , replace = TRUE )
190243
0 commit comments