@@ -94,7 +94,7 @@ biomaRt_getBM_batch <- function(mart, attributes, filters, values, chunk_size) {
9494# wget https://github.com/SomaLogic/SomaLogic-Data/raw/main/example_data.adat # Retrieve just the 5k (v4.0) ADAT
9595# wget https://github.com/SomaLogic/SomaLogic-Data/raw/main/example_data_v4.1_plasma.adat # Retrieve just the 7k (v4.1) ADAT
9696# wget https://github.com/SomaLogic/SomaLogic-Data/raw/main/example_data_v5.0_plasma.adat # Retrieve just the 11k (v5.0) ADAT
97- data <- SomaDataIO :: read_adat(" /Users/leem/Downloads /example_data_v5.0_plasma.adat" )
97+ data <- SomaDataIO :: read_adat(" inst/data /example_data_v5.0_plasma.adat" )
9898data <- SomaDataIO :: getAnalyteInfo(data )
9999
100100# # format data
@@ -163,17 +163,44 @@ map_hgnc <- biomaRt_getBM_batch(
163163 chunk_size = 2
164164)
165165
166+ # ensemble primary id ====
167+ ensembl_uniprot <- getBM(
168+ attributes = c(" uniprot_gn_id" , " ensembl_gene_id" , " chromosome_name" ),
169+ filters = " uniprot_gn_id" ,
170+ values = id_uniprot_id ,
171+ mart = mart
172+ )
173+
174+ ensembl_entrez <- getBM(
175+ attributes = c(" entrezgene_id" , " ensembl_gene_id" , " chromosome_name" ),
176+ filters = " entrezgene_id" ,
177+ values = as.integer(id_entrez_id ),
178+ mart = mart
179+ )
180+
181+ ensembl_hgnc <- getBM(
182+ attributes = c(" hgnc_symbol" , " ensembl_gene_id" , " chromosome_name" ),
183+ filters = " hgnc_symbol" ,
184+ values = id_hgnc_id ,
185+ mart = mart
186+ )
187+
188+ map_ensembl <- full_join(ensembl_uniprot , ensembl_entrez , by = c(" ensembl_gene_id" , " chromosome_name" )) %> %
189+ full_join(ensembl_hgnc , by = c(" ensembl_gene_id" , " chromosome_name" )) %> %
190+ filter(chromosome_name %in% c(as.character(1 : 22 ), " X" , " Y" ))
191+
166192# combine maps ====
167193map <- rbind(map_uniprot ,
168194 map_entrez ,
169195 map_hgnc ) %> %
170196 dplyr :: distinct()
171197
198+ map <- full_join(map , map_ensembl )
199+
172200counts <- map %> %
173201 summarise(across(everything(), ~ sum(is.na(. ) | . == " " ))) %> %
174202 tidyr :: pivot_longer(cols = everything(), names_to = " Column" , values_to = " MissingCount" )
175203
176-
177204# positions ====
178205# # hg19 ====
179206id_hg19 <- subset(genes(TxDb.Hsapiens.UCSC.hg19.knownGene ,
@@ -252,11 +279,15 @@ columns <- c(
252279 " SeqId" , " SomaId" , " UNIPROT" , " Target" , " TargetFullName" , " EntrezGeneID" , " EntrezGeneSymbol" ,
253280 " uniprot_gn_id" , " uniprot_gn_symbol" ,
254281 " entrezgene_id" ," entrezgene_accession" ,
255- " hgnc_id" , " hgnc_symbol" , " external_gene_name" ," gene_biotype" ,
282+ " hgnc_id" , " hgnc_symbol" , " ensembl_gene_id " , " external_gene_name" ," gene_biotype" ,
256283 " CHR" , " START_hg19" , " END_hg19" , " strand_hg19" ," START_hg38" , " END_hg38" , " strand_hg38"
257284)
258285data_map <- data_map [, columns ]
259286
287+ counts <- data_map %> %
288+ summarise(across(everything(), ~ sum(is.na(. ) | . == " " ))) %> %
289+ tidyr :: pivot_longer(cols = everything(), names_to = " Column" , values_to = " MissingCount" )
290+
260291# # write
261292write.table(x = data_map ,
262293 file = paste0(" inst/data/mapping_" , VAR_build , " _somalogic.txt" ),
0 commit comments