forked from stemangiola/CuratedAtlasQueryR
-
Notifications
You must be signed in to change notification settings - Fork 2
Open
Description
- Add
empty_dropletcolumn (boolean) - Add
doubletcolumn (boolean) - Add
feature_non_zero_count(it could be returned within the HPCell empty droplet table) - Add
alivecolumn (boolean) - [ON HOLD, NOT SURE IT IS A GOOD IDEA] Add curated labels (see below)
library(readr)
library(forcats)
library(glue)
age_bin <- function(age_days, sex) {
# Convert age in days to age in years
age_years <- age_days / 365.25
# Initialise an empty vector to store the results
age_bins <- vector("character", length(age_years))
# Define average thresholds for "unknown" sex based on midpoint between male and female stages
unknown_thresholds <- c(3, 13, 20, 38, 52)
# Loop through each element to assign the appropriate bin based on sex and age
for (i in seq_along(age_years)) {
if (sex[i] == "male") {
age_bins[i] <- dplyr::case_when(
age_years[i] < 3 ~ "Infancy",
age_years[i] < 13 ~ "Childhood",
age_years[i] < 21 ~ "Adolescence",
age_years[i] < 40 ~ "Young Adulthood",
age_years[i] < 55 ~ "Middle Age",
age_years[i] >= 55 ~ "Senior",
TRUE ~ NA_character_
)
} else if (sex[i] == "female") {
age_bins[i] <- dplyr::case_when(
age_years[i] < 3 ~ "Infancy",
age_years[i] < 13 ~ "Childhood",
age_years[i] < 19 ~ "Adolescence",
age_years[i] < 36 ~ "Young Adulthood",
age_years[i] < 50 ~ "Middle Age",
age_years[i] >= 50 ~ "Senior",
TRUE ~ NA_character_
)
} else if (sex[i] == "unknown") {
age_bins[i] <- dplyr::case_when(
age_years[i] < unknown_thresholds[1] ~ "Infancy",
age_years[i] < unknown_thresholds[2] ~ "Childhood",
age_years[i] < unknown_thresholds[3] ~ "Adolescence",
age_years[i] < unknown_thresholds[4] ~ "Young Adulthood",
age_years[i] < unknown_thresholds[5] ~ "Middle Age",
age_years[i] >= unknown_thresholds[5] ~ "Senior",
TRUE ~ NA_character_
)
} else {
stop("Each element of 'sex' must be either 'male', 'female', or 'unknown'.")
}
}
return(age_bins)
}
edit_covariates = function(tbl, disease_tbl){
ethnicity_grouped <- tribble(
~self_reported_ethnicity, ~ethnicity_groups,
"unknown", "Other/Unknown",
"European", "European",
"Korean", "East Asian",
"Asian", "East Asian",
"Japanese", "East Asian",
"African American", "African",
"Hispanic or Latin American", "Hispanic/Latin American",
"Singaporean Chinese", "East Asian",
"Han Chinese", "East Asian",
"Singaporean Indian", "South Asian",
"Singaporean Malay", "Other/Unknown",
"British", "European",
"African", "African",
"South Asian", "South Asian",
"European American", "European",
"East Asian", "East Asian",
"American", "Other/Unknown",
"African American or Afro-Caribbean", "African",
"Oceanian", "Native American & Pacific Islander",
"Jewish Israeli", "Middle Eastern & North African",
"Chinese", "East Asian",
"South East Asian", "Other/Unknown",
"Greater Middle Eastern (Middle Eastern or North African or Persian)", "Middle Eastern & North African",
"Native American", "Native American & Pacific Islander",
"Pacific Islander", "Native American & Pacific Islander",
"Finnish", "European",
"Bangladeshi", "South Asian",
"Native American,Hispanic or Latin American", "Hispanic/Latin American",
"Irish", "European",
"Iraqi", "Middle Eastern & North African",
"European,Asian", "European"
)
assay_data_grouped <- tribble(
~assay, ~assay_groups,
"10x 3' v2", "10x Genomics 3",
"10x 3' v3", "10x Genomics 3",
"10x 5' v2", "10x Genomics 5",
"10x 5' v1", "10x Genomics 5",
"MARS-seq", "Plate based Technologies",
"10x 3' transcription profiling", "10x Genomics 3",
"10x 5' transcription profiling", "10x Genomics 5",
"Smart-seq2", "Smart seq",
"microwell-seq", "Microwell Technologies",
"TruDrop", "TruDrop",
"Drop-seq", "Drop based Technologies",
"Seq-Well S3", "Microwell Technologies",
"GEXSCOPE technology", "Other Technologies",
"Seq-Well", "Microwell Technologies",
"sci-RNA-seq", "Other Technologies",
"10x 3' v1", "10x Genomics 3",
"BD Rhapsody Whole Transcriptome Analysis", "Other Technologies",
"BD Rhapsody Targeted mRNA", "Other Technologies",
"CEL-seq2", "Plate based Technologies",
"SPLiT-seq", "Other Technologies",
"STRT-seq", "Plate based Technologies",
"inDrop", "Drop based Technologies",
"Smart-seq v4", "Smart seq",
"ScaleBio single cell RNA sequencing", "Other Technologies"
)
disease_data_grouped <- tribble(
~disease, ~disease_groups,
# Normal control
"normal", "Normal",
# Isolated Diseases
"COVID-19", "COVID-19 related",
"post-COVID-19 disorder", "COVID-19 related",
"long COVID-19", "COVID-19 related",
"glioblastoma", "Glioblastoma",
"lung adenocarcinoma", "Lung Adenocarcinoma",
"systemic lupus erythematosus", "Systemic Lupus Erythematosus",
# Infectious and Immune-related Diseases (other than COVID-19)
"Crohn disease", "Infectious and Immune-related Diseases",
"Crohn ileitis", "Infectious and Immune-related Diseases",
"pneumonia", "Infectious and Immune-related Diseases",
"common variable immunodeficiency", "Infectious and Immune-related Diseases",
"toxoplasmosis", "Infectious and Immune-related Diseases",
"Plasmodium malariae malaria", "Infectious and Immune-related Diseases",
"type 1 diabetes mellitus", "Infectious and Immune-related Diseases",
"influenza", "Infectious and Immune-related Diseases",
"chronic rhinitis", "Infectious and Immune-related Diseases",
"periodontitis", "Infectious and Immune-related Diseases",
"localized scleroderma", "Infectious and Immune-related Diseases",
"lymphangioleiomyomatosis", "Infectious and Immune-related Diseases",
"listeriosis", "Infectious and Immune-related Diseases",
# Cancer (other than isolated cancers)
"squamous cell lung carcinoma", "Cancer",
"small cell lung carcinoma", "Cancer",
"non-small cell lung carcinoma", "Cancer",
"breast carcinoma", "Cancer",
"breast cancer", "Cancer",
"luminal B breast carcinoma", "Cancer",
"luminal A breast carcinoma", "Cancer",
"triple-negative breast carcinoma", "Cancer",
"gastric cancer", "Cancer",
"colorectal cancer", "Cancer",
"colon sessile serrated adenoma/polyp", "Cancer",
"follicular lymphoma", "Cancer",
"B-cell acute lymphoblastic leukemia", "Cancer",
"B-cell non-Hodgkin lymphoma", "Cancer",
"acute myeloid leukemia", "Cancer",
"acute promyelocytic leukemia", "Cancer",
"plasma cell myeloma", "Cancer",
"clear cell renal carcinoma", "Cancer",
"nonpapillary renal cell carcinoma", "Cancer",
"basal cell carcinoma", "Cancer",
"colorectal neoplasm", "Cancer",
"adenocarcinoma", "Cancer",
"chromophobe renal cell carcinoma", "Cancer",
"neuroendocrine carcinoma", "Cancer",
"lung large cell carcinoma", "Cancer",
"tongue cancer", "Cancer",
"Wilms tumor", "Cancer",
"pleomorphic carcinoma", "Cancer",
"blastoma", "Cancer",
# Neurodegenerative and Neurological Disorders
"dementia", "Neurodegenerative and Neurological Disorders",
"Alzheimer disease", "Neurodegenerative and Neurological Disorders",
"Parkinson disease", "Neurodegenerative and Neurological Disorders",
"amyotrophic lateral sclerosis", "Neurodegenerative and Neurological Disorders",
"multiple sclerosis", "Neurodegenerative and Neurological Disorders",
"Down syndrome", "Neurodegenerative and Neurological Disorders",
"trisomy 18", "Neurodegenerative and Neurological Disorders",
"frontotemporal dementia", "Neurodegenerative and Neurological Disorders",
"temporal lobe epilepsy", "Neurodegenerative and Neurological Disorders",
"Lewy body dementia", "Neurodegenerative and Neurological Disorders",
"amyotrophic lateral sclerosis 26 with or without frontotemporal dementia", "Neurodegenerative and Neurological Disorders",
# Respiratory Conditions
"pulmonary fibrosis", "Respiratory Conditions",
"respiratory system disorder", "Respiratory Conditions",
"chronic obstructive pulmonary disease", "Respiratory Conditions",
"cystic fibrosis", "Respiratory Conditions",
"interstitial lung disease", "Respiratory Conditions",
"hypersensitivity pneumonitis", "Respiratory Conditions",
"non-specific interstitial pneumonia", "Respiratory Conditions",
"aspiration pneumonia", "Respiratory Conditions",
"pulmonary emphysema", "Respiratory Conditions",
"pulmonary sarcoidosis", "Respiratory Conditions",
# Cardiovascular Diseases
"myocardial infarction", "Cardiovascular Diseases",
"acute myocardial infarction", "Cardiovascular Diseases",
"dilated cardiomyopathy", "Cardiovascular Diseases",
"heart failure", "Cardiovascular Diseases",
"arrhythmogenic right ventricular cardiomyopathy", "Cardiovascular Diseases",
"congenital heart disease", "Cardiovascular Diseases",
"non-compaction cardiomyopathy", "Cardiovascular Diseases",
"cardiomyopathy", "Cardiovascular Diseases",
"heart disorder", "Cardiovascular Diseases",
# Metabolic and Other Disorders
"type 2 diabetes mellitus", "Metabolic and Other Disorders",
"chronic kidney disease", "Metabolic and Other Disorders",
"digestive system disorder", "Metabolic and Other Disorders",
"primary sclerosing cholangitis", "Metabolic and Other Disorders",
"gastritis", "Metabolic and Other Disorders",
"acute kidney failure", "Metabolic and Other Disorders",
"tubular adenoma", "Metabolic and Other Disorders",
"benign prostatic hyperplasia", "Metabolic and Other Disorders",
"opiate dependence", "Metabolic and Other Disorders",
"gingivitis", "Metabolic and Other Disorders",
"hyperplastic polyp", "Metabolic and Other Disorders",
"clonal hematopoiesis", "Metabolic and Other Disorders",
"epilepsy", "Metabolic and Other Disorders",
"age related macular degeneration 7", "Metabolic and Other Disorders",
"kidney benign neoplasm", "Metabolic and Other Disorders",
"malignant pancreatic neoplasm", "Metabolic and Other Disorders",
"cataract", "Metabolic and Other Disorders",
"macular degeneration", "Metabolic and Other Disorders",
"hydrosalpinx", "Metabolic and Other Disorders",
"tubulovillous adenoma", "Metabolic and Other Disorders",
"gastric intestinal metaplasia", "Metabolic and Other Disorders",
"Barrett esophagus", "Metabolic and Other Disorders",
# Other Diseases
"injury", "Other Diseases",
"anencephaly", "Other Diseases",
"primary biliary cholangitis", "Other Diseases",
"keloid", "Other Diseases",
"kidney oncocytoma", "Other Diseases",
"respiratory failure", "Other Diseases",
"pilocytic astrocytoma", "Other Diseases"
)
disease_data_grouped =
disease_data_grouped |>
select(-disease_groups) |>
left_join(disease_tbl) |>
mutate(disease_groups = if_else(disease_groups |> is.na(), "other", disease_groups))
age_bin_table =
tbl |>
distinct(age_days, sex) |>
filter(!age_days |> is.na()) |>
mutate(sex = if_else(sex |> is.na(), "unknown", sex)) |>
as_tibble() |>
mutate(age_bin = age_bin(age_days, sex))
tbl |>
# TECH
left_join(assay_data_grouped, copy=TRUE) |>
# DISEASE
left_join(disease_data_grouped, copy=TRUE) |>
# TEMPORARY. de-group pancreas and liver
mutate(tissue_groups = case_when(
tissue %in% c("gallbladder") ~ "gallbladder",
tissue %in% c("pancreas", "exocrine pancreas") ~ "pancreas",
tissue %in% c("liver", "caudate lobe of liver", "hepatic cecum" ) ~ "liver",
TRUE ~ tissue_groups
)) |>
# SEX edit
mutate(sex = if_else(sex |> is.na(), "unknown", sex)) |>
# Age
filter(age_days > 365) |>
left_join(age_bin_table, copy=TRUE) |>
# ETHNICITY
left_join(ethnicity_grouped, copy=TRUE) |>
dplyr::select(cell_id, sample_id, donor_id, dataset_id, file_id_cellNexus_single_cell, title, collection_id, age_days, age_bin, sex, ethnicity_groups, tissue_groups, tissue, assay_groups, cell_type_unified_ensemble, cell_type, disease_groups) |>
as_tibble() |>
# Set intercept
mutate(
ethnicity_groups = fct_relevel(ethnicity_groups, "European"),
assay_groups = fct_relevel(assay_groups, "10x Genomics 3"),
disease_groups = fct_relevel(disease_groups, "Normal"),
age_bin = fct_relevel(age_bin, "Adolescence")
) |>
# Center based on adolescence
mutate(age_days_scaled = age_days |> scale(center = 15*365) |> as.numeric())
}
# result_directory = "/vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/sccomp_on_cellNexus_1_0_1")
system(glue("~/bin/rclone copy box_adelaide:/minh_immune_map_disease/disease_data_grouped_further.csv ./"))
# REMOVE OLD CACHE which is here get_default_cache_dir()
get_metadata() |>
edit_covariates(
read_csv(glue("./disease_data_grouped_further.csv"))
) Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels