|
| 1 | +from pathlib import Path |
| 2 | +from scipy.sparse import csr_matrix, hstack |
| 3 | +import pandas as pd |
| 4 | +import anndata as ad |
| 5 | +import urllib.request |
| 6 | +from datetime import datetime |
| 7 | + |
| 8 | +## VIASH START |
| 9 | +par = { |
| 10 | + "keep_files": True, # wether to delete the intermediate files |
| 11 | + "output": "./temp/datasets/2022Lu_human_liver_cancer_sc.h5ad", |
| 12 | + "dataset_id": "2022Lu_human_liver_cancer_sc", |
| 13 | + "dataset_name": "2022Lu_human_liver_cancer_sc", |
| 14 | + "dataset_url": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE149614", |
| 15 | + "dataset_reference": "https://doi.org/10.1038/s41467-022-32283-3", |
| 16 | + "dataset_summary": "This dataset contains scRNA-seq data from human liver cancer cells.", |
| 17 | + "dataset_description": "This dataset contains scRNA-seq data from human liver cancer cells.", |
| 18 | + "dataset_organism": "Homo sapiens" |
| 19 | +} |
| 20 | +meta = { |
| 21 | + "temp_dir": "./temp/datasets/2022Lu_human_liver_cancer_sc", |
| 22 | +} |
| 23 | +## VIASH END |
| 24 | + |
| 25 | +# helper variables |
| 26 | +TMP_DIR = Path(meta["temp_dir"] or "/tmp") |
| 27 | +TMP_DIR.mkdir(parents=True, exist_ok=True) |
| 28 | +FILE_URLS = { |
| 29 | + "counts": "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE149614&format=file&file=GSE149614%5FHCC%2EscRNAseq%2ES71915%2Ecount%2Etxt%2Egz", |
| 30 | + "obs": "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE149614&format=file&file=GSE149614%5FHCC%2Emetadata%2Eupdated%2Etxt%2Egz" |
| 31 | +} |
| 32 | +FILE_PATHS = { |
| 33 | + "counts": TMP_DIR / "GSE149614_HCC.scRNAseq.S71915.count.txt.gz", |
| 34 | + "obs": TMP_DIR / "GSE149614_HCC.metadata.updated.txt.gz", |
| 35 | +} |
| 36 | + |
| 37 | + |
| 38 | +# Download the data |
| 39 | +print("Downloading data", flush=True) |
| 40 | +for key, url in FILE_URLS.items(): |
| 41 | + urllib.request.urlretrieve(url, FILE_PATHS[key]) |
| 42 | + |
| 43 | +# Read data |
| 44 | +print("Reading count matrix", flush=True) |
| 45 | +chunk_size = 200 |
| 46 | + |
| 47 | +genes = [] |
| 48 | +X_sparse = [] |
| 49 | + |
| 50 | +t0 = datetime.now() |
| 51 | + |
| 52 | +for i,chunk in enumerate(pd.read_csv(FILE_PATHS["counts"], sep="\t", chunksize=chunk_size)): |
| 53 | + |
| 54 | + if i % 10 == 0: |
| 55 | + print("\t", datetime.now() - t0, " " , i, "/128") |
| 56 | + |
| 57 | + genes += chunk.index.tolist() |
| 58 | + |
| 59 | + if i == 0: |
| 60 | + X_sparse = csr_matrix(chunk.values.T) |
| 61 | + obs = chunk.columns.tolist() # it's the same in each chunk since all cells are loaded |
| 62 | + else: |
| 63 | + X_sparse = hstack([X_sparse, csr_matrix(chunk.values.T)]) |
| 64 | + |
| 65 | + del chunk |
| 66 | + |
| 67 | +print("Reading obs", flush=True) |
| 68 | +df_obs = pd.read_csv(FILE_PATHS["obs"], sep="\t", index_col=0) |
| 69 | + |
| 70 | +assert (obs == df_obs.index.tolist()) |
| 71 | + |
| 72 | +# Create adata |
| 73 | +print("Creating adata", flush=True) |
| 74 | +adata = ad.AnnData( |
| 75 | + X=None, |
| 76 | + obs=df_obs, |
| 77 | + var=pd.DataFrame(index=genes), |
| 78 | + layers={"counts": X_sparse} |
| 79 | +) |
| 80 | + |
| 81 | +# Rename fields |
| 82 | +rename_obs_keys = { |
| 83 | + "cell_type": "celltype", |
| 84 | + "batch": "sample", |
| 85 | + "donor_id": "patient", |
| 86 | + "cancer_stage": "stage", # TODO: this is currently not in the config yaml - how to handle this? |
| 87 | + # #TODO "cell_type_unified" (?), maybe call the unified one "cell_type" and the original one "cell_type_level1" |
| 88 | + # other keys: "batch", "assay_ontology_term_id", "cell_type_ontology_term_id", "development_stage_ontology_term_id" |
| 89 | + # "diseases_ontology_term_id", "is_primary_data", "organism_ontology_term_id", "self_reported_ethnicity", |
| 90 | + # "self_reported_ethnicity_ontology_term_id", "sex_ontology_term_id", "suspension_type", |
| 91 | + # "suspension_type_ontology_term_id", "tissue_ontology_term_id", "tissue_general_ontology_term_id", "soma_joinid" |
| 92 | +} |
| 93 | +adata.obs = adata.obs.rename(columns={old:new for new,old in rename_obs_keys.items()}) |
| 94 | + |
| 95 | +# Add additional information to obs |
| 96 | +#TODO: Finish up the terms according to the ontology |
| 97 | +#Ontology schema currently (13.03.2025) used in openproblems (CELLxGENE schema v4.0.0): |
| 98 | +#https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md |
| 99 | +#(mentioned here: https://openproblems.bio/documentation/reference/openproblems/src-datasets#file-format:-raw-dataset) |
| 100 | +store_info = { |
| 101 | + "dataset_id": "2022Lu_human_liver_cancer_sc", #"GSE176078", |
| 102 | + "assay": "Chromium Single-Cell v2 3’ Chemistry Library", # from metadata from GEO GSE176078 #TODO: ontology |
| 103 | + "tissue": "liver", |
| 104 | + "disease": "liver cancer", |
| 105 | + # "disease_ontology_term_id": "PATO:0000461", #TODO: ontology |
| 106 | + "organism": "Homo sapiens", |
| 107 | + # "organism_ontology_term_id": "NCBITaxon:10090", #TODO: ontology |
| 108 | + "tissue_general": "liver", |
| 109 | + # "tissue_general_ontology_term_id": "UBERON:0000955", #TODO: ontology |
| 110 | + "development_stage": "adult", |
| 111 | + # "development_stage_ontology_term_id": "MmusDv:0000110" #TODO: ontology |
| 112 | +} |
| 113 | +for key, value in store_info.items(): |
| 114 | + adata.obs[key] = pd.Categorical([value] * adata.n_obs, categories=[value]) |
| 115 | + |
| 116 | +# Remove undesired columns |
| 117 | +for key in adata.obs.columns: |
| 118 | + if (key not in rename_obs_keys.keys()) and (key not in store_info.keys()): |
| 119 | + print(f"Removing .obs['{key}']") |
| 120 | + del adata.obs[key] |
| 121 | + |
| 122 | +# Var |
| 123 | +adata.var["gene_symbol"] = adata.var_names |
| 124 | +# TODO: can we also get ensembl ids? (adata.var["feature_id"]) |
| 125 | + |
| 126 | +# Uns |
| 127 | +for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]: |
| 128 | + adata.uns[key] = par[key] |
| 129 | + |
| 130 | +# Delete files if requested |
| 131 | +if not par["keep_files"]: |
| 132 | + print("Removing files", flush=True) |
| 133 | + for file in FILE_PATHS.values(): |
| 134 | + if file.exists(): |
| 135 | + print("\t...", file, flush=True) |
| 136 | + file.unlink() |
| 137 | + |
| 138 | +print("Writing adata", flush=True) |
| 139 | +adata.write_h5ad(par["output"], compression="gzip") |
0 commit comments