|
| 1 | +from pathlib import Path |
| 2 | +import pandas as pd |
| 3 | +import anndata as ad |
| 4 | +import scanpy as sc |
| 5 | +import urllib.request |
| 6 | +import tarfile |
| 7 | + |
| 8 | + |
| 9 | +## VIASH START |
| 10 | +par = { |
| 11 | + "cancer_subtypes": ['HER2+', 'TNBC', 'ER+'], |
| 12 | + "keep_files": True, # wether to delete the intermediate files |
| 13 | + "output": "2021Wu_human_breast_cancer_sc.h5ad", |
| 14 | + "dataset_id": "2021Wu_human_breast_cancer_sc", |
| 15 | + "dataset_name": "2021Wu_human_breast_cancer_sc", |
| 16 | + "dataset_url": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE176078", |
| 17 | + "dataset_reference": "https://doi.org/10.1038/s41588-021-00911-1", #TODO: bibtex not doi, also adjust config.vsh.yaml |
| 18 | + "dataset_summary": "This dataset contains scRNA-seq data from human breast cancer cells.", |
| 19 | + "dataset_description": "This dataset contains scRNA-seq data from human breast cancer cells.", |
| 20 | + "dataset_organism": "Homo sapiens" |
| 21 | +} |
| 22 | +meta = { |
| 23 | + "temp_dir": "./tmp/2021Wu_human_breast_cancer_sc", |
| 24 | +} |
| 25 | +## VIASH END |
| 26 | + |
| 27 | +# helper variables |
| 28 | +TMP_DIR = Path(meta["temp_dir"] or "/tmp") |
| 29 | +TMP_DIR.mkdir(parents=True, exist_ok=True) |
| 30 | +FILE_PATHS = { |
| 31 | + "tar_gz": TMP_DIR / "GSE176078_Wu_etal_2021_BRCA_scRNASeq.tar.gz", |
| 32 | + "counts_mtx": TMP_DIR / "Wu_etal_2021_BRCA_scRNASeq/count_matrix_sparse.mtx", |
| 33 | + "barcodes": TMP_DIR / "Wu_etal_2021_BRCA_scRNASeq/count_matrix_barcodes.tsv", |
| 34 | + "genes": TMP_DIR / 'Wu_etal_2021_BRCA_scRNASeq/count_matrix_genes.tsv', |
| 35 | + "obs": TMP_DIR / 'Wu_etal_2021_BRCA_scRNASeq/metadata.csv', |
| 36 | +} |
| 37 | + |
| 38 | + |
| 39 | +# Download the data |
| 40 | +print("Downloading data", flush=True) |
| 41 | +urllib.request.urlretrieve( |
| 42 | + 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE176nnn/GSE176078/suppl/GSE176078_Wu_etal_2021_BRCA_scRNASeq.tar.gz', |
| 43 | + FILE_PATHS["tar_gz"] |
| 44 | +) |
| 45 | + |
| 46 | +# Extract the data |
| 47 | +scrna_file = tarfile.open(FILE_PATHS["tar_gz"]) |
| 48 | +scrna_file.extractall(TMP_DIR) |
| 49 | +scrna_file.close() |
| 50 | + |
| 51 | +# Read data |
| 52 | +print("Reading count matrix", flush=True) |
| 53 | +counts = sc.read_mtx(FILE_PATHS["counts_mtx"], dtype='float32') |
| 54 | + |
| 55 | +print("Reading barcodes", flush=True) |
| 56 | +barcodes = pd.read_csv(FILE_PATHS["barcodes"], header = None) |
| 57 | +barcodes.columns = ['Barcode'] |
| 58 | +barcodes.set_index('Barcode', inplace=True) |
| 59 | +barcodes.rename_axis(None, axis=0, inplace=True) |
| 60 | + |
| 61 | +print("Reading var names", flush=True) |
| 62 | +genes = pd.read_csv(FILE_PATHS["genes"], header = None) |
| 63 | +genes.columns = ['Gene'] |
| 64 | +genes.set_index('Gene', inplace=True) |
| 65 | +genes.rename_axis(None, axis=0, inplace=True) |
| 66 | + |
| 67 | +print("Reading obs", flush=True) |
| 68 | +obs = pd.read_csv(FILE_PATHS["obs"]) |
| 69 | +obs = obs.iloc[0::,1::] |
| 70 | + |
| 71 | +print("Setting barcodes as obs index", flush=True) |
| 72 | +obs.index = barcodes.index |
| 73 | + |
| 74 | +# Create adata |
| 75 | +print("Creating adata", flush=True) |
| 76 | +adata = ad.AnnData(X = counts.X.T, var = genes, obs = obs ) |
| 77 | + |
| 78 | +adata.layers["counts"] = adata.X |
| 79 | +del adata.X |
| 80 | + |
| 81 | +# Rename fields |
| 82 | +rename_obs_keys = { |
| 83 | + "cell_type": "celltype_major", |
| 84 | + "cell_type_level2": "celltype_minor", |
| 85 | + "cell_type_level3": "celltype_subset", |
| 86 | + "donor_id": "orig.ident", |
| 87 | + "cancer_subtype": "subtype", # TODO: this is currently not in the config yaml - how to handle this? |
| 88 | + # #TODO "cell_type_unified" (?), maybe call the unified one "cell_type" and the original one "cell_type_level1" |
| 89 | + # other keys: "batch", "assay_ontology_term_id", "cell_type_ontology_term_id", "development_stage_ontology_term_id" |
| 90 | + # "diseases_ontology_term_id", "is_primary_data", "organism_ontology_term_id", "self_reported_ethnicity", |
| 91 | + # "self_reported_ethnicity_ontology_term_id", "sex_ontology_term_id", "suspension_type", |
| 92 | + # "suspension_type_ontology_term_id", "tissue_ontology_term_id", "tissue_general_ontology_term_id", "soma_joinid" |
| 93 | +} |
| 94 | +adata.obs = adata.obs.rename(columns={old:new for new,old in rename_obs_keys.items()}) |
| 95 | + |
| 96 | +# Add additional information to obs |
| 97 | +#TODO: Finish up the terms according to the ontology |
| 98 | +#Ontology schema currently (13.03.2025) used in openproblems (CELLxGENE schema v4.0.0): |
| 99 | +#https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md |
| 100 | +#(mentioned here: https://openproblems.bio/documentation/reference/openproblems/src-datasets#file-format:-raw-dataset) |
| 101 | +store_info = { |
| 102 | + "dataset_id": "2021Wu_human_breast_cancer_sc", #"GSE176078", |
| 103 | + "assay": "Chromium Single-Cell v2 3’ and 5’ Chemistry Library", # from metadata from GEO GSE176078 #TODO: ontology |
| 104 | + "sex": "female", #TODO: double check |
| 105 | + "tissue": "breast", |
| 106 | + "disease": "breast cancer", |
| 107 | + # "disease_ontology_term_id": "PATO:0000461", #TODO: ontology |
| 108 | + "organism": "Homo sapiens", |
| 109 | + # "organism_ontology_term_id": "NCBITaxon:10090", #TODO: ontology |
| 110 | + "tissue_general": "breast", |
| 111 | + # "tissue_general_ontology_term_id": "UBERON:0000955", #TODO: ontology |
| 112 | + "development_stage": "adult", |
| 113 | + # "development_stage_ontology_term_id": "MmusDv:0000110" #TODO: ontology |
| 114 | +} |
| 115 | +for key, value in store_info.items(): |
| 116 | + adata.obs[key] = pd.Categorical([value] * adata.n_obs, categories=[value]) |
| 117 | + |
| 118 | +# Remove undesired columns |
| 119 | +for key in adata.obs.columns: |
| 120 | + if (key not in rename_obs_keys.keys()) and (key not in store_info.keys()): |
| 121 | + print(f"Removing .obs['{key}']") |
| 122 | + del adata.obs[key] |
| 123 | + |
| 124 | +# Var |
| 125 | +adata.var["gene_symbol"] = adata.var_names |
| 126 | +# TODO: can we also get ensembl ids? (adata.var["feature_id"]) |
| 127 | + |
| 128 | +# Uns |
| 129 | +for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]: |
| 130 | + adata.uns[key] = par[key] |
| 131 | + |
| 132 | +# Filter for cancer subtypes |
| 133 | +adata = adata[adata.obs["cancer_subtype"].isin(par["cancer_subtypes"])] |
| 134 | + |
| 135 | +# Delete files if requested |
| 136 | +if not par["keep_files"]: |
| 137 | + print("Removing files", flush=True) |
| 138 | + for file in FILE_PATHS.values(): |
| 139 | + if file.exists(): |
| 140 | + print("\t...", file, flush=True) |
| 141 | + file.unlink() |
| 142 | + |
| 143 | +print("Writing adata", flush=True) |
| 144 | +adata.write_h5ad(par["output"], compression="gzip") |
0 commit comments