Skip to content

Commit 5b83d8e

Browse files
authored
Merge pull request #22 from openproblems-bio/2021Wu_human_breast_cancer_sc
2021 wu human breast cancer sc
2 parents 85284ec + c710d7c commit 5b83d8e

File tree

6 files changed

+501
-0
lines changed

6 files changed

+501
-0
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/bin/bash
2+
3+
# get the root of the directory
4+
REPO_ROOT=$(git rev-parse --show-toplevel)
5+
6+
# ensure that the command below is run from the root of the repository
7+
cd "$REPO_ROOT"
8+
9+
set -e
10+
11+
publish_dir="s3://openproblems-data/resources/datasets"
12+
13+
# Note that the current download script and processing workflow have a specific default parameter set for the given dataset.
14+
# No additional datasets are supported by that component/workflow. Therefore the default parameters are used and don't need
15+
# to be specified here.
16+
17+
cat > /tmp/params.yaml << HERE
18+
param_list:
19+
- id: wu_human_breast_cancer_sc/2021Wu_human_breast_cancer_sc
20+
cancer_subtypes:
21+
- HER2+
22+
- TNBC
23+
- ER+
24+
25+
keep_files: false
26+
27+
output_dataset: "\$id/dataset.h5ad"
28+
output_meta: "\$id/dataset_meta.yaml"
29+
output_state: "\$id/state.yaml"
30+
publish_dir: "$publish_dir"
31+
HERE
32+
33+
tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
34+
--revision build/main \
35+
--pull-latest \
36+
--main-script target/nextflow/datasets/workflows/process_wu_human_breast_cancer_sc/main.nf \
37+
--workspace 53907369739130 \
38+
--compute-env 6TeIFgV5OY4pJCk8I0bfOh \
39+
--params-file /tmp/params.yaml \
40+
--config common/nextflow_helpers/labels_tw.config \
41+
--labels datasets,wu_human_breast_cancer_sc
42+
43+
aws s3 sync \
44+
s3://openproblems-data/resources/datasets/wu_human_breast_cancer_sc/2021Wu_human_breast_cancer_sc \
45+
resources/datasets/wu_human_breast_cancer_sc/2021Wu_human_breast_cancer_sc
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
name: wu_human_breast_cancer_sc
2+
namespace: datasets/loaders
3+
4+
argument_groups:
5+
- name: Inputs
6+
arguments:
7+
- type: string
8+
name: --cancer_subtypes
9+
required: false
10+
description: The cancer subtypes to download data for.
11+
multiple: true
12+
default:
13+
- HER2+
14+
- TNBC
15+
- ER+
16+
- name: Caching settings
17+
arguments:
18+
- type: boolean
19+
name: --keep_files
20+
required: false
21+
description: Whether to remove the downloaded files after processing.
22+
default: false
23+
- name: Metadata
24+
arguments:
25+
- type: string
26+
name: --dataset_id
27+
description: "A unique identifier for the dataset"
28+
required: false
29+
default: "2021Wu_human_breast_cancer_sc"
30+
- name: --dataset_name
31+
type: string
32+
description: Nicely formatted name.
33+
required: false
34+
default: "2021Wu_human_breast_cancer_sc"
35+
- type: string
36+
name: --dataset_url
37+
description: Link to the original source of the dataset.
38+
required: false
39+
default: "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE176078"
40+
- name: --dataset_reference
41+
type: string
42+
description: Bibtex reference of the paper in which the dataset was published.
43+
required: false
44+
default: "https://doi.org/10.1038/s41588-021-00911-1"
45+
- name: --dataset_summary
46+
type: string
47+
description: Short description of the dataset.
48+
required: false
49+
default: "This dataset contains scRNA-seq data from human breast cancer cells."
50+
- name: --dataset_description
51+
type: string
52+
description: Long description of the dataset.
53+
required: false
54+
default: "This dataset contains scRNA-seq data from human breast cancer cells."
55+
- name: --dataset_organism
56+
type: string
57+
description: The organism of the sample in the dataset.
58+
required: false
59+
default: "Homo sapiens"
60+
- name: Outputs
61+
arguments:
62+
- name: "--output"
63+
__merge__: /src/api/file_common_scrnaseq.yaml
64+
direction: output
65+
required: true
66+
67+
resources:
68+
- type: python_script
69+
path: script.py
70+
71+
engines:
72+
- type: docker
73+
image: openproblems/base_python:1.0.0
74+
__merge__:
75+
- /src/base/setup_txsim_partial.yaml
76+
- type: native
77+
78+
runners:
79+
- type: executable
80+
- type: nextflow
81+
directives:
82+
label: [veryhighmem, midcpu, midtime]
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
from pathlib import Path
2+
import pandas as pd
3+
import anndata as ad
4+
import scanpy as sc
5+
import urllib.request
6+
import tarfile
7+
8+
9+
## VIASH START
10+
par = {
11+
"cancer_subtypes": ['HER2+', 'TNBC', 'ER+'],
12+
"keep_files": True, # wether to delete the intermediate files
13+
"output": "2021Wu_human_breast_cancer_sc.h5ad",
14+
"dataset_id": "2021Wu_human_breast_cancer_sc",
15+
"dataset_name": "2021Wu_human_breast_cancer_sc",
16+
"dataset_url": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE176078",
17+
"dataset_reference": "https://doi.org/10.1038/s41588-021-00911-1", #TODO: bibtex not doi, also adjust config.vsh.yaml
18+
"dataset_summary": "This dataset contains scRNA-seq data from human breast cancer cells.",
19+
"dataset_description": "This dataset contains scRNA-seq data from human breast cancer cells.",
20+
"dataset_organism": "Homo sapiens"
21+
}
22+
meta = {
23+
"temp_dir": "./tmp/2021Wu_human_breast_cancer_sc",
24+
}
25+
## VIASH END
26+
27+
# helper variables
28+
TMP_DIR = Path(meta["temp_dir"] or "/tmp")
29+
TMP_DIR.mkdir(parents=True, exist_ok=True)
30+
FILE_PATHS = {
31+
"tar_gz": TMP_DIR / "GSE176078_Wu_etal_2021_BRCA_scRNASeq.tar.gz",
32+
"counts_mtx": TMP_DIR / "Wu_etal_2021_BRCA_scRNASeq/count_matrix_sparse.mtx",
33+
"barcodes": TMP_DIR / "Wu_etal_2021_BRCA_scRNASeq/count_matrix_barcodes.tsv",
34+
"genes": TMP_DIR / 'Wu_etal_2021_BRCA_scRNASeq/count_matrix_genes.tsv',
35+
"obs": TMP_DIR / 'Wu_etal_2021_BRCA_scRNASeq/metadata.csv',
36+
}
37+
38+
39+
# Download the data
40+
print("Downloading data", flush=True)
41+
urllib.request.urlretrieve(
42+
'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE176nnn/GSE176078/suppl/GSE176078_Wu_etal_2021_BRCA_scRNASeq.tar.gz',
43+
FILE_PATHS["tar_gz"]
44+
)
45+
46+
# Extract the data
47+
scrna_file = tarfile.open(FILE_PATHS["tar_gz"])
48+
scrna_file.extractall(TMP_DIR)
49+
scrna_file.close()
50+
51+
# Read data
52+
print("Reading count matrix", flush=True)
53+
counts = sc.read_mtx(FILE_PATHS["counts_mtx"], dtype='float32')
54+
55+
print("Reading barcodes", flush=True)
56+
barcodes = pd.read_csv(FILE_PATHS["barcodes"], header = None)
57+
barcodes.columns = ['Barcode']
58+
barcodes.set_index('Barcode', inplace=True)
59+
barcodes.rename_axis(None, axis=0, inplace=True)
60+
61+
print("Reading var names", flush=True)
62+
genes = pd.read_csv(FILE_PATHS["genes"], header = None)
63+
genes.columns = ['Gene']
64+
genes.set_index('Gene', inplace=True)
65+
genes.rename_axis(None, axis=0, inplace=True)
66+
67+
print("Reading obs", flush=True)
68+
obs = pd.read_csv(FILE_PATHS["obs"])
69+
obs = obs.iloc[0::,1::]
70+
71+
print("Setting barcodes as obs index", flush=True)
72+
obs.index = barcodes.index
73+
74+
# Create adata
75+
print("Creating adata", flush=True)
76+
adata = ad.AnnData(X = counts.X.T, var = genes, obs = obs )
77+
78+
adata.layers["counts"] = adata.X
79+
del adata.X
80+
81+
# Rename fields
82+
rename_obs_keys = {
83+
"cell_type": "celltype_major",
84+
"cell_type_level2": "celltype_minor",
85+
"cell_type_level3": "celltype_subset",
86+
"donor_id": "orig.ident",
87+
"cancer_subtype": "subtype", # TODO: this is currently not in the config yaml - how to handle this?
88+
# #TODO "cell_type_unified" (?), maybe call the unified one "cell_type" and the original one "cell_type_level1"
89+
# other keys: "batch", "assay_ontology_term_id", "cell_type_ontology_term_id", "development_stage_ontology_term_id"
90+
# "diseases_ontology_term_id", "is_primary_data", "organism_ontology_term_id", "self_reported_ethnicity",
91+
# "self_reported_ethnicity_ontology_term_id", "sex_ontology_term_id", "suspension_type",
92+
# "suspension_type_ontology_term_id", "tissue_ontology_term_id", "tissue_general_ontology_term_id", "soma_joinid"
93+
}
94+
adata.obs = adata.obs.rename(columns={old:new for new,old in rename_obs_keys.items()})
95+
96+
# Add additional information to obs
97+
#TODO: Finish up the terms according to the ontology
98+
#Ontology schema currently (13.03.2025) used in openproblems (CELLxGENE schema v4.0.0):
99+
#https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md
100+
#(mentioned here: https://openproblems.bio/documentation/reference/openproblems/src-datasets#file-format:-raw-dataset)
101+
store_info = {
102+
"dataset_id": "2021Wu_human_breast_cancer_sc", #"GSE176078",
103+
"assay": "Chromium Single-Cell v2 3’ and 5’ Chemistry Library", # from metadata from GEO GSE176078 #TODO: ontology
104+
"sex": "female", #TODO: double check
105+
"tissue": "breast",
106+
"disease": "breast cancer",
107+
# "disease_ontology_term_id": "PATO:0000461", #TODO: ontology
108+
"organism": "Homo sapiens",
109+
# "organism_ontology_term_id": "NCBITaxon:10090", #TODO: ontology
110+
"tissue_general": "breast",
111+
# "tissue_general_ontology_term_id": "UBERON:0000955", #TODO: ontology
112+
"development_stage": "adult",
113+
# "development_stage_ontology_term_id": "MmusDv:0000110" #TODO: ontology
114+
}
115+
for key, value in store_info.items():
116+
adata.obs[key] = pd.Categorical([value] * adata.n_obs, categories=[value])
117+
118+
# Remove undesired columns
119+
for key in adata.obs.columns:
120+
if (key not in rename_obs_keys.keys()) and (key not in store_info.keys()):
121+
print(f"Removing .obs['{key}']")
122+
del adata.obs[key]
123+
124+
# Var
125+
adata.var["gene_symbol"] = adata.var_names
126+
# TODO: can we also get ensembl ids? (adata.var["feature_id"])
127+
128+
# Uns
129+
for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]:
130+
adata.uns[key] = par[key]
131+
132+
# Filter for cancer subtypes
133+
adata = adata[adata.obs["cancer_subtype"].isin(par["cancer_subtypes"])]
134+
135+
# Delete files if requested
136+
if not par["keep_files"]:
137+
print("Removing files", flush=True)
138+
for file in FILE_PATHS.values():
139+
if file.exists():
140+
print("\t...", file, flush=True)
141+
file.unlink()
142+
143+
print("Writing adata", flush=True)
144+
adata.write_h5ad(par["output"], compression="gzip")
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
name: process_wu_human_breast_cancer_sc
2+
namespace: datasets/workflows
3+
4+
argument_groups:
5+
- name: Inputs
6+
arguments:
7+
- type: string
8+
name: --cancer_subtypes
9+
required: false
10+
description: The cancer subtypes to download data for.
11+
multiple: true
12+
default:
13+
- HER2+
14+
- TNBC
15+
- ER+
16+
- name: Caching settings
17+
arguments:
18+
- type: boolean
19+
name: --keep_files
20+
required: false
21+
description: Whether to remove the downloaded files after processing.
22+
default: false
23+
- name: Metadata
24+
arguments:
25+
- type: string
26+
name: --dataset_id
27+
description: "A unique identifier for the dataset"
28+
required: false
29+
default: "2021Wu_human_breast_cancer_sc"
30+
- name: --dataset_name
31+
type: string
32+
description: Nicely formatted name.
33+
required: false
34+
default: "2021Wu_human_breast_cancer_sc"
35+
- type: string
36+
name: --dataset_url
37+
description: Link to the original source of the dataset.
38+
required: false
39+
default: "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE176078"
40+
- name: --dataset_reference
41+
type: string
42+
description: Bibtex reference of the paper in which the dataset was published.
43+
required: false
44+
default: "https://doi.org/10.1038/s41588-021-00911-1"
45+
- name: --dataset_summary
46+
type: string
47+
description: Short description of the dataset.
48+
required: false
49+
default: "This dataset contains scRNA-seq data from human breast cancer cells."
50+
- name: --dataset_description
51+
type: string
52+
description: Long description of the dataset.
53+
required: false
54+
default: "This dataset contains scRNA-seq data from human breast cancer cells."
55+
- name: --dataset_organism
56+
type: string
57+
description: The organism of the sample in the dataset.
58+
required: false
59+
default: "Homo sapiens"
60+
- name: Outputs
61+
arguments:
62+
- name: "--output_dataset"
63+
__merge__: /src/api/file_common_scrnaseq.yaml
64+
direction: output
65+
required: true
66+
default: "$id/dataset.h5ad"
67+
- name: "--output_meta"
68+
direction: "output"
69+
type: file
70+
description: "Dataset metadata"
71+
default: "$id/dataset_meta.yaml"
72+
73+
resources:
74+
- type: nextflow_script
75+
path: main.nf
76+
entrypoint: run_wf
77+
- path: /common/nextflow_helpers/helper.nf
78+
79+
dependencies:
80+
- name: datasets/loaders/wu_human_breast_cancer_sc
81+
# - name: datasets/processors/subsample
82+
# repository: openproblems
83+
- name: datasets/normalization/log_cp
84+
repository: openproblems
85+
- name: datasets/processors/pca
86+
repository: openproblems
87+
- name: datasets/processors/hvg
88+
repository: openproblems
89+
- name: datasets/processors/knn
90+
repository: openproblems
91+
- name: h5ad/extract_uns_metadata
92+
repository: core
93+
94+
runners:
95+
- type: nextflow
96+
directives:
97+
label: [midcpu, midmem, hightime]

0 commit comments

Comments
 (0)