Skip to content

Commit c19b095

Browse files
authored
Merge pull request #27 from openproblems-bio/dataset_sc_lung
Dataset sc lung
2 parents cff6309 + debc3f0 commit c19b095

File tree

7 files changed

+464
-1
lines changed

7 files changed

+464
-1
lines changed
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/bin/bash
2+
3+
# get the root of the directory
4+
REPO_ROOT=$(git rev-parse --show-toplevel)
5+
6+
# ensure that the command below is run from the root of the repository
7+
cd "$REPO_ROOT"
8+
9+
set -e
10+
11+
publish_dir="s3://openproblems-data/resources/datasets"
12+
13+
14+
cat > /tmp/params.yaml << HERE
15+
param_list:
16+
- id: zuani_human_nsclc_sc/2024Zuani_human_nsclc_sc
17+
18+
input: "ftp://anonymous@ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/10X_Lung_Tumour_Annotated_v2.h5ad"
19+
keep_files: false
20+
21+
output_dataset: "\$id/dataset.h5ad"
22+
output_meta: "\$id/dataset_meta.yaml"
23+
output_state: "\$id/state.yaml"
24+
publish_dir: "$publish_dir"
25+
HERE
26+
27+
tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
28+
--revision build/main \
29+
--pull-latest \
30+
--main-script target/nextflow/datasets/workflows/process_zuani_human_nsclc_sc/main.nf \
31+
--workspace 53907369739130 \
32+
--params-file /tmp/params.yaml \
33+
--config common/nextflow_helpers/labels_tw.config \
34+
--labels datasets,zuani_human_nsclc_sc
35+
36+
#aws s3 sync \
37+
# s3://openproblems-data/resources/datasets/zuani_human_nsclc_sc/2024Zuani_human_nsclc_sc \
38+
# resources/datasets/zuani_human_nsclc_sc/2024Zuani_human_nsclc_sc
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
name: zuani_human_nsclc_sc
2+
namespace: datasets/loaders
3+
4+
argument_groups:
5+
- name: Inputs
6+
arguments:
7+
- type: file
8+
name: --input
9+
description: Path to the dataset
10+
required: true
11+
example: "ftp://anonymous@ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/10X_Lung_Tumour_Annotated_v2.h5ad"
12+
- name: Caching settings
13+
arguments:
14+
- type: boolean
15+
name: --keep_files
16+
required: false
17+
description: Whether to remove the downloaded files after processing.
18+
default: false
19+
- name: Metadata
20+
arguments:
21+
- type: string
22+
name: --dataset_id
23+
description: "A unique identifier for the dataset"
24+
required: false
25+
default: "2024Zuani_human_nsclc_sc"
26+
- name: --dataset_name
27+
type: string
28+
description: Nicely formatted name.
29+
required: false
30+
default: "2024Zuani_human_nsclc_sc"
31+
- type: string
32+
name: --dataset_url
33+
description: Link to the original source of the dataset.
34+
required: false
35+
default: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526"
36+
- name: --dataset_reference
37+
type: string
38+
description: Bibtex reference of the paper in which the dataset was published.
39+
required: false
40+
default: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526"
41+
- name: --dataset_summary
42+
type: string
43+
description: Short description of the dataset.
44+
required: false
45+
default: "Single cell RNA sequencing atlas of non-small cell lung cancer (NSCLC)"
46+
- name: --dataset_description
47+
type: string
48+
description: Long description of the dataset.
49+
required: false
50+
default: "We performed single cell RNA sequencing (scRNA-seq) of NSCLC tumours and matched, adjacent, non-involved lung tissue from 24 patients. The data set is composed of approximately 900,000 cells from two different populations: CD235- (haematopoietic and non-haematopoietic cells depleted of erythrocytes), and CD45+ (all haematopoietic cells)."
51+
- name: --dataset_organism
52+
type: string
53+
description: The organism of the sample in the dataset.
54+
required: false
55+
default: "human"
56+
- name: Outputs
57+
arguments:
58+
- name: "--output"
59+
__merge__: /src/api/file_common_scrnaseq.yaml
60+
direction: output
61+
required: true
62+
63+
resources:
64+
- type: python_script
65+
path: script.py
66+
67+
engines:
68+
- type: docker
69+
image: openproblems/base_python:1.0.0
70+
__merge__:
71+
- /src/base/setup_txsim_partial.yaml
72+
- type: native
73+
74+
runners:
75+
- type: executable
76+
- type: nextflow
77+
directives:
78+
label: [veryhighmem, midcpu, midtime]
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
from pathlib import Path
2+
import os
3+
import pandas as pd
4+
import anndata as ad
5+
6+
7+
## VIASH START
8+
9+
par = {
10+
"input": "ftp://anonymous@ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/10X_Lung_Tumour_Annotated_v2.h5ad",
11+
"keep_files": True, # wether to delete the intermediate files
12+
"output": "./temp/datasets/2024Zuani_human_nsclc_sc.h5ad",
13+
"dataset_id": "2024Zuani_human_nsclc_sc",
14+
"dataset_name": "2024Zuani_human_nsclc_sc",
15+
"dataset_url": "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526",
16+
"dataset_reference": "https://doi.org/10.1038/s41467-024-48700-8",
17+
"dataset_summary": "This dataset contains scRNA-seq data from human lung cancer cells.",
18+
"dataset_description": "This dataset contains scRNA-seq data from human lung cancer cells.",
19+
"dataset_organism": "Homo sapiens"
20+
}
21+
22+
meta = {
23+
"temp_dir": "./temp/datasets/2024Zuani_human_nsclc_sc",
24+
}
25+
26+
## VIASH END
27+
28+
29+
# Helper variables
30+
TMP_DIR = Path(meta["temp_dir"] or "./tmp")
31+
TMP_DIR.mkdir(parents=True, exist_ok=True)
32+
FILE_PATH = TMP_DIR / par["input"].split("/")[-1]
33+
DOWNLOAD_URL = par["input"]
34+
35+
# Download the data (55GB)
36+
os.system(f'wget "{DOWNLOAD_URL}" -P "{TMP_DIR}/"')
37+
# os.system(f'wget "{DOWNLOAD_URL}" -P "{TMP_DIR}/" --show-progress')
38+
adata = ad.read_h5ad(FILE_PATH)
39+
# adata = adata[::100]
40+
41+
# Filter genes (not needed)
42+
# sc.pp.filter_genes(adata, min_counts=1)
43+
44+
# Filter cells to NSCLC (~200k cells filtered out)
45+
tumour_type_to_nsclc_status = {
46+
"NSCLC": "NSCLC",
47+
"Squamous cell carcinoma": "NSCLC",
48+
"Squamous dysplasia": "not NSCLC", # pre-malignant lesion
49+
"Squamous cancer": "NSCLC",
50+
"Adenocarcinoma ": "NSCLC",
51+
"Adenocarcinoma": "NSCLC",
52+
"NA": "not NSCLC", # unclear / missing data
53+
"Mucinouse\nadenocarcinoma": "NSCLC",
54+
"Presumed Lung cancer": "not NSCLC", # not a confirmed subtype
55+
"Squamous carcinoma": "NSCLC",
56+
"Squamous cell lung cancer": "NSCLC",
57+
"lung adenocarcinoma": "NSCLC",
58+
"TTF1 +ve lung adenocarcinoma": "NSCLC",
59+
"Lung cancer": "not NSCLC" # too generic #TODO: check from paper if this refers to NSCLC or not
60+
}
61+
adata.obs["NSCLC"] = adata.obs["tumour type"].map(tumour_type_to_nsclc_status)
62+
adata = adata[adata.obs["NSCLC"] == "NSCLC"]
63+
64+
# Filter out cell types that should be removed
65+
to_remove = adata.obs["Cell types"].str.endswith("(to remove)")
66+
adata = adata[~to_remove]
67+
68+
69+
# Rename or copy obs columns
70+
rename_obs_keys = {
71+
"cell_type": 'Cell types',
72+
"donor_id": "patient",
73+
"sex": "sex",
74+
"batch": "batch",
75+
}
76+
adata.obs = adata.obs.rename(columns={old:new for new,old in rename_obs_keys.items()})
77+
78+
# Store obs metadata with single values
79+
store_info = {
80+
"dataset_id": par["dataset_id"],
81+
"tissue": "lung",
82+
"disease": "NSCLC",
83+
"organism": "Homo sapiens",
84+
"tissue_general": "lung",
85+
"development_stage": "adult",
86+
# #TODO other keys: "assay", "assay_ontology_term_id", "cell_type_ontology_term_id", "development_stage_ontology_term_id"
87+
# "diseases_ontology_term_id", "is_primary_data", "organism_ontology_term_id", "self_reported_ethnicity",
88+
# "self_reported_ethnicity_ontology_term_id", "sex_ontology_term_id", "suspension_type",
89+
# "suspension_type_ontology_term_id", "tissue_ontology_term_id", "tissue_general_ontology_term_id", "soma_joinid"
90+
}
91+
for key, value in store_info.items():
92+
adata.obs[key] = pd.Categorical([value] * adata.n_obs, categories=[value])
93+
94+
# Subset obs columns
95+
obs_cols = list(rename_obs_keys.keys()) + list(store_info.keys())
96+
adata.obs = adata.obs[obs_cols]
97+
98+
# Save uns metadata
99+
for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]:
100+
adata.uns[key] = par[key]
101+
102+
# Add gene symbol column
103+
adata.var["gene_symbol"] = adata.var_names
104+
105+
# Subset var columns
106+
var_cols = ["gene_symbol"]
107+
adata.var = adata.var[var_cols]
108+
109+
# Add layers
110+
adata.layers['counts'] = adata.X
111+
del adata.X
112+
113+
# Delete files if requested
114+
if not par["keep_files"]:
115+
print("Removing files", flush=True)
116+
if FILE_PATH.exists():
117+
print("\t...", FILE_PATH, flush=True)
118+
FILE_PATH.unlink()
119+
120+
# Write adata
121+
print("Writing adata", flush=True)
122+
adata.write_h5ad(par["output"], compression="gzip")
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
name: process_zuani_human_nsclc_sc
2+
namespace: datasets/workflows
3+
4+
argument_groups:
5+
- name: Inputs
6+
arguments:
7+
- type: file
8+
name: --input
9+
description: Path to the dataset
10+
required: true
11+
example: "ftp://anonymous@ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/10X_Lung_Tumour_Annotated_v2.h5ad"
12+
- name: Caching settings
13+
arguments:
14+
- type: boolean
15+
name: --keep_files
16+
required: false
17+
description: Whether to remove the downloaded files after processing.
18+
default: false
19+
- name: Metadata
20+
arguments:
21+
- type: string
22+
name: --dataset_id
23+
description: "A unique identifier for the dataset"
24+
required: false
25+
default: "2024Zuani_human_nsclc_sc"
26+
- name: --dataset_name
27+
type: string
28+
description: Nicely formatted name.
29+
required: false
30+
default: "2024Zuani_human_nsclc_sc"
31+
- type: string
32+
name: --dataset_url
33+
description: Link to the original source of the dataset.
34+
required: false
35+
default: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526"
36+
- name: --dataset_reference
37+
type: string
38+
description: Bibtex reference of the paper in which the dataset was published.
39+
required: false
40+
default: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526"
41+
- name: --dataset_summary
42+
type: string
43+
description: Short description of the dataset.
44+
required: true
45+
default: "Single cell RNA sequencing atlas of non-small cell lung cancer (NSCLC)"
46+
- name: --dataset_description
47+
type: string
48+
description: Long description of the dataset.
49+
required: true
50+
default: "We performed single cell RNA sequencing (scRNA-seq) of NSCLC tumours and matched, adjacent, non-involved lung tissue from 24 patients. The data set is composed of approximately 900,000 cells from two different populations: CD235- (haematopoietic and non-haematopoietic cells depleted of erythrocytes), and CD45+ (all haematopoietic cells)."
51+
- name: --dataset_organism
52+
type: string
53+
description: The organism of the sample in the dataset.
54+
required: false
55+
default: "human"
56+
- name: Outputs
57+
arguments:
58+
- name: "--output"
59+
__merge__: /src/api/file_common_scrnaseq.yaml
60+
direction: output
61+
required: true
62+
default: "$id/dataset.h5ad"
63+
- name: "--output_meta"
64+
direction: "output"
65+
type: file
66+
description: "Dataset metadata"
67+
default: "$id/dataset_meta.yaml"
68+
69+
resources:
70+
- type: nextflow_script
71+
path: main.nf
72+
entrypoint: run_wf
73+
- path: /common/nextflow_helpers/helper.nf
74+
75+
dependencies:
76+
- name: datasets/loaders/zuani_human_nsclc_sc
77+
# - name: datasets/processors/subsample
78+
# repository: openproblems
79+
- name: datasets/normalization/log_cp
80+
repository: openproblems
81+
- name: datasets/processors/pca
82+
repository: openproblems
83+
- name: datasets/processors/hvg
84+
repository: openproblems
85+
- name: datasets/processors/knn
86+
repository: openproblems
87+
- name: utils/extract_uns_metadata
88+
repository: openproblems
89+
90+
runners:
91+
- type: nextflow
92+
directives:
93+
label: [midcpu, midmem, hightime]

0 commit comments

Comments
 (0)