Merge pull request #27 from openproblems-bio/dataset_sc_lung

LouisK92 · web-flow · commit c19b095bc9dd · 2025-07-15T19:41:19.000+02:00
Dataset sc lung
diff --git a/common b/common
@@ -1 +1 @@
-Subproject commit 80321bf1b5e44330c5ceadfc1434908bb58e2aff
+Subproject commit 65e05af68a11ee87853fcf7a3c6b579001f21abe
diff --git a/scripts/create_resources/sc/process_zuani_human_nsclc_sc.sh b/scripts/create_resources/sc/process_zuani_human_nsclc_sc.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+publish_dir="s3://openproblems-data/resources/datasets"
+
+
+cat > /tmp/params.yaml << HERE
+param_list:
+  - id: zuani_human_nsclc_sc/2024Zuani_human_nsclc_sc
+        
+input: "ftp://anonymous@ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/10X_Lung_Tumour_Annotated_v2.h5ad"
+keep_files: false 
+
+output_dataset: "\$id/dataset.h5ad"
+output_meta: "\$id/dataset_meta.yaml"
+output_state: "\$id/state.yaml"
+publish_dir: "$publish_dir"
+HERE
+
+tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
+  --revision build/main \
+  --pull-latest \
+  --main-script target/nextflow/datasets/workflows/process_zuani_human_nsclc_sc/main.nf \
+  --workspace 53907369739130 \
+  --params-file /tmp/params.yaml \
+  --config common/nextflow_helpers/labels_tw.config \
+  --labels datasets,zuani_human_nsclc_sc
+
+#aws s3 sync \
+#  s3://openproblems-data/resources/datasets/zuani_human_nsclc_sc/2024Zuani_human_nsclc_sc \
+#  resources/datasets/zuani_human_nsclc_sc/2024Zuani_human_nsclc_sc
diff --git a/src/datasets/loaders/zuani_human_nsclc_sc/config.vsh.yaml b/src/datasets/loaders/zuani_human_nsclc_sc/config.vsh.yaml
@@ -0,0 +1,78 @@
+name: zuani_human_nsclc_sc
+namespace: datasets/loaders
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - type: file
+        name: --input
+        description: Path to the dataset
+        required: true
+        example: "ftp://anonymous@ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/10X_Lung_Tumour_Annotated_v2.h5ad"
+  - name: Caching settings
+    arguments:
+      - type: boolean
+        name: --keep_files
+        required: false
+        description: Whether to remove the downloaded files after processing.
+        default: false
+  - name: Metadata
+    arguments:
+      - type: string
+        name: --dataset_id
+        description: "A unique identifier for the dataset"
+        required: false
+        default: "2024Zuani_human_nsclc_sc" 
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: false
+        default: "2024Zuani_human_nsclc_sc" 
+      - type: string
+        name: --dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+        default: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" 
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+        default: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" 
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: false
+        default: "Single cell RNA sequencing atlas of non-small cell lung cancer (NSCLC)" 
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: false
+        default: "We performed single cell RNA sequencing (scRNA-seq) of NSCLC tumours and matched, adjacent, non-involved lung tissue from 24 patients. The data set is composed of approximately 900,000 cells from two different populations: CD235- (haematopoietic and non-haematopoietic cells depleted of erythrocytes), and CD45+ (all haematopoietic cells)." 
+      - name: --dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+        default: "human" 
+  - name: Outputs
+    arguments:
+    - name: "--output"
+      __merge__: /src/api/file_common_scrnaseq.yaml
+      direction: output
+      required: true
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    __merge__: 
+      - /src/base/setup_txsim_partial.yaml
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [veryhighmem, midcpu, midtime]
diff --git a/src/datasets/loaders/zuani_human_nsclc_sc/script.py b/src/datasets/loaders/zuani_human_nsclc_sc/script.py
@@ -0,0 +1,122 @@
+from pathlib import Path
+import os
+import pandas as pd
+import anndata as ad
+
+
+## VIASH START
+
+par = {
+    "input": "ftp://anonymous@ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/10X_Lung_Tumour_Annotated_v2.h5ad",
+    "keep_files": True, # wether to delete the intermediate files
+    "output": "./temp/datasets/2024Zuani_human_nsclc_sc.h5ad",
+    "dataset_id": "2024Zuani_human_nsclc_sc", 
+    "dataset_name": "2024Zuani_human_nsclc_sc", 
+    "dataset_url": "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526",
+    "dataset_reference": "https://doi.org/10.1038/s41467-024-48700-8", 
+    "dataset_summary": "This dataset contains scRNA-seq data from human lung cancer cells.",
+    "dataset_description": "This dataset contains scRNA-seq data from human lung cancer cells.",
+    "dataset_organism": "Homo sapiens"
+}
+
+meta = {
+    "temp_dir": "./temp/datasets/2024Zuani_human_nsclc_sc",
+}
+
+## VIASH END
+
+
+# Helper variables
+TMP_DIR = Path(meta["temp_dir"] or "./tmp")
+TMP_DIR.mkdir(parents=True, exist_ok=True)
+FILE_PATH = TMP_DIR / par["input"].split("/")[-1]
+DOWNLOAD_URL = par["input"]
+
+# Download the data (55GB)
+os.system(f'wget "{DOWNLOAD_URL}" -P "{TMP_DIR}/"')
+# os.system(f'wget "{DOWNLOAD_URL}" -P "{TMP_DIR}/" --show-progress')
+adata = ad.read_h5ad(FILE_PATH)
+# adata = adata[::100]
+
+# Filter genes (not needed)
+# sc.pp.filter_genes(adata, min_counts=1)
+
+# Filter cells to NSCLC (~200k cells filtered out)
+tumour_type_to_nsclc_status = {
+    "NSCLC": "NSCLC",
+    "Squamous cell carcinoma": "NSCLC",
+    "Squamous dysplasia": "not NSCLC",  # pre-malignant lesion
+    "Squamous cancer": "NSCLC", 
+    "Adenocarcinoma ": "NSCLC",
+    "Adenocarcinoma": "NSCLC",
+    "NA": "not NSCLC",  # unclear / missing data
+    "Mucinouse\nadenocarcinoma": "NSCLC",  
+    "Presumed Lung cancer": "not NSCLC",  # not a confirmed subtype
+    "Squamous carcinoma": "NSCLC",
+    "Squamous cell lung cancer": "NSCLC",
+    "lung adenocarcinoma": "NSCLC",
+    "TTF1 +ve lung adenocarcinoma": "NSCLC",
+    "Lung cancer": "not NSCLC"  # too generic #TODO: check from paper if this refers to NSCLC or not
+}
+adata.obs["NSCLC"] = adata.obs["tumour type"].map(tumour_type_to_nsclc_status)
+adata = adata[adata.obs["NSCLC"] == "NSCLC"]
+
+# Filter out cell types that should be removed
+to_remove = adata.obs["Cell types"].str.endswith("(to remove)")
+adata = adata[~to_remove]
+
+
+# Rename or copy obs columns
+rename_obs_keys = {
+    "cell_type": 'Cell types',
+    "donor_id": "patient",
+    "sex": "sex",
+    "batch": "batch",
+}
+adata.obs = adata.obs.rename(columns={old:new for new,old in rename_obs_keys.items()})
+
+# Store obs metadata with single values
+store_info = { 
+    "dataset_id": par["dataset_id"],
+    "tissue": "lung",
+    "disease": "NSCLC", 
+    "organism": "Homo sapiens",
+    "tissue_general": "lung",
+    "development_stage": "adult", 
+    # #TODO other keys: "assay", "assay_ontology_term_id", "cell_type_ontology_term_id", "development_stage_ontology_term_id"
+    # "diseases_ontology_term_id", "is_primary_data", "organism_ontology_term_id", "self_reported_ethnicity", 
+    # "self_reported_ethnicity_ontology_term_id", "sex_ontology_term_id", "suspension_type", 
+    # "suspension_type_ontology_term_id", "tissue_ontology_term_id", "tissue_general_ontology_term_id", "soma_joinid"
+}
+for key, value in store_info.items():
+    adata.obs[key] = pd.Categorical([value] * adata.n_obs, categories=[value])
+
+# Subset obs columns
+obs_cols = list(rename_obs_keys.keys()) + list(store_info.keys())
+adata.obs = adata.obs[obs_cols]
+
+# Save uns metadata
+for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]:
+    adata.uns[key] = par[key]
+
+# Add gene symbol column
+adata.var["gene_symbol"] = adata.var_names
+
+# Subset var columns
+var_cols = ["gene_symbol"]
+adata.var = adata.var[var_cols]
+
+# Add layers
+adata.layers['counts'] =  adata.X
+del adata.X
+
+# Delete files if requested
+if not par["keep_files"]:
+    print("Removing files", flush=True)
+    if FILE_PATH.exists():
+        print("\t...", FILE_PATH, flush=True)
+        FILE_PATH.unlink()
+
+# Write adata
+print("Writing adata", flush=True)
+adata.write_h5ad(par["output"], compression="gzip")
diff --git a/src/datasets/workflows/process_zuani_human_nsclc_sc/config.vsh.yaml b/src/datasets/workflows/process_zuani_human_nsclc_sc/config.vsh.yaml
@@ -0,0 +1,93 @@
+name: process_zuani_human_nsclc_sc
+namespace: datasets/workflows
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - type: file
+        name: --input
+        description: Path to the dataset
+        required: true
+        example: "ftp://anonymous@ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/10X_Lung_Tumour_Annotated_v2.h5ad"
+  - name: Caching settings
+    arguments:
+      - type: boolean
+        name: --keep_files
+        required: false
+        description: Whether to remove the downloaded files after processing.
+        default: false
+  - name: Metadata
+    arguments:
+      - type: string
+        name: --dataset_id
+        description: "A unique identifier for the dataset"
+        required: false
+        default: "2024Zuani_human_nsclc_sc" 
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: false
+        default: "2024Zuani_human_nsclc_sc" 
+      - type: string
+        name: --dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+        default: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" 
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+        default: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" 
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+        default: "Single cell RNA sequencing atlas of non-small cell lung cancer (NSCLC)" 
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+        default: "We performed single cell RNA sequencing (scRNA-seq) of NSCLC tumours and matched, adjacent, non-involved lung tissue from 24 patients. The data set is composed of approximately 900,000 cells from two different populations: CD235- (haematopoietic and non-haematopoietic cells depleted of erythrocytes), and CD45+ (all haematopoietic cells)." 
+      - name: --dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+        default: "human" 
+  - name: Outputs
+    arguments:
+    - name: "--output"
+      __merge__: /src/api/file_common_scrnaseq.yaml
+      direction: output
+      required: true
+      default: "$id/dataset.h5ad"
+    - name: "--output_meta"
+      direction: "output"
+      type: file
+      description: "Dataset metadata"
+      default: "$id/dataset_meta.yaml"
+
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+  - path: /common/nextflow_helpers/helper.nf
+
+dependencies:
+  - name: datasets/loaders/zuani_human_nsclc_sc
+#  - name: datasets/processors/subsample
+#    repository: openproblems
+  - name: datasets/normalization/log_cp
+    repository: openproblems
+  - name: datasets/processors/pca
+    repository: openproblems
+  - name: datasets/processors/hvg
+    repository: openproblems
+  - name: datasets/processors/knn
+    repository: openproblems
+  - name: utils/extract_uns_metadata
+    repository: openproblems
+
+runners:
+  - type: nextflow
+    directives:
+      label: [midcpu, midmem, hightime]
diff --git a/src/datasets/workflows/process_zuani_human_nsclc_sc/main.nf b/src/datasets/workflows/process_zuani_human_nsclc_sc/main.nf
diff --git a/src/datasets/workflows/process_zuani_human_nsclc_sc/test.sh b/src/datasets/workflows/process_zuani_human_nsclc_sc/test.sh