Skip to content

Commit 40d5934

Browse files
authored
Merge pull request #37 from openproblems-bio/ganier_skin_sc
Add sc human skin dataset
2 parents 992b231 + d399422 commit 40d5934

File tree

6 files changed

+412
-0
lines changed

6 files changed

+412
-0
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/bin/bash
2+
3+
# get the root of the directory
4+
REPO_ROOT=$(git rev-parse --show-toplevel)
5+
6+
# ensure that the command below is run from the root of the repository
7+
cd "$REPO_ROOT"
8+
9+
set -e
10+
11+
publish_dir="s3://openproblems-data/resources/datasets"
12+
13+
# Note that the current download script and processing workflow have a specific default parameter set for the given dataset.
14+
# No additional datasets are supported by that component/workflow. Therefore the default parameters are used and don't need
15+
# to be specified here.
16+
17+
cat > /tmp/params.yaml << HERE
18+
param_list:
19+
- id: scrnaseq_for_ist/2024Ganier_human_skin_sc
20+
21+
keep_files: false
22+
23+
output_dataset: "\$id/dataset.h5ad"
24+
output_meta: "\$id/dataset_meta.yaml"
25+
output_state: "\$id/state.yaml"
26+
publish_dir: "$publish_dir"
27+
HERE
28+
29+
tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
30+
--revision build/main \
31+
--pull-latest \
32+
--main-script target/nextflow/datasets/workflows/process_ganier_human_skin_sc/main.nf \
33+
--workspace 53907369739130 \
34+
--params-file /tmp/params.yaml \
35+
--config common/nextflow_helpers/labels_tw.config \
36+
--labels datasets,ganier_human_skin_sc
37+
38+
#aws s3 sync \
39+
# s3://openproblems-data/resources/datasets/wu_human_breast_cancer_sc/2021Wu_human_breast_cancer_sc \
40+
# resources/datasets/wu_human_breast_cancer_sc/2021Wu_human_breast_cancer_sc
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
name: ganier_human_skin_sc
2+
namespace: datasets/loaders
3+
4+
argument_groups:
5+
- name: Caching settings
6+
arguments:
7+
- type: boolean
8+
name: --keep_files
9+
required: false
10+
description: Whether to remove the downloaded files after processing.
11+
default: false
12+
- name: Metadata
13+
arguments:
14+
- type: string
15+
name: --dataset_id
16+
description: "A unique identifier for the dataset"
17+
required: false
18+
default: "2024Ganier_human_skin_sc"
19+
- name: --dataset_name
20+
type: string
21+
description: Nicely formatted name.
22+
required: false
23+
default: "2024Ganier_human_skin_sc"
24+
- type: string
25+
name: --dataset_url
26+
description: Link to the original source of the dataset.
27+
required: false
28+
default: "https://spatial-skin-atlas.cellgeni.sanger.ac.uk"
29+
- name: --dataset_reference
30+
type: string
31+
description: Bibtex reference of the paper in which the dataset was published.
32+
required: false
33+
default: "https://doi.org/10.1073/pnas.2313326120"
34+
- name: --dataset_summary
35+
type: string
36+
description: Short description of the dataset.
37+
required: false
38+
default: "This dataset contains scRNA-seq data from healthy human skin."
39+
- name: --dataset_description
40+
type: string
41+
description: Long description of the dataset.
42+
required: false
43+
default: "This dataset contains scRNA-seq data from healthy human skin."
44+
- name: --dataset_organism
45+
type: string
46+
description: The organism of the sample in the dataset.
47+
required: false
48+
default: "Homo sapiens"
49+
- name: Outputs
50+
arguments:
51+
- name: "--output"
52+
__merge__: /src/api/file_common_scrnaseq.yaml
53+
direction: output
54+
required: true
55+
56+
resources:
57+
- type: python_script
58+
path: script.py
59+
60+
engines:
61+
- type: docker
62+
image: openproblems/base_python:1
63+
__merge__:
64+
- /src/base/setup_txsim_partial.yaml
65+
- type: native
66+
67+
runners:
68+
- type: executable
69+
- type: nextflow
70+
directives:
71+
label: [midmem, midcpu, midtime]
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
from pathlib import Path
2+
import pandas as pd
3+
import anndata as ad
4+
import urllib.request
5+
6+
7+
## VIASH START
8+
par = {
9+
"keep_files": True, # wether to delete the intermediate files
10+
"output": "./temp/datasets/2024Ganier_human_skin_sc.h5ad",
11+
"dataset_id": "2024Ganier_human_skin_sc",
12+
"dataset_name": "2024Ganier_human_skin_sc",
13+
"dataset_url": "https://spatial-skin-atlas.cellgeni.sanger.ac.uk",
14+
"dataset_reference": "https://doi.org/10.1073/pnas.2313326120", #TODO: bibtex not doi, also adjust config.vsh.yaml
15+
"dataset_summary": "This dataset contains scRNA-seq data from healthy human skin.",
16+
"dataset_description": "This dataset contains scRNA-seq data from healthy human skin.",
17+
"dataset_organism": "Homo sapiens"
18+
}
19+
meta = {
20+
"temp_dir": "./temp/datasets/2024Ganier_human_skin_sc",
21+
}
22+
## VIASH END
23+
24+
# helper variables
25+
TMP_DIR = Path(meta["temp_dir"] or "/tmp")
26+
TMP_DIR.mkdir(parents=True, exist_ok=True)
27+
FILE_URL = "https://cellgeni.cog.sanger.ac.uk/spatial-skin-atlas/scrnaseq/bcc_and_normal-CG_portal_fat.h5ad"
28+
# cell x gene alternative: https://datasets.cellxgene.cziscience.com/995fa8f9-8ed4-46d3-8d75-115fa06cc787.h5ad
29+
FILE_PATH = TMP_DIR / "bcc_and_normal-CG_portal_fat.h5ad"
30+
31+
32+
# Download the data
33+
print("Downloading data (~1GB)", flush=True)
34+
urllib.request.urlretrieve(FILE_URL, FILE_PATH)
35+
36+
# Read data
37+
print("Reading data", flush=True)
38+
adata = ad.read_h5ad(FILE_PATH)
39+
40+
# Filter out bcc samples
41+
adata = adata[~adata.obs["01_sample"].str.startswith("bcc")]
42+
43+
# NOTE: only log-normalized counts are available, in the workflow we'll skip the log-normalization step
44+
# This is not optimal, layers "counts" should be the raw counts
45+
adata.layers["counts"] = adata.X
46+
adata.layers["normalized"] = adata.X
47+
del adata.X
48+
49+
# Rename fields
50+
rename_obs_keys = {
51+
"cell_type": "04_celltypes",
52+
"cell_type_level2": "05_subcelltypes",
53+
"batch": "01_sample",
54+
#TODO other fields, note that the cell x gene download source might be the better alternative
55+
}
56+
adata.obs = adata.obs.rename(columns={old:new for new,old in rename_obs_keys.items()})
57+
58+
# Add additional information to obs
59+
#TODO: Finish up the terms according to the ontology
60+
#Ontology schema currently (13.03.2025) used in openproblems (CELLxGENE schema v4.0.0):
61+
#https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md
62+
#(mentioned here: https://openproblems.bio/documentation/reference/openproblems/src-datasets#file-format:-raw-dataset)
63+
store_info = {
64+
"dataset_id": "2024Ganier_human_skin_sc",
65+
#TODO other fields, note that the cell x gene download source might be the better alternative
66+
#"assay": "Chromium Single-Cell v2 3’ and 5’ Chemistry Library", # from metadata from GEO GSE176078 #TODO: ontology
67+
#"sex": "female", #TODO: double check
68+
#"tissue": "breast",
69+
#"disease": "breast cancer",
70+
## "disease_ontology_term_id": "PATO:0000461", #TODO: ontology
71+
#"organism": "Homo sapiens",
72+
## "organism_ontology_term_id": "NCBITaxon:10090", #TODO: ontology
73+
#"tissue_general": "breast",
74+
## "tissue_general_ontology_term_id": "UBERON:0000955", #TODO: ontology
75+
#"development_stage": "adult",
76+
## "development_stage_ontology_term_id": "MmusDv:0000110" #TODO: ontology
77+
}
78+
for key, value in store_info.items():
79+
adata.obs[key] = pd.Categorical([value] * adata.n_obs, categories=[value])
80+
81+
# Remove undesired columns
82+
for key in adata.obs.columns:
83+
if (key not in rename_obs_keys.keys()) and (key not in store_info.keys()):
84+
print(f"Removing .obs['{key}']")
85+
del adata.obs[key]
86+
87+
# Var
88+
adata.var["gene_symbol"] = adata.var_names
89+
# TODO: can we also get ensembl ids? (adata.var["feature_id"])
90+
91+
# Uns
92+
for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]:
93+
adata.uns[key] = par[key]
94+
95+
96+
# Delete files if requested
97+
if not par["keep_files"]:
98+
print("Removing files", flush=True)
99+
if FILE_PATH.exists():
100+
print("\t...", FILE_PATH, flush=True)
101+
FILE_PATH.unlink()
102+
103+
print("Writing adata", flush=True)
104+
adata.write_h5ad(par["output"], compression="gzip")
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
name: process_ganier_human_skin_sc
2+
namespace: datasets/workflows
3+
4+
argument_groups:
5+
- name: Caching settings
6+
arguments:
7+
- type: boolean
8+
name: --keep_files
9+
required: false
10+
description: Whether to remove the downloaded files after processing.
11+
default: false
12+
- name: Metadata
13+
arguments:
14+
- type: string
15+
name: --dataset_id
16+
description: "A unique identifier for the dataset"
17+
required: false
18+
default: "2024Ganier_human_skin_sc"
19+
- name: --dataset_name
20+
type: string
21+
description: Nicely formatted name.
22+
required: false
23+
default: "2024Ganier_human_skin_sc"
24+
- type: string
25+
name: --dataset_url
26+
description: Link to the original source of the dataset.
27+
required: false
28+
default: "https://spatial-skin-atlas.cellgeni.sanger.ac.uk"
29+
- name: --dataset_reference
30+
type: string
31+
description: Bibtex reference of the paper in which the dataset was published.
32+
required: false
33+
default: "https://doi.org/10.1073/pnas.2313326120"
34+
- name: --dataset_summary
35+
type: string
36+
description: Short description of the dataset.
37+
required: false
38+
default: "This dataset contains scRNA-seq data from healthy human skin."
39+
- name: --dataset_description
40+
type: string
41+
description: Long description of the dataset.
42+
required: false
43+
default: "This dataset contains scRNA-seq data from healthy human skin."
44+
- name: --dataset_organism
45+
type: string
46+
description: The organism of the sample in the dataset.
47+
required: false
48+
default: "Homo sapiens"
49+
- name: Outputs
50+
arguments:
51+
- name: "--output_dataset"
52+
__merge__: /src/api/file_common_scrnaseq.yaml
53+
direction: output
54+
required: true
55+
default: "$id/dataset.h5ad"
56+
- name: "--output_meta"
57+
direction: "output"
58+
type: file
59+
description: "Dataset metadata"
60+
default: "$id/dataset_meta.yaml"
61+
62+
resources:
63+
- type: nextflow_script
64+
path: main.nf
65+
entrypoint: run_wf
66+
- path: /common/nextflow_helpers/helper.nf
67+
68+
dependencies:
69+
- name: datasets/loaders/ganier_human_skin_sc
70+
- name: datasets/processors/pca
71+
repository: openproblems
72+
- name: datasets/processors/hvg
73+
repository: openproblems
74+
- name: datasets/processors/knn
75+
repository: openproblems
76+
- name: utils/extract_uns_metadata
77+
repository: openproblems
78+
79+
runners:
80+
- type: nextflow
81+
directives:
82+
label: [midcpu, midmem, hightime]
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
include { findArgumentSchema } from "${meta.resources_dir}/helper.nf"
2+
3+
workflow auto {
4+
findStates(params, meta.config)
5+
| meta.workflow.run(
6+
auto: [publish: "state"]
7+
)
8+
}
9+
10+
workflow run_wf {
11+
take:
12+
input_ch
13+
14+
main:
15+
output_ch = input_ch
16+
17+
// copy id to the state
18+
| map{ id, state ->
19+
def new_state = state + [dataset_id: id]
20+
[id, new_state]
21+
}
22+
23+
| ganier_human_skin_sc.run(
24+
fromState: [
25+
"dataset_id",
26+
"dataset_name",
27+
"dataset_url",
28+
"dataset_reference",
29+
"dataset_summary",
30+
"dataset_description",
31+
"dataset_organism",
32+
],
33+
toState: [
34+
"output_normalized": "output"
35+
]
36+
)
37+
38+
| hvg.run(
39+
fromState: ["input": "output_normalized"],
40+
toState: ["output_hvg": "output"]
41+
)
42+
43+
| pca.run(
44+
fromState: ["input": "output_hvg"],
45+
toState: ["output_pca": "output" ]
46+
)
47+
48+
| knn.run(
49+
fromState: ["input": "output_pca"],
50+
toState: ["output_knn": "output"]
51+
)
52+
// add synonym
53+
| map{ id, state ->
54+
[id, state + [output_dataset: state.output_knn]]
55+
}
56+
57+
| extract_uns_metadata.run(
58+
fromState: { id, state ->
59+
def schema = findArgumentSchema(meta.config, "output_dataset")
60+
// workaround: convert GString to String
61+
schema = iterateMap(schema, { it instanceof GString ? it.toString() : it })
62+
def schemaYaml = tempFile("schema.yaml")
63+
writeYaml(schema, schemaYaml)
64+
[
65+
"input": state.output_dataset,
66+
"schema": schemaYaml
67+
]
68+
},
69+
toState: ["output_meta": "output"]
70+
)
71+
72+
| setState([
73+
"output_dataset": "output_dataset",
74+
"output_meta": "output_meta"
75+
])
76+
77+
emit:
78+
output_ch
79+
}
80+

0 commit comments

Comments
 (0)