Skip to content

Commit 9b75094

Browse files
committed
Add Lu human liver cancer sc dataset
1 parent 31a37a6 commit 9b75094

File tree

6 files changed

+462
-0
lines changed

6 files changed

+462
-0
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/bin/bash
2+
3+
# get the root of the directory
4+
REPO_ROOT=$(git rev-parse --show-toplevel)
5+
6+
# ensure that the command below is run from the root of the repository
7+
cd "$REPO_ROOT"
8+
9+
set -e
10+
11+
publish_dir="s3://openproblems-data/resources/datasets"
12+
13+
# Note that the current download script and processing workflow have a specific default parameter set for the given dataset.
14+
# No additional datasets are supported by that component/workflow. Therefore the default parameters are used and don't need
15+
# to be specified here.
16+
17+
cat > /tmp/params.yaml << HERE
18+
param_list:
19+
- id: scrnaseq_for_ist/2022Lu_human_liver_cancer_sc
20+
21+
keep_files: false
22+
23+
output_dataset: "\$id/dataset.h5ad"
24+
output_meta: "\$id/dataset_meta.yaml"
25+
output_state: "\$id/state.yaml"
26+
publish_dir: "$publish_dir"
27+
HERE
28+
29+
tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
30+
--revision build/main \
31+
--pull-latest \
32+
--main-script target/nextflow/datasets/workflows/process_lu_human_liver_cancer_sc/main.nf \
33+
--workspace 53907369739130 \
34+
--params-file /tmp/params.yaml \
35+
--config common/nextflow_helpers/labels_tw.config \
36+
--labels datasets,lu_human_liver_cancer_sc
37+
38+
#aws s3 sync \
39+
# s3://openproblems-data/resources/datasets/wu_human_breast_cancer_sc/2021Wu_human_breast_cancer_sc \
40+
# resources/datasets/wu_human_breast_cancer_sc/2021Wu_human_breast_cancer_sc
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
name: lu_human_liver_cancer_sc
2+
namespace: datasets/loaders
3+
4+
argument_groups:
5+
- name: Caching settings
6+
arguments:
7+
- type: boolean
8+
name: --keep_files
9+
required: false
10+
description: Whether to remove the downloaded files after processing.
11+
default: false
12+
- name: Metadata
13+
arguments:
14+
- type: string
15+
name: --dataset_id
16+
description: "A unique identifier for the dataset"
17+
required: false
18+
default: "2022Lu_human_liver_cancer_sc"
19+
- name: --dataset_name
20+
type: string
21+
description: Nicely formatted name.
22+
required: false
23+
default: "2022Lu_human_liver_cancer_sc"
24+
- type: string
25+
name: --dataset_url
26+
description: Link to the original source of the dataset.
27+
required: false
28+
default: "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE149614"
29+
- name: --dataset_reference
30+
type: string
31+
description: Bibtex reference of the paper in which the dataset was published.
32+
required: false
33+
default: "https://doi.org/10.1038/s41467-022-32283-3"
34+
- name: --dataset_summary
35+
type: string
36+
description: Short description of the dataset.
37+
required: false
38+
default: "This dataset contains scRNA-seq data from human liver cancer cells."
39+
- name: --dataset_description
40+
type: string
41+
description: Long description of the dataset.
42+
required: false
43+
default: "This dataset contains scRNA-seq data from human liver cancer cells."
44+
- name: --dataset_organism
45+
type: string
46+
description: The organism of the sample in the dataset.
47+
required: false
48+
default: "Homo sapiens"
49+
- name: Outputs
50+
arguments:
51+
- name: "--output"
52+
__merge__: /src/api/file_common_scrnaseq.yaml
53+
direction: output
54+
required: true
55+
56+
resources:
57+
- type: python_script
58+
path: script.py
59+
60+
engines:
61+
- type: docker
62+
image: openproblems/base_python:1
63+
__merge__:
64+
- /src/base/setup_txsim_partial.yaml
65+
- type: native
66+
67+
runners:
68+
- type: executable
69+
- type: nextflow
70+
directives:
71+
label: [midmem, midcpu, midtime]
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
from pathlib import Path
2+
from scipy.sparse import csr_matrix, hstack
3+
import pandas as pd
4+
import anndata as ad
5+
import urllib.request
6+
from datetime import datetime
7+
8+
## VIASH START
9+
par = {
10+
"keep_files": True, # wether to delete the intermediate files
11+
"output": "./temp/datasets/2022Lu_human_liver_cancer_sc.h5ad",
12+
"dataset_id": "2022Lu_human_liver_cancer_sc",
13+
"dataset_name": "2022Lu_human_liver_cancer_sc",
14+
"dataset_url": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE149614",
15+
"dataset_reference": "https://doi.org/10.1038/s41467-022-32283-3",
16+
"dataset_summary": "This dataset contains scRNA-seq data from human liver cancer cells.",
17+
"dataset_description": "This dataset contains scRNA-seq data from human liver cancer cells.",
18+
"dataset_organism": "Homo sapiens"
19+
}
20+
meta = {
21+
"temp_dir": "./temp/datasets/2022Lu_human_liver_cancer_sc",
22+
}
23+
## VIASH END
24+
25+
# helper variables
26+
TMP_DIR = Path(meta["temp_dir"] or "/tmp")
27+
TMP_DIR.mkdir(parents=True, exist_ok=True)
28+
FILE_URLS = {
29+
"counts": "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE149614&format=file&file=GSE149614%5FHCC%2EscRNAseq%2ES71915%2Ecount%2Etxt%2Egz",
30+
"obs": "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE149614&format=file&file=GSE149614%5FHCC%2Emetadata%2Eupdated%2Etxt%2Egz"
31+
}
32+
FILE_PATHS = {
33+
"counts": TMP_DIR / "GSE149614_HCC.scRNAseq.S71915.count.txt.gz",
34+
"obs": TMP_DIR / "GSE149614_HCC.metadata.updated.txt.gz",
35+
}
36+
37+
38+
# Download the data
39+
print("Downloading data", flush=True)
40+
for key, url in FILE_URLS.items():
41+
urllib.request.urlretrieve(url, FILE_PATHS[key])
42+
43+
# Read data
44+
print("Reading count matrix", flush=True)
45+
chunk_size = 200
46+
47+
genes = []
48+
X_sparse = []
49+
50+
t0 = datetime.now()
51+
52+
for i,chunk in enumerate(pd.read_csv(FILE_PATHS["counts"], sep="\t", chunksize=chunk_size)):
53+
54+
if i % 10 == 0:
55+
print("\t", datetime.now() - t0, " " , i, "/128")
56+
57+
genes += chunk.index.tolist()
58+
59+
if i == 0:
60+
X_sparse = csr_matrix(chunk.values.T)
61+
obs = chunk.columns.tolist() # it's the same in each chunk since all cells are loaded
62+
else:
63+
X_sparse = hstack([X_sparse, csr_matrix(chunk.values.T)])
64+
65+
del chunk
66+
67+
print("Reading obs", flush=True)
68+
df_obs = pd.read_csv(FILE_PATHS["obs"], sep="\t", index_col=0)
69+
70+
assert (obs == df_obs.index.tolist())
71+
72+
# Create adata
73+
print("Creating adata", flush=True)
74+
adata = ad.AnnData(
75+
X=None,
76+
obs=df_obs,
77+
var=pd.DataFrame(index=genes),
78+
layers={"counts": X_sparse}
79+
)
80+
81+
# Rename fields
82+
rename_obs_keys = {
83+
"cell_type": "celltype",
84+
"batch": "sample",
85+
"donor_id": "patient",
86+
"cancer_stage": "stage", # TODO: this is currently not in the config yaml - how to handle this?
87+
# #TODO "cell_type_unified" (?), maybe call the unified one "cell_type" and the original one "cell_type_level1"
88+
# other keys: "batch", "assay_ontology_term_id", "cell_type_ontology_term_id", "development_stage_ontology_term_id"
89+
# "diseases_ontology_term_id", "is_primary_data", "organism_ontology_term_id", "self_reported_ethnicity",
90+
# "self_reported_ethnicity_ontology_term_id", "sex_ontology_term_id", "suspension_type",
91+
# "suspension_type_ontology_term_id", "tissue_ontology_term_id", "tissue_general_ontology_term_id", "soma_joinid"
92+
}
93+
adata.obs = adata.obs.rename(columns={old:new for new,old in rename_obs_keys.items()})
94+
95+
# Add additional information to obs
96+
#TODO: Finish up the terms according to the ontology
97+
#Ontology schema currently (13.03.2025) used in openproblems (CELLxGENE schema v4.0.0):
98+
#https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md
99+
#(mentioned here: https://openproblems.bio/documentation/reference/openproblems/src-datasets#file-format:-raw-dataset)
100+
store_info = {
101+
"dataset_id": "2022Lu_human_liver_cancer_sc", #"GSE176078",
102+
"assay": "Chromium Single-Cell v2 3’ Chemistry Library", # from metadata from GEO GSE176078 #TODO: ontology
103+
"tissue": "liver",
104+
"disease": "liver cancer",
105+
# "disease_ontology_term_id": "PATO:0000461", #TODO: ontology
106+
"organism": "Homo sapiens",
107+
# "organism_ontology_term_id": "NCBITaxon:10090", #TODO: ontology
108+
"tissue_general": "liver",
109+
# "tissue_general_ontology_term_id": "UBERON:0000955", #TODO: ontology
110+
"development_stage": "adult",
111+
# "development_stage_ontology_term_id": "MmusDv:0000110" #TODO: ontology
112+
}
113+
for key, value in store_info.items():
114+
adata.obs[key] = pd.Categorical([value] * adata.n_obs, categories=[value])
115+
116+
# Remove undesired columns
117+
for key in adata.obs.columns:
118+
if (key not in rename_obs_keys.keys()) and (key not in store_info.keys()):
119+
print(f"Removing .obs['{key}']")
120+
del adata.obs[key]
121+
122+
# Var
123+
adata.var["gene_symbol"] = adata.var_names
124+
# TODO: can we also get ensembl ids? (adata.var["feature_id"])
125+
126+
# Uns
127+
for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]:
128+
adata.uns[key] = par[key]
129+
130+
# Delete files if requested
131+
if not par["keep_files"]:
132+
print("Removing files", flush=True)
133+
for file in FILE_PATHS.values():
134+
if file.exists():
135+
print("\t...", file, flush=True)
136+
file.unlink()
137+
138+
print("Writing adata", flush=True)
139+
adata.write_h5ad(par["output"], compression="gzip")
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
name: process_lu_human_liver_cancer_sc
2+
namespace: datasets/workflows
3+
4+
argument_groups:
5+
- name: Caching settings
6+
arguments:
7+
- type: boolean
8+
name: --keep_files
9+
required: false
10+
description: Whether to remove the downloaded files after processing.
11+
default: false
12+
- name: Metadata
13+
arguments:
14+
- type: string
15+
name: --dataset_id
16+
description: "A unique identifier for the dataset"
17+
required: false
18+
default: "2022Lu_human_liver_cancer_sc"
19+
- name: --dataset_name
20+
type: string
21+
description: Nicely formatted name.
22+
required: false
23+
default: "2022Lu_human_liver_cancer_sc"
24+
- type: string
25+
name: --dataset_url
26+
description: Link to the original source of the dataset.
27+
required: false
28+
default: "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE149614"
29+
- name: --dataset_reference
30+
type: string
31+
description: Bibtex reference of the paper in which the dataset was published.
32+
required: false
33+
default: "https://doi.org/10.1038/s41467-022-32283-3"
34+
- name: --dataset_summary
35+
type: string
36+
description: Short description of the dataset.
37+
required: false
38+
default: "This dataset contains scRNA-seq data from human liver cancer cells."
39+
- name: --dataset_description
40+
type: string
41+
description: Long description of the dataset.
42+
required: false
43+
default: "This dataset contains scRNA-seq data from human liver cancer cells."
44+
- name: --dataset_organism
45+
type: string
46+
description: The organism of the sample in the dataset.
47+
required: false
48+
default: "Homo sapiens"
49+
- name: Outputs
50+
arguments:
51+
- name: "--output_dataset"
52+
__merge__: /src/api/file_common_scrnaseq.yaml
53+
direction: output
54+
required: true
55+
default: "$id/dataset.h5ad"
56+
- name: "--output_meta"
57+
direction: "output"
58+
type: file
59+
description: "Dataset metadata"
60+
default: "$id/dataset_meta.yaml"
61+
62+
resources:
63+
- type: nextflow_script
64+
path: main.nf
65+
entrypoint: run_wf
66+
- path: /common/nextflow_helpers/helper.nf
67+
68+
dependencies:
69+
- name: datasets/loaders/lu_human_liver_cancer_sc
70+
- name: datasets/normalization/log_cp
71+
repository: openproblems
72+
- name: datasets/processors/pca
73+
repository: openproblems
74+
- name: datasets/processors/hvg
75+
repository: openproblems
76+
- name: datasets/processors/knn
77+
repository: openproblems
78+
- name: utils/extract_uns_metadata
79+
repository: openproblems
80+
81+
runners:
82+
- type: nextflow
83+
directives:
84+
label: [midcpu, midmem, hightime]

0 commit comments

Comments
 (0)