Skip to content

Commit fcf16f1

Browse files
Olga013Paulos2411rcannood
authored
Added op3 loader (#926)
* Added op3 loader * Minor Changes * Updated Code with Output Data * Minor Updates * Minor Updates * Minor Update * Added Workflow * Update script.py * workflow for op3_loader * Implemented Suggestions * removed space * No changes * move input file to a separate argument * add initial script * added small fixes for op3 pipeline * Removed two obsolete changes * removed scanpy cell and gene filtering * an example script for running the data processing pipeline is added * changed the name of an input file * removed .r filtration script * unnecessary import is deleted in script.py * unnecessary state is deleted in process_op3/main.nf * duplicated rows are deleted in process_op3/config.vsh.yaml * minor fixes in .bash scripts are added * fixing splits is added * publish_dir and main-script dir were changed * the name of dir in s3 is changed * update common * let nextflow take care of the download * fix test (make sure viashpy runs it) * rename component * finish renaming more scripts * fix script * temporarily disable test --------- Co-authored-by: Paulos2411 <[email protected]> Co-authored-by: Paul <[email protected]> Co-authored-by: Robrecht Cannoodt <[email protected]> Co-authored-by: Olga Novitskaia <[email protected]>
1 parent 952c0a9 commit fcf16f1

File tree

8 files changed

+787
-1
lines changed

8 files changed

+787
-1
lines changed
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
name: op3
2+
namespace: datasets/loaders/scrnaseq
3+
description: |
4+
"Loads and preprocesses the OP3 dataset from GEO accession GSE279945."
5+
6+
argument_groups:
7+
- name: Input
8+
arguments:
9+
- name: "--input"
10+
type: file
11+
description: "Input url to the .h5ad file."
12+
direction: input
13+
required: false
14+
default: https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279945/suppl/GSE279945_sc_counts_processed.h5ad
15+
example: https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279945/suppl/GSE279945_sc_counts_processed.h5ad
16+
- name: "--var_feature_name"
17+
type: string
18+
description: "Location of where to find the feature names. Can be set to index if the feature names are the index."
19+
default: index
20+
- name: Data Filtering
21+
description: "Arguments for filtering the dataset"
22+
arguments:
23+
- name: "--donor_id"
24+
type: string
25+
description: "Donor ID to filter for (1, 2, or 3). If not specified, all donors are included."
26+
required: false
27+
- name: "--cell_type"
28+
type: string
29+
description: "Cell type to filter for (T cells, B cells, NK cells, or Myeloid). If not specified, all cell types are included."
30+
required: false
31+
- name: "--perturbation"
32+
type: string
33+
description: "Perturbation to filter for. If not specified, all perturbations are included."
34+
required: false
35+
36+
- name: Dataset Metadata
37+
description: "Metadata about the dataset"
38+
arguments:
39+
- name: "--dataset_id"
40+
type: string
41+
description: "Unique identifier for the dataset"
42+
default: "op3"
43+
- name: "--dataset_name"
44+
type: string
45+
description: "Human-readable name for the dataset"
46+
default: "OP3: single-cell multimodal dataset in PBMCs for perturbation prediction benchmarking"
47+
- name: "--dataset_summary"
48+
type: string
49+
description: "Short summary of the dataset"
50+
default: "The Open Problems Perurbation Prediction (OP3) dataset with small molecule perturbations in PBMCs"
51+
- name: "--dataset_description"
52+
type: string
53+
description: "Detailed description of the dataset"
54+
default: "The OP3 dataset is to-date the largest single-cell small molecule perturbation dataset in primary tissue with multiple donor replicates."
55+
- name: "dataset_reference"
56+
type: string
57+
description: "Bibtex reference of the paper in which the dataset was published."
58+
required: false
59+
default: GSE279945
60+
- name: "--dataset_url"
61+
type: string
62+
description: "Link to the original source of the dataset."
63+
required: false
64+
default: https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279945/suppl/GSE279945_sc_counts_processed.h5ad
65+
66+
- name: Output
67+
description: "Output parameters"
68+
arguments:
69+
- name: "--output"
70+
type: file
71+
description: "Output h5ad file."
72+
direction: output
73+
required: true
74+
- name: "--output_compression"
75+
type: string
76+
choices: [gzip, lzf]
77+
required: false
78+
default: "gzip"
79+
80+
resources:
81+
- type: python_script
82+
path: script.py
83+
84+
# test_resources:
85+
# - type: python_script
86+
# path: test.py
87+
88+
engines:
89+
- type: docker
90+
image: openproblems/base_python:1
91+
test_setup:
92+
- type: python
93+
packages:
94+
- viashpy
95+
96+
runners:
97+
- type: executable
98+
- type: nextflow
99+
directives:
100+
label: [highmem, midcpu, midtime]
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
import anndata as ad
2+
import numpy as np
3+
import logging
4+
5+
logger = logging.getLogger(__name__)
6+
logging.basicConfig(level=logging.INFO)
7+
8+
## VIASH START
9+
par = {
10+
"input": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279945/suppl/GSE279945_sc_counts_processed.h5ad",
11+
"var_feature_name": "index",
12+
"data_type": "sc",
13+
"donor_id": None,
14+
"cell_type": None,
15+
"perturbation": None,
16+
"dataset_id": "op3",
17+
"dataset_name": "OP3: single-cell multimodal dataset in PBMCs for perturbation prediction benchmarking",
18+
"dataset_summary": "The Open Problems Perurbation Prediction (OP3) dataset with small molecule perturbations in PBMCs",
19+
"dataset_description": "The OP3 dataset is to-date the largest single-cell small molecule perturbation dataset in primary tissue with multiple donor replicates.",
20+
"output": "output.h5ad",
21+
"output_compression": "gzip"
22+
}
23+
meta = {}
24+
## VIASH END
25+
26+
def filter_op3_data(adata):
27+
"""
28+
Filter the OP3 dataset based on specific criteria for each small molecule and cell type.
29+
"""
30+
logger.info("Applying OP3-specific filtering criteria")
31+
32+
# Original filter code follows...
33+
34+
# Create a boolean mask for filtering observations
35+
obs_filt = np.ones(adata.n_obs, dtype=bool)
36+
37+
# Alvocidib only T cells in only 2 donors, remove
38+
obs_filt = obs_filt & (adata.obs['sm_name'] != "Alvocidib")
39+
40+
# BMS-387032 - one donor with only T cells, two other consistent, but only 2 cell types
41+
# Leave the 2 cell types in, remove donor 2 with only T cells
42+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "BMS-387032") & (adata.obs['donor_id'] == "Donor 2"))
43+
44+
# BMS-387032 remove myeloid cells and B cells
45+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "BMS-387032") &
46+
adata.obs['cell_type'].isin(["B cells", "Myeloid cells"]))
47+
48+
# CGP 60474 has only T cells left, remove
49+
obs_filt = obs_filt & (adata.obs['sm_name'] != "CGP 60474")
50+
51+
# Canertinib - the variation of Myeloid cell proportions is very large, skip Myeloid
52+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "Canertinib") &
53+
(adata.obs['cell_type'] == "Myeloid cells"))
54+
55+
# Foretinib - large variation in Myeloid cell proportions (some in T cells), skip Myeloid
56+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "Foretinib") &
57+
(adata.obs['cell_type'] == "Myeloid cells"))
58+
59+
# Ganetespib (STA-9090) - donor 2 has no Myeloid and small NK cells proportions
60+
# Skip Myeloid, remove donor 2
61+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "Ganetespib (STA-9090)") &
62+
(adata.obs['donor_id'] == "Donor 2"))
63+
64+
# IN1451 - donor 2 has no NK or B, remove Donor 2
65+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "IN1451") &
66+
(adata.obs['donor_id'] == "Donor 2"))
67+
68+
# Navitoclax - donor 3 doesn't have B cells and has different T and Myeloid proportions
69+
# Remove donor 3
70+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "Navitoclax") &
71+
(adata.obs['donor_id'] == "Donor 3"))
72+
73+
# PF-04691502 remove Myeloid (only present in donor 3)
74+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "PF-04691502") &
75+
(adata.obs['cell_type'] == "Myeloid cells"))
76+
77+
# Proscillaridin A;Proscillaridin-A remove Myeloid, since the variation is very high (4x)
78+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "Proscillaridin A;Proscillaridin-A") &
79+
(adata.obs['cell_type'] == "Myeloid cells"))
80+
81+
# R428 - skip NK due to high variation (close to 3x)
82+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "R428") &
83+
(adata.obs['cell_type'] == "NK cells"))
84+
85+
# UNII-BXU45ZH6LI - remove due to large variation across all cell types and missing cell types
86+
obs_filt = obs_filt & (adata.obs['sm_name'] != "UNII-BXU45ZH6LI")
87+
88+
# Apply the filter
89+
filtered_adata = adata[obs_filt].copy()
90+
91+
logger.info(f"Filtered data from {adata.n_obs} to {filtered_adata.n_obs} cells")
92+
93+
return filtered_adata
94+
95+
def fix_splits(adata):
96+
"""Fix splits after reannotation."""
97+
logger.info("Fix splits after reannotation")
98+
adata.obs["cell_type_orig_updated"] = adata.obs["cell_type_orig"].apply(lambda x: "T cells" if x.startswith("T ") else x)
99+
adata.obs["sm_cell_type_orig"] = adata.obs["sm_name"].astype(str) + "_" + adata.obs["cell_type_orig_updated"].astype(str)
100+
mapping_to_split = adata.obs.groupby("sm_cell_type_orig")["split"].apply(lambda x: x.unique()[0]).to_dict()
101+
adata.obs["sm_cell_type"] = adata.obs["sm_name"].astype(str) + "_" + adata.obs["cell_type"].astype(str)
102+
adata.obs["split"] = adata.obs["sm_cell_type"].map(mapping_to_split)
103+
adata.obs['control'] = adata.obs['split'].eq("control")
104+
return adata
105+
106+
107+
def move_x_to_layers(adata):
108+
"""Move .X to .layers['counts'] and set X to None."""
109+
logger.info("Moving .X to .layers['counts']")
110+
adata.layers["counts"] = adata.X.copy()
111+
adata.X = None
112+
113+
def add_metadata_to_uns(adata, par):
114+
"""Add standardized metadata to .uns."""
115+
logger.info("Adding metadata to .uns")
116+
adata.uns['dataset_id'] = par["dataset_id"]
117+
adata.uns['dataset_name'] = par["dataset_name"]
118+
adata.uns['dataset_summary'] = par["dataset_summary"]
119+
adata.uns['dataset_description'] = par["dataset_description"]
120+
adata.uns['dataset_organism'] = 'Homo sapiens'
121+
adata.uns['dataset_url'] = par["dataset_url"]
122+
adata.uns['dataset_reference'] = 'GSE279945'
123+
adata.uns['dataset_version'] = '1.0.0'
124+
adata.uns['processing_status'] = 'processed'
125+
126+
def print_unique(adata, column):
127+
"""Print unique values in a column."""
128+
if column in adata.obs:
129+
values = adata.obs[column].unique()
130+
if len(values) <= 10:
131+
formatted = "', '".join(values)
132+
logger.info(f"Unique {column}: ['{formatted}']")
133+
else:
134+
logger.info(f"Unique {column}: {len(values)} values")
135+
136+
def print_summary(adata):
137+
"""Print a summary of the dataset."""
138+
logger.info(f"Dataset shape: {adata.shape}")
139+
logger.info(f"Number of cells: {adata.n_obs}")
140+
logger.info(f"Number of genes: {adata.n_vars}")
141+
142+
# Print unique values for key columns
143+
for column in ['donor_id', 'cell_type', 'perturbation']:
144+
print_unique(adata, column)
145+
146+
logger.info(f"Layers: {list(adata.layers.keys())}")
147+
logger.info(f"Metadata: {list(adata.uns.keys())}")
148+
149+
def write_anndata(adata, par):
150+
"""Write AnnData object to file."""
151+
logger.info(f"Writing AnnData object to '{par['output']}'")
152+
adata.write_h5ad(par["output"], compression=par["output_compression"])
153+
154+
# Instead of defining main() and calling it at the end, write the code directly
155+
logger.info("Starting OP3 loader")
156+
# Load the data
157+
logger.info(f"Loading data at {par['input']}")
158+
adata = ad.read_h5ad(par["input"])
159+
160+
# Apply OP3-specific filtering
161+
adata = filter_op3_data(adata)
162+
# Filter by parameters
163+
if par["donor_id"] is not None:
164+
logger.info(f"Filtering for donor_id: {par['donor_id']}")
165+
adata = adata[adata.obs["donor_id"] == par["donor_id"]]
166+
167+
if par["cell_type"] is not None:
168+
logger.info(f"Filtering for cell_type: {par['cell_type']}")
169+
adata = adata[adata.obs["cell_type"] == par["cell_type"]]
170+
171+
if par["perturbation"] is not None:
172+
logger.info(f"Filtering for perturbation: {par['perturbation']}")
173+
adata = adata[adata.obs["perturbation"] == par["perturbation"]]
174+
175+
# Fix splits after reannotation
176+
fix_splits(adata)
177+
178+
# Move X to layers and normalize
179+
move_x_to_layers(adata)
180+
181+
# Add dataset metadata
182+
add_metadata_to_uns(adata, par)
183+
184+
# Add a feature name
185+
logger.info("Setting .var['feature_name']")
186+
187+
if par["var_feature_name"] == "index":
188+
adata.var["feature_name"] = adata.var.index
189+
else:
190+
if par["var_feature_name"] in adata.var:
191+
adata.var["feature_name"] = adata.var[par["feature_name"]]
192+
del adata.var[par["feature_name"]]
193+
else:
194+
logger.info(f"Warning: key '{par['var_feature_name']}' could not be found in adata.var.")
195+
196+
# Print summary and save
197+
print_summary(adata)
198+
write_anndata(adata, par)
199+
200+
logger.info("Done")
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
2+
import os
3+
import pytest
4+
import anndata as ad
5+
import numpy as np
6+
import requests
7+
import sys
8+
9+
## VIASH START
10+
meta = {
11+
'resources_dir': './resources_test/',
12+
'executable': './target/docker/datasets/loaders/scrnaseq/op3',
13+
'config': os.path.abspath('./src/datasets/loaders/scrnaseq/op3/config.vsh.yaml')
14+
}
15+
## VIASH END
16+
17+
def test_op3(run_component, tmp_path):
18+
"""Test the OP3 loader."""
19+
input_file = tmp_path / "input.h5ad" # Convert to string to be safe
20+
output_file = tmp_path / "output.h5ad" # Convert to string to be safe
21+
22+
# download file
23+
url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279945/suppl/GSE279945_sc_counts_processed.h5ad"
24+
response = requests.get(url)
25+
if response.status_code != 200:
26+
raise RuntimeError(f"Failed to download file from {url}, status code: {response.status_code}")
27+
28+
# run loader
29+
run_component([
30+
"--input", str(input_file),
31+
"--var_feature_name", "index",
32+
"--donor_id", "1",
33+
"--cell_type", "T cells",
34+
"--dataset_id", "test_op3",
35+
"--dataset_name", "OP3 Test Dataset",
36+
"--dataset_summary", "Test summary for OP3 dataset",
37+
"--dataset_description", "Test description for OP3 dataset",
38+
"--dataset_url", "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279945/suppl/GSE279945_sc_counts_processed.h5ad",
39+
"--output", str(output_file),
40+
"--output_compression", "gzip"
41+
])
42+
43+
# check whether file exists
44+
assert output_file.exists(), f"Output file {output_file} does not exist"
45+
46+
adata = ad.read_h5ad(output_file)
47+
48+
# check obs
49+
assert not adata.obs.empty, ".obs should not be empty"
50+
assert "donor_id" in adata.obs.columns
51+
assert np.all(adata.obs["donor_id"] == "1"), "Not all cells are from donor 1"
52+
assert "cell_type" in adata.obs.columns
53+
assert np.all(adata.obs["cell_type"] == "T cells"), "Not all cells are T cells"
54+
assert "perturbation" in adata.obs.columns
55+
assert adata.n_obs > 0
56+
57+
# check var
58+
assert not adata.var.empty, ".var should not be empty"
59+
assert adata.n_vars > 0
60+
61+
# check layers
62+
assert "counts" in adata.layers
63+
assert adata.X is not None, "X matrix should not be None"
64+
65+
# check uns
66+
assert adata.uns["dataset_id"] == "test_op3", "Incorrect .uns['dataset_id']"
67+
assert adata.uns["dataset_name"] == "OP3 Test Dataset", "Incorrect .uns['dataset_name']"
68+
assert adata.uns["dataset_summary"] == "Test summary for OP3 dataset", "Incorrect .uns['dataset_summary']"
69+
assert adata.uns["dataset_description"] == "Test description for OP3 dataset", "Incorrect .uns['dataset_description']"
70+
71+
if __name__ == "__main__":
72+
sys.exit(pytest.main([__file__]))

0 commit comments

Comments
 (0)