Skip to content

Commit 3521aa5

Browse files
rcannoodPaulos2411Olga013
authored
Port OP3 data loader (#2)
Co-authored-by: Paulos2411 <paul.ortulidispflanz@gmail.com> Co-authored-by: Paul <88028027+Paulos2411@users.noreply.github.com> Co-authored-by: Olga Novitskaia <novitskaia.olga.s@gmail.com>
1 parent 4b3a2fc commit 3521aa5

File tree

5 files changed

+682
-0
lines changed

5 files changed

+682
-0
lines changed
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
__merge__: /src/api/comp_dataset_loader.yaml
2+
name: openproblems_op3
3+
namespace: loaders/scrnaseq
4+
description: |
5+
"Loads and preprocesses the OP3 dataset from GEO accession GSE279945."
6+
7+
argument_groups:
8+
- name: Input
9+
arguments:
10+
- name: "--input"
11+
type: file
12+
description: "Input url to the .h5ad file."
13+
direction: input
14+
required: false
15+
default: https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279945/suppl/GSE279945_sc_counts_processed.h5ad
16+
example: https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279945/suppl/GSE279945_sc_counts_processed.h5ad
17+
- name: "--var_feature_name"
18+
type: string
19+
description: "Location of where to find the feature names. Can be set to index if the feature names are the index."
20+
default: index
21+
- name: Data Filtering
22+
description: "Arguments for filtering the dataset"
23+
arguments:
24+
- name: "--donor_id"
25+
type: string
26+
description: "Donor ID to filter for (1, 2, or 3). If not specified, all donors are included."
27+
required: false
28+
- name: "--cell_type"
29+
type: string
30+
description: "Cell type to filter for (T cells, B cells, NK cells, or Myeloid). If not specified, all cell types are included."
31+
required: false
32+
- name: "--perturbation"
33+
type: string
34+
description: "Perturbation to filter for. If not specified, all perturbations are included."
35+
required: false
36+
37+
- name: Dataset Metadata
38+
description: "Metadata about the dataset"
39+
arguments:
40+
- name: "--dataset_id"
41+
type: string
42+
description: "Unique identifier for the dataset"
43+
default: "openproblems_op3"
44+
- name: "--dataset_name"
45+
type: string
46+
description: "Human-readable name for the dataset"
47+
default: "OP3: single-cell multimodal dataset in PBMCs for perturbation prediction benchmarking"
48+
- name: "--dataset_summary"
49+
type: string
50+
description: "Short summary of the dataset"
51+
default: "The Open Problems Perurbation Prediction (OP3) dataset with small molecule perturbations in PBMCs"
52+
- name: "--dataset_description"
53+
type: string
54+
description: "Detailed description of the dataset"
55+
default: "The OP3 dataset is to-date the largest single-cell small molecule perturbation dataset in primary tissue with multiple donor replicates."
56+
- name: "dataset_reference"
57+
type: string
58+
description: "Bibtex reference of the paper in which the dataset was published."
59+
required: false
60+
default: GSE279945
61+
- name: "--dataset_url"
62+
type: string
63+
description: "Link to the original source of the dataset."
64+
required: false
65+
default: https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279945/suppl/GSE279945_sc_counts_processed.h5ad
66+
67+
resources:
68+
- type: python_script
69+
path: script.py
70+
71+
# test_resources:
72+
# - type: python_script
73+
# path: test.py
74+
75+
engines:
76+
- type: docker
77+
image: openproblems/base_python:1
78+
test_setup:
79+
- type: python
80+
packages:
81+
- viashpy
82+
83+
runners:
84+
- type: executable
85+
- type: nextflow
86+
directives:
87+
label: [highmem, midcpu, midtime]
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
import anndata as ad
2+
import numpy as np
3+
import logging
4+
5+
logger = logging.getLogger(__name__)
6+
logging.basicConfig(level=logging.INFO)
7+
8+
## VIASH START
9+
par = {
10+
"input": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279945/suppl/GSE279945_sc_counts_processed.h5ad",
11+
"var_feature_name": "index",
12+
"data_type": "sc",
13+
"donor_id": None,
14+
"cell_type": None,
15+
"perturbation": None,
16+
"dataset_id": "op3",
17+
"dataset_name": "OP3: single-cell multimodal dataset in PBMCs for perturbation prediction benchmarking",
18+
"dataset_summary": "The Open Problems Perurbation Prediction (OP3) dataset with small molecule perturbations in PBMCs",
19+
"dataset_description": "The OP3 dataset is to-date the largest single-cell small molecule perturbation dataset in primary tissue with multiple donor replicates.",
20+
"output": "output.h5ad",
21+
"output_compression": "gzip"
22+
}
23+
meta = {}
24+
## VIASH END
25+
26+
def filter_op3_data(adata):
27+
"""
28+
Filter the OP3 dataset based on specific criteria for each small molecule and cell type.
29+
"""
30+
logger.info("Applying OP3-specific filtering criteria")
31+
32+
# Original filter code follows...
33+
34+
# Create a boolean mask for filtering observations
35+
obs_filt = np.ones(adata.n_obs, dtype=bool)
36+
37+
# Alvocidib only T cells in only 2 donors, remove
38+
obs_filt = obs_filt & (adata.obs['sm_name'] != "Alvocidib")
39+
40+
# BMS-387032 - one donor with only T cells, two other consistent, but only 2 cell types
41+
# Leave the 2 cell types in, remove donor 2 with only T cells
42+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "BMS-387032") & (adata.obs['donor_id'] == "Donor 2"))
43+
44+
# BMS-387032 remove myeloid cells and B cells
45+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "BMS-387032") &
46+
adata.obs['cell_type'].isin(["B cells", "Myeloid cells"]))
47+
48+
# CGP 60474 has only T cells left, remove
49+
obs_filt = obs_filt & (adata.obs['sm_name'] != "CGP 60474")
50+
51+
# Canertinib - the variation of Myeloid cell proportions is very large, skip Myeloid
52+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "Canertinib") &
53+
(adata.obs['cell_type'] == "Myeloid cells"))
54+
55+
# Foretinib - large variation in Myeloid cell proportions (some in T cells), skip Myeloid
56+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "Foretinib") &
57+
(adata.obs['cell_type'] == "Myeloid cells"))
58+
59+
# Ganetespib (STA-9090) - donor 2 has no Myeloid and small NK cells proportions
60+
# Skip Myeloid, remove donor 2
61+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "Ganetespib (STA-9090)") &
62+
(adata.obs['donor_id'] == "Donor 2"))
63+
64+
# IN1451 - donor 2 has no NK or B, remove Donor 2
65+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "IN1451") &
66+
(adata.obs['donor_id'] == "Donor 2"))
67+
68+
# Navitoclax - donor 3 doesn't have B cells and has different T and Myeloid proportions
69+
# Remove donor 3
70+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "Navitoclax") &
71+
(adata.obs['donor_id'] == "Donor 3"))
72+
73+
# PF-04691502 remove Myeloid (only present in donor 3)
74+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "PF-04691502") &
75+
(adata.obs['cell_type'] == "Myeloid cells"))
76+
77+
# Proscillaridin A;Proscillaridin-A remove Myeloid, since the variation is very high (4x)
78+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "Proscillaridin A;Proscillaridin-A") &
79+
(adata.obs['cell_type'] == "Myeloid cells"))
80+
81+
# R428 - skip NK due to high variation (close to 3x)
82+
obs_filt = obs_filt & ~((adata.obs['sm_name'] == "R428") &
83+
(adata.obs['cell_type'] == "NK cells"))
84+
85+
# UNII-BXU45ZH6LI - remove due to large variation across all cell types and missing cell types
86+
obs_filt = obs_filt & (adata.obs['sm_name'] != "UNII-BXU45ZH6LI")
87+
88+
# Apply the filter
89+
filtered_adata = adata[obs_filt].copy()
90+
91+
logger.info(f"Filtered data from {adata.n_obs} to {filtered_adata.n_obs} cells")
92+
93+
return filtered_adata
94+
95+
def fix_splits(adata):
96+
"""Fix splits after reannotation."""
97+
logger.info("Fix splits after reannotation")
98+
adata.obs["cell_type_orig_updated"] = adata.obs["cell_type_orig"].apply(lambda x: "T cells" if x.startswith("T ") else x)
99+
adata.obs["sm_cell_type_orig"] = adata.obs["sm_name"].astype(str) + "_" + adata.obs["cell_type_orig_updated"].astype(str)
100+
mapping_to_split = adata.obs.groupby("sm_cell_type_orig")["split"].apply(lambda x: x.unique()[0]).to_dict()
101+
adata.obs["sm_cell_type"] = adata.obs["sm_name"].astype(str) + "_" + adata.obs["cell_type"].astype(str)
102+
adata.obs["split"] = adata.obs["sm_cell_type"].map(mapping_to_split)
103+
adata.obs['control'] = adata.obs['split'].eq("control")
104+
return adata
105+
106+
107+
def move_x_to_layers(adata):
108+
"""Move .X to .layers['counts'] and set X to None."""
109+
logger.info("Moving .X to .layers['counts']")
110+
adata.layers["counts"] = adata.X.copy()
111+
adata.X = None
112+
113+
def add_metadata_to_uns(adata, par):
114+
"""Add standardized metadata to .uns."""
115+
logger.info("Adding metadata to .uns")
116+
adata.uns['dataset_id'] = par["dataset_id"]
117+
adata.uns['dataset_name'] = par["dataset_name"]
118+
adata.uns['dataset_summary'] = par["dataset_summary"]
119+
adata.uns['dataset_description'] = par["dataset_description"]
120+
adata.uns['dataset_organism'] = 'Homo sapiens'
121+
adata.uns['dataset_url'] = par["dataset_url"]
122+
adata.uns['dataset_reference'] = 'GSE279945'
123+
adata.uns['dataset_version'] = '1.0.0'
124+
adata.uns['processing_status'] = 'processed'
125+
126+
def print_unique(adata, column):
127+
"""Print unique values in a column."""
128+
if column in adata.obs:
129+
values = adata.obs[column].unique()
130+
if len(values) <= 10:
131+
formatted = "', '".join(values)
132+
logger.info(f"Unique {column}: ['{formatted}']")
133+
else:
134+
logger.info(f"Unique {column}: {len(values)} values")
135+
136+
def print_summary(adata):
137+
"""Print a summary of the dataset."""
138+
logger.info(f"Dataset shape: {adata.shape}")
139+
logger.info(f"Number of cells: {adata.n_obs}")
140+
logger.info(f"Number of genes: {adata.n_vars}")
141+
142+
# Print unique values for key columns
143+
for column in ['donor_id', 'cell_type', 'perturbation']:
144+
print_unique(adata, column)
145+
146+
logger.info(f"Layers: {list(adata.layers.keys())}")
147+
logger.info(f"Metadata: {list(adata.uns.keys())}")
148+
149+
def write_anndata(adata, par):
150+
"""Write AnnData object to file."""
151+
logger.info(f"Writing AnnData object to '{par['output']}'")
152+
adata.write_h5ad(par["output"], compression=par["output_compression"])
153+
154+
# Instead of defining main() and calling it at the end, write the code directly
155+
logger.info("Starting OP3 loader")
156+
# Load the data
157+
logger.info(f"Loading data at {par['input']}")
158+
adata = ad.read_h5ad(par["input"])
159+
160+
# Apply OP3-specific filtering
161+
adata = filter_op3_data(adata)
162+
# Filter by parameters
163+
if par["donor_id"] is not None:
164+
logger.info(f"Filtering for donor_id: {par['donor_id']}")
165+
adata = adata[adata.obs["donor_id"] == par["donor_id"]]
166+
167+
if par["cell_type"] is not None:
168+
logger.info(f"Filtering for cell_type: {par['cell_type']}")
169+
adata = adata[adata.obs["cell_type"] == par["cell_type"]]
170+
171+
if par["perturbation"] is not None:
172+
logger.info(f"Filtering for perturbation: {par['perturbation']}")
173+
adata = adata[adata.obs["perturbation"] == par["perturbation"]]
174+
175+
# Fix splits after reannotation
176+
fix_splits(adata)
177+
178+
# Move X to layers and normalize
179+
move_x_to_layers(adata)
180+
181+
# Add dataset metadata
182+
add_metadata_to_uns(adata, par)
183+
184+
# Add a feature name
185+
logger.info("Setting .var['feature_name']")
186+
187+
if par["var_feature_name"] == "index":
188+
adata.var["feature_name"] = adata.var.index
189+
else:
190+
if par["var_feature_name"] in adata.var:
191+
adata.var["feature_name"] = adata.var[par["feature_name"]]
192+
del adata.var[par["feature_name"]]
193+
else:
194+
logger.info(f"Warning: key '{par['var_feature_name']}' could not be found in adata.var.")
195+
196+
# Print summary and save
197+
print_summary(adata)
198+
write_anndata(adata, par)
199+
200+
logger.info("Done")
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
2+
import os
3+
import pytest
4+
import anndata as ad
5+
import numpy as np
6+
import requests
7+
import sys
8+
9+
## VIASH START
10+
meta = {
11+
'resources_dir': './resources_test/',
12+
'executable': './target/docker/datasets/loaders/scrnaseq/op3',
13+
'config': os.path.abspath('./src/datasets/loaders/scrnaseq/op3/config.vsh.yaml')
14+
}
15+
## VIASH END
16+
17+
def test_op3(run_component, tmp_path):
18+
"""Test the OP3 loader."""
19+
input_file = tmp_path / "input.h5ad" # Convert to string to be safe
20+
output_file = tmp_path / "output.h5ad" # Convert to string to be safe
21+
22+
# download file
23+
url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279945/suppl/GSE279945_sc_counts_processed.h5ad"
24+
response = requests.get(url)
25+
if response.status_code != 200:
26+
raise RuntimeError(f"Failed to download file from {url}, status code: {response.status_code}")
27+
28+
# run loader
29+
run_component([
30+
"--input", str(input_file),
31+
"--var_feature_name", "index",
32+
"--donor_id", "1",
33+
"--cell_type", "T cells",
34+
"--dataset_id", "test_op3",
35+
"--dataset_name", "OP3 Test Dataset",
36+
"--dataset_summary", "Test summary for OP3 dataset",
37+
"--dataset_description", "Test description for OP3 dataset",
38+
"--dataset_url", "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279945/suppl/GSE279945_sc_counts_processed.h5ad",
39+
"--output", str(output_file),
40+
"--output_compression", "gzip"
41+
])
42+
43+
# check whether file exists
44+
assert output_file.exists(), f"Output file {output_file} does not exist"
45+
46+
adata = ad.read_h5ad(output_file)
47+
48+
# check obs
49+
assert not adata.obs.empty, ".obs should not be empty"
50+
assert "donor_id" in adata.obs.columns
51+
assert np.all(adata.obs["donor_id"] == "1"), "Not all cells are from donor 1"
52+
assert "cell_type" in adata.obs.columns
53+
assert np.all(adata.obs["cell_type"] == "T cells"), "Not all cells are T cells"
54+
assert "perturbation" in adata.obs.columns
55+
assert adata.n_obs > 0
56+
57+
# check var
58+
assert not adata.var.empty, ".var should not be empty"
59+
assert adata.n_vars > 0
60+
61+
# check layers
62+
assert "counts" in adata.layers
63+
assert adata.X is not None, "X matrix should not be None"
64+
65+
# check uns
66+
assert adata.uns["dataset_id"] == "test_op3", "Incorrect .uns['dataset_id']"
67+
assert adata.uns["dataset_name"] == "OP3 Test Dataset", "Incorrect .uns['dataset_name']"
68+
assert adata.uns["dataset_summary"] == "Test summary for OP3 dataset", "Incorrect .uns['dataset_summary']"
69+
assert adata.uns["dataset_description"] == "Test description for OP3 dataset", "Incorrect .uns['dataset_description']"
70+
71+
if __name__ == "__main__":
72+
sys.exit(pytest.main([__file__]))

0 commit comments

Comments
 (0)