Skip to content

Commit c315f4a

Browse files
committed
automated tests no longer dependent on Zenodo
1 parent 4c0ae3e commit c315f4a

File tree

7 files changed

+101
-36
lines changed

7 files changed

+101
-36
lines changed

drevalpy/datasets/loader.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,15 @@
88

99
from .curvecurator import fit_curves
1010
from .dataset import DrugResponseDataset
11-
from .utils import ALLOWED_MEASURES, CELL_LINE_IDENTIFIER, DRUG_IDENTIFIER, TISSUE_IDENTIFIER, download_dataset
11+
from .utils import (
12+
ALLOWED_MEASURES,
13+
CELL_LINE_IDENTIFIER,
14+
DRUG_IDENTIFIER,
15+
TISSUE_IDENTIFIER,
16+
download_dataset,
17+
download_from_url,
18+
unzip_data,
19+
)
1220

1321

1422
def check_measure(measure_queried: str, measures_data: list[str], dataset_name: str) -> None:
@@ -46,7 +54,8 @@ def _load_zenodo_dataset(
4654
path = os.path.join(path_data, dataset_name, file_name)
4755
if not os.path.exists(path):
4856
download_dataset(dataset_name, path_data, redownload=True)
49-
meta_path = os.path.join(path_data, "meta")
57+
# tissue mapping is not in TOY play dataset
58+
meta_path = os.path.join(path_data, "meta", "tissue_mapping.csv")
5059
if not os.path.exists(meta_path):
5160
download_dataset("meta", path_data, redownload=True)
5261

@@ -112,6 +121,35 @@ def load_ccle(
112121
return _load_zenodo_dataset(path_data=path_data, measure=measure, file_name="CCLE.csv", dataset_name="CCLE")
113122

114123

124+
def _load_test_data(
125+
path_data: str = "data", measure: str = "LN_IC50_curvecurator", dataset_name: str = "TOYv1"
126+
) -> DrugResponseDataset:
127+
test_data_path = "https://github.com/JudithBernett/test-datasets/raw/drugresponseeval/test_data"
128+
# first get meta
129+
meta_path = os.path.join(path_data, "meta")
130+
if not os.path.exists(meta_path):
131+
file_url = f"{test_data_path}/meta.zip"
132+
file_path = Path(path_data) / "meta.zip"
133+
response_meta = download_from_url(dataset_name="meta", file_url=file_url)
134+
unzip_data(path_to_zip=file_path, response=response_meta, data_path=path_data)
135+
file_url = f"{test_data_path}/{dataset_name}.zip"
136+
file_path = Path(path_data) / f"{dataset_name}.zip"
137+
response = download_from_url(dataset_name=dataset_name, file_url=file_url)
138+
unzip_data(path_to_zip=file_path, response=response, data_path=path_data)
139+
140+
file_name = Path(path_data) / dataset_name / f"{dataset_name}.csv"
141+
response_data = pd.read_csv(file_name, dtype={"pubchem_id": str, "cell_line_name": str})
142+
response_data[DRUG_IDENTIFIER] = response_data[DRUG_IDENTIFIER].str.replace(",", "")
143+
check_measure(measure, list(response_data.columns), dataset_name)
144+
return DrugResponseDataset(
145+
response=response_data[measure].values,
146+
cell_line_ids=response_data[CELL_LINE_IDENTIFIER].values,
147+
drug_ids=response_data[DRUG_IDENTIFIER].values,
148+
tissues=response_data[TISSUE_IDENTIFIER].values,
149+
dataset_name=dataset_name,
150+
)
151+
152+
115153
def load_toyv1(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
116154
"""
117155
Loads small Toy dataset, subsampled from CTRPv2.
@@ -121,7 +159,7 @@ def load_toyv1(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -
121159
122160
:return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
123161
"""
124-
return _load_zenodo_dataset(path_data=path_data, measure=measure, file_name="TOYv1.csv", dataset_name="TOYv1")
162+
return _load_test_data(path_data=path_data, measure=measure, dataset_name="TOYv1")
125163

126164

127165
def load_toyv2(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:

drevalpy/datasets/map_tissues.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -407,8 +407,7 @@ def main():
407407

408408
tissue_map = _apply_manual_cell_line_corrections(tissue_map)
409409

410-
final.loc[:, "tissue"] = final.loc[:, "cellosaurus_id"].map(tissue_map)
411-
final = final.copy()
410+
final = final.assign(tissue=final["cellosaurus_id"].map(tissue_map))
412411
if save_tissue_mapping:
413412
final.drop_duplicates(subset="cellosaurus_id", inplace=True)
414413
tissue_mapping_path = os.path.join(data_path, "meta", "tissue_mapping.csv")

drevalpy/datasets/utils.py

Lines changed: 39 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import networkx as nx
99
import numpy as np
1010
import requests
11+
from requests import Response
1112

1213
# DRUG_IDENTIFIER, CELL_LINE_IDENTIFIER, and TISSUE_IDENTIFIER are used in pipeline
1314
DRUG_IDENTIFIER = "pubchem_id"
@@ -17,6 +18,40 @@
1718
ALLOWED_MEASURES.extend([f"{m}_curvecurator" for m in ALLOWED_MEASURES])
1819

1920

21+
def unzip_data(path_to_zip: Path, response: Response, data_path: str):
22+
"""
23+
Unzips the downloaded data.
24+
25+
:param path_to_zip: Path to the zip file to be unzipped.
26+
:param response: HTML response containing response.content
27+
:param data_path: Where the unzipped directory should be stored
28+
"""
29+
with open(path_to_zip, "wb") as f:
30+
f.write(response.content)
31+
32+
with zipfile.ZipFile(path_to_zip, "r") as z:
33+
for member in z.infolist():
34+
if not member.filename.startswith("__MACOSX/"):
35+
z.extract(member, os.path.join(data_path))
36+
path_to_zip.unlink() # Remove zip file after extraction
37+
38+
39+
def download_from_url(dataset_name: str, file_url: str) -> Response:
40+
"""
41+
Download a file from a given URL.
42+
43+
:param dataset_name: how the dataset is called
44+
:param file_url: exact URL to the zip file
45+
:return: HTML response containing response.content
46+
:raises HTTPError: if the download fails
47+
"""
48+
print(f"Downloading {dataset_name} from {file_url}...")
49+
response = requests.get(file_url, timeout=120)
50+
if response.status_code != 200:
51+
raise requests.exceptions.HTTPError(f"Error downloading file: " f"{response.status_code}")
52+
return response
53+
54+
2055
def download_dataset(
2156
dataset_name: str,
2257
data_path: str | Path = "data",
@@ -40,15 +75,11 @@ def download_dataset(
4075
else:
4176
url = "https://zenodo.org/doi/10.5281/zenodo.12633909"
4277
# Fetch the latest record
43-
headers = {
44-
"User-Agent": "curl/8.5.0",
45-
"Accept": "application/json",
46-
}
47-
response = requests.get(url, timeout=timeout, headers=headers)
78+
response = requests.get(url, timeout=timeout)
4879
if response.status_code != 200:
4980
raise requests.exceptions.HTTPError(f"Error fetching record: {response.status_code}")
5081
latest_url = response.links["linkset"]["url"]
51-
response = requests.get(latest_url, timeout=timeout, headers=headers)
82+
response = requests.get(latest_url, timeout=timeout)
5283
if response.status_code != 200:
5384
raise requests.exceptions.HTTPError(f"Error fetching record: {response.status_code}")
5485
data = response.json()
@@ -59,21 +90,9 @@ def download_dataset(
5990
# Download each file
6091
name_to_url = {file["key"]: file["links"]["self"] for file in data["files"]}
6192
file_url = name_to_url[file_name]
62-
# Download the file
63-
print(f"Downloading {dataset_name} from {file_url}...")
64-
response = requests.get(file_url, timeout=timeout)
65-
if response.status_code != 200:
66-
raise requests.exceptions.HTTPError(f"Error downloading file {dataset_name}: " f"{response.status_code}")
67-
68-
# Save the file
69-
with open(file_path, "wb") as f:
70-
f.write(response.content)
7193

72-
with zipfile.ZipFile(file_path, "r") as z:
73-
for member in z.infolist():
74-
if not member.filename.startswith("__MACOSX/"):
75-
z.extract(member, os.path.join(data_path))
76-
file_path.unlink() # Remove zip file after extraction
94+
response = download_from_url(dataset_name=dataset_name, file_url=file_url)
95+
unzip_data(path_to_zip=file_path, response=response, data_path=data_path)
7796

7897
print(f"{dataset_name} data downloaded and extracted to {data_path}")
7998

drevalpy/models/SimpleNeuralNetwork/multiomics_neural_network.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def __init__(self):
4040
self.hyperparameters = None
4141
self.methylation_scaler = StandardScaler()
4242
self.methylation_pca = None
43+
self.pca_ncomp = 100
4344
self.gene_expression_scaler = StandardScaler()
4445

4546
@classmethod
@@ -62,7 +63,7 @@ def build_model(self, hyperparameters: dict):
6263
methylation_pca_components.
6364
"""
6465
self.hyperparameters = hyperparameters
65-
self.methylation_pca = PCA(n_components=hyperparameters["methylation_pca_components"])
66+
self.pca_ncomp = hyperparameters["methylation_pca_components"]
6667

6768
def train(
6869
self,
@@ -84,6 +85,13 @@ def train(
8485
"""
8586
if drug_input is None:
8687
raise ValueError("Drug input (fingerprints) is needed for the MultiOmicsNeuralNetwork model.")
88+
first_feature = next(iter(cell_line_input.features.values()))
89+
n_met_features = first_feature["methylation"].shape[0]
90+
if n_met_features > self.pca_ncomp:
91+
self.methylation_pca = PCA(n_components=self.pca_ncomp)
92+
else:
93+
self.methylation_pca = PCA(n_components=n_met_features)
94+
8795
cell_line_input = prepare_expression_and_methylation(
8896
cell_line_input=cell_line_input,
8997
cell_line_ids=np.unique(output.cell_line_ids),

drevalpy/models/baselines/multi_omics_random_forest.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def __init__(self):
3030
"""
3131
super().__init__()
3232
self.pca = None
33+
self.pca_ncomp = 100
3334

3435
@classmethod
3536
def get_model_name(cls) -> str:
@@ -47,7 +48,7 @@ def build_model(self, hyperparameters: dict):
4748
:param hyperparameters: Hyperparameters for the model.
4849
"""
4950
super().build_model(hyperparameters)
50-
self.pca = PCA(n_components=hyperparameters["n_components"])
51+
self.pca_ncomp = hyperparameters["n_components"]
5152

5253
def load_cell_line_features(self, data_path: str, dataset_name: str) -> FeatureDataset:
5354
"""
@@ -104,6 +105,10 @@ def train(
104105
inputs["fingerprints"],
105106
)
106107

108+
if methylation.shape[1] > self.pca_ncomp:
109+
self.pca = PCA(n_components=self.pca_ncomp)
110+
else:
111+
self.pca = PCA(n_components=methylation.shape[1])
107112
methylation = self.pca.fit_transform(methylation)
108113

109114
x = np.concatenate(

tests/models/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def cross_study_dataset() -> DrugResponseDataset:
2828
2929
:returns: drug_response, cell_line_input, drug_input
3030
"""
31-
path_data = "../data"
31+
path_data = os.path.join("..", "data")
3232
drug_response = load_toyv2(path_data)
3333
drug_response.remove_nan_responses()
3434
return drug_response

tests/test_available_data.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
"""Tests for the available datasets."""
22

3-
import requests
4-
53
from drevalpy.datasets import AVAILABLE_DATASETS
64

75

@@ -19,6 +17,7 @@ def test_factory() -> None:
1917
assert len(AVAILABLE_DATASETS) == 9
2018

2119

20+
'''
2221
def test_datasets():
2322
"""Test whether the datasets exist on Zenodo."""
2423
zenodo_doi_url = "https://zenodo.org/doi/10.5281/zenodo.12633909"
@@ -34,20 +33,17 @@ def test_datasets():
3433
"PDX_Bruna.zip",
3534
"meta.zip",
3635
}
37-
headers = {
38-
"User-Agent": "curl/8.5.0",
39-
"Accept": "application/json",
40-
}
4136
42-
response = requests.get(zenodo_doi_url, headers=headers, timeout=30)
37+
response = requests.get(zenodo_doi_url, timeout=30)
4338
response.raise_for_status()
4439
4540
latest_url = response.links["linkset"]["url"]
46-
response = requests.get(latest_url, headers=headers, timeout=30)
41+
response = requests.get(latest_url, timeout=30)
4742
response.raise_for_status()
4843
4944
data = response.json()
5045
zenodo_files = {f["key"] for f in data["files"]}
5146
5247
missing = expected_files - zenodo_files
5348
assert not missing, f"Missing files on Zenodo: {missing}"
49+
'''

0 commit comments

Comments
 (0)