automated tests no longer dependent on Zenodo

JudithBernett · JudithBernett · commit c315f4a88c2a · 2026-01-07T16:09:27.000+01:00
diff --git a/drevalpy/datasets/loader.py b/drevalpy/datasets/loader.py
@@ -8,7 +8,15 @@
 
 from .curvecurator import fit_curves
 from .dataset import DrugResponseDataset
-from .utils import ALLOWED_MEASURES, CELL_LINE_IDENTIFIER, DRUG_IDENTIFIER, TISSUE_IDENTIFIER, download_dataset
+from .utils import (
+    ALLOWED_MEASURES,
+    CELL_LINE_IDENTIFIER,
+    DRUG_IDENTIFIER,
+    TISSUE_IDENTIFIER,
+    download_dataset,
+    download_from_url,
+    unzip_data,
+)
 
 
 def check_measure(measure_queried: str, measures_data: list[str], dataset_name: str) -> None:
@@ -46,7 +54,8 @@ def _load_zenodo_dataset(
     path = os.path.join(path_data, dataset_name, file_name)
     if not os.path.exists(path):
         download_dataset(dataset_name, path_data, redownload=True)
-    meta_path = os.path.join(path_data, "meta")
+    # tissue mapping is not in TOY play dataset
+    meta_path = os.path.join(path_data, "meta", "tissue_mapping.csv")
     if not os.path.exists(meta_path):
         download_dataset("meta", path_data, redownload=True)
 
@@ -112,6 +121,35 @@ def load_ccle(
     return _load_zenodo_dataset(path_data=path_data, measure=measure, file_name="CCLE.csv", dataset_name="CCLE")
 
 
+def _load_test_data(
+    path_data: str = "data", measure: str = "LN_IC50_curvecurator", dataset_name: str = "TOYv1"
+) -> DrugResponseDataset:
+    test_data_path = "https://github.com/JudithBernett/test-datasets/raw/drugresponseeval/test_data"
+    # first get meta
+    meta_path = os.path.join(path_data, "meta")
+    if not os.path.exists(meta_path):
+        file_url = f"{test_data_path}/meta.zip"
+        file_path = Path(path_data) / "meta.zip"
+        response_meta = download_from_url(dataset_name="meta", file_url=file_url)
+        unzip_data(path_to_zip=file_path, response=response_meta, data_path=path_data)
+    file_url = f"{test_data_path}/{dataset_name}.zip"
+    file_path = Path(path_data) / f"{dataset_name}.zip"
+    response = download_from_url(dataset_name=dataset_name, file_url=file_url)
+    unzip_data(path_to_zip=file_path, response=response, data_path=path_data)
+
+    file_name = Path(path_data) / dataset_name / f"{dataset_name}.csv"
+    response_data = pd.read_csv(file_name, dtype={"pubchem_id": str, "cell_line_name": str})
+    response_data[DRUG_IDENTIFIER] = response_data[DRUG_IDENTIFIER].str.replace(",", "")
+    check_measure(measure, list(response_data.columns), dataset_name)
+    return DrugResponseDataset(
+        response=response_data[measure].values,
+        cell_line_ids=response_data[CELL_LINE_IDENTIFIER].values,
+        drug_ids=response_data[DRUG_IDENTIFIER].values,
+        tissues=response_data[TISSUE_IDENTIFIER].values,
+        dataset_name=dataset_name,
+    )
+
+
 def load_toyv1(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
     """
     Loads small Toy dataset, subsampled from CTRPv2.
@@ -121,7 +159,7 @@ def load_toyv1(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -
 
     :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
     """
-    return _load_zenodo_dataset(path_data=path_data, measure=measure, file_name="TOYv1.csv", dataset_name="TOYv1")
+    return _load_test_data(path_data=path_data, measure=measure, dataset_name="TOYv1")
 
 
 def load_toyv2(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
diff --git a/drevalpy/datasets/map_tissues.py b/drevalpy/datasets/map_tissues.py
@@ -407,8 +407,7 @@ def main():
 
     tissue_map = _apply_manual_cell_line_corrections(tissue_map)
 
-    final.loc[:, "tissue"] = final.loc[:, "cellosaurus_id"].map(tissue_map)
-    final = final.copy()
+    final = final.assign(tissue=final["cellosaurus_id"].map(tissue_map))
     if save_tissue_mapping:
         final.drop_duplicates(subset="cellosaurus_id", inplace=True)
         tissue_mapping_path = os.path.join(data_path, "meta", "tissue_mapping.csv")
diff --git a/drevalpy/datasets/utils.py b/drevalpy/datasets/utils.py
@@ -8,6 +8,7 @@
 import networkx as nx
 import numpy as np
 import requests
+from requests import Response
 
 # DRUG_IDENTIFIER, CELL_LINE_IDENTIFIER, and TISSUE_IDENTIFIER are used in pipeline
 DRUG_IDENTIFIER = "pubchem_id"
@@ -17,6 +18,40 @@
 ALLOWED_MEASURES.extend([f"{m}_curvecurator" for m in ALLOWED_MEASURES])
 
 
+def unzip_data(path_to_zip: Path, response: Response, data_path: str):
+    """
+    Unzips the downloaded data.
+
+    :param path_to_zip: Path to the zip file to be unzipped.
+    :param response: HTML response containing response.content
+    :param data_path: Where the unzipped directory should be stored
+    """
+    with open(path_to_zip, "wb") as f:
+        f.write(response.content)
+
+    with zipfile.ZipFile(path_to_zip, "r") as z:
+        for member in z.infolist():
+            if not member.filename.startswith("__MACOSX/"):
+                z.extract(member, os.path.join(data_path))
+    path_to_zip.unlink()  # Remove zip file after extraction
+
+
+def download_from_url(dataset_name: str, file_url: str) -> Response:
+    """
+    Download a file from a given URL.
+
+    :param dataset_name: how the dataset is called
+    :param file_url: exact URL to the zip file
+    :return: HTML response containing response.content
+    :raises HTTPError: if the download fails
+    """
+    print(f"Downloading {dataset_name} from {file_url}...")
+    response = requests.get(file_url, timeout=120)
+    if response.status_code != 200:
+        raise requests.exceptions.HTTPError(f"Error downloading file: " f"{response.status_code}")
+    return response
+
+
 def download_dataset(
     dataset_name: str,
     data_path: str | Path = "data",
@@ -40,15 +75,11 @@ def download_dataset(
     else:
         url = "https://zenodo.org/doi/10.5281/zenodo.12633909"
         # Fetch the latest record
-        headers = {
-            "User-Agent": "curl/8.5.0",
-            "Accept": "application/json",
-        }
-        response = requests.get(url, timeout=timeout, headers=headers)
+        response = requests.get(url, timeout=timeout)
         if response.status_code != 200:
             raise requests.exceptions.HTTPError(f"Error fetching record: {response.status_code}")
         latest_url = response.links["linkset"]["url"]
-        response = requests.get(latest_url, timeout=timeout, headers=headers)
+        response = requests.get(latest_url, timeout=timeout)
         if response.status_code != 200:
             raise requests.exceptions.HTTPError(f"Error fetching record: {response.status_code}")
         data = response.json()
@@ -59,21 +90,9 @@ def download_dataset(
         # Download each file
         name_to_url = {file["key"]: file["links"]["self"] for file in data["files"]}
         file_url = name_to_url[file_name]
-        # Download the file
-        print(f"Downloading {dataset_name} from {file_url}...")
-        response = requests.get(file_url, timeout=timeout)
-        if response.status_code != 200:
-            raise requests.exceptions.HTTPError(f"Error downloading file {dataset_name}: " f"{response.status_code}")
-
-        # Save the file
-        with open(file_path, "wb") as f:
-            f.write(response.content)
 
-        with zipfile.ZipFile(file_path, "r") as z:
-            for member in z.infolist():
-                if not member.filename.startswith("__MACOSX/"):
-                    z.extract(member, os.path.join(data_path))
-        file_path.unlink()  # Remove zip file after extraction
+        response = download_from_url(dataset_name=dataset_name, file_url=file_url)
+        unzip_data(path_to_zip=file_path, response=response, data_path=data_path)
 
         print(f"{dataset_name} data downloaded and extracted to {data_path}")
 
diff --git a/drevalpy/models/SimpleNeuralNetwork/multiomics_neural_network.py b/drevalpy/models/SimpleNeuralNetwork/multiomics_neural_network.py
@@ -40,6 +40,7 @@ def __init__(self):
         self.hyperparameters = None
         self.methylation_scaler = StandardScaler()
         self.methylation_pca = None
+        self.pca_ncomp = 100
         self.gene_expression_scaler = StandardScaler()
 
     @classmethod
@@ -62,7 +63,7 @@ def build_model(self, hyperparameters: dict):
             methylation_pca_components.
         """
         self.hyperparameters = hyperparameters
-        self.methylation_pca = PCA(n_components=hyperparameters["methylation_pca_components"])
+        self.pca_ncomp = hyperparameters["methylation_pca_components"]
 
     def train(
         self,
@@ -84,6 +85,13 @@ def train(
         """
         if drug_input is None:
             raise ValueError("Drug input (fingerprints) is needed for the MultiOmicsNeuralNetwork model.")
+        first_feature = next(iter(cell_line_input.features.values()))
+        n_met_features = first_feature["methylation"].shape[0]
+        if n_met_features > self.pca_ncomp:
+            self.methylation_pca = PCA(n_components=self.pca_ncomp)
+        else:
+            self.methylation_pca = PCA(n_components=n_met_features)
+
         cell_line_input = prepare_expression_and_methylation(
             cell_line_input=cell_line_input,
             cell_line_ids=np.unique(output.cell_line_ids),
diff --git a/drevalpy/models/baselines/multi_omics_random_forest.py b/drevalpy/models/baselines/multi_omics_random_forest.py
@@ -30,6 +30,7 @@ def __init__(self):
         """
         super().__init__()
         self.pca = None
+        self.pca_ncomp = 100
 
     @classmethod
     def get_model_name(cls) -> str:
@@ -47,7 +48,7 @@ def build_model(self, hyperparameters: dict):
         :param hyperparameters: Hyperparameters for the model.
         """
         super().build_model(hyperparameters)
-        self.pca = PCA(n_components=hyperparameters["n_components"])
+        self.pca_ncomp = hyperparameters["n_components"]
 
     def load_cell_line_features(self, data_path: str, dataset_name: str) -> FeatureDataset:
         """
@@ -104,6 +105,10 @@ def train(
             inputs["fingerprints"],
         )
 
+        if methylation.shape[1] > self.pca_ncomp:
+            self.pca = PCA(n_components=self.pca_ncomp)
+        else:
+            self.pca = PCA(n_components=methylation.shape[1])
         methylation = self.pca.fit_transform(methylation)
 
         x = np.concatenate(
diff --git a/tests/models/conftest.py b/tests/models/conftest.py
@@ -28,7 +28,7 @@ def cross_study_dataset() -> DrugResponseDataset:
 
     :returns: drug_response, cell_line_input, drug_input
     """
-    path_data = "../data"
+    path_data = os.path.join("..", "data")
     drug_response = load_toyv2(path_data)
     drug_response.remove_nan_responses()
     return drug_response
diff --git a/tests/test_available_data.py b/tests/test_available_data.py
@@ -1,7 +1,5 @@
 """Tests for the available datasets."""
 
-import requests
-
 from drevalpy.datasets import AVAILABLE_DATASETS
 
 
@@ -19,6 +17,7 @@ def test_factory() -> None:
     assert len(AVAILABLE_DATASETS) == 9
 
 
+'''
 def test_datasets():
     """Test whether the datasets exist on Zenodo."""
     zenodo_doi_url = "https://zenodo.org/doi/10.5281/zenodo.12633909"
@@ -34,20 +33,17 @@ def test_datasets():
         "PDX_Bruna.zip",
         "meta.zip",
     }
-    headers = {
-        "User-Agent": "curl/8.5.0",
-        "Accept": "application/json",
-    }
 
-    response = requests.get(zenodo_doi_url, headers=headers, timeout=30)
+    response = requests.get(zenodo_doi_url, timeout=30)
     response.raise_for_status()
 
     latest_url = response.links["linkset"]["url"]
-    response = requests.get(latest_url, headers=headers, timeout=30)
+    response = requests.get(latest_url, timeout=30)
     response.raise_for_status()
 
     data = response.json()
     zenodo_files = {f["key"] for f in data["files"]}
 
     missing = expected_files - zenodo_files
     assert not missing, f"Missing files on Zenodo: {missing}"
+'''