diff --git a/drevalpy/datasets/dataset.py b/drevalpy/datasets/dataset.py index 528e8f67..bef2fa06 100644 --- a/drevalpy/datasets/dataset.py +++ b/drevalpy/datasets/dataset.py @@ -15,7 +15,6 @@ import copy import os -from abc import ABC, abstractmethod from pathlib import Path from typing import Any, Callable @@ -31,30 +30,7 @@ np.set_printoptions(threshold=6) -class Dataset(ABC): - """Abstract wrapper class for datasets.""" - - @classmethod - @abstractmethod - def from_csv(cls: type["Dataset"], input_file: str | Path, dataset_name: str = "unknown") -> "Dataset": - """ - Loads the dataset from data. - - :param input_file: Path to the csv file containing the data to be loaded - :param dataset_name: Optional name to associate the dataset with, default = "unknown" - :returns: Dataset object containing data from provided csv file. - """ - - @abstractmethod - def save(self, path: str): - """ - Saves the dataset to data. - - :param path: path to the dataset - """ - - -class DrugResponseDataset(Dataset): +class DrugResponseDataset: """Drug response dataset.""" _response: np.ndarray @@ -226,7 +202,7 @@ def to_dataframe(self) -> pd.DataFrame: data["predictions"] = self.predictions return pd.DataFrame(data) - def save(self, path: str | Path): + def to_csv(self, path: str | Path): """ Stores the drug response dataset on disk. @@ -412,7 +388,7 @@ def save_splits(self, path: str): ]: if mode in split: split_path = os.path.join(path, f"cv_split_{i}_{mode}.csv") - split[mode].save(path=split_path) + split[mode].to_csv(path=split_path) def load_splits(self, path: str) -> None: """ @@ -720,7 +696,7 @@ def _leave_group_out_cv( return cv_sets -class FeatureDataset(Dataset): +class FeatureDataset: """Class for feature datasets.""" _meta_info: dict[str, Any] = {} @@ -728,17 +704,62 @@ class FeatureDataset(Dataset): @classmethod def from_csv( - cls: type["FeatureDataset"], input_file: str | Path, dataset_name: str = "unknown" - ) -> "FeatureDataset": + cls: type["FeatureDataset"], + path_to_csv: str | Path, + id_column: str, + view_name: str, + drop_columns: list[str] | None = None, + ): + """Load a one-view feature dataset from a csv file. + + Load a feature dataset from a csv file. The rows of the csv file represent the instances (cell lines or drugs), + the columns represent the features. A column named id_column contains the identifiers of the instances. + All unrelated columns (e.g. other id columns) should be provided as drop_columns, + that will be removed from the dataset. + + :param path_to_csv: path to the csv file containing the data to be loaded + :param view_name: name of the view (e.g. gene_expression) + :param id_column: name of the column containing the identifiers + :param drop_columns: list of columns to drop (e.g. other identifier columns) + :returns: FeatureDataset object containing data from provided csv file. + """ + data = pd.read_csv(path_to_csv) + ids = data[id_column].values + data_features = data.drop(columns=(drop_columns or [])) + data_features = data_features.set_index(id_column) + # remove duplicate feature rows (rows with the same index) + data_features = data_features[~data_features.index.duplicated(keep="first")] + features = {} + + for identifier in ids: + features_for_instance = data_features.loc[identifier].values + features[identifier] = {view_name: features_for_instance} + + return cls(features=features) + + def to_csv(self, path: str | Path, id_column: str, view_name: str): """ - Load a feature dataset from a csv file. + Save the feature dataset to a CSV file. - This function creates a FeatureDataset from a provided input file in csv format. - :param input_file: Path to the csv file containing the data to be loaded - :param dataset_name: Optional name to associate the dataset with, default = "unknown" - :raises NotImplementedError: This method is currently not implemented. + :param path: Path to the CSV file. + :param id_column: Name of the column containing the identifiers. + :param view_name: Name of the view (e.g., gene_expression). + + :raises ValueError: If the view is not found for an identifier. """ - raise NotImplementedError + data = [] + for identifier, feature_dict in self.features.items(): + # Get the feature vector for the specified view + if view_name in feature_dict: + row = {id_column: identifier} + row.update({f"feature_{i}": value for i, value in enumerate(feature_dict[view_name])}) + data.append(row) + else: + raise ValueError(f"View {view_name!r} not found for identifier {identifier!r}.") + + # Convert to DataFrame and save to CSV + df = pd.DataFrame(data) + df.to_csv(path, index=False) @property def meta_info(self) -> dict[str, Any]: @@ -798,15 +819,6 @@ def __init__( raise AssertionError(f"Meta keys {meta_info.keys()} not in view names {self.view_names}") self._meta_info = meta_info - def save(self, path: str): - """ - Saves the feature dataset to data. - - :param path: path to the dataset - :raises NotImplementedError: if method is not implemented - """ - raise NotImplementedError("save method not implemented") - def randomize_features(self, views_to_randomize: str | list[str], randomization_type: str) -> None: """ Randomizes the feature vectors. diff --git a/drevalpy/experiment.py b/drevalpy/experiment.py index 98a2e0f1..dcf0bcf6 100644 --- a/drevalpy/experiment.py +++ b/drevalpy/experiment.py @@ -4,7 +4,7 @@ import os import shutil import warnings -from typing import Any, Optional +from typing import Any import numpy as np import pandas as pd @@ -23,16 +23,16 @@ def drug_response_experiment( models: list[type[DRPModel]], response_data: DrugResponseDataset, - baselines: Optional[list[type[DRPModel]]] = None, - response_transformation: Optional[TransformerMixin] = None, + baselines: list[type[DRPModel]] | None = None, + response_transformation: TransformerMixin | None = None, run_id: str = "", test_mode: str = "LPO", metric: str = "RMSE", n_cv_splits: int = 5, multiprocessing: bool = False, - randomization_mode: Optional[list[str]] = None, + randomization_mode: list[str] | None = None, randomization_type: str = "permutation", - cross_study_datasets: Optional[list[DrugResponseDataset]] = None, + cross_study_datasets: list[DrugResponseDataset] | None = None, n_trials_robustness: int = 0, path_out: str = "results/", overwrite: bool = False, @@ -150,7 +150,7 @@ def drug_response_experiment( raise ValueError("No cv splits found.") for split_index, split in enumerate(response_data.cv_splits): - print(f"################# FOLD {split_index+1}/{len(response_data.cv_splits)} " f"#################") + print(f"################# FOLD {split_index + 1}/{len(response_data.cv_splits)} " f"#################") prediction_file = os.path.join(predictions_path, f"predictions_split_{split_index}.csv") @@ -226,7 +226,7 @@ def drug_response_experiment( single_drug_id=(drug_id if model_name in SINGLE_DRUG_MODEL_FACTORY else None), ) - test_dataset.save(prediction_file) + test_dataset.to_csv(prediction_file) else: print(f"Split {split_index} already exists. Skipping.") with open( @@ -287,7 +287,7 @@ def consolidate_single_drug_model_predictions( n_cv_splits: int, results_path: str, cross_study_datasets: list[DrugResponseDataset], - randomization_mode: Optional[list[str]] = None, + randomization_mode: list[str] | None = None, n_trials_robustness: int = 0, out_path: str = "", ) -> None: @@ -363,7 +363,7 @@ def consolidate_single_drug_model_predictions( # Robustness predictions for trial in range(n_trials_robustness): robustness_path = os.path.join(single_drug_prediction_path, "robustness") - f = f"robustness_{trial+1}_split_{split}.csv" + f = f"robustness_{trial + 1}_split_{split}.csv" if trial not in predictions["robustness"]: predictions["robustness"][trial] = [] predictions["robustness"][trial].append( @@ -411,7 +411,7 @@ def consolidate_single_drug_model_predictions( os.path.join( out_path, "robustness", - f"robustness_{trial+1}_split_{split}.csv", + f"robustness_{trial + 1}_split_{split}.csv", ) ) @@ -427,7 +427,7 @@ def consolidate_single_drug_model_predictions( def load_features( model: DRPModel, path_data: str, dataset: DrugResponseDataset -) -> tuple[FeatureDataset, Optional[FeatureDataset]]: +) -> tuple[FeatureDataset, FeatureDataset | None]: """ Load and reduce cell line and drug features for a given dataset. @@ -448,11 +448,11 @@ def cross_study_prediction( test_mode: str, train_dataset: DrugResponseDataset, path_data: str, - early_stopping_dataset: Optional[DrugResponseDataset], - response_transformation: Optional[TransformerMixin], + early_stopping_dataset: DrugResponseDataset | None, + response_transformation: TransformerMixin | None, path_out: str, split_index: int, - single_drug_id: Optional[str] = None, + single_drug_id: str | None = None, ) -> None: """ Run the drug response prediction experiment on a cross-study dataset to assess the generalizability of the model. @@ -484,7 +484,7 @@ def cross_study_prediction( cell_lines_to_keep = cl_features.identifiers if cl_features is not None else None - drugs_to_keep: Optional[np.ndarray] = None + drugs_to_keep: np.ndarray | None = None if single_drug_id is not None: drugs_to_keep = np.array([single_drug_id]) elif drug_features is not None: @@ -535,7 +535,7 @@ def cross_study_prediction( dataset._response = response_transformation.inverse_transform(dataset.response) else: dataset._predictions = np.array([]) - dataset.save( + dataset.to_csv( os.path.join( path_out, "cross_study", @@ -584,10 +584,10 @@ def robustness_test( path_data: str, train_dataset: DrugResponseDataset, test_dataset: DrugResponseDataset, - early_stopping_dataset: Optional[DrugResponseDataset], + early_stopping_dataset: DrugResponseDataset | None, path_out: str, split_index: int, - response_transformation: Optional[TransformerMixin] = None, + response_transformation: TransformerMixin | None = None, ): """ Run robustness tests for the given model and dataset. @@ -609,10 +609,10 @@ def robustness_test( robustness_test_path = os.path.join(path_out, "robustness") os.makedirs(robustness_test_path, exist_ok=True) for trial in range(n_trials): - print(f"Running robustness test trial {trial+1}/{n_trials}") + print(f"Running robustness test trial {trial + 1}/{n_trials}") trial_file = os.path.join( robustness_test_path, - f"robustness_{trial+1}_split_{split_index}.csv", + f"robustness_{trial + 1}_split_{split_index}.csv", ) if not os.path.isfile(trial_file): robustness_train_predict( @@ -634,11 +634,11 @@ def robustness_train_predict( trial_file: str, train_dataset: DrugResponseDataset, test_dataset: DrugResponseDataset, - early_stopping_dataset: Optional[DrugResponseDataset], + early_stopping_dataset: DrugResponseDataset | None, model: DRPModel, hpam_set: dict, path_data: str, - response_transformation: Optional[TransformerMixin] = None, + response_transformation: TransformerMixin | None = None, ): """ Train and predict for the robustness test. @@ -666,7 +666,7 @@ def robustness_train_predict( early_stopping_dataset=early_stopping_dataset, response_transformation=response_transformation, ) - test_dataset.save(trial_file) + test_dataset.to_csv(trial_file) def randomization_test( @@ -676,11 +676,11 @@ def randomization_test( path_data: str, train_dataset: DrugResponseDataset, test_dataset: DrugResponseDataset, - early_stopping_dataset: Optional[DrugResponseDataset], + early_stopping_dataset: DrugResponseDataset | None, path_out: str, split_index: int, randomization_type: str = "permutation", - response_transformation=Optional[TransformerMixin], + response_transformation=TransformerMixin | None, ) -> None: """ Run randomization tests for the given model and dataset. @@ -745,8 +745,8 @@ def randomize_train_predict( path_data: str, train_dataset: DrugResponseDataset, test_dataset: DrugResponseDataset, - early_stopping_dataset: Optional[DrugResponseDataset], - response_transformation: Optional[TransformerMixin], + early_stopping_dataset: DrugResponseDataset | None, + response_transformation: TransformerMixin | None, ) -> None: """ Randomize the features for a given view and run the model. @@ -783,12 +783,12 @@ def randomize_train_predict( ) return - cl_features_rand: Optional[FeatureDataset] = None + cl_features_rand: FeatureDataset | None = None if cl_features is not None: cl_features_rand = cl_features.copy() cl_features_rand.randomize_features(view, randomization_type=randomization_type) # type: ignore[union-attr] - drug_features_rand: Optional[FeatureDataset] = None + drug_features_rand: FeatureDataset | None = None if drug_features is not None: drug_features_rand = drug_features.copy() drug_features_rand.randomize_features(view, randomization_type=randomization_type) # type: ignore[union-attr] @@ -804,7 +804,7 @@ def randomize_train_predict( cl_features=cl_features_rand, drug_features=drug_features_rand, ) - test_dataset_rand.save(randomization_test_file) + test_dataset_rand.to_csv(randomization_test_file) def split_early_stopping( @@ -837,10 +837,10 @@ def train_and_predict( path_data: str, train_dataset: DrugResponseDataset, prediction_dataset: DrugResponseDataset, - early_stopping_dataset: Optional[DrugResponseDataset] = None, - response_transformation: Optional[TransformerMixin] = None, - cl_features: Optional[FeatureDataset] = None, - drug_features: Optional[FeatureDataset] = None, + early_stopping_dataset: DrugResponseDataset | None = None, + response_transformation: TransformerMixin | None = None, + cl_features: FeatureDataset | None = None, + drug_features: FeatureDataset | None = None, ) -> DrugResponseDataset: """ Train the model and predict the response for the prediction dataset. @@ -926,8 +926,8 @@ def train_and_evaluate( path_data: str, train_dataset: DrugResponseDataset, validation_dataset: DrugResponseDataset, - early_stopping_dataset: Optional[DrugResponseDataset] = None, - response_transformation: Optional[TransformerMixin] = None, + early_stopping_dataset: DrugResponseDataset | None = None, + response_transformation: TransformerMixin | None = None, metric: str = "rmse", ) -> dict[str, float]: """ @@ -960,8 +960,8 @@ def hpam_tune( train_dataset: DrugResponseDataset, validation_dataset: DrugResponseDataset, hpam_set: list[dict], - early_stopping_dataset: Optional[DrugResponseDataset] = None, - response_transformation: Optional[TransformerMixin] = None, + early_stopping_dataset: DrugResponseDataset | None = None, + response_transformation: TransformerMixin | None = None, metric: str = "RMSE", path_data: str = "data", ) -> dict: @@ -1019,9 +1019,9 @@ def hpam_tune_raytune( model: DRPModel, train_dataset: DrugResponseDataset, validation_dataset: DrugResponseDataset, - early_stopping_dataset: Optional[DrugResponseDataset], + early_stopping_dataset: DrugResponseDataset | None, hpam_set: list[dict], - response_transformation: Optional[TransformerMixin] = None, + response_transformation: TransformerMixin | None = None, metric: str = "RMSE", ray_path: str = "raytune", path_data: str = "data", @@ -1094,7 +1094,7 @@ def make_model_list(models: list[type[DRPModel]], response_data: DrugResponseDat @pipeline_function -def get_model_name_and_drug_id(model_name: str) -> tuple[str, Optional[str]]: +def get_model_name_and_drug_id(model_name: str) -> tuple[str, str | None]: """ Get the model name and drug id from the model name. @@ -1119,8 +1119,8 @@ def get_model_name_and_drug_id(model_name: str) -> tuple[str, Optional[str]]: @pipeline_function def get_datasets_from_cv_split( - split: dict[str, DrugResponseDataset], model_class: type[DRPModel], model_name: str, drug_id: Optional[str] = None -) -> tuple[DrugResponseDataset, DrugResponseDataset, Optional[DrugResponseDataset], DrugResponseDataset]: + split: dict[str, DrugResponseDataset], model_class: type[DRPModel], model_name: str, drug_id: str | None = None +) -> tuple[DrugResponseDataset, DrugResponseDataset, DrugResponseDataset | None, DrugResponseDataset]: """ Get train, validation, (early stopping), and test datasets from the CV split. diff --git a/drevalpy/models/MOLIR/utils.py b/drevalpy/models/MOLIR/utils.py index c544ee6a..4f7d5c2e 100644 --- a/drevalpy/models/MOLIR/utils.py +++ b/drevalpy/models/MOLIR/utils.py @@ -9,7 +9,6 @@ import os import random import secrets -from typing import Optional import numpy as np import pytorch_lightning as pl @@ -69,7 +68,7 @@ def generate_triplets_indices( y: np.ndarray, positive_range: float, negative_range: float, - random_seed: Optional[int] = None, + random_seed: int | None = None, ) -> tuple[np.ndarray, np.ndarray]: """ Generates triplets for the MOLIR model. @@ -155,8 +154,8 @@ def create_dataset_and_loaders( batch_size: int, output_train: DrugResponseDataset, cell_line_input: FeatureDataset, - output_earlystopping: Optional[DrugResponseDataset] = None, -) -> tuple[DataLoader, Optional[DataLoader]]: + output_earlystopping: DrugResponseDataset | None = None, +) -> tuple[DataLoader, DataLoader | None]: """ Creates the RegressionDataset (torch Dataset) and the DataLoader for the training and validation data. @@ -322,7 +321,7 @@ def fit( self, output_train: DrugResponseDataset, cell_line_input: FeatureDataset, - output_earlystopping: Optional[DrugResponseDataset] = None, + output_earlystopping: DrugResponseDataset | None = None, patience: int = 5, ) -> None: """ diff --git a/drevalpy/models/baselines/sklearn_models.py b/drevalpy/models/baselines/sklearn_models.py index 84a67e02..09fd417c 100644 --- a/drevalpy/models/baselines/sklearn_models.py +++ b/drevalpy/models/baselines/sklearn_models.py @@ -1,7 +1,5 @@ """Contains sklearn baseline models: ElasticNet, RandomForest, SVM.""" -from typing import Optional - import numpy as np from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor from sklearn.linear_model import ElasticNet, Lasso, Ridge @@ -121,7 +119,7 @@ def load_cell_line_features(self, data_path: str, dataset_name: str) -> FeatureD dataset_name=dataset_name, ) - def load_drug_features(self, data_path: str, dataset_name: str) -> Optional[FeatureDataset]: + def load_drug_features(self, data_path: str, dataset_name: str) -> FeatureDataset | None: """ Load the drug features, in this case the fingerprints. diff --git a/drevalpy/models/drp_model.py b/drevalpy/models/drp_model.py index ad642384..3b4b6798 100644 --- a/drevalpy/models/drp_model.py +++ b/drevalpy/models/drp_model.py @@ -9,7 +9,7 @@ import inspect import os from abc import ABC, abstractmethod -from typing import Any, Optional +from typing import Any import numpy as np import yaml @@ -154,7 +154,7 @@ def load_cell_line_features(self, data_path: str, dataset_name: str) -> FeatureD """ @abstractmethod - def load_drug_features(self, data_path: str, dataset_name: str) -> Optional[FeatureDataset]: + def load_drug_features(self, data_path: str, dataset_name: str) -> FeatureDataset | None: """ Load the drug features before the train/predict method is called. @@ -170,12 +170,12 @@ def load_drug_features(self, data_path: str, dataset_name: str) -> Optional[Feat def get_concatenated_features( self, - cell_line_view: Optional[str], - drug_view: Optional[str], + cell_line_view: str | None, + drug_view: str | None, cell_line_ids_output: np.ndarray, drug_ids_output: np.ndarray, - cell_line_input: Optional[FeatureDataset], - drug_input: Optional[FeatureDataset], + cell_line_input: FeatureDataset | None, + drug_input: FeatureDataset | None, ) -> np.ndarray: """ Concatenates the features to an input matrix X for the given cell line and drug views. @@ -228,8 +228,8 @@ def get_feature_matrices( self, cell_line_ids: np.ndarray, drug_ids: np.ndarray, - cell_line_input: Optional[FeatureDataset], - drug_input: Optional[FeatureDataset], + cell_line_input: FeatureDataset | None, + drug_input: FeatureDataset | None, ) -> dict[str, np.ndarray]: """ Returns the feature matrices for the given cell line and drug ids by retrieving the correct views. diff --git a/drevalpy/models/utils.py b/drevalpy/models/utils.py index 27d1e545..b7208361 100644 --- a/drevalpy/models/utils.py +++ b/drevalpy/models/utils.py @@ -2,7 +2,6 @@ import os.path import warnings -from typing import Optional import numpy as np import pandas as pd @@ -24,7 +23,7 @@ def load_cl_ids_from_csv(path: str, dataset_name: str) -> FeatureDataset: def load_and_reduce_gene_features( feature_type: str, - gene_list: Optional[str], + gene_list: str | None, data_path: str, dataset_name: str, ) -> FeatureDataset: @@ -138,8 +137,8 @@ def load_drug_fingerprint_features(data_path: str, dataset_name: str) -> Feature def get_multiomics_feature_dataset( data_path: str, dataset_name: str, - gene_list: Optional[str] = "drug_target_genes_all_drugs", - omics: Optional[list[str]] = None, + gene_list: str | None = "drug_target_genes_all_drugs", + omics: list[str] | None = None, ) -> FeatureDataset: """ Get multiomics feature dataset for the given list of OMICs. diff --git a/drevalpy/utils.py b/drevalpy/utils.py index 8eb66e36..7093d4c6 100644 --- a/drevalpy/utils.py +++ b/drevalpy/utils.py @@ -2,7 +2,6 @@ import argparse from pathlib import Path -from typing import Optional from sklearn.base import TransformerMixin from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler @@ -331,7 +330,7 @@ def get_datasets( measure: str = "response", curve_curator: bool = False, cores: int = 1, -) -> tuple[DrugResponseDataset, Optional[list[DrugResponseDataset]]]: +) -> tuple[DrugResponseDataset, list[DrugResponseDataset] | None]: """ Load the response data and cross-study datasets. @@ -366,7 +365,7 @@ def get_datasets( @pipeline_function -def get_response_transformation(response_transformation: str) -> Optional[TransformerMixin]: +def get_response_transformation(response_transformation: str) -> TransformerMixin | None: """ Get the skelarn response transformation object of choice. diff --git a/drevalpy/visualization/utils.py b/drevalpy/visualization/utils.py index bdd3cfae..a643b12e 100644 --- a/drevalpy/visualization/utils.py +++ b/drevalpy/visualization/utils.py @@ -4,7 +4,7 @@ import pathlib import re import shutil -from typing import Optional, TextIO +from typing import TextIO import importlib_resources import pandas as pd @@ -267,7 +267,7 @@ def _evaluate_per_group( df: pd.DataFrame, group_by: str, norm_group_eval_results: dict[str, dict[str, float]], - eval_results_per_group: Optional[pd.DataFrame], + eval_results_per_group: pd.DataFrame | None, model: str, ) -> tuple[dict[str, dict[str, float]], pd.DataFrame]: """ diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 35ab3f54..d485c8d5 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -5,6 +5,7 @@ import networkx as nx import numpy as np +import pandas as pd import pytest from flaky import flaky @@ -29,7 +30,7 @@ def test_response_dataset_load() -> None: response=data["response"], ) dataset_path = Path("dataset.csv") - dataset.save(dataset_path) + dataset.to_csv(dataset_path) del dataset # Load the dataset dataset = DrugResponseDataset.from_csv(dataset_path) @@ -526,20 +527,6 @@ def test_invariant_randomization_graph(graph_dataset: FeatureDataset) -> None: ) -def test_feature_dataset_save_and_load(sample_dataset: FeatureDataset) -> None: - """ - Test if the save and load methods work correctly. - - :param sample_dataset: sample FeatureDataset - """ - tmp = tempfile.NamedTemporaryFile() - with pytest.raises(NotImplementedError): - sample_dataset.save(path=tmp.name) - - with pytest.raises(NotImplementedError): - FeatureDataset.from_csv(tmp.name) - - def test_add_features(sample_dataset: FeatureDataset, graph_dataset: FeatureDataset) -> None: """ Test if the add_features method works correctly. @@ -551,3 +538,61 @@ def test_add_features(sample_dataset: FeatureDataset, graph_dataset: FeatureData assert sample_dataset.meta_info is not None assert "molecular_graph" in sample_dataset.meta_info assert "molecular_graph" in sample_dataset.view_names + + +def test_feature_dataset_csv_methods(): + """Test the `from_csv` and `to_csv` methods of the FeatureDataset class.""" + # Create temporary directory for testing + with tempfile.TemporaryDirectory() as temp_dir: + temp_dir = Path(temp_dir) + + # Create test data + test_csv_path = temp_dir / "test_features.csv" + data = { + "id": ["A", "B", "C"], + "feature_1": [1.0, 2.0, 3.0], + "feature_2": [4.0, 5.0, 6.0], + } + df = pd.DataFrame(data) + df.to_csv(test_csv_path, index=False) + + # Test `from_csv` method + view_name = "example_view" + feature_dataset = FeatureDataset.from_csv( + path_to_csv=test_csv_path, id_column="id", view_name=view_name, drop_columns=None + ) + + # Validate loaded data + assert set(feature_dataset.identifiers) == {"A", "B", "C"}, "Identifiers mismatch." + assert feature_dataset.view_names == [view_name], "View names mismatch." + expected_features = { + "A": {"example_view": np.array([1.0, 4.0])}, + "B": {"example_view": np.array([2.0, 5.0])}, + "C": {"example_view": np.array([3.0, 6.0])}, + } + for identifier in expected_features: + np.testing.assert_array_equal( + feature_dataset.features[identifier][view_name], + expected_features[identifier][view_name], + f"Feature mismatch for identifier {identifier}.", + ) + + # Test `to_csv` method + output_csv_path = temp_dir / "output_features.csv" + feature_dataset.to_csv(path=output_csv_path, id_column="id", view_name=view_name) + + # Validate saved data + saved_df = pd.read_csv(output_csv_path) + expected_saved_df = pd.DataFrame( + { + "id": ["A", "B", "C"], + "feature_0": [1.0, 2.0, 3.0], + "feature_1": [4.0, 5.0, 6.0], + } + ) + pd.testing.assert_frame_equal( + saved_df, + expected_saved_df, + check_dtype=False, # Relax dtype check for cross-platform compatibility + obj="Saved CSV data", + )