Support More Data Types in the sklearn Dataset and Decoder (#43)

gonlairo · web-flow · commit c95bd5aed77b · 2023-07-15T11:26:15.000+02:00
diff --git a/cebra/data/helper.py b/cebra/data/helper.py
@@ -19,6 +19,44 @@
 import scipy.linalg
 import torch
 
+import cebra.data.base as cebra_data_base
+import cebra.data.multi_session as cebra_data_multisession
+import cebra.data.single_session as cebra_data_singlesession
+
+
+def get_loader_options(dataset: cebra_data_base.Dataset) -> List[str]:
+    """Return all possible dataloaders for the given dataset."""
+
+    loader_options = []
+    if isinstance(dataset, cebra_data_singlesession.SingleSessionDataset):
+        mixed = True
+        if dataset.continuous_index is not None:
+            loader_options.append(cebra_data_singlesession.ContinuousDataLoader)
+        else:
+            mixed = False
+        if dataset.discrete_index is not None:
+            loader_options.append(cebra_data_singlesession.DiscreteDataLoader)
+        else:
+            mixed = False
+        if mixed:
+            loader_options.append(cebra_data_singlesession.MixedDataLoader)
+    elif isinstance(dataset, cebra_data_multisession.MultiSessionDataset):
+        mixed = True
+        if dataset.continuous_index is not None:
+            loader_options.append(
+                cebra_data_multisession.ContinuousMultiSessionDataLoader)
+        else:
+            mixed = False
+        if dataset.discrete_index is not None:
+            pass  # not implemented yet
+        else:
+            mixed = False
+        if mixed:
+            pass  # not implemented yet
+    else:
+        raise TypeError(f"Invalid dataset type: {dataset}")
+    return loader_options
+
 
 def _require_numpy_array(array: Union[npt.NDArray, torch.Tensor]):
     if not isinstance(array, np.ndarray):
diff --git a/cebra/distributions/discrete.py b/cebra/distributions/discrete.py
@@ -19,6 +19,7 @@
 import torch
 
 import cebra.distributions.base as abc_
+import cebra.helper
 
 
 class Discrete(abc_.ConditionalDistribution, abc_.HasGenerator):
@@ -38,7 +39,7 @@ def _to_numpy_int(self, samples: Union[torch.Tensor,
                                            npt.NDArray]) -> npt.NDArray:
         if isinstance(samples, torch.Tensor):
             samples = samples.cpu().numpy()
-        if samples.dtype not in (np.int32, np.int64):
+        if not cebra.helper._is_integer(samples):
             samples = samples.astype(int)
         return samples
 
diff --git a/cebra/helper.py b/cebra/helper.py
@@ -15,47 +15,18 @@
 import pathlib
 import tempfile
 import urllib
+import warnings
 import zipfile
-from typing import List
+from typing import List, Union
 
+import numpy as np
+import numpy.typing as npt
 import requests
+import torch
 
 import cebra.data
 
 
-def get_loader_options(dataset: cebra.data.Dataset) -> List[str]:
-    """Return all possible dataloaders for the given dataset."""
-
-    loader_options = []
-    if isinstance(dataset, cebra.data.SingleSessionDataset):
-        mixed = True
-        if dataset.continuous_index is not None:
-            loader_options.append(cebra.data.ContinuousDataLoader)
-        else:
-            mixed = False
-        if dataset.discrete_index is not None:
-            loader_options.append(cebra.data.DiscreteDataLoader)
-        else:
-            mixed = False
-        if mixed:
-            loader_options.append(cebra.data.MixedDataLoader)
-    elif isinstance(dataset, cebra.data.MultiSessionDataset):
-        mixed = True
-        if dataset.continuous_index is not None:
-            loader_options.append(cebra.data.ContinuousMultiSessionDataLoader)
-        else:
-            mixed = False
-        if dataset.discrete_index is not None:
-            pass  # not implemented yet
-        else:
-            mixed = False
-        if mixed:
-            pass  # not implemented yet
-    else:
-        raise TypeError(f"Invalid dataset type: {dataset}")
-    return loader_options
-
-
 def download_file_from_url(url: str) -> str:
     """Download a fole from ``url``.
 
@@ -88,3 +59,53 @@ def download_file_from_zip_url(url, file="montblanc_tracks.h5"):
             except zipfile.error:
                 pass
     return pathlib.Path(foldername) / "data" / file
+
+
+def _is_integer(y: Union[npt.NDArray, torch.Tensor]) -> bool:
+    """Check if the values in ``y`` are :py:class:`int`.
+
+    Args:
+        y: An array, either as a :py:func:`numpy.array` or a :py:class:`torch.Tensor`.
+
+    Returns:
+        ``True`` if ``y`` contains :py:class:`int`.
+    """
+    return (isinstance(y, np.ndarray) and np.issubdtype(y.dtype, np.integer)
+           ) or (isinstance(y, torch.Tensor) and
+                 (not torch.is_floating_point(y) and not torch.is_complex(y)))
+
+
+def _is_floating(y: Union[npt.NDArray, torch.Tensor]) -> bool:
+    """Check if the values in ``y`` are :py:class:`int`.
+
+    Note:
+        There is no ``torch`` method to check that the ``dtype`` of a :py:class:`torch.Tensor`
+        is a :py:class:`float`, consequently, we check that it is not :py:class:`int` nor
+        :py:class:`complex`.
+
+    Args:
+        y: An array, either as a :py:func:`numpy.array` or a :py:class:`torch.Tensor`.
+
+    Returns:
+        ``True`` if ``y`` contains :py:class:`float`.
+    """
+
+    return (isinstance(y, np.ndarray) and
+            np.issubdtype(y.dtype, np.floating)) or (isinstance(
+                y, torch.Tensor) and torch.is_floating_point(y))
+
+
+def get_loader_options(dataset: "cebra.data.Dataset") -> List[str]:
+    """Return all possible dataloaders for the given dataset.
+
+    Notes:
+        This function is deprecated and will be removed in an upcoming version of CEBRA.
+        Please use :py:mod:`cebra.data.helper.get_loader_options` instead, which is an
+        exact copy.
+    """
+
+    import cebra.data.helper
+    warnings.warn(
+        "The 'get_loader_options' function has been moved to 'cebra.data.helpers' module. "
+        "Please update your imports.", DeprecationWarning)
+    return cebra.data.helper.get_loader_options
diff --git a/cebra/integrations/sklearn/dataset.py b/cebra/integrations/sklearn/dataset.py
@@ -18,6 +18,7 @@
 import torch
 
 import cebra.data
+import cebra.helper
 import cebra.integrations.sklearn.utils as cebra_sklearn_utils
 import cebra.models
 import cebra.solver
@@ -134,12 +135,12 @@ def _parse_labels(self, labels: Optional[tuple]):
 
             # Define the index as either continuous or discrete indices, depending
             # on the dtype in the index array.
-            if y.dtype in (np.float32, np.float64):
+            if cebra.helper._is_floating(y):
                 y = torch.from_numpy(y).float()
                 if y.dim() == 1:
                     y = y.unsqueeze(1)
                 continuous_index.append(y)
-            elif y.dtype in (np.int32, np.int64):
+            elif cebra.helper._is_integer(y):
                 y = torch.from_numpy(y).long().squeeze()
                 if y.dim() > 1:
                     raise ValueError(
diff --git a/cebra/integrations/sklearn/decoder.py b/cebra/integrations/sklearn/decoder.py
@@ -21,39 +21,7 @@
 import sklearn.neighbors
 import torch
 
-
-def _is_integer(y: Union[npt.NDArray, torch.Tensor]) -> bool:
-    """Check if the values in ``y`` are :py:class:`int`.
-
-    Args:
-        y: An array, either as a :py:func:`numpy.array` or a :py:class:`torch.Tensor`.
-    
-    Returns:
-        ``True`` if ``y`` contains :py:class:`int`.
-    """
-    return (isinstance(y, np.ndarray) and np.issubdtype(y.dtype, np.integer)
-           ) or (isinstance(y, torch.Tensor) and
-                 (not torch.is_floating_point(y) and not torch.is_complex(y)))
-
-
-def _is_floating(y: Union[npt.NDArray, torch.Tensor]) -> bool:
-    """Check if the values in ``y`` are :py:class:`int`.
-    
-    Note: 
-        There is no ``torch`` method to check that the ``dtype`` of a :py:class:`torch.Tensor`
-        is a :py:class:`float`, consequently, we check that it is not :py:class:`int` nor
-        :py:class:`complex`.
-
-    Args:
-        y: An array, either as a :py:func:`numpy.array` or a :py:class:`torch.Tensor`.
-    
-    Returns:
-        ``True`` if ``y`` contains :py:class:`float`.
-    """
-
-    return (isinstance(y, np.ndarray) and
-            np.issubdtype(y.dtype, np.floating)) or (isinstance(
-                y, torch.Tensor) and torch.is_floating_point(y))
+import cebra.helper
 
 
 class Decoder(abc.ABC, sklearn.base.BaseEstimator):
@@ -152,10 +120,10 @@ def fit(
             )
 
         # Use regression or classification, based on if the targets are continuous or discrete
-        if _is_floating(y):
+        if cebra.helper._is_floating(y):
             self.knn = sklearn.neighbors.KNeighborsRegressor(
                 n_neighbors=self.n_neighbors, metric=self.metric)
-        elif _is_integer(y):
+        elif cebra.helper._is_integer(y):
             self.knn = sklearn.neighbors.KNeighborsClassifier(
                 n_neighbors=self.n_neighbors, metric=self.metric)
         else:
@@ -237,7 +205,7 @@ def fit(
                 f"Invalid shape: y and X must have the same number of samples, got y:{len(y)} and X:{len(X)}."
             )
 
-        if not (_is_integer(y) or _is_floating(y)):
+        if not (cebra.helper._is_integer(y) or cebra.helper._is_floating(y)):
             raise NotImplementedError(
                 f"Invalid type: targets must be numeric, got y:{y.dtype}")
 
diff --git a/tests/test_integration_train.py b/tests/test_integration_train.py
@@ -10,6 +10,7 @@
 # https://github.com/AdaptiveMotorControlLab/CEBRA/LICENSE.md
 #
 import itertools
+from typing import List
 
 import pytest
 import torch
@@ -18,6 +19,7 @@
 import cebra
 import cebra.config
 import cebra.data
+import cebra.data.helper as cebra_data_helper
 import cebra.datasets
 import cebra.helper
 import cebra.models
@@ -68,7 +70,7 @@ def _list_data_loaders():
     ]
     # TODO limit this to the valid combinations---however this
     # requires to adapt the dataset API slightly; it is currently
-    # required to initialize the dataset to run cebra.helper.get_loader_options.
+    # required to initialize the dataset to run cebra_data_helper.get_loader_options.
     prefixes = set()
     for dataset_name, loader in itertools.product(cebra.datasets.get_options(),
                                                   loaders):
@@ -86,7 +88,7 @@ def test_train(dataset_name, loader_type):
     args = cebra.config.Config(num_steps=1, device="cuda").as_namespace()
 
     dataset = cebra.datasets.init(dataset_name)
-    if loader_type not in cebra.helper.get_loader_options(dataset):
+    if loader_type not in cebra_data_helper.get_loader_options(dataset):
         # skip this test, since the data/loader combination is not valid.
         pytest.skip("Not a valid dataset/loader combination.")
     loader = loader_type(
diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py
@@ -117,6 +117,26 @@ def test_sklearn_dataset():
         cebra_data.datasets.DatasetCollection(*sessions)
 
 
+@pytest.mark.parametrize("int_type", [np.uint8, np.int8, np.int32])
+@pytest.mark.parametrize("float_type", [np.float16, np.float32, np.float64])
+def test_sklearn_dataset_type_index(int_type, float_type):
+    N = 100
+    X = np.random.uniform(0, 1, (N * 2, 2))
+    y = np.concatenate([np.zeros(N), np.ones(N)])
+
+    # integer type
+    y = y.astype(int_type)
+    _, _, loader, _ = cebra.CEBRA(batch_size=512)._prepare_fit(X, y)
+    assert loader.dataset.discrete_index is not None
+    assert loader.dataset.continuous_index is None
+
+    # floating type
+    y = y.astype(float_type)
+    _, _, loader, _ = cebra.CEBRA(batch_size=512)._prepare_fit(X, y)
+    assert loader.dataset.continuous_index is not None
+    assert loader.dataset.discrete_index is None
+
+
 @_util.parametrize_slow(
     arg_names="is_cont,is_disc,is_full,is_multi,is_hybrid",
     fast_arguments=list(
diff --git a/tests/test_sklearn_decoder.py b/tests/test_sklearn_decoder.py
@@ -13,6 +13,7 @@
 import pytest
 import torch
 
+import cebra.helper
 import cebra.integrations.sklearn.decoder as cebra_sklearn_decoder
 
 
@@ -104,7 +105,7 @@ def test_sklearn_decoder(decoder):
 
 
 def test_dtype_checker():
-    assert cebra_sklearn_decoder._is_floating(torch.Tensor([4.5]))
-    assert cebra_sklearn_decoder._is_integer(torch.LongTensor([4]))
-    assert cebra_sklearn_decoder._is_floating(np.array([4.5]))
-    assert cebra_sklearn_decoder._is_integer(np.array([4]))
+    assert cebra.helper._is_floating(torch.Tensor([4.5]))
+    assert cebra.helper._is_integer(torch.LongTensor([4]))
+    assert cebra.helper._is_floating(np.array([4.5]))
+    assert cebra.helper._is_integer(np.array([4]))