Enhance ModuleNotFoundError messages (#85)

stes · web-flow · commit 3f06b71da0cb · 2023-10-03T17:54:24.000+02:00
* Enhance the module not found err messages in cebra.load

* Moved the monkey reaching error message

* Raise error except warning for nlb_tools

* Seed the criterion tests
diff --git a/cebra/data/load.py b/cebra/data/load.py
@@ -81,6 +81,13 @@
     )
 
 
+def _module_not_found_error(module_name):
+    return ModuleNotFoundError(
+        f"Could not load {module_name}. You can manually install {module_name} "
+        "or install the [datasets] dependency in cebra: "
+        "pip install 'cebra[datasets]'")
+
+
 class _BaseLoader(abc.ABC):
     """Base loader."""
 
@@ -186,7 +193,13 @@ def load(
                                                            keypoints=columns)
                         else:
                             raise ModuleNotFoundError(
-                                "DLC integration could not be loaded.")
+                                "DLC integration could not be loaded. "
+                                "Most likely, this is because you do not have all "
+                                "integrations dependencies installed. Try installing "
+                                "cebra with the [integrations] and [datasets] dependency to fix this "
+                                "error. You might need to re-start your environment "
+                                "after installing: "
+                                "pip install 'cebra[integrations,datasets]'.")
                     # if the provided key is valid
                     elif key in df_keys:
                         loaded_array = _PandasLoader.load_from_h5(
@@ -208,7 +221,7 @@ def load(
                     raise AttributeError(
                         "No valid data structure was found in your file.")
         else:
-            raise ModuleNotFoundError()
+            raise _module_not_found_error("h5py")
         return loaded_array
 
     @staticmethod
@@ -385,7 +398,7 @@ def load(
             except pd.errors.EmptyDataError:
                 raise AttributeError(".csv file is empty.")
         else:
-            raise ModuleNotFoundError()
+            raise _module_not_found_error("pandas")
         return loaded_array
 
 
@@ -420,7 +433,7 @@ def load(
                     loaded_array = loaded_dict[key].values
                     break
         else:
-            raise ModuleNotFoundError()
+            raise _module_not_found_error("pandas")
         return loaded_array
 
     # def prepare_engine(extension: str):
@@ -486,7 +499,7 @@ def load(
                 raise NotImplementedError(
                     f"{type(loaded_data)} is not handled for .jl files.")
         else:
-            raise ModuleNotFoundError()
+            raise _module_not_found_error("joblib")
         return loaded_array
 
 
@@ -531,7 +544,7 @@ def load(
                 raise NotImplementedError(
                     f"{type(loaded_data)} is not handled for .pk files.")
         else:
-            raise ModuleNotFoundError()
+            raise _module_not_found_error("pickle")
         return loaded_array
 
 
@@ -572,7 +585,7 @@ def load(
             if _IS_H5PY_AVAILABLE:
                 loaded_array = _H5pyLoader.load(file, key)
             else:
-                raise ModuleNotFoundError()
+                raise _module_not_found_error("h5py")
         return loaded_array
 
 
diff --git a/cebra/datasets/monkey_reaching.py b/cebra/datasets/monkey_reaching.py
@@ -27,17 +27,6 @@
 import scipy.io
 import torch
 
-try:
-    from nlb_tools.nwb_interface import NWBDataset
-except ImportError:
-    import warnings
-
-    warnings.warn(
-        ("Could not import the nlb_tools package required for data loading "
-         "of cebra.datasets.monkey_reaching. Dataset will not be available. "
-         "If required, you can install the dataset by running "
-         "pip install git+https://github.com/neurallatents/nlb_tools."))
-
 import cebra.data
 from cebra.datasets import get_datapath
 from cebra.datasets import register
@@ -62,6 +51,16 @@ def _load_data(
 
     """
 
+    try:
+        from nlb_tools.nwb_interface import NWBDataset
+    except ImportError as e:
+        raise ImportError(
+            "Could not import the nlb_tools package required for data loading "
+            "the raw reaching datasets in NWB format. "
+            "If required, you can install the dataset by running "
+            "pip install nlb_tools or installing cebra with the [datasets] "
+            "dependencies: pip install 'cebra[datasets]'")
+
     def _get_info(trial_info, data):
         passive = []
         direction = []
diff --git a/tests/test_criterions.py b/tests/test_criterions.py
@@ -276,10 +276,16 @@ def _compute_grads(output, inputs):
     return [input_.grad for input_ in inputs]
 
 
-def test_infonce():
+def _sample_dist_matrices(seed):
+    rng = torch.Generator().manual_seed(42)
+    pos_dist = torch.randn(100, generator=rng)
+    neg_dist = torch.randn(100, 100, generator=rng)
+    return pos_dist, neg_dist
+
 
-    pos_dist = torch.randn(100,)
-    neg_dist = torch.randn(100, 100)
+@pytest.mark.parametrize("seed", [42, 4242, 424242])
+def test_infonce(seed):
+    pos_dist, neg_dist = _sample_dist_matrices(seed)
 
     ref_loss, ref_align, ref_uniform = _reference_infonce(pos_dist, neg_dist)
     loss, align, uniform = cebra_criterions.infonce(pos_dist, neg_dist)
@@ -290,11 +296,9 @@ def test_infonce():
     assert torch.allclose(align + uniform, loss)
 
 
-def test_infonce_gradients():
-
-    rng = torch.Generator().manual_seed(42)
-    pos_dist = torch.randn(100, generator=rng)
-    neg_dist = torch.randn(100, 100, generator=rng)
+@pytest.mark.parametrize("seed", [42, 4242, 424242])
+def test_infonce_gradients(seed):
+    pos_dist, neg_dist = _sample_dist_matrices(seed)
 
     for i in range(3):
         pos_dist_ = pos_dist.clone()
@@ -312,7 +316,7 @@ def test_infonce_gradients():
         grad = _compute_grads(loss, [pos_dist_, neg_dist_])
 
         # NOTE(stes) default relative tolerance is 1e-5
-        assert torch.allclose(loss_ref, loss, rtol = 1e-4)
+        assert torch.allclose(loss_ref, loss, rtol=1e-4)
 
         if i == 0:
             assert grad[0] is not None
diff --git a/tests/test_load.py b/tests/test_load.py
@@ -12,6 +12,8 @@
 import pathlib
 import pickle
 import tempfile
+import unittest
+from unittest.mock import patch
 
 import h5py
 import hdf5storage
@@ -27,6 +29,7 @@
 
 __test_functions = []
 __test_functions_error = []
+__test_functions_module_not_found = []
 
 
 def _skip_hdf5storage(*args, **kwargs):
@@ -42,7 +45,7 @@ def test_imports():
     assert hasattr(cebra, "load_data")
 
 
-def register(*file_endings):
+def register(*file_endings, requires=()):
     # for each file format
     def _register(f):
         # f is the filename
@@ -53,6 +56,12 @@ def _register(f):
             lambda filename: f(filename + "." + file_ending)
             for file_ending in file_endings
         ])
+        if len(requires) > 0:
+            __test_functions_module_not_found.extend([
+                (requires, lambda filename: filename + "." + file_ending,
+                 lambda filename: f(filename + "." + file_ending))
+                for file_ending in file_endings
+            ])
         return f
 
     return _register
@@ -152,7 +161,7 @@ def generate_numpy_no_array(filename):
 # TODO: test raise ModuleFoundError for h5py
 
 
-@register("h5", "hdf", "hdf5", "h")
+@register("h5", "hdf", "hdf5", "h", requires=("h5py",))
 def generate_h5(filename):
     A = np.arange(1000).reshape(10, 100)
     with h5py.File(filename, "w") as hf:
@@ -380,7 +389,7 @@ def generate_wrong_key(filename):
 
 
 #### .CSV ####
-@register("csv")
+@register("csv", requires=("pandas",))
 def generate_csv(filename):
     A = np.arange(1000).reshape(10, 100)
     pd.DataFrame(A).to_csv(filename, header=False, index=False, sep=",")
@@ -404,7 +413,7 @@ def generate_csv_empty_file(filename):
 
 
 #### EXCEL ####
-@register("xls", "xlsx", "xlsm")
+@register("xls", "xlsx", "xlsm", requires=("pandas", "pd"))
 # TODO(celia): add the following extension:  "xlsb", "odf", "ods", "odt",
 # issue to create the files
 def generate_excel(filename):
@@ -777,3 +786,23 @@ def test_load_error(save_data):
 
     with pytest.raises((AttributeError, TypeError)):
         save_data(filename)
+
+
+@pytest.mark.parametrize("module_names,get_path,save_data",
+                         __test_functions_module_not_found)
+def test_module_not_installed(module_names, get_path, save_data):
+
+    assert len(module_names) > 0
+    assert isinstance(module_names, tuple)
+
+    with tempfile.NamedTemporaryFile() as tf:
+        filename = tf.name
+
+    saved_array, loaded_array = save_data(filename)
+    assert np.allclose(saved_array, loaded_array)
+
+    # TODO(stes): Sketch for a test --- needs additional work.
+    # with patch.dict('sys.modules', {module: None for module in module_names}):
+    #    path = get_path(filename)
+    #    with pytest.raises(ModuleNotFoundError, match="cebra[datasets]"):
+    #        cebra.data.load.load(path)