Utilites for validating data normalization and standardization (#223)

Balandat · facebook-github-bot · commit a107ff95a34c · 2019-08-09T14:36:41.000-07:00
Summary: Adds utilities that make it easy to check whether input data is free of NaNs, normalized (for inputs), and standardized (for targets). Addresses part of #209 - these utilities will need to be called in the various model constructors, but that will be a separate PR. We should add some `debug` setting to `settings.py` (on by default) that calls these checks on the input data with `raise_on_fail=False`. Pull Request resolved: #223 Reviewed By: sdaulton Differential Revision: D16685285 Pulled By: Balandat fbshipit-source-id: eb54f9f4d383a1b3078f253b74f707d9c8213d86
diff --git a/botorch/exceptions/__init__.py b/botorch/exceptions/__init__.py
@@ -2,10 +2,16 @@
 
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 
-from .errors import BotorchError, CandidateGenerationError, UnsupportedError
+from .errors import (
+    BotorchError,
+    CandidateGenerationError,
+    InputDataError,
+    UnsupportedError,
+)
 from .warnings import (
     BadInitialCandidatesWarning,
     BotorchWarning,
+    InputDataWarning,
     OptimizationWarning,
     SamplingWarning,
 )
@@ -16,6 +22,8 @@
     "CandidateGenerationError",
     "UnsupportedError",
     "BotorchWarning",
+    "InputDataWarning",
+    "InputDataError",
     "BadInitialCandidatesWarning",
     "OptimizationWarning",
     "SamplingWarning",
diff --git a/botorch/exceptions/errors.py b/botorch/exceptions/errors.py
@@ -19,6 +19,12 @@ class CandidateGenerationError(BotorchError):
     pass
 
 
+class InputDataError(BotorchError):
+    r"""Exception raised when input data does not comply with conventions."""
+
+    pass
+
+
 class UnsupportedError(BotorchError):
     r"""Currently unsupported feature."""
 
diff --git a/botorch/exceptions/warnings.py b/botorch/exceptions/warnings.py
@@ -13,14 +13,20 @@ class BotorchWarning(Warning):
     pass
 
 
-class OptimizationWarning(BotorchWarning):
-    r"""Optimization-releated warnings."""
+class BadInitialCandidatesWarning(BotorchWarning):
+    r"""Warning issued if set of initial candidates for optimziation is bad."""
 
     pass
 
 
-class BadInitialCandidatesWarning(BotorchWarning):
-    r"""Warning issued if set of initial candidates for optimziation is bad."""
+class InputDataWarning(BotorchWarning):
+    r"""Warning raised when input data does not comply with conventions."""
+
+    pass
+
+
+class OptimizationWarning(BotorchWarning):
+    r"""Optimization-releated warnings."""
 
     pass
 
diff --git a/botorch/models/utils.py b/botorch/models/utils.py
@@ -6,12 +6,15 @@
 Utiltiy functions for models.
 """
 
+import warnings
 from typing import List, Optional, Tuple
 
 import torch
 from gpytorch.utils.broadcasting import _mul_broadcast_shape
 from torch import Tensor
 
+from ..exceptions import InputDataError, InputDataWarning
+
 
 def _make_X_full(X: Tensor, output_indices: List[int], tf: int) -> Tensor:
     r"""Helper to construct input tensor with task indices.
@@ -107,3 +110,73 @@ def add_output_dim(X: Tensor, original_batch_shape: torch.Size) -> Tuple[Tensor,
     X = X.unsqueeze(-3)
     output_dim_idx = max(len(original_batch_shape), len(X_batch_shape))
     return X, output_dim_idx
+
+
+def check_no_nans(Z: Tensor) -> None:
+    r"""Check that tensor does not contain NaN values.
+
+    Raises an InputDataError if `Z` contains NaN values.
+
+    Args:
+        Z: The input tensor.
+    """
+    if torch.any(torch.isnan(Z)).item():
+        raise InputDataError("Input data contains NaN values.")
+
+
+def check_min_max_scaling(
+    X: Tensor, strict: bool = False, atol: float = 1e-2, raise_on_fail: bool = False
+) -> None:
+    r"""Check that tensor is normalized to the unit cube.
+
+    Args:
+        X: A `batch_shape x n x d` input tensor. Typically the training inputs
+            of a model.
+        strict: If True, require `X` to be scaled to the unit cube (rather than
+            just to be contained within the unit cube).
+        atol: The tolerance for the boundary check. Only used if `strict=True`.
+        raise_on_fail: If True, raise an exception instead of a warning.
+    """
+    with torch.no_grad():
+        Xmin, Xmax = torch.min(X, dim=-1)[0], torch.max(X, dim=-1)[0]
+        msg = None
+        if strict and max(torch.abs(Xmin).max(), torch.abs(Xmax - 1).max()) > atol:
+            msg = "scaled"
+        if torch.any(Xmin < -atol) or torch.any(Xmax > 1 + atol):
+            msg = "contained"
+        if msg is not None:
+            msg = (
+                f"Input data is not {msg} to the unit cube. "
+                "Please consider min-max scaling the input data."
+            )
+            if raise_on_fail:
+                raise InputDataError(msg)
+            warnings.warn(msg, InputDataWarning)
+
+
+def check_standardization(
+    Y: Tensor,
+    atol_mean: float = 1e-2,
+    atol_std: float = 1e-2,
+    raise_on_fail: bool = False,
+) -> None:
+    r"""Check that tensor is standardized (zero mean, unit variance).
+
+    Args:
+        Y: The input tensor of shape `batch_shape x n x m`. Typically the
+            train targets of a model. Standardization is checked across the
+            `n`-dimension.
+        atol_mean: The tolerance for the mean check.
+        atol_std: The tolerance for the std check.
+        raise_on_fail: If True, raise an exception instead of a warning.
+    """
+    with torch.no_grad():
+        Ymean, Ystd = torch.mean(Y, dim=-2), torch.std(Y, dim=-2)
+        if torch.abs(Ymean).max() > atol_mean or torch.abs(Ystd - 1).max() > atol_std:
+            msg = (
+                "Input data is not standardized. Please consider scaling the "
+                "input to zero mean and unit variance."
+            )
+            if raise_on_fail:
+                raise InputDataError(msg)
+            warnings.warn(msg, InputDataWarning)
diff --git a/test/exceptions/test_errors.py b/test/exceptions/test_errors.py
@@ -7,6 +7,7 @@
 from botorch.exceptions.errors import (
     BotorchError,
     CandidateGenerationError,
+    InputDataError,
     UnsupportedError,
 )
 
@@ -15,12 +16,15 @@ class TestBotorchExceptions(unittest.TestCase):
     def test_botorch_exception_hierarchy(self):
         self.assertIsInstance(BotorchError(), Exception)
         self.assertIsInstance(CandidateGenerationError(), BotorchError)
+        self.assertIsInstance(InputDataError(), BotorchError)
         self.assertIsInstance(UnsupportedError(), BotorchError)
 
     def test_raise_botorch_exceptions(self):
-        with self.assertRaises(BotorchError):
-            raise BotorchError("message")
-        with self.assertRaises(CandidateGenerationError):
-            raise CandidateGenerationError("message")
-        with self.assertRaises(UnsupportedError):
-            raise UnsupportedError("message")
+        for ErrorClass in (
+            BotorchError,
+            CandidateGenerationError,
+            InputDataError,
+            UnsupportedError,
+        ):
+            with self.assertRaises(ErrorClass):
+                raise ErrorClass("message")
diff --git a/test/exceptions/test_warnings.py b/test/exceptions/test_warnings.py
@@ -8,6 +8,7 @@
 from botorch.exceptions.warnings import (
     BadInitialCandidatesWarning,
     BotorchWarning,
+    InputDataWarning,
     OptimizationWarning,
     SamplingWarning,
 )
@@ -17,18 +18,20 @@ class TestBotorchWarnings(unittest.TestCase):
     def test_botorch_warnings_hierarchy(self):
         self.assertIsInstance(BotorchWarning(), Warning)
         self.assertIsInstance(BadInitialCandidatesWarning(), BotorchWarning)
+        self.assertIsInstance(InputDataWarning(), BotorchWarning)
         self.assertIsInstance(OptimizationWarning(), BotorchWarning)
         self.assertIsInstance(SamplingWarning(), BotorchWarning)
 
     def test_botorch_warnings(self):
         for WarningClass in (
             BotorchWarning,
             BadInitialCandidatesWarning,
+            InputDataWarning,
             OptimizationWarning,
             SamplingWarning,
         ):
-            with warnings.catch_warnings(record=True) as w:
+            with warnings.catch_warnings(record=True) as ws:
                 warnings.warn("message", WarningClass)
-                self.assertEqual(len(w), 1)
-                self.assertTrue(issubclass(w[-1].category, WarningClass))
-                self.assertTrue("message" in str(w[-1].message))
+                self.assertEqual(len(ws), 1)
+                self.assertTrue(issubclass(ws[-1].category, WarningClass))
+                self.assertTrue("message" in str(ws[-1].message))
diff --git a/test/models/test_utils.py b/test/models/test_utils.py
@@ -3,9 +3,17 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 
 import unittest
+import warnings
 
 import torch
-from botorch.models.utils import add_output_dim, multioutput_to_batch_mode_transform
+from botorch.exceptions import InputDataError, InputDataWarning
+from botorch.models.utils import (
+    add_output_dim,
+    check_min_max_scaling,
+    check_no_nans,
+    check_standardization,
+    multioutput_to_batch_mode_transform,
+)
 
 
 class TestMultiOutputToBatchModeTransform(unittest.TestCase):
@@ -80,3 +88,72 @@ def test_add_output_dim(self, cuda=False):
     def test_add_output_dim_cuda(self, cuda=False):
         if torch.cuda.is_available():
             self.test_add_output_dim(cuda=True)
+
+
+class TestInputDataChecks(unittest.TestCase):
+    def test_check_no_nans(self):
+        check_no_nans(torch.tensor([1.0, 2.0]))
+        with self.assertRaises(InputDataError):
+            check_no_nans(torch.tensor([1.0, float("nan")]))
+
+    def test_check_min_max_scaling(self):
+        # check unscaled input in unit cube
+        X = 0.1 + 0.8 * torch.rand(4, 2, 3)
+        with warnings.catch_warnings(record=True) as ws:
+            check_min_max_scaling(X=X)
+            self.assertFalse(any(issubclass(w.category, InputDataWarning) for w in ws))
+        check_min_max_scaling(X=X, raise_on_fail=True)
+        with warnings.catch_warnings(record=True) as ws:
+            check_min_max_scaling(X=X, strict=True)
+            self.assertTrue(any(issubclass(w.category, InputDataWarning) for w in ws))
+            self.assertTrue(any("not scaled" in str(w.message) for w in ws))
+        with self.assertRaises(InputDataError):
+            check_min_max_scaling(X=X, strict=True, raise_on_fail=True)
+        # check proper input
+        Xmin, Xmax = X.min(dim=-1, keepdim=True)[0], X.max(dim=-1, keepdim=True)[0]
+        Xstd = (X - Xmin) / (Xmax - Xmin)
+        with warnings.catch_warnings(record=True) as ws:
+            check_min_max_scaling(X=Xstd)
+            self.assertFalse(any(issubclass(w.category, InputDataWarning) for w in ws))
+        check_min_max_scaling(X=Xstd, raise_on_fail=True)
+        with warnings.catch_warnings(record=True) as ws:
+            check_min_max_scaling(X=Xstd, strict=True)
+            self.assertFalse(any(issubclass(w.category, InputDataWarning) for w in ws))
+        check_min_max_scaling(X=Xstd, strict=True, raise_on_fail=True)
+        # check violation
+        X[0, 0, 0] = 2
+        with warnings.catch_warnings(record=True) as ws:
+            check_min_max_scaling(X=X)
+            self.assertTrue(any(issubclass(w.category, InputDataWarning) for w in ws))
+            self.assertTrue(any("not contained" in str(w.message) for w in ws))
+        with self.assertRaises(InputDataError):
+            check_min_max_scaling(X=X, raise_on_fail=True)
+        with warnings.catch_warnings(record=True) as ws:
+            check_min_max_scaling(X=X, strict=True)
+            self.assertTrue(any(issubclass(w.category, InputDataWarning) for w in ws))
+            self.assertTrue(any("not contained" in str(w.message) for w in ws))
+        with self.assertRaises(InputDataError):
+            check_min_max_scaling(X=X, strict=True, raise_on_fail=True)
+
+    def test_check_standardization(self):
+        Y = torch.randn(3, 4, 2)
+        # check standardized input
+        Yst = (Y - Y.mean(dim=-2, keepdim=True)) / Y.std(dim=-2, keepdim=True)
+        with warnings.catch_warnings(record=True) as ws:
+            check_standardization(Y=Yst)
+            self.assertFalse(any(issubclass(w.category, InputDataWarning) for w in ws))
+        check_standardization(Y=Yst, raise_on_fail=True)
+        # check nonzero mean
+        with warnings.catch_warnings(record=True) as ws:
+            check_standardization(Y=Yst + 1)
+            self.assertTrue(any(issubclass(w.category, InputDataWarning) for w in ws))
+            self.assertTrue(any("not standardized" in str(w.message) for w in ws))
+        with self.assertRaises(InputDataError):
+            check_standardization(Y=Yst + 1, raise_on_fail=True)
+        # check non-unit variance
+        with warnings.catch_warnings(record=True) as ws:
+            check_standardization(Y=Yst * 2)
+            self.assertTrue(any(issubclass(w.category, InputDataWarning) for w in ws))
+            self.assertTrue(any("not standardized" in str(w.message) for w in ws))
+        with self.assertRaises(InputDataError):
+            check_standardization(Y=Yst * 2, raise_on_fail=True)