diff --git a/gpytorch/functions/__init__.py b/gpytorch/functions/__init__.py
index e94f7e5aa..ad62644d9 100644
--- a/gpytorch/functions/__init__.py
+++ b/gpytorch/functions/__init__.py
@@ -64,7 +64,14 @@ def dsmm(sparse_mat, dense_mat):
 
 
 def exact_predictive_mean(
-    full_covar, full_mean, train_labels, num_train, likelihood, precomputed_cache=None, non_batch_train=False
+    full_covar,
+    full_mean,
+    train_inputs,
+    train_labels,
+    num_train,
+    likelihood,
+    precomputed_cache=None,
+    non_batch_train=False,
 ):
     """
     Computes the posterior predictive mean of a GP
@@ -73,6 +80,7 @@ def exact_predictive_mean(
         - full_covar ( (n+t) x (n+t) ) - the block prior covariance matrix of training and testing points
             [ K_XX, K_XX*; K_X*X, K_X*X* ]
         - full_mean (n + t) - the training and test prior means, stacked on top of each other
+        - train_inputs (:obj:`torch.tensor`) -  The training data inputs
         - train_labels (n) - the training labels minus the training prior mean
         - noise (1) - the observed noise (from the likelihood)
         - precomputed_cache - speeds up subsequent computations (default: None)
@@ -88,17 +96,20 @@ def exact_predictive_mean(
 
         full_covar = NonLazyTensor(full_covar)
     return full_covar.exact_predictive_mean(
-        full_mean, train_labels, num_train, likelihood, precomputed_cache, non_batch_train
+        full_mean, train_inputs, train_labels, num_train, likelihood, precomputed_cache, non_batch_train
     )
 
 
-def exact_predictive_covar(full_covar, num_train, likelihood, precomputed_cache=None, non_batch_train=False):
+def exact_predictive_covar(
+    full_covar, train_inputs, num_train, likelihood, precomputed_cache=None, non_batch_train=False
+):
     """
     Computes the posterior predictive covariance of a GP
 
     Args:
         - full_covar ( (n+t) x (n+t) ) - the block prior covariance matrix of training and testing points
             [ K_XX, K_XX*; K_X*X, K_X*X* ]
+        - train_inputs TODO
         - num_train (int) - how many training points are there in the full covariance matrix
         - noise (1) - the observed noise (from the likelihood)
         - precomputed_cache - speeds up subsequent computations (default: None)
@@ -113,7 +124,8 @@ def exact_predictive_covar(full_covar, num_train, likelihood, precomputed_cache=
         from ..lazy.non_lazy_tensor import NonLazyTensor
 
         full_covar = NonLazyTensor(full_covar)
-    return full_covar.exact_predictive_covar(num_train, likelihood, precomputed_cache, non_batch_train)
+
+    return full_covar.exact_predictive_covar(train_inputs, num_train, likelihood, precomputed_cache, non_batch_train)
 
 
 def log_normal_cdf(x):
diff --git a/gpytorch/kernels/__init__.py b/gpytorch/kernels/__init__.py
index 0025bd8c8..591786d3e 100644
--- a/gpytorch/kernels/__init__.py
+++ b/gpytorch/kernels/__init__.py
@@ -1,12 +1,12 @@
 #!/usr/bin/env python3
 
-from .kernel import Kernel, AdditiveKernel, ProductKernel
 from .additive_structure_kernel import AdditiveStructureKernel
 from .cosine_kernel import CosineKernel
-from .grid_kernel import GridKernel
 from .grid_interpolation_kernel import GridInterpolationKernel
+from .grid_kernel import GridKernel
 from .index_kernel import IndexKernel
 from .inducing_point_kernel import InducingPointKernel
+from .kernel import AdditiveKernel, Kernel, ProductKernel
 from .lcm_kernel import LCMKernel
 from .linear_kernel import LinearKernel
 from .matern_kernel import MaternKernel
@@ -18,6 +18,7 @@
 from .spectral_mixture_kernel import SpectralMixtureKernel
 from .white_noise_kernel import WhiteNoiseKernel
 
+
 __all__ = [
     "Kernel",
     "AdditiveKernel",
@@ -27,7 +28,6 @@
     "GridInterpolationKernel",
     "IndexKernel",
     "InducingPointKernel",
-    "InducingPointKernelAddedLossTerm",
     "LCMKernel",
     "LinearKernel",
     "MaternKernel",
diff --git a/gpytorch/kernels/white_noise_kernel.py b/gpytorch/kernels/white_noise_kernel.py
index 8c453edd9..668382428 100644
--- a/gpytorch/kernels/white_noise_kernel.py
+++ b/gpytorch/kernels/white_noise_kernel.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 import torch
+
 from . import Kernel
 from ..lazy import DiagLazyTensor, ZeroLazyTensor
 
@@ -55,4 +56,4 @@ def forward(self, x1, x2, **params):
         elif x1.size(-2) == x2.size(-2) and x1.size(-2) == self.variances.size(1) and torch.equal(x1, x2):
             return DiagLazyTensor(self.variances.view(self.variances.size(0), -1))
         else:
-            return ZeroLazyTensor(x1.size(-3), x1.size(-2), x2.size(-2))
+            return ZeroLazyTensor(x1.size(-3), x1.size(-2), x2.size(-2), dtype=x1.dtype, device=x1.device)
diff --git a/gpytorch/lazy/interpolated_lazy_tensor.py b/gpytorch/lazy/interpolated_lazy_tensor.py
index c03e01692..f36e045e0 100644
--- a/gpytorch/lazy/interpolated_lazy_tensor.py
+++ b/gpytorch/lazy/interpolated_lazy_tensor.py
@@ -363,7 +363,14 @@ def diag(self):
             return res
 
     def exact_predictive_mean(
-        self, full_mean, train_labels, num_train, likelihood, precomputed_cache=None, non_batch_train=False
+        self,
+        full_mean,
+        train_inputs,
+        train_labels,
+        num_train,
+        likelihood,
+        precomputed_cache=None,
+        non_batch_train=False,
     ):
         from ..distributions import MultivariateNormal
 
@@ -382,7 +389,7 @@ def exact_predictive_mean(
 
             train_mean = full_mean.narrow(-1, 0, train_train_covar.size(-1))
 
-            mvn = likelihood(MultivariateNormal(train_mean, train_train_covar))
+            mvn = likelihood(MultivariateNormal(train_mean, train_train_covar), train_inputs)
             train_mean, train_train_covar = mvn.mean, mvn.lazy_covariance_matrix
 
             train_train_covar_inv_labels = train_train_covar.inv_matmul((train_labels - train_mean).unsqueeze(-1))
@@ -422,11 +429,15 @@ def _exact_predictive_covar_inv_quad_form_root(self, precomputed_cache, test_tra
         res = left_interp(test_interp_indices, test_interp_values, precomputed_cache)
         return res
 
-    def exact_predictive_covar(self, num_train, likelihood, precomputed_cache=None, non_batch_train=False):
+    def exact_predictive_covar(
+        self, train_inputs, num_train, likelihood, precomputed_cache=None, non_batch_train=False
+    ):
         from ..distributions import MultivariateNormal
 
         if not beta_features.fast_pred_var.on() and not beta_features.fast_pred_samples.on():
-            return super(InterpolatedLazyTensor, self).exact_predictive_covar(num_train, likelihood, precomputed_cache)
+            return super(InterpolatedLazyTensor, self).exact_predictive_covar(
+                train_inputs, num_train, likelihood, precomputed_cache
+            )
 
         n_test = self.size(-2) - num_train
         train_interp_indices = self.left_interp_indices.narrow(-2, 0, num_train)
@@ -452,7 +463,7 @@ def exact_predictive_covar(self, num_train, likelihood, precomputed_cache=None,
             )
 
             grv = MultivariateNormal(torch.zeros(1), train_train_covar)
-            train_train_covar = likelihood(grv).lazy_covariance_matrix
+            train_train_covar = likelihood(grv, train_inputs).lazy_covariance_matrix
 
             # Get probe vectors for inverse root
             num_probe_vectors = beta_features.fast_pred_var.num_probe_vectors()
diff --git a/gpytorch/lazy/lazy_evaluated_kernel_tensor.py b/gpytorch/lazy/lazy_evaluated_kernel_tensor.py
index bd1dc4742..af2f63585 100644
--- a/gpytorch/lazy/lazy_evaluated_kernel_tensor.py
+++ b/gpytorch/lazy/lazy_evaluated_kernel_tensor.py
@@ -44,6 +44,17 @@ def _quad_form_derivative(self, left_vecs, right_vecs):
     def _transpose_nonbatch(self):
         return self.__class__(self.kernel, self.x2, self.x1, **self.params)
 
+    def _batch_get_indices(self, batch_indices, left_indices, right_indices):
+        from ..kernels import Kernel
+
+        x1 = self.x1[batch_indices, left_indices, :].unsqueeze(0)
+        x2 = self.x2[batch_indices, right_indices, :].unsqueeze(0)
+        res = super(Kernel, self.kernel).__call__(x1.transpose(-1, -2), x2.transpose(-1, -2))
+        if isinstance(res, LazyTensor):
+            res = res.evaluate()
+        res = res.view(-1)
+        return res
+
     def _get_indices(self, left_indices, right_indices):
         from ..kernels import Kernel
 
@@ -166,25 +177,34 @@ def evaluate(self):
         return self.evaluate_kernel().evaluate()
 
     def exact_predictive_mean(
-        self, full_mean, train_labels, num_train, likelihood, precomputed_cache=None, non_batch_train=False
+        self,
+        full_mean,
+        train_inputs,
+        train_labels,
+        num_train,
+        likelihood,
+        precomputed_cache=None,
+        non_batch_train=False,
     ):
         if self.kernel.has_custom_exact_predictions:
             return self.evaluate_kernel().exact_predictive_mean(
-                full_mean, train_labels, num_train, likelihood, precomputed_cache, non_batch_train
+                full_mean, train_inputs, train_labels, num_train, likelihood, precomputed_cache, non_batch_train
             )
         else:
             return super(LazyEvaluatedKernelTensor, self).exact_predictive_mean(
-                full_mean, train_labels, num_train, likelihood, precomputed_cache, non_batch_train
+                full_mean, train_inputs, train_labels, num_train, likelihood, precomputed_cache, non_batch_train
             )
 
-    def exact_predictive_covar(self, num_train, likelihood, precomputed_cache=None, non_batch_train=False):
+    def exact_predictive_covar(
+        self, train_inputs, num_train, likelihood, precomputed_cache=None, non_batch_train=False
+    ):
         if self.kernel.has_custom_exact_predictions:
             return self.evaluate_kernel().exact_predictive_covar(
-                num_train, likelihood, precomputed_cache, non_batch_train
+                train_inputs, num_train, likelihood, precomputed_cache, non_batch_train
             )
         else:
             return super(LazyEvaluatedKernelTensor, self).exact_predictive_covar(
-                num_train, likelihood, precomputed_cache, non_batch_train
+                train_inputs, num_train, likelihood, precomputed_cache, non_batch_train
             )
 
     def repeat(self, *sizes):
diff --git a/gpytorch/lazy/lazy_tensor.py b/gpytorch/lazy/lazy_tensor.py
index 5872c87b7..c503b5658 100644
--- a/gpytorch/lazy/lazy_tensor.py
+++ b/gpytorch/lazy/lazy_tensor.py
@@ -505,7 +505,14 @@ def evaluate_kernel(self):
         return self.representation_tree()(*self.representation())
 
     def exact_predictive_mean(
-        self, full_mean, train_labels, num_train, likelihood, precomputed_cache=None, non_batch_train=False
+        self,
+        full_mean,
+        train_inputs,
+        train_labels,
+        num_train,
+        likelihood,
+        precomputed_cache=None,
+        non_batch_train=False,
     ):
         """
         Computes the posterior predictive covariance of a GP
@@ -514,6 +521,7 @@ def exact_predictive_mean(
 
         Args:
             full_mean (:obj:`torch.tensor`): the training and test prior means, stacked on top of each other
+            train_inputs (:obj:`torch.tensor`): The training data inputs
             train_labels (:obj:`torch.tensor`): the training labels minus the training prior mean
             noise (:obj:`torch.tensor`): the observed noise (from the likelihood)
             precomputed_cache (optional): speeds up subsequent computations (default: None)
@@ -537,7 +545,8 @@ def exact_predictive_mean(
             if non_batch_train and train_mean.dim() == 2:
                 train_mean = train_mean[0]
                 train_labels = train_labels[0]
-            mvn = likelihood(MultivariateNormal(train_mean, train_train_covar))
+            mvn = likelihood(MultivariateNormal(train_mean, train_train_covar), train_inputs)
+
             train_mean, train_train_covar = mvn.mean, mvn.lazy_covariance_matrix
 
             train_labels_offset = train_labels - train_mean
@@ -563,13 +572,16 @@ def exact_predictive_mean(
 
         return res, precomputed_cache.detach()
 
-    def exact_predictive_covar(self, num_train, likelihood, precomputed_cache=None, non_batch_train=False):
+    def exact_predictive_covar(
+        self, train_inputs, num_train, likelihood, precomputed_cache=None, non_batch_train=False
+    ):
         """
         Computes the posterior predictive covariance of a GP
         Assumes that self is the block prior covariance matrix of training and testing points
         [ K_XX, K_XX*; K_X*X, K_X*X* ]
 
         Args:
+            train_inputs (:obj:`torch.tensor`): The training data inputs
             num_train (int): The number of training points in the full covariance matrix
             noise (scalar): The observed noise (from the likelihood)
             precomputed_cache (optional): speeds up subsequent computations (default: None)
@@ -589,7 +601,9 @@ def exact_predictive_covar(self, num_train, likelihood, precomputed_cache=None,
             test_train_covar = self[num_train:, :num_train]
             test_test_covar = self[num_train:, num_train:]
 
-        train_train_covar = likelihood(MultivariateNormal(torch.zeros(1), train_train_covar)).lazy_covariance_matrix
+        train_train_covar = likelihood(
+            MultivariateNormal(torch.zeros(1), train_train_covar), train_inputs
+        ).lazy_covariance_matrix
         if not beta_features.fast_pred_var.on():
             from .matmul_lazy_tensor import MatmulLazyTensor
 
diff --git a/gpytorch/likelihoods/__init__.py b/gpytorch/likelihoods/__init__.py
index da966d001..d0634be70 100644
--- a/gpytorch/likelihoods/__init__.py
+++ b/gpytorch/likelihoods/__init__.py
@@ -1,15 +1,25 @@
 #!/usr/bin/env python3
 
 from .likelihood import Likelihood
-from .gaussian_likelihood import GaussianLikelihood
-from .multitask_gaussian_likelihood import MultitaskGaussianLikelihood
 from .bernoulli_likelihood import BernoulliLikelihood
+from .gaussian_likelihood import GaussianLikelihood, _GaussianLikelihoodBase
+from .multitask_gaussian_likelihood import (
+    MultitaskGaussianLikelihood,
+    MultitaskGaussianLikelihoodKronecker,
+    _MultitaskGaussianLikelihoodBase,
+)
+from .noise_models import HeteroskedasticNoise
 from .softmax_likelihood import SoftmaxLikelihood
 
+
 __all__ = [
-    "Likelihood",
+    "_GaussianLikelihoodBase",
+    "_MultitaskGaussianLikelihoodBase",
+    "BernoulliLikelihood",
     "GaussianLikelihood",
+    "HeteroskedasticNoise",
+    "Likelihood",
     "MultitaskGaussianLikelihood",
-    "BernoulliLikelihood",
+    "MultitaskGaussianLikelihoodKronecker",
     "SoftmaxLikelihood",
 ]
diff --git a/gpytorch/likelihoods/gaussian_likelihood.py b/gpytorch/likelihoods/gaussian_likelihood.py
index ff60f3dd2..cf1ae6ff8 100644
--- a/gpytorch/likelihoods/gaussian_likelihood.py
+++ b/gpytorch/likelihoods/gaussian_likelihood.py
@@ -1,64 +1,89 @@
 #!/usr/bin/env python3
 
 import math
-import torch
+
+from torch.nn.functional import softplus
+
+from .. import settings
 from ..distributions import MultivariateNormal
-from ..functions import add_diag
-from ..likelihoods import Likelihood
 from ..lazy import DiagLazyTensor
-from .. import settings
+from ..likelihoods import Likelihood
 from ..utils.deprecation import _deprecate_kwarg
-from ..utils.transforms import _get_inv_param_transform
-from torch.nn.functional import softplus
+from .noise_models import HomoskedasticNoise
+
+
+class _GaussianLikelihoodBase(Likelihood):
+    """Base class for Gaussian Likelihoods, supporting general heteroskedastic noise models. """
+
+    def __init__(self, noise_covar):
+        super().__init__()
+        self.noise_covar = noise_covar
+
+    def forward(self, input, *params):
+        if not isinstance(input, MultivariateNormal):
+            raise ValueError("Gaussian likelihoods require a MultivariateNormal input")
+        mean, covar = input.mean, input.lazy_covariance_matrix
+        if len(params) > 0:
+            # we can infer the shape from the params
+            shape = None
+        else:
+            # here shape[:-1] is the batch shape requested, and shape[-1] is `n`, the number of points
+            shape = mean.shape if len(mean.shape) == 1 else mean.shape[:-1]
+        noise_covar = self.noise_covar(*params, shape=shape)
+        full_covar = covar + noise_covar
+        return input.__class__(mean, full_covar)
 
+    def variational_log_probability(self, input, target):
+        raise NotImplementedError
 
-class GaussianLikelihood(Likelihood):
-    r"""
-    """
 
+class GaussianLikelihood(_GaussianLikelihoodBase):
     def __init__(self, noise_prior=None, batch_size=1, param_transform=softplus, inv_param_transform=None, **kwargs):
         noise_prior = _deprecate_kwarg(kwargs, "log_noise_prior", "noise_prior", noise_prior)
-        super(GaussianLikelihood, self).__init__()
-        self._param_transform = param_transform
-        self._inv_param_transform = _get_inv_param_transform(param_transform, inv_param_transform)
-        self.register_parameter(name="raw_noise", parameter=torch.nn.Parameter(torch.zeros(batch_size, 1)))
-        if noise_prior is not None:
-            self.register_prior("noise_prior", noise_prior, lambda: self.noise, lambda v: self._set_noise(v))
+        noise_covar = HomoskedasticNoise(
+            noise_prior=noise_prior,
+            batch_size=batch_size,
+            param_transform=param_transform,
+            inv_param_transform=inv_param_transform,
+        )
+        super().__init__(noise_covar=noise_covar)
+
+    def _param_transform(self, value):
+        return self.noise_covar._param_transform(value)
+
+    def _inv_param_transform(self, value):
+        return self.noise_covar._inv_param_transform(value)
 
     @property
     def noise(self):
-        return self._param_transform(self.raw_noise)
+        return self.noise_covar.noise
 
     @noise.setter
     def noise(self, value):
-        self._set_noise(value)
-
-    def _set_noise(self, value):
-        self.initialize(raw_noise=self._inv_param_transform(value))
+        self.noise_covar.noise = value
 
-    def forward(self, input):
-        if not isinstance(input, MultivariateNormal):
-            raise ValueError("GaussianLikelihood requires a MultivariateNormal input")
-        mean, covar = input.mean, input.lazy_covariance_matrix
-        noise = self.noise
-        if covar.ndimension() == 2:
-            if settings.debug.on() and noise.size(0) > 1:
-                raise RuntimeError("With batch_size > 1, expected a batched MultivariateNormal distribution.")
-            noise = noise.squeeze(0)
+    @property
+    def raw_noise(self):
+        return self.noise_covar.raw_noise
 
-        return input.__class__(mean, add_diag(covar, noise))
+    @raw_noise.setter
+    def raw_noise(self, value):
+        self.noise_covar.raw_noise = value
 
     def variational_log_probability(self, input, target):
         mean, variance = input.mean, input.variance
-        log_noise = self.log_noise
+        noise = self.noise_covar.noise
+
+        if mean.dim() > target.dim():
+            target = target.unsqueeze(-1)
 
         if variance.ndimension() == 1:
-            if settings.debug.on() and log_noise.size(0) > 1:
+            if settings.debug.on() and noise.size(0) > 1:
                 raise RuntimeError("With batch_size > 1, expected a batched MultivariateNormal distribution.")
-            log_noise = log_noise.squeeze(0)
+            noise = noise.squeeze(0)
 
-        res = -0.5 * ((target - mean) ** 2 + variance) / self.noise
-        res += -0.5 * log_noise - 0.5 * math.log(2 * math.pi)
+        res = -0.5 * ((target - mean) ** 2 + variance) / noise
+        res += -0.5 * noise.log() - 0.5 * math.log(2 * math.pi)
         return res.sum(-1)
 
     def pyro_sample_y(self, variational_dist_f, y_obs, sample_shape, name_prefix=""):
diff --git a/gpytorch/likelihoods/multitask_gaussian_likelihood.py b/gpytorch/likelihoods/multitask_gaussian_likelihood.py
index afb3d4245..1651b48a4 100644
--- a/gpytorch/likelihoods/multitask_gaussian_likelihood.py
+++ b/gpytorch/likelihoods/multitask_gaussian_likelihood.py
@@ -1,29 +1,137 @@
 #!/usr/bin/env python3
 
 import torch
-from ..functions import add_diag
-from ..lazy import DiagLazyTensor, KroneckerProductLazyTensor, RootLazyTensor
-from ..likelihoods import GaussianLikelihood
+from torch.nn.functional import softplus
+
 from .. import settings
+from ..functions import add_diag
+from ..lazy import (
+    BlockDiagLazyTensor,
+    DiagLazyTensor,
+    KroneckerProductLazyTensor,
+    MatmulLazyTensor,
+    NonLazyTensor,
+    RootLazyTensor,
+)
+from ..likelihoods import Likelihood, _GaussianLikelihoodBase
 from ..utils.deprecation import _deprecate_kwarg
-from torch.nn.functional import softplus
+from ..utils.transforms import _get_inv_param_transform
+from .noise_models import MultitaskHomoskedasticNoise
+
+
+class _MultitaskGaussianLikelihoodBase(_GaussianLikelihoodBase):
+    """Base class for multi-task Gaussian Likelihoods, supporting general heteroskedastic noise models. """
+
+    def __init__(self, num_tasks, noise_covar, rank=0, task_correlation_prior=None, batch_size=1):
+        """
+        Args:
+            num_tasks (int):
+                Number of tasks.
+            noise_covar (:obj:`gpytorch.module.Module`):
+                A model for the noise covariance. This can be a simple homoskedastic noise model, or a GP
+                that is to be fitted on the observed measurement errors.
+            rank (int):
+                The rank of the task noise covariance matrix to fit. If `rank` is set to 0, then a diagonal covariance
+                matrix is fit.
+            task_correlation_prior (:obj:`gpytorch.priors.Prior`):
+                Prior to use over the task noise correlation matrix. Only used when `rank` > 0.
+            batch_size (int):
+                Number of batches.
+        """
+        super().__init__(noise_covar=noise_covar)
+        if rank != 0:
+            self.register_parameter(
+                name="task_noise_corr_factor", parameter=torch.nn.Parameter(torch.randn(batch_size, num_tasks, rank))
+            )
+            self.register_parameter(
+                name="task_noise_corr_diag", parameter=torch.nn.Parameter(torch.ones(batch_size, num_tasks))
+            )
+            if task_correlation_prior is not None:
+                self.register_prior(
+                    "MultitaskErrorCorrelationPrior", task_correlation_prior, lambda: self._eval_corr_matrix
+                )
+        elif task_correlation_prior is not None:
+            raise ValueError("Can only specify task_correlation_prior if rank>0")
+        self.num_tasks = num_tasks
+        self.rank = rank
+
+    def _eval_corr_matrix(self):
+        corr_factor = self.task_noise_corr_factor.squeeze(0)
+        corr_diag = self.task_noise_corr_diag.squeeze(0)
+        M = corr_factor.matmul(corr_factor.transpose(-1, -2))
+        idx = torch.arange(M.shape[-1], dtype=torch.long, device=M.device)
+        M[..., idx, idx] += corr_diag
+        sem_inv = 1 / torch.diagonal(M, dim1=-2, dim2=-1).sqrt().unsqueeze(-1)
+        return M * sem_inv.matmul(sem_inv.transpose(-1, -2))
+
+    def forward(self, input, *params):
+        """
+        Adds the task noises to the diagonal of the covariance matrix of the supplied
+        :obj:`gpytorch.distributions.MultivariateNormal` or :obj:`gpytorch.distributions.MultitaskMultivariateNormal`,
+        in case of `rank` == 0. Otherwise, adds a rank `rank` covariance matrix to it.
+
+        This scales the task correlations appropriately by the variances at the different points provided
+        by the noise variance model (evalutated at the provided params)
+        """
+        mean, covar = input.mean, input.lazy_covariance_matrix
+        batch_shape, n = covar.shape[:-2], covar.shape[-1] // self.num_tasks
+
+        if len(batch_shape) > 1:
+            raise NotImplementedError("Batch shapes with dim > 1 not yet supported for MulitTask Likelihoods")
+
+        # compute the noise covariance
+        if len(params) > 0:
+            shape = None
+        else:
+            shape = mean.shape if len(mean.shape) == 1 else mean.shape[:-1]
+        noise_covar = self.noise_covar(*params, shape=shape)
+
+        if hasattr(self, "task_noise_corr_factor"):
+            # if rank > 0, compute the task correlation matrix
+            # TODO: This is inefficient, change repeat so it can repeat LazyTensors w/ multiple batch dimensions
+            task_corr = self._eval_corr_matrix()
+            exp_shape = batch_shape + torch.Size([n]) + task_corr.shape[-2:]
+            if len(batch_shape) == 1:
+                task_corr = task_corr.unsqueeze(-3)
+            task_corr_exp = NonLazyTensor(task_corr.expand(exp_shape))
+            noise_sem = noise_covar.sqrt()
+            task_covar_blocks = MatmulLazyTensor(MatmulLazyTensor(noise_sem, task_corr_exp), noise_sem)
+        else:
+            # otherwise tasks are uncorrelated
+            task_covar_blocks = noise_covar
+
+        if len(batch_shape) == 1:
+            # TODO: Properly support general batch shapes in BlockDiagLazyTensor (no shape arithmetic)
+            tcb_eval = task_covar_blocks.evaluate()
+            task_covar = BlockDiagLazyTensor(
+                NonLazyTensor(tcb_eval.view(-1, *tcb_eval.shape[-2:])), num_blocks=tcb_eval.shape[0]
+            )
+        else:
+            task_covar = BlockDiagLazyTensor(task_covar_blocks)
+        return input.__class__(mean, covar + task_covar)
+
+    def variational_log_probability(self, input, target):
+        raise NotImplementedError("Variational inference with Multitask Gaussian likelihood is not yet supported")
 
 
-class MultitaskGaussianLikelihood(GaussianLikelihood):
+class MultitaskGaussianLikelihood(_MultitaskGaussianLikelihoodBase):
     """
     A convenient extension of the :class:`gpytorch.likelihoods.GaussianLikelihood` to the multitask setting that allows
     for a full cross-task covariance structure for the noise. The fitted covariance matrix has rank `rank`.
     If a strictly diagonal task noise covariance matrix is desired, then rank=0 should be set. (This option still
-    allows for a different `log_noise` parameter for each task.)
+    allows for a different `log_noise` parameter for each task.). This likelihood assumes homoskedastic noise.
 
     Like the Gaussian likelihood, this object can be used with exact inference.
+
+    Note: This currently does not yet support batched training and evaluation. If you need support for this,
+    use MultitaskGaussianLikelihoodKronecker for the time being.
     """
 
     def __init__(
         self,
         num_tasks,
         rank=0,
-        task_prior=None,
+        task_correlation_prior=None,
         batch_size=1,
         noise_prior=None,
         param_transform=softplus,
@@ -37,18 +145,94 @@ def __init__(
             rank (int): The rank of the task noise covariance matrix to fit. If `rank` is set to 0,
             then a diagonal covariance matrix is fit.
 
-            task_prior (:obj:`gpytorch.priors.Prior`): Prior to use over the task noise covariance matrix if
-            `rank` > 0, or a prior over the log of just the diagonal elements, if `rank` == 0.
+            task_correlation_prior (:obj:`gpytorch.priors.Prior`): Prior to use over the task noise correlaton matrix.
+            Only used when `rank` > 0.
 
         """
-        noise_prior = _deprecate_kwarg(kwargs, "log_noise_prior", "noise_prior", noise_prior)
-        super(MultitaskGaussianLikelihood, self).__init__(
-            batch_size=batch_size,
+        task_correlation_prior = _deprecate_kwarg(
+            kwargs, "task_prior", "task_correlation_prior", task_correlation_prior
+        )
+        noise_covar = MultitaskHomoskedasticNoise(
+            num_tasks=num_tasks,
             noise_prior=noise_prior,
+            batch_size=batch_size,
             param_transform=param_transform,
             inv_param_transform=inv_param_transform,
         )
+        super().__init__(
+            num_tasks=num_tasks,
+            noise_covar=noise_covar,
+            rank=rank,
+            task_correlation_prior=task_correlation_prior,
+            batch_size=batch_size,
+        )
+        self._param_transform = param_transform
+        self._inv_param_transform = _get_inv_param_transform(param_transform, inv_param_transform)
+        self.register_parameter(name="raw_noise", parameter=torch.nn.Parameter(torch.zeros(batch_size, 1)))
+
+    @property
+    def noise(self):
+        return self._param_transform(self.raw_noise)
+
+    @noise.setter
+    def noise(self, value):
+        self._set_noise(value)
 
+    def _set_noise(self, value):
+        self.initialize(raw_noise=self._inv_param_transform(value))
+
+    def forward(self, input, *params):
+        mvn = super().forward(input, *params)
+        mean, covar = mvn.mean, mvn.lazy_covariance_matrix
+        noise = self.noise
+        if covar.ndimension() == 2:
+            if settings.debug.on() and noise.size(0) > 1:
+                raise RuntimeError("With batch_size > 1, expected a batched MultitaskMultivariateNormal distribution.")
+            noise = noise.squeeze(0)
+        covar = add_diag(covar, noise)
+        return input.__class__(mean, covar)
+
+
+class MultitaskGaussianLikelihoodKronecker(_MultitaskGaussianLikelihoodBase):
+    """
+    A convenient extension of the :class:`gpytorch.likelihoods.GaussianLikelihood` to the multitask setting that allows
+    for a full cross-task covariance structure for the noise. The fitted covariance matrix has rank `rank`.
+    If a strictly diagonal task noise covariance matrix is desired, then rank=0 should be set. (This option still
+    allows for a different `noise` parameter for each task.)
+
+    Like the Gaussian likelihood, this object can be used with exact inference.
+
+    Note: This Likelihood is scheduled to be deprecated and replaced by an improved version of
+    `MultitaskGaussianLikelihood`. Use this only for compatibility with batched Multitask models.
+    """
+
+    def __init__(
+        self,
+        num_tasks,
+        rank=0,
+        task_prior=None,
+        batch_size=1,
+        noise_prior=None,
+        param_transform=softplus,
+        inv_param_transform=None,
+        **kwargs
+    ):
+        """
+        Args:
+            num_tasks (int): Number of tasks.
+
+            rank (int): The rank of the task noise covariance matrix to fit. If `rank` is set to 0,
+            then a diagonal covariance matrix is fit.
+
+            task_prior (:obj:`gpytorch.priors.Prior`): Prior to use over the task noise covariance matrix if
+            `rank` > 0, or a prior over the log of just the diagonal elements, if `rank` == 0.
+
+        """
+        noise_prior = _deprecate_kwarg(kwargs, "log_noise_prior", "noise_prior", noise_prior)
+        super(Likelihood, self).__init__()
+        self._param_transform = param_transform
+        self._inv_param_transform = _get_inv_param_transform(param_transform, inv_param_transform)
+        self.register_parameter(name="raw_noise", parameter=torch.nn.Parameter(torch.zeros(batch_size, 1)))
         if rank == 0:
             self.register_parameter(
                 name="raw_task_noises", parameter=torch.nn.Parameter(torch.zeros(batch_size, num_tasks))
@@ -62,25 +246,36 @@ def __init__(
             if task_prior is not None:
                 self.register_prior("MultitaskErrorCovariancePrior", task_prior, self._eval_covar_matrix)
         self.num_tasks = num_tasks
+        self.rank = rank
 
-    def _eval_covar_matrix(self, task_noise_covar_factor, raw_noise):
-        num_tasks = task_noise_covar_factor.size(0)
-        noise = self._param_transform(raw_noise)
-        D = noise * torch.eye(num_tasks, dtype=noise.dtype, device=noise.device)
-        return task_noise_covar_factor.matmul(task_noise_covar_factor.transpose(-1, -2)) + D
+    @property
+    def noise(self):
+        return self._param_transform(self.raw_noise)
 
-    def forward(self, input):
+    @noise.setter
+    def noise(self, value):
+        self._set_noise(value)
+
+    def _set_noise(self, value):
+        self.initialize(raw_noise=self._inv_param_transform(value))
+
+    def _eval_covar_matrix(self):
+        covar_factor = self.task_noise_covar_factor
+        noise = self.noise
+        D = noise * torch.eye(self.num_tasks, dtype=noise.dtype, device=noise.device)
+        return covar_factor.matmul(covar_factor.transpose(-1, -2)) + D
+
+    def forward(self, input, *params):
         """
-        Adds the log task noises to the diagonal of the covariance matrix of the supplied
-        :obj:`gpytorch.distributions.MultivariateNormal` or
-        :obj:`gpytorch.distributions.MultitaskMultivariateNormal`, in case of
-        `rank` == 0. Otherwise, adds a rank `rank` covariance matrix to it.
+        Adds the task noises to the diagonal of the covariance matrix of the supplied
+        :obj:`gpytorch.distributions.MultivariateNormal` or :obj:`gpytorch.distributions.MultitaskMultivariateNormal`,
+        in case of `rank` == 0. Otherwise, adds a rank `rank` covariance matrix to it.
 
         To accomplish this, we form a new :obj:`gpytorch.lazy.KroneckerProductLazyTensor` between :math:`I_{n}`,
         an identity matrix with size equal to the data and a (not necessarily diagonal) matrix containing the task
         noises :math:`D_{t}`.
 
-        We also incorporate a shared `raw_noise` parameter from the base
+        We also incorporate a shared `noise` parameter from the base
         :class:`gpytorch.likelihoods.GaussianLikelihood` that we extend.
 
         The final covariance matrix after this method is then :math:`K + D_{t} \otimes I_{n} + \sigma^{2}I_{nt}`.
@@ -95,15 +290,16 @@ def forward(self, input):
         """
         mean, covar = input.mean, input.lazy_covariance_matrix
 
-        if hasattr(self, "raw_task_noises"):
-            noises = self._param_transform(self.raw_task_noises)
+        if self.rank == 0:
+            task_noises = self._param_transform(self.raw_task_noises)
             if covar.ndimension() == 2:
-                if settings.debug.on() and noises.size(0) > 1:
+                if settings.debug.on() and task_noises.size(0) > 1:
                     raise RuntimeError(
                         "With batch_size > 1, expected a batched MultitaskMultivariateNormal distribution."
                     )
-                noises = noises.squeeze(0)
-            task_var_lt = DiagLazyTensor(noises)
+                task_noises = task_noises.squeeze(0)
+            task_var_lt = DiagLazyTensor(task_noises)
+            device = task_noises.device
         else:
             task_noise_covar_factor = self.task_noise_covar_factor
             if covar.ndimension() == 2:
@@ -113,13 +309,12 @@ def forward(self, input):
                     )
                 task_noise_covar_factor = task_noise_covar_factor.squeeze(0)
             task_var_lt = RootLazyTensor(task_noise_covar_factor)
+            device = task_noise_covar_factor.device
 
         if covar.ndimension() == 2:
-            eye_lt = DiagLazyTensor(torch.ones(covar.size(-1) // self.num_tasks, device=self.log_noise.device))
+            eye_lt = DiagLazyTensor(torch.ones(covar.size(-1) // self.num_tasks, device=device))
         else:
-            eye_lt = DiagLazyTensor(
-                torch.ones(covar.size(0), covar.size(-1) // self.num_tasks, device=self.log_noise.device)
-            )
+            eye_lt = DiagLazyTensor(torch.ones(covar.size(0), covar.size(-1) // self.num_tasks, device=device))
             # Make sure the batch sizes are going to match
             if task_var_lt.size(0) == 1:
                 task_var_lt = task_var_lt.repeat(eye_lt.size(0), 1, 1)
@@ -135,6 +330,3 @@ def forward(self, input):
 
         covar = add_diag(covar, noise)
         return input.__class__(mean, covar)
-
-    def variational_log_probability(self, input, target):
-        raise NotImplementedError("Variational inference with Multitask Gaussian likelihood is not yet supported")
diff --git a/gpytorch/likelihoods/noise_models.py b/gpytorch/likelihoods/noise_models.py
new file mode 100644
index 000000000..805570bba
--- /dev/null
+++ b/gpytorch/likelihoods/noise_models.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+
+import torch
+from torch.nn import Parameter
+from torch.nn.functional import softplus
+
+from ..distributions import MultivariateNormal
+from ..lazy import DiagLazyTensor
+from ..module import Module
+from ..utils.broadcasting import _mul_broadcast_shape
+from ..utils.transforms import _get_inv_param_transform
+
+
+class _HomoskedasticNoiseBase(Module):
+    def __init__(self, noise_prior=None, batch_size=1, param_transform=softplus, inv_param_transform=None, num_tasks=1):
+        super().__init__()
+        self._param_transform = param_transform
+        self._inv_param_transform = _get_inv_param_transform(param_transform, inv_param_transform)
+        self.register_parameter(name="raw_noise", parameter=Parameter(torch.zeros(batch_size, num_tasks)))
+        if noise_prior is not None:
+            self.register_prior("noise_prior", noise_prior, lambda: self.noise, lambda v: self._set_noise(v))
+
+    @property
+    def noise(self):
+        return self._param_transform(self.raw_noise)
+
+    @noise.setter
+    def noise(self, value):
+        self._set_noise(value)
+
+    def _set_noise(self, value):
+        self.initialize(raw_noise=self._inv_param_transform(value))
+
+    def forward(self, *params, shape=None):
+        """In the homoskedastic case, the parameters are only used to infer the required shape.
+        Here are the possible scenarios:
+        - non-batched noise, non-batched input, non-MT -> noise_diag shape is `n`
+        - non-batched noise, non-batched input, MT -> noise_diag shape is `nt`
+        - non-batched noise, batched input, non-MT -> noise_diag shape is `b x n` with b' the broadcasted batch shape
+        - non-batched noise, batched input, MT -> noise_diag shape is `b x nt`
+        - batched noise, non-batched input, non-MT -> noise_diag shape is `b x n`
+        - batched noise, non-batched input, MT -> noise_diag shape is `b x nt`
+        - batched noise, batched input, non-MT -> noise_diag shape is `b' x n`
+        - batched noise, batched input, MT -> noise_diag shape is `b' x nt`
+        where `n` is the number of evaluation points and `t` is the number of tasks (i.e. `num_tasks` of self.noise).
+        So bascially the shape is always `b' x nt`, with `b'` appropriately broadcast from the noise parameter and
+        input batch shapes. `n` and the input batch shape are determined either from the shape arg or from the params
+        input. For this it is sufficient to take in a single `shape` arg, with the convention that shape[:-1] is the
+        batch shape of the input, and shape[-1] is `n`.
+        """
+        if shape is None:
+            p = params[0] if torch.is_tensor(params[0]) else params[0][0]
+            shape = p.shape if len(p.shape) == 1 else p.shape[:-1]
+        noise = self.noise
+        batch_shape, n = shape[:-1], shape[-1]
+        noise_batch_shape = noise.shape[:-1] if noise.shape[-2] > 1 else torch.Size()
+        num_tasks = noise.shape[-1]
+        batch_shape = _mul_broadcast_shape(noise_batch_shape, batch_shape)
+        noise = noise.unsqueeze(-2)
+        if len(batch_shape) == 0:
+            noise = noise.squeeze(0)
+        noise_diag = noise.expand(batch_shape + torch.Size([n, num_tasks])).contiguous()
+        if num_tasks == 1:
+            noise_diag = noise_diag.view(*batch_shape, n)
+        return DiagLazyTensor(noise_diag)
+
+
+class HomoskedasticNoise(_HomoskedasticNoiseBase):
+    def __init__(self, noise_prior=None, batch_size=1, param_transform=softplus, inv_param_transform=None):
+        super().__init__(
+            noise_prior=noise_prior,
+            batch_size=batch_size,
+            param_transform=param_transform,
+            inv_param_transform=inv_param_transform,
+            num_tasks=1,
+        )
+
+
+class MultitaskHomoskedasticNoise(_HomoskedasticNoiseBase):
+    def __init__(self, num_tasks, noise_prior=None, batch_size=1, param_transform=softplus, inv_param_transform=None):
+        super().__init__(
+            noise_prior=noise_prior,
+            batch_size=batch_size,
+            param_transform=param_transform,
+            inv_param_transform=inv_param_transform,
+            num_tasks=num_tasks,
+        )
+
+
+class HeteroskedasticNoise(Module):
+    def __init__(self, noise_model, noise_indices=None, noise_transform=torch.exp):
+        super().__init__()
+        self.noise_model = noise_model
+        self._noise_transform = noise_transform
+        self._noise_indices = noise_indices
+        self._noise_transform = noise_transform
+
+    def forward(self, *params, batch_shape=None, shape=None):
+        if len(params) == 1 and not torch.is_tensor(params[0]):
+            output = self.noise_model(*params[0])
+        else:
+            output = self.noise_model(*params)
+        if not isinstance(output, MultivariateNormal):
+            raise NotImplementedError("Currently only noise models that return a MultivariateNormal are supported")
+        # note: this also works with MultitaskMultivariateNormal, where this
+        # will return a batched DiagLazyTensors of size n x num_tasks x num_tasks
+        noise_diag = output.mean if self._noise_indices is None else output.mean[..., self._noise_indices]
+        return DiagLazyTensor(self._noise_transform(noise_diag))
diff --git a/gpytorch/mlls/exact_marginal_log_likelihood.py b/gpytorch/mlls/exact_marginal_log_likelihood.py
index 4f3faa4f7..36dfd15d3 100644
--- a/gpytorch/mlls/exact_marginal_log_likelihood.py
+++ b/gpytorch/mlls/exact_marginal_log_likelihood.py
@@ -2,7 +2,7 @@
 
 import torch
 from .marginal_log_likelihood import MarginalLogLikelihood
-from ..likelihoods import GaussianLikelihood
+from ..likelihoods import _GaussianLikelihoodBase
 from ..distributions import MultivariateNormal
 
 
@@ -15,16 +15,16 @@ def __init__(self, likelihood, model):
         - likelihood: (Likelihood) - the likelihood for the model
         - model: (Module) - the exact GP model
         """
-        if not isinstance(likelihood, GaussianLikelihood):
+        if not isinstance(likelihood, _GaussianLikelihoodBase):
             raise RuntimeError("Likelihood must be Gaussian for exact inference")
         super(ExactMarginalLogLikelihood, self).__init__(likelihood, model)
 
-    def forward(self, output, target):
+    def forward(self, output, target, *params):
         if not isinstance(output, MultivariateNormal):
             raise RuntimeError("ExactMarginalLogLikelihood can only operate on Gaussian random variables")
 
         # Get the log prob of the marginal distribution
-        output = self.likelihood(output)
+        output = self.likelihood(output, *params)
         res = output.log_prob(target)
 
         # Add terms for SGPR / when inducing points are learned
diff --git a/gpytorch/models/exact_gp.py b/gpytorch/models/exact_gp.py
index d8ac46819..c91726467 100644
--- a/gpytorch/models/exact_gp.py
+++ b/gpytorch/models/exact_gp.py
@@ -4,7 +4,7 @@
 import torch
 from ..functions import exact_predictive_mean, exact_predictive_covar
 from ..distributions import MultivariateNormal, MultitaskMultivariateNormal
-from ..likelihoods import GaussianLikelihood
+from ..likelihoods import _GaussianLikelihoodBase
 from .. import settings
 from .gp import GP
 
@@ -15,8 +15,8 @@ def __init__(self, train_inputs, train_targets, likelihood):
             train_inputs = (train_inputs,)
         if train_inputs is not None and not all(torch.is_tensor(train_input) for train_input in train_inputs):
             raise RuntimeError("Train inputs must be a tensor, or a list/tuple of tensors")
-        if not isinstance(likelihood, GaussianLikelihood):
-            raise RuntimeError("ExactGP can only handle GaussianLikelihood")
+        if not isinstance(likelihood, _GaussianLikelihoodBase):
+            raise RuntimeError("ExactGP can only handle Gaussian likelihoods")
 
         super(ExactGP, self).__init__()
         if train_inputs is not None:
@@ -72,7 +72,7 @@ def __call__(self, *args, **kwargs):
                     "train_inputs, train_targets cannot be None in training mode. "
                     "Call .eval() for prior predictions, or call .set_train_data() to add training data."
                 )
-            if settings.debug.on():
+            if settings.check_training_data.on():
                 if not all(torch.equal(train_input, input) for train_input, input in zip(train_inputs, inputs)):
                     raise RuntimeError("You must train on the training inputs!")
             res = super(ExactGP, self).__call__(*inputs, **kwargs)
@@ -150,6 +150,7 @@ def __call__(self, *args, **kwargs):
             predictive_mean, mean_cache = exact_predictive_mean(
                 full_covar=full_covar,
                 full_mean=full_mean,
+                train_inputs=train_inputs,
                 train_labels=train_targets,
                 num_train=num_train,
                 likelihood=self.likelihood,
@@ -158,6 +159,7 @@ def __call__(self, *args, **kwargs):
             )
             predictive_covar, covar_cache = exact_predictive_covar(
                 full_covar=full_covar,
+                train_inputs=train_inputs,
                 num_train=num_train,
                 likelihood=self.likelihood,
                 precomputed_cache=self.covar_cache,
diff --git a/gpytorch/module.py b/gpytorch/module.py
index f603e319c..3f0461c36 100644
--- a/gpytorch/module.py
+++ b/gpytorch/module.py
@@ -1,19 +1,20 @@
 #!/usr/bin/env python3
 
+import itertools
+import warnings
 from collections import OrderedDict
 
 import torch
 from torch import nn
 from torch.distributions import Distribution
-import itertools
+
 from .lazy import LazyTensor
 from .utils.deprecation import DeprecationError
-import warnings
 
 
 class Module(nn.Module):
     def __init__(self):
-        super(Module, self).__init__()
+        super().__init__()
         self._added_loss_terms = OrderedDict()
         self._priors = OrderedDict()
 
@@ -69,49 +70,27 @@ def initialize(self, **kwargs):
         kwargs: (param_name, value) - parameter to initialize
         Value can take the form of a tensor, a float, or an int
         """
-        from .kernels import (
-            CosineKernel,
-            IndexKernel,
-            MaternKernel,
-            PeriodicKernel,
-            RBFKernel,
-            ScaleKernel,
-            SpectralMixtureKernel,
-        )
-
-        from .likelihoods import GaussianLikelihood, MultitaskGaussianLikelihood
-
-        modules_with_log_params = [
-            CosineKernel,
-            IndexKernel,
-            MaternKernel,
-            PeriodicKernel,
-            RBFKernel,
-            ScaleKernel,
-            SpectralMixtureKernel,
-            GaussianLikelihood,
-            MultitaskGaussianLikelihood,
-        ]
+        from .utils.log_deprecation import MODULES_WITH_LOG_PARAMS
 
         for name, val in kwargs.items():
             if isinstance(val, int):
                 val = float(val)
-            if any([isinstance(self, mod_type) for mod_type in modules_with_log_params]) and 'log_' in name:
-                base_name = name.split('log_')[1]
-                name = 'raw_' + base_name
+            if any(isinstance(self, mod_type) for mod_type in MODULES_WITH_LOG_PARAMS) and "log_" in name:
+                base_name = name.split("log_")[1]
+                name = "raw_" + base_name
                 if not torch.is_tensor(val):
                     val = self._inv_param_transform(torch.tensor(val).exp()).item()
                 else:
                     val = self._inv_param_transform(val.exp())
 
-            if name not in self._parameters:
+            if not hasattr(self, name):
                 raise AttributeError("Unknown parameter {p} for {c}".format(p=name, c=self.__class__.__name__))
             if torch.is_tensor(val):
                 self.__getattr__(name).data.copy_(val)
             elif isinstance(val, float):
                 self.__getattr__(name).data.fill_(val)
             else:
-                raise AttributeError("Type {t} not valid to initialize parameter {p}".format(t=type(val), p=name))
+                raise AttributeError("Type {t} not valid for initializing parameter {p}".format(t=type(val), p=name))
 
             # Ensure value is contained in support of prior (if present)
             prior_name = "_".join([name, "prior"])
@@ -178,7 +157,7 @@ def register_parameter(self, name, parameter, prior=None):
             )
         if "_parameters" not in self.__dict__:
             raise AttributeError("Cannot assign parameter before Module.__init__() call")
-        super(Module, self).register_parameter(name, parameter)
+        super().register_parameter(name, parameter)
 
     def register_prior(self, name, prior, param_or_closure, setting_closure=None):
         """
@@ -252,56 +231,25 @@ def variational_parameters(self):
     def _load_from_state_dict(
         self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
     ):
-        from .kernels import (
-            CosineKernel,
-            IndexKernel,
-            MaternKernel,
-            PeriodicKernel,
-            RBFKernel,
-            ScaleKernel,
-            SpectralMixtureKernel,
-        )
-
-        from .likelihoods import GaussianLikelihood, MultitaskGaussianLikelihood
+        from .utils.log_deprecation import LOG_DEPRECATION_MSG, MODULES_WITH_LOG_PARAMS
 
         local_name_params = itertools.chain(self._parameters.items(), self._buffers.items())
         local_state = {k: v.data for k, v in local_name_params if v is not None}
 
-        modules_with_log_params = [
-            CosineKernel,
-            IndexKernel,
-            MaternKernel,
-            PeriodicKernel,
-            RBFKernel,
-            ScaleKernel,
-            SpectralMixtureKernel,
-            GaussianLikelihood,
-            MultitaskGaussianLikelihood,
-        ]
-
-        if not any([isinstance(self, mod_type) for mod_type in modules_with_log_params]):
-            return super(Module, self)._load_from_state_dict(
-                state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
-            )
-
-        super(Module, self)._load_from_state_dict(
+        super()._load_from_state_dict(
             state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
         )
+        if not any(isinstance(self, mod_type) for mod_type in MODULES_WITH_LOG_PARAMS):
+            return
 
         # Load log space parameters and throw deprecation warnings.
         for name, param in local_state.items():
-            if 'raw_' in name:
-                base_name = name.split('raw_')[1]
+            if "raw_" in name:
+                base_name = name.split("raw_")[1]
                 log_name = "log_" + base_name
                 log_key = prefix + log_name
                 if log_key in state_dict and log_key not in local_state:
-                    warnings.warn(
-                        "The '{log_name}' parameter is deprecated in favor of '{name}' because we no longer ensure "
-                        "positiveness with torch.exp for improved stability reasons and will be removed in a future "
-                        "release. To solve this issue, just save this model "
-                        "again.".format(log_name=log_name, name=name),
-                        DeprecationWarning,
-                    )
+                    warnings.warn(LOG_DEPRECATION_MSG.format(log_name=log_name, name=name), DeprecationWarning)
                     input_param = state_dict[log_key]
                     if isinstance(input_param, nn.Parameter):
                         input_param = input_param.data
@@ -315,42 +263,21 @@ def _load_from_state_dict(
                         unexpected_keys.remove(prefix + log_name)
 
     def __getattr__(self, name):
-        from .kernels import (
-            CosineKernel,
-            IndexKernel,
-            MaternKernel,
-            PeriodicKernel,
-            RBFKernel,
-            ScaleKernel,
-            SpectralMixtureKernel,
-        )
-
-        from .likelihoods import GaussianLikelihood, MultitaskGaussianLikelihood
-
-        modules_with_log_params = [
-            CosineKernel,
-            IndexKernel,
-            MaternKernel,
-            PeriodicKernel,
-            RBFKernel,
-            ScaleKernel,
-            SpectralMixtureKernel,
-            GaussianLikelihood,
-            MultitaskGaussianLikelihood,
-        ]
-
-        if not any([isinstance(self, mod_type) for mod_type in modules_with_log_params]) or 'log_' not in name:
-            return super(Module, self).__getattr__(name)
-        else:
-            base_name = name.split('log_')[1]  # e.g. log_lengthscale -> lengthscale
-            raw_name = 'raw_' + base_name
-            warnings.warn(
-                "The '{log_name}' parameter is deprecated in favor of '{name}'  because we no longer ensure "
-                "positiveness with torch.exp for improved stability reasons and will be removed in a future "
-                "release.".format(log_name=name, name=raw_name),
-                DeprecationWarning,
-            )
-            return super(Module, self).__getattribute__(base_name).log()  # Get real param value and transform to log
+        try:
+            return super().__getattr__(name)
+        except AttributeError as e:
+            from .utils.log_deprecation import LOG_DEPRECATION_MSG, MODULES_WITH_LOG_PARAMS
+
+            if any(isinstance(self, mod_type) for mod_type in MODULES_WITH_LOG_PARAMS) and "log_" in name:
+                base_name = name.split("log_")[1]  # e.g. log_lengthscale -> lengthscale
+                raw_name = "raw_" + base_name
+                warnings.warn(LOG_DEPRECATION_MSG.format(log_name=name, name=raw_name), DeprecationWarning)
+                return super().__getattribute__(base_name).log()  # Get real param value and transform to log
+            else:
+                try:
+                    return super().__getattribute__(name)
+                except AttributeError:
+                    raise e
 
 
 def _extract_named_added_loss_terms(module, memo=None, prefix=""):
diff --git a/gpytorch/settings.py b/gpytorch/settings.py
index 3c2bcd328..2de721c3b 100644
--- a/gpytorch/settings.py
+++ b/gpytorch/settings.py
@@ -47,6 +47,18 @@ def __exit__(self, *args):
         return False
 
 
+class check_training_data(_feature_flag):
+    """
+    Check whether the correct training data is supplied in Exact GP training mode
+    Pros: fewer data checks, fewer warning messages
+    Cons: possibility of supplying incorrect data, model accidentially in wrong mode
+
+    Note: If using a Heteroskedastic Noise model, this will need to be disabled
+    """
+
+    _state = True
+
+
 class debug(_feature_flag):
     """
     Whether or not to perform "safety" checks on the supplied data.
diff --git a/gpytorch/utils/grid.py b/gpytorch/utils/grid.py
index 6c37c788b..e10f374a1 100644
--- a/gpytorch/utils/grid.py
+++ b/gpytorch/utils/grid.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python3
 
-
 import math
 import torch
 
@@ -48,7 +47,7 @@ def choose_grid_size(train_inputs, ratio=1.0):
 def create_data_from_grid(grid):
     grid_size = grid.size(-2)
     grid_dim = grid.size(-1)
-    grid_data = torch.zeros(int(pow(grid_size, grid_dim)), grid_dim)
+    grid_data = torch.zeros(int(pow(grid_size, grid_dim)), grid_dim, device=grid.device)
     prev_points = None
     for i in range(grid_dim):
         for j in range(grid_size):
diff --git a/gpytorch/utils/log_deprecation.py b/gpytorch/utils/log_deprecation.py
new file mode 100644
index 000000000..b7f786f89
--- /dev/null
+++ b/gpytorch/utils/log_deprecation.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+from ..kernels import (
+    CosineKernel,
+    IndexKernel,
+    MaternKernel,
+    PeriodicKernel,
+    RBFKernel,
+    ScaleKernel,
+    SpectralMixtureKernel,
+)
+from ..likelihoods import GaussianLikelihood, MultitaskGaussianLikelihood
+
+
+MODULES_WITH_LOG_PARAMS = [
+    CosineKernel,
+    IndexKernel,
+    MaternKernel,
+    PeriodicKernel,
+    RBFKernel,
+    ScaleKernel,
+    SpectralMixtureKernel,
+    GaussianLikelihood,
+    MultitaskGaussianLikelihood,
+]
+
+LOG_DEPRECATION_MSG = (
+    "The '{log_name}' parameter is deprecated in favor of '{name}'  because we no longer ensure "
+    "positiveness with torch.exp for improved stability reasons and will be removed in a future "
+    "release."
+)
diff --git a/test/examples/test_batch_gp_regression.py b/test/examples/test_batch_gp_regression.py
index 804773591..49cf27baf 100644
--- a/test/examples/test_batch_gp_regression.py
+++ b/test/examples/test_batch_gp_regression.py
@@ -1,16 +1,17 @@
 #!/usr/bin/env python3
 
+import math
 import os
 import random
-import math
-import torch
 import unittest
+
 import gpytorch
-from torch import optim
+import torch
+from gpytorch.distributions import MultivariateNormal
 from gpytorch.kernels import RBFKernel, ScaleKernel
-from gpytorch.means import ConstantMean
 from gpytorch.likelihoods import GaussianLikelihood
-from gpytorch.distributions import MultivariateNormal
+from gpytorch.means import ConstantMean
+from torch import optim
 
 
 # Batch training test: Let's learn hyperparameters on a sine dataset, but test on a sine dataset and a cosine dataset
@@ -120,7 +121,7 @@ def test_train_on_batch_test_on_batch(self):
         for _ in range(50):
             optimizer.zero_grad()
             output = gp_model(train_x12)
-            loss = -mll(output, train_y12).sum()
+            loss = -mll(output, train_y12, train_x12).sum()
             loss.backward()
             optimizer.step()
 
@@ -159,7 +160,7 @@ def test_train_on_batch_test_on_batch_shared_hypers_over_batch(self):
         for _ in range(50):
             optimizer.zero_grad()
             output = gp_model(train_x12)
-            loss = -mll(output, train_y12).sum()
+            loss = -mll(output, train_y12, train_x12).sum()
             loss.backward()
             optimizer.step()
 
diff --git a/test/examples/test_batch_multitask_gp_regression.py b/test/examples/test_batch_multitask_gp_regression.py
index a51425de4..89223bcf7 100644
--- a/test/examples/test_batch_multitask_gp_regression.py
+++ b/test/examples/test_batch_multitask_gp_regression.py
@@ -9,7 +9,7 @@
 from torch import optim
 from gpytorch.kernels import RBFKernel, MultitaskKernel
 from gpytorch.means import ConstantMean, MultitaskMean
-from gpytorch.likelihoods import MultitaskGaussianLikelihood
+from gpytorch.likelihoods import MultitaskGaussianLikelihoodKronecker
 from gpytorch.distributions import MultitaskMultivariateNormal
 
 
@@ -69,7 +69,7 @@ def tearDown(self):
 
     def test_train_on_single_set_test_on_batch(self):
         # We're manually going to set the hyperparameters to something they shouldn't be
-        likelihood = MultitaskGaussianLikelihood(
+        likelihood = MultitaskGaussianLikelihoodKronecker(
             noise_prior=gpytorch.priors.NormalPrior(loc=torch.zeros(1), scale=torch.ones(1)), num_tasks=2
         )
         gp_model = ExactGPModel(train_x1, train_y1, likelihood)
@@ -112,7 +112,7 @@ def test_train_on_single_set_test_on_batch(self):
 
     def test_train_on_batch_test_on_batch(self):
         # We're manually going to set the hyperparameters to something they shouldn't be
-        likelihood = MultitaskGaussianLikelihood(
+        likelihood = MultitaskGaussianLikelihoodKronecker(
             noise_prior=gpytorch.priors.NormalPrior(loc=torch.zeros(2), scale=torch.ones(2)), batch_size=2, num_tasks=2
         )
         gp_model = ExactGPModel(train_x12, train_y12, likelihood, batch_size=2)
@@ -151,7 +151,7 @@ def test_train_on_batch_test_on_batch(self):
 
     def test_train_on_batch_test_on_batch_shared_hypers_over_batch(self):
         # We're manually going to set the hyperparameters to something they shouldn't be
-        likelihood = MultitaskGaussianLikelihood(
+        likelihood = MultitaskGaussianLikelihoodKronecker(
             noise_prior=gpytorch.priors.NormalPrior(loc=torch.zeros(2), scale=torch.ones(2)), batch_size=1, num_tasks=2
         )
         gp_model = ExactGPModel(train_x12, train_y12, likelihood, batch_size=1)
diff --git a/test/examples/test_grid_gp_regression.py b/test/examples/test_grid_gp_regression.py
index ca8e9c9e6..6e53e1b8d 100644
--- a/test/examples/test_grid_gp_regression.py
+++ b/test/examples/test_grid_gp_regression.py
@@ -1,11 +1,12 @@
 #!/usr/bin/env python3
 
-import gpytorch
-import torch
 import math
-import unittest
 import os
 import random
+import unittest
+
+import gpytorch
+import torch
 from torch import optim
 
 
@@ -52,19 +53,26 @@ def tearDown(self):
         if hasattr(self, "rng_state"):
             torch.set_rng_state(self.rng_state)
 
-    def test_grid_gp_mean_abs_error(self):
+    def test_grid_gp_mean_abs_error(self, cuda=False):
+        device = torch.device("cuda") if cuda else torch.device("cpu")
         grid_bounds = [(0, 1), (0, 2)]
         grid_size = 25
-        grid = torch.zeros(grid_size, len(grid_bounds))
+        grid = torch.zeros(grid_size, len(grid_bounds), device=device)
         for i in range(len(grid_bounds)):
             grid_diff = float(grid_bounds[i][1] - grid_bounds[i][0]) / (grid_size - 2)
-            grid[:, i] = torch.linspace(grid_bounds[i][0] - grid_diff, grid_bounds[i][1] + grid_diff, grid_size)
+            grid[:, i] = torch.linspace(
+                grid_bounds[i][0] - grid_diff, grid_bounds[i][1] + grid_diff, grid_size, device=device
+            )
 
-        train_x, train_y, test_x, test_y = make_data(grid)
+        train_x, train_y, test_x, test_y = make_data(grid, cuda=cuda)
         likelihood = gpytorch.likelihoods.GaussianLikelihood()
         gp_model = GridGPRegressionModel(grid, train_x, train_y, likelihood)
         mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model)
 
+        if cuda:
+            gp_model.cuda()
+            likelihood.cuda()
+
         # Optimize the model
         gp_model.train()
         likelihood.train()
@@ -72,7 +80,7 @@ def test_grid_gp_mean_abs_error(self):
         optimizer = optim.Adam(list(gp_model.parameters()) + list(likelihood.parameters()), lr=0.1)
         optimizer.n_iter = 0
         with gpytorch.settings.debug(False):
-            for _ in range(25):
+            for _ in range(20):
                 optimizer.zero_grad()
                 output = gp_model(train_x)
                 loss = -mll(output, train_y)
@@ -96,6 +104,10 @@ def test_grid_gp_mean_abs_error(self):
 
         self.assertLess(mean_abs_error.squeeze().item(), 0.3)
 
+    def test_grid_gp_mean_abs_error_cuda(self):
+        if torch.cuda.is_available():
+            self.test_grid_gp_mean_abs_error(cuda=True)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/examples/test_kronecker_multitask_ski_gp_regression.py b/test/examples/test_kronecker_multitask_ski_gp_regression.py
index 151c43429..6e5d611e1 100644
--- a/test/examples/test_kronecker_multitask_ski_gp_regression.py
+++ b/test/examples/test_kronecker_multitask_ski_gp_regression.py
@@ -1,28 +1,16 @@
 #!/usr/bin/env python3
 
-from math import pi
-
 import os
 import random
-import torch
 import unittest
+from math import pi
+
 import gpytorch
-from gpytorch.kernels import RBFKernel, MultitaskKernel, GridInterpolationKernel
-from gpytorch.means import ConstantMean, MultitaskMean
-from gpytorch.likelihoods import MultitaskGaussianLikelihood
+import torch
 from gpytorch.distributions import MultitaskMultivariateNormal
-
-
-# Simple training data: let's try to learn a sine function
-train_x = torch.linspace(0, 1, 100)
-
-# y1 function is sin(2*pi*x) with noise N(0, 0.04)
-train_y1 = torch.sin(train_x * (2 * pi)) + torch.randn(train_x.size()) * 0.1
-# y2 function is cos(2*pi*x) with noise N(0, 0.04)
-train_y2 = torch.cos(train_x * (2 * pi)) + torch.randn(train_x.size()) * 0.1
-
-# Create a train_y which interleaves the two
-train_y = torch.stack([train_y1, train_y2], -1)
+from gpytorch.kernels import GridInterpolationKernel, MultitaskKernel, RBFKernel
+from gpytorch.likelihoods import MultitaskGaussianLikelihood
+from gpytorch.means import ConstantMean, MultitaskMean
 
 
 class MultitaskGPModel(gpytorch.models.ExactGP):
@@ -51,9 +39,25 @@ def tearDown(self):
         if hasattr(self, "rng_state"):
             torch.set_rng_state(self.rng_state)
 
-    def test_multitask_gp_mean_abs_error(self):
+    def _get_data(self, cuda=False):
+        # Simple training data: let's try to learn a sine function
+        train_x = torch.linspace(0, 1, 100, device=torch.device("cuda") if cuda else torch.device("cpu"))
+        # y1 function is sin(2*pi*x) with noise N(0, 0.04)
+        train_y1 = torch.sin(train_x * (2 * pi)) + torch.randn_like(train_x) * 0.1
+        # y2 function is cos(2*pi*x) with noise N(0, 0.04)
+        train_y2 = torch.cos(train_x * (2 * pi)) + torch.randn_like(train_x) * 0.1
+        # Create a train_y which interleaves the two
+        train_y = torch.stack([train_y1, train_y2], -1)
+        return train_x, train_y
+
+    def test_multitask_gp_mean_abs_error(self, cuda=False):
+        train_x, train_y = self._get_data(cuda=cuda)
         likelihood = MultitaskGaussianLikelihood(num_tasks=2)
         model = MultitaskGPModel(train_x, train_y, likelihood)
+
+        if cuda:
+            model.cuda()
+
         # Find optimal model hyperparameters
         model.train()
         likelihood.train()
@@ -79,7 +83,8 @@ def test_multitask_gp_mean_abs_error(self):
         # Test the model
         model.eval()
         likelihood.eval()
-        test_x = torch.linspace(0, 1, 51)
+
+        test_x = torch.linspace(0, 1, 51, device=torch.device("cuda") if cuda else torch.device("cpu"))
         test_y1 = torch.sin(test_x * (2 * pi))
         test_y2 = torch.cos(test_x * (2 * pi))
         test_preds = likelihood(model(test_x)).mean
@@ -89,6 +94,10 @@ def test_multitask_gp_mean_abs_error(self):
         self.assertLess(mean_abs_error_task_1.squeeze().item(), 0.05)
         self.assertLess(mean_abs_error_task_2.squeeze().item(), 0.05)
 
+    def test_multitask_gp_mean_abs_error_cuda(self):
+        if torch.cuda.is_available():
+            self.test_multitask_gp_mean_abs_error(cuda=True)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/examples/test_simple_gp_regression.py b/test/examples/test_simple_gp_regression.py
index 8e2784173..d7b56b043 100644
--- a/test/examples/test_simple_gp_regression.py
+++ b/test/examples/test_simple_gp_regression.py
@@ -1,26 +1,18 @@
 #!/usr/bin/env python3
 
-from math import exp, pi
-
 import os
 import random
-import torch
 import unittest
+from math import exp, pi
+
 import gpytorch
-from torch import optim
+import torch
+from gpytorch.distributions import MultivariateNormal
 from gpytorch.kernels import RBFKernel, ScaleKernel
 from gpytorch.likelihoods import GaussianLikelihood
 from gpytorch.means import ConstantMean
 from gpytorch.priors import SmoothedBoxPrior
-from gpytorch.distributions import MultivariateNormal
-
-
-# Simple training data: let's try to learn a sine function
-train_x = torch.linspace(0, 1, 11)
-train_y = torch.sin(train_x * (2 * pi))
-
-test_x = torch.linspace(0, 1, 51)
-test_y = torch.sin(test_x * (2 * pi))
+from torch import optim
 
 
 class ExactGPModel(gpytorch.models.ExactGP):
@@ -48,7 +40,17 @@ def tearDown(self):
         if hasattr(self, "rng_state"):
             torch.set_rng_state(self.rng_state)
 
-    def test_prior(self):
+    def _get_data(self, cuda=False):
+        device = torch.device("cuda") if cuda else torch.device("cpu")
+        # Simple training data: let's try to learn a sine function
+        train_x = torch.linspace(0, 1, 11, device=device)
+        train_y = torch.sin(train_x * (2 * pi))
+        test_x = torch.linspace(0, 1, 51, device=device)
+        test_y = torch.sin(test_x * (2 * pi))
+        return train_x, test_x, train_y, test_y
+
+    def test_prior(self, cuda=False):
+        train_x, test_x, train_y, test_y = self._get_data(cuda=cuda)
         # We're manually going to set the hyperparameters to be ridiculous
         likelihood = GaussianLikelihood(noise_prior=SmoothedBoxPrior(exp(-3), exp(3), sigma=0.1))
         gp_model = ExactGPModel(None, None, likelihood)
@@ -60,6 +62,10 @@ def test_prior(self):
         gp_model.covar_module.base_kernel.initialize(log_lengthscale=0)
         likelihood.initialize(log_noise=0)
 
+        if cuda:
+            gp_model.cuda()
+            likelihood.cuda()
+
         # Compute posterior distribution
         gp_model.eval()
         likelihood.eval()
@@ -71,12 +77,21 @@ def test_prior(self):
         self.assertLess(torch.norm(function_predictions.mean - 1.5), 1e-3)
         self.assertLess(torch.norm(function_predictions.variance - correct_variance), 1e-3)
 
-    def test_posterior_latent_gp_and_likelihood_without_optimization(self):
+    def test_prior_cuda(self):
+        if torch.cuda.is_available():
+            self.test_prior(cuda=True)
+
+    def test_posterior_latent_gp_and_likelihood_without_optimization(self, cuda=False):
+        train_x, test_x, train_y, test_y = self._get_data(cuda=cuda)
         # We're manually going to set the hyperparameters to be ridiculous
         likelihood = GaussianLikelihood()
         gp_model = ExactGPModel(train_x, train_y, likelihood)
         gp_model.covar_module.base_kernel.initialize(raw_lengthscale=-15)
-        likelihood.initialize(raw_noise=-15)
+        likelihood.initialize(log_noise=-15)
+
+        if cuda:
+            gp_model.cuda()
+            likelihood.cuda()
 
         # Compute posterior distribution
         gp_model.eval()
@@ -91,12 +106,17 @@ def test_posterior_latent_gp_and_likelihood_without_optimization(self):
         self.assertLess(torch.norm(function_predictions.variance), 1e-3)
 
         # It shouldn't fit much else though
-        test_function_predictions = gp_model(torch.tensor([1.1]))
+        test_function_predictions = gp_model(torch.tensor([1.1]).type_as(test_x))
 
         self.assertLess(torch.norm(test_function_predictions.mean - 0), 1e-4)
         self.assertLess(torch.norm(test_function_predictions.variance - gp_model.covar_module.outputscale), 1e-4)
 
-    def test_posterior_latent_gp_and_likelihood_with_optimization(self):
+    def test_posterior_latent_gp_and_likelihood_without_optimization_cuda(self):
+        if torch.cuda.is_available():
+            self.test_posterior_latent_gp_and_likelihood_without_optimization(cuda=True)
+
+    def test_posterior_latent_gp_and_likelihood_with_optimization(self, cuda=False):
+        train_x, test_x, train_y, test_y = self._get_data(cuda=cuda)
         # We're manually going to set the hyperparameters to something they shouldn't be
         likelihood = GaussianLikelihood(noise_prior=SmoothedBoxPrior(exp(-3), exp(3), sigma=0.1))
         gp_model = ExactGPModel(train_x, train_y, likelihood)
@@ -105,6 +125,10 @@ def test_posterior_latent_gp_and_likelihood_with_optimization(self):
         gp_model.mean_module.initialize(constant=0)
         likelihood.initialize(log_noise=1)
 
+        if cuda:
+            gp_model.cuda()
+            likelihood.cuda()
+
         # Find optimal model hyperparameters
         gp_model.train()
         likelihood.train()
@@ -135,7 +159,12 @@ def test_posterior_latent_gp_and_likelihood_with_optimization(self):
 
         self.assertLess(mean_abs_error.item(), 0.05)
 
-    def test_posterior_latent_gp_and_likelihood_fast_pred_var(self):
+    def test_posterior_latent_gp_and_likelihood_with_optimization_cuda(self):
+        if torch.cuda.is_available():
+            self.test_posterior_latent_gp_and_likelihood_with_optimization(cuda=True)
+
+    def test_posterior_latent_gp_and_likelihood_fast_pred_var(self, cuda=False):
+        train_x, test_x, train_y, test_y = self._get_data(cuda=cuda)
         with gpytorch.fast_pred_var(), gpytorch.settings.debug(False):
             # We're manually going to set the hyperparameters to
             # something they shouldn't be
@@ -146,6 +175,10 @@ def test_posterior_latent_gp_and_likelihood_fast_pred_var(self):
             gp_model.mean_module.initialize(constant=0)
             likelihood.initialize(log_noise=1)
 
+            if cuda:
+                gp_model.cuda()
+                likelihood.cuda()
+
             # Find optimal model hyperparameters
             gp_model.train()
             likelihood.train()
@@ -175,53 +208,17 @@ def test_posterior_latent_gp_and_likelihood_fast_pred_var(self):
 
             # Now bump up the likelihood to something huge
             # This will make it easy to calculate the variance
-            likelihood.raw_noise.data.fill_(3)
+            likelihood.noise_covar.raw_noise.data.fill_(3)
             test_function_predictions = likelihood(gp_model(train_x))
 
-            noise = likelihood.noise
+            noise = likelihood.noise_covar.noise
             var_diff = (test_function_predictions.variance - noise).abs()
 
             self.assertLess(torch.max(var_diff / noise), 0.05)
 
-    def test_posterior_latent_gp_and_likelihood_with_optimization_cuda(self):
+    def test_posterior_latent_gp_and_likelihood_fast_pred_var_cuda(self):
         if torch.cuda.is_available():
-            # We're manually going to set the hyperparameters to
-            # something they shouldn't be
-            likelihood = GaussianLikelihood(noise_prior=SmoothedBoxPrior(exp(-3), exp(3), sigma=0.1)).cuda()
-            gp_model = ExactGPModel(train_x.cuda(), train_y.cuda(), likelihood).cuda()
-            mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model)
-            gp_model.covar_module.base_kernel.initialize(log_lengthscale=1)
-            gp_model.mean_module.initialize(constant=0)
-            likelihood.initialize(log_noise=1)
-
-            # Find optimal model hyperparameters
-            gp_model.train()
-            likelihood.train()
-            optimizer = optim.Adam(gp_model.parameters(), lr=0.1)
-            optimizer.n_iter = 0
-            for _ in range(50):
-                optimizer.zero_grad()
-                output = gp_model(train_x.cuda())
-                loss = -mll(output, train_y.cuda())
-                loss.backward()
-                optimizer.n_iter += 1
-                optimizer.step()
-
-            for param in gp_model.parameters():
-                self.assertTrue(param.grad is not None)
-                self.assertGreater(param.grad.norm().item(), 0)
-            for param in likelihood.parameters():
-                self.assertTrue(param.grad is not None)
-                self.assertGreater(param.grad.norm().item(), 0)
-            optimizer.step()
-
-            # Test the model
-            gp_model.eval()
-            likelihood.eval()
-            test_function_predictions = likelihood(gp_model(test_x.cuda()))
-            mean_abs_error = torch.mean(torch.abs(test_y.cuda() - test_function_predictions.mean))
-
-            self.assertLess(mean_abs_error.item(), 0.05)
+            self.test_posterior_latent_gp_and_likelihood_fast_pred_var(cuda=True)
 
 
 if __name__ == "__main__":
diff --git a/test/examples/test_white_noise_regression.py b/test/examples/test_white_noise_regression.py
index 4421a7af3..333b150ff 100644
--- a/test/examples/test_white_noise_regression.py
+++ b/test/examples/test_white_noise_regression.py
@@ -1,26 +1,18 @@
 #!/usr/bin/env python3
 
-from math import exp, pi
-
 import os
 import random
-import torch
 import unittest
+from math import exp, pi
+
 import gpytorch
-from torch import optim
-from gpytorch.kernels import RBFKernel, WhiteNoiseKernel, ScaleKernel
+import torch
+from gpytorch.distributions import MultivariateNormal
+from gpytorch.kernels import RBFKernel, ScaleKernel, WhiteNoiseKernel
 from gpytorch.likelihoods import GaussianLikelihood
 from gpytorch.means import ConstantMean
 from gpytorch.priors import SmoothedBoxPrior
-from gpytorch.distributions import MultivariateNormal
-
-
-# Simple training data: let's try to learn a sine function
-train_x = torch.linspace(0, 1, 11)
-train_y = torch.sin(train_x * (2 * pi))
-
-test_x = torch.linspace(0, 1, 51)
-test_y = torch.sin(test_x * (2 * pi))
+from torch import optim
 
 
 class ExactGPModel(gpytorch.models.ExactGP):
@@ -50,7 +42,17 @@ def tearDown(self):
         if hasattr(self, "rng_state"):
             torch.set_rng_state(self.rng_state)
 
-    def test_posterior_latent_gp_and_likelihood_without_optimization(self):
+    def _get_data(self, cuda=False):
+        device = torch.device("cuda") if cuda else torch.device("cpu")
+        # Simple training data: let's try to learn a sine function
+        train_x = torch.linspace(0, 1, 11, device=device)
+        train_y = torch.sin(train_x * (2 * pi))
+        test_x = torch.linspace(0, 1, 51, device=device)
+        test_y = torch.sin(test_x * (2 * pi))
+        return train_x, test_x, train_y, test_y
+
+    def test_posterior_latent_gp_and_likelihood_without_optimization(self, cuda=False):
+        train_x, test_x, train_y, test_y = self._get_data(cuda=cuda)
         with gpytorch.settings.debug(False):
             # We're manually going to set the hyperparameters to be ridiculous
             likelihood = GaussianLikelihood(noise_prior=SmoothedBoxPrior(exp(-10), exp(10), sigma=0.25))
@@ -63,6 +65,10 @@ def test_posterior_latent_gp_and_likelihood_without_optimization(self):
             gp_model.mean_module.initialize(constant=0)
             likelihood.initialize(log_noise=-10)
 
+            if cuda:
+                gp_model.cuda()
+                likelihood.cuda()
+
             # Compute posterior distribution
             gp_model.eval()
             likelihood.eval()
@@ -75,12 +81,17 @@ def test_posterior_latent_gp_and_likelihood_without_optimization(self):
             self.assertLess(torch.norm(function_predictions.variance), 5e-3)
 
             # It shouldn't fit much else though
-            test_function_predictions = gp_model(torch.tensor([1.1], dtype=torch.float))
+            test_function_predictions = gp_model(torch.tensor([1.1]).type_as(test_x))
 
             self.assertLess(torch.norm(test_function_predictions.mean - 0), 1e-4)
             self.assertLess(torch.norm(test_function_predictions.variance - gp_model.covar_module.outputscale), 1e-4)
 
-    def test_posterior_latent_gp_and_likelihood_with_optimization(self):
+    def test_posterior_latent_gp_and_likelihood_without_optimization_cuda(self):
+        if torch.cuda.is_available():
+            self.test_posterior_latent_gp_and_likelihood_without_optimization(cuda=True)
+
+    def test_posterior_latent_gp_and_likelihood_with_optimization(self, cuda=False):
+        train_x, test_x, train_y, test_y = self._get_data(cuda=cuda)
         # We're manually going to set the hyperparameters to something they shouldn't be
         likelihood = GaussianLikelihood(noise_prior=SmoothedBoxPrior(exp(-3), exp(3), sigma=0.1))
         gp_model = ExactGPModel(train_x, train_y, likelihood)
@@ -89,9 +100,14 @@ def test_posterior_latent_gp_and_likelihood_with_optimization(self):
         gp_model.mean_module.initialize(constant=0)
         likelihood.initialize(log_noise=1)
 
+        if cuda:
+            gp_model.cuda()
+            likelihood.cuda()
+
         # Find optimal model hyperparameters
         gp_model.train()
         likelihood.train()
+
         optimizer = optim.Adam(list(gp_model.parameters()) + list(likelihood.parameters()), lr=0.1)
         optimizer.n_iter = 0
         with gpytorch.settings.debug(False):
@@ -119,10 +135,14 @@ def test_posterior_latent_gp_and_likelihood_with_optimization(self):
 
         self.assertLess(mean_abs_error.squeeze().item(), 0.05)
 
-    def test_posterior_latent_gp_and_likelihood_fast_pred_var(self):
+    def test_posterior_latent_gp_and_likelihood_with_optimization_cuda(self):
+        if torch.cuda.is_available():
+            self.test_posterior_latent_gp_and_likelihood_with_optimization(cuda=True)
+
+    def test_posterior_latent_gp_and_likelihood_fast_pred_var(self, cuda=False):
+        train_x, test_x, train_y, test_y = self._get_data(cuda=cuda)
         with gpytorch.fast_pred_var(), gpytorch.settings.debug(False):
-            # We're manually going to set the hyperparameters to
-            # something they shouldn't be
+            # We're manually going to set the hyperparameters to something they shouldn't be
             likelihood = GaussianLikelihood(noise_prior=SmoothedBoxPrior(exp(-3), exp(3), sigma=0.1))
             gp_model = ExactGPModel(train_x, train_y, likelihood)
             mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model)
@@ -130,6 +150,10 @@ def test_posterior_latent_gp_and_likelihood_fast_pred_var(self):
             gp_model.mean_module.initialize(constant=0)
             likelihood.initialize(log_noise=1)
 
+            if cuda:
+                gp_model.cuda()
+                likelihood.cuda()
+
             # Find optimal model hyperparameters
             gp_model.train()
             likelihood.train()
@@ -167,46 +191,9 @@ def test_posterior_latent_gp_and_likelihood_fast_pred_var(self):
 
             self.assertLess(torch.max(var_diff / noise), 0.05)
 
-    def test_posterior_latent_gp_and_likelihood_with_optimization_cuda(self):
+    def test_posterior_latent_gp_and_likelihood_fast_pred_var_cuda(self):
         if torch.cuda.is_available():
-            # We're manually going to set the hyperparameters to
-            # something they shouldn't be
-            likelihood = GaussianLikelihood(noise_prior=SmoothedBoxPrior(exp(-3), exp(3), sigma=0.1)).cuda()
-            gp_model = ExactGPModel(train_x.cuda(), train_y.cuda(), likelihood).cuda()
-            mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model)
-            gp_model.rbf_covar_module.initialize(log_lengthscale=1)
-            gp_model.mean_module.initialize(constant=0)
-            likelihood.initialize(log_noise=1)
-
-            # Find optimal model hyperparameters
-            gp_model.train()
-            likelihood.train()
-            optimizer = optim.Adam(gp_model.parameters(), lr=0.1)
-            optimizer.n_iter = 0
-            with gpytorch.settings.debug(False):
-                for _ in range(50):
-                    optimizer.zero_grad()
-                    output = gp_model(train_x.cuda())
-                    loss = -mll(output, train_y.cuda())
-                    loss.backward()
-                    optimizer.n_iter += 1
-                    optimizer.step()
-
-                for param in gp_model.parameters():
-                    self.assertTrue(param.grad is not None)
-                    self.assertGreater(param.grad.norm().item(), 0)
-                for param in likelihood.parameters():
-                    self.assertTrue(param.grad is not None)
-                    self.assertGreater(param.grad.norm().item(), 0)
-                optimizer.step()
-
-                # Test the model
-                gp_model.eval()
-                likelihood.eval()
-                test_function_predictions = likelihood(gp_model(test_x.cuda()))
-                mean_abs_error = torch.mean(torch.abs(test_y.cuda() - test_function_predictions.mean))
-
-            self.assertLess(mean_abs_error.squeeze().item(), 0.05)
+            self.test_posterior_latent_gp_and_likelihood_fast_pred_var(cuda=True)
 
 
 if __name__ == "__main__":
diff --git a/test/likelihoods/test_general_multitask_gaussian_likelihood.py b/test/likelihoods/test_general_multitask_gaussian_likelihood.py
index 05ded52a3..ef265bf51 100644
--- a/test/likelihoods/test_general_multitask_gaussian_likelihood.py
+++ b/test/likelihoods/test_general_multitask_gaussian_likelihood.py
@@ -8,7 +8,7 @@
 import gpytorch
 import torch
 from gpytorch.kernels import MultitaskKernel, RBFKernel
-from gpytorch.likelihoods import MultitaskGaussianLikelihood
+from gpytorch.likelihoods import MultitaskGaussianLikelihoodKronecker
 from gpytorch.means import ConstantMean, MultitaskMean
 from gpytorch.distributions import MultitaskMultivariateNormal
 
@@ -53,7 +53,7 @@ def tearDown(self):
             torch.set_rng_state(self.rng_state)
 
     def test_multitask_low_rank_noise_covar(self):
-        likelihood = MultitaskGaussianLikelihood(num_tasks=2, rank=1)
+        likelihood = MultitaskGaussianLikelihoodKronecker(num_tasks=2, rank=1)
         model = MultitaskGPModel(train_x, train_y, likelihood)
         # Find optimal model hyperparameters
         model.train()
@@ -83,10 +83,10 @@ def test_multitask_low_rank_noise_covar(self):
 
         num_tasks = 2
         task_noise_covar_factor = likelihood.task_noise_covar_factor
-        log_noise = likelihood.log_noise
+        noise = likelihood.noise
         task_noise_covar = task_noise_covar_factor.matmul(
             task_noise_covar_factor.transpose(-1, -2)
-        ) + log_noise.exp() * torch.eye(num_tasks)
+        ) + noise * torch.eye(num_tasks)
 
         self.assertGreater(task_noise_covar[0, 0, 1].item(), 0.05)