diff --git a/pySDC/helpers/spectral_helper.py b/pySDC/helpers/spectral_helper.py
index b1b147a26d..105add7374 100644
--- a/pySDC/helpers/spectral_helper.py
+++ b/pySDC/helpers/spectral_helper.py
@@ -2,7 +2,8 @@
 import scipy
 from pySDC.implementations.datatype_classes.mesh import mesh
 from scipy.special import factorial
-from functools import wraps
+from functools import partial, wraps
+import logging
 
 
 def cache(func):
@@ -73,8 +74,9 @@ class SpectralHelper1D:
     sparse_lib = scipy.sparse
     linalg = scipy.sparse.linalg
     xp = np
+    distributable = False
 
-    def __init__(self, N, x0=None, x1=None, useGPU=False):
+    def __init__(self, N, x0=None, x1=None, useGPU=False, useFFTW=False):
         """
         Constructor
 
@@ -83,15 +85,23 @@ def __init__(self, N, x0=None, x1=None, useGPU=False):
             x0 (float): Coordinate of left boundary
             x1 (float): Coordinate of right boundary
             useGPU (bool): Whether to use GPUs
+            useFFTW (bool): Whether to use FFTW for the transforms
         """
         self.N = N
         self.x0 = x0
         self.x1 = x1
         self.L = x1 - x0
         self.useGPU = useGPU
+        self.plans = {}
+        self.logger = logging.getLogger(name=type(self).__name__)
 
         if useGPU:
             self.setup_GPU()
+        else:
+            self.setup_CPU(useFFTW=useFFTW)
+
+        if useGPU and useFFTW:
+            raise ValueError('Please run either on GPUs or with FFTW, not both!')
 
     @classmethod
     def setup_GPU(cls):
@@ -107,6 +117,26 @@ def setup_GPU(cls):
         cls.linalg = linalg
         cls.fft_lib = fft_lib
 
+    @classmethod
+    def setup_CPU(cls, useFFTW=False):
+        """switch to CPU modules"""
+
+        cls.xp = np
+        cls.sparse_lib = scipy.sparse
+        cls.linalg = scipy.sparse.linalg
+
+        if useFFTW:
+            from mpi4py_fft import fftw
+
+            cls.fft_backend = 'fftw'
+            cls.fft_lib = fftw
+        else:
+            cls.fft_backend = 'scipy'
+            cls.fft_lib = scipy.fft
+
+        cls.fft_comm_backend = 'MPI'
+        cls.dtype = mesh
+
     def get_Id(self):
         """
         Get identity matrix
@@ -227,18 +257,16 @@ class ChebychevHelper(SpectralHelper1D):
     that moves to Chebychev U basis during differentiation, which is sparse. When using this technique, problems need to
     be formulated in first order formulation.
 
-    This implementation is largely based on the Dedalus paper (arXiv:1905.10388).
+    This implementation is largely based on the Dedalus paper (https://doi.org/10.1103/PhysRevResearch.2.023068).
     """
 
-    def __init__(self, *args, transform_type='fft', x0=-1, x1=1, **kwargs):
+    def __init__(self, *args, x0=-1, x1=1, **kwargs):
         """
         Constructor.
         Please refer to the parent class for additional arguments. Notably, you have to supply a resolution `N` and you
         may choose to run on GPUs via the `useGPU` argument.
 
         Args:
-            transform_type ('fft' or 'dct'): Either use DCT functions directly implemented in the transform library or
-                                             use the FFT from the library to compute the DCT
             x0 (float): Coordinate of left boundary. Note that only -1 is currently implented
             x1 (float): Coordinate of right boundary. Note that only +1 is currently implented
         """
@@ -246,10 +274,6 @@ def __init__(self, *args, transform_type='fft', x0=-1, x1=1, **kwargs):
         self.lin_trf_fac = (x1 - x0) / 2
         self.lin_trf_off = (x1 + x0) / 2
         super().__init__(*args, x0=x0, x1=x1, **kwargs)
-        self.transform_type = transform_type
-
-        if self.transform_type == 'fft':
-            self.get_fft_utils()
 
         self.norm = self.get_norm()
 
@@ -374,6 +398,7 @@ def get_differentiation_matrix(self, p=1):
         D[0, :] /= 2
         return self.sparse_lib.csc_matrix(self.xp.linalg.matrix_power(D, p)) / self.lin_trf_fac**p
 
+    @cache
     def get_norm(self, N=None):
         '''
         Get normalization for converting Chebychev coefficients and DCT
@@ -389,134 +414,114 @@ def get_norm(self, N=None):
         norm[0] /= 2
         return norm
 
-    def get_fft_shuffle(self, forward, N):
-        """
-        In order to more easily parallelize using distributed FFT libraries, we express the DCT via an FFT following
-        doi.org/10.1109/TASSP.1980.1163351. The idea is based on reshuffling the data to be periodic and rotating it
-        in the complex plane. This function returns a mask to do the shuffling.
-
-        Args:
-            forward (bool): Whether you want the shuffle for forward transform or backward transform
-            N (int): size of the grid
-
-        Returns:
-            self.xp.array: Use as mask
-        """
-        xp = self.xp
-        if forward:
-            return xp.append(xp.arange((N + 1) // 2) * 2, -xp.arange(N // 2) * 2 - 1 - N % 2)
-        else:
-            mask = xp.zeros(N, dtype=int)
-            mask[: N - N % 2 : 2] = xp.arange(N // 2)
-            mask[1::2] = N - xp.arange(N // 2) - 1
-            mask[-1] = N // 2
-            return mask
-
-    def get_fft_shift(self, forward, N):
-        """
-        As described in the docstring for `get_fft_shuffle`, we need to rotate in the complex plane in order to use FFT for DCT.
-
-        Args:
-            forward (bool): Whether you want the rotation for forward transform or backward transform
-            N (int): size of the grid
-
-        Returns:
-            self.xp.array: Rotation
-        """
-        k = self.get_wavenumbers()
-        norm = self.get_norm()
-        xp = self.xp
-        if forward:
-            return 2 * xp.exp(-1j * np.pi * k / (2 * N) + 0j * np.pi / 4) * norm
-        else:
-            shift = xp.exp(1j * np.pi * k / (2 * N))
-            shift[0] = 0.5
-            return shift / norm
-
-    def get_fft_utils(self):
-        """
-        Get the required utilities for using FFT to do DCT as described in the docstring for `get_fft_shuffle` and keep
-        them cached.
-        """
-        self.fft_utils = {
-            'fwd': {},
-            'bck': {},
-        }
-
-        # forwards transform
-        self.fft_utils['fwd']['shuffle'] = self.get_fft_shuffle(True, self.N)
-        self.fft_utils['fwd']['shift'] = self.get_fft_shift(True, self.N)
-
-        # backwards transform
-        self.fft_utils['bck']['shuffle'] = self.get_fft_shuffle(False, self.N)
-        self.fft_utils['bck']['shift'] = self.get_fft_shift(False, self.N)
-
-        return self.fft_utils
-
-    def transform(self, u, axis=-1, **kwargs):
+    def transform(self, u, *args, axes=None, shape=None, **kwargs):
         """
-        1D DCT along axis. `kwargs` will be passed on to the FFT library.
+        DCT along axes. `kwargs` will be passed on to the FFT library.
 
         Args:
             u: Data you want to transform
-            axis (int): Axis you want to transform along
+            axes (tuple): Axes you want to transform along
 
         Returns:
             Data in spectral space
         """
-        if self.transform_type.lower() == 'dct':
-            return self.fft_lib.dct(u, axis=axis, **kwargs) * self.norm
-        elif self.transform_type.lower() == 'fft':
-            result = u.copy()
-
-            shuffle = [slice(0, s, 1) for s in u.shape]
-            shuffle[axis] = self.fft_utils['fwd']['shuffle']
-
-            v = u[(*shuffle,)]
-
-            V = self.fft_lib.fft(v, axis=axis, **kwargs)
+        axes = axes if axes else tuple(i for i in range(u.ndim))
+        kwargs['s'] = shape
+        kwargs['norm'] = kwargs.get('norm', 'backward')
+
+        trf = self.fft_lib.dctn(u, *args, axes=axes, type=2, **kwargs)
+        for axis in axes:
+
+            if self.N < trf.shape[axis]:
+                # mpi4py-fft implements padding only for FFT, where the frequencies are sorted such that the zeros are
+                # removed in the middle rather than the end. We need to resort this here and put the highest frequencies
+                # in the middle.
+                _trf = self.xp.zeros_like(trf)
+                N = self.N
+                N_pad = _trf.shape[axis] - N
+                end_first_half = N // 2 + 1
+
+                # copy first "half"
+                su = [slice(None)] * trf.ndim
+                su[axis] = slice(0, end_first_half)
+                _trf[tuple(su)] = trf[tuple(su)]
+
+                # copy second "half"
+                su = [slice(None)] * u.ndim
+                su[axis] = slice(end_first_half + N_pad, None)
+                s_u = [slice(None)] * u.ndim
+                s_u[axis] = slice(end_first_half, N)
+                _trf[tuple(su)] = trf[tuple(s_u)]
+
+                # # copy values to be cut
+                # su = [slice(None)] * u.ndim
+                # su[axis] = slice(end_first_half, end_first_half + N_pad)
+                # s_u = [slice(None)] * u.ndim
+                # s_u[axis] = slice(-N_pad, None)
+                # _trf[tuple(su)] = trf[tuple(s_u)]
+
+                trf = _trf
 
             expansion = [np.newaxis for _ in u.shape]
             expansion[axis] = slice(0, u.shape[axis], 1)
+            norm = self.xp.ones(trf.shape[axis]) * self.norm[-1]
+            norm[: self.N] = self.norm
+            trf *= norm[(*expansion,)]
+        return trf
 
-            V *= self.fft_utils['fwd']['shift'][(*expansion,)]
-
-            result.real[...] = V.real[...]
-            return result
-        else:
-            raise NotImplementedError(f'Please choose a transform type from fft and dct, not {self.transform_type=}')
-
-    def itransform(self, u, axis=-1):
+    def itransform(self, u, *args, axes=None, shape=None, **kwargs):
         """
-        1D inverse DCT along axis.
+        Inverse DCT along axis.
 
         Args:
             u: Data you want to transform
-            axis (int): Axis you want to transform along
+            axes (tuple): Axes you want to transform along
 
         Returns:
             Data in physical space
         """
-        assert self.norm.shape[0] == u.shape[axis]
+        axes = axes if axes else tuple(i for i in range(u.ndim))
+        kwargs['s'] = shape
+        kwargs['norm'] = kwargs.get('norm', 'backward')
+        kwargs['overwrite_x'] = kwargs.get('overwrite_x', False)
 
-        if self.transform_type == 'dct':
-            return self.fft_lib.idct(u / self.norm, axis=axis)
-        elif self.transform_type == 'fft':
-            result = u.copy()
+        for axis in axes:
 
+            if self.N == u.shape[axis]:
+                _u = u.copy()
+            else:
+                # mpi4py-fft implements padding only for FFT, where the frequencies are sorted such that the zeros are
+                # added in the middle rather than the end. We need to resort this here and put the padding in the end.
+                N = self.N
+                _u = self.xp.zeros_like(u)
+
+                # copy first half
+                su = [slice(None)] * u.ndim
+                su[axis] = slice(0, N // 2 + 1)
+                _u[tuple(su)] = u[tuple(su)]
+
+                # copy second half
+                su = [slice(None)] * u.ndim
+                su[axis] = slice(-(N // 2), None)
+                s_u = [slice(None)] * u.ndim
+                s_u[axis] = slice(N // 2, N // 2 + (N // 2))
+                _u[tuple(s_u)] = u[tuple(su)]
+
+                if N % 2 == 0:
+                    su = [slice(None)] * u.ndim
+                    su[axis] = N // 2
+                    _u[tuple(su)] *= 2
+
+            # generate norm
             expansion = [np.newaxis for _ in u.shape]
             expansion[axis] = slice(0, u.shape[axis], 1)
+            norm = self.xp.ones(_u.shape[axis])
+            norm[: self.N] = self.norm
+            norm = self.get_norm(u.shape[axis]) * _u.shape[axis] / self.N
 
-            v = self.fft_lib.ifft(u * self.fft_utils['bck']['shift'][(*expansion,)], axis=axis)
+            _u /= norm[(*expansion,)]
 
-            shuffle = [slice(0, s, 1) for s in u.shape]
-            shuffle[axis] = self.fft_utils['bck']['shuffle']
-            V = v[(*shuffle,)]
-
-            result.real[...] = V.real[...]
-            return result
-        else:
-            raise NotImplementedError
+        return self.fft_lib.idctn(_u, *args, axes=axes, type=2, **kwargs)
 
     def get_BC(self, kind, **kwargs):
         """
@@ -717,6 +722,8 @@ def get_integration_constant(self, u_hat, axis):
 
 
 class FFTHelper(SpectralHelper1D):
+    distributable = True
+
     def __init__(self, *args, x0=0, x1=2 * np.pi, **kwargs):
         """
         Constructor.
@@ -724,8 +731,6 @@ def __init__(self, *args, x0=0, x1=2 * np.pi, **kwargs):
         may choose to run on GPUs via the `useGPU` argument.
 
         Args:
-            transform_type ('fft' or 'dct'): Either use DCT functions directly implemented in the transform library or
-                                             use the FFT from the library to compute the DCT
             x0 (float, optional): Coordinate of left boundary
             x1 (float, optional): Coordinate of right boundary
         """
@@ -757,11 +762,14 @@ def get_differentiation_matrix(self, p=1):
         k = self.get_wavenumbers()
 
         if self.useGPU:
-            # Have to raise the matrix to power p on CPU because the GPU equivalent is not implemented in CuPy at the time of writing.
-            import scipy.sparse as sp
+            if p > 1:
+                # Have to raise the matrix to power p on CPU because the GPU equivalent is not implemented in CuPy at the time of writing.
+                from scipy.sparse.linalg import matrix_power
 
-            D = self.sparse_lib.diags(1j * k).get()
-            return self.sparse_lib.csc_matrix(sp.linalg.matrix_power(D, p))
+                D = self.sparse_lib.diags(1j * k).get()
+                return self.sparse_lib.csc_matrix(matrix_power(D, p))
+            else:
+                return self.sparse_lib.diags(1j * k)
         else:
             return self.linalg.matrix_power(self.sparse_lib.diags(1j * k), p)
 
@@ -779,31 +787,56 @@ def get_integration_matrix(self, p=1):
         k[0] = 1j * self.L
         return self.linalg.matrix_power(self.sparse_lib.diags(1 / (1j * k)), p)
 
-    def transform(self, u, axis=-1, **kwargs):
+    def get_plan(self, u, forward, *args, **kwargs):
+        if self.fft_lib.__name__ == 'mpi4py_fft.fftw':
+            if 'axes' in kwargs.keys():
+                kwargs['axes'] = tuple(kwargs['axes'])
+            key = (forward, u.shape, args, *(me for me in kwargs.values()))
+            if key in self.plans.keys():
+                return self.plans[key]
+            else:
+                self.logger.debug(f'Generating FFT plan for {key=}')
+                transform = self.fft_lib.fftn(u, *args, **kwargs) if forward else self.fft_lib.ifftn(u, *args, **kwargs)
+                self.plans[key] = transform
+
+            return self.plans[key]
+        else:
+            if forward:
+                return partial(self.fft_lib.fftn, norm=kwargs.get('norm', 'backward'))
+            else:
+                return partial(self.fft_lib.ifftn, norm=kwargs.get('norm', 'forward'))
+
+    def transform(self, u, *args, axes=None, shape=None, **kwargs):
         """
-        1D FFT along axis. `kwargs` are passed on to the FFT library.
+        FFT along axes. `kwargs` are passed on to the FFT library.
 
         Args:
             u: Data you want to transform
-            axis (int): Axis you want to transform along
+            axes (tuple): Axes you want to transform over
 
         Returns:
             transformed data
         """
-        return self.fft_lib.fft(u, axis=axis, **kwargs)
+        axes = axes if axes else tuple(i for i in range(u.ndim))
+        kwargs['s'] = shape
+        plan = self.get_plan(u, *args, forward=True, axes=axes, **kwargs)
+        return plan(u, *args, axes=axes, **kwargs)
 
-    def itransform(self, u, axis=-1):
+    def itransform(self, u, *args, axes=None, shape=None, **kwargs):
         """
-        Inverse 1D FFT.
+        Inverse FFT.
 
         Args:
             u: Data you want to transform
-            axis (int): Axis you want to transform along
+            axes (tuple): Axes over which to transform
 
         Returns:
             transformed data
         """
-        return self.fft_lib.ifft(u, axis=axis)
+        axes = axes if axes else tuple(i for i in range(u.ndim))
+        kwargs['s'] = shape
+        plan = self.get_plan(u, *args, forward=False, axes=axes, **kwargs)
+        return plan(u, *args, axes=axes, **kwargs) / np.prod([u.shape[axis] for axis in axes])
 
     def get_BC(self, kind):
         """
@@ -874,7 +907,6 @@ class SpectralHelper:
         BC_line_zero_matrix (sparse matrix): Matrix that zeros rows where we can then add the BCs in using `BCs`
         rhs_BCs_hat (self.xp.ndarray): Boundary conditions in spectral space
         global_shape (tuple): Global shape of the solution as in `mpi4py-fft`
-        local_slice (slice): Local slice of the solution as in `mpi4py-fft`
         fft_obj: When using distributed FFTs, this will be a parallel transform object from `mpi4py-fft`
         init (tuple): This is the same `init` that is used throughout the problem classes
         init_forward (tuple): This is the equivalent of `init` in spectral space
@@ -885,7 +917,7 @@ class SpectralHelper:
     sparse_lib = scipy.sparse
     linalg = scipy.sparse.linalg
     dtype = mesh
-    fft_backend = 'fftw'
+    fft_backend = 'scipy'
     fft_comm_backend = 'MPI'
 
     @classmethod
@@ -894,17 +926,39 @@ def setup_GPU(cls):
         import cupy as cp
         import cupyx.scipy.sparse as sparse_lib
         import cupyx.scipy.sparse.linalg as linalg
+        import cupyx.scipy.fft as fft_lib
         from pySDC.implementations.datatype_classes.cupy_mesh import cupy_mesh
 
         cls.xp = cp
         cls.sparse_lib = sparse_lib
         cls.linalg = linalg
 
-        cls.fft_backend = 'cupy'
+        cls.fft_lib = fft_lib
+        cls.fft_backend = 'cupyx-scipy'
         cls.fft_comm_backend = 'NCCL'
 
         cls.dtype = cupy_mesh
 
+    @classmethod
+    def setup_CPU(cls, useFFTW=False):
+        """switch to CPU modules"""
+
+        cls.xp = np
+        cls.sparse_lib = scipy.sparse
+        cls.linalg = scipy.sparse.linalg
+
+        if useFFTW:
+            from mpi4py_fft import fftw
+
+            cls.fft_backend = 'fftw'
+            cls.fft_lib = fftw
+        else:
+            cls.fft_backend = 'scipy'
+            cls.fft_lib = scipy.fft
+
+        cls.fft_comm_backend = 'MPI'
+        cls.dtype = mesh
+
     def __init__(self, comm=None, useGPU=False, debug=False):
         """
         Constructor
@@ -920,6 +974,8 @@ def __init__(self, comm=None, useGPU=False, debug=False):
 
         if useGPU:
             self.setup_GPU()
+        else:
+            self.setup_CPU()
 
         self.axes = []
         self.components = []
@@ -931,6 +987,10 @@ def __init__(self, comm=None, useGPU=False, debug=False):
         self.fft_cache = {}
         self.fft_dealias_shape_cache = {}
 
+        self.logger = logging.getLogger(name='Spectral Discretization')
+        if debug:
+            self.logger.setLevel(logging.DEBUG)
+
     @property
     def u_init(self):
         """
@@ -945,6 +1005,13 @@ def u_init_forward(self):
         """
         return self.dtype(self.init_forward)
 
+    @property
+    def u_init_physical(self):
+        """
+        Get empty data container in physical space
+        """
+        return self.dtype(self.init_physical)
+
     @property
     def shape(self):
         """
@@ -979,7 +1046,6 @@ def add_axis(self, base, *args, **kwargs):
         kwargs['useGPU'] = self.useGPU
 
         if base.lower() in ['chebychov', 'chebychev', 'cheby', 'chebychovhelper']:
-            kwargs['transform_type'] = kwargs.get('transform_type', 'fft')
             self.axes.append(ChebychevHelper(*args, **kwargs))
         elif base.lower() in ['fft', 'fourier', 'ffthelper']:
             self.axes.append(FFTHelper(*args, **kwargs))
@@ -1080,15 +1146,35 @@ def get_BC(self, axis, kind, line=-1, scalar=False, **kwargs):
 
             Id = self.get_local_slice_of_1D_matrix(self.axes[axis2].get_Id() @ _Id, axis=axis2)
 
-            if self.useGPU:
-                Id = Id.get()
-
             mats = [
                 None,
             ] * ndim
             mats[axis] = self.get_local_slice_of_1D_matrix(BC, axis=axis)
             mats[axis2] = Id
-            return self.sparse_lib.csc_matrix(sp.kron(*mats))
+            return self.sparse_lib.csc_matrix(self.sparse_lib.kron(*mats))
+        if ndim == 3:
+            mats = [
+                None,
+            ] * ndim
+
+            for ax in range(ndim):
+                if ax == axis:
+                    continue
+
+                if scalar:
+                    _Id = self.sparse_lib.diags(self.xp.append([1], self.xp.zeros(self.axes[ax].N - 1)))
+                else:
+                    _Id = self.axes[ax].get_Id()
+
+                mats[ax] = self.get_local_slice_of_1D_matrix(self.axes[ax].get_Id() @ _Id, axis=ax)
+
+            mats[axis] = self.get_local_slice_of_1D_matrix(BC, axis=axis)
+
+            return self.sparse_lib.csc_matrix(self.sparse_lib.kron(mats[0], self.sparse_lib.kron(*mats[1:])))
+        else:
+            raise NotImplementedError(
+                f'Matrix expansion for boundary conditions not implemented for {ndim} dimensions!'
+            )
 
     def remove_BC(self, component, equation, axis, kind, line=-1, scalar=False, **kwargs):
         """
@@ -1121,7 +1207,7 @@ def remove_BC(self, component, equation, axis, kind, line=-1, scalar=False, **kw
                 + [slice(0, self.init[0][i + 1]) for i in range(axis + 1, len(self.axes))]
             )
         N = self.axes[axis].N
-        if (N + line) % N in self.xp.arange(N)[self.local_slice[axis]]:
+        if (N + line) % N in self.xp.arange(N)[self.local_slice()[axis]]:
             self.BC_rhs_mask[(*slices,)] = False
 
     def add_BC(self, component, equation, axis, kind, v, line=-1, scalar=False, **kwargs):
@@ -1165,15 +1251,10 @@ def add_BC(self, component, equation, axis, kind, v, line=-1, scalar=False, **kw
             else:
                 self.BC_rhs_mask[(*slices,)] = True
         else:
-            slices = (
-                [self.index(equation)]
-                + [slice(0, self.init[0][i + 1]) for i in range(axis)]
-                + [line]
-                + [slice(0, self.init[0][i + 1]) for i in range(axis + 1, len(self.axes))]
-            )
+            slices = [self.index(equation), *self.global_slice(True)]
             N = self.axes[axis].N
-            if (N + line) % N in self.xp.arange(N)[self.local_slice[axis]]:
-                slices[axis + 1] -= self.local_slice[axis].start
+            if (N + line) % N in self.get_indices(True)[axis]:
+                slices[axis + 1] = (N + line) % N - self.local_slice()[axis].start
                 self.BC_rhs_mask[(*slices,)] = True
 
     def setup_BCs(self):
@@ -1245,21 +1326,17 @@ def put_BCs_in_rhs_hat(self, rhs_hat):
             Generate a mask where we need to set values in the rhs in spectral space to zero, such that can replace them
             by the boundary conditions. The mask is then cached.
             """
-            self._rhs_hat_zero_mask = self.xp.zeros(shape=rhs_hat.shape, dtype=bool)
+            self._rhs_hat_zero_mask = self.newDistArray().astype(bool)
 
             for axis in range(self.ndim):
                 for bc in self.full_BCs:
-                    slices = (
-                        [slice(0, self.init[0][i + 1]) for i in range(axis)]
-                        + [bc['line']]
-                        + [slice(0, self.init[0][i + 1]) for i in range(axis + 1, len(self.axes))]
-                    )
                     if axis == bc['axis']:
-                        _slice = [self.index(bc['equation'])] + slices
+                        slices = [self.index(bc['equation']), *self.global_slice(True)]
                         N = self.axes[axis].N
-                        if (N + bc['line']) % N in self.xp.arange(N)[self.local_slice[axis]]:
-                            _slice[axis + 1] -= self.local_slice[axis].start
-                            self._rhs_hat_zero_mask[(*_slice,)] = True
+                        line = bc['line']
+                        if (N + line) % N in self.get_indices(True)[axis]:
+                            slices[axis + 1] = (N + line) % N - self.local_slice()[axis].start
+                            self._rhs_hat_zero_mask[(*slices,)] = True
 
         rhs_hat[self._rhs_hat_zero_mask] = 0
         return rhs_hat + self.rhs_BCs_hat
@@ -1284,18 +1361,14 @@ def put_BCs_in_rhs(self, rhs):
             _rhs_hat = self.transform(rhs, axes=(axis - ndim,))
 
             for bc in self.full_BCs:
-                slices = (
-                    [slice(0, self.init[0][i + 1]) for i in range(axis)]
-                    + [bc['line']]
-                    + [slice(0, self.init[0][i + 1]) for i in range(axis + 1, len(self.axes))]
-                )
+
                 if axis == bc['axis']:
-                    _slice = [self.index(bc['equation'])] + slices
+                    _slice = [self.index(bc['equation']), *self.global_slice(True)]
 
                     N = self.axes[axis].N
-                    if (N + bc['line']) % N in self.xp.arange(N)[self.local_slice[axis]]:
-                        _slice[axis + 1] -= self.local_slice[axis].start
-
+                    line = bc['line']
+                    if (N + line) % N in self.get_indices(True)[axis]:
+                        _slice[axis + 1] = (N + line) % N - self.local_slice()[axis].start
                         _rhs_hat[(*_slice,)] = bc['v']
 
             rhs = self.itransform(_rhs_hat, axes=(axis - ndim,))
@@ -1371,16 +1444,57 @@ def get_wavenumbers(self):
         """
         Get grid in spectral space
         """
-        grids = [self.axes[i].get_wavenumbers()[self.local_slice[i]] for i in range(len(self.axes))]
+        grids = [self.axes[i].get_wavenumbers()[self.local_slice(True)[i]] for i in range(len(self.axes))]
         return self.xp.meshgrid(*grids, indexing='ij')
 
-    def get_grid(self):
+    def get_grid(self, forward_output=False):
         """
         Get grid in physical space
         """
-        grids = [self.axes[i].get_1dgrid()[self.local_slice[i]] for i in range(len(self.axes))]
+        grids = [self.axes[i].get_1dgrid()[self.local_slice(forward_output)[i]] for i in range(len(self.axes))]
         return self.xp.meshgrid(*grids, indexing='ij')
 
+    def get_indices(self, forward_output=True):
+        return [self.xp.arange(self.axes[i].N)[self.local_slice(forward_output)[i]] for i in range(len(self.axes))]
+
+    @cache
+    def get_pfft(self, axes=None, padding=None, grid=None):
+        if self.ndim == 1 or self.comm is None:
+            return None
+        from mpi4py_fft import PFFT
+
+        axes = tuple(i for i in range(self.ndim)) if axes is None else axes
+        padding = list(padding if padding else [1.0 for _ in range(self.ndim)])
+
+        def no_transform(u, *args, **kwargs):
+            return u
+
+        transforms = {(i,): (no_transform, no_transform) for i in range(self.ndim)}
+        for i in axes:
+            transforms[((i + self.ndim) % self.ndim,)] = (self.axes[i].transform, self.axes[i].itransform)
+
+        # "transform" all axes to ensure consistent shapes.
+        # Transform non-distributable axes last to ensure they are aligned
+        _axes = tuple(sorted((axis + self.ndim) % self.ndim for axis in axes))
+        _axes = [axis for axis in _axes if not self.axes[axis].distributable] + sorted(
+            [axis for axis in _axes if self.axes[axis].distributable]
+            + [axis for axis in range(self.ndim) if axis not in _axes]
+        )
+
+        pfft = PFFT(
+            comm=self.comm,
+            shape=self.global_shape[1:],
+            axes=_axes,  # TODO: control the order of the transforms better
+            dtype='D',
+            collapse=False,
+            backend=self.fft_backend,
+            comm_backend=self.fft_comm_backend,
+            padding=padding,
+            transforms=transforms,
+            grid=grid,
+        )
+        return pfft
+
     def get_fft(self, axes=None, direction='object', padding=None, shape=None):
         """
         When using MPI, we use `PFFT` objects generated by mpi4py-fft
@@ -1442,6 +1556,18 @@ def get_fft(self, axes=None, direction='object', padding=None, shape=None):
 
         return self.fft_cache[key]
 
+    def local_slice(self, forward_output=True):
+        if self.fft_obj:
+            return self.get_pfft().local_slice(forward_output=forward_output)
+        else:
+            return [slice(0, me.N) for me in self.axes]
+
+    def global_slice(self, forward_output=True):
+        if self.fft_obj:
+            return [slice(0, me) for me in self.fft_obj.global_shape(forward_output=forward_output)]
+        else:
+            return self.local_slice(forward_output=forward_output)
+
     def setup_fft(self, real_spectral_coefficients=False):
         """
         This function must be called after all axes have been setup in order to prepare the local shapes of the data.
@@ -1454,18 +1580,25 @@ def setup_fft(self, real_spectral_coefficients=False):
             self.add_component('u')
 
         self.global_shape = (len(self.components),) + tuple(me.N for me in self.axes)
-        self.local_slice = [slice(0, me.N) for me in self.axes]
 
         axes = tuple(i for i in range(len(self.axes)))
-        self.fft_obj = self.get_fft(axes=axes, direction='object')
-        if self.fft_obj is not None:
-            self.local_slice = self.fft_obj.local_slice(False)
+        self.fft_obj = self.get_pfft(axes=axes)
 
         self.init = (
             np.empty(shape=self.global_shape)[
                 (
                     ...,
-                    *self.local_slice,
+                    *self.local_slice(False),
+                )
+            ].shape,
+            self.comm,
+            np.dtype('float'),
+        )
+        self.init_physical = (
+            np.empty(shape=self.global_shape)[
+                (
+                    ...,
+                    *self.local_slice(False),
                 )
             ].shape,
             self.comm,
@@ -1475,7 +1608,7 @@ def setup_fft(self, real_spectral_coefficients=False):
             np.empty(shape=self.global_shape)[
                 (
                     ...,
-                    *self.local_slice,
+                    *self.local_slice(True),
                 )
             ].shape,
             self.comm,
@@ -1483,435 +1616,175 @@ def setup_fft(self, real_spectral_coefficients=False):
         )
 
         self.BC_mat = self.get_empty_operator_matrix()
-        self.BC_rhs_mask = self.xp.zeros(
-            shape=self.init[0],
-            dtype=bool,
-        )
+        self.BC_rhs_mask = self.newDistArray().astype(bool)
 
-    def _transform_fft(self, u, axes, **kwargs):
+    def newDistArray(self, pfft=None, forward_output=True, val=0, rank=1, view=False):
         """
-        FFT along `axes`
-
-        Args:
-            u: The solution
-            axes (tuple): Axes you want to transform over
-
-        Returns:
-            transformed solution
+        Get an empty distributed array. This is almost a copy of the function of the same name from mpi4py-fft, but
+        takes care of all the solution components in the tensor.
         """
-        # TODO: clean up and try putting more of this in the 1D bases
-        fft = self.get_fft(axes, 'forward', **kwargs)
-        return fft(u, axes=axes)
-
-    def _transform_dct(self, u, axes, padding=None, **kwargs):
-        '''
-        DCT along `axes`.
-        This will only return real values!
-        When padding the solution, we cannot just use the mpi4py-fft implementation, because of the unusual ordering of
-        wavenumbers in FFTs.
-
-        Args:
-            u: The solution
-            axes (tuple): Axes you want to transform over
-
-        Returns:
-            transformed solution
-        '''
-        # TODO: clean up and try putting more of this in the 1D bases
-        if self.debug:
-            assert self.xp.allclose(u.imag, 0), 'This function can only handle real input.'
-
-        if len(axes) > 1:
-            v = self._transform_dct(self._transform_dct(u, axes[1:], **kwargs), (axes[0],), **kwargs)
-        else:
-            v = u.copy().astype(complex)
-            axis = axes[0]
-            base = self.axes[axis]
-
-            shuffle = [slice(0, s, 1) for s in u.shape]
-            shuffle[axis] = base.get_fft_shuffle(True, N=v.shape[axis])
-            v = v[(*shuffle,)]
-
-            if padding is not None:
-                shape = list(v.shape)
-                if ('forward', *padding) in self.fft_dealias_shape_cache.keys():
-                    shape[0] = self.fft_dealias_shape_cache[('forward', *padding)]
-                elif self.comm:
-                    send_buf = np.array(v.shape[0])
-                    recv_buf = np.array(v.shape[0])
-                    self.comm.Allreduce(send_buf, recv_buf)
-                    shape[0] = int(recv_buf)
-                fft = self.get_fft(axes, 'forward', shape=shape)
-            else:
-                fft = self.get_fft(axes, 'forward', **kwargs)
-
-            v = fft(v, axes=axes)
-
-            expansion = [np.newaxis for _ in u.shape]
-            expansion[axis] = slice(0, v.shape[axis], 1)
-
-            if padding is not None:
-                shift = base.get_fft_shift(True, v.shape[axis])
+        if self.comm is None:
+            return self.xp.zeros(self.init[0], dtype=self.init[2])
+        from mpi4py_fft.distarray import DistArray
 
-                if padding[axis] != 1:
-                    N = int(np.ceil(v.shape[axis] / padding[axis]))
-                    _expansion = [slice(0, n) for n in v.shape]
-                    _expansion[axis] = slice(0, N, 1)
-                    v = v[(*_expansion,)]
+        pfft = pfft if pfft else self.get_pfft()
+        if pfft is None:
+            if forward_output:
+                return self.u_init_forward
             else:
-                shift = base.fft_utils['fwd']['shift']
+                return self.u_init
 
-            v *= shift[(*expansion,)]
-
-        return v.real
-
-    def transform_single_component(self, u, axes=None, padding=None):
-        """
-        Transform a single component of the solution
-
-        Args:
-            u data to transform:
-            axes (tuple): Axes over which to transform
-            padding (list): Padding factor for transform. E.g. a padding factor of 2 will discard the upper half of modes after transforming
-
-        Returns:
-            Transformed data
-        """
-        # TODO: clean up and try putting more of this in the 1D bases
-        trfs = {
-            ChebychevHelper: self._transform_dct,
-            UltrasphericalHelper: self._transform_dct,
-            FFTHelper: self._transform_fft,
-        }
-
-        axes = tuple(-i - 1 for i in range(self.ndim)[::-1]) if axes is None else axes
-        padding = (
-            [
-                1,
-            ]
-            * self.ndim
-            if padding is None
-            else padding
-        )  # You know, sometimes I feel very strongly about Black still. This atrocious formatting is readable by Sauron only.
-
-        result = u.copy().astype(complex)
-        alignment = self.ndim - 1
-
-        axes_collapsed = [tuple(sorted(me for me in axes if type(self.axes[me]) == base)) for base in trfs.keys()]
-        bases = [list(trfs.keys())[i] for i in range(len(axes_collapsed)) if len(axes_collapsed[i]) > 0]
-        axes_collapsed = [me for me in axes_collapsed if len(me) > 0]
-        shape = [max(u.shape[i], self.global_shape[1 + i]) for i in range(self.ndim)]
-
-        fft = self.get_fft(axes=axes, padding=padding, direction='object')
-        if fft is not None:
-            shape = list(fft.global_shape(False))
-
-        for trf in range(len(axes_collapsed)):
-            _axes = axes_collapsed[trf]
-            base = bases[trf]
-
-            if len(_axes) == 0:
-                continue
-
-            for _ax in _axes:
-                shape[_ax] = self.global_shape[1 + self.ndim + _ax]
-
-            fft = self.get_fft(_axes, 'object', padding=padding, shape=shape)
-
-            _in = self.get_aligned(
-                result, axis_in=alignment, axis_out=self.ndim + _axes[-1], forward=False, fft=fft, shape=shape
-            )
-
-            alignment = self.ndim + _axes[-1]
-
-            _out = trfs[base](_in, axes=_axes, padding=padding, shape=shape)
-
-            if self.comm is not None:
-                _out *= np.prod([self.axes[i].N for i in _axes])
-
-            axes_next_base = (axes_collapsed + [(-1,)])[trf + 1]
-            alignment = alignment if len(axes_next_base) == 0 else self.ndim + axes_next_base[-1]
-            result = self.get_aligned(
-                _out, axis_in=self.ndim + _axes[0], axis_out=alignment, fft=fft, forward=True, shape=shape
-            )
-
-        return result
-
-    def transform(self, u, axes=None, padding=None):
-        """
-        Transform all components from physical space to spectral space
-
-        Args:
-            u data to transform:
-            axes (tuple): Axes over which to transform
-            padding (list): Padding factor for transform. E.g. a padding factor of 2 will discard the upper half of modes after transforming
-
-        Returns:
-            Transformed data
-        """
-        axes = tuple(-i - 1 for i in range(self.ndim)[::-1]) if axes is None else axes
-        padding = (
-            [
-                1,
-            ]
-            * self.ndim
-            if padding is None
-            else padding
-        )
-
-        result = [
-            None,
-        ] * self.ncomponents
-        for comp in self.components:
-            i = self.index(comp)
-
-            result[i] = self.transform_single_component(u[i], axes=axes, padding=padding)
+        global_shape = pfft.global_shape(forward_output)
+        p0 = pfft.pencil[forward_output]
+        if forward_output is True:
+            dtype = pfft.forward.output_array.dtype
+        else:
+            dtype = pfft.forward.input_array.dtype
+        global_shape = (self.ncomponents,) * rank + global_shape
 
-        return self.xp.stack(result)
+        if pfft.xfftn[0].backend in ["cupy", "cupyx-scipy"]:
+            from mpi4py_fft.distarrayCuPy import DistArrayCuPy as darraycls
+        else:
+            darraycls = DistArray
 
-    def _transform_ifft(self, u, axes, **kwargs):
-        # TODO: clean up and try putting more of this in the 1D bases
-        ifft = self.get_fft(axes, 'backward', **kwargs)
-        return ifft(u, axes=axes)
+        z = darraycls(global_shape, subcomm=p0.subcomm, val=val, dtype=dtype, alignment=p0.axis, rank=rank)
+        return z.v if view else z
 
-    def _transform_idct(self, u, axes, padding=None, **kwargs):
-        '''
-        This will only ever return real values!
-        '''
-        # TODO: clean up and try putting more of this in the 1D bases
-        if self.debug:
-            assert self.xp.allclose(u.imag, 0), 'This function can only handle real input.'
+    def infer_alignment(self, u, forward_output, padding=None, **kwargs):
+        if self.comm is None:
+            return [0]
 
-        v = u.copy().astype(complex)
+        def _alignment(pfft):
+            _arr = self.newDistArray(pfft, forward_output=forward_output)
+            _aligned_axes = [i for i in range(self.ndim) if _arr.global_shape[i + 1] == u.shape[i + 1]]
+            return _aligned_axes
 
-        if len(axes) > 1:
-            v = self._transform_idct(self._transform_idct(u, axes[1:]), (axes[0],))
+        if padding is None:
+            pfft = self.get_pfft(**kwargs)
+            aligned_axes = _alignment(pfft)
         else:
-            axis = axes[0]
-            base = self.axes[axis]
-
-            if padding is not None:
-                if padding[axis] != 1:
-                    N_pad = int(np.ceil(v.shape[axis] * padding[axis]))
-                    _pad = [[0, 0] for _ in v.shape]
-                    _pad[axis] = [0, N_pad - base.N]
-                    v = self.xp.pad(v, _pad, 'constant')
-
-                    shift = self.xp.exp(1j * np.pi * self.xp.arange(N_pad) / (2 * N_pad)) * base.N
-                else:
-                    shift = base.fft_utils['bck']['shift']
+            if self.ndim == 2:
+                padding_options = [(1.0, padding[1]), (padding[0], 1.0), padding, (1.0, 1.0)]
+            elif self.ndim == 3:
+                padding_options = [
+                    (1.0, 1.0, padding[2]),
+                    (1.0, padding[1], 1.0),
+                    (padding[0], 1.0, 1.0),
+                    (1.0, padding[1], padding[2]),
+                    (padding[0], 1.0, padding[2]),
+                    (padding[0], padding[1], 1.0),
+                    padding,
+                    (1.0, 1.0, 1.0),
+                ]
             else:
-                shift = base.fft_utils['bck']['shift']
-
-            expansion = [np.newaxis for _ in u.shape]
-            expansion[axis] = slice(0, v.shape[axis], 1)
-
-            v *= shift[(*expansion,)]
-
-            if padding is not None:
-                if padding[axis] != 1:
-                    shape = list(v.shape)
-                    if ('backward', *padding) in self.fft_dealias_shape_cache.keys():
-                        shape[0] = self.fft_dealias_shape_cache[('backward', *padding)]
-                    elif self.comm:
-                        send_buf = np.array(v.shape[0])
-                        recv_buf = np.array(v.shape[0])
-                        self.comm.Allreduce(send_buf, recv_buf)
-                        shape[0] = int(recv_buf)
-                    ifft = self.get_fft(axes, 'backward', shape=shape)
-                else:
-                    ifft = self.get_fft(axes, 'backward', padding=padding, **kwargs)
-            else:
-                ifft = self.get_fft(axes, 'backward', padding=padding, **kwargs)
-            v = ifft(v, axes=axes)
-
-            shuffle = [slice(0, s, 1) for s in v.shape]
-            shuffle[axis] = base.get_fft_shuffle(False, N=v.shape[axis])
-            v = v[(*shuffle,)]
-
-        return v.real
-
-    def itransform_single_component(self, u, axes=None, padding=None):
-        """
-        Inverse transform over single component of the solution
-
-        Args:
-            u data to transform:
-            axes (tuple): Axes over which to transform
-            padding (list): Padding factor for transform. E.g. a padding factor of 2 will add as many zeros as there were modes before before transforming
-
-        Returns:
-            Transformed data
-        """
-        # TODO: clean up and try putting more of this in the 1D bases
-        trfs = {
-            FFTHelper: self._transform_ifft,
-            ChebychevHelper: self._transform_idct,
-            UltrasphericalHelper: self._transform_idct,
-        }
-
-        axes = tuple(-i - 1 for i in range(self.ndim)[::-1]) if axes is None else axes
-        padding = (
-            [
-                1,
-            ]
-            * self.ndim
-            if padding is None
-            else padding
-        )
-
-        result = u.copy().astype(complex)
-        alignment = self.ndim - 1
-
-        axes_collapsed = [tuple(sorted(me for me in axes if type(self.axes[me]) == base)) for base in trfs.keys()]
-        bases = [list(trfs.keys())[i] for i in range(len(axes_collapsed)) if len(axes_collapsed[i]) > 0]
-        axes_collapsed = [me for me in axes_collapsed if len(me) > 0]
-        shape = list(self.global_shape[1:])
-
-        for trf in range(len(axes_collapsed)):
-            _axes = axes_collapsed[trf]
-            base = bases[trf]
-
-            if len(_axes) == 0:
-                continue
-
-            fft = self.get_fft(_axes, 'object', padding=padding, shape=shape)
-
-            _in = self.get_aligned(
-                result, axis_in=alignment, axis_out=self.ndim + _axes[0], forward=True, fft=fft, shape=shape
-            )
-            if self.comm is not None:
-                _in /= np.prod([self.axes[i].N for i in _axes])
-
-            alignment = self.ndim + _axes[0]
-
-            _out = trfs[base](_in, axes=_axes, padding=padding, shape=shape)
-
-            for _ax in _axes:
-                if fft:
-                    shape[_ax] = fft._input_shape[_ax]
-                else:
-                    shape[_ax] = _out.shape[_ax]
-
-            axes_next_base = (axes_collapsed + [(-1,)])[trf + 1]
-            alignment = alignment if len(axes_next_base) == 0 else self.ndim + axes_next_base[0]
-            result = self.get_aligned(
-                _out, axis_in=self.ndim + _axes[-1], axis_out=alignment, fft=fft, forward=False, shape=shape
-            )
-
-        return result
-
-    def get_aligned(self, u, axis_in, axis_out, fft=None, forward=False, **kwargs):
-        """
-        Realign the data along the axis when using distributed FFTs. `kwargs` will be used to get the correct PFFT
-        object from `mpi4py-fft`, which has suitable transfer classes for the shape of data. Hence, they should include
-        shape especially, if applicable.
-
-        Args:
-            u: The solution
-            axis_in (int): Current alignment
-            axis_out (int): New alignment
-            fft (mpi4py_fft.PFFT), optional: parallel FFT object
-            forward (bool): Whether the input is in spectral space or not
-
-        Returns:
-            solution aligned on `axis_in`
-        """
-        if self.comm is None or axis_in == axis_out:
-            return u.copy()
-        if self.comm.size == 1:
-            return u.copy()
-
-        global_fft = self.get_fft(**kwargs)
-        axisA = [me.axisA for me in global_fft.transfer]
-        axisB = [me.axisB for me in global_fft.transfer]
-
-        current_axis = axis_in
-
-        if axis_in in axisA and axis_out in axisB:
-            while current_axis != axis_out:
-                transfer = global_fft.transfer[axisA.index(current_axis)]
-
-                arrayB = self.xp.empty(shape=transfer.subshapeB, dtype=transfer.dtype)
-                arrayA = self.xp.empty(shape=transfer.subshapeA, dtype=transfer.dtype)
-                arrayA[:] = u[:]
-
-                transfer.forward(arrayA=arrayA, arrayB=arrayB)
+                raise NotImplementedError(f'Don\'t know how to infer alignment in {self.ndim}D!')
+            for _padding in padding_options:
+                pfft = self.get_pfft(padding=_padding, **kwargs)
+                aligned_axes = _alignment(pfft)
+                if len(aligned_axes) > 0:
+                    self.logger.debug(
+                        f'Found alignment of array with size {u.shape}: {aligned_axes} using padding {_padding}'
+                    )
+                    break
 
-                current_axis = transfer.axisB
-                u = arrayB
+        assert len(aligned_axes) > 0, f'Found no aligned axes for array of size {u.shape}!'
+        return aligned_axes
 
+    def redistribute(self, u, axis, forward_output, **kwargs):
+        if self.comm is None:
             return u
-        elif axis_in in axisB and axis_out in axisA:
-            while current_axis != axis_out:
-                transfer = global_fft.transfer[axisB.index(current_axis)]
 
-                arrayB = self.xp.empty(shape=transfer.subshapeB, dtype=transfer.dtype)
-                arrayA = self.xp.empty(shape=transfer.subshapeA, dtype=transfer.dtype)
-                arrayB[:] = u[:]
+        pfft = self.get_pfft(**kwargs)
+        _arr = self.newDistArray(pfft, forward_output=forward_output)
 
-                transfer.backward(arrayA=arrayA, arrayB=arrayB)
+        if 'Dist' in type(u).__name__ and False:
+            try:
+                u.redistribute(out=_arr)
+                return _arr
+            except AssertionError:
+                pass
+
+        u_alignment = self.infer_alignment(u, forward_output=False, **kwargs)
+        for alignment in u_alignment:
+            _arr = _arr.redistribute(alignment)
+            if _arr.shape == u.shape:
+                _arr[...] = u
+                return _arr.redistribute(axis)
+
+        raise Exception(
+            f'Don\'t know how to align array of local shape {u.shape} and global shape {self.global_shape}, aligned in axes {u_alignment}, to axis {axis}'
+        )
 
-                current_axis = transfer.axisA
-                u = arrayA
+    def transform(self, u, *args, axes=None, padding=None, **kwargs):
+        pfft = self.get_pfft(*args, axes=axes, padding=padding, **kwargs)
 
-            return u
-        else:  # go the potentially slower route of not reusing transfer classes
-            from mpi4py_fft import newDistArray
+        if pfft is None:
+            axes = axes if axes else tuple(i for i in range(self.ndim))
+            u_hat = u.copy()
+            for i in axes:
+                _axis = 1 + i if i >= 0 else i
+                u_hat = self.axes[i].transform(u_hat, axes=(_axis,))
+            return u_hat
 
-            fft = self.get_fft(**kwargs) if fft is None else fft
+        _in = self.newDistArray(pfft, forward_output=False, rank=1)
+        _out = self.newDistArray(pfft, forward_output=True, rank=1)
 
-            _in = newDistArray(fft, forward).redistribute(axis_in)
+        if _in.shape == u.shape:
             _in[...] = u
+        else:
+            _in[...] = self.redistribute(u, axis=_in.alignment, forward_output=False, padding=padding, **kwargs)
+
+        for i in range(self.ncomponents):
+            pfft.forward(_in[i], _out[i], normalize=False)
+
+        if padding is not None:
+            _out /= np.prod(padding)
+        return _out
+
+    def itransform(self, u, *args, axes=None, padding=None, **kwargs):
+        if padding is not None:
+            assert all(
+                (self.axes[i].N * padding[i]) % 1 == 0 for i in range(self.ndim)
+            ), 'Cannot do this padding with this resolution. Resulting resolution must be integer'
+
+        pfft = self.get_pfft(*args, axes=axes, padding=padding, **kwargs)
+        if pfft is None:
+            axes = axes if axes else tuple(i for i in range(self.ndim))
+            u_hat = u.copy()
+            for i in axes:
+                _axis = 1 + i if i >= 0 else i
+                u_hat = self.axes[i].itransform(u_hat, axes=(_axis,))
+            return u_hat
+
+        _in = self.newDistArray(pfft, forward_output=True, rank=1)
+        _out = self.newDistArray(pfft, forward_output=False, rank=1)
+
+        if _in.shape == u.shape:
+            _in[...] = u
+        else:
+            _in[...] = self.redistribute(u, axis=_in.alignment, forward_output=True, padding=padding, **kwargs)
 
-            return _in.redistribute(axis_out)
-
-    def itransform(self, u, axes=None, padding=None):
-        """
-        Inverse transform over all components of the solution
-
-        Args:
-            u data to transform:
-            axes (tuple): Axes over which to transform
-            padding (list): Padding factor for transform. E.g. a padding factor of 2 will add as many zeros as there were modes before before transforming
-
-        Returns:
-            Transformed data
-        """
-        axes = tuple(-i - 1 for i in range(self.ndim)[::-1]) if axes is None else axes
-        padding = (
-            [
-                1,
-            ]
-            * self.ndim
-            if padding is None
-            else padding
-        )
-
-        result = [
-            None,
-        ] * self.ncomponents
-        for comp in self.components:
-            i = self.index(comp)
-
-            result[i] = self.itransform_single_component(u[i], axes=axes, padding=padding)
+        for i in range(self.ncomponents):
+            pfft.backward(_in[i], _out[i], normalize=True)
 
-        return self.xp.stack(result)
+        if padding is not None:
+            _out *= np.prod(padding)
+        return _out
 
     def get_local_slice_of_1D_matrix(self, M, axis):
         """
         Get the local version of a 1D matrix. When using distributed FFTs, each rank will carry only a subset of modes,
-        which you can sort out via the `SpectralHelper.local_slice` attribute. When constructing a 1D matrix, you can
+        which you can sort out via the `SpectralHelper.local_slice()` attribute. When constructing a 1D matrix, you can
         use this method to get the part corresponding to the modes carried by this rank.
 
         Args:
             M (sparse matrix): Global 1D matrix you want to get the local version of
-            axis (int): Direction in which you want the local version. You will get the global matrix in other directions. This means slab decomposition only.
+            axis (int): Direction in which you want the local version. You will get the global matrix in other directions.
 
         Returns:
             sparse local matrix
         """
-        return M.tocsc()[self.local_slice[axis], self.local_slice[axis]]
+        return M.tocsc()[self.local_slice(True)[axis], self.local_slice(True)[axis]]
 
     def expand_matrix_ND(self, matrix, aligned):
         sp = self.sparse_lib
@@ -1929,6 +1802,15 @@ def expand_matrix_ND(self, matrix, aligned):
             mats[axis] = self.get_local_slice_of_1D_matrix(I1D, axis)
 
             return sp.kron(*mats)
+        elif ndim == 3:
+
+            mats = [None] * ndim
+            mats[aligned] = self.get_local_slice_of_1D_matrix(matrix, aligned)
+            for axis in axes:
+                I1D = sp.eye(self.axes[axis].N)
+                mats[axis] = self.get_local_slice_of_1D_matrix(I1D, axis)
+
+            return sp.kron(mats[0], sp.kron(*mats[1:]))
 
         else:
             raise NotImplementedError(f'Matrix expansion not implemented for {ndim} dimensions!')
diff --git a/pySDC/implementations/problem_classes/RayleighBenard.py b/pySDC/implementations/problem_classes/RayleighBenard.py
index 509bf3ca14..504c036576 100644
--- a/pySDC/implementations/problem_classes/RayleighBenard.py
+++ b/pySDC/implementations/problem_classes/RayleighBenard.py
@@ -222,7 +222,7 @@ def eval_f(self, u, *args, **kwargs):
         Dx_u_hat = (self._Dx_expanded @ u_hat.flatten()).reshape(u_hat.shape)
         Dz_u_hat = (self._Dz_expanded @ u_hat.flatten()).reshape(u_hat.shape)
 
-        padding = [self.dealiasing, self.dealiasing]
+        padding = (self.dealiasing, self.dealiasing)
         Dx_u_pad = self.itransform(Dx_u_hat, padding=padding).real
         Dz_u_pad = self.itransform(Dz_u_hat, padding=padding).real
         u_pad = self.itransform(u_hat, padding=padding).real
@@ -407,7 +407,7 @@ def compute_Nusselt_numbers(self, u):
         DzT_hat[iT] = (self.Dz @ u_hat[iT].flatten()).reshape(DzT_hat[iT].shape)
 
         # compute vT with dealiasing
-        padding = [self.dealiasing, self.dealiasing]
+        padding = (self.dealiasing, self.dealiasing)
         u_pad = self.itransform(u_hat, padding=padding).real
         _me = self.xp.zeros_like(u_pad)
         _me[0] = u_pad[iv] * u_pad[iT]
diff --git a/pySDC/implementations/problem_classes/RayleighBenard3D.py b/pySDC/implementations/problem_classes/RayleighBenard3D.py
new file mode 100644
index 0000000000..a49784dd81
--- /dev/null
+++ b/pySDC/implementations/problem_classes/RayleighBenard3D.py
@@ -0,0 +1,359 @@
+import numpy as np
+from mpi4py import MPI
+
+from pySDC.implementations.problem_classes.generic_spectral import GenericSpectralLinear
+from pySDC.implementations.datatype_classes.mesh import mesh, imex_mesh
+from pySDC.core.convergence_controller import ConvergenceController
+from pySDC.core.hooks import Hooks
+from pySDC.implementations.convergence_controller_classes.check_convergence import CheckConvergence
+from pySDC.core.problem import WorkCounter
+
+
+class RayleighBenard3D(GenericSpectralLinear):
+    """
+    Rayleigh-Benard Convection is a variation of incompressible Navier-Stokes.
+
+    The equations we solve are
+
+        u_x + v_y + w_z = 0
+        T_t - kappa (T_xx + T_yy + T_zz) = -uT_x - vT_y - wT_z
+        u_t - nu (u_xx + u_yy + u_zz) + p_x = -uu_x - vu_y - wu_z
+        v_t - nu (v_xx + v_yy + v_zz) + p_y = -uv_x - vv_y - wv_z
+        w_t - nu (w_xx + w_yy + w_zz) + p_z - T = -uw_x - vw_y - ww_z
+
+    with u the horizontal velocity, v the vertical velocity (in z-direction), T the temperature, p the pressure, indices
+    denoting derivatives, kappa=(Rayleigh * Prandtl)**(-1/2) and nu = (Rayleigh / Prandtl)**(-1/2). Everything on the left
+    hand side, that is the viscous part, the pressure gradient and the buoyancy due to temperature are treated
+    implicitly, while the non-linear convection part on the right hand side is integrated explicitly.
+
+    The domain, vertical boundary conditions and pressure gauge are
+
+        Omega = [0, 8) x (-1, 1)
+        T(z=+1) = 0
+        T(z=-1) = 2
+        u(z=+-1) = v(z=+-1) = 0
+        integral over p = 0
+
+    The spectral discretization uses FFT horizontally, implying periodic BCs, and an ultraspherical method vertically to
+    facilitate the Dirichlet BCs.
+
+    Parameters:
+        Prandtl (float): Prandtl number
+        Rayleigh (float): Rayleigh number
+        nx (int): Horizontal resolution
+        nz (int): Vertical resolution
+        BCs (dict): Can specify boundary conditions here
+        dealiasing (float): Dealiasing factor for evaluating the non-linear part
+        comm (mpi4py.Intracomm): Space communicator
+    """
+
+    dtype_u = mesh
+    dtype_f = imex_mesh
+
+    def __init__(
+        self,
+        Prandtl=1,
+        Rayleigh=2e6,
+        nx=256,
+        ny=256,
+        nz=64,
+        BCs=None,
+        dealiasing=1.5,
+        comm=None,
+        Lz=1,
+        Lx=1,
+        Ly=1,
+        useGPU=False,
+        **kwargs,
+    ):
+        """
+        Constructor. `kwargs` are forwarded to parent class constructor.
+
+        Args:
+            Prandtl (float): Prandtl number
+            Rayleigh (float): Rayleigh number
+            nx (int): Resolution in x-direction
+            nz (int): Resolution in z direction
+            BCs (dict): Vertical boundary conditions
+            dealiasing (float): Dealiasing for evaluating the non-linear part in real space
+            comm (mpi4py.Intracomm): Space communicator
+            Lx (float): Horizontal length of the domain
+        """
+        # TODO: documentation
+        BCs = {} if BCs is None else BCs
+        BCs = {
+            'T_top': 0,
+            'T_bottom': Lz,
+            'w_top': 0,
+            'w_bottom': 0,
+            'v_top': 0,
+            'v_bottom': 0,
+            'u_top': 0,
+            'u_bottom': 0,
+            'p_integral': 0,
+            **BCs,
+        }
+        if comm is None:
+            try:
+                from mpi4py import MPI
+
+                comm = MPI.COMM_WORLD
+            except ModuleNotFoundError:
+                pass
+        self._makeAttributeAndRegister(
+            'Prandtl',
+            'Rayleigh',
+            'nx',
+            'ny',
+            'nz',
+            'BCs',
+            'dealiasing',
+            'comm',
+            'Lx',
+            'Ly',
+            'Lz',
+            localVars=locals(),
+            readOnly=True,
+        )
+
+        bases = [
+            {'base': 'fft', 'N': nx, 'x0': 0, 'x1': self.Lx, 'useFFTW': not useGPU},
+            {'base': 'fft', 'N': ny, 'x0': 0, 'x1': self.Ly, 'useFFTW': not useGPU},
+            {'base': 'ultraspherical', 'N': nz, 'x0': 0, 'x1': self.Lz},
+        ]
+        components = ['u', 'v', 'w', 'T', 'p']
+        super().__init__(bases, components, comm=comm, useGPU=useGPU, **kwargs)
+
+        self.X, self.Y, self.Z = self.get_grid()
+        self.Kx, self.Ky, self.Kz = self.get_wavenumbers()
+
+        # construct 3D matrices
+        Dzz = self.get_differentiation_matrix(axes=(2,), p=2)
+        Dz = self.get_differentiation_matrix(axes=(2,))
+        Dy = self.get_differentiation_matrix(axes=(1,))
+        Dyy = self.get_differentiation_matrix(axes=(1,), p=2)
+        Dx = self.get_differentiation_matrix(axes=(0,))
+        Dxx = self.get_differentiation_matrix(axes=(0,), p=2)
+        Id = self.get_Id()
+
+        S1 = self.get_basis_change_matrix(p_out=0, p_in=1)
+        S2 = self.get_basis_change_matrix(p_out=0, p_in=2)
+
+        U01 = self.get_basis_change_matrix(p_in=0, p_out=1)
+        U12 = self.get_basis_change_matrix(p_in=1, p_out=2)
+        U02 = self.get_basis_change_matrix(p_in=0, p_out=2)
+
+        self.Dx = Dx
+        self.Dxx = Dxx
+        self.Dy = Dy
+        self.Dyy = Dyy
+        self.Dz = S1 @ Dz
+        self.Dzz = S2 @ Dzz
+        self.S2 = S2
+        self.S1 = S1
+
+        # compute rescaled Rayleigh number to extract viscosity and thermal diffusivity
+        Ra = Rayleigh / (max([abs(BCs['T_top'] - BCs['T_bottom']), np.finfo(float).eps]) * self.axes[2].L ** 3)
+        self.kappa = (Ra * Prandtl) ** (-1 / 2.0)
+        self.nu = (Ra / Prandtl) ** (-1 / 2.0)
+
+        # construct operators
+        _D = U02 @ (Dxx + Dyy) + Dzz
+        L_lhs = {
+            'p': {'u': U01 @ Dx, 'v': U01 @ Dy, 'w': Dz},  # divergence free constraint
+            'u': {'p': U02 @ Dx, 'u': -self.nu * _D},
+            'v': {'p': U02 @ Dy, 'v': -self.nu * _D},
+            'w': {'p': U12 @ Dz, 'w': -self.nu * _D, 'T': -U02 @ Id},
+            'T': {'T': -self.kappa * _D},
+        }
+        self.setup_L(L_lhs)
+
+        # mass matrix
+        _U02 = U02 @ Id
+        M_lhs = {i: {i: _U02} for i in ['u', 'v', 'w', 'T']}
+        self.setup_M(M_lhs)
+
+        # BCs
+        self.add_BC(
+            component='p', equation='p', axis=2, v=self.BCs['p_integral'], kind='integral', line=-1, scalar=True
+        )
+        self.add_BC(component='T', equation='T', axis=2, x=-1, v=self.BCs['T_bottom'], kind='Dirichlet', line=-1)
+        self.add_BC(component='T', equation='T', axis=2, x=1, v=self.BCs['T_top'], kind='Dirichlet', line=-2)
+        self.add_BC(component='w', equation='w', axis=2, x=1, v=self.BCs['w_top'], kind='Dirichlet', line=-1)
+        self.add_BC(component='w', equation='w', axis=2, x=-1, v=self.BCs['w_bottom'], kind='Dirichlet', line=-2)
+        self.remove_BC(component='w', equation='w', axis=2, x=-1, kind='Dirichlet', line=-2, scalar=True)
+        for comp in ['u', 'v']:
+            self.add_BC(
+                component=comp, equation=comp, axis=2, v=self.BCs[f'{comp}_top'], x=1, kind='Dirichlet', line=-2
+            )
+            self.add_BC(
+                component=comp,
+                equation=comp,
+                axis=2,
+                v=self.BCs[f'{comp}_bottom'],
+                x=-1,
+                kind='Dirichlet',
+                line=-1,
+            )
+
+        # eliminate Nyquist mode if needed
+        if nx % 2 == 0:
+            Nyquist_mode_index = self.axes[0].get_Nyquist_mode_index()
+            for component in self.components:
+                self.add_BC(
+                    component=component, equation=component, axis=0, kind='Nyquist', line=int(Nyquist_mode_index), v=0
+                )
+        if ny % 2 == 0:
+            Nyquist_mode_index = self.axes[0].get_Nyquist_mode_index()
+            for component in self.components:
+                self.add_BC(
+                    component=component, equation=component, axis=1, kind='Nyquist', line=int(Nyquist_mode_index), v=0
+                )
+        self.setup_BCs()
+
+        self.work_counters['rhs'] = WorkCounter()
+
+    def eval_f(self, u, *args, **kwargs):
+        f = self.f_init
+
+        if self.spectral_space:
+            u_hat = u.copy()
+        else:
+            u_hat = self.transform(u)
+
+        f_impl_hat = self.u_init_forward
+
+        iu, iv, iw, iT, ip = self.index(['u', 'v', 'w', 'T', 'p'])
+        derivative_indices = [iu, iv, iw, iT]
+
+        # evaluate implicit terms
+        f_impl_hat = -(self.L @ u_hat.flatten()).reshape(u_hat.shape)
+        for i in derivative_indices:
+            f_impl_hat[i] = (self.S2 @ f_impl_hat[i].flatten()).reshape(f_impl_hat[i].shape)
+        f_impl_hat[ip] = (self.S1 @ f_impl_hat[ip].flatten()).reshape(f_impl_hat[ip].shape)
+
+        if self.spectral_space:
+            self.xp.copyto(f.impl, f_impl_hat)
+        else:
+            f.impl[:] = self.itransform(f_impl_hat).real
+
+        # -------------------------------------------
+        # treat convection explicitly with dealiasing
+
+        # start by computing derivatives
+        padding = (self.dealiasing,) * self.ndim
+        derivatives = []
+        u_hat_flat = [u_hat[i].flatten() for i in derivative_indices]
+
+        _D_u_hat = self.u_init_forward
+        for D in [self.Dx, self.Dy, self.Dz]:
+            _D_u_hat *= 0
+            for i in derivative_indices:
+                self.xp.copyto(_D_u_hat[i], (D @ u_hat_flat[i]).reshape(_D_u_hat[i].shape))
+            derivatives.append(self.itransform(_D_u_hat, padding=padding).real)
+
+        u_pad = self.itransform(u_hat, padding=padding).real
+
+        fexpl_pad = self.xp.zeros_like(u_pad)
+        for i in derivative_indices:
+            for i_vel, iD in zip([iu, iv, iw], range(self.ndim)):
+                fexpl_pad[i] -= u_pad[i_vel] * derivatives[iD][i]
+
+        if self.spectral_space:
+            self.xp.copyto(f.expl, self.transform(fexpl_pad, padding=padding))
+        else:
+            f.expl[:] = self.itransform(self.transform(fexpl_pad, padding=padding)).real
+
+        self.work_counters['rhs']()
+        return f
+
+    def u_exact(self, t=0, noise_level=1e-3, seed=99):
+        assert t == 0
+        assert (
+            self.BCs['v_top'] == self.BCs['v_bottom']
+        ), 'Initial conditions are only implemented for zero velocity gradient'
+
+        me = self.spectral.u_init
+        iu, iw, iT, ip = self.index(['u', 'w', 'T', 'p'])
+
+        # linear temperature gradient
+        assert self.Lz == 1
+        for comp in ['T', 'u', 'v', 'w']:
+            a = self.BCs[f'{comp}_top'] - self.BCs[f'{comp}_bottom']
+            b = self.BCs[f'{comp}_bottom']
+            me[self.index(comp)] = a * self.Z + b
+
+        # perturb slightly
+        rng = self.xp.random.default_rng(seed=seed)
+
+        noise = self.spectral.u_init
+        noise[iT] = rng.random(size=me[iT].shape)
+
+        me[iT] += noise[iT].real * noise_level * (self.Z - 1) * (self.Z + 1)
+
+        if self.spectral_space:
+            me_hat = self.spectral.u_init_forward
+            me_hat[:] = self.transform(me)
+            return me_hat
+        else:
+            return me
+
+    def get_fig(self):  # pragma: no cover
+        """
+        Get a figure suitable to plot the solution of this problem
+
+        Returns
+        -------
+        self.fig : matplotlib.pyplot.figure.Figure
+        """
+        import matplotlib.pyplot as plt
+        from mpl_toolkits.axes_grid1 import make_axes_locatable
+
+        plt.rcParams['figure.constrained_layout.use'] = True
+        self.fig, axs = plt.subplots(2, 1, sharex=True, sharey=True, figsize=((10, 5)))
+        self.cax = []
+        divider = make_axes_locatable(axs[0])
+        self.cax += [divider.append_axes('right', size='3%', pad=0.03)]
+        divider2 = make_axes_locatable(axs[1])
+        self.cax += [divider2.append_axes('right', size='3%', pad=0.03)]
+        return self.fig
+
+    def plot(self, u, t=None, fig=None, quantity='T'):  # pragma: no cover
+        r"""
+        Plot the solution.
+
+        Parameters
+        ----------
+        u : dtype_u
+            Solution to be plotted
+        t : float
+            Time to display at the top of the figure
+        fig : matplotlib.pyplot.figure.Figure
+            Figure with the same structure as a figure generated by `self.get_fig`. If none is supplied, a new figure will be generated.
+        quantity : (str)
+            quantity you want to plot
+
+        Returns
+        -------
+        None
+        """
+        fig = self.get_fig() if fig is None else fig
+        axs = fig.axes
+
+        imV = axs[1].pcolormesh(self.X, self.Z, self.compute_vorticity(u).real)
+
+        if self.spectral_space:
+            u = self.itransform(u)
+
+        imT = axs[0].pcolormesh(self.X, self.Z, u[self.index(quantity)].real)
+
+        for i, label in zip([0, 1], [rf'${quantity}$', 'vorticity']):
+            axs[i].set_aspect(1)
+            axs[i].set_title(label)
+
+        if t is not None:
+            fig.suptitle(f't = {t:.2f}')
+        axs[1].set_xlabel(r'$x$')
+        axs[1].set_ylabel(r'$z$')
+        fig.colorbar(imT, self.cax[0])
+        fig.colorbar(imV, self.cax[1])
diff --git a/pySDC/implementations/problem_classes/generic_spectral.py b/pySDC/implementations/problem_classes/generic_spectral.py
index c14e60a22a..306b6f731b 100644
--- a/pySDC/implementations/problem_classes/generic_spectral.py
+++ b/pySDC/implementations/problem_classes/generic_spectral.py
@@ -347,8 +347,8 @@ def solve_system(self, rhs, dt, u0=None, *args, skip_itransform=False, **kwargs)
     def setUpFieldsIO(self):
         Rectilinear.setupMPI(
             comm=self.comm,
-            iLoc=[me.start for me in self.local_slice],
-            nLoc=[me.stop - me.start for me in self.local_slice],
+            iLoc=[me.start for me in self.local_slice(False)],
+            nLoc=[me.stop - me.start for me in self.local_slice(False)],
         )
 
     def getOutputFile(self, fileName):
diff --git a/pySDC/tests/test_helpers/test_spectral_helper.py b/pySDC/tests/test_helpers/test_spectral_helper.py
index 7df1eff3ac..a56fab01c2 100644
--- a/pySDC/tests/test_helpers/test_spectral_helper.py
+++ b/pySDC/tests/test_helpers/test_spectral_helper.py
@@ -4,9 +4,8 @@
 @pytest.mark.base
 @pytest.mark.parametrize('nx', [16])
 @pytest.mark.parametrize('nz', [16])
-@pytest.mark.parametrize('variant', ['T2U', 'T2T'])
 @pytest.mark.parametrize('axes', [(-2,), (-1,), (-2, -1)])
-def test_integration_matrix2D(nx, nz, variant, axes, useMPI=False, **kwargs):
+def test_integration_matrix2D(nx, nz, axes, useMPI=False, **kwargs):
     import numpy as np
     from pySDC.helpers.spectral_helper import SpectralHelper
 
@@ -45,29 +44,27 @@ def test_integration_matrix2D(nx, nz, variant, axes, useMPI=False, **kwargs):
     assert np.allclose(S_u, expect, atol=1e-12)
 
 
-@pytest.mark.base
+@pytest.mark.mpi4py
 @pytest.mark.parametrize('nx', [32])
 @pytest.mark.parametrize('nz', [16])
-@pytest.mark.parametrize('variant', ['T2U', 'T2T'])
 @pytest.mark.parametrize('axes', [(-2,), (-1,), (-2, -1)])
 @pytest.mark.parametrize('bx', ['cheby', 'fft'])
 @pytest.mark.parametrize('bz', ['cheby', 'fft'])
-def test_differentiation_matrix2D(nx, nz, variant, axes, bx, bz, useMPI=False, **kwargs):
-    import numpy as np
+def test_differentiation_matrix2D(nx, nz, axes, bx, bz, useGPU=False, **kwargs):
     from pySDC.helpers.spectral_helper import SpectralHelper
 
-    if useMPI:
-        from mpi4py import MPI
-
-        comm = MPI.COMM_WORLD
-    else:
-        comm = None
-
-    helper = SpectralHelper(comm=comm, debug=True)
+    helper = SpectralHelper(debug=True, useGPU=useGPU)
     helper.add_axis(base=bx, N=nx)
     helper.add_axis(base=bz, N=nz)
     helper.setup_fft()
 
+    np = helper.xp
+
+    if useGPU:
+        import cupy
+
+        assert np == cupy
+
     X, Z = helper.get_grid()
     conv = helper.get_basis_change_matrix()
     D = helper.get_differentiation_matrix(axes)
@@ -95,30 +92,30 @@ def test_differentiation_matrix2D(nx, nz, variant, axes, bx, bz, useMPI=False, *
         else:
             raise NotImplementedError
 
-    u_hat = helper.transform(u, axes=(-2, -1))
+    u_hat = helper.transform(u)
     D_u_hat = (conv @ D @ u_hat.flatten()).reshape(u_hat.shape)
-    D_u = helper.itransform(D_u_hat, axes=(-1, -2)).real
+    D_u = helper.itransform(D_u_hat).real
 
-    assert np.allclose(D_u, expect, atol=1e-12)
+    assert np.allclose(D_u, expect, atol=1e-11)
+
+
+@pytest.mark.cupy
+@pytest.mark.parametrize('axes', [(-2,), (-1,), (-2, -1)])
+@pytest.mark.parametrize('bx', ['cheby', 'fft'])
+@pytest.mark.parametrize('bz', ['cheby', 'fft'])
+def test_differentiation_matrix2D_GPU(bx, bz, axes):
+    test_differentiation_matrix2D(32, 16, bx=bx, bz=bz, axes=axes, useGPU=True)
 
 
 @pytest.mark.base
 @pytest.mark.parametrize('nx', [32])
 @pytest.mark.parametrize('nz', [16])
-@pytest.mark.parametrize('variant', ['T2U', 'T2T'])
 @pytest.mark.parametrize('bx', ['cheby', 'fft'])
-def test_identity_matrix2D(nx, nz, variant, bx, useMPI=False, **kwargs):
+def test_identity_matrix2D(nx, nz, bx, **kwargs):
     import numpy as np
     from pySDC.helpers.spectral_helper import SpectralHelper
 
-    if useMPI:
-        from mpi4py import MPI
-
-        comm = MPI.COMM_WORLD
-    else:
-        comm = None
-
-    helper = SpectralHelper(comm=comm, debug=True)
+    helper = SpectralHelper(debug=True)
     helper.add_axis(base=bx, N=nx)
     helper.add_axis(base='cheby', N=nz)
     helper.setup_fft()
@@ -162,7 +159,7 @@ def test_matrix1D(N, base, type):
 
     u = helper.u_init
     u[0] = C @ D @ coeffs
-    du = helper.itransform(u, axes=(-1,))
+    du = helper.itransform(u)
 
     if type == 'diff':
         exact = np.polynomial.Chebyshev(coeffs).deriv(1)(x)
@@ -176,18 +173,15 @@ def _test_transform_dealias(
     bx,
     bz,
     axis,
-    nx=2**4 + 1,
-    nz=2**2 + 1,
+    nx=2**4,
+    nz=2**2,
     padding=3 / 2,
-    axes=(
-        -2,
-        -1,
-    ),
     useMPI=True,
+    useGPU=False,
     **kwargs,
 ):
-    import numpy as np
     from pySDC.helpers.spectral_helper import SpectralHelper
+    import numpy as np
 
     if useMPI:
         from mpi4py import MPI
@@ -196,17 +190,25 @@ def _test_transform_dealias(
     else:
         comm = None
 
-    helper = SpectralHelper(comm=comm, debug=True)
+    helper = SpectralHelper(comm=comm, debug=True, useGPU=useGPU)
     helper.add_axis(base=bx, N=nx)
     helper.add_axis(base=bz, N=nz)
     helper.setup_fft()
     xp = helper.xp
 
-    _padding = [
-        padding,
-    ] * helper.ndim
+    if useGPU:
+        import cupy
+
+        assert xp == cupy
 
-    helper_pad = SpectralHelper(comm=comm, debug=True)
+    _padding = tuple(
+        [
+            padding,
+        ]
+        * helper.ndim
+    )
+
+    helper_pad = SpectralHelper(comm=comm, debug=True, useGPU=useGPU)
     helper_pad.add_axis(base=bx, N=int(_padding[0] * nx))
     helper_pad.add_axis(base=bz, N=int(_padding[1] * nz))
     helper_pad.setup_fft()
@@ -219,14 +221,26 @@ def _test_transform_dealias(
     X, Z = helper.get_grid()
     X_pad, Z_pad = helper_pad.get_grid()
 
+    if useGPU:
+        X_CPU = X.get()
+        Z_CPU = Z.get()
+        X_pad_CPU = X_pad.get()
+        Z_pad_CPU = Z_pad.get()
+    else:
+        X_CPU = X
+        Z_CPU = Z
+        X_pad_CPU = X_pad
+        Z_pad_CPU = Z_pad
+
     if axis == -2:
         f = nx // 3
         u_hat[0][xp.logical_and(xp.abs(Kx) == f, Kz == 0)] += 1
         u2_hat_expect[0][xp.logical_and(xp.abs(Kx) == 2 * f, Kz == 0)] += 1 / nx
         u2_hat_expect[0][xp.logical_and(xp.abs(Kx) == 0, Kz == 0)] += 2 / nx
-        u_expect[0] += np.cos(f * X) * 2 / nx
-        u_expect_pad[0] += np.cos(f * X_pad) * 2 / nx
+        u_expect[0] += xp.cos(f * X) * 2 / nx
+        u_expect_pad[0] += xp.cos(f * X_pad) * 2 / nx
     elif axis == -1:
+
         f = nz // 2 + 1
         u_hat[0][xp.logical_and(xp.abs(Kz) == f, Kx == 0)] += 1
         u2_hat_expect[0][xp.logical_and(Kz == 2 * f, Kx == 0)] += 1 / (2 * nx)
@@ -234,8 +248,8 @@ def _test_transform_dealias(
 
         coef = np.zeros(nz)
         coef[f] = 1 / nx
-        u_expect[0] = np.polynomial.Chebyshev(coef)(Z)
-        u_expect_pad[0] = np.polynomial.Chebyshev(coef)(Z_pad)
+        u_expect[0] = xp.array(np.polynomial.Chebyshev(coef)(Z_CPU))
+        u_expect_pad[0] = xp.array(np.polynomial.Chebyshev(coef)(Z_pad_CPU))
     elif axis in [(-1, -2), (-2, -1)]:
         fx = nx // 3
         fz = nz // 2 + 1
@@ -252,35 +266,47 @@ def _test_transform_dealias(
         coef = np.zeros(nz)
         coef[fz] = 1 / nx
 
-        u_expect[0] = np.cos(fx * X) * 2 / nx + np.polynomial.Chebyshev(coef)(Z)
-        u_expect_pad[0] = np.cos(fx * X_pad) * 2 / nx + np.polynomial.Chebyshev(coef)(Z_pad)
+        u_expect[0] = xp.cos(fx * X) * 2 / nx + xp.array(np.polynomial.Chebyshev(coef)(Z_CPU))
+        u_expect_pad[0] = xp.cos(fx * X_pad) * 2 / nx + xp.array(np.polynomial.Chebyshev(coef)(Z_pad_CPU))
     else:
         raise NotImplementedError
 
     assert bx == 'fft' and bz == 'cheby', 'This test is not implemented for the bases you are looking for'
 
-    u_pad = helper.itransform(u_hat, padding=_padding, axes=axes)
-    u = helper.itransform(u_hat, axes=axes).real
+    u_pad = helper.itransform(u_hat, padding=_padding)
+    u = helper.itransform(u_hat).real
 
-    assert not np.allclose(u_pad.shape, u.shape)
+    assert not xp.allclose(u_pad.shape, u.shape) or padding == 1
 
     u2 = u**2
     u2_pad = u_pad**2
 
-    assert np.allclose(u, u_expect)
-    assert np.allclose(u_pad, u_expect_pad)
+    assert xp.allclose(u, u_expect)
+    assert xp.allclose(u_pad, u_expect_pad)
+
+    u2_hat = helper.transform(u2_pad, padding=_padding)
+    assert xp.allclose(u2_hat_expect, u2_hat)
+    assert not xp.allclose(u2_hat_expect, helper.transform(u2)), 'Test is too boring, no dealiasing needed'
+
 
-    assert np.allclose(u2_hat_expect, helper.transform(u2_pad, padding=_padding))
-    assert not np.allclose(u2_hat_expect, helper.transform(u2)), 'Test is too boring, no dealiasing needed'
+@pytest.mark.cupy
+@pytest.mark.parametrize('axis', [-1, -2])
+@pytest.mark.parametrize('bx', ['fft'])
+@pytest.mark.parametrize('bz', ['cheby'])
+def test_dealias_GPU(axis, bx, bz, **kwargs):
+    _test_transform_dealias(axis=axis, bx=bx, bz=bz, **kwargs, useGPU=True)
 
 
 @pytest.mark.base
 @pytest.mark.parametrize('nx', [3, 8])
-@pytest.mark.parametrize('nz', [3, 8])
+@pytest.mark.parametrize('ny', [3, 8])
+@pytest.mark.parametrize('nz', [0, 3, 8])
+@pytest.mark.parametrize('by', ['fft', 'cheby'])
 @pytest.mark.parametrize('bz', ['fft', 'cheby'])
 @pytest.mark.parametrize('bx', ['fft', 'cheby'])
-@pytest.mark.parametrize('axes', [(-1,), (-2,), (-1, -2), (-2, -1)])
-def test_transform(nx, nz, bx, bz, axes, useMPI=False, **kwargs):
+@pytest.mark.parametrize('axes', [(-1,), (-2,), (-3,), (-1, -2), (-2, -1), (-1, -2, -3)])
+@pytest.mark.parametrize('padding', [1])
+def test_transform(nx, ny, nz, bx, by, bz, axes, padding, useMPI=False, **kwargs):
     import numpy as np
     from pySDC.helpers.spectral_helper import SpectralHelper
 
@@ -288,49 +314,105 @@ def test_transform(nx, nz, bx, bz, axes, useMPI=False, **kwargs):
         from mpi4py import MPI
 
         comm = MPI.COMM_WORLD
-        rank = comm.rank
     else:
         comm = None
 
     helper = SpectralHelper(comm=comm, debug=True)
     helper.add_axis(base=bx, N=nx)
-    helper.add_axis(base=bz, N=nz)
-    helper.setup_fft()
+    helper.add_axis(base=by, N=ny)
+    if nz > 0:
+        helper.add_axis(base=bz, N=nz)
+    elif -3 in axes:
+        return None
 
+    helper.setup_fft()
     u = helper.u_init
-    u[...] = np.random.random(u.shape)
 
-    u_all = np.empty(shape=(1, nx, nz), dtype=u.dtype)
+    if nz > 0:
+        u_all = np.random.random((1, nx, ny, nz)).astype(u.dtype)
+    else:
+        u_all = np.random.random((1, nx, ny)).astype(u.dtype)
 
     if useMPI:
-        rank = comm.rank
-        u_all[...] = (np.array(comm.allgather(u[0]))).reshape(u_all.shape)
-        if comm.size == 1:
-            assert np.allclose(u_all, u)
-    else:
-        rank = 0
-        u_all[...] = u
+        u_all = comm.bcast(u_all, root=0)
+
+    u[...] = u_all[
+        (
+            0,
+            *helper.local_slice(False),
+        )
+    ]
+
+    axes_ordered = []
+    for ax in axes:
+        if 'FFT' in type(helper.axes[ax]).__name__:
+            axes_ordered = axes_ordered + [ax]
+        else:
+            axes_ordered = [ax] + axes_ordered
 
     expect_trf = u_all.copy()
-
-    if bx == 'fft' and bz == 'cheby':
-        axes_1d = sorted(axes)[::-1]
-    elif bx == 'cheby' and bz == 'fft':
-        axes_1d = sorted(axes)
-    else:
-        axes_1d = axes
-
-    for i in axes_1d:
+    for i in axes_ordered:
         base = helper.axes[i]
-        expect_trf = base.transform(expect_trf, axis=i)
+        expect_trf = base.transform(expect_trf, axes=(i,))
 
     trf = helper.transform(u, axes=axes)
     itrf = helper.itransform(trf, axes=axes)
 
-    expect_local = expect_trf[:, trf.shape[1] * rank : trf.shape[1] * (rank + 1), :]
+    expect_local = expect_trf[
+        (
+            ...,
+            *helper.local_slice(True),
+        )
+    ]
+    if expect_local.shape != trf.shape:
+        expect_local = expect_trf[
+            (
+                ...,
+                *helper.local_slice(True),
+            )
+        ]
 
     assert np.allclose(expect_local, trf), 'Forward transform is unexpected'
-    assert np.allclose(itrf, u), 'Backward transform is unexpected'
+    assert np.allclose(
+        itrf,
+        u_all[
+            (
+                0,
+                *helper.local_slice(False),
+            )
+        ],
+    ), 'Backward transform is unexpected'
+
+    if padding > 1:
+        _padding = (padding,) * helper.ndim
+        u_pad = helper.itransform(trf, axes=axes, padding=_padding)
+        trf2 = helper.transform(u_pad, axes=axes, padding=_padding)
+        assert np.allclose(trf2, trf)
+        assert sum(u_pad.shape) > sum(u.shape), f'{u_pad.shape}, {u.shape}'
+
+
+@pytest.mark.mpi4py
+@pytest.mark.mpi(ranks=[1, 2])
+@pytest.mark.parametrize('nx', [4, 8])
+@pytest.mark.parametrize('ny', [4, 8])
+@pytest.mark.parametrize('nz', [0, 8])
+@pytest.mark.parametrize(
+    'by',
+    [
+        'fft',
+    ],
+)
+@pytest.mark.parametrize(
+    'bx',
+    [
+        'fft',
+    ],
+)
+@pytest.mark.parametrize('bz', ['fft', 'cheby'])
+@pytest.mark.parametrize('axes', [(-1,), (-1, -2), (-2, -1, -3)])
+@pytest.mark.parametrize('padding', [1, 1.5])
+def test_transform_MPI(mpi_ranks, nx, ny, nz, bx, by, bz, axes, padding, **kwargs):
+    test_transform(nx=nx, ny=ny, nz=nz, bx=bx, by=by, bz=bz, axes=axes, padding=padding, useMPI=True, **kwargs)
 
 
 def run_MPI_test(num_procs, **kwargs):
@@ -355,17 +437,6 @@ def run_MPI_test(num_procs, **kwargs):
     )
 
 
-@pytest.mark.mpi4py
-@pytest.mark.parametrize('nx', [4, 8])
-@pytest.mark.parametrize('nz', [4, 8])
-@pytest.mark.parametrize('bz', ['fft', 'cheby'])
-@pytest.mark.parametrize('bx', ['fft'])
-@pytest.mark.parametrize('num_procs', [2, 1])
-@pytest.mark.parametrize('axes', ["-1", "-2", "-1,-2"])
-def test_transform_MPI(nx, nz, bx, bz, num_procs, axes):
-    run_MPI_test(num_procs=num_procs, test='transform', nx=nx, nz=nz, bx=bx, bz=bz, axes=axes)
-
-
 @pytest.mark.mpi4py
 @pytest.mark.parametrize('nx', [8])
 @pytest.mark.parametrize('nz', [16])
@@ -396,7 +467,7 @@ def test_tau_method_integral(N, bc_val):
 @pytest.mark.parametrize('bc', [-1, 0, 1])
 @pytest.mark.parametrize('N', [8, 32])
 @pytest.mark.parametrize('bc_val', [-99, 3.1415])
-def test_tau_method(bc, N, bc_val, kind='Dirichlet'):
+def test_tau_method(bc, N, bc_val, kind='Dirichlet', useGPU=False):
     '''
     solve u_x - u + tau P = 0, u(bc) = bc_val
 
@@ -406,14 +477,22 @@ def test_tau_method(bc, N, bc_val, kind='Dirichlet'):
     The test verifies that the solution satisfies the perturbed equation and the boundary condition.
     '''
     from pySDC.helpers.spectral_helper import SpectralHelper
-    import numpy as np
     import scipy.sparse as sp
+    import numpy as np
 
-    helper = SpectralHelper(debug=True)
+    helper = SpectralHelper(debug=True, useGPU=useGPU)
     helper.add_component('u')
     helper.add_axis(base='cheby', N=N)
     helper.setup_fft()
 
+    xp = helper.xp
+    linalg = helper.linalg
+
+    if useGPU:
+        import cupy
+
+        assert xp == cupy
+
     if kind == 'integral':
         helper.add_BC('u', 'u', 0, v=bc_val, kind='integral')
     else:
@@ -429,20 +508,24 @@ def test_tau_method(bc, N, bc_val, kind='Dirichlet'):
     A = helper.convert_operator_matrix_to_operator(_A)
     A = helper.put_BCs_in_matrix(A)
 
-    rhs = helper.put_BCs_in_rhs(np.zeros((1, N)))
+    rhs = helper.put_BCs_in_rhs(xp.zeros((1, N)))
     rhs_hat = helper.transform(rhs, axes=(-1,))
 
-    sol_hat = sp.linalg.spsolve(A, rhs_hat.flatten())
+    sol_hat = linalg.spsolve(A, rhs_hat.flatten())
+
+    if useGPU:
+        C = C.get()
+        sol_hat = sol_hat.get()
 
     sol_poly = np.polynomial.Chebyshev(sol_hat)
     d_sol_poly = sol_poly.deriv(1)
-    x = np.linspace(-1, 1, 100)
+    x = xp.linspace(-1, 1, 100)
 
     if kind == 'integral':
         integral = sol_poly.integ(1, lbnd=-1)
-        assert np.isclose(integral(1), bc_val), 'Solution does not satisfy boundary condition'
+        assert xp.isclose(integral(1), bc_val), 'Solution does not satisfy boundary condition'
     else:
-        assert np.isclose(sol_poly(bc), bc_val), 'Solution does not satisfy boundary condition'
+        assert xp.isclose(sol_poly(bc), bc_val), 'Solution does not satisfy boundary condition'
 
     coef = np.append(np.zeros(N - 1), [1])
     P = np.polynomial.Chebyshev(C @ coef)
@@ -450,12 +533,16 @@ def test_tau_method(bc, N, bc_val, kind='Dirichlet'):
     assert np.allclose(tau, tau[0]), 'Solution does not satisfy perturbed equation'
 
 
+@pytest.mark.cupy
+def test_tau_method_GPU():
+    test_tau_method(-1, 8, 2.77, useGPU=True)
+
+
 @pytest.mark.base
-@pytest.mark.parametrize('variant', ['T2T', 'T2U'])
 @pytest.mark.parametrize('nx', [4, 8])
 @pytest.mark.parametrize('nz', [4, 8])
 @pytest.mark.parametrize('bc_val', [-2, 1.0])
-def test_tau_method2D(variant, nz, nx, bc_val, bc=-1, useMPI=False, plotting=False, **kwargs):
+def test_tau_method2D(nz, nx, bc_val, bc=-1, plotting=False, useMPI=False, **kwargs):
     '''
     solve u_z - 0.1u_xx -u_x + tau P = 0, u(bc) = sin(bc_val*x) -> space-time discretization of advection-diffusion
     problem. We do FFT in x-direction and Chebychov in z-direction.
@@ -476,10 +563,9 @@ def test_tau_method2D(variant, nz, nx, bc_val, bc=-1, useMPI=False, plotting=Fal
     helper.add_component(['u'])
     helper.setup_fft()
 
-    X, Z = helper.get_grid()
+    X, Z = helper.get_grid(forward_output=True)
     x = X[:, 0]
     z = Z[0, :]
-    shape = helper.init[0][1:]
 
     bcs = np.sin(bc_val * x)
     helper.add_BC('u', 'u', 1, kind='dirichlet', x=bc, v=bcs)
@@ -503,26 +589,27 @@ def test_tau_method2D(variant, nz, nx, bc_val, bc=-1, useMPI=False, plotting=Fal
     # solve the system
     sol_hat = helper.u_init_forward
     sol_hat[0] = (helper.sparse_lib.linalg.spsolve(A, rhs_hat.flatten())).reshape(X.shape)
-    sol = helper.itransform(sol_hat, axes=(-2, -1)).real
+    sol = helper.redistribute(helper.itransform(sol_hat), True, 1).real
 
     # construct polynomials for testing
     sol_cheby = helper.transform(sol, axes=(-1,))
-    polys = [np.polynomial.Chebyshev(sol_cheby[0, i, :]) for i in range(shape[0])]
+    polys = [np.polynomial.Chebyshev(sol_cheby[0, i, :]) for i in range(sol_cheby.shape[1])]
 
     Pz = np.polynomial.Chebyshev(np.append(np.zeros(nz - 1), [1]))
-    tau_term, _ = np.meshgrid(Pz(z), np.ones(shape[0]))
+    tau_term, _ = np.meshgrid(Pz(z), np.ones(sol_hat.shape[1]))
     error = ((A @ sol_hat.flatten()).reshape(X.shape) / tau_term).real
 
     if plotting:
         import matplotlib.pyplot as plt
 
-        im = plt.pcolormesh(X, Z, sol[0])
+        _X, _Z = helper.get_grid(forward_output=False)
+        im = plt.pcolormesh(_X, _Z, sol[0])
         plt.colorbar(im)
         plt.xlabel('x')
         plt.ylabel('t')
         plt.show()
 
-    for i in range(shape[0]):
+    for i in range(sol.shape[1]):
 
         assert np.isclose(polys[i](bc), bcs[i]), f'Solution does not satisfy boundary condition x={x[i]}'
 
@@ -534,13 +621,13 @@ def test_tau_method2D(variant, nz, nx, bc_val, bc=-1, useMPI=False, plotting=Fal
 
 
 @pytest.mark.mpi4py
-@pytest.mark.parametrize('variant', ['T2T', 'T2U'])
+@pytest.mark.mpi(ranks=[2])
 @pytest.mark.parametrize('nx', [4, 8])
 @pytest.mark.parametrize('nz', [4, 8])
 @pytest.mark.parametrize('bc_val', [-2])
 @pytest.mark.parametrize('num_procs', [2, 1])
-def test_tau_method2D_MPI(variant, nz, nx, bc_val, num_procs, **kwargs):
-    run_MPI_test(variant=variant, nz=nz, nx=nx, bc_val=bc_val, num_procs=num_procs, test='tau')
+def test_tau_method2D_MPI(mpi_ranks, nz, nx, bc_val, num_procs, **kwargs):
+    test_tau_method2D(nz=nz, nx=nx, bc_val=bc_val, num_procs=num_procs, test='tau', useMPI=True)
 
 
 @pytest.mark.mpi4py
@@ -552,6 +639,151 @@ def test_dealias_MPI(num_procs, axis, bx, bz, nx=32, nz=64, **kwargs):
     run_MPI_test(num_procs=num_procs, axis=axis, nx=nx, nz=nz, bx=bx, bz=bz, test='dealias')
 
 
+@pytest.mark.base
+@pytest.mark.parametrize('nx', [8])
+@pytest.mark.parametrize('ny', [16])
+@pytest.mark.parametrize('nz', [32])
+@pytest.mark.parametrize('p', [1, 2])
+@pytest.mark.parametrize('bz', ['fft', 'cheby', 'ultraspherical'])
+@pytest.mark.parametrize('axes', [(-1,), (-2,), (-3,), (-1, -2), (-2, -3), (-1, -3), (-1, -2, -3)])
+def test_differentiation_matrix3D(nx, ny, nz, bz, axes, p, useMPI=False, **kwargs):
+    import numpy as np
+    from pySDC.helpers.spectral_helper import SpectralHelper
+
+    if useMPI:
+        from mpi4py import MPI
+
+        comm = MPI.COMM_WORLD
+    else:
+        comm = None
+
+    helper = SpectralHelper(comm=comm, debug=True)
+    helper.add_axis(base='fft', N=nx)
+    helper.add_axis(base='fft', N=ny)
+    helper.add_axis(base=bz, N=nz)
+    helper.setup_fft()
+
+    X, Y, Z = helper.get_grid()
+
+    if bz == 'cheby' and p > 1:
+        return None
+    elif bz == 'ultraspherical' and -1 in axes:
+        conv = helper.get_basis_change_matrix(p_out=0, p_in=p)
+    else:
+        conv = helper.get_basis_change_matrix()
+
+    D = helper.get_differentiation_matrix(axes, p=p)
+
+    u = helper.u_init
+
+    z_part = np.sin(3 * Z) if bz == 'fft' else Z**3
+    z_part_z = 3 * np.cos(3 * Z) if bz == 'fft' else 3 * Z**2
+    z_part_zz = -9 * np.sin(3 * Z) if bz == 'fft' else 6 * Z
+
+    u[0, ...] = np.sin(X) * np.cos(2 * Y) * z_part + np.cos(2 * X) + np.sin(Y) + np.sin(4 * Z)
+    if axes == (-3,):
+        if p == 1:
+            expect = np.cos(X) * np.cos(2 * Y) * z_part - 2 * np.sin(2 * X)
+        else:
+            expect = -np.sin(X) * np.cos(2 * Y) * z_part - 4 * np.cos(2 * X)
+    elif axes == (-2,):
+        if p == 1:
+            expect = np.sin(X) * (-2) * np.sin(2 * Y) * z_part + np.cos(Y)
+        else:
+            expect = np.sin(X) * (-4) * np.cos(2 * Y) * z_part - np.sin(Y)
+    elif axes == (-1,):
+        if p == 1:
+            expect = np.sin(X) * np.cos(2 * Y) * z_part_z + 4 * np.cos(4 * Z)
+        elif p == 2:
+            expect = np.sin(X) * np.cos(2 * Y) * z_part_zz - 16 * np.sin(4 * Z)
+    elif sorted(axes) == [-3, -2]:
+        if p == 1:
+            expect = -2 * np.cos(X) * np.sin(2 * Y) * z_part
+        elif p == 2:
+            expect = 4 * np.sin(X) * np.cos(2 * Y) * z_part
+    elif sorted(axes) == [-2, -1]:
+        if p == 1:
+            expect = np.sin(X) * (-2) * np.sin(2 * Y) * z_part_z
+        elif p == 2:
+            expect = np.sin(X) * (-4) * np.cos(2 * Y) * z_part_zz
+    elif sorted(axes) == [-3, -1]:
+        if p == 1:
+            expect = np.cos(X) * np.cos(2 * Y) * z_part_z
+        elif p == 2:
+            expect = -np.sin(X) * np.cos(2 * Y) * z_part_zz
+    elif axes == (-1, -2, -3):
+        if p == 1:
+            expect = np.cos(X) * (-2) * np.sin(2 * Y) * z_part_z
+        elif p == 2:
+            expect = -np.sin(X) * (-4) * np.cos(2 * Y) * z_part_zz
+    else:
+        raise NotImplementedError
+
+    u_hat = helper.transform(u)
+    D_u_hat = (conv @ D @ u_hat.flatten()).reshape(u_hat.shape)
+    D_u = helper.itransform(D_u_hat).real
+
+    error = np.linalg.norm(D_u - expect)
+    assert np.isclose(error, 0, atol=6e-8), f'Got {error=:.2e}'
+
+    if useMPI:
+        if comm.size == 2:
+            assert u_hat.shape[1] < nx or u_hat.shape[2] < ny, 'Not distributed'
+        elif comm.size > 2:
+            assert u_hat.shape[1] < nx and u_hat.shape[2] < ny, 'Not distributed in pencils'
+
+
+@pytest.mark.mpi4py
+@pytest.mark.mpi(ranks=[2, 4])
+@pytest.mark.parametrize('nx', [8])
+@pytest.mark.parametrize('ny', [16])
+@pytest.mark.parametrize('nz', [32])
+@pytest.mark.parametrize('bz', ['fft', 'cheby', 'ultraspherical'])
+@pytest.mark.parametrize('axes', [(-1,), (-2,), (-3,), (-1, -2), (-2, -3), (-1, -3), (-1, -2, -3)])
+def test_differentiation_matrix3DMPI(mpi_ranks, nx, ny, nz, bz, axes, useMPI=True, **kwargs):
+    test_differentiation_matrix3D(nx, ny, nz, bz, axes, p=1, **kwargs)
+
+
+@pytest.mark.base
+@pytest.mark.parametrize('nx', [32])
+@pytest.mark.parametrize('ny', [0, 16])
+@pytest.mark.parametrize('nz', [8])
+@pytest.mark.parametrize('bx', ['cheby', 'fft'])
+def test_identity_matrix_ND(nx, ny, nz, bx, useMPI=False, **kwargs):
+    import numpy as np
+    from pySDC.helpers.spectral_helper import SpectralHelper
+
+    if useMPI:
+        from mpi4py import MPI
+
+        comm = MPI.COMM_WORLD
+    else:
+        comm = None
+
+    helper = SpectralHelper(comm=comm, debug=True)
+    helper.add_axis(base=bx, N=nx)
+    if ny > 0:
+        helper.add_axis(base=bx, N=ny)
+    helper.add_axis(base='cheby', N=nz)
+    helper.setup_fft()
+
+    grid = helper.get_grid()
+    conv = helper.get_basis_change_matrix()
+    I = helper.get_Id()
+
+    u = helper.u_init
+    u[0, ...] = np.sin(grid[-2]) * grid[-1] ** 2 + grid[-1] ** 3 + np.cos(2 * grid[-2])
+
+    if ny > 0:
+        u[0, ...] += np.cos(grid[-3])
+
+    u_hat = helper.transform(u, axes=(-2, -1))
+    I_u_hat = (conv @ I @ u_hat.flatten()).reshape(u_hat.shape)
+    I_u = helper.itransform(I_u_hat, axes=(-1, -2))
+
+    assert np.allclose(I_u, u, atol=1e-12)
+
+
 @pytest.mark.base
 def test_cache_decorator():
     from pySDC.helpers.spectral_helper import cache
@@ -652,7 +884,6 @@ def get_operator(diag):
     parser.add_argument('--bx', type=str, help='Base in x direction')
     parser.add_argument('--bc_val', type=int, help='Value of boundary condition')
     parser.add_argument('--test', type=str, help='type of test', choices=['transform', 'diff', 'int', 'tau', 'dealias'])
-    parser.add_argument('--variant', type=str, help='Chebychov mode', choices=['T2T', 'T2U'], default='T2U')
     parser.add_argument('--useMPI', type=str_to_bool, help='use MPI or not', choices=[True, False], default=True)
     args = parser.parse_args()
 
@@ -667,14 +898,19 @@ def get_operator(diag):
     elif args.test == 'dealias':
         _test_transform_dealias(**vars(args))
     elif args.test is None:
-        # test_transform(8, 3, 'fft', 'cheby', (-1,))
-        # test_differentiation_matrix2D(2**5, 2**5, 'T2U', bx='fft', bz='fft', axes=(-2, -1))
-        # test_matrix1D(4, 'cheby', 'int')
+        # test_differentiation_matrix3D(2, 2, 4, 'cheby', p=1, axes=(-1, -2, -3), useMPI=True)
+        # test_differentiation_matrix3D(2, 2, 4, 'ultraspherical', p=1, axes=(-1, -2, -3), useMPI=True)
+        # test_differentiation_matrix3D(32, 32, 32, 'fft', p=2, axes=(-1, -2), useMPI=True)
+        # test_transform(4, 4, 8, 'fft', 'fft', 'cheby', axes=(-1,), padding=1.5, useMPI=True)
+        # test_dealias_GPU(axis=(-1, -2), bx='fft', bz='cheby', padding=1.5)
+        # test_differentiation_matrix2D(2**5, 2**5, 'T2U', bx='cheby', bz='fft', axes=(-2, -1))
+        # test_matrix1D(4, 'cheby', 'diff')
         # test_tau_method(-1, 8, 99, kind='Dirichlet')
-        # test_tau_method2D('T2U', 2**8, 2**8, -2, plotting=True)
+        # test_tau_method2D('T2U', 2**8, 2**8, -2, plotting=True, useMPI=True)
+        # test_tau_method2D('T2U', 2**1, 2**2, -2, plotting=False, useMPI=True)
         # test_filter(6, 6, (0,))
-        # _test_transform_dealias('fft', 'cheby', (-1, -2))
-        test_block_diagonal_operators()
+        # _test_transform_dealias('fft', 'cheby', -1, nx=2**2, nz=5, padding=1.5)
+        test_tau_method_GPU()
     else:
         raise NotImplementedError
     print('done')
diff --git a/pySDC/tests/test_helpers/test_spectral_helper_1d_chebychev.py b/pySDC/tests/test_helpers/test_spectral_helper_1d_chebychev.py
index 5052e5908c..9bdac95d15 100644
--- a/pySDC/tests/test_helpers/test_spectral_helper_1d_chebychev.py
+++ b/pySDC/tests/test_helpers/test_spectral_helper_1d_chebychev.py
@@ -115,7 +115,7 @@ def test_differentiation_non_standard_domain_size(N, x0, x1, p):
     D = cheby.get_differentiation_matrix(p)
 
     du_hat = D @ u_hat
-    du = cheby.itransform(du_hat)
+    du = cheby.itransform(du_hat, axes=(-1,))
 
     assert np.allclose(du_hat_exact, du_hat), np.linalg.norm(du_hat_exact - du_hat)
     assert np.allclose(du, du_exact), np.linalg.norm(du_exact - du)
@@ -142,27 +142,43 @@ def test_integration_matrix(N):
 @pytest.mark.base
 @pytest.mark.parametrize('N', [4])
 @pytest.mark.parametrize('d', [1, 2, 3])
-@pytest.mark.parametrize('transform_type', ['dct', 'fft'])
-def test_transform(N, d, transform_type):
+def test_transform(N, d):
     import scipy
     import numpy as np
     from pySDC.helpers.spectral_helper import ChebychevHelper
 
-    cheby = ChebychevHelper(N, transform_type=transform_type)
+    cheby = ChebychevHelper(N)
     u = np.random.random((d, N))
     norm = cheby.get_norm()
-    x = (cheby.get_1dgrid() * cheby.lin_trf_fac + cheby.lin_trf_off) * cheby.lin_trf_fac + cheby.lin_trf_off
     x = (cheby.get_1dgrid() - cheby.lin_trf_off) / cheby.lin_trf_fac
 
-    itransform = cheby.itransform(u, axis=-1).real
+    itransform = cheby.itransform(u, axes=(-1,)).real
 
     for i in range(d):
         assert np.allclose(np.polynomial.Chebyshev(u[i])(x), itransform[i])
     assert np.allclose(u.shape, itransform.shape)
-    assert np.allclose(scipy.fft.dct(u, axis=-1) * norm, cheby.transform(u, axis=-1).real)
+    assert np.allclose(scipy.fft.dct(u, axis=-1) * norm, cheby.transform(u, axes=(-1,)).real)
     assert np.allclose(scipy.fft.idct(u / norm, axis=-1), itransform)
-    assert np.allclose(cheby.transform(cheby.itransform(u)), u)
-    assert np.allclose(cheby.itransform(cheby.transform(u)), u)
+    assert np.allclose(cheby.transform(cheby.itransform(u, axes=(-1,)), axes=(-1,)), u)
+    assert np.allclose(cheby.itransform(cheby.transform(u, axes=(-1,)), axes=(-1,)), u)
+
+
+@pytest.mark.cupy
+def test_transform_cupy(N=8):
+    import numpy as np
+    import cupy as cp
+    from pySDC.helpers.spectral_helper import ChebychevHelper
+
+    u = cp.random.random(N).astype('D')
+
+    helper_CPU = ChebychevHelper(N=N, useGPU=False)
+    u_hat_CPU = helper_CPU.transform(u.get())
+
+    helper = ChebychevHelper(N=N, useGPU=True)
+    u_hat = helper.transform(u)
+
+    assert cp.allclose(u, helper.itransform(u_hat))
+    assert np.allclose(u_hat.get(), u_hat_CPU)
 
 
 @pytest.mark.base
@@ -270,7 +286,7 @@ def test_tau_method2D(bc, nz, nx, bc_val, plotting=False):
     bcs = np.sin(bc_val * x)
     rhs = np.zeros_like(X)
     rhs[:, -1] = bcs
-    rhs_hat = fft.transform(rhs, axis=-2)  # the rhs is already in Chebychev spectral space
+    rhs_hat = fft.transform(rhs, axes=(-2,))  # the rhs is already in Chebychev spectral space
 
     # generate matrices
     Dx = fft.get_differentiation_matrix(p=2) * 1e-1 + fft.get_differentiation_matrix()
@@ -289,8 +305,8 @@ def test_tau_method2D(bc, nz, nx, bc_val, plotting=False):
     sol_hat = (sp.linalg.spsolve(A, rhs_hat.flatten())).reshape(rhs.shape)
 
     # transform back to real space
-    _sol = fft.itransform(sol_hat, axis=-2).real
-    sol = cheby.itransform(_sol, axis=-1)
+    _sol = fft.itransform(sol_hat, axes=(-2,)).real
+    sol = cheby.itransform(_sol, axes=(-1,))
 
     # construct polynomials for testing
     polys = [np.polynomial.Chebyshev(_sol[i, :]) for i in range(nx)]
@@ -346,7 +362,7 @@ def test_tau_method2D_diffusion(nz, nx, bc_val, plotting=False):
     rhs = np.zeros((2, nx, nz))  # components u and u_x
     rhs[0, :, -1] = np.sin(bc_val * x) + 1
     rhs[1, :, -1] = 3 * np.exp(-((x - 3.6) ** 2)) + np.cos(x)
-    rhs_hat = fft.transform(rhs, axis=-2)  # the rhs is already in Chebychev spectral space
+    rhs_hat = fft.transform(rhs, axes=(-2,))  # the rhs is already in Chebychev spectral space
 
     # generate 1D matrices
     Dx = fft.get_differentiation_matrix()
@@ -379,8 +395,8 @@ def test_tau_method2D_diffusion(nz, nx, bc_val, plotting=False):
     sol_hat = (sp.linalg.spsolve(A, rhs_hat.flatten())).reshape(rhs.shape)
 
     # transform back to real space
-    _sol = fft.itransform(sol_hat, axis=-2).real
-    sol = cheby.itransform(_sol, axis=-1)
+    _sol = fft.itransform(sol_hat, axes=(-2,)).real
+    sol = cheby.itransform(_sol, axes=(-1,))
 
     polys = [np.polynomial.Chebyshev(_sol[0, i, :]) for i in range(nx)]
 
@@ -401,3 +417,7 @@ def test_tau_method2D_diffusion(nz, nx, bc_val, plotting=False):
         assert np.allclose(
             polys[i](z), sol[0, i, :]
         ), f'Solution is incorrectly transformed back to real space at x={x[i]}'
+
+
+if __name__ == '__main__':
+    test_transform_cupy()
diff --git a/pySDC/tests/test_helpers/test_spectral_helper_1d_fft.py b/pySDC/tests/test_helpers/test_spectral_helper_1d_fft.py
index 15fd93464e..4d456036c0 100644
--- a/pySDC/tests/test_helpers/test_spectral_helper_1d_fft.py
+++ b/pySDC/tests/test_helpers/test_spectral_helper_1d_fft.py
@@ -1,7 +1,7 @@
 import pytest
 
 
-@pytest.mark.base
+@pytest.mark.mpi4py
 @pytest.mark.parametrize('N', [9, 64])
 @pytest.mark.parametrize('x0', [-4, 0, 1])
 @pytest.mark.parametrize('x1', [None, 4, 8])
@@ -15,7 +15,7 @@ def test_differentiation_matrix(N, x0, x1, plot=False):
     x = helper.get_1dgrid()
     D = helper.get_differentiation_matrix()
 
-    u = np.zeros_like(x)
+    u = np.zeros_like(x).astype('D')
     expect = np.zeros_like(u)
 
     num_coef = N // 2
@@ -42,17 +42,39 @@ def test_differentiation_matrix(N, x0, x1, plot=False):
     assert np.allclose(expect, Du)
 
 
-@pytest.mark.base
-def test_transform(N=8):
+@pytest.mark.mpi4py
+@pytest.mark.parametrize('useFFTW', [False, True])
+def test_transform(useFFTW, N=8):
     import numpy as np
     from pySDC.helpers.spectral_helper import FFTHelper
 
-    u = np.random.random(N)
-    helper = FFTHelper(N=N)
+    u = np.random.random(N).astype('D')
+    helper = FFTHelper(N=N, useFFTW=useFFTW)
     u_hat = helper.transform(u)
+
+    assert np.allclose(u_hat, np.fft.fft(u))
     assert np.allclose(u, helper.itransform(u_hat))
 
 
+@pytest.mark.cupy
+@pytest.mark.parametrize('d', [1, 2, 3])
+def test_transform_cupy(d, N=8):
+    import numpy as np
+    import cupy as cp
+    from pySDC.helpers.spectral_helper import FFTHelper
+
+    u = cp.random.random((d, N)).astype('D')
+
+    helper_CPU = FFTHelper(N=N, useGPU=False)
+    u_hat_CPU = helper_CPU.transform(u.get(), axes=(-1,))
+
+    helper = FFTHelper(N=N, useGPU=True)
+    u_hat = helper.transform(u, axes=(-1,))
+
+    assert np.allclose(u_hat.get(), u_hat_CPU)
+    assert cp.allclose(u, helper.itransform(u_hat, axes=(-1,)))
+
+
 @pytest.mark.base
 @pytest.mark.parametrize('N', [8, 64])
 def test_integration_matrix(N, plot=False):
@@ -114,6 +136,9 @@ def test_tau_method(N, v):
 
 
 if __name__ == '__main__':
-    # test_differentiation_matrix(64, 4, True)
+    # test_differentiation_matrix(64, 4, 8, True, True)
     # test_integration_matrix(8, True)
-    test_tau_method(6, 1)
+    # test_tau_method(6, 1)
+    # test_transform(True)
+    # test_transform(False)
+    test_transform_cupy(4)
diff --git a/pySDC/tests/test_helpers/test_spectral_helper_1d_ultraspherical.py b/pySDC/tests/test_helpers/test_spectral_helper_1d_ultraspherical.py
index 22d5fb5001..9a677d8282 100644
--- a/pySDC/tests/test_helpers/test_spectral_helper_1d_ultraspherical.py
+++ b/pySDC/tests/test_helpers/test_spectral_helper_1d_ultraspherical.py
@@ -15,7 +15,7 @@ def test_differentiation_matrix(N, p):
     D = helper.get_differentiation_matrix(p=p)
     Q = helper.get_basis_change_matrix(p_in=p, p_out=0)
 
-    du = helper.itransform(Q @ D @ coeffs)
+    du = helper.itransform(Q @ D @ coeffs, axes=(-1,))
     exact = np.polynomial.Chebyshev(coeffs).deriv(p)(x)
 
     assert np.allclose(exact, du)
@@ -46,7 +46,7 @@ def test_differentiation_non_standard_domain_size(N, x0, x1, p):
     Q = helper.get_basis_change_matrix(p_in=p, p_out=0)
 
     du_hat = Q @ D @ u_hat
-    du = helper.itransform(du_hat)
+    du = helper.itransform(du_hat, axes=(-1,))
 
     assert np.allclose(du_hat_exact, du_hat), np.linalg.norm(du_hat_exact - du_hat)
     assert np.allclose(du, du_exact), np.linalg.norm(du_exact - du)
@@ -119,7 +119,7 @@ def test_poisson_problem(N, deg, Dirichlet_recombination):
 
     u_hat = Pr @ sp.linalg.spsolve(A @ Pr, rhs)
 
-    u = helper.itransform(u_hat)
+    u = helper.itransform(u_hat, axes=(-1,))
 
     u_exact = (a - b) / 2 * x ** (deg + 2) + (b + a) / 2
 
diff --git a/pySDC/tests/test_problems/test_RayleighBenard.py b/pySDC/tests/test_problems/test_RayleighBenard.py
index 1cc1d85a9e..c9cbe693c8 100644
--- a/pySDC/tests/test_problems/test_RayleighBenard.py
+++ b/pySDC/tests/test_problems/test_RayleighBenard.py
@@ -105,7 +105,7 @@ def test_vorticity(nx, nz, direction):
 
 @pytest.mark.mpi4py
 @pytest.mark.parametrize('v', [0, 3.14])
-def test_Nusselt_numbers(v, nx=5, nz=4):
+def test_Nusselt_numbers(v, nx=6, nz=4):
     import numpy as np
     from pySDC.implementations.problem_classes.RayleighBenard import RayleighBenard
 
@@ -114,14 +114,14 @@ def test_Nusselt_numbers(v, nx=5, nz=4):
         'v_bottom': v,
     }
 
-    P = RayleighBenard(nx=nx, nz=nz, BCs=BCs)
+    P = RayleighBenard(nx=nx, nz=nz, BCs=BCs, dealiasing=1.5)
 
     u = P.u_exact(noise_level=0)
 
     nusselt = P.compute_Nusselt_numbers(u)
     expect = {'V': 1 + v, 't': 1, 'b': +1 + 2 * v, 'b_no_v': 1, 't_no_v': 1}
     for key in nusselt.keys():
-        assert np.isclose(nusselt[key], expect[key])
+        assert np.isclose(nusselt[key], expect[key]), key
 
 
 def test_viscous_dissipation(nx=2**5 + 1, nz=2**3 + 1):
@@ -317,8 +317,8 @@ def test_apply_BCs():
     # test_eval_f(2**0, 2**2, 'z', True)
     # test_Poisson_problem(1, 'T')
     # test_Poisson_problem_v()
-    test_apply_BCs()
-    # test_Nusselt_numbers(1)
+    # test_apply_BCs()
+    test_Nusselt_numbers(3.14)
     # test_buoyancy_computation()
     # test_viscous_dissipation()
     # test_CFL()
diff --git a/pySDC/tests/test_problems/test_RayleighBenard3D.py b/pySDC/tests/test_problems/test_RayleighBenard3D.py
new file mode 100644
index 0000000000..4595b57547
--- /dev/null
+++ b/pySDC/tests/test_problems/test_RayleighBenard3D.py
@@ -0,0 +1,209 @@
+import pytest
+
+
+@pytest.mark.mpi4py
+@pytest.mark.parametrize('direction', ['x', 'y', 'z', 'mixed'])
+@pytest.mark.parametrize('nx', [16])
+@pytest.mark.parametrize('nz', [8])
+@pytest.mark.parametrize('spectral_space', [True, False])
+def test_eval_f(nx, nz, direction, spectral_space):
+    import numpy as np
+    from pySDC.implementations.problem_classes.RayleighBenard3D import RayleighBenard3D
+
+    P = RayleighBenard3D(nx=nx, ny=nx, nz=nz, Rayleigh=1, spectral_space=spectral_space)
+    iu, iv, iw, ip, iT = P.index(['u', 'v', 'w', 'p', 'T'])
+    X, Y, Z = P.X, P.Y, P.Z
+    cos, sin = np.cos, np.sin
+
+    kappa = P.kappa
+    nu = P.nu
+
+    k = 2
+    if direction == 'x':
+        y = sin(X * k * np.pi)
+        y_x = cos(X * k * np.pi) * k * np.pi
+        y_xx = -sin(X * k * np.pi) * k * k * np.pi**2
+        y_y = 0
+        y_yy = 0
+        y_z = 0
+        y_zz = 0
+    elif direction == 'y':
+        y = sin(Y * k * np.pi)
+        y_y = cos(Y * k * np.pi) * k * np.pi
+        y_yy = -sin(Y * k * np.pi) * k * k * np.pi**2
+        y_x = 0
+        y_xx = 0
+        y_z = 0
+        y_zz = 0
+    elif direction == 'z':
+        y = Z**2
+        y_x = 0
+        y_xx = 0
+        y_y = 0
+        y_yy = 0
+        y_z = 2 * Z
+        y_zz = 2.0
+    elif direction == 'mixed':
+        y = sin(X * k * np.pi) * sin(Y * k * np.pi) * Z**2
+        y_x = cos(X * k * np.pi) * k * np.pi * sin(k * Y * np.pi) * Z**2
+        y_xx = -sin(X * k * np.pi) * k * k * np.pi**2 * sin(Y * k * np.pi) * Z**2
+        y_y = cos(Y * k * np.pi) * k * np.pi * sin(X * k * np.pi) * Z**2
+        y_yy = -sin(Y * k * np.pi) * k * k * np.pi**2 * sin(X * k * np.pi) * Z**2
+        y_z = sin(X * k * np.pi) * sin(Y * k * np.pi) * 2 * Z
+        y_zz = sin(X * k * np.pi) * sin(Y * k * np.pi) * 2
+    else:
+        raise NotImplementedError
+
+    u = P.u_init_physical
+    assert np.allclose(P.eval_f(P.u_init), 0), 'Non-zero time derivative in static 0 configuration'
+
+    for i in [iu, iv, iw, iT, ip]:
+        u[i][:] = y
+
+    if spectral_space:
+        u = P.transform(u)
+
+    f = P.eval_f(u)
+
+    f_expect = P.f_init
+    for i in [iT, iu, iv, iw]:
+        f_expect.expl[i] = -y * (y_x + y_y + y_z)
+    f_expect.impl[iT] = kappa * (y_xx + y_yy + y_zz)
+    f_expect.impl[iu] = -y_x + nu * (y_xx + y_yy + y_zz)
+    f_expect.impl[iv] = -y_y + nu * (y_xx + y_yy + y_zz)
+    f_expect.impl[iw] = -y_z + nu * (y_xx + y_yy + y_zz) + y
+    f_expect.impl[ip] = -(y_x + y_y + y_z)
+
+    if spectral_space:
+        f.impl = P.itransform(f.impl)
+        f.expl = P.itransform(f.expl)
+
+    for comp in ['u', 'v', 'w', 'T', 'p'][::-1]:
+        i = P.spectral.index(comp)
+        assert np.allclose(f.expl[i], f_expect.expl[i]), f'Unexpected explicit function evaluation in component {comp}'
+        assert np.allclose(f.impl[i], f_expect.impl[i]), f'Unexpected implicit function evaluation in component {comp}'
+
+
+@pytest.mark.mpi4py
+@pytest.mark.parametrize('direction', ['x', 'y', 'z', 'mixed'])
+@pytest.mark.mpi(ranks=[2, 4])
+def test_eval_f_parallel(mpi_ranks, direction):
+    test_eval_f(nx=4, nz=4, direction=direction, spectral_space=False)
+
+
+@pytest.mark.mpi4py
+@pytest.mark.parametrize('nx', [1, 8])
+@pytest.mark.parametrize('component', ['u', 'v', 'T'])
+def test_Poisson_problems(nx, component):
+    """
+    When forgetting about convection and the time-dependent part, you get Poisson problems in u and T that are easy to solve. We check that we get the exact solution in a simple test here.
+    """
+    import numpy as np
+    from pySDC.implementations.problem_classes.RayleighBenard3D import RayleighBenard3D
+
+    BCs = {
+        'u_top': 0,
+        'u_bottom': 0,
+        'v_top': 0,
+        'v_bottom': 0,
+        'w_top': 0,
+        'w_bottom': 0,
+        'T_top': 0,
+        'T_bottom': 0,
+    }
+    P = RayleighBenard3D(
+        nx=nx, ny=nx, nz=6, BCs=BCs, Rayleigh=(max([abs(BCs['T_top'] - BCs['T_bottom']), np.finfo(float).eps]) * 2**4)
+    )
+    rhs = P.u_init
+
+    idx = P.index(f'{component}')
+
+    A = P.put_BCs_in_matrix(-P.L)
+    rhs[idx][0, 0, 2] = 6
+    rhs[idx][0, 0, 0] = 6
+    rhs = P.put_BCs_in_rhs_hat(rhs)
+    u = P.sparse_lib.linalg.spsolve(A, P.M @ rhs.flatten()).reshape(rhs.shape).real
+
+    u_exact = P.u_init_forward.astype('d')
+    if P.comm.rank == 0:
+        u_exact[idx][0, 0, 4] = 1 / 8
+        u_exact[idx][0, 0, 2] = 1 / 2
+        u_exact[idx][0, 0, 0] = -5 / 8
+
+        if component == 'T':
+            ip = P.index('p')
+            u_exact[ip][0, 0, 5] = 1 / (16 * 5) / 2
+            u_exact[ip][0, 0, 3] = 5 / (16 * 5) / 2
+            u_exact[ip][0, 0, 1] = -70 / (16 * 5) / 2
+
+    for comp in ['u', 'v', 'w', 'T', 'p']:
+        i = P.spectral.index(comp)
+        assert np.allclose(u_exact[i], u[i]), f'Unexpected solution in component {comp}'
+
+
+@pytest.mark.mpi4py
+def test_Poisson_problem_w():
+    """
+    Here we don't really solve a Poisson problem. w can only be constant due to the incompressibility, then we have a Possion problem in T with a linear solution and p absorbs all the rest. This is therefore mainly a test for the pressure computation. We don't test that the boundary condition is enforced because the constant pressure offset is entirely irrelevant to anything.
+    """
+    import numpy as np
+    from pySDC.implementations.problem_classes.RayleighBenard3D import RayleighBenard3D
+
+    BCs = {
+        'u_top': 0,
+        'u_bottom': 0,
+        'v_top': 0,
+        'v_bottom': 0,
+        'w_top': 0,
+        'w_bottom': 0,
+        'T_top': 0,
+        'T_bottom': 1,
+    }
+    P = RayleighBenard3D(nx=2, ny=2, nz=2**3, BCs=BCs, Rayleigh=1.0)
+    iw = P.index('w')
+
+    rhs_real = P.u_init
+    rhs_real[iw] = 32 * 6 * P.Z**5
+
+    rhs = P.transform(rhs_real)
+    rhs = (P.M @ rhs.flatten()).reshape(rhs.shape)
+    rhs_real = P.itransform(rhs)
+
+    rhs_real = P.put_BCs_in_rhs(rhs_real)
+    rhs = P.transform(rhs_real)
+
+    A = P.put_BCs_in_matrix(-P.L)
+    u = P.sparse_lib.linalg.spsolve(A, rhs.flatten()).reshape(rhs.shape).real
+
+    u_exact_real = P.u_init
+    iT = P.index('T')
+    u_exact_real[iT] = 1 - P.Z
+
+    ip = P.index('p')
+    u_exact_real[ip] = P.Z - 1 / 2 * P.Z**2 - 32 * P.Z**6
+
+    u_exact = P.transform(u_exact_real)
+    u_exact[ip, 0, 0] = u[ip, 0, 0]  # nobody cares about the constant offset
+
+    for comp in ['u', 'v', 'w', 'T', 'p']:
+        i = P.spectral.index(comp)
+        assert np.allclose(u_exact[i], u[i]), f'Unexpected solution in component {comp}'
+
+
+@pytest.mark.mpi4py
+def test_libraries():
+    from pySDC.implementations.problem_classes.RayleighBenard3D import RayleighBenard3D
+    from mpi4py_fft import fftw
+    from scipy import fft
+
+    P = RayleighBenard3D(nx=2, ny=2, nz=2)
+    assert P.axes[0].fft_lib == fftw
+    assert P.axes[1].fft_lib == fftw
+    assert P.axes[2].fft_lib == fft
+
+
+if __name__ == '__main__':
+    # test_eval_f(2**2, 2**1, 'x', False)
+    # test_libraries()
+    # test_Poisson_problems(4, 'u')
+    test_Poisson_problem_w()