From 675e173eedde3a6931574990a4b966a829983605 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 14 Oct 2024 12:18:54 +0200
Subject: [PATCH 001/123] Add new Kokkos backend to the backend_types
 enumeration.

---
 include/plssvm/backend_types.hpp             | 14 +++++++++++++-
 include/plssvm/detail/cmd/parser_predict.hpp |  2 +-
 include/plssvm/detail/cmd/parser_train.hpp   |  2 +-
 src/plssvm/backend_types.cpp                 | 15 +++++++++++----
 4 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/include/plssvm/backend_types.hpp b/include/plssvm/backend_types.hpp
index 779062ab5..76b23914b 100644
--- a/include/plssvm/backend_types.hpp
+++ b/include/plssvm/backend_types.hpp
@@ -44,7 +44,9 @@ enum class backend_type {
     /** [OpenCL](https://www.khronos.org/opencl/) to target CPUs and GPUs from different vendors. */
     opencl,
     /** [SYCL](https://www.khronos.org/sycl/) to target CPUs and GPUs from different vendors. Currently tested SYCL implementations are [DPC++](https://github.com/intel/llvm) and [AdaptiveCpp](https://github.com/AdaptiveCpp/AdaptiveCpp) (formerly known as hipSYCL). */
-    sycl
+    sycl,
+    /** [Kokkos](https://github.com/kokkos/kokkos) to target CPUs and GPUs from different vendors. */
+    kokkos
 };
 
 /**
@@ -89,6 +91,7 @@ namespace hip { class csvm; }
 namespace opencl { class csvm; }
 namespace adaptivecpp { class csvm; }
 namespace dpcpp { class csvm; }
+namespace kokkos { class csvm; }
 
 // clang-format on
 
@@ -169,6 +172,15 @@ struct csvm_to_backend_type<dpcpp::csvm> {
     constexpr static sycl::implementation_type impl = sycl::implementation_type::dpcpp;
 };
 
+/**
+ * @brief Sets the `value` to `plssvm::backend_type::kokkos` for the Kokkos C-SVM.
+ */
+template <>
+struct csvm_to_backend_type<kokkos::csvm> {
+    /// The enum value representing the Kokkos backend.
+    constexpr static backend_type value = backend_type::kokkos;
+};
+
 }  // namespace detail
 
 /// @endcond
diff --git a/include/plssvm/detail/cmd/parser_predict.hpp b/include/plssvm/detail/cmd/parser_predict.hpp
index 23ba69866..6c0a37c01 100644
--- a/include/plssvm/detail/cmd/parser_predict.hpp
+++ b/include/plssvm/detail/cmd/parser_predict.hpp
@@ -37,7 +37,7 @@ struct parser_predict {
      */
     parser_predict(int argc, char **argv);
 
-    /// The used backend: automatic (depending on the specified target_platforms), OpenMP, CUDA, HIP, OpenCL, or SYCL.
+    /// The used backend: automatic (depending on the specified target_platforms), OpenMP, CUDA, HIP, OpenCL, SYCL, or Kokkos.
     backend_type backend{ backend_type::automatic };
     /// The target platform: automatic (depending on the used backend), CPUs or GPUs from NVIDIA, AMD, or Intel.
     target_platform target{ target_platform::automatic };
diff --git a/include/plssvm/detail/cmd/parser_train.hpp b/include/plssvm/detail/cmd/parser_train.hpp
index 6a48c8d91..008466863 100644
--- a/include/plssvm/detail/cmd/parser_train.hpp
+++ b/include/plssvm/detail/cmd/parser_train.hpp
@@ -53,7 +53,7 @@ struct parser_train {
     /// The multi-class classification strategy used.
     classification_type classification{ classification_type::oaa };
 
-    /// The used backend: automatic (depending on the specified target_platforms), OpenMP, CUDA, HIP, OpenCL, or SYCL.
+    /// The used backend: automatic (depending on the specified target_platforms), OpenMP, CUDA, HIP, OpenCL, SYCL, or Kokkos.
     backend_type backend{ backend_type::automatic };
     /// The target platform: automatic (depending on the used backend), CPUs or GPUs from NVIDIA, AMD, or Intel.
     target_platform target{ target_platform::automatic };
diff --git a/src/plssvm/backend_types.cpp b/src/plssvm/backend_types.cpp
index 0d01bb837..8c7f4095c 100644
--- a/src/plssvm/backend_types.cpp
+++ b/src/plssvm/backend_types.cpp
@@ -47,6 +47,9 @@ std::vector<backend_type> list_available_backends() {
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
     available_backends.push_back(backend_type::sycl);
 #endif
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    available_backends.push_back(backend_type::kokkos);
+#endif
 
     // automatic is ALWAYS available but AT LEAST ONE other backend must be available in addition
     PLSSVM_ASSERT(available_backends.size() > 1, "Besides \"automatic\" at least one other backend must be available!");
@@ -58,10 +61,10 @@ backend_type determine_default_backend(const std::vector<backend_type> &availabl
     // the decision order based on empiric findings
     using decision_order_type = std::pair<target_platform, std::vector<backend_type>>;
     const std::array decision_order = {
-        decision_order_type{ target_platform::gpu_nvidia, { backend_type::cuda, backend_type::hip, backend_type::opencl, backend_type::sycl, backend_type::stdpar } },
-        decision_order_type{ target_platform::gpu_amd, { backend_type::hip, backend_type::opencl, backend_type::sycl, backend_type::stdpar } },
-        decision_order_type{ target_platform::gpu_intel, { backend_type::sycl, backend_type::opencl, backend_type::stdpar } },
-        decision_order_type{ target_platform::cpu, { backend_type::sycl, backend_type::opencl, backend_type::openmp, backend_type::stdpar } }
+        decision_order_type{ target_platform::gpu_nvidia, { backend_type::cuda, backend_type::hip, backend_type::opencl, backend_type::sycl, backend_type::kokkos, backend_type::stdpar } },
+        decision_order_type{ target_platform::gpu_amd, { backend_type::hip, backend_type::opencl, backend_type::sycl, backend_type::kokkos, backend_type::stdpar } },
+        decision_order_type{ target_platform::gpu_intel, { backend_type::sycl, backend_type::opencl, backend_type::kokkos, backend_type::stdpar } },
+        decision_order_type{ target_platform::cpu, { backend_type::sycl, backend_type::kokkos, backend_type::opencl, backend_type::openmp, backend_type::stdpar } }
     };
 
     // return the default backend based on the previously defined decision order
@@ -95,6 +98,8 @@ std::ostream &operator<<(std::ostream &out, const backend_type backend) {
             return out << "opencl";
         case backend_type::sycl:
             return out << "sycl";
+        case backend_type::kokkos:
+            return out << "kokkos";
     }
     return out << "unknown";
 }
@@ -118,6 +123,8 @@ std::istream &operator>>(std::istream &in, backend_type &backend) {
         backend = backend_type::opencl;
     } else if (str == "sycl") {
         backend = backend_type::sycl;
+    } else if (str == "kokkos") {
+        backend = backend_type::kokkos;
     } else {
         in.setstate(std::ios::failbit);
     }

From 4781224b22743dc8363ba01d302c90b11897fd6c Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 14 Oct 2024 12:20:29 +0200
Subject: [PATCH 002/123] Add specialized exception type.

---
 include/plssvm/backends/Kokkos/exceptions.hpp | 38 +++++++++++++++++++
 src/plssvm/backends/Kokkos/exceptions.cpp     | 21 ++++++++++
 2 files changed, 59 insertions(+)
 create mode 100644 include/plssvm/backends/Kokkos/exceptions.hpp
 create mode 100644 src/plssvm/backends/Kokkos/exceptions.cpp

diff --git a/include/plssvm/backends/Kokkos/exceptions.hpp b/include/plssvm/backends/Kokkos/exceptions.hpp
new file mode 100644
index 000000000..047b7cad8
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/exceptions.hpp
@@ -0,0 +1,38 @@
+/**
+* @file
+* @author Alexander Van Craen
+* @author Marcel Breyer
+* @copyright 2018-today The PLSSVM project - All Rights Reserved
+* @license This file is part of the PLSSVM project which is released under the MIT license.
+*          See the LICENSE.md file in the project root for full license information.
+*
+* @brief Implements custom exception classes specific to the Kokkos backend.
+*/
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_EXCEPTIONS_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_EXCEPTIONS_HPP_
+#pragma once
+
+#include "plssvm/exceptions/exceptions.hpp"       // plssvm::exception
+#include "plssvm/exceptions/source_location.hpp"  // plssvm::source_location
+
+#include <string>  // std::string
+
+namespace plssvm::kokkos {
+
+/**
+* @brief Exception type thrown if a problem with the Kokkos backend occurs.
+*/
+class backend_exception : public exception {
+ public:
+   /**
+    * @brief Construct a new exception forwarding the exception message and source location to plssvm::exception.
+    * @param[in] msg the exception's `what()` message
+    * @param[in] loc the exception's call side information
+    */
+   explicit backend_exception(const std::string &msg, source_location loc = source_location::current());
+};
+
+}  // namespace plssvm::kokkos
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_EXCEPTIONS_HPP_
diff --git a/src/plssvm/backends/Kokkos/exceptions.cpp b/src/plssvm/backends/Kokkos/exceptions.cpp
new file mode 100644
index 000000000..4186e4008
--- /dev/null
+++ b/src/plssvm/backends/Kokkos/exceptions.cpp
@@ -0,0 +1,21 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ */
+
+#include "plssvm/backends/Kokkos/exceptions.hpp"
+
+#include "plssvm/exceptions/exceptions.hpp"       // plssvm::exception
+#include "plssvm/exceptions/source_location.hpp"  // plssvm::source_location
+
+#include <string>  // std::string
+
+namespace plssvm::kokkos {
+
+backend_exception::backend_exception(const std::string &msg, source_location loc) :
+    ::plssvm::exception{ msg, "kokkos::backend_exception", loc } { }
+
+}  // namespace plssvm::kokkos

From 40ec682995aa16d2a32de6498c8d5cc9b099b564 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 14 Oct 2024 12:20:46 +0200
Subject: [PATCH 003/123] Stub implementation of the pinned_memory class.

---
 .../backends/Kokkos/detail/pinned_memory.hpp  | 93 +++++++++++++++++++
 .../backends/Kokkos/detail/pinned_memory.cpp  | 46 +++++++++
 2 files changed, 139 insertions(+)
 create mode 100644 include/plssvm/backends/Kokkos/detail/pinned_memory.hpp
 create mode 100644 src/plssvm/backends/Kokkos/detail/pinned_memory.cpp

diff --git a/include/plssvm/backends/Kokkos/detail/pinned_memory.hpp b/include/plssvm/backends/Kokkos/detail/pinned_memory.hpp
new file mode 100644
index 000000000..dffb0d1c7
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/detail/pinned_memory.hpp
@@ -0,0 +1,93 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Small wrapper around RAII enabled TODO.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_PINNED_MEMORY_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_DETAIL_PINNED_MEMORY_HPP_
+#pragma once
+
+#include "plssvm/backends/host_pinned_memory.hpp"  // plssvm::detail::host_pinned_memory
+#include "plssvm/matrix.hpp"                       // plssvm::matrix, plssvm::layout_type
+
+#include <cstddef>  // std::size_t
+#include <vector>   // std::vector
+
+namespace plssvm::kokkos::detail {
+
+/**
+ * @brief A small RAII wrapper class to register/unregister pinned memory.
+ * @tparam T the type of the data array that should be pinned
+ */
+template <typename T>
+class [[nodiscard]] pinned_memory final : public ::plssvm::detail::host_pinned_memory<T> {
+    /// The template base type of the CUDA pinned_memory class.
+    using base_type = ::plssvm::detail::host_pinned_memory<T>;
+
+    using base_type::is_pinned_;
+    using base_type::ptr_;
+
+  public:
+    using typename base_type::value_type;
+
+    /**
+     * @brief Register the memory managed by the matrix @p matr to use pinned memory.
+     * @tparam layout the layout type of the matrix
+     * @param[in] matr the memory to pin
+     */
+    template <layout_type layout>
+    explicit pinned_memory(const matrix<T, layout> &matr) :
+        pinned_memory{ matr.data(), matr.size_padded() } { }
+
+    /**
+     * @brief Register the memory managed by the vector @p vec to use pinned memory.
+     * @param[in] vec the memory to pin
+     */
+    explicit pinned_memory(const std::vector<T> &vec);
+    /**
+     * @brief Register the memory managed by the pointer @p ptr with @p size to use pinned memory.
+     * @param[in] ptr the memory to pin
+     * @param[in] size the number of elements in the memory region to pin (**not** bytes!)
+     */
+    pinned_memory(const T *ptr, std::size_t size);
+    /**
+     * @brief Unregister the memory managed by this object.
+     */
+    ~pinned_memory() override;
+
+    /**
+     * @brief Must provide a memory that should be pinned.
+     */
+    pinned_memory() = delete;
+    /**
+     * @brief Delete the copy-constructor.
+     */
+    pinned_memory(const pinned_memory &) = delete;
+    /**
+     * @brief Delete the move-constructor.
+     */
+    pinned_memory(pinned_memory &&) noexcept = delete;
+    /**
+     * @brief Delete the copy-assignment operator.
+     * @return `*this`
+     */
+    pinned_memory &operator=(const pinned_memory &) = delete;
+    /**
+     * @brief Delete the move-assignment operator.
+     * @return `*this`
+     */
+    pinned_memory &operator=(pinned_memory &&) noexcept = delete;
+};
+
+extern template class pinned_memory<float>;
+extern template class pinned_memory<double>;
+
+}  // namespace plssvm::kokkos::detail
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_DETAIL_PINNED_MEMORY_HPP_
diff --git a/src/plssvm/backends/Kokkos/detail/pinned_memory.cpp b/src/plssvm/backends/Kokkos/detail/pinned_memory.cpp
new file mode 100644
index 000000000..919cbdaa1
--- /dev/null
+++ b/src/plssvm/backends/Kokkos/detail/pinned_memory.cpp
@@ -0,0 +1,46 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ */
+
+#include "plssvm/backends/Kokkos/detail/pinned_memory.hpp"
+
+#include "plssvm/backends/host_pinned_memory.hpp"  // plssvm::detail::host_pinned_memory
+#include "plssvm/exceptions/exceptions.hpp"        // plssvm::exception
+
+#include <cstddef>    // std::size_t
+#include <exception>  // std::terminate
+#include <iostream>   // std::cerr, std::endl
+#include <vector>     // std::vector
+
+namespace plssvm::kokkos::detail {
+
+template <typename T>
+pinned_memory<T>::pinned_memory(const std::vector<T> &vec) :
+    pinned_memory{ vec.data(), vec.size() } { }
+
+template <typename T>
+pinned_memory<T>::pinned_memory(const T *ptr, const std::size_t size) :
+    ::plssvm::detail::host_pinned_memory<T>{ ptr } {
+    this->pin_memory(size * sizeof(T));
+}
+
+template <typename T>
+pinned_memory<T>::~pinned_memory() {
+    try {
+        if (is_pinned_ && ptr_ != nullptr) {
+            this->unpin_memory();
+        }
+    } catch (const plssvm::exception &e) {
+        std::cerr << e.what_with_loc() << std::endl;
+        std::terminate();
+    }
+}
+
+template class pinned_memory<float>;
+template class pinned_memory<double>;
+
+}  // namespace plssvm::kokkos::detail

From dd6999086844851a382713af213b9297dc042270 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 14 Oct 2024 12:21:05 +0200
Subject: [PATCH 004/123] Stub implementation of the Kokkos csvm class.

---
 include/plssvm/backends/Kokkos/csvm.hpp       | 198 ++++++++++++++++++
 .../Kokkos/detail/execution_space.hpp         |  42 ++++
 .../plssvm/backends/Kokkos/detail/utility.hpp |  89 ++++++++
 src/plssvm/backends/Kokkos/csvm.cpp           | 179 ++++++++++++++++
 .../Kokkos/detail/execution_space.cpp         |  39 ++++
 src/plssvm/backends/Kokkos/detail/utility.cpp | 138 ++++++++++++
 6 files changed, 685 insertions(+)
 create mode 100644 include/plssvm/backends/Kokkos/csvm.hpp
 create mode 100644 include/plssvm/backends/Kokkos/detail/execution_space.hpp
 create mode 100644 include/plssvm/backends/Kokkos/detail/utility.hpp
 create mode 100644 src/plssvm/backends/Kokkos/csvm.cpp
 create mode 100644 src/plssvm/backends/Kokkos/detail/execution_space.cpp
 create mode 100644 src/plssvm/backends/Kokkos/detail/utility.cpp

diff --git a/include/plssvm/backends/Kokkos/csvm.hpp b/include/plssvm/backends/Kokkos/csvm.hpp
new file mode 100644
index 000000000..524f1bd4b
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/csvm.hpp
@@ -0,0 +1,198 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Defines a C-SVM using the Kokkos backend.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_CSVM_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_CSVM_HPP_
+#pragma once
+
+#include "plssvm/backends/execution_range.hpp"                // plssvm::detail::{dim_type, execution_range}
+#include "plssvm/backends/gpu_csvm.hpp"                       // plssvm::detail::gpu_csvm
+#include "plssvm/backends/Kokkos/detail/device_ptr.hpp"       // plssvm::kokkos::detail::device_ptr
+#include "plssvm/backends/Kokkos/detail/execution_space.hpp"  // plssvm::kokkos::detail::execution_space
+#include "plssvm/backends/Kokkos/detail/pinned_memory.hpp"    // plssvm::kokkos::detail::pinned_memory
+#include "plssvm/csvm.hpp"                                    // plssvm::detail::csvm_backend_exists
+#include "plssvm/detail/memory_size.hpp"                      // plssvm::detail::memory_size
+#include "plssvm/detail/type_traits.hpp"                      // PLSSVM_REQUIRES
+#include "plssvm/parameter.hpp"                               // plssvm::parameter, plssvm::detail::parameter
+#include "plssvm/target_platforms.hpp"                        // plssvm::target_platform
+
+#include <cstddef>      // std::size_t
+#include <type_traits>  // std::true_type
+#include <utility>      // std::forward
+#include <vector>       // std::vector
+
+namespace plssvm {
+
+namespace kokkos {
+
+/**
+ * @brief A C-SVM implementation using Kokkos as backend.
+ */
+class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, int, detail::pinned_memory> {
+  protected:
+    // protected for the test mock class
+    /// The template base type of the Kokkos C-SVM class.
+    using base_type = ::plssvm::detail::gpu_csvm<detail::device_ptr, int, detail::pinned_memory>;
+
+    using base_type::data_distribution_;
+    using base_type::devices_;
+
+  public:
+    using base_type::device_ptr_type;
+    using typename base_type::pinned_memory_type;
+    using typename base_type::queue_type;
+
+    /**
+     * @brief Construct a new C-SVM using the Kokkos backend with the parameters given through @p params.
+     * @param[in] params struct encapsulating all possible parameters
+     * @throws plssvm::exception all exceptions thrown in the base class constructor
+     * @throws plssvm::kokkos::backend_exception if the requested target is not available
+     * @throws plssvm::kokkos::backend_exception if no device for the requested target was found
+     */
+    explicit csvm(parameter params = {});
+    /**
+     * @brief Construct a new C-SVM using the Kokkos backend on the @p target platform with the parameters given through @p params.
+     * @param[in] target the target platform used for this C-SVM
+     * @param[in] params struct encapsulating all possible SVM parameters
+     * @throws plssvm::exception all exceptions thrown in the base class constructor
+     * @throws plssvm::kokkos::backend_exception if the requested target is not available
+     * @throws plssvm::kokkos::backend_exception if no device for the requested target was found
+     */
+    explicit csvm(target_platform target, parameter params = {});
+
+    /**
+     * @brief Construct a new C-SVM using the Kokkos backend and the optionally provided @p named_args.
+     * @param[in] named_args the additional optional named arguments
+     * @throws plssvm::exception all exceptions thrown in the base class constructor
+     * @throws plssvm::kokkos::backend_exception if the requested target is not available
+     * @throws plssvm::kokkos::backend_exception if no device for the requested target was found
+     */
+    template <typename... Args, PLSSVM_REQUIRES(::plssvm::detail::has_only_parameter_named_args_v<Args...>)>
+    explicit csvm(Args &&...named_args) :
+        csvm{ plssvm::target_platform::automatic, std::forward<Args>(named_args)... } { }
+
+    /**
+     * @brief Construct a new C-SVM using the Kokkos backend on the @p target platform and the optionally provided @p named_args.
+     * @param[in] target the target platform used for this C-SVM
+     * @param[in] named_args the additional optional named-parameters
+     * @throws plssvm::exception all exceptions thrown in the base class constructor
+     * @throws plssvm::kokkos::backend_exception if the requested target is not available
+     * @throws plssvm::kokkos::backend_exception if no device for the requested target was found
+     */
+    template <typename... Args, PLSSVM_REQUIRES(::plssvm::detail::has_only_parameter_named_args_v<Args...>)>
+    explicit csvm(const target_platform target, Args &&...named_args) :
+        base_type{ std::forward<Args>(named_args)... } {
+        this->init(target);
+    }
+
+    /**
+     * @copydoc plssvm::csvm::csvm(const plssvm::csvm &)
+     */
+    csvm(const csvm &) = delete;
+    /**
+     * @copydoc plssvm::csvm::csvm(plssvm::csvm &&) noexcept
+     */
+    csvm(csvm &&) noexcept = default;
+    /**
+     * @copydoc plssvm::csvm::operator=(const plssvm::csvm &)
+     */
+    csvm &operator=(const csvm &) = delete;
+    /**
+     * @copydoc plssvm::csvm::operator=(plssvm::csvm &&) noexcept
+     */
+    csvm &operator=(csvm &&) noexcept = default;
+    /**
+     * @brief Wait for all operations on all Kokkos devices to finish.
+     * @details Terminates the program, if any exception is thrown.
+     */
+    ~csvm() override;
+
+  protected:
+    /**
+     * @brief Initialize all important states related to the Kokkos backend.
+     * @param[in] target the target platform to use
+     * @throws plssvm::kokkos::backend_exception if the requested target is not available
+     * @throws plssvm::kokkos::backend_exception if no device for the requested target was found
+     */
+    void init(target_platform target);
+
+    /**
+     * @copydoc plssvm::csvm::get_device_memory
+     */
+    [[nodiscard]] std::vector<::plssvm::detail::memory_size> get_device_memory() const final;
+    /**
+     * @copydoc plssvm::csvm::get_max_mem_alloc_size
+     */
+    [[nodiscard]] std::vector<::plssvm::detail::memory_size> get_max_mem_alloc_size() const final;
+    /**
+     * @copydoc plssvm::detail::gpu_csvm::get_max_work_group_size
+     */
+    [[nodiscard]] std::size_t get_max_work_group_size(std::size_t device_id) const final;
+    /**
+     * @copydoc plssvm::detail::gpu_csvm::get_max_grid_size
+     */
+    [[nodiscard]] ::plssvm::detail::dim_type get_max_grid_size(std::size_t device_id) const override;
+
+    //***************************************************//
+    //                        fit                        //
+    //***************************************************//
+    /**
+     * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
+     */
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    /**
+     * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
+     */
+    void run_blas_level_3_kernel_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const ::plssvm::detail::execution_range &mirror_exec, real_type alpha, const device_ptr_type &A_d, const device_ptr_type &B_d, real_type beta, device_ptr_type &C_d) const final;
+    /**
+     * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_implicit_blas_level_3
+     */
+    void run_assemble_kernel_matrix_implicit_blas_level_3(std::size_t device_id, const ::plssvm::detail::execution_range &exec, real_type alpha, const device_ptr_type &A_d, const parameter &params, const device_ptr_type &q_red_d, real_type QA_cost, const device_ptr_type &B_d, device_ptr_type &C_d) const final;
+    /**
+     * @copydoc plssvm::detail::gpu_csvm::run_inplace_matrix_addition
+     */
+    void run_inplace_matrix_addition(std::size_t device_id, const ::plssvm::detail::execution_range &exec, device_ptr_type &lhs_d, const device_ptr_type &rhs_d) const override;
+    /**
+     * @copydoc plssvm::detail::gpu_csvm::run_inplace_matrix_scale
+     */
+    void run_inplace_matrix_scale(std::size_t device_id, const ::plssvm::detail::execution_range &exec, device_ptr_type &lhs_d, real_type scale) const override;
+
+    //***************************************************//
+    //                   predict, score                  //
+    //***************************************************//
+    /**
+     * @copydoc plssvm::detail::gpu_csvm::run_w_kernel
+     */
+    [[nodiscard]] device_ptr_type run_w_kernel(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const device_ptr_type &alpha_d, const device_ptr_type &sv_d) const final;
+    /**
+     * @copydoc plssvm::detail::gpu_csvm::run_predict_kernel
+     */
+    [[nodiscard]] device_ptr_type run_predict_kernel(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &alpha_d, const device_ptr_type &rho_d, const device_ptr_type &sv_or_w_d, const device_ptr_type &predict_points_d) const final;
+
+    /// The used Kokkos execution space.
+    detail::execution_space space_;
+};
+
+}  // namespace kokkos
+
+namespace detail {
+
+/**
+ * @brief Sets the `value` to `true` since C-SVMs using the Kokkos backend are available.
+ */
+template <>
+struct csvm_backend_exists<kokkos::csvm> : std::true_type { };
+
+}  // namespace detail
+
+}  // namespace plssvm
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_CSVM_HPP_
diff --git a/include/plssvm/backends/Kokkos/detail/execution_space.hpp b/include/plssvm/backends/Kokkos/detail/execution_space.hpp
new file mode 100644
index 000000000..8e89975c3
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/detail/execution_space.hpp
@@ -0,0 +1,42 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Execution space enumeration for the ExecutionSpaces in Kokkos.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_EXECUTION_SPACE_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_DETAIL_EXECUTION_SPACE_HPP_
+#pragma once
+
+#include "fmt/base.h"     // fmt::formatter
+#include "fmt/ostream.h"  // fmt::ostream_formatter
+
+#include <iosfwd>  // std::ostream forward declaration
+
+namespace plssvm::kokkos::detail {
+
+enum class execution_space {
+    cuda,
+    hip,
+    sycl,
+    hpx,
+    openmp,
+    openmp_target,
+    openacc,
+    threads,
+    serial
+};
+
+std::ostream &operator<<(std::ostream &out, execution_space space);
+
+}  // namespace plssvm::kokkos::detail
+
+template <>
+struct fmt::formatter<plssvm::kokkos::detail::execution_space> : fmt::ostream_formatter { };
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_DETAIL_EXECUTION_SPACE_HPP_
diff --git a/include/plssvm/backends/Kokkos/detail/utility.hpp b/include/plssvm/backends/Kokkos/detail/utility.hpp
new file mode 100644
index 000000000..3b7a9c706
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/detail/utility.hpp
@@ -0,0 +1,89 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Utility functions for the Kokkos backend.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_UTILITY_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_DETAIL_UTILITY_HPP_
+#pragma once
+
+#include "plssvm/backends/Kokkos/detail/execution_space.hpp"  // plssvm::kokkos::detail::execution_space
+#include "plssvm/target_platforms.hpp"                        // plssvm::target_platform
+
+#include "Kokkos_Core.hpp"  // TODO: ?
+
+#include <cstddef>      // std::size_t
+#include <string>       // std::string
+#include <type_traits>  // std::is_same_v
+
+namespace plssvm::kokkos::detail {
+
+template <typename ExecSpace>
+[[nodiscard]] execution_space determine_execution_space() noexcept {
+    // determine the execution_space enumeration value based on the provided Kokkos execution space
+#if defined(KOKKOS_ENABLE_CUDA)
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::Cuda>) {
+        return execution_space::cuda;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::HIP>) {
+        return execution_space::hip;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::SYCL>) {
+        return execution_space::sycl;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_HPX)
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::Experimental::HPX>) {
+        return execution_space::hpx;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_OPENMP)
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::OpenMP>) {
+        return execution_space::openmp;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::OpenMPTarget>) {
+        return execution_space::openmp_target;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_OPENACC)
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::Experimental::OpenACC>) {
+        return execution_space::openacc;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_THREADS)
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::Threads>) {
+        return execution_space::threads;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_SERIAL)
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::Serial>) {
+        return execution_space::serial;
+    }
+#endif
+}
+
+[[nodiscard]] target_platform determine_default_target_platform_from_execution_space(execution_space space);
+
+void check_execution_space_target_platform_combination(execution_space space, target_platform target);
+
+[[nodiscard]] std::string get_device_name(execution_space space, std::size_t device_id);
+
+void device_synchronize_all();
+
+[[nodiscard]] std::string get_kokkos_version();
+
+}  // namespace plssvm::kokkos::detail
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_DETAIL_UTILITY_HPP_
diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
new file mode 100644
index 000000000..3c3afb022
--- /dev/null
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -0,0 +1,179 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ */
+
+#include "plssvm/backends/Kokkos/csvm.hpp"
+
+#include "plssvm/backends/Kokkos/detail/execution_space.hpp"  // plssvm::kokkos::detail::execution_space
+#include "plssvm/backends/Kokkos/detail/utility.hpp"          // plssvm::kokkos::detail::get_runtime_version
+#include "plssvm/backends/Kokkos/exceptions.hpp"              // plssvm::kokkos::backend_exception
+#include "plssvm/detail/logging.hpp"                          // plssvm::detail::log
+#include "plssvm/detail/tracking/performance_tracker.hpp"     // plssvm::detail::tracking::tracking_entry
+#include "plssvm/exceptions/exceptions.hpp"                   // plssvm::exception
+#include "plssvm/parameter.hpp"                               // plssvm::parameter
+#include "plssvm/target_platforms.hpp"                        // plssvm::target_platform
+#include "plssvm/verbosity_levels.hpp"                        // plssvm::verbosity_level
+
+#include "Kokkos_Core.hpp"  // TODO:
+
+#include "fmt/format.h"  // fmt::format
+
+#include <exception>  // std::terminate
+#include <iostream>   // std::cout, std::endl
+#include <numeric>    // std::iota
+#include <vector>     // std::vector
+
+namespace plssvm::kokkos {
+
+csvm::csvm(parameter params) :
+    csvm{ plssvm::target_platform::automatic, params } { }
+
+csvm::csvm(target_platform target, parameter params) :
+    base_type{ params } {
+    this->init(target);
+}
+
+void csvm::init(const target_platform target) {
+    // check whether the requested target platform has been enabled
+    switch (target) {
+        case target_platform::automatic:
+            break;
+        case target_platform::cpu:
+#if !defined(PLSSVM_HAS_CPU_TARGET)
+            throw backend_exception{ fmt::format("Requested target platform '{}' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!", target) };
+#endif
+            break;
+        case target_platform::gpu_nvidia:
+#if !defined(PLSSVM_HAS_NVIDIA_TARGET)
+            throw backend_exception{ fmt::format("Requested target platform '{}' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!", target) };
+#endif
+            break;
+        case target_platform::gpu_amd:
+#if !defined(PLSSVM_HAS_AMD_TARGET)
+            throw backend_exception{ fmt::format("Requested target platform '{}' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!", target) };
+#endif
+            break;
+        case target_platform::gpu_intel:
+#if !defined(PLSSVM_HAS_INTEL_TARGET)
+            throw backend_exception{ fmt::format("Requested target platform '{}' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!", target) };
+#endif
+            break;
+    }
+
+    // TODO: document: we ALWAYS use the default execution space
+
+    // set the execution space -> we always only use the Kokkos::DefaultExecutionSpace
+    space_ = detail::determine_execution_space<Kokkos::DefaultExecutionSpace>();
+
+    plssvm::detail::log(verbosity_level::full,
+                        "\nUsing Kokkos ({}) as backend with the Kokkos::DefaultExecutionSpace \"{}\".\n",
+                        plssvm::detail::tracking::tracking_entry{ "dependencies", "kokkos_version", detail::get_kokkos_version() },
+                        plssvm::detail::tracking::tracking_entry{ "dependencies", "kokkos_default_execution_space", space_ });
+
+    // check whether the provided target platform is compatible with the Kokkos execution space
+    if (target == target_platform::automatic) {
+        // determine the default target based on the provided Kokkos execution space
+        target_ = detail::determine_default_target_platform_from_execution_space(space_);
+        plssvm::detail::log(verbosity_level::full,
+                            "Using {} as automatic target platform.\n",
+                            target_);
+    } else {
+        // check whether the provided target platform is compatible with the execution space
+        // throws a backend exception if the combination is invalid
+        detail::check_execution_space_target_platform_combination(space_, target);
+        target_ = target;
+    }
+
+    // get all available devices wrt the requested target platform
+    devices_.resize(static_cast<std::vector<queue_type>::size_type>(Kokkos::num_devices()));
+    std::iota(devices_.begin(), devices_.end(), 0);
+
+    // throw exception if no CUDA devices could be found
+    if (devices_.empty()) {
+        throw backend_exception{ fmt::format("Not devices found for the Kokkos execution space {} with the target platform {}!", space_, target_) };
+    }
+
+    // print found Kokkos devices
+    plssvm::detail::log(verbosity_level::full,
+                        "Found {} Kokkos device(s) for the target platform {}:\n",
+                        plssvm::detail::tracking::tracking_entry{ "backend", "num_devices", devices_.size() },
+                        plssvm::detail::tracking::tracking_entry{ "backend", "target_platform", target_ });
+
+    std::vector<std::string> device_names;
+    device_names.reserve(devices_.size());
+    for (typename std::vector<queue_type>::size_type device = 0; device < devices_.size(); ++device) {
+        const std::string device_name = detail::get_device_name(space_, device);
+        plssvm::detail::log(verbosity_level::full,
+                            "  [{}, {}]\n",
+                            device,
+                            device_name);
+        device_names.emplace_back(device_name);
+    }
+    PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "device", device_names }));
+    plssvm::detail::log(verbosity_level::full | verbosity_level::timing,
+                        "\n");
+}
+
+csvm::~csvm() {
+    try {
+        // be sure that all operations on the Kokkos execution spaces have finished before destruction
+        detail::device_synchronize_all();
+    } catch (const plssvm::exception &e) {
+        std::cout << e.what_with_loc() << std::endl;
+        std::terminate();
+    }
+}
+
+std::vector<::plssvm::detail::memory_size> csvm::get_device_memory() const {
+    return {};
+}
+
+std::vector<::plssvm::detail::memory_size> csvm::get_max_mem_alloc_size() const {
+    return {};
+}
+
+std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
+    return {};
+}
+
+::plssvm::detail::dim_type csvm::get_max_grid_size([[maybe_unused]] const std::size_t device_id) const {
+    return {};
+}
+
+//***************************************************//
+//                        fit                        //
+//***************************************************//
+
+auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
+    return {};
+}
+
+void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const ::plssvm::detail::execution_range &mirror_exec, const real_type alpha, const device_ptr_type &A_d, const device_ptr_type &B_d, const real_type beta, device_ptr_type &C_d) const {
+}
+
+void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, device_ptr_type &lhs_d, const device_ptr_type &rhs_d) const {
+}
+
+void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, device_ptr_type &lhs_d, const real_type scale) const {
+}
+
+void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const real_type alpha, const device_ptr_type &A_d, const parameter &params, const device_ptr_type &q_red, const real_type QA_cost, const device_ptr_type &B_d, device_ptr_type &C_d) const {
+}
+
+//***************************************************//
+//                   predict, score                  //
+//***************************************************//
+
+auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const device_ptr_type &alpha_d, const device_ptr_type &sv_d) const -> device_ptr_type {
+    return {};
+}
+
+auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &alpha_d, const device_ptr_type &rho_d, const device_ptr_type &sv_or_w_d, const device_ptr_type &predict_points_d) const -> device_ptr_type {
+    return {};
+}
+
+}  // namespace plssvm::kokkos
diff --git a/src/plssvm/backends/Kokkos/detail/execution_space.cpp b/src/plssvm/backends/Kokkos/detail/execution_space.cpp
new file mode 100644
index 000000000..65afa72b1
--- /dev/null
+++ b/src/plssvm/backends/Kokkos/detail/execution_space.cpp
@@ -0,0 +1,39 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ */
+
+#include "plssvm/backends/Kokkos/detail/execution_space.hpp"
+
+#include <ostream>  // std::ostream
+
+namespace plssvm::kokkos::detail {
+
+std::ostream &operator<<(std::ostream &out, const execution_space space) {
+    switch (space) {
+        case execution_space::cuda:
+            return out << "Cuda";
+        case execution_space::hip:
+            return out << "HIP";
+        case execution_space::sycl:
+            return out << "SYCL";
+        case execution_space::hpx:
+            return out << "HPX";
+        case execution_space::openmp:
+            return out << "OpenMP";
+        case execution_space::openmp_target:
+            return out << "OpenMPTarget";
+        case execution_space::openacc:
+            return out << "OpenACC";
+        case execution_space::threads:
+            return out << "Threads";
+        case execution_space::serial:
+            return out << "Serial";
+    }
+    return out << "unknown";
+}
+
+}  // namespace plssvm::kokkos::detail
diff --git a/src/plssvm/backends/Kokkos/detail/utility.cpp b/src/plssvm/backends/Kokkos/detail/utility.cpp
new file mode 100644
index 000000000..9458bb899
--- /dev/null
+++ b/src/plssvm/backends/Kokkos/detail/utility.cpp
@@ -0,0 +1,138 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ */
+
+#include "plssvm/backends/Kokkos/detail/utility.hpp"
+
+#include "plssvm/backends/Kokkos/detail/execution_space.hpp"  // plssvm::kokkos::detail::execution_space
+#include "plssvm/backends/Kokkos/exceptions.hpp"              // plssvm::kokkos::backend_exception
+#include "plssvm/detail/assert.hpp"                           // PLSSVM_ASSERT
+#include "plssvm/detail/utility.hpp"                          // plssvm::detail::unreachable
+#include "plssvm/target_platforms.hpp"                        // plssvm::target_platform
+
+#include "Kokkos_Macros.hpp"
+
+#if defined(KOKKOS_ENABLE_CUDA)
+    #include "cuda_runtime.h"  // cudaDeviceProp, cudaGetDeviceProperties
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+    #include "hip/hip_runtime_api.h"  // HIP runtime functions
+#endif
+
+#include "fmt/core.h"  // fmt::format
+
+#include <cstddef>  // std::size_t
+#include <string>   // std::string
+
+namespace plssvm::kokkos::detail {
+
+target_platform determine_default_target_platform_from_execution_space(const execution_space space) {
+    switch (space) {
+        case execution_space::cuda:
+            return target_platform::gpu_nvidia;
+        case execution_space::hip:
+            return target_platform::gpu_amd;
+        case execution_space::sycl:
+        case execution_space::openmp_target:
+        case execution_space::openacc:
+            return target_platform::gpu_nvidia;  // TODO: what to return here?
+        case execution_space::openmp:
+        case execution_space::hpx:
+        case execution_space::threads:
+        case execution_space::serial:
+            return target_platform::cpu;
+    }
+    ::plssvm::detail::unreachable();
+}
+
+void check_execution_space_target_platform_combination(const execution_space space, const target_platform target) {
+    PLSSVM_ASSERT(target != target_platform::automatic, "The provided target platform may not be the automatic target platform!");
+
+    switch (space) {
+        case execution_space::cuda:
+            if (target != target_platform::gpu_nvidia) {
+                throw backend_exception{ fmt::format("The target platform {} is not supported for Kokkos {} execution space!", target, space) };
+            }
+            break;
+        case execution_space::hip:
+            if (target != target_platform::gpu_amd) {
+                throw backend_exception{ fmt::format("The target platform {} is not supported for Kokkos {} execution space!", target, space) };
+            }
+            break;
+        case execution_space::sycl:
+            // SYCL may support all target platforms!
+            // TODO: use SYCL specific functions to check?
+        case execution_space::openmp_target:
+            // OpenMP Target Offloading may support all target platforms!
+            // TODO: use OpenMP Target Offloading specific functions to check?
+        case execution_space::openacc:
+            // OpenACC may support all target platforms!
+            // TODO: use OpenACC Target Offloading specific functions to check?
+            break;
+        case execution_space::openmp:
+        case execution_space::hpx:
+        case execution_space::threads:
+        case execution_space::serial:
+            if (target != target_platform::cpu) {
+                throw backend_exception{ fmt::format("The target platform {} is not supported for Kokkos {} execution space!", target, space) };
+            }
+            break;
+    }
+}
+
+// TODO: error checks?
+
+std::string get_device_name(const execution_space space, const std::size_t device_id) {
+    // TODO: implement for other backends!
+    switch (space) {
+        case execution_space::cuda:
+#if defined(KOKKOS_ENABLE_CUDA)
+            {
+                cudaDeviceProp prop{};
+                cudaGetDeviceProperties(&prop, static_cast<int>(device_id));
+                return std::string{ prop.name };
+            }
+#else
+            throw backend_exception{ fmt::format("Unsupported Kokkos execution space \"{}\"!", space) };
+#endif
+        case execution_space::hip:
+#if defined(KOKKOS_ENABLE_HIP)
+            {
+                hipDeviceProp_t prop{};
+                hipGetDeviceProperties(&prop, static_cast<int>(device_id));
+                return std::string{ prop.name };
+            }
+#else
+            throw backend_exception{ fmt::format("Unsupported Kokkos execution space \"{}\"!", space) };
+#endif
+        case execution_space::openmp:
+#if defined(KOKKOS_ENABLE_HIP)
+            return "CPU host device";
+#else
+            throw backend_exception{ fmt::format("Unsupported Kokkos execution space \"{}\"!", space) };
+#endif
+        case execution_space::sycl:
+        case execution_space::hpx:
+        case execution_space::openmp_target:
+        case execution_space::openacc:
+        case execution_space::threads:
+        case execution_space::serial:
+            throw backend_exception{ fmt::format("Unsupported Kokkos execution space \"{}\"!", space) };
+    }
+    return "unknown";
+}
+
+void device_synchronize_all() {
+    Kokkos::DefaultExecutionSpace::impl_static_fence("synchronize all");
+}
+
+std::string get_kokkos_version() {
+    // get the Kokkos version
+    return fmt::format("{}.{}.{}", KOKKOS_VERSION_MAJOR, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH);
+}
+
+}  // namespace plssvm::kokkos::detail

From b1dcb5a6dcdedddfbe366757942378a62fca1601 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 14 Oct 2024 12:21:28 +0200
Subject: [PATCH 005/123] Add Kokkos to the build.

---
 .clang-format                             |  2 +-
 CMakeLists.txt                            | 11 ++++
 include/plssvm/core.hpp                   |  6 ++
 include/plssvm/csvm_factory.hpp           |  5 ++
 src/main_predict.cpp                      | 22 ++++++++
 src/main_train.cpp                        | 23 ++++++++
 src/plssvm/backends/Kokkos/CMakeLists.txt | 67 +++++++++++++++++++++++
 7 files changed, 135 insertions(+), 1 deletion(-)
 create mode 100644 src/plssvm/backends/Kokkos/CMakeLists.txt

diff --git a/.clang-format b/.clang-format
index 30a5ef1db..533b9bcab 100644
--- a/.clang-format
+++ b/.clang-format
@@ -79,7 +79,7 @@ IncludeBlocks: Regroup
 IncludeCategories:
   - Regex: '^"plssvm/'
     Priority: 1
-  - Regex: '^"(cuda|hip|CL|sycl|omp)'
+  - Regex: '^"(cuda|hip|CL|sycl|omp|Kokkos)'
     Priority: 2
   - Regex: '^"(tests|bindings)/'
     Priority: 3
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f0c01a9ec..e293f8e3e 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -380,6 +380,13 @@ if (PLSSVM_ENABLE_SYCL_BACKEND MATCHES "AUTO" OR PLSSVM_ENABLE_SYCL_BACKEND)
     add_subdirectory(src/plssvm/backends/SYCL)
 endif ()
 
+## check for Kokkos backend
+set(PLSSVM_ENABLE_KOKKOS_BACKEND AUTO CACHE STRING "Enable SYCL Backend")
+set_property(CACHE PLSSVM_ENABLE_KOKKOS_BACKEND PROPERTY STRINGS AUTO ON OFF)
+if (PLSSVM_ENABLE_KOKKOS_BACKEND MATCHES "AUTO" OR PLSSVM_ENABLE_KOKKOS_BACKEND)
+    add_subdirectory(src/plssvm/backends/Kokkos)
+endif ()
+
 ## check if ANY backend is available/has been enabled
 get_target_property(PLSSVM_LINKED_BACKENDS ${PLSSVM_ALL_LIBRARY_NAME} INTERFACE_LINK_LIBRARIES)
 if (NOT PLSSVM_LINKED_BACKENDS)
@@ -690,6 +697,10 @@ if (TARGET ${PLSSVM_SYCL_BACKEND_LIBRARY_NAME})
     endforeach ()
     list(APPEND PLSSVM_BACKEND_NAME_LIST "sycl")
 endif ()
+if (TARGET ${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME})
+    message(STATUS "${PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING}")
+    list(APPEND PLSSVM_BACKEND_NAME_LIST "kokkos")
+endif ()
 message(STATUS "")
 
 ########################################################################################################################
diff --git a/include/plssvm/core.hpp b/include/plssvm/core.hpp
index a569d6915..1edf825c1 100644
--- a/include/plssvm/core.hpp
+++ b/include/plssvm/core.hpp
@@ -107,4 +107,10 @@ using namespace plssvm::PLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION;
 /// Namespace containing the C-SVM using the SYCL backend with the preferred SYCL implementation. **Should not** directly be used by users.
 namespace plssvm::sycl::detail { }
 
+/// Namespace containing the C-SVM using the Kokkos backend.
+namespace plssvm::kokkos { }
+
+/// Namespace containing Kokkos backend specific implementation details. **Should not** directly be used by users.
+namespace plssvm::kokkos::detail { }
+
 #endif  // PLSSVM_CORE_HPP_
diff --git a/include/plssvm/csvm_factory.hpp b/include/plssvm/csvm_factory.hpp
index 0b923caa0..190ff8984 100644
--- a/include/plssvm/csvm_factory.hpp
+++ b/include/plssvm/csvm_factory.hpp
@@ -45,6 +45,9 @@
         #include "plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp"  // plssvm::adaptivecpp::csvm, plssvm::csvm_backend_exists_v
     #endif
 #endif
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    #include "plssvm/backends/Kokkos/csvm.hpp"  // plssvm::kokkos::csvm, plssvm::csvm_backend_exists_v
+#endif
 
 #include "fmt/format.h"   // fmt::format
 #include "igor/igor.hpp"  // igor::parser, igor::has_unnamed_arguments
@@ -138,6 +141,8 @@ template <typename... Args>
             return make_csvm_default_impl<opencl::csvm>(std::forward<Args>(args)...);
         case backend_type::sycl:
             return make_csvm_sycl_impl(std::forward<Args>(args)...);
+        case backend_type::kokkos:
+            return make_csvm_default_impl<kokkos::csvm>(std::forward<Args>(args)...);
     }
     throw unsupported_backend_exception{ "Unrecognized backend provided!" };
 }
diff --git a/src/main_predict.cpp b/src/main_predict.cpp
index 31585a758..b4d47cb05 100644
--- a/src/main_predict.cpp
+++ b/src/main_predict.cpp
@@ -17,6 +17,10 @@
                                                            // PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_SET_REFERENCE_TIME
 #include "plssvm/detail/utility.hpp"                       // PLSSVM_IS_DEFINED
 
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    #include "Kokkos_Core.hpp"  // Kokkos::initialize, Kokkos::is_initialized, Kokkos::finalize, Kokkos::is_finalized
+#endif
+
 #if defined(PLSSVM_HARDWARE_SAMPLING_ENABLED)
     #include "plssvm/detail/tracking/cpu/hardware_sampler.hpp"      // plssvm::detail::tracking::cpu_hardware_sampler
     #include "plssvm/detail/tracking/hardware_sampler.hpp"          // plssvm::detail::tracking::hardware_sampler
@@ -72,6 +76,16 @@ int main(int argc, char *argv[]) {
 
             // check whether SYCL is used as backend (it is either requested directly or as automatic backend)
             const bool use_sycl_as_backend{ cmd_parser.backend == plssvm::backend_type::sycl || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::sycl) };
+            // check whether Kokkos is used as backend (it is either requested directly or as automatic backend)
+            const bool use_kokkos_as_backend{ cmd_parser.backend == plssvm::backend_type::kokkos || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::kokkos) };
+
+            // initialize Kokkos if necessary
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+            if (use_kokkos_as_backend) {
+                Kokkos::initialize(argc, argv);  // TODO: set device?
+                PLSSVM_ASSERT(Kokkos::is_initialized(), "Something went wrong initializing the Kokkos environment!");
+            }
+#endif
 
             // create default csvm
             const std::unique_ptr<plssvm::csvm> svm = use_sycl_as_backend ? plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type)
@@ -164,6 +178,14 @@ int main(int argc, char *argv[]) {
                 PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_HARDWARE_SAMPLER_ENTRY(*s);
             });
 #endif
+
+            // finalize Kokkos if necessary
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+            if (use_kokkos_as_backend) {  // TODO: what if an exception occurred?
+                Kokkos::finalize();
+                PLSSVM_ASSERT(Kokkos::is_finalized(), "Something went wrong finalizing the Kokkos environment!");
+            }
+#endif
         };
         std::visit(data_set_visitor, plssvm::detail::cmd::data_set_factory(cmd_parser));
 
diff --git a/src/main_train.cpp b/src/main_train.cpp
index 91958bf19..ff4365638 100644
--- a/src/main_train.cpp
+++ b/src/main_train.cpp
@@ -14,8 +14,13 @@
 #include "plssvm/detail/logging.hpp"                       // plssvm::detail::log
 #include "plssvm/detail/tracking/performance_tracker.hpp"  // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_SAVE,
                                                            // PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_HARDWARE_SAMPLER_ENTRY, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_SET_REFERENCE_TIME
+#include "plssvm/detail/assert.hpp"                        // PLSSVM_ASSERT
 #include "plssvm/detail/utility.hpp"                       // PLSSVM_IS_DEFINED
 
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    #include "Kokkos_Core.hpp"  // Kokkos::initialize, Kokkos::is_initialized, Kokkos::finalize, Kokkos::is_finalized
+#endif
+
 #if defined(PLSSVM_HARDWARE_SAMPLING_ENABLED)
     #include "plssvm/detail/tracking/cpu/hardware_sampler.hpp"      // plssvm::detail::tracking::cpu_hardware_sampler
     #include "plssvm/detail/tracking/hardware_sampler.hpp"          // plssvm::detail::tracking::hardware_sampler
@@ -69,6 +74,16 @@ int main(int argc, char *argv[]) {
 
             // check whether SYCL is used as backend (it is either requested directly or as automatic backend)
             const bool use_sycl_as_backend{ cmd_parser.backend == plssvm::backend_type::sycl || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::sycl) };
+            // check whether Kokkos is used as backend (it is either requested directly or as automatic backend)
+            const bool use_kokkos_as_backend{ cmd_parser.backend == plssvm::backend_type::kokkos || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::kokkos) };
+
+            // initialize Kokkos if necessary
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+            if (use_kokkos_as_backend) {
+                Kokkos::initialize(argc, argv);  // TODO: set device?
+                PLSSVM_ASSERT(Kokkos::is_initialized(), "Something went wrong initializing the Kokkos environment!");
+            }
+#endif
 
             // create SVM
             const std::unique_ptr<plssvm::csvm> svm = use_sycl_as_backend ? plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, cmd_parser.csvm_params, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_kernel_invocation_type = cmd_parser.sycl_kernel_invocation_type)
@@ -105,6 +120,14 @@ int main(int argc, char *argv[]) {
                 PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_HARDWARE_SAMPLER_ENTRY(*s);
             });
 #endif
+
+            // finalize Kokkos if necessary
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+            if (use_kokkos_as_backend) {  // TODO: what if an exception occurred
+                Kokkos::finalize();
+                PLSSVM_ASSERT(Kokkos::is_finalized(), "Something went wrong finalizing the Kokkos environment!");
+            }
+#endif
         };
         std::visit(data_set_visitor, plssvm::detail::cmd::data_set_factory(cmd_parser));
 
diff --git a/src/plssvm/backends/Kokkos/CMakeLists.txt b/src/plssvm/backends/Kokkos/CMakeLists.txt
new file mode 100644
index 000000000..d7d1037ce
--- /dev/null
+++ b/src/plssvm/backends/Kokkos/CMakeLists.txt
@@ -0,0 +1,67 @@
+## Authors: Alexander Van Craen, Marcel Breyer
+## Copyright (C): 2018-today The PLSSVM project - All Rights Reserved
+## License: This file is part of the PLSSVM project which is released under the MIT license.
+##          See the LICENSE.md file in the project root for full license information.
+########################################################################################################################
+
+list(APPEND CMAKE_MESSAGE_INDENT "Kokkos:  ")
+
+# check if Kokkos can be enabled
+message(CHECK_START "Checking for Kokkos backend")
+
+find_package(Kokkos)
+
+if (NOT Kokkos_FOUND)
+    message(CHECK_FAIL "not found")
+    if (PLSSVM_ENABLE_KOKKOS_BACKEND MATCHES "ON")
+        message(SEND_ERROR "Cannot find requested backend: Kokkos!")
+    endif ()
+    return()
+endif ()
+message(CHECK_PASS "found")
+
+# explicitly set sources
+set(PLSSVM_KOKKOS_SOURCES
+    ${CMAKE_CURRENT_LIST_DIR}/detail/device_ptr.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/detail/execution_space.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/detail/pinned_memory.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/csvm.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/exceptions.cpp
+)
+
+# set target properties
+set_local_and_parent(PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME plssvm-Kokkos)
+add_library(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} SHARED ${PLSSVM_KOKKOS_SOURCES})
+target_link_libraries(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PUBLIC Kokkos::kokkos)
+
+# link base library against Kokkos library
+target_link_libraries(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PUBLIC ${PLSSVM_BASE_LIBRARY_NAME})
+
+# set whether the kernel source should be compiled with fast math enabled or not # TODO: enable fast-math
+#if (PLSSVM_ENABLE_FAST_MATH)
+#    target_compile_definitions(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_ENABLE_FAST_MATH)
+#endif ()
+
+# set compile definition that the Kokkos backend is available
+target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PRIVATE PLSSVM_HAS_KOKKOS_BACKEND)
+target_compile_definitions(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PUBLIC PLSSVM_HAS_KOKKOS_BACKEND)
+
+# link against interface library
+target_link_libraries(${PLSSVM_ALL_LIBRARY_NAME} INTERFACE ${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME})
+
+# mark backend library as install target
+append_local_and_parent(PLSSVM_TARGETS_TO_INSTALL ${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME})
+
+# generate summary string
+set(PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_COMPILER " - Kokkos:")
+include(${PROJECT_SOURCE_DIR}/cmake/assemble_summary_string.cmake)
+assemble_summary_string(PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS)
+# do not print any special target architecture information
+string(REPLACE " (${PLSSVM_CPU_TARGET_ARCHS})" "" PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS}")
+string(REPLACE " (${PLSSVM_NVIDIA_TARGET_ARCHS})" "" PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS}")
+string(REPLACE " (${PLSSVM_AMD_TARGET_ARCHS})" "" PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS}")
+string(REPLACE " (${PLSSVM_INTEL_TARGET_ARCHS})" "" PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS}")
+set(PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING "${PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_COMPILER}${PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS}" PARENT_SCOPE)
+
+list(POP_BACK CMAKE_MESSAGE_INDENT)
\ No newline at end of file

From 0cf9e36e9cb15fdaf650c5b9e4900217db355eb5 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 14 Oct 2024 12:21:43 +0200
Subject: [PATCH 006/123] Kokkos device_ptr stub tests.

---
 .../backends/Kokkos/detail/device_ptr.hpp     | 147 ++++++++++++++++++
 .../backends/Kokkos/detail/device_ptr.cpp     |  79 ++++++++++
 2 files changed, 226 insertions(+)
 create mode 100644 include/plssvm/backends/Kokkos/detail/device_ptr.hpp
 create mode 100644 src/plssvm/backends/Kokkos/detail/device_ptr.cpp

diff --git a/include/plssvm/backends/Kokkos/detail/device_ptr.hpp b/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
new file mode 100644
index 000000000..a12021efb
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
@@ -0,0 +1,147 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Small wrapper around a Kokkos view.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_PTR_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_PTR_HPP_
+#pragma once
+
+#include "plssvm/backends/gpu_device_ptr.hpp"  // plssvm::detail::gpu_device_ptr
+#include "plssvm/shape.hpp"                    // plssvm::shape
+
+#include "Kokkos_Core.hpp"  // TODO:
+
+#include <cstddef>  // std::size_t
+
+namespace plssvm::kokkos::detail {
+
+template <typename T>
+using device_view_type = Kokkos::View<T *, Kokkos::DefaultExecutionSpace>;
+
+template <typename T>
+using device_subview_type = Kokkos::Subview<T *, Kokkos::DefaultExecutionSpace>;
+
+template <typename T>
+using host_view_type = Kokkos::View<T *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
+
+/**
+ * @brief Small wrapper class around a Kokkos view together with commonly used device functions.
+ * @tparam T the type of the kernel view to wrap
+ */
+template <typename T>
+class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, device_view_type<T>, device_ptr<T>> {
+    /// The template base type of the Kokkos device_ptr class.
+    using base_type = ::plssvm::detail::gpu_device_ptr<T, int, device_view_type<T>, device_ptr<T>>;
+
+    using base_type::data_;
+    using base_type::queue_;
+    using base_type::shape_;
+
+  public:
+    // Be able to use overloaded base class functions.
+    using base_type::copy_to_device;
+    using base_type::copy_to_device_strided;
+    using base_type::copy_to_host;
+    using base_type::copy_to_other_device;
+    using base_type::fill;
+    using base_type::memset;
+
+    using typename base_type::const_host_pointer_type;
+    using typename base_type::device_pointer_type;
+    using typename base_type::host_pointer_type;
+    using typename base_type::queue_type;
+    using typename base_type::size_type;
+    using typename base_type::value_type;
+
+    // TODO: DOKU
+
+    /**
+     * @brief Default construct a CUDA device_ptr with a size of 0.
+     * @details Always associated with device 0.
+     */
+    device_ptr() = default;
+    /**
+     * @brief Allocates `size * sizeof(T)` bytes on the device with ID @p device.
+     * @param[in] size the number of elements represented by the device_ptr
+     * @param[in] device the associated CUDA device
+     * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
+     */
+    explicit device_ptr(size_type size, int device);
+    /**
+     * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes on the device with ID @p device.
+     * @param[in] shape the number of elements represented by the device_ptr
+     * @param[in] device the associated CUDA device
+     * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
+     */
+    explicit device_ptr(plssvm::shape shape, int device);
+    /**
+     * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes on the device with ID @p device.
+     * @param[in] shape the number of elements represented by the device_ptr
+     * @param[in] padding the number of padding elements added to the extent values
+     * @param[in] device the associated CUDA device
+     * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
+     */
+    device_ptr(plssvm::shape shape, plssvm::shape padding, int device);
+
+    /**
+     * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)
+     */
+    device_ptr(const device_ptr &) = delete;
+    /**
+     * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(plssvm::detail::gpu_device_ptr &&)
+     */
+    device_ptr(device_ptr &&other) noexcept = default;
+
+    /**
+     * @copydoc plssvm::detail::gpu_device_ptr::operator=(const plssvm::detail::gpu_device_ptr &)
+     */
+    device_ptr &operator=(const device_ptr &) = delete;
+    /**
+     * @copydoc plssvm::detail::gpu_device_ptr::operator=(plssvm::detail::gpu_device_ptr &&)
+     */
+    device_ptr &operator=(device_ptr &&other) noexcept = default;
+
+    /**
+     * @copydoc plssvm::detail::gpu_device_ptr::~gpu_device_ptr()
+     */
+    ~device_ptr() override;
+
+    /**
+     * @copydoc plssvm::detail::gpu_device_ptr::memset(int, size_type, size_type)
+     */
+    void memset(int pattern, size_type pos, size_type num_bytes) override;
+    /**
+     * @copydoc plssvm::detail::gpu_device_ptr::fill(value_type, size_type, size_type)
+     */
+    void fill(value_type value, size_type pos, size_type count) override;
+    /**
+     * @copydoc plssvm::detail::gpu_device_ptr::copy_to_device(const_host_pointer_type, size_type, size_type)
+     */
+    void copy_to_device(const_host_pointer_type data_to_copy, size_type pos, size_type count) override;
+    /**
+     * @copydoc plssvm::detail::gpu_device_ptr::copy_to_device_strided(const_host_pointer_type, std::size_t, std::size_t, std::size_t)
+     */
+    void copy_to_device_strided(const_host_pointer_type data_to_copy, std::size_t spitch, std::size_t width, std::size_t height) override;
+    /**
+     * @copydoc plssvm::detail::gpu_device_ptr::copy_to_host(host_pointer_type, size_type, size_type) const
+     */
+    void copy_to_host(host_pointer_type buffer, size_type pos, size_type count) const override;
+    /**
+     * @copydoc plssvm::detail::gpu_device_ptr::copy_to_other_device(derived_gpu_device_ptr &, size_type, size_type) const
+     */
+    void copy_to_other_device(device_ptr &target, size_type pos, size_type count) const override;
+};
+
+extern template class device_ptr<float>;
+extern template class device_ptr<double>;
+
+}  // namespace plssvm::kokkos::detail
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_PTR_HPP_
diff --git a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
new file mode 100644
index 000000000..70af702b1
--- /dev/null
+++ b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
@@ -0,0 +1,79 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ */
+
+#include "plssvm/backends/Kokkos/detail/device_ptr.hpp"
+
+namespace plssvm::kokkos::detail {
+
+template <typename T>
+device_ptr<T>::device_ptr(const size_type size, const int queue) :
+    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, queue } { }
+
+template <typename T>
+device_ptr<T>::device_ptr(const plssvm::shape shape, const int queue) :
+    device_ptr{ shape, plssvm::shape{ 0, 0 }, queue } { }
+
+template <typename T>
+device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const int queue) :
+    base_type{ shape, padding, queue } {
+    static std::size_t count = 0;
+    // TODO: queue type, check range?
+    // TODO: how to assign a view to a GPU in a multi-GPU setting?
+    data_ = device_view_type<T>{ fmt::format("device_ptr_{}", count++), this->size_padded() };
+}
+
+template <typename T>
+device_ptr<T>::~device_ptr() {
+    // avoid compiler warnings
+    try {
+        // TODO:
+    } catch (const plssvm::exception &e) {
+        std::cout << e.what_with_loc() << std::endl;
+        std::terminate();
+    }
+}
+
+template <typename T>
+void device_ptr<T>::memset(const int pattern, const size_type pos, const size_type num_bytes) {
+}
+
+template <typename T>
+void device_ptr<T>::fill(const value_type value, const size_type pos, const size_type count) {
+}
+
+template <typename T>
+void device_ptr<T>::copy_to_device(const_host_pointer_type data_to_copy, const size_type pos, const size_type count) {
+    PLSSVM_ASSERT(data_ != view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
+
+//    detail::set_device(queue_);  // TODO:
+    const size_type rcount = std::min(count, this->size_padded() - pos);
+
+    // create view of the host data
+    host_view_type<T> host_view{ data_to_copy, rcount };
+    // create subview of the device data
+    device_subview_type<T> data_subview{ data_, Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(pos, rcount) };  // TODO: view via typedef
+    Kokkos::deep_copy(data_subview, host_view);
+}
+
+template <typename T>
+void device_ptr<T>::copy_to_device_strided(const_host_pointer_type data_to_copy, const std::size_t spitch, const std::size_t width, const std::size_t height) {
+}
+
+template <typename T>
+void device_ptr<T>::copy_to_host(host_pointer_type buffer, const size_type pos, const size_type count) const {
+}
+
+template <typename T>
+void device_ptr<T>::copy_to_other_device(device_ptr &target, const size_type pos, const size_type count) const {
+}
+
+template class device_ptr<float>;
+template class device_ptr<double>;
+
+}  // namespace plssvm::kokkos::detail

From 7559525756e22a314dc933679d9e1f1e10bb0dba Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 16 Oct 2024 09:24:18 +0200
Subject: [PATCH 007/123] First (maybe) functional Kokkos device_ptr
 implementation.

---
 .../backends/Kokkos/detail/device_ptr.cpp     | 54 +++++++++++++++++--
 1 file changed, 51 insertions(+), 3 deletions(-)

diff --git a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
index 70af702b1..5758a4309 100644
--- a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
@@ -8,6 +8,20 @@
 
 #include "plssvm/backends/Kokkos/detail/device_ptr.hpp"
 
+#include "plssvm/backends/Kokkos/exceptions.hpp"  // plssvm::kokkos::backend_exception
+#include "plssvm/detail/assert.hpp"               // PLSSVM_ASSERT
+#include "plssvm/exceptions/exceptions.hpp"       // plssvm::exception
+#include "plssvm/shape.hpp"                       // plssvm::shape
+
+#include "Kokkos_Core.hpp"
+
+#include "fmt/format.h"  // fmt::format
+
+#include <cstddef>    // std::size_t
+#include <exception>  // std::terminate
+#include <iostream>   // std::cout, std::endl
+#include <vector>     // std::vector
+
 namespace plssvm::kokkos::detail {
 
 template <typename T>
@@ -51,26 +65,60 @@ void device_ptr<T>::copy_to_device(const_host_pointer_type data_to_copy, const s
     PLSSVM_ASSERT(data_ != view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
 
-//    detail::set_device(queue_);  // TODO:
+    // detail::set_device(queue_);  // TODO:
     const size_type rcount = std::min(count, this->size_padded() - pos);
 
     // create view of the host data
-    host_view_type<T> host_view{ data_to_copy, rcount };
+    const host_view_type<const T> host_view{ data_to_copy, rcount };
     // create subview of the device data
-    device_subview_type<T> data_subview{ data_, Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(pos, rcount) };  // TODO: view via typedef
+    auto data_subview = Kokkos::subview(data_, std::make_pair(pos, pos + rcount));
+    // copy the data to the device subview
     Kokkos::deep_copy(data_subview, host_view);
 }
 
 template <typename T>
 void device_ptr<T>::copy_to_device_strided(const_host_pointer_type data_to_copy, const std::size_t spitch, const std::size_t width, const std::size_t height) {
+    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
+
+    if (width > spitch) {
+        throw backend_exception{ fmt::format("Invalid width and spitch combination specified (width: {} <= spitch: {})!", width, spitch) };
+    }
+
+    Kokkos::View<T **, Kokkos::LayoutRight> view_2d{ data_.data(), this->shape_padded().x, this->shape_padded().y };
+    // TODO: implement
 }
 
 template <typename T>
 void device_ptr<T>::copy_to_host(host_pointer_type buffer, const size_type pos, const size_type count) const {
+    PLSSVM_ASSERT(data_ != view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!");
+
+    // detail::set_device(queue_);  // TODO:
+    const size_type rcount = std::min(count, this->size_padded() - pos);
+
+    // create view of the host data
+    const host_view_type<T> host_view{ buffer, rcount };
+    // create subview of the device data
+    auto data_subview = Kokkos::subview(data_, std::make_pair(pos, pos + rcount));
+    // copy the data to the host
+    Kokkos::deep_copy(host_view, data_subview);
 }
 
 template <typename T>
 void device_ptr<T>::copy_to_other_device(device_ptr &target, const size_type pos, const size_type count) const {
+    PLSSVM_ASSERT(data_ != view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(target.get() != view_type<T>{}, "Invalid target pointer! Maybe target has been default constructed?");
+
+    const size_type rcount = std::min(count, this->size_padded() - pos);
+    if (target.size_padded() < rcount) {
+        throw backend_exception{ fmt::format("Buffer too small to perform copy (needed: {}, provided: {})!", rcount, target.size_padded()) };
+    }
+
+    // TODO: use Kokkos function?
+    std::vector<value_type> temp(rcount);
+    this->copy_to_host(temp, pos, rcount);
+    target.copy_to_device(temp);
 }
 
 template class device_ptr<float>;

From 0a318dccfaa0a9d585859a202465865776786524 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 17 Oct 2024 19:02:28 +0200
Subject: [PATCH 008/123] Add Kokkos CMake preset.

---
 CMakePresets.json         |   1 +
 cmake/presets/kokkos.json | 142 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+)
 create mode 100644 cmake/presets/kokkos.json

diff --git a/CMakePresets.json b/CMakePresets.json
index 8e4925dd0..bd33cac2f 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -13,6 +13,7 @@
     "cmake/presets/opencl.json",
     "cmake/presets/acpp.json",
     "cmake/presets/dpcpp.json",
+    "cmake/presets/kokkos.json",
     "cmake/presets/all.json"
   ]
 }
\ No newline at end of file
diff --git a/cmake/presets/kokkos.json b/cmake/presets/kokkos.json
new file mode 100644
index 000000000..620e940e5
--- /dev/null
+++ b/cmake/presets/kokkos.json
@@ -0,0 +1,142 @@
+{
+  "version": 6,
+  "include": [
+    "common.json"
+  ],
+  "configurePresets": [
+    {
+      "name": "kokkos",
+      "displayName": "Kokkos backend",
+      "inherits": "build",
+      "cacheVariables": {
+        "PLSSVM_ENABLE_KOKKOS_BACKEND": "ON"
+      }
+    },
+    {
+      "name": "kokkos_python",
+      "displayName": "Kokkos backend + Python bindings",
+      "inherits": "build",
+      "cacheVariables": {
+        "PLSSVM_ENABLE_KOKKOS_BACKEND": "ON",
+        "PLSSVM_ENABLE_LANGUAGE_BINDINGS": "ON",
+        "PLSSVM_ENABLE_PYTHON_BINDINGS": "ON"
+      }
+    },
+    {
+      "name": "kokkos_test",
+      "displayName": "Kokkos backend tests",
+      "inherits": "test",
+      "cacheVariables": {
+        "PLSSVM_ENABLE_KOKKOS_BACKEND": "ON"
+      }
+    }
+  ],
+  "buildPresets": [
+    {
+      "name": "kokkos",
+      "displayName": "Kokkos backend",
+      "configurePreset": "kokkos",
+      "configuration": "RelWithDebInfo",
+      "inherits": "common"
+    },
+    {
+      "name": "kokkos_python",
+      "displayName": "Kokkos backend + Python bindings",
+      "configurePreset": "kokkos_python",
+      "configuration": "RelWithDebInfo",
+      "inherits": "common"
+    },
+    {
+      "name": "kokkos_test",
+      "displayName": "Kokkos backend tests",
+      "configurePreset": "kokkos_test",
+      "configuration": "Debug",
+      "inherits": "common"
+    }
+  ],
+  "testPresets": [
+    {
+      "name": "kokkos_test",
+      "displayName": "Kokkos backend all tests",
+      "configurePreset": "kokkos_test",
+      "inherits": "common"
+    },
+    {
+      "name": "kokkos_backend_test",
+      "displayName": "Kokkos backend specific tests",
+      "configurePreset": "kokkos_test",
+      "inherits": "common",
+      "filter": {
+        "include": {
+          "name": "Kokkos.*"
+        }
+      }
+    }
+  ],
+  "workflowPresets": [
+    {
+      "name": "kokkos",
+      "displayName": "Kokkos backend workflow",
+      "steps": [
+        {
+          "name": "kokkos",
+          "type": "configure"
+        },
+        {
+          "name": "kokkos",
+          "type": "build"
+        }
+      ]
+    },
+    {
+      "name": "kokkos_python",
+      "displayName": "Kokkos backend + Python bindings workflow",
+      "steps": [
+        {
+          "name": "kokkos_python",
+          "type": "configure"
+        },
+        {
+          "name": "kokkos_python",
+          "type": "build"
+        }
+      ]
+    },
+    {
+      "name": "kokkos_test",
+      "displayName": "Kokkos test workflow",
+      "steps": [
+        {
+          "name": "kokkos_test",
+          "type": "configure"
+        },
+        {
+          "name": "kokkos_test",
+          "type": "build"
+        },
+        {
+          "name": "kokkos_test",
+          "type": "test"
+        }
+      ]
+    },
+    {
+      "name": "kokkos_backend_test",
+      "displayName": "Kokkos backend test workflow",
+      "steps": [
+        {
+          "name": "kokkos_test",
+          "type": "configure"
+        },
+        {
+          "name": "kokkos_test",
+          "type": "build"
+        },
+        {
+          "name": "kokkos_backend_test",
+          "type": "test"
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file

From d254a5b3eb900beade2a895b053ccdf770a289df Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 17 Oct 2024 19:02:56 +0200
Subject: [PATCH 009/123] Further improve implementation and add more
 placeholder functions.

---
 include/plssvm/backends/Kokkos/csvm.hpp       |   6 +-
 .../backends/Kokkos/detail/device_ptr.hpp     |  31 +-
 .../Kokkos/kernel/cg_explicit/blas.hpp        |  44 +++
 .../cg_explicit/kernel_matrix_assembly.hpp    |  32 ++
 .../kernel_matrix_assembly_blas.hpp           |  32 ++
 .../Kokkos/kernel/kernel_functions.hpp        | 111 ++++++
 .../backends/Kokkos/kernel/predict_kernel.hpp |  42 +++
 src/plssvm/backends/Kokkos/csvm.cpp           | 357 +++++++++++++++++-
 .../backends/Kokkos/detail/device_ptr.cpp     |  30 +-
 9 files changed, 625 insertions(+), 60 deletions(-)
 create mode 100644 include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
 create mode 100644 include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
 create mode 100644 include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
 create mode 100644 include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
 create mode 100644 include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp

diff --git a/include/plssvm/backends/Kokkos/csvm.hpp b/include/plssvm/backends/Kokkos/csvm.hpp
index 524f1bd4b..206d85a81 100644
--- a/include/plssvm/backends/Kokkos/csvm.hpp
+++ b/include/plssvm/backends/Kokkos/csvm.hpp
@@ -24,6 +24,8 @@
 #include "plssvm/parameter.hpp"                               // plssvm::parameter, plssvm::detail::parameter
 #include "plssvm/target_platforms.hpp"                        // plssvm::target_platform
 
+#include "Kokkos_Core.hpp"  // TODO:
+
 #include <cstddef>      // std::size_t
 #include <type_traits>  // std::true_type
 #include <utility>      // std::forward
@@ -36,11 +38,11 @@ namespace kokkos {
 /**
  * @brief A C-SVM implementation using Kokkos as backend.
  */
-class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, int, detail::pinned_memory> {
+class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, Kokkos::DefaultExecutionSpace, detail::pinned_memory> {
   protected:
     // protected for the test mock class
     /// The template base type of the Kokkos C-SVM class.
-    using base_type = ::plssvm::detail::gpu_csvm<detail::device_ptr, int, detail::pinned_memory>;
+    using base_type = ::plssvm::detail::gpu_csvm<detail::device_ptr, Kokkos::DefaultExecutionSpace, detail::pinned_memory>;
 
     using base_type::data_distribution_;
     using base_type::devices_;
diff --git a/include/plssvm/backends/Kokkos/detail/device_ptr.hpp b/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
index a12021efb..953faf3ed 100644
--- a/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
+++ b/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
@@ -16,7 +16,7 @@
 #include "plssvm/backends/gpu_device_ptr.hpp"  // plssvm::detail::gpu_device_ptr
 #include "plssvm/shape.hpp"                    // plssvm::shape
 
-#include "Kokkos_Core.hpp"  // TODO:
+#include "Kokkos_Core.hpp"  // TODO: Kokkos::DefaultExecutionSpace
 
 #include <cstddef>  // std::size_t
 
@@ -36,9 +36,9 @@ using host_view_type = Kokkos::View<T *, Kokkos::HostSpace, Kokkos::MemoryUnmana
  * @tparam T the type of the kernel view to wrap
  */
 template <typename T>
-class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, device_view_type<T>, device_ptr<T>> {
+class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, Kokkos::DefaultExecutionSpace, device_view_type<T>, device_ptr<T>> {
     /// The template base type of the Kokkos device_ptr class.
-    using base_type = ::plssvm::detail::gpu_device_ptr<T, int, device_view_type<T>, device_ptr<T>>;
+    using base_type = ::plssvm::detail::gpu_device_ptr<T, Kokkos::DefaultExecutionSpace, device_view_type<T>, device_ptr<T>>;
 
     using base_type::data_;
     using base_type::queue_;
@@ -60,35 +60,30 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, device_view_t
     using typename base_type::size_type;
     using typename base_type::value_type;
 
-    // TODO: DOKU
-
     /**
-     * @brief Default construct a CUDA device_ptr with a size of 0.
+     * @brief Default construct a Kokkos device_ptr with a size of 0.
      * @details Always associated with device 0.
      */
     device_ptr() = default;
     /**
-     * @brief Allocates `size * sizeof(T)` bytes on the device with ID @p device.
+     * @brief Allocates `size * sizeof(T)` bytes in the Kokkos execution space @p exec.
      * @param[in] size the number of elements represented by the device_ptr
-     * @param[in] device the associated CUDA device
-     * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
+     * @param[in] exec the associated Kokkos execution space
      */
-    explicit device_ptr(size_type size, int device);
+    explicit device_ptr(size_type size, Kokkos::DefaultExecutionSpace exec);
     /**
-     * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes on the device with ID @p device.
+     * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes in the Kokkos execution space @p exec.
      * @param[in] shape the number of elements represented by the device_ptr
-     * @param[in] device the associated CUDA device
-     * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
+     * @param[in] exec the associated Kokkos execution space
      */
-    explicit device_ptr(plssvm::shape shape, int device);
+    explicit device_ptr(plssvm::shape shape, Kokkos::DefaultExecutionSpace exec);
     /**
-     * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes on the device with ID @p device.
+     * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes in the Kokkos execution space @p exec.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
-     * @param[in] device the associated CUDA device
-     * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
+     * @param[in] exec the associated Kokkos execution space
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, int device);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, Kokkos::DefaultExecutionSpace exec);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
new file mode 100644
index 000000000..79f96283e
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
@@ -0,0 +1,44 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Functions for explicitly performing a BLAS GEMM like matrix-matrix multiplication using the Kokkos backend.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_BLAS_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_BLAS_HPP_
+#pragma once
+
+#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+
+#include "Kokkos_Core.hpp"  // TODO:
+
+namespace plssvm::kokkos::detail {
+
+class device_kernel_symm {
+  public:
+  private:
+};
+
+class device_kernel_symm_mirror {
+  public:
+  private:
+};
+
+class device_kernel_inplace_matrix_add {
+  public:
+  private:
+};
+
+class device_kernel_inplace_matrix_scale {
+  public:
+  private:
+};
+
+}  // namespace plssvm::kokkos::detail
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_BLAS_HPP_
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
new file mode 100644
index 000000000..ff74257b9
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -0,0 +1,32 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Functions for explicitly assembling the kernel matrix using the Kokkos backend.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_KERNEL_MATRIX_ASSEMBLY_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_KERNEL_MATRIX_ASSEMBLY_HPP_
+#pragma once
+
+#include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"  // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/constants.hpp"                                // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
+
+#include "Kokkos_Core.hpp"  // TODO:
+
+namespace plssvm::kokkos::detail {
+
+template <kernel_function_type kernel_function, typename... Args>
+class device_kernel_assembly {
+  public:
+  private:
+};
+
+}  // namespace plssvm::kokkos::detail
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_KERNEL_MATRIX_ASSEMBLY_HPP_
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
new file mode 100644
index 000000000..2d9e855b2
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -0,0 +1,32 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Functions for implicitly assembling the kernel matrix using the Kokkos backend.
+ */
+
+#ifndef PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
+#define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
+#pragma once
+
+#include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"  // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/constants.hpp"                                // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
+
+#include "Kokkos_Core.hpp"  // TODO: Kokkos::atomic_add
+
+namespace plssvm::kokkos::detail {
+
+template <kernel_function_type kernel_function, typename... Args>
+class device_kernel_assembly_symm {
+  public:
+  private:
+};
+
+}  // namespace plssvm::kokkos::detail
+
+#endif  // PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
diff --git a/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp b/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
new file mode 100644
index 000000000..f7f422659
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
@@ -0,0 +1,111 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Implement the different kernel functions on the GPU using Kokkos.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_KERNEL_KERNEL_FUNCTIONS_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_KERNEL_KERNEL_FUNCTIONS_HPP_
+
+#include "plssvm/constants.hpp"              // plssvm::real_type
+#include "plssvm/detail/utility.hpp"         // plssvm::detail::always_false_v
+#include "plssvm/kernel_function_types.hpp"  // plssvm::kernel_function_type
+
+#include "Kokkos_MathematicalFunctions.hpp"  // Kokkos::pow, Kokkos::exp, Kokkos::tanh, Kokkos::abs
+
+#include <limits>  // std::numeric_limits
+#include <tuple>   // std::tuple, std::get
+
+namespace plssvm::kokkos::detail {
+
+//***************************************************//
+//                 feature reductions                //
+//***************************************************//
+
+/**
+ * @brief Compute the default feature reduction, i.e., a simple dot-product.
+ * @tparam kernel_function the kernel function type
+ * @param[in] val1 the first feature value
+ * @param[in] val2 the second feature value
+ * @return the reduced value (`[[nodiscard]]`)
+ */
+template <kernel_function_type kernel_function>
+[[nodiscard]] inline real_type feature_reduce(const real_type val1, const real_type val2) {
+    return val1 * val2;
+}
+
+/**
+ * @brief Compute the feature reduction for the radial basis function kernel function, i.e., the squared euclidean distance.
+ * @param[in] val1 the first feature value
+ * @param[in] val2 the second feature value
+ * @return the reduced value (`[[nodiscard]]`)
+ */
+template <>
+[[nodiscard]] inline real_type feature_reduce<kernel_function_type::rbf>(const real_type val1, const real_type val2) {
+    const real_type d = val1 - val2;
+    return d * d;
+}
+
+/**
+ * @brief Compute the feature reduction for the laplacian kernel function, i.e., the Manhattan distance.
+ * @param[in] val1 the first feature value
+ * @param[in] val2 the second feature value
+ * @return the reduced value (`[[nodiscard]]`)
+ */
+template <>
+[[nodiscard]] inline real_type feature_reduce<kernel_function_type::laplacian>(const real_type val1, const real_type val2) {
+    return ::Kokkos::fabs(val1 - val2);
+}
+
+/**
+ * @brief Compute the feature reduction for the chi-squared kernel function.
+ * @note Be sure that the denominator isn't 0.0 which may be the case for padding values.
+ * @param[in] val1 the first feature value
+ * @param[in] val2 the second feature value
+ * @return the reduced value (`[[nodiscard]]`)
+ */
+template <>
+[[nodiscard]] inline real_type feature_reduce<kernel_function_type::chi_squared>(const real_type val1, const real_type val2) {
+    const real_type d = val1 - val2;
+    return (real_type{ 1.0 } / (val1 + val2 + std::numeric_limits<real_type>::min())) * d * d;
+}
+
+//***************************************************//
+//                  kernel functions                 //
+//***************************************************//
+
+/**
+ * @brief Compute the @p kernel_function using @p value and the @p params.
+ * @tparam kernel_function the kernel function type
+ * @tparam Args the types of the potential kernel function parameters
+ * @param[in] value the value to apply the kernel function to
+ * @param[in] params the potential kernel function parameters
+ * @return the result value (`[[nodiscard]]`)
+ */
+template <kernel_function_type kernel_function, typename... Args>
+[[nodiscard]] inline real_type apply_kernel_function(const real_type value, const std::tuple<Args...> params) {
+    if constexpr (kernel_function == kernel_function_type::linear) {
+        return value;
+    } else if constexpr (kernel_function == kernel_function_type::polynomial) {
+        return ::Kokkos::pow(std::get<1>(params) * value + std::get<2>(params), std::get<0>(params));
+    } else if constexpr (kernel_function == kernel_function_type::rbf) {
+        return ::Kokkos::exp(-std::get<0>(params) * value);
+    } else if constexpr (kernel_function == kernel_function_type::sigmoid) {
+        return ::Kokkos::tanh(std::get<0>(params) * value + std::get<1>(params));
+    } else if constexpr (kernel_function == kernel_function_type::laplacian) {
+        return ::Kokkos::exp(-std::get<0>(params) * value);
+    } else if constexpr (kernel_function == kernel_function_type::chi_squared) {
+        return ::Kokkos::exp(-std::get<0>(params) * value);
+    } else {
+        static_assert(::plssvm::detail::always_false_v<Args...>, "Unsupported kernel function!");
+    }
+}
+
+}  // namespace plssvm::kokkos::detail
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_KERNEL_KERNEL_FUNCTIONS_HPP_
diff --git a/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
new file mode 100644
index 000000000..a203cb7e9
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
@@ -0,0 +1,42 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Defines the functions used for prediction for the C-SVM using the Kokkos backend.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_PREDICT_KERNEL_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_PREDICT_KERNEL_HPP_
+#pragma once
+
+#include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"  // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/constants.hpp"                                // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
+
+#include "Kokkos_Core.hpp"  // TODO: Kokkos::atomic_add
+
+namespace plssvm::kokkos::detail {
+
+class device_kernel_w_linear {
+  public:
+  private:
+};
+
+class device_kernel_predict_linear {
+  public:
+  private:
+};
+
+template <kernel_function_type kernel_function, typename... Args>
+class device_kernel_predict {
+  public:
+  private:
+};
+
+}  // namespace plssvm::kokkos::detail
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_PREDICT_KERNEL_HPP_
diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 3c3afb022..f459d55e5 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -8,23 +8,34 @@
 
 #include "plssvm/backends/Kokkos/csvm.hpp"
 
-#include "plssvm/backends/Kokkos/detail/execution_space.hpp"  // plssvm::kokkos::detail::execution_space
-#include "plssvm/backends/Kokkos/detail/utility.hpp"          // plssvm::kokkos::detail::get_runtime_version
-#include "plssvm/backends/Kokkos/exceptions.hpp"              // plssvm::kokkos::backend_exception
-#include "plssvm/detail/logging.hpp"                          // plssvm::detail::log
-#include "plssvm/detail/tracking/performance_tracker.hpp"     // plssvm::detail::tracking::tracking_entry
-#include "plssvm/exceptions/exceptions.hpp"                   // plssvm::exception
-#include "plssvm/parameter.hpp"                               // plssvm::parameter
-#include "plssvm/target_platforms.hpp"                        // plssvm::target_platform
-#include "plssvm/verbosity_levels.hpp"                        // plssvm::verbosity_level
+#include "plssvm/backends/execution_range.hpp"                                        // plssvm::detail::dim_type
+#include "plssvm/backends/execution_range.hpp"                                        // plssvm::detail::execution_range
+#include "plssvm/backends/Kokkos/detail/device_ptr.hpp"                               // plssvm::kokkos::detail::device_ptr
+#include "plssvm/backends/Kokkos/detail/execution_space.hpp"                          // plssvm::kokkos::detail::execution_space
+#include "plssvm/backends/Kokkos/detail/utility.hpp"                                  // plssvm::kokkos::detail::get_runtime_version
+#include "plssvm/backends/Kokkos/exceptions.hpp"                                      // plssvm::kokkos::backend_exception
+#include "plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp"                         // plssvm::kokkos::detail::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale}
+#include "plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp"       // plssvm::kokkos::detail::device_kernel_assembly
+#include "plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp"  // plssvm::kokkos::detail::device_kernel_assembly_symm
+#include "plssvm/backends/Kokkos/kernel/predict_kernel.hpp"                           // plssvm::kokkos::detail::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
+#include "plssvm/detail/data_distribution.hpp"                                        // plssvm::detail::triangular_data_distribution
+#include "plssvm/detail/logging.hpp"                                                  // plssvm::detail::log
+#include "plssvm/detail/memory_size.hpp"                                              // plssvm::detail::memory_size
+#include "plssvm/detail/tracking/performance_tracker.hpp"                             // plssvm::detail::tracking::tracking_entry
+#include "plssvm/exceptions/exceptions.hpp"                                           // plssvm::exception
+#include "plssvm/parameter.hpp"                                                       // plssvm::parameter
+#include "plssvm/target_platforms.hpp"                                                // plssvm::target_platform
+#include "plssvm/verbosity_levels.hpp"                                                // plssvm::verbosity_level
 
 #include "Kokkos_Core.hpp"  // TODO:
 
+#include "fmt/core.h"    // fmt::format
 #include "fmt/format.h"  // fmt::format
 
+#include <cstddef>    // std::size_t
 #include <exception>  // std::terminate
 #include <iostream>   // std::cout, std::endl
-#include <numeric>    // std::iota
+#include <string>     // std::string
 #include <vector>     // std::vector
 
 namespace plssvm::kokkos {
@@ -89,8 +100,18 @@ void csvm::init(const target_platform target) {
     }
 
     // get all available devices wrt the requested target platform
-    devices_.resize(static_cast<std::vector<queue_type>::size_type>(Kokkos::num_devices()));
-    std::iota(devices_.begin(), devices_.end(), 0);
+// TODO: HOW CAN ONE USE MULTIPLE KOKKOS DEVICES
+// TODO: implement for other Kokkos execution spaces
+#if defined(KOKKOS_ENABLE_CUDA)
+    for (int device = 0; device < Kokkos::num_devices(); ++device) {
+        // create CUDA stream using the CUDA specific functions
+        cudaSetDevice(device);
+        cudaStream_t stream{};
+        cudaStreamCreate(&stream);
+        // create Kokkos execution space for the specific device
+        devices_.emplace_back(Kokkos::Cuda(stream, true));
+    }
+#endif
 
     // throw exception if no CUDA devices could be found
     if (devices_.empty()) {
@@ -103,7 +124,7 @@ void csvm::init(const target_platform target) {
                         plssvm::detail::tracking::tracking_entry{ "backend", "num_devices", devices_.size() },
                         plssvm::detail::tracking::tracking_entry{ "backend", "target_platform", target_ });
 
-    std::vector<std::string> device_names;
+    std::vector<std::string> device_names{};
     device_names.reserve(devices_.size());
     for (typename std::vector<queue_type>::size_type device = 0; device < devices_.size(); ++device) {
         const std::string device_name = detail::get_device_name(space_, device);
@@ -129,19 +150,89 @@ csvm::~csvm() {
 }
 
 std::vector<::plssvm::detail::memory_size> csvm::get_device_memory() const {
-    return {};
+    // TODO: implement for other execution spaces, guard behind ifdef
+    std::vector<::plssvm::detail::memory_size> res(this->num_available_devices());
+    switch (space_) {
+        case detail::execution_space::cuda:
+            {
+                cudaDeviceProp prop{};
+                for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
+                    cudaGetDeviceProperties(&prop, devices_[device_id].cuda_device());
+                    res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(prop.totalGlobalMem) };
+                }
+            }
+            break;
+        case detail::execution_space::hip:
+        case detail::execution_space::sycl:
+        case detail::execution_space::openmp_target:
+        case detail::execution_space::openacc:
+        case detail::execution_space::openmp:
+        case detail::execution_space::hpx:
+        case detail::execution_space::threads:
+        case detail::execution_space::serial:
+            throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
+    }
+
+    return res;
 }
 
 std::vector<::plssvm::detail::memory_size> csvm::get_max_mem_alloc_size() const {
-    return {};
+    // TODO: implement for other execution spaces, guard behind ifdef
+    switch (space_) {
+        case detail::execution_space::cuda:
+            return this->get_device_memory();
+        case detail::execution_space::hip:
+        case detail::execution_space::sycl:
+        case detail::execution_space::openmp_target:
+        case detail::execution_space::openacc:
+        case detail::execution_space::openmp:
+        case detail::execution_space::hpx:
+        case detail::execution_space::threads:
+        case detail::execution_space::serial:
+            throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
+    }
 }
 
 std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
-    return {};
+    // TODO: implement for other execution spaces, guard behind ifdef
+    switch (space_) {
+        case detail::execution_space::cuda:
+            {
+                cudaDeviceProp prop{};
+                cudaGetDeviceProperties(&prop, devices_[device_id].cuda_device());
+                return static_cast<std::size_t>(prop.maxThreadsPerBlock);
+            }
+        case detail::execution_space::hip:
+        case detail::execution_space::sycl:
+        case detail::execution_space::openmp_target:
+        case detail::execution_space::openacc:
+        case detail::execution_space::openmp:
+        case detail::execution_space::hpx:
+        case detail::execution_space::threads:
+        case detail::execution_space::serial:
+            throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
+    }
 }
 
 ::plssvm::detail::dim_type csvm::get_max_grid_size([[maybe_unused]] const std::size_t device_id) const {
-    return {};
+    // TODO: implement for other execution spaces, guard behind ifdef
+    switch (space_) {
+        case detail::execution_space::cuda:
+            {
+                cudaDeviceProp prop{};
+                cudaGetDeviceProperties(&prop, devices_[device_id].cuda_device());
+                return { static_cast<std::size_t>(prop.maxGridSize[0]), static_cast<std::size_t>(prop.maxGridSize[1]), static_cast<std::size_t>(prop.maxGridSize[2]) };
+            }
+        case detail::execution_space::hip:
+        case detail::execution_space::sycl:
+        case detail::execution_space::openmp_target:
+        case detail::execution_space::openacc:
+        case detail::execution_space::openmp:
+        case detail::execution_space::hpx:
+        case detail::execution_space::threads:
+        case detail::execution_space::serial:
+            throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
+    }
 }
 
 //***************************************************//
@@ -149,19 +240,179 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size([[maybe_unused]] const std::s
 //***************************************************//
 
 auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
-    return {};
+    const unsigned long long num_rows_reduced = data_d.shape().x - 1;
+    const unsigned long long num_features = data_d.shape().y;
+    const queue_type &device = devices_[device_id];
+
+    // calculate the number of data points this device is responsible for
+    const unsigned long long device_specific_num_rows = data_distribution_->place_specific_num_rows(device_id);
+
+    // get the offset of the data points this device is responsible for
+    const unsigned long long row_offset = data_distribution_->place_row_offset(device_id);
+
+    // calculate the number of matrix entries
+    const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_);
+    const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id);
+
+    device_ptr_type kernel_matrix_d{ num_entries_padded, device };  // only explicitly store the upper triangular matrix
+    const real_type cost_factor = real_type{ 1.0 } / params.cost;
+
+    // TODO: implement
+    // // convert execution range block to CUDA's native dim3
+    // const dim3 native_block = detail::dim_type_to_native(exec.block);
+    //
+    // for (const auto &[partial_grid, offsets] : exec.grids) {
+    //     // convert execution range partial_grid to CUDA's native dim3
+    //     const dim3 native_partial_grid = detail::dim_type_to_native(partial_grid);
+    //
+    //     switch (params.kernel_type) {
+    //         case kernel_function_type::linear:
+    //             detail::device_kernel_assembly<kernel_function_type::linear><<<native_partial_grid, native_block>>>(kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y);
+    //             break;
+    //         case kernel_function_type::polynomial:
+    //             detail::device_kernel_assembly<kernel_function_type::polynomial><<<native_partial_grid, native_block>>>(kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, params.degree, std::get<real_type>(params.gamma), params.coef0);
+    //             break;
+    //         case kernel_function_type::rbf:
+    //             detail::device_kernel_assembly<kernel_function_type::rbf><<<native_partial_grid, native_block>>>(kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, std::get<real_type>(params.gamma));
+    //             break;
+    //         case kernel_function_type::sigmoid:
+    //             detail::device_kernel_assembly<kernel_function_type::sigmoid><<<native_partial_grid, native_block>>>(kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, std::get<real_type>(params.gamma), params.coef0);
+    //             break;
+    //         case kernel_function_type::laplacian:
+    //             detail::device_kernel_assembly<kernel_function_type::laplacian><<<native_partial_grid, native_block>>>(kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, std::get<real_type>(params.gamma));
+    //             break;
+    //         case kernel_function_type::chi_squared:
+    //             detail::device_kernel_assembly<kernel_function_type::chi_squared><<<native_partial_grid, native_block>>>(kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, std::get<real_type>(params.gamma));
+    //             break;
+    //     }
+    // }
+    detail::device_synchronize_all();
+
+    return kernel_matrix_d;
 }
 
 void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const ::plssvm::detail::execution_range &mirror_exec, const real_type alpha, const device_ptr_type &A_d, const device_ptr_type &B_d, const real_type beta, device_ptr_type &C_d) const {
+    const unsigned long long num_rhs = B_d.shape().x;
+    const unsigned long long num_rows = B_d.shape().y;
+    const queue_type &device = devices_[device_id];
+
+    // calculate the number of data points this device is responsible for
+    const unsigned long long device_specific_num_rows = data_distribution_->place_specific_num_rows(device_id);
+    // get the offset of the data points this device is responsible for
+    const unsigned long long row_offset = data_distribution_->place_row_offset(device_id);
+
+    // TODO: implement
+    // // convert execution range block to CUDA's native dim3
+    // const dim3 native_block = detail::dim_type_to_native(exec.block);
+    //
+    // detail::set_device(device);
+    // for (const auto &[partial_grid, offsets] : exec.grids) {
+    //     // convert execution range partial_grid to CUDA's native dim3
+    //     const dim3 native_partial_grid = detail::dim_type_to_native(partial_grid);
+    //
+    //     detail::device_kernel_symm<<<native_partial_grid, native_block>>>(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.x, offsets.y);
+    // }
+    //
+    // // convert execution range block to CUDA's native dim3
+    // const dim3 native_mirror_block = detail::dim_type_to_native(mirror_exec.block);
+    //
+    // for (const auto &[partial_grid, offsets] : mirror_exec.grids) {
+    //     const unsigned long long num_mirror_rows = num_rows - row_offset - device_specific_num_rows;
+    //
+    //     if (num_mirror_rows > 0) {
+    //         // convert execution range partial_grid to CUDA's native dim3
+    //         const dim3 native_partial_grid = detail::dim_type_to_native(partial_grid);
+    //
+    //         detail::device_kernel_symm_mirror<<<native_partial_grid, native_mirror_block>>>(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.x, offsets.y);
+    //     }
+    // }
+    // detail::peek_at_last_error();
+    detail::device_synchronize_all();
 }
 
 void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, device_ptr_type &lhs_d, const device_ptr_type &rhs_d) const {
+    const unsigned long long num_rhs = lhs_d.shape().x;
+    const queue_type &device = devices_[device_id];
+
+    // // TODO: implement
+    // // convert execution range block to CUDA's native dim3
+    // const dim3 native_block = detail::dim_type_to_native(exec.block);
+    //
+    // detail::set_device(device);
+    // for (const auto &[partial_grid, offsets] : exec.grids) {
+    //     // convert execution range partial_grid to CUDA's native dim3
+    //     const dim3 native_partial_grid = detail::dim_type_to_native(partial_grid);
+    //
+    //     detail::device_kernel_inplace_matrix_add<<<native_partial_grid, native_block>>>(num_rhs, lhs_d.get(), rhs_d.get(), offsets.x, offsets.y);
+    // }
+    // detail::peek_at_last_error();
+    detail::device_synchronize_all();
 }
 
 void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, device_ptr_type &lhs_d, const real_type scale) const {
+    const unsigned long long num_rhs = lhs_d.shape().x;
+    const queue_type &device = devices_[device_id];
+
+    // TODO: implement
+    // // convert execution range block to CUDA's native dim3
+    // const dim3 native_block = detail::dim_type_to_native(exec.block);
+    //
+    // detail::set_device(device);
+    // for (const auto &[partial_grid, offsets] : exec.grids) {
+    //     // convert execution range partial_grid to CUDA's native dim3
+    //     const dim3 native_partial_grid = detail::dim_type_to_native(partial_grid);
+    //
+    //     detail::device_kernel_inplace_matrix_scale<<<native_partial_grid, native_block>>>(num_rhs, lhs_d.get(), scale, offsets.x, offsets.y);
+    // }
+    // detail::peek_at_last_error();
+    detail::device_synchronize_all();
 }
 
 void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const real_type alpha, const device_ptr_type &A_d, const parameter &params, const device_ptr_type &q_red, const real_type QA_cost, const device_ptr_type &B_d, device_ptr_type &C_d) const {
+    const unsigned long long num_rows_reduced = A_d.shape().x - 1;
+    const unsigned long long num_features = A_d.shape().y;
+    const unsigned long long num_classes = B_d.shape().x;
+    const queue_type &device = devices_[device_id];
+
+    // calculate the number of data points this device is responsible for
+    const unsigned long long device_specific_num_rows = data_distribution_->place_specific_num_rows(device_id);
+    // get the offset of the data points this device is responsible for
+    const unsigned long long row_offset = data_distribution_->place_row_offset(device_id);
+
+    const real_type cost_factor = real_type{ 1.0 } / params.cost;
+
+    // TODO: implement
+    // // convert general execution range's block to CUDA specific block
+    // const dim3 native_block = detail::dim_type_to_native(exec.block);
+    //
+    // detail::set_device(device);
+    // for (const auto &[partial_grid, offsets] : exec.grids) {
+    //     // convert execution range partial_grid to CUDA's native dim3
+    //     const dim3 native_partial_grid = detail::dim_type_to_native(partial_grid);
+    //
+    //     switch (params.kernel_type) {
+    //         case kernel_function_type::linear:
+    //             detail::device_kernel_assembly_symm<kernel_function_type::linear><<<native_partial_grid, native_block>>>(alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y);
+    //             break;
+    //         case kernel_function_type::polynomial:
+    //             detail::device_kernel_assembly_symm<kernel_function_type::polynomial><<<native_partial_grid, native_block>>>(alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, params.degree, std::get<real_type>(params.gamma), params.coef0);
+    //             break;
+    //         case kernel_function_type::rbf:
+    //             detail::device_kernel_assembly_symm<kernel_function_type::rbf><<<native_partial_grid, native_block>>>(alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, std::get<real_type>(params.gamma));
+    //             break;
+    //         case kernel_function_type::sigmoid:
+    //             detail::device_kernel_assembly_symm<kernel_function_type::sigmoid><<<native_partial_grid, native_block>>>(alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, std::get<real_type>(params.gamma), params.coef0);
+    //             break;
+    //         case kernel_function_type::laplacian:
+    //             detail::device_kernel_assembly_symm<kernel_function_type::laplacian><<<native_partial_grid, native_block>>>(alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, std::get<real_type>(params.gamma));
+    //             break;
+    //         case kernel_function_type::chi_squared:
+    //             detail::device_kernel_assembly_symm<kernel_function_type::chi_squared><<<native_partial_grid, native_block>>>(alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, std::get<real_type>(params.gamma));
+    //             break;
+    //     }
+    // }
+    // detail::peek_at_last_error();
+    detail::device_synchronize_all();
 }
 
 //***************************************************//
@@ -169,11 +420,77 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
 //***************************************************//
 
 auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const device_ptr_type &alpha_d, const device_ptr_type &sv_d) const -> device_ptr_type {
-    return {};
+    const unsigned long long num_classes = alpha_d.shape().x;
+    const unsigned long long num_sv = alpha_d.shape().y;
+    const unsigned long long device_specific_num_sv = sv_d.shape().x;
+    const unsigned long long num_features = sv_d.shape().y;
+    const queue_type &device = devices_[device_id];
+
+    // get the offset of the data points this device is responsible for
+    const unsigned long long sv_offset = data_distribution_->place_row_offset(device_id);
+
+    device_ptr_type w_d{ shape{ num_classes, num_features }, shape{ PADDING_SIZE, PADDING_SIZE }, device };
+
+    // TODO: implement
+    // // convert execution range block to CUDA's native dim3
+    // const dim3 native_block = detail::dim_type_to_native(exec.block);
+    //
+    // detail::set_device(device);
+    // for (const auto &[partial_grid, offsets] : exec.grids) {
+    //     // convert execution range partial_grid to CUDA's native dim3
+    //     const dim3 native_partial_grid = detail::dim_type_to_native(partial_grid);
+    //
+    //     detail::device_kernel_w_linear<<<native_partial_grid, native_block>>>(w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.x, offsets.y);
+    // }
+    // detail::peek_at_last_error();
+    detail::device_synchronize_all();
+
+    return w_d;
 }
 
 auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &alpha_d, const device_ptr_type &rho_d, const device_ptr_type &sv_or_w_d, const device_ptr_type &predict_points_d) const -> device_ptr_type {
-    return {};
+    const unsigned long long num_classes = alpha_d.shape().x;
+    const unsigned long long num_predict_points = predict_points_d.shape().x;  // = device_specific_num_rows
+    const unsigned long long num_features = predict_points_d.shape().y;
+    const unsigned long long num_sv = sv_or_w_d.shape().x;
+    const queue_type &device = devices_[device_id];
+
+    device_ptr_type out_d{ shape{ num_predict_points, num_classes }, shape{ PADDING_SIZE, PADDING_SIZE }, device };
+
+    // TODO: implement
+    // // convert execution range block to CUDA's native dim3
+    // const dim3 native_block = detail::dim_type_to_native(exec.block);
+    //
+    // detail::set_device(device);
+    // for (const auto &[partial_grid, offsets] : exec.grids) {
+    //     // convert execution range partial_grid to CUDA's native dim3
+    //     const dim3 native_partial_grid = detail::dim_type_to_native(partial_grid);
+    //
+    //     switch (params.kernel_type) {
+    //         case kernel_function_type::linear:
+    //             detail::device_kernel_predict_linear<<<native_partial_grid, native_block>>>(out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.x, offsets.y);
+    //             break;
+    //         case kernel_function_type::polynomial:
+    //             detail::device_kernel_predict<kernel_function_type::polynomial><<<native_partial_grid, native_block>>>(out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, params.degree, std::get<real_type>(params.gamma), params.coef0);
+    //             break;
+    //         case kernel_function_type::rbf:
+    //             detail::device_kernel_predict<kernel_function_type::rbf><<<native_partial_grid, native_block>>>(out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, std::get<real_type>(params.gamma));
+    //             break;
+    //         case kernel_function_type::sigmoid:
+    //             detail::device_kernel_predict<kernel_function_type::sigmoid><<<native_partial_grid, native_block>>>(out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, std::get<real_type>(params.gamma), params.coef0);
+    //             break;
+    //         case kernel_function_type::laplacian:
+    //             detail::device_kernel_predict<kernel_function_type::laplacian><<<native_partial_grid, native_block>>>(out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, std::get<real_type>(params.gamma));
+    //             break;
+    //         case kernel_function_type::chi_squared:
+    //             detail::device_kernel_predict<kernel_function_type::chi_squared><<<native_partial_grid, native_block>>>(out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, std::get<real_type>(params.gamma));
+    //             break;
+    //     }
+    // }
+    // detail::peek_at_last_error();
+    detail::device_synchronize_all();
+
+    return out_d;
 }
 
 }  // namespace plssvm::kokkos
diff --git a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
index 5758a4309..22719482c 100644
--- a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
@@ -15,7 +15,7 @@
 
 #include "Kokkos_Core.hpp"
 
-#include "fmt/format.h"  // fmt::format
+#include "fmt/core.h"  // fmt::format
 
 #include <cstddef>    // std::size_t
 #include <exception>  // std::terminate
@@ -25,31 +25,23 @@
 namespace plssvm::kokkos::detail {
 
 template <typename T>
-device_ptr<T>::device_ptr(const size_type size, const int queue) :
-    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, queue } { }
+device_ptr<T>::device_ptr(const size_type size, const Kokkos::DefaultExecutionSpace exec) :
+    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, exec } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const int queue) :
-    device_ptr{ shape, plssvm::shape{ 0, 0 }, queue } { }
+device_ptr<T>::device_ptr(const plssvm::shape shape, const Kokkos::DefaultExecutionSpace exec) :
+    device_ptr{ shape, plssvm::shape{ 0, 0 }, exec } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const int queue) :
-    base_type{ shape, padding, queue } {
-    static std::size_t count = 0;
-    // TODO: queue type, check range?
-    // TODO: how to assign a view to a GPU in a multi-GPU setting?
-    data_ = device_view_type<T>{ fmt::format("device_ptr_{}", count++), this->size_padded() };
+device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const Kokkos::DefaultExecutionSpace exec) :
+    base_type{ shape, padding, exec } {
+    // TODO: GUARD behind ifdef!
+    data_ = device_view_type<T>{ fmt::format("device_{}_view", exec.cuda_device()), this->size_padded() };
 }
 
 template <typename T>
 device_ptr<T>::~device_ptr() {
-    // avoid compiler warnings
-    try {
-        // TODO:
-    } catch (const plssvm::exception &e) {
-        std::cout << e.what_with_loc() << std::endl;
-        std::terminate();
-    }
+    // Kokkos automatically frees the memory of a Kokkos::View if the View goes out of scope
 }
 
 template <typename T>
@@ -65,7 +57,6 @@ void device_ptr<T>::copy_to_device(const_host_pointer_type data_to_copy, const s
     PLSSVM_ASSERT(data_ != view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
 
-    // detail::set_device(queue_);  // TODO:
     const size_type rcount = std::min(count, this->size_padded() - pos);
 
     // create view of the host data
@@ -94,7 +85,6 @@ void device_ptr<T>::copy_to_host(host_pointer_type buffer, const size_type pos,
     PLSSVM_ASSERT(data_ != view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!");
 
-    // detail::set_device(queue_);  // TODO:
     const size_type rcount = std::min(count, this->size_padded() - pos);
 
     // create view of the host data

From 752afd48b8d87aa2a6aff1d6538e482eec214371 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 18 Oct 2024 15:05:52 +0200
Subject: [PATCH 010/123] Don't hardcoded test against nullptr, but the default
 constructed device_pointer_type.

---
 include/plssvm/backends/gpu_device_ptr.hpp | 28 +++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/plssvm/backends/gpu_device_ptr.hpp b/include/plssvm/backends/gpu_device_ptr.hpp
index 7d364253b..55a3e18a9 100644
--- a/include/plssvm/backends/gpu_device_ptr.hpp
+++ b/include/plssvm/backends/gpu_device_ptr.hpp
@@ -416,14 +416,14 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::swap(
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::memset(const int pattern, const size_type pos) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     this->memset(pattern, pos, this->size_padded() * sizeof(value_type));
 }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::fill(const value_type value, const size_type pos) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     this->fill(value, pos, this->size_padded());
 }
@@ -431,7 +431,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::fill(
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 template <layout_type layout>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_device(const matrix<value_type, layout> &data_to_copy) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (data_to_copy.size_padded() < this->size_padded()) {
         throw gpu_device_ptr_exception{ fmt::format("Too few data to perform copy (needed: {}, provided: {})!", this->size_padded(), data_to_copy.size_padded()) };
@@ -441,14 +441,14 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_device(const std::vector<value_type> &data_to_copy) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     this->copy_to_device(data_to_copy, 0, this->size_padded());
 }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_device(const std::vector<value_type> &data_to_copy, const size_type pos, const size_type count) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
     if (data_to_copy.size() < rcount) {
@@ -459,7 +459,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_device(const_host_pointer_type data_to_copy) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
 
     this->copy_to_device(data_to_copy, 0, this->size_padded());
@@ -468,7 +468,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 template <layout_type layout>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_device_strided(const matrix<value_type, layout> &data_to_copy, const std::size_t start_row, const std::size_t num_rows) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (start_row + num_rows > data_to_copy.num_rows()) {
         throw gpu_device_ptr_exception{ fmt::format("Tried to copy lines {}-{} (zero-based index) to the device, but the matrix has only {} lines!", start_row, start_row + num_rows - 1, data_to_copy.num_rows()) };
@@ -494,7 +494,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_device_strided(const std::vector<value_type> &data_to_copy, std::size_t spitch, std::size_t width, std::size_t height) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (width > spitch) {
         throw gpu_device_ptr_exception{ fmt::format("Invalid width and spitch combination specified (width: {} <= spitch: {})!", width, spitch) };
@@ -509,7 +509,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 template <layout_type layout>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_host(matrix<value_type, layout> &buffer) const {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (buffer.size_padded() < this->size_padded()) {
         throw gpu_device_ptr_exception{ fmt::format("Buffer too small to perform copy (needed: {}, provided: {})!", this->size_padded(), buffer.size_padded()) };
@@ -519,14 +519,14 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_host(std::vector<value_type> &buffer) const {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     this->copy_to_host(buffer, 0, this->size_padded());
 }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_host(std::vector<value_type> &buffer, const size_type pos, const size_type count) const {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
     if (buffer.size() < rcount) {
@@ -537,7 +537,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_host(host_pointer_type buffer) const {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!");
 
     this->copy_to_host(buffer, 0, this->size_padded());
@@ -545,8 +545,8 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_other_device(derived_gpu_device_ptr &target) const {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
-    PLSSVM_ASSERT(target.get() != nullptr, "Invalid target pointer! Maybe target has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(target.get() != device_pointer_type{}, "Invalid target pointer! Maybe target has been default constructed?");
 
     this->copy_to_other_device(target, 0, this->size_padded());
 }

From 820c431778b42cfad2c2f289288f9402d62bbce3 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 21 Oct 2024 15:09:55 +0200
Subject: [PATCH 011/123] Implement missing device_ptr functionality.

---
 .../backends/Kokkos/detail/device_ptr.hpp     |  3 +-
 .../backends/Kokkos/detail/device_ptr.cpp     | 75 +++++++++++++++----
 tests/backends/generic_device_ptr_tests.hpp   | 40 +++++-----
 3 files changed, 81 insertions(+), 37 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/detail/device_ptr.hpp b/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
index 953faf3ed..8f587b667 100644
--- a/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
+++ b/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
@@ -105,8 +105,9 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, Kokkos::DefaultExe
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::~gpu_device_ptr()
+     * @details Kokkos automatically frees the memory of a Kokkos::View if the View goes out of scope.
      */
-    ~device_ptr() override;
+    ~device_ptr() override = default;
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::memset(int, size_type, size_type)
diff --git a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
index 22719482c..b176c1283 100644
--- a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
@@ -8,10 +8,11 @@
 
 #include "plssvm/backends/Kokkos/detail/device_ptr.hpp"
 
-#include "plssvm/backends/Kokkos/exceptions.hpp"  // plssvm::kokkos::backend_exception
-#include "plssvm/detail/assert.hpp"               // PLSSVM_ASSERT
-#include "plssvm/exceptions/exceptions.hpp"       // plssvm::exception
-#include "plssvm/shape.hpp"                       // plssvm::shape
+#include "plssvm/backends/Kokkos/detail/utility.hpp"  // plssvm::detail::device_synchronize
+#include "plssvm/backends/Kokkos/exceptions.hpp"      // plssvm::kokkos::backend_exception
+#include "plssvm/detail/assert.hpp"                   // PLSSVM_ASSERT
+#include "plssvm/exceptions/exceptions.hpp"           // plssvm::exception
+#include "plssvm/shape.hpp"                           // plssvm::shape
 
 #include "Kokkos_Core.hpp"
 
@@ -39,22 +40,45 @@ device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding
     data_ = device_view_type<T>{ fmt::format("device_{}_view", exec.cuda_device()), this->size_padded() };
 }
 
-template <typename T>
-device_ptr<T>::~device_ptr() {
-    // Kokkos automatically frees the memory of a Kokkos::View if the View goes out of scope
-}
-
 template <typename T>
 void device_ptr<T>::memset(const int pattern, const size_type pos, const size_type num_bytes) {
+    PLSSVM_ASSERT(data_ != device_view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
+
+    if (pos >= this->size_padded()) {
+        throw backend_exception{ fmt::format("Illegal access in memset!: {} >= {}", pos, this->size_padded()) };
+    }
+    const size_type rnum_bytes = std::min(num_bytes, (this->size_padded() - pos) * sizeof(value_type));
+
+    // create subview of the device data
+    auto data_subview = Kokkos::subview(data_, std::make_pair(pos, pos + (rnum_bytes / sizeof(value_type))));
+    // fill subview with constant data
+    Kokkos::parallel_for("device_ptr_memset", num_bytes, KOKKOS_LAMBDA(const std::size_t idx) {
+        // Cast the view's data pointer to unsigned char* (byte access)
+        reinterpret_cast<unsigned char*>(data_subview.data())[idx] = pattern; });
+
+    detail::device_synchronize(queue_);
 }
 
 template <typename T>
 void device_ptr<T>::fill(const value_type value, const size_type pos, const size_type count) {
+    PLSSVM_ASSERT(data_ != device_view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
+
+    if (pos >= this->size_padded()) {
+        throw backend_exception{ fmt::format("Illegal access in fill!: {} >= {}", pos, this->size_padded()) };
+    }
+    const size_type rcount = std::min(count, this->size_padded() - pos);
+
+    // create subview of the device data
+    auto data_subview = Kokkos::subview(data_, std::make_pair(pos, pos + rcount));
+    // fill subview with constant data
+    Kokkos::deep_copy(data_subview, value);
+
+    detail::device_synchronize(queue_);
 }
 
 template <typename T>
 void device_ptr<T>::copy_to_device(const_host_pointer_type data_to_copy, const size_type pos, const size_type count) {
-    PLSSVM_ASSERT(data_ != view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
@@ -65,24 +89,39 @@ void device_ptr<T>::copy_to_device(const_host_pointer_type data_to_copy, const s
     auto data_subview = Kokkos::subview(data_, std::make_pair(pos, pos + rcount));
     // copy the data to the device subview
     Kokkos::deep_copy(data_subview, host_view);
+
+    detail::device_synchronize(queue_);
 }
 
 template <typename T>
 void device_ptr<T>::copy_to_device_strided(const_host_pointer_type data_to_copy, const std::size_t spitch, const std::size_t width, const std::size_t height) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
 
     if (width > spitch) {
         throw backend_exception{ fmt::format("Invalid width and spitch combination specified (width: {} <= spitch: {})!", width, spitch) };
     }
 
-    Kokkos::View<T **, Kokkos::LayoutRight> view_2d{ data_.data(), this->shape_padded().x, this->shape_padded().y };
-    // TODO: implement
+    // TODO: strided copy to device in Kokkos currently not possible
+    if (spitch == width) {
+        // can use normal copy since we have no line strides
+        this->copy_to_device(data_to_copy, 0, width * height);
+    } else {
+        std::vector<value_type> temp(this->shape_padded().x * height, value_type{ 0.0 });
+        value_type *pos = temp.data();
+        for (std::size_t row = 0; row < height; ++row) {
+            std::memcpy(pos, data_to_copy + row * spitch, width * sizeof(value_type));
+            pos += this->shape_padded().x;
+        }
+        this->copy_to_device(temp);
+    }
+
+    detail::device_synchronize(queue_);
 }
 
 template <typename T>
 void device_ptr<T>::copy_to_host(host_pointer_type buffer, const size_type pos, const size_type count) const {
-    PLSSVM_ASSERT(data_ != view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
@@ -93,12 +132,14 @@ void device_ptr<T>::copy_to_host(host_pointer_type buffer, const size_type pos,
     auto data_subview = Kokkos::subview(data_, std::make_pair(pos, pos + rcount));
     // copy the data to the host
     Kokkos::deep_copy(host_view, data_subview);
+
+    detail::device_synchronize(queue_);
 }
 
 template <typename T>
 void device_ptr<T>::copy_to_other_device(device_ptr &target, const size_type pos, const size_type count) const {
-    PLSSVM_ASSERT(data_ != view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
-    PLSSVM_ASSERT(target.get() != view_type<T>{}, "Invalid target pointer! Maybe target has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(target.get() != device_view_type<T>{}, "Invalid target pointer! Maybe target has been default constructed?");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
     if (target.size_padded() < rcount) {
@@ -109,6 +150,8 @@ void device_ptr<T>::copy_to_other_device(device_ptr &target, const size_type pos
     std::vector<value_type> temp(rcount);
     this->copy_to_host(temp, pos, rcount);
     target.copy_to_device(temp);
+
+    detail::device_synchronize(queue_);
 }
 
 template class device_ptr<float>;
diff --git a/tests/backends/generic_device_ptr_tests.hpp b/tests/backends/generic_device_ptr_tests.hpp
index 6a8713dc7..3f2407005 100644
--- a/tests/backends/generic_device_ptr_tests.hpp
+++ b/tests/backends/generic_device_ptr_tests.hpp
@@ -46,7 +46,7 @@ TYPED_TEST_P(DevicePtr, default_construct) {
 
     // empty data
     EXPECT_FALSE(static_cast<bool>(ptr));
-    EXPECT_EQ(ptr.get(), nullptr);
+    EXPECT_EQ(ptr.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(ptr.size(), 0);
     EXPECT_EQ(ptr.shape(), (plssvm::shape{ 0, 0 }));
     EXPECT_TRUE(ptr.empty());
@@ -63,7 +63,7 @@ TYPED_TEST_P(DevicePtr, construct_size) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(ptr));
-    EXPECT_NE(ptr.get(), nullptr);
+    EXPECT_NE(ptr.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(ptr.shape(), (plssvm::shape{ 42, 1 }));
     // check padding
     EXPECT_EQ(ptr.padding(), (plssvm::shape{ 0, 0 }));
@@ -81,7 +81,7 @@ TYPED_TEST_P(DevicePtr, construct_shape) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(ptr));
-    EXPECT_NE(ptr.get(), nullptr);
+    EXPECT_NE(ptr.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(ptr.shape(), (plssvm::shape{ 42, 16 }));
     // check padding
     EXPECT_EQ(ptr.padding(), (plssvm::shape{ 0, 0 }));
@@ -99,7 +99,7 @@ TYPED_TEST_P(DevicePtr, construct_shape_and_padding) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(ptr));
-    EXPECT_NE(ptr.get(), nullptr);
+    EXPECT_NE(ptr.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(ptr.shape(), (plssvm::shape{ 42, 16 }));
     // check padding
     EXPECT_EQ(ptr.padding(), (plssvm::shape{ 4, 4 }));
@@ -119,7 +119,7 @@ TYPED_TEST_P(DevicePtr, move_construct) {
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
     // EXPECT_EQ(second.queue(), queue);
-    EXPECT_NE(second.get(), nullptr);
+    EXPECT_NE(second.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 1 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 0, 0 }));
@@ -127,7 +127,7 @@ TYPED_TEST_P(DevicePtr, move_construct) {
 
     // check moved-from data
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), nullptr);
+    EXPECT_EQ(first.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -147,7 +147,7 @@ TYPED_TEST_P(DevicePtr, move_construct_with_padding) {
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
     // EXPECT_EQ(second.queue(), queue);
-    EXPECT_NE(second.get(), nullptr);
+    EXPECT_NE(second.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 10 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 4, 5 }));
@@ -155,7 +155,7 @@ TYPED_TEST_P(DevicePtr, move_construct_with_padding) {
 
     // check moved-from data
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), nullptr);
+    EXPECT_EQ(first.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -177,7 +177,7 @@ TYPED_TEST_P(DevicePtr, move_assign) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
-    EXPECT_NE(second.get(), nullptr);
+    EXPECT_NE(second.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 1 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 0, 0 }));
@@ -185,7 +185,7 @@ TYPED_TEST_P(DevicePtr, move_assign) {
 
     // check moved-from data
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), nullptr);
+    EXPECT_EQ(first.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -207,7 +207,7 @@ TYPED_TEST_P(DevicePtr, move_assign_with_padding) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
-    EXPECT_NE(second.get(), nullptr);
+    EXPECT_NE(second.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 10 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 4, 5 }));
@@ -215,7 +215,7 @@ TYPED_TEST_P(DevicePtr, move_assign_with_padding) {
 
     // check moved-from data
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), nullptr);
+    EXPECT_EQ(first.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -237,14 +237,14 @@ TYPED_TEST_P(DevicePtr, swap_member_function) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
-    EXPECT_NE(second.get(), nullptr);
+    EXPECT_NE(second.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 1 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 0, 0 }));
     EXPECT_EQ(second.shape_padded(), (plssvm::shape{ 42, 1 }));
 
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), nullptr);
+    EXPECT_EQ(first.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -266,14 +266,14 @@ TYPED_TEST_P(DevicePtr, swap_member_function_with_padding) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
-    EXPECT_NE(second.get(), nullptr);
+    EXPECT_NE(second.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 10 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 4, 5 }));
     EXPECT_EQ(second.shape_padded(), (plssvm::shape{ 46, 15 }));
 
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), nullptr);
+    EXPECT_EQ(first.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -296,14 +296,14 @@ TYPED_TEST_P(DevicePtr, swap_free_function) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
-    EXPECT_NE(second.get(), nullptr);
+    EXPECT_NE(second.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 1 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 0, 0 }));
     EXPECT_EQ(second.shape_padded(), (plssvm::shape{ 42, 1 }));
 
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), nullptr);
+    EXPECT_EQ(first.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -326,14 +326,14 @@ TYPED_TEST_P(DevicePtr, swap_free_function_with_padding) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
-    EXPECT_NE(second.get(), nullptr);
+    EXPECT_NE(second.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 10 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 4, 5 }));
     EXPECT_EQ(second.shape_padded(), (plssvm::shape{ 46, 15 }));
 
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), nullptr);
+    EXPECT_EQ(first.get(), typename device_ptr_type::device_pointer_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));

From 2fc2616318b39d56ada99f533450ce78a9076983 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 21 Oct 2024 15:18:48 +0200
Subject: [PATCH 012/123] Add total_size function to dim_type struct.

---
 include/plssvm/backends/execution_range.hpp |  9 +++++++++
 tests/backends/execution_range.cpp          | 14 ++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/include/plssvm/backends/execution_range.hpp b/include/plssvm/backends/execution_range.hpp
index 52c78f8e1..c44aab7b6 100644
--- a/include/plssvm/backends/execution_range.hpp
+++ b/include/plssvm/backends/execution_range.hpp
@@ -77,6 +77,15 @@ struct [[nodiscard]] dim_type {
         swap_ull(z, other.z);
     }
 
+    /**
+     * @brief Return the total number of elements in the dimensional type.
+     * @details Equal to: `x * y * z`.
+     * @return the total number of elements (`[[nodiscard]]`)
+     */
+    [[nodiscard]] constexpr unsigned long long total_size() const noexcept {
+        return x * y * z;
+    }
+
     /// The dimensional size in x direction.
     unsigned long long x{ 1 };
     /// The dimensional size in y direction.
diff --git a/tests/backends/execution_range.cpp b/tests/backends/execution_range.cpp
index 75fe16ef2..866dae83a 100644
--- a/tests/backends/execution_range.cpp
+++ b/tests/backends/execution_range.cpp
@@ -94,6 +94,20 @@ TEST(DimType, swap_free_function) {
     EXPECT_EQ(dim2.z, 1ull);
 }
 
+TEST(DimType, total_size) {
+    // create dim types
+    constexpr plssvm::detail::dim_type dim1{};
+    constexpr plssvm::detail::dim_type dim2{ 64ull };
+    constexpr plssvm::detail::dim_type dim3{ 64ull, 32ull };
+    constexpr plssvm::detail::dim_type dim4{ 64ull, 32ull, 16ull };
+
+    // test total_size function
+    EXPECT_EQ(dim1.total_size(), 1ull);
+    EXPECT_EQ(dim2.total_size(), 64ull);
+    EXPECT_EQ(dim3.total_size(), 2048ull);
+    EXPECT_EQ(dim4.total_size(), 32768ull);
+}
+
 TEST(DimType, equality) {
     // create dim types
     constexpr plssvm::detail::dim_type dim1{};

From 83dd160ad005039406658b08a67d6d191e231cfc Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 21 Oct 2024 15:20:33 +0200
Subject: [PATCH 013/123] Add first working (but WIP) Kokkos backend
 implementation.

---
 .../Kokkos/detail/standard_layout_tuple.hpp   | 129 ++++++
 .../plssvm/backends/Kokkos/detail/utility.hpp |   3 +-
 .../Kokkos/kernel/cg_explicit/blas.hpp        | 359 +++++++++++++++-
 .../cg_explicit/kernel_matrix_assembly.hpp    | 145 ++++++-
 .../kernel_matrix_assembly_blas.hpp           | 253 +++++++++++-
 .../Kokkos/kernel/kernel_functions.hpp        |  36 +-
 .../backends/Kokkos/kernel/predict_kernel.hpp | 371 +++++++++++++++++
 src/main_predict.cpp                          |   4 +-
 src/main_train.cpp                            |   5 +-
 src/plssvm/backends/Kokkos/csvm.cpp           | 384 ++++++++++--------
 src/plssvm/backends/Kokkos/detail/utility.cpp |   4 +-
 11 files changed, 1496 insertions(+), 197 deletions(-)
 create mode 100644 include/plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp

diff --git a/include/plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp b/include/plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp
new file mode 100644
index 000000000..3f5fddddd
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp
@@ -0,0 +1,129 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Implementation of a basic and minimalistic tuple class which is standard-layout conform.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_STANDARD_LAYOUT_TUPLE_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_DETAIL_STANDARD_LAYOUT_TUPLE_HPP_
+#pragma once
+
+#include "plssvm/constants.hpp"  // plssvm::real_type
+
+#include <cstddef>      // std::size_t
+#include <type_traits>  // std::is_standard_layout
+#include <utility>      // std::forward
+
+namespace plssvm::kokkos::detail {
+
+/*
+ * Empty base implementation.
+ */
+template <typename...>
+struct standard_layout_tuple;
+
+/**
+ * @brief Save the value of type @p T as scalar and the remaining values of type @p Rest recursively in another standard layout tuple.
+ * @tparam T the type of the value to save in this tuple
+ * @tparam Rest the remaining types saved in a recursive tuple
+ */
+template <typename T, typename... Rest>
+struct standard_layout_tuple<T, Rest...> {
+    /// The stored value.
+    T value;
+    /// The remaining values stored in their own tuple.
+    standard_layout_tuple<Rest...> remaining;
+};
+
+/**
+ * @brief Special case for an empty tuple (recursion termination criterion).
+ */
+template <>
+struct standard_layout_tuple<> { };
+
+namespace impl {
+
+/**
+ * @brief Recursively traverse (at compile time) the tuple @p t and retrieve the value at position @p I.
+ * @tparam I the index of the tuple value to get
+ */
+template <std::size_t I>
+struct get_impl {
+    /**
+     * @brief Recursively traverse (at compile time) the tuple @p t and retrieve the value at position @p I.
+     * @tparam Types the types in the tuple
+     * @param[in] t the tuple to traverse
+     * @return the requested value (`[[nodiscard]]`)
+     */
+    template <typename... Types>
+    KOKKOS_INLINE_FUNCTION constexpr static auto get(const standard_layout_tuple<Types...> &t) {
+        return get_impl<I - 1>::get(t.remaining);
+    }
+};
+
+/**
+ * @brief Special case to retrieve the currently held value (recursion termination criterion).
+ */
+template <>
+struct get_impl<0> {
+    /**
+     * @brief Get the held value from @p t.
+     * @tparam Types the types in the tuple
+     * @param[in] t the tuple to get the value from
+     * @return the requested value (`[[nodiscard]]`)
+     */
+    template <typename... Types>
+    KOKKOS_INLINE_FUNCTION constexpr static auto get(const standard_layout_tuple<Types...> &t) {
+        return t.value;
+    }
+};
+
+}  // namespace impl
+
+/**
+ * @brief Get the value at position @p I of the tuple @p t holding the @p Types.
+ * @tparam I the position of the element in the tuple to get
+ * @tparam Types the types stored in the tuple
+ * @param[in] t the tuple
+ * @return the value of the tuple @p t at position @p I (`[[nodiscard]]`)
+ */
+template <std::size_t I, typename... Types>
+KOKKOS_INLINE_FUNCTION constexpr auto get(const standard_layout_tuple<Types...> &t) {
+    static_assert(I < sizeof...(Types), "Invalid standard_layout_tuple index!");
+    return impl::get_impl<I>::get(t);
+}
+
+/**
+ * @brief Special case: return an empty tuple if no values have bee provided.
+ * @return an empty tuple (`[[nodiscard]]`)
+ */
+[[nodiscard]] inline constexpr standard_layout_tuple<> make_standard_layout_tuple() {
+    return standard_layout_tuple<>{};
+}
+
+/**
+ * @brief Create a new tuple storing the values @p arg and @p remaining.
+ * @tparam T the type of the first value
+ * @tparam Rest the types of the remaining values (if any)
+ * @param[in,out] arg the first value
+ * @param[in,out] remaining the remaining values (if any)
+ * @return the constructed tuple (`[[nodiscard]]`)
+ */
+template <typename T, typename... Rest>
+[[nodiscard]] inline constexpr standard_layout_tuple<T, Rest...> make_standard_layout_tuple(T &&arg, Rest &&...remaining) {
+    return standard_layout_tuple<T, Rest...>{ std::forward<T>(arg), make_standard_layout_tuple(std::forward<Rest>(remaining)...) };
+}
+
+// sanity checks: be sure that the important use cases are indeed standard layout types!
+static_assert(std::is_standard_layout_v<standard_layout_tuple<>>, "standard_layout_tuple<> has no standard layout!");
+static_assert(std::is_standard_layout_v<standard_layout_tuple<int, real_type, real_type>>, "standard_layout_tuple<int, real_type, real_type> has no standard layout!");
+static_assert(std::is_standard_layout_v<standard_layout_tuple<real_type>>, "standard_layout_tuple<real_type> has no standard layout!");
+
+}  // namespace plssvm::kokkos::detail
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_DETAIL_STANDARD_LAYOUT_TUPLE_HPP_
diff --git a/include/plssvm/backends/Kokkos/detail/utility.hpp b/include/plssvm/backends/Kokkos/detail/utility.hpp
index 3b7a9c706..523900aa9 100644
--- a/include/plssvm/backends/Kokkos/detail/utility.hpp
+++ b/include/plssvm/backends/Kokkos/detail/utility.hpp
@@ -80,7 +80,8 @@ void check_execution_space_target_platform_combination(execution_space space, ta
 
 [[nodiscard]] std::string get_device_name(execution_space space, std::size_t device_id);
 
-void device_synchronize_all();
+void device_synchronize(const Kokkos::DefaultExecutionSpace& exec);
+
 
 [[nodiscard]] std::string get_kokkos_version();
 
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
index 79f96283e..c12220b0b 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
@@ -13,30 +13,387 @@
 #define PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_BLAS_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/backends/Kokkos/detail/device_ptr.hpp"  // TODO: view type aliases
+#include "plssvm/constants.hpp"                          // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
 
 #include "Kokkos_Core.hpp"  // TODO:
 
 namespace plssvm::kokkos::detail {
 
+/**
+ * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
+ */
 class device_kernel_symm {
   public:
+    /**
+     * @brief Initialize the Kokkos kernel function object.
+     * @param[in] num_rows the number of rows in @p A and @p C
+     * @param[in] num_rhs the number of columns in @p B and @p C
+     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] alpha the scalar alpha value
+     * @param[in] A the matrix @p A
+     * @param[in] B the matrix @p B
+     * @param[in] beta the scalar beta value
+     * @param[in,out] C the matrix @p C, also used as result matrix
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, device_view_type<const real_type> A, device_view_type<const real_type> B, const real_type beta, device_view_type<real_type> C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
+        num_rows_{ num_rows },
+        num_rhs_{ num_rhs },
+        device_specific_num_rows_{ device_specific_num_rows },
+        row_offset_{ row_offset },
+        alpha_{ alpha },
+        A_{ A },
+        B_{ B },
+        beta_{ beta },
+        C_{ C },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset },
+        grid_size_x_{ grid_size_x } { }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const Kokkos::TeamPolicy<>::member_type &team) const {
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        const auto FEATURE_BLOCK_SIZE_sz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
+        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+
+        // calculate the indices used in the current thread
+        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;  // # rhs -> num_rhs
+        const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;  // # rows -> num_mirror_rows
+        const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+
+        // create the shared memory arrays used for caching data point features
+        constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
+        real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size * sizeof(real_type)));
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> A_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+
+        // create a thread private array used for internal caching
+        real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
+
+        // iterate over all features using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += FEATURE_BLOCK_SIZE_sz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                const auto global_i = i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+                const auto global_j = j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+
+                // determine on which side of the diagonal we are located
+                if (dim + threadIdx_y < global_j) {
+                    A_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + global_j - (dim + threadIdx_y) * (dim + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }];
+                } else {
+                    A_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + dim + threadIdx_y - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
+                }
+                // determine on which side of the diagonal we are located
+                if (dim + threadIdx_y + THREAD_BLOCK_SIZE < global_j) {
+                    A_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + global_j - (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz + std::size_t{ 1 }) / std::size_t{ 2 }];
+                } else {
+                    A_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
+                }
+
+                B_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(dim + row_offset_ + threadIdx_y) * (num_rhs_ + PADDING_SIZE_sz) + global_i];
+                B_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(dim + row_offset_ + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rhs_ + PADDING_SIZE_sz) + global_i];
+            }
+            team.team_barrier();  // wait until all threads loaded their part of the data
+
+            // perform the dot product calculation
+            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        temp[internal_i][internal_j] += A_cache(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i);
+                    }
+                }
+            }
+            team.team_barrier();  // wait until all threads performed their part of the calculations
+        }
+
+        // apply the (partial) BLAS operation and update C
+        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                const auto global_i = i + static_cast<std::size_t>(internal_i);
+                const auto device_global_j = j + static_cast<std::size_t>(internal_j);
+                const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
+
+                // be sure to not perform out of bounds accesses
+                if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) {
+                    C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i];
+                }
+            }
+        }
+    }
+
   private:
+    /// @cond Doxygen_suppress
+    const std::size_t num_rows_;
+    const std::size_t num_rhs_;
+    const std::size_t device_specific_num_rows_;
+    const std::size_t row_offset_;
+    const real_type alpha_;
+    device_view_type<const real_type> A_;
+    device_view_type<const real_type> B_;
+    const real_type beta_;
+    device_view_type<real_type> C_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    const std::size_t grid_size_x_;
+    /// @endcond
 };
 
+/**
+ * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
+ * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
+ */
 class device_kernel_symm_mirror {
   public:
+    /**
+     * @brief Initialize the Kokkos kernel function object.
+     * @param[in] num_rows the number of rows in @p A and @p C
+     * @param[in] num_rhs the number of columns in @p B and @p C
+     * @param[in] num_mirror_rows the number of rows to mirror down
+     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] alpha the scalar alpha value
+     * @param[in] A the matrix @p A
+     * @param[in] B the matrix @p B
+     * @param[in] beta the scalar beta value
+     * @param[in,out] C the matrix @p C, also used as result matrix
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, device_view_type<const real_type> A, device_view_type<const real_type> B, const real_type beta, device_view_type<real_type> C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
+        num_rows_{ num_rows },
+        num_rhs_{ num_rhs },
+        num_mirror_rows_{ num_mirror_rows },
+        device_specific_num_rows_{ device_specific_num_rows },
+        row_offset_{ row_offset },
+        alpha_{ alpha },
+        A_{ A },
+        B_{ B },
+        beta_{ beta },
+        C_{ C },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset },
+        grid_size_x_{ grid_size_x } { }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const Kokkos::TeamPolicy<>::member_type &team) const {
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        const auto FEATURE_BLOCK_SIZE_sz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
+        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+
+        // calculate the indices used in the current thread
+        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;  // # rhs -> num_rhs
+        const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;  // # rows -> num_mirror_rows
+        const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+
+        // create the shared memory arrays used for caching data point features
+        constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
+        real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size * sizeof(real_type)));
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> A_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+
+        // create a thread private array used for internal caching
+        real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
+
+        // iterate over the remaining features using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += FEATURE_BLOCK_SIZE_sz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                const auto global_i = i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+                const auto global_j = j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+
+                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
+                A_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - (dim + threadIdx_y - std::size_t{ 1 }) * (dim + threadIdx_y) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_y) + global_j];
+                A_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz - std::size_t{ 1 }) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) + global_j];
+                B_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(row_offset_ + dim + threadIdx_y) * (num_rhs_ + PADDING_SIZE_sz) + global_i];
+                B_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(row_offset_ + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rhs_ + PADDING_SIZE_sz) + global_i];
+            }
+            team.team_barrier();  // wait until all threads loaded their part of the data
+
+            // perform the feature reduction calculation
+            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        temp[internal_i][internal_j] += A_cache(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i);
+                    }
+                }
+            }
+            team.team_barrier();  // wait until all threads performed their part of the calculations
+        }
+
+        // apply the (remaining) BLAS operation and update C
+        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                const auto global_i = i + static_cast<std::size_t>(internal_i);
+                const auto partial_global_j = j + static_cast<std::size_t>(internal_j);
+                const auto global_j = row_offset_ + device_specific_num_rows_ + j + static_cast<std::size_t>(internal_j);
+
+                // be sure to not perform out of bounds accesses
+                if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) {
+                    C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i];
+                }
+            }
+        }
+    }
+
   private:
+    /// @cond Doxygen_suppress
+    const std::size_t num_rows_;
+    const std::size_t num_rhs_;
+    const std::size_t num_mirror_rows_;
+    const std::size_t device_specific_num_rows_;
+    const std::size_t row_offset_;
+    const real_type alpha_;
+    device_view_type<const real_type> A_;
+    device_view_type<const real_type> B_;
+    const real_type beta_;
+    device_view_type<real_type> C_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    const std::size_t grid_size_x_;
+    /// @endcond
 };
 
+/**
+ * @brief Perform a simple inplace matrix addition: lhs += rhs.
+ */
 class device_kernel_inplace_matrix_add {
   public:
+    /**
+     * @brief Initialize the Kokkos kernel function object.
+     * @param[in] num_cols the number of columns in both matrices
+     * @param[in,out] lhs the first matrix (updated inplace)
+     * @param[in] rhs the second matrix
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_inplace_matrix_add(const std::size_t num_cols, device_view_type<real_type> lhs, device_view_type<const real_type> rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
+        num_cols_{ num_cols },
+        lhs_{ lhs },
+        rhs_{ rhs },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset },
+        grid_size_x_{ grid_size_x } { }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const Kokkos::TeamPolicy<>::member_type &team) const {
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+
+        // Calculate the indices used in the current thread
+        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;  // num_rows
+        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;  // num_rhs
+
+        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                const auto global_i = i + static_cast<std::size_t>(internal_i);
+                const auto global_j = j + static_cast<std::size_t>(internal_j);
+
+                // if (global_i < lhs_.extent(0) && global_j < rhs_.extent(0)) {  // TODO:
+                lhs_[global_i * (num_cols_ + PADDING_SIZE_sz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_sz) + global_j];
+                // }
+            }
+        }
+    }
+
   private:
+    /// @cond Doxygen_suppress
+    const std::size_t num_cols_;
+    device_view_type<real_type> lhs_;
+    device_view_type<const real_type> rhs_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    const std::size_t grid_size_x_;
+    /// @endcond
 };
 
+/**
+ * @brief Perform a simple inplace matrix scale: lhs *= scalar.
+ */
 class device_kernel_inplace_matrix_scale {
   public:
+    /**
+     * @brief Initialize the Kokkos kernel function object.
+     * @param[in] num_cols the number of columns in the matrix
+     * @param[in,out] lhs the first matrix (updated inplace)
+     * @param[in] scale the value to scale
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_inplace_matrix_scale(const std::size_t num_cols, device_view_type<real_type> lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
+        num_cols_{ num_cols },
+        lhs_{ lhs },
+        scale_{ scale },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset },
+        grid_size_x_{ grid_size_x } { }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const Kokkos::TeamPolicy<>::member_type &team) const {
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+
+        // Calculate the indices used in the current thread
+        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;  // num_rows
+        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;  // num_rhs
+
+        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                const auto global_i = i + static_cast<std::size_t>(internal_i);
+                const auto global_j = j + static_cast<std::size_t>(internal_j);
+
+                // if (global_i < lhs_.extent(0)) {  // TODO:
+                lhs_[global_i * (num_cols_ + PADDING_SIZE_sz) + global_j] *= scale_;
+                // }
+            }
+        }
+    }
+
   private:
+    /// @cond Doxygen_suppress
+    const std::size_t num_cols_;
+    device_view_type<real_type> lhs_;
+    const real_type scale_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    const std::size_t grid_size_x_;
+    /// @endcond
 };
 
 }  // namespace plssvm::kokkos::detail
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
index ff74257b9..ad9397377 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -13,18 +13,157 @@
 #define PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_KERNEL_MATRIX_ASSEMBLY_HPP_
 #pragma once
 
-#include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"  // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
-#include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
+#include "plssvm/backends/Kokkos/detail/device_ptr.hpp"             // TODO: view type aliases
+#include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp"  // plssvm::kokkos::detail::standard_layout_tuple
+#include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"       // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/constants.hpp"                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/kernel_function_types.hpp"                         // plssvm::kernel_function_type
 
 #include "Kokkos_Core.hpp"  // TODO:
 
+#include <cstddef>  // std::size_t
+
 namespace plssvm::kokkos::detail {
 
+/**
+ * @brief Create the explicit kernel matrix using the @p kernel_function.
+ * @tparam kernel_function the type of the used kernel function
+ * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `standard_layout_tuple`
+ */
 template <kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly {
   public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] cgh the SYCL handler used to allocate the local memory
+     * @param[out] kernel_matrix_d the calculated kernel matrix
+     * @param[in] data_d the data points to calculate the kernel matrix from
+     * @param[in] num_rows the number of data points
+     * @param[in] device_num_rows the number of rows the current device is responsible for
+     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] num_features the number of features per data point
+     * @param[in] q the vector used in the dimensional reduction
+     * @param[in] QA_cost the scalar used in the dimensional reduction
+     * @param[in] cost the cost factor the diagonal is scaled with
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
+     */
+    device_kernel_assembly(device_view_type<real_type> kernel_matrix_d, device_view_type<real_type> data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, device_view_type<real_type> q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
+        kernel_matrix_d_{ kernel_matrix_d },
+        data_d_{ data_d },
+        num_rows_{ num_rows },
+        device_num_rows_{ device_num_rows },
+        row_offset_{ row_offset },
+        num_features_{ num_features },
+        q_{ q },
+        QA_cost_{ QA_cost },
+        cost_{ cost },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset },
+        grid_size_x_{ grid_size_x },
+        kernel_function_parameter_{ detail::make_standard_layout_tuple(std::forward<Args>(kernel_function_parameter)...) } {
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const Kokkos::TeamPolicy<>::member_type &team) const {
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        const auto FEATURE_BLOCK_SIZE_sz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
+        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+
+        // calculate the indices used in the current thread
+        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;  // # rhs -> num_rhs
+        const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;  // # rows -> num_mirror_rows
+        const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+
+        // create the shared memory arrays used for caching data point features
+        constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
+        real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size));
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_i{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_j{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+
+        // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further
+        if (blockIdx_x >= blockIdx_y) {
+            // create a thread private array used for internal caching
+            real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
+
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) {
+                // load data into shared memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    const auto global_i = row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+                    const auto global_j = row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+
+                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
+                    data_cache_i(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + 1ull + PADDING_SIZE_sz) + global_i];
+                    data_cache_i(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + 1ull + PADDING_SIZE_sz) + global_i];
+                    data_cache_j(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + 1ull + PADDING_SIZE_sz) + global_j];
+                    data_cache_j(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + 1ull + PADDING_SIZE_sz) + global_j];
+                }
+                team.team_barrier();  // wait until all threads loaded their part of the data
+
+                // perform the feature reduction calculation
+                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i),
+                                                                                                    data_cache_j(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j));
+                        }
+                    }
+                }
+                team.team_barrier();  // wait until all threads performed their part of the calculations
+            }
+
+            // apply the remaining part of the kernel function and store the value in the output kernel matrix
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    // calculate the indices to access the kernel matrix (the part stored on the current device)
+                    const auto device_global_i = i + static_cast<std::size_t>(internal_i);
+                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
+                    const auto device_global_j = j + static_cast<std::size_t>(internal_j);
+                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
+
+                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
+                    if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                        real_type temp_ij = temp[internal_i][internal_j];
+                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                        // apply the cost on the diagonal
+                        if (global_i == global_j) {
+                            temp_ij += cost_;
+                        }
+                        // update the kernel matrix
+                        kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - device_global_j * (device_global_j + 1ull) / 2ull + device_global_i] = temp_ij;
+                    }
+                }
+            }
+        }
+    }
+
   private:
+    /// @cond Doxygen_suppress
+    device_view_type<real_type> kernel_matrix_d_;
+    device_view_type<const real_type> data_d_;
+    const std::size_t num_rows_;
+    const std::size_t device_num_rows_;
+    const std::size_t row_offset_;
+    const std::size_t num_features_;
+    device_view_type<const real_type> q_;
+    const real_type QA_cost_;
+    const real_type cost_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    const std::size_t grid_size_x_;
+    const detail::standard_layout_tuple<Args...> kernel_function_parameter_;
+    /// @endcond
 };
 
 }  // namespace plssvm::kokkos::detail
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index 2d9e855b2..2f0f6619c 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -9,24 +9,265 @@
  * @brief Functions for implicitly assembling the kernel matrix using the Kokkos backend.
  */
 
-#ifndef PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
-#define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
+#ifndef PLSSVM_BACKENDS_KOKKOS_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
 #pragma once
 
-#include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"  // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
-#include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
+#include "plssvm/backends/Kokkos/detail/device_ptr.hpp"             // TODO: view type aliases
+#include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp"  // plssvm::kokkos::detail::standard_layout_tuple
+#include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"       // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/constants.hpp"                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/kernel_function_types.hpp"                         // plssvm::kernel_function_type
 
 #include "Kokkos_Core.hpp"  // TODO: Kokkos::atomic_add
 
 namespace plssvm::kokkos::detail {
 
+/**
+ * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
+ * @tparam kernel_function the type of the used kernel function
+ * @tparam Args the types of the parameters necessary for the specific kernel function
+ */
 template <kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly_symm {
   public:
+    /**
+     * @brief Initialize the Kokkos kernel function object.
+     * @param[in] alpha the scalar alpha value
+     * @param[in] q the vector used in the dimensional reduction
+     * @param[in] data_d the data points to calculate the implicit kernel matrix from
+     * @param[in] num_rows the total number of data points (= total number of rows)
+     * @param[in] device_num_rows the number of rows the current device is responsible for
+     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] num_features the number of features per data point
+     * @param[in] QA_cost the scalar used in the dimensional reduction
+     * @param[in] cost the cost factor the diagonal is scaled with
+     * @param[in] B the matrix @p B
+     * @param[in,out] C the matrix @p C
+     * @param[in] num_classes the number of classes in the data set
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
+     */
+    device_kernel_assembly_symm(const real_type alpha, device_view_type<const real_type> q, device_view_type<const real_type> data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, device_view_type<const real_type> B, device_view_type<real_type> C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
+        alpha_{ alpha },
+        q_{ q },
+        data_d_{ data_d },
+        num_rows_{ num_rows },
+        device_num_rows_{ device_num_rows },
+        row_offset_{ row_offset },
+        num_features_{ num_features },
+        QA_cost_{ QA_cost },
+        cost_{ cost },
+        B_{ B },
+        C_{ C },
+        num_classes_{ num_classes },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset },
+        grid_size_x_{ grid_size_x },
+        kernel_function_parameter_{ detail::make_standard_layout_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const Kokkos::TeamPolicy<>::member_type &team) const {
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        const auto FEATURE_BLOCK_SIZE_sz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
+        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+
+        // calculate the indices used in the current thread
+        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;  // # rhs -> num_rhs
+        const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;  // # rows -> num_mirror_rows
+        const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+
+        // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further
+        if (blockIdx_x >= blockIdx_y) {
+            // create a thread private array used for internal caching
+            real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
+
+            // create the shared memory arrays used for caching data point features
+            constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
+            real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size));
+
+            {
+                // create the shared memory arrays used for caching data point features
+                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_i{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_j{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+
+                // iterate over all features using blocking to be able to cache them for faster memory accesses
+                for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) {
+                    // load data into shared memory
+                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                        const auto global_i = row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+                        const auto global_j = row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+
+                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
+                        data_cache_i(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i];
+                        data_cache_i(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i];
+                        data_cache_j(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j];
+                        data_cache_j(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j];
+                    }
+                    team.team_barrier();  // wait until all threads loaded their part of the data
+
+                    // perform the feature reduction calculation
+                    for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i),
+                                                                                                        data_cache_j(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j));
+                            }
+                        }
+                    }
+                    team.team_barrier();  // wait until all threads performed their part of the calculations
+                }
+            }
+
+            // apply the remaining part of the kernel function and store the value in the output kernel matrix
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
+                    const auto device_global_i = i + static_cast<std::size_t>(internal_i);
+                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
+                    const auto device_global_j = j + static_cast<std::size_t>(internal_j);
+
+                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
+                    if ((device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j)) {
+                        temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                        // apply the cost on the diagonal
+                        if (global_i == global_j) {
+                            temp[internal_i][internal_j] += cost_;
+                        }
+                    } else {
+                        // be sure to set the value to zero otherwise
+                        temp[internal_i][internal_j] = real_type{ 0.0 };
+                    }
+                }
+            }
+
+            // calculate C += alpha * temp * B for the UPPER triangular matrix
+            {
+                // same shared memory size but with different dimensions
+                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ data_cache_ptr, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE, FEATURE_BLOCK_SIZE };
+                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> C_out_cache{ data_cache_ptr + shmem_size, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE, FEATURE_BLOCK_SIZE };
+
+                // iterate over all classes using blocking to be able to cache them for faster memory accesses
+                for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_sz) {
+                    // load data into shared memory
+                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                        const auto global_i = row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+
+                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
+                        B_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y) = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y];
+                        B_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y + THREAD_BLOCK_SIZE) = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz];
+                        C_out_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y) = real_type{ 0.0 };
+                        C_out_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y + THREAD_BLOCK_SIZE) = real_type{ 0.0 };
+                    }
+                    team.team_barrier();  // wait until all threads loaded their part of the data
+
+                    // calculate intermediate results and store them in shared memory
+                    for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                C_out_cache(threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j, (class_idx + threadIdx_x) % FEATURE_BLOCK_SIZE) +=
+                                    temp[internal_i][internal_j] * B_cache(threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i, (class_idx + threadIdx_x) % FEATURE_BLOCK_SIZE);
+                            }
+                        }
+                        team.team_barrier();  // wait until all threads performed their part of the calculations
+                    }
+
+                    // add intermediate cached results to C
+                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                        const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal);
+                        Kokkos::atomic_add(&C_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_x], C_out_cache(threadIdx_y * INTERNAL_BLOCK_SIZE + internal, threadIdx_x));
+                        Kokkos::atomic_add(&C_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_sz], C_out_cache(threadIdx_y * INTERNAL_BLOCK_SIZE + internal, threadIdx_x + THREAD_BLOCK_SIZE));
+                    }
+                    team.team_barrier();  // wai until all threads updated C with their values
+                }
+            }
+
+            // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
+                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
+
+                    if (global_i == global_j) {
+                        temp[internal_i][internal_j] = real_type{ 0.0 };
+                    }
+                }
+            }
+
+            // calculate C += alpha * temp * B for the LOWER triangular matrix
+            {
+                // same shared memory size but with different dimensions
+                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> C_out_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+
+                // iterate over all classes using blocking to be able to cache them for faster memory accesses
+                for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_sz) {
+                    // load data into shared memory
+                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                        const auto global_j = row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+
+                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
+                        B_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y];
+                        B_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz];
+                        C_out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 };
+                        C_out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 };
+                    }
+                    team.team_barrier();  // wait until all threads loaded their part of the data
+
+                    // calculate intermediate results and store them in shared memory
+                    for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                C_out_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, internal_i * THREAD_BLOCK_SIZE + threadIdx_x) +=
+                                    temp[internal_i][internal_j] * B_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j);
+                            }
+                        }
+                        team.team_barrier();  // wait until all threads performed their part of the calculations
+                    }
+
+                    // add intermediate cached results to C
+                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                        const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal);
+                        Kokkos::atomic_add(&C_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y], C_out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x));
+                        Kokkos::atomic_add(&C_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz], C_out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x));
+                    }
+                    team.team_barrier();  // wait until all threads updated C with their values
+                }
+            }
+        }
+    }
+
   private:
+    /// @cond Doxygen_suppress
+    const real_type alpha_;
+    device_view_type<const real_type> q_;
+    device_view_type<const real_type> data_d_;
+    const std::size_t num_rows_;
+    const std::size_t device_num_rows_;
+    const std::size_t row_offset_;
+    const std::size_t num_features_;
+    const real_type QA_cost_;
+    const real_type cost_;
+    device_view_type<const real_type> B_;
+    device_view_type<real_type> C_;
+    const std::size_t num_classes_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    const std::size_t grid_size_x_;
+    const detail::standard_layout_tuple<Args...> kernel_function_parameter_;
+    /// @endcond
 };
 
 }  // namespace plssvm::kokkos::detail
 
-#endif  // PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
+#endif  // PLSSVM_BACKENDS_KOKKOS_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
diff --git a/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp b/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
index f7f422659..952b1e99f 100644
--- a/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
@@ -12,14 +12,14 @@
 #ifndef PLSSVM_BACKENDS_KOKKOS_KERNEL_KERNEL_FUNCTIONS_HPP_
 #define PLSSVM_BACKENDS_KOKKOS_KERNEL_KERNEL_FUNCTIONS_HPP_
 
-#include "plssvm/constants.hpp"              // plssvm::real_type
-#include "plssvm/detail/utility.hpp"         // plssvm::detail::always_false_v
-#include "plssvm/kernel_function_types.hpp"  // plssvm::kernel_function_type
+#include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp"  // plssvm::kokkos::detail::standard_layout_tuple
+#include "plssvm/constants.hpp"                                     // plssvm::real_type
+#include "plssvm/detail/utility.hpp"                                // plssvm::detail::always_false_v
+#include "plssvm/kernel_function_types.hpp"                         // plssvm::kernel_function_type
 
 #include "Kokkos_MathematicalFunctions.hpp"  // Kokkos::pow, Kokkos::exp, Kokkos::tanh, Kokkos::abs
 
-#include <limits>  // std::numeric_limits
-#include <tuple>   // std::tuple, std::get
+#include <type_traits>  // std::is_same_v
 
 namespace plssvm::kokkos::detail {
 
@@ -35,7 +35,7 @@ namespace plssvm::kokkos::detail {
  * @return the reduced value (`[[nodiscard]]`)
  */
 template <kernel_function_type kernel_function>
-[[nodiscard]] inline real_type feature_reduce(const real_type val1, const real_type val2) {
+KOKKOS_INLINE_FUNCTION real_type feature_reduce(const real_type val1, const real_type val2) {
     return val1 * val2;
 }
 
@@ -46,7 +46,7 @@ template <kernel_function_type kernel_function>
  * @return the reduced value (`[[nodiscard]]`)
  */
 template <>
-[[nodiscard]] inline real_type feature_reduce<kernel_function_type::rbf>(const real_type val1, const real_type val2) {
+KOKKOS_INLINE_FUNCTION real_type feature_reduce<kernel_function_type::rbf>(const real_type val1, const real_type val2) {
     const real_type d = val1 - val2;
     return d * d;
 }
@@ -58,7 +58,7 @@ template <>
  * @return the reduced value (`[[nodiscard]]`)
  */
 template <>
-[[nodiscard]] inline real_type feature_reduce<kernel_function_type::laplacian>(const real_type val1, const real_type val2) {
+KOKKOS_INLINE_FUNCTION real_type feature_reduce<kernel_function_type::laplacian>(const real_type val1, const real_type val2) {
     return ::Kokkos::fabs(val1 - val2);
 }
 
@@ -70,9 +70,13 @@ template <>
  * @return the reduced value (`[[nodiscard]]`)
  */
 template <>
-[[nodiscard]] inline real_type feature_reduce<kernel_function_type::chi_squared>(const real_type val1, const real_type val2) {
+KOKKOS_INLINE_FUNCTION real_type feature_reduce<kernel_function_type::chi_squared>(const real_type val1, const real_type val2) {
     const real_type d = val1 - val2;
-    return (real_type{ 1.0 } / (val1 + val2 + std::numeric_limits<real_type>::min())) * d * d;
+    if constexpr (std::is_same_v<real_type, float>) {
+        return (real_type{ 1.0 } / (val1 + val2 + FLT_MIN)) * d * d;  // TODO: std::numeric_limits::min
+    } else {
+        return (real_type{ 1.0 } / (val1 + val2 + DBL_MIN)) * d * d;  // TODO: std::numeric_limits::min
+    }
 }
 
 //***************************************************//
@@ -88,19 +92,19 @@ template <>
  * @return the result value (`[[nodiscard]]`)
  */
 template <kernel_function_type kernel_function, typename... Args>
-[[nodiscard]] inline real_type apply_kernel_function(const real_type value, const std::tuple<Args...> params) {
+KOKKOS_INLINE_FUNCTION real_type apply_kernel_function(const real_type value, const detail::standard_layout_tuple<Args...> params) {
     if constexpr (kernel_function == kernel_function_type::linear) {
         return value;
     } else if constexpr (kernel_function == kernel_function_type::polynomial) {
-        return ::Kokkos::pow(std::get<1>(params) * value + std::get<2>(params), std::get<0>(params));
+        return ::Kokkos::pow(detail::get<1>(params) * value + detail::get<2>(params), detail::get<0>(params));
     } else if constexpr (kernel_function == kernel_function_type::rbf) {
-        return ::Kokkos::exp(-std::get<0>(params) * value);
+        return ::Kokkos::exp(-detail::get<0>(params) * value);
     } else if constexpr (kernel_function == kernel_function_type::sigmoid) {
-        return ::Kokkos::tanh(std::get<0>(params) * value + std::get<1>(params));
+        return ::Kokkos::tanh(detail::get<0>(params) * value + detail::get<1>(params));
     } else if constexpr (kernel_function == kernel_function_type::laplacian) {
-        return ::Kokkos::exp(-std::get<0>(params) * value);
+        return ::Kokkos::exp(-detail::get<0>(params) * value);
     } else if constexpr (kernel_function == kernel_function_type::chi_squared) {
-        return ::Kokkos::exp(-std::get<0>(params) * value);
+        return ::Kokkos::exp(-detail::get<0>(params) * value);
     } else {
         static_assert(::plssvm::detail::always_false_v<Args...>, "Unsupported kernel function!");
     }
diff --git a/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
index a203cb7e9..629a0901f 100644
--- a/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
@@ -13,6 +13,7 @@
 #define PLSSVM_BACKENDS_KOKKOS_PREDICT_KERNEL_HPP_
 #pragma once
 
+#include "plssvm/backends/Kokkos/detail/device_ptr.hpp"        // TODO: view type aliases
 #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"  // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
 #include "plssvm/constants.hpp"                                // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
@@ -21,20 +22,390 @@
 
 namespace plssvm::kokkos::detail {
 
+/**
+ * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
+ */
 class device_kernel_w_linear {
   public:
+    /**
+     * @brief Initialize the Kokkos kernel function object.
+     * @param[in,out] w_d the vector to speedup the linear prediction
+     * @param[in] alpha_d the previously learned weights
+     * @param[in] sv_d the support vectors
+     * @param[in] num_classes the number of classes
+     * @param[in] num_sv the number of support vectors
+     * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
+     * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_w_linear(device_view_type<real_type> w_d, device_view_type<const real_type> alpha_d, device_view_type<const real_type> sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
+        w_d_{ w_d },
+        alpha_d_{ alpha_d },
+        sv_d_{ sv_d },
+        num_classes_{ num_classes },
+        num_sv_{ num_sv },
+        device_specific_num_sv_{ device_specific_num_sv },
+        sv_offset_{ sv_offset },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset },
+        grid_size_x_{ grid_size_x } { }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const Kokkos::TeamPolicy<>::member_type &team) const {
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+
+        // calculate the indices used in the current thread
+        const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;
+        const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+        const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;
+        const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+
+        // create the shared memory arrays used for caching data point features
+        constexpr std::size_t shmem_size = THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
+        real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size));
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_feature{ data_cache_ptr, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_alpha{ data_cache_ptr + shmem_size, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+
+        // create a thread private array used for internal caching
+        real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
+
+        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
+        for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE_sz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                const auto global_feature_idx = feature_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+                const auto global_class_idx = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+
+                data_cache_feature(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_sz) + sv + threadIdx_y];  // SoA
+                data_cache_alpha(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_sz) + sv + sv_offset_ + threadIdx_y];      // AoS
+            }
+            team.team_barrier();  // wait until all threads loaded their part of the data
+
+            // perform the dot product calculation
+            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                        temp[internal_feature][internal_class] += data_cache_alpha(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_class) * data_cache_feature(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_feature);
+                    }
+                }
+            }
+            team.team_barrier();  // wait until all threads performed their part of the calculations
+        }
+
+        // update global array with local one
+        for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_sz) + global_class_idx] = temp[internal_feature][internal_class];
+            }
+        }
+    }
+
   private:
+    /// @cond Doxygen_suppress
+    device_view_type<real_type> w_d_;
+    device_view_type<const real_type> alpha_d_;
+    device_view_type<const real_type> sv_d_;
+    const std::size_t num_classes_;
+    const std::size_t num_sv_;
+    const std::size_t device_specific_num_sv_;
+    const std::size_t sv_offset_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    const std::size_t grid_size_x_;
+    /// @endcond
 };
 
+/**
+ * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
+ */
 class device_kernel_predict_linear {
   public:
+    /**
+     * @brief Initialize the Kokkos kernel function object.
+     * @param[out] prediction_d the predicted values
+     * @param[in] w_d the vector to speedup the calculations
+     * @param[in] rho_d the previously learned bias
+     * @param[in] predict_points_d the data points to predict
+     * @param[in] num_classes the number of classes
+     * @param[in] num_predict_points the number of data points to predict
+     * @param[in] num_features the number of features per data point
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_predict_linear(device_view_type<real_type> prediction_d, device_view_type<const real_type> w_d, device_view_type<const real_type> rho_d, device_view_type<const real_type> predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
+        prediction_d_{ prediction_d },
+        w_d_{ w_d },
+        rho_d_{ rho_d },
+        predict_points_d_{ predict_points_d },
+        num_classes_{ num_classes },
+        num_predict_points_{ num_predict_points },
+        num_features_{ num_features },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset },
+        grid_size_x_{ grid_size_x } { }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const Kokkos::TeamPolicy<>::member_type &team) const {
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        const auto FEATURE_BLOCK_SIZE_sz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
+        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+
+        // calculate the indices used in the current thread
+        const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;
+        const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+        const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;
+        const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+
+        // create the shared memory arrays used for caching data point features
+        constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
+        real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size));
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_pp{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_w{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+
+        // create a thread private array used for internal caching
+        real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
+
+        // iterate over all features using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                const auto global_pp_idx = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+                const auto global_class_idx = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+
+                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
+                data_cache_pp(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx];
+                data_cache_pp(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx];
+                data_cache_w(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = w_d_[(dim + threadIdx_y) * (num_classes_ + PADDING_SIZE_sz) + global_class_idx];
+                data_cache_w(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = w_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_classes_ + PADDING_SIZE_sz) + global_class_idx];
+            }
+            team.team_barrier();  // wait until all threads loaded their part of the data
+
+            // perform the dot product calculation
+            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                        temp[internal_pd][internal_class] += data_cache_w(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_class) * data_cache_pp(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_pd);
+                    }
+                }
+            }
+            team.team_barrier();  // wait until all threads performed their part of the calculations
+        }
+
+        // update global array with local one
+        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
+                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_sz) + global_class_idx] = temp[internal_pd][internal_class] - rho_d_[global_class_idx];
+            }
+        }
+    }
+
   private:
+    /// @cond Doxygen_suppress
+    device_view_type<real_type> prediction_d_;
+    device_view_type<const real_type> w_d_;
+    device_view_type<const real_type> rho_d_;
+    device_view_type<const real_type> predict_points_d_;
+    const std::size_t num_classes_;
+    const std::size_t num_predict_points_;
+    const std::size_t num_features_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    const std::size_t grid_size_x_;
+    /// @endcond
 };
 
+/**
+ * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @tparam kernel_function the type of the used kernel function
+ * @tparam Args the types of the parameters necessary for the specific kernel function
+ */
 template <kernel_function_type kernel_function, typename... Args>
 class device_kernel_predict {
   public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] prediction_d the predicted values
+     * @param[in] alpha_d the previously learned weights
+     * @param[in] rho_d the previously learned biases
+     * @param[in] sv_d the support vectors
+     * @param[in] predict_points_d the data points to predict
+     * @param[in] num_classes the number of classes
+     * @param[in] num_sv the number of support vectors
+     * @param[in] num_predict_points the number of data points to predict
+     * @param[in] num_features the number of features per data point
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
+     */
+    device_kernel_predict(device_view_type<real_type> prediction_d, device_view_type<const real_type> alpha_d, device_view_type<const real_type> rho_d, device_view_type<const real_type> sv_d, device_view_type<const real_type> predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
+        prediction_d_{ prediction_d },
+        alpha_d_{ alpha_d },
+        rho_d_{ rho_d },
+        sv_d_{ sv_d },
+        predict_points_d_{ predict_points_d },
+        num_classes_{ num_classes },
+        num_sv_{ num_sv },
+        num_predict_points_{ num_predict_points },
+        num_features_{ num_features },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset },
+        grid_size_x_{ grid_size_x },
+        kernel_function_parameter_{ detail::make_standard_layout_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const Kokkos::TeamPolicy<>::member_type &team) const {
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        const auto FEATURE_BLOCK_SIZE_sz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
+        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+
+        // calculate the indices used in the current thread
+        const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;
+        const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+        const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+
+        constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
+        real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size));
+
+        // create a thread private array used for internal caching
+        real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
+
+        {
+            // create the shared memory arrays used for caching data point features
+            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_pp{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_sv{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) {
+                // load data into shared memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    const auto global_pp_idx = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE;
+                    const auto global_sv_idx = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE;
+
+                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
+                    data_cache_pp(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx];
+                    data_cache_pp(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx];
+                    data_cache_sv(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = sv_d_[(dim + threadIdx_y) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx];
+                    data_cache_sv(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = sv_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx];
+                }
+                team.team_barrier();  // wait until all threads loaded their part of the data
+
+                // perform the feature reduction calculation
+                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                        for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                            temp[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(data_cache_sv(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_sv),
+                                                                                                      data_cache_pp(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_pd));
+                        }
+                    }
+                }
+                team.team_barrier();  // wait until all threads performed their part of the calculations
+            }
+        }
+
+        // update temp using the respective kernel function
+        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                temp[internal_pd][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pd][internal_sv], kernel_function_parameter_);
+            }
+        }
+
+        {
+            // create the shared memory arrays used for caching data point features
+            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> alpha_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> out_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_sz) {
+                // load data into shared memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    const std::size_t global_sv_idx = sv_idx_linear + internal * THREAD_BLOCK_SIZE;
+
+                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
+                    alpha_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_d_[(dim + threadIdx_y) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx];
+                    alpha_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx];
+
+                    // the bias (rho) must only be applied once for all support vectors
+                    if (blockIdx_y == 0ull) {
+                        out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = -rho_d_[dim + threadIdx_y];
+                        out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = -rho_d_[dim + threadIdx_y + THREAD_BLOCK_SIZE_sz];
+                    } else {
+                        out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 };
+                        out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 };
+                    }
+                }
+                team.team_barrier();  // wait until all threads loaded their part of the data
+
+                // calculate intermediate results and store them in shared memory
+                for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                        for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                            out_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, internal_pd * THREAD_BLOCK_SIZE + threadIdx_x) +=
+                                temp[internal_pd][internal_sv] * alpha_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_sv);
+                        }
+                    }
+                    team.team_barrier();  // wait until all threads performed their part of the calculations
+                }
+
+                // add intermediate cached results to prediction_d
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal);
+
+                    Kokkos::atomic_add(&prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y], out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x));
+                    Kokkos::atomic_add(&prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz], out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x));
+                }
+                team.team_barrier();  // wait until all threads updated their part of the prediction
+            }
+        }
+    }
+
   private:
+    /// @cond Doxygen_suppress
+    device_view_type<real_type> prediction_d_;
+    device_view_type<const real_type> alpha_d_;
+    device_view_type<const real_type> rho_d_;
+    device_view_type<const real_type> sv_d_;
+    device_view_type<const real_type> predict_points_d_;
+    const std::size_t num_classes_;
+    const std::size_t num_sv_;
+    const std::size_t num_predict_points_;
+    const std::size_t num_features_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    const std::size_t grid_size_x_;
+    const detail::standard_layout_tuple<Args...> kernel_function_parameter_;
+    /// @endcond
 };
 
 }  // namespace plssvm::kokkos::detail
diff --git a/src/main_predict.cpp b/src/main_predict.cpp
index f9e70e1d5..1fe40d102 100644
--- a/src/main_predict.cpp
+++ b/src/main_predict.cpp
@@ -15,6 +15,7 @@
 #include "plssvm/detail/tracking/performance_tracker.hpp"  // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_SAVE,
                                                            // PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_HWS_ENTRY
                                                            // PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_SET_REFERENCE_TIME
+#include "plssvm/detail/assert.hpp"                        // PLSSVM_ASSERT
 #include "plssvm/detail/utility.hpp"                       // PLSSVM_IS_DEFINED
 
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
@@ -74,11 +75,12 @@ int main(int argc, char *argv[]) {
 
             // check whether SYCL is used as backend (it is either requested directly or as automatic backend)
             const bool use_sycl_as_backend{ cmd_parser.backend == plssvm::backend_type::sycl || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::sycl) };
+
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
             // check whether Kokkos is used as backend (it is either requested directly or as automatic backend)
             const bool use_kokkos_as_backend{ cmd_parser.backend == plssvm::backend_type::kokkos || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::kokkos) };
 
             // initialize Kokkos if necessary
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
             if (use_kokkos_as_backend) {
                 Kokkos::initialize(argc, argv);  // TODO: set device?
                 PLSSVM_ASSERT(Kokkos::is_initialized(), "Something went wrong initializing the Kokkos environment!");
diff --git a/src/main_train.cpp b/src/main_train.cpp
index f7ed20d9c..1d18d2744 100644
--- a/src/main_train.cpp
+++ b/src/main_train.cpp
@@ -14,7 +14,7 @@
 #include "plssvm/detail/logging.hpp"                       // plssvm::detail::log
 #include "plssvm/detail/tracking/performance_tracker.hpp"  // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_SAVE,
                                                            // PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_HWS_ENTRY, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_SET_REFERENCE_TIME
-#include "plssvm/detail/assert.hpp"
+#include "plssvm/detail/assert.hpp"                        // PLSSVM_ASSERT
 #include "plssvm/detail/utility.hpp"                       // PLSSVM_IS_DEFINED
 
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
@@ -72,11 +72,12 @@ int main(int argc, char *argv[]) {
 
             // check whether SYCL is used as backend (it is either requested directly or as automatic backend)
             const bool use_sycl_as_backend{ cmd_parser.backend == plssvm::backend_type::sycl || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::sycl) };
+
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
             // check whether Kokkos is used as backend (it is either requested directly or as automatic backend)
             const bool use_kokkos_as_backend{ cmd_parser.backend == plssvm::backend_type::kokkos || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::kokkos) };
 
             // initialize Kokkos if necessary
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
             if (use_kokkos_as_backend) {
                 Kokkos::initialize(argc, argv);  // TODO: set device?
                 PLSSVM_ASSERT(Kokkos::is_initialized(), "Something went wrong initializing the Kokkos environment!");
diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index f459d55e5..f113c7e2c 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -8,8 +8,7 @@
 
 #include "plssvm/backends/Kokkos/csvm.hpp"
 
-#include "plssvm/backends/execution_range.hpp"                                        // plssvm::detail::dim_type
-#include "plssvm/backends/execution_range.hpp"                                        // plssvm::detail::execution_range
+#include "plssvm/backends/execution_range.hpp"                                        // plssvm::detail::{execution_range, dim_type}
 #include "plssvm/backends/Kokkos/detail/device_ptr.hpp"                               // plssvm::kokkos::detail::device_ptr
 #include "plssvm/backends/Kokkos/detail/execution_space.hpp"                          // plssvm::kokkos::detail::execution_space
 #include "plssvm/backends/Kokkos/detail/utility.hpp"                                  // plssvm::kokkos::detail::get_runtime_version
@@ -18,11 +17,14 @@
 #include "plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp"       // plssvm::kokkos::detail::device_kernel_assembly
 #include "plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp"  // plssvm::kokkos::detail::device_kernel_assembly_symm
 #include "plssvm/backends/Kokkos/kernel/predict_kernel.hpp"                           // plssvm::kokkos::detail::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
+#include "plssvm/constants.hpp"                                                       // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE, plssvm::FEATURE_BLOCK_SIZE
 #include "plssvm/detail/data_distribution.hpp"                                        // plssvm::detail::triangular_data_distribution
 #include "plssvm/detail/logging.hpp"                                                  // plssvm::detail::log
 #include "plssvm/detail/memory_size.hpp"                                              // plssvm::detail::memory_size
 #include "plssvm/detail/tracking/performance_tracker.hpp"                             // plssvm::detail::tracking::tracking_entry
+#include "plssvm/detail/utility.hpp"                                                  // plssvm::detail::unreachable // TODO: remove
 #include "plssvm/exceptions/exceptions.hpp"                                           // plssvm::exception
+#include "plssvm/kernel_function_types.hpp"                                           // plssvm::kernel_function_type
 #include "plssvm/parameter.hpp"                                                       // plssvm::parameter
 #include "plssvm/target_platforms.hpp"                                                // plssvm::target_platform
 #include "plssvm/verbosity_levels.hpp"                                                // plssvm::verbosity_level
@@ -141,8 +143,10 @@ void csvm::init(const target_platform target) {
 
 csvm::~csvm() {
     try {
-        // be sure that all operations on the Kokkos execution spaces have finished before destruction
-        detail::device_synchronize_all();
+        // be sure that all operations on the CUDA devices have finished before destruction
+        for (const queue_type &device : devices_) {
+            detail::device_synchronize(device);
+        }
     } catch (const plssvm::exception &e) {
         std::cout << e.what_with_loc() << std::endl;
         std::terminate();
@@ -191,9 +195,12 @@ std::vector<::plssvm::detail::memory_size> csvm::get_max_mem_alloc_size() const
         case detail::execution_space::serial:
             throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
     }
+    ::plssvm::detail::unreachable();
 }
 
 std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
+    PLSSVM_ASSERT(device_id < this->num_available_devices(), "Invalid device {} requested!", device_id);
+
     // TODO: implement for other execution spaces, guard behind ifdef
     switch (space_) {
         case detail::execution_space::cuda:
@@ -212,9 +219,12 @@ std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
         case detail::execution_space::serial:
             throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
     }
+    ::plssvm::detail::unreachable();
 }
 
-::plssvm::detail::dim_type csvm::get_max_grid_size([[maybe_unused]] const std::size_t device_id) const {
+::plssvm::detail::dim_type csvm::get_max_grid_size(const std::size_t device_id) const {
+    PLSSVM_ASSERT(device_id < this->num_available_devices(), "Invalid device {} requested!", device_id);
+
     // TODO: implement for other execution spaces, guard behind ifdef
     switch (space_) {
         case detail::execution_space::cuda:
@@ -233,6 +243,7 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size([[maybe_unused]] const std::s
         case detail::execution_space::serial:
             throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
     }
+    ::plssvm::detail::unreachable();
 }
 
 //***************************************************//
@@ -256,37 +267,55 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
 
     device_ptr_type kernel_matrix_d{ num_entries_padded, device };  // only explicitly store the upper triangular matrix
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
+    const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
+
+    // save the team sizes
+    const ::plssvm::detail::dim_type team_sizes = exec.block;
 
-    // TODO: implement
-    // // convert execution range block to CUDA's native dim3
-    // const dim3 native_block = detail::dim_type_to_native(exec.block);
-    //
-    // for (const auto &[partial_grid, offsets] : exec.grids) {
-    //     // convert execution range partial_grid to CUDA's native dim3
-    //     const dim3 native_partial_grid = detail::dim_type_to_native(partial_grid);
-    //
-    //     switch (params.kernel_type) {
-    //         case kernel_function_type::linear:
-    //             detail::device_kernel_assembly<kernel_function_type::linear><<<native_partial_grid, native_block>>>(kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y);
-    //             break;
-    //         case kernel_function_type::polynomial:
-    //             detail::device_kernel_assembly<kernel_function_type::polynomial><<<native_partial_grid, native_block>>>(kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, params.degree, std::get<real_type>(params.gamma), params.coef0);
-    //             break;
-    //         case kernel_function_type::rbf:
-    //             detail::device_kernel_assembly<kernel_function_type::rbf><<<native_partial_grid, native_block>>>(kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, std::get<real_type>(params.gamma));
-    //             break;
-    //         case kernel_function_type::sigmoid:
-    //             detail::device_kernel_assembly<kernel_function_type::sigmoid><<<native_partial_grid, native_block>>>(kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, std::get<real_type>(params.gamma), params.coef0);
-    //             break;
-    //         case kernel_function_type::laplacian:
-    //             detail::device_kernel_assembly<kernel_function_type::laplacian><<<native_partial_grid, native_block>>>(kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, std::get<real_type>(params.gamma));
-    //             break;
-    //         case kernel_function_type::chi_squared:
-    //             detail::device_kernel_assembly<kernel_function_type::chi_squared><<<native_partial_grid, native_block>>>(kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, std::get<real_type>(params.gamma));
-    //             break;
-    //     }
-    // }
-    detail::device_synchronize_all();
+    for (const auto &[partial_grid, offsets] : exec.grids) {
+        // create a Kokkos TeamPolicy
+        Kokkos::TeamPolicy<> team_policy(device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO);
+
+        switch (params.kernel_type) {
+            case kernel_function_type::linear:
+                {
+                    using functor_type = detail::device_kernel_assembly<kernel_function_type::linear>;
+                    Kokkos::parallel_for("assemble_kernel_matrix_explicit_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x });
+                }
+                break;
+            case kernel_function_type::polynomial:
+                {
+                    using functor_type = detail::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                    Kokkos::parallel_for("assemble_kernel_matrix_explicit_polynomial", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                }
+                break;
+            case kernel_function_type::rbf:
+                {
+                    using functor_type = detail::device_kernel_assembly<kernel_function_type::rbf, real_type>;
+                    Kokkos::parallel_for("assemble_kernel_matrix_explicit_rbf", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
+                }
+                break;
+            case kernel_function_type::sigmoid:
+                {
+                    using functor_type = detail::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                    Kokkos::parallel_for("assemble_kernel_matrix_explicit_sigmoid", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma), params.coef0 });
+                }
+                break;
+            case kernel_function_type::laplacian:
+                {
+                    using functor_type = detail::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
+                    Kokkos::parallel_for("assemble_kernel_matrix_explicit_laplacian", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
+                }
+                break;
+            case kernel_function_type::chi_squared:
+                {
+                    using functor_type = detail::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
+                    Kokkos::parallel_for("assemble_kernel_matrix_explicit_chi_squared", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
+                }
+                break;
+        }
+    }
+    detail::device_synchronize(device);
 
     return kernel_matrix_d;
 }
@@ -300,72 +329,65 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
     const unsigned long long device_specific_num_rows = data_distribution_->place_specific_num_rows(device_id);
     // get the offset of the data points this device is responsible for
     const unsigned long long row_offset = data_distribution_->place_row_offset(device_id);
+    // the necessary amount of scratch memory for the kernels
+    const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
+
+    // save the team sizes
+    const ::plssvm::detail::dim_type team_sizes = exec.block;
 
-    // TODO: implement
-    // // convert execution range block to CUDA's native dim3
-    // const dim3 native_block = detail::dim_type_to_native(exec.block);
-    //
-    // detail::set_device(device);
-    // for (const auto &[partial_grid, offsets] : exec.grids) {
-    //     // convert execution range partial_grid to CUDA's native dim3
-    //     const dim3 native_partial_grid = detail::dim_type_to_native(partial_grid);
-    //
-    //     detail::device_kernel_symm<<<native_partial_grid, native_block>>>(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.x, offsets.y);
-    // }
-    //
-    // // convert execution range block to CUDA's native dim3
-    // const dim3 native_mirror_block = detail::dim_type_to_native(mirror_exec.block);
-    //
-    // for (const auto &[partial_grid, offsets] : mirror_exec.grids) {
-    //     const unsigned long long num_mirror_rows = num_rows - row_offset - device_specific_num_rows;
-    //
-    //     if (num_mirror_rows > 0) {
-    //         // convert execution range partial_grid to CUDA's native dim3
-    //         const dim3 native_partial_grid = detail::dim_type_to_native(partial_grid);
-    //
-    //         detail::device_kernel_symm_mirror<<<native_partial_grid, native_mirror_block>>>(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.x, offsets.y);
-    //     }
-    // }
-    // detail::peek_at_last_error();
-    detail::device_synchronize_all();
+    for (const auto &[partial_grid, offsets] : exec.grids) {
+        // create a Kokkos TeamPolicy
+        Kokkos::TeamPolicy<> team_policy{ device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+
+        Kokkos::parallel_for("blas_level_3_kernel_explicit", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.x, offsets.y, partial_grid.x));
+    }
+
+    // save the mirror team sizes
+    const ::plssvm::detail::dim_type mirror_team_sizes = mirror_exec.block;
+
+    for (const auto &[partial_grid, offsets] : mirror_exec.grids) {
+        const unsigned long long num_mirror_rows = num_rows - row_offset - device_specific_num_rows;
+
+        if (num_mirror_rows > 0) {
+            // create a Kokkos TeamPolicy
+            Kokkos::TeamPolicy<> team_policy{ static_cast<int>(partial_grid.total_size()), static_cast<int>(mirror_team_sizes.total_size()), Kokkos::AUTO };
+
+            Kokkos::parallel_for("blas_level_3_kernel_explicit_mirror", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.x, offsets.y, partial_grid.x));
+        }
+    }
+    detail::device_synchronize(device);
 }
 
 void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, device_ptr_type &lhs_d, const device_ptr_type &rhs_d) const {
     const unsigned long long num_rhs = lhs_d.shape().x;
     const queue_type &device = devices_[device_id];
 
-    // // TODO: implement
-    // // convert execution range block to CUDA's native dim3
-    // const dim3 native_block = detail::dim_type_to_native(exec.block);
-    //
-    // detail::set_device(device);
-    // for (const auto &[partial_grid, offsets] : exec.grids) {
-    //     // convert execution range partial_grid to CUDA's native dim3
-    //     const dim3 native_partial_grid = detail::dim_type_to_native(partial_grid);
-    //
-    //     detail::device_kernel_inplace_matrix_add<<<native_partial_grid, native_block>>>(num_rhs, lhs_d.get(), rhs_d.get(), offsets.x, offsets.y);
-    // }
-    // detail::peek_at_last_error();
-    detail::device_synchronize_all();
+    // save the team sizes
+    const ::plssvm::detail::dim_type team_sizes = exec.block;
+
+    for (const auto &[partial_grid, offsets] : exec.grids) {
+        // create a Kokkos TeamPolicy
+        Kokkos::TeamPolicy<> team_policy{ static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+
+        Kokkos::parallel_for("inplace_matrix_addition", team_policy, detail::device_kernel_inplace_matrix_add(num_rhs, lhs_d.get(), rhs_d.get(), offsets.x, offsets.y, partial_grid.x));
+    }
+    detail::device_synchronize(device);
 }
 
 void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, device_ptr_type &lhs_d, const real_type scale) const {
     const unsigned long long num_rhs = lhs_d.shape().x;
     const queue_type &device = devices_[device_id];
 
-    // TODO: implement
-    // // convert execution range block to CUDA's native dim3
-    // const dim3 native_block = detail::dim_type_to_native(exec.block);
-    //
-    // detail::set_device(device);
-    // for (const auto &[partial_grid, offsets] : exec.grids) {
-    //     // convert execution range partial_grid to CUDA's native dim3
-    //     const dim3 native_partial_grid = detail::dim_type_to_native(partial_grid);
-    //
-    //     detail::device_kernel_inplace_matrix_scale<<<native_partial_grid, native_block>>>(num_rhs, lhs_d.get(), scale, offsets.x, offsets.y);
-    // }
-    // detail::peek_at_last_error();
-    detail::device_synchronize_all();
+    // save the team sizes
+    const ::plssvm::detail::dim_type team_sizes = exec.block;
+
+    for (const auto &[partial_grid, offsets] : exec.grids) {
+        // create a Kokkos TeamPolicy
+        Kokkos::TeamPolicy<> team_policy{ static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+
+        Kokkos::parallel_for("inplace_matrix_scale", team_policy, detail::device_kernel_inplace_matrix_scale(num_rhs, lhs_d.get(), scale, offsets.x, offsets.y, partial_grid.x));
+    }
+    detail::device_synchronize(device);
 }
 
 void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const real_type alpha, const device_ptr_type &A_d, const parameter &params, const device_ptr_type &q_red, const real_type QA_cost, const device_ptr_type &B_d, device_ptr_type &C_d) const {
@@ -380,39 +402,55 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
     const unsigned long long row_offset = data_distribution_->place_row_offset(device_id);
 
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
+    const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
+
+    // save the team sizes
+    const ::plssvm::detail::dim_type team_sizes = exec.block;
+
+    for (const auto &[partial_grid, offsets] : exec.grids) {
+        // create a Kokkos TeamPolicy
+        Kokkos::TeamPolicy<> team_policy(device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO);
 
-    // TODO: implement
-    // // convert general execution range's block to CUDA specific block
-    // const dim3 native_block = detail::dim_type_to_native(exec.block);
-    //
-    // detail::set_device(device);
-    // for (const auto &[partial_grid, offsets] : exec.grids) {
-    //     // convert execution range partial_grid to CUDA's native dim3
-    //     const dim3 native_partial_grid = detail::dim_type_to_native(partial_grid);
-    //
-    //     switch (params.kernel_type) {
-    //         case kernel_function_type::linear:
-    //             detail::device_kernel_assembly_symm<kernel_function_type::linear><<<native_partial_grid, native_block>>>(alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y);
-    //             break;
-    //         case kernel_function_type::polynomial:
-    //             detail::device_kernel_assembly_symm<kernel_function_type::polynomial><<<native_partial_grid, native_block>>>(alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, params.degree, std::get<real_type>(params.gamma), params.coef0);
-    //             break;
-    //         case kernel_function_type::rbf:
-    //             detail::device_kernel_assembly_symm<kernel_function_type::rbf><<<native_partial_grid, native_block>>>(alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, std::get<real_type>(params.gamma));
-    //             break;
-    //         case kernel_function_type::sigmoid:
-    //             detail::device_kernel_assembly_symm<kernel_function_type::sigmoid><<<native_partial_grid, native_block>>>(alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, std::get<real_type>(params.gamma), params.coef0);
-    //             break;
-    //         case kernel_function_type::laplacian:
-    //             detail::device_kernel_assembly_symm<kernel_function_type::laplacian><<<native_partial_grid, native_block>>>(alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, std::get<real_type>(params.gamma));
-    //             break;
-    //         case kernel_function_type::chi_squared:
-    //             detail::device_kernel_assembly_symm<kernel_function_type::chi_squared><<<native_partial_grid, native_block>>>(alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, std::get<real_type>(params.gamma));
-    //             break;
-    //     }
-    // }
-    // detail::peek_at_last_error();
-    detail::device_synchronize_all();
+        switch (params.kernel_type) {
+            case kernel_function_type::linear:
+                {
+                    using functor_type = detail::device_kernel_assembly_symm<kernel_function_type::linear>;
+                    Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, partial_grid.x });
+                }
+                break;
+            case kernel_function_type::polynomial:
+                {
+                    using functor_type = detail::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                    Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_polynomial", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, partial_grid.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                }
+                break;
+            case kernel_function_type::rbf:
+                {
+                    using functor_type = detail::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
+                    Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_rbf", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
+                }
+                break;
+            case kernel_function_type::sigmoid:
+                {
+                    using functor_type = detail::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                    Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_sigmoid", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma), params.coef0 });
+                }
+                break;
+            case kernel_function_type::laplacian:
+                {
+                    using functor_type = detail::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
+                    Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_laplacian", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
+                }
+                break;
+            case kernel_function_type::chi_squared:
+                {
+                    using functor_type = detail::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
+                    Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_chi_squared", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
+                }
+                break;
+        }
+    }
+    detail::device_synchronize(device);
 }
 
 //***************************************************//
@@ -431,19 +469,18 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
 
     device_ptr_type w_d{ shape{ num_classes, num_features }, shape{ PADDING_SIZE, PADDING_SIZE }, device };
 
-    // TODO: implement
-    // // convert execution range block to CUDA's native dim3
-    // const dim3 native_block = detail::dim_type_to_native(exec.block);
-    //
-    // detail::set_device(device);
-    // for (const auto &[partial_grid, offsets] : exec.grids) {
-    //     // convert execution range partial_grid to CUDA's native dim3
-    //     const dim3 native_partial_grid = detail::dim_type_to_native(partial_grid);
-    //
-    //     detail::device_kernel_w_linear<<<native_partial_grid, native_block>>>(w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.x, offsets.y);
-    // }
-    // detail::peek_at_last_error();
-    detail::device_synchronize_all();
+    const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
+
+    // save the team sizes
+    const ::plssvm::detail::dim_type team_sizes = exec.block;
+
+    for (const auto &[partial_grid, offsets] : exec.grids) {
+        // create a Kokkos TeamPolicy
+        Kokkos::TeamPolicy<> team_policy{ static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+
+        Kokkos::parallel_for("w_kernel", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_w_linear(w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.x, offsets.y, partial_grid.x));
+    }
+    detail::device_synchronize(device);
 
     return w_d;
 }
@@ -457,38 +494,55 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
 
     device_ptr_type out_d{ shape{ num_predict_points, num_classes }, shape{ PADDING_SIZE, PADDING_SIZE }, device };
 
-    // TODO: implement
-    // // convert execution range block to CUDA's native dim3
-    // const dim3 native_block = detail::dim_type_to_native(exec.block);
-    //
-    // detail::set_device(device);
-    // for (const auto &[partial_grid, offsets] : exec.grids) {
-    //     // convert execution range partial_grid to CUDA's native dim3
-    //     const dim3 native_partial_grid = detail::dim_type_to_native(partial_grid);
-    //
-    //     switch (params.kernel_type) {
-    //         case kernel_function_type::linear:
-    //             detail::device_kernel_predict_linear<<<native_partial_grid, native_block>>>(out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.x, offsets.y);
-    //             break;
-    //         case kernel_function_type::polynomial:
-    //             detail::device_kernel_predict<kernel_function_type::polynomial><<<native_partial_grid, native_block>>>(out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, params.degree, std::get<real_type>(params.gamma), params.coef0);
-    //             break;
-    //         case kernel_function_type::rbf:
-    //             detail::device_kernel_predict<kernel_function_type::rbf><<<native_partial_grid, native_block>>>(out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, std::get<real_type>(params.gamma));
-    //             break;
-    //         case kernel_function_type::sigmoid:
-    //             detail::device_kernel_predict<kernel_function_type::sigmoid><<<native_partial_grid, native_block>>>(out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, std::get<real_type>(params.gamma), params.coef0);
-    //             break;
-    //         case kernel_function_type::laplacian:
-    //             detail::device_kernel_predict<kernel_function_type::laplacian><<<native_partial_grid, native_block>>>(out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, std::get<real_type>(params.gamma));
-    //             break;
-    //         case kernel_function_type::chi_squared:
-    //             detail::device_kernel_predict<kernel_function_type::chi_squared><<<native_partial_grid, native_block>>>(out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, std::get<real_type>(params.gamma));
-    //             break;
-    //     }
-    // }
-    // detail::peek_at_last_error();
-    detail::device_synchronize_all();
+    const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
+
+    // save the team sizes
+    const ::plssvm::detail::dim_type team_sizes = exec.block;
+
+    for (const auto &[partial_grid, offsets] : exec.grids) {
+        // create a Kokkos TeamPolicy
+        Kokkos::TeamPolicy<> team_policy{ static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+
+        switch (params.kernel_type) {
+            case kernel_function_type::linear:
+                {
+                    using functor_type = detail::device_kernel_predict_linear;
+                    Kokkos::parallel_for("predict_kernel_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x });
+                }
+                break;
+            case kernel_function_type::polynomial:
+                {
+                    using functor_type = detail::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                    Kokkos::parallel_for("predict_kernel_polynomial", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                }
+                break;
+            case kernel_function_type::rbf:
+                {
+                    using functor_type = detail::device_kernel_predict<kernel_function_type::rbf, real_type>;
+                    Kokkos::parallel_for("predict_kernel_rbf", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
+                }
+                break;
+            case kernel_function_type::sigmoid:
+                {
+                    using functor_type = detail::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                    Kokkos::parallel_for("predict_kernel_sigmoid", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma), params.coef0 });
+                }
+                break;
+            case kernel_function_type::laplacian:
+                {
+                    using functor_type = detail::device_kernel_predict<kernel_function_type::laplacian, real_type>;
+                    Kokkos::parallel_for("predict_kernel_laplacian", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
+                }
+                break;
+            case kernel_function_type::chi_squared:
+                {
+                    using functor_type = detail::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
+                    Kokkos::parallel_for("predict_kernel_chi_squared", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
+                }
+                break;
+        }
+    }
+    detail::device_synchronize(device);
 
     return out_d;
 }
diff --git a/src/plssvm/backends/Kokkos/detail/utility.cpp b/src/plssvm/backends/Kokkos/detail/utility.cpp
index 9458bb899..4505c0515 100644
--- a/src/plssvm/backends/Kokkos/detail/utility.cpp
+++ b/src/plssvm/backends/Kokkos/detail/utility.cpp
@@ -126,8 +126,8 @@ std::string get_device_name(const execution_space space, const std::size_t devic
     return "unknown";
 }
 
-void device_synchronize_all() {
-    Kokkos::DefaultExecutionSpace::impl_static_fence("synchronize all");
+void device_synchronize(const Kokkos::DefaultExecutionSpace& exec) {
+    exec.fence();
 }
 
 std::string get_kokkos_version() {

From bc47002ab1a48993b6a16318759ef07dc1f09f05 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 21 Oct 2024 15:21:04 +0200
Subject: [PATCH 014/123] Add missing parameterized test suites.

---
 tests/main.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tests/main.cpp b/tests/main.cpp
index 69570930a..d27eddd7d 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -13,18 +13,27 @@
 
 // silence GTest warnings/test errors
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVM);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolver);
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunction);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolver);
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunction);
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunctionClassification);
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunctionClassification);
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMDeathTest);
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverDeathTest);
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunctionDeathTest);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunctionDeathTest);
+
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVM);
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVMKernelFunction);
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVMDeathTest);
+
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtr);
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtrLayout);
+
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtrDeathTest);
+
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Exception);
 
 int main(int argc, char **argv) {

From dfbb0cc44db0117231056bc4d18d2aef0ffc7e58 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 21 Oct 2024 15:21:18 +0200
Subject: [PATCH 015/123] Add first Kokkos backend tests.

---
 tests/backends/CMakeLists.txt               |   5 +
 tests/backends/Kokkos/CMakeLists.txt        |  35 +++++
 tests/backends/Kokkos/detail/device_ptr.cpp |  42 +++++
 tests/backends/Kokkos/exceptions.cpp        |  25 +++
 tests/backends/Kokkos/kokkos_csvm.cpp       | 162 ++++++++++++++++++++
 tests/backends/Kokkos/mock_kokkos_csvm.hpp  |  85 ++++++++++
 tests/kokkos_main.cpp                       |  63 ++++++++
 7 files changed, 417 insertions(+)
 create mode 100644 tests/backends/Kokkos/CMakeLists.txt
 create mode 100644 tests/backends/Kokkos/detail/device_ptr.cpp
 create mode 100644 tests/backends/Kokkos/exceptions.cpp
 create mode 100644 tests/backends/Kokkos/kokkos_csvm.cpp
 create mode 100644 tests/backends/Kokkos/mock_kokkos_csvm.hpp
 create mode 100644 tests/kokkos_main.cpp

diff --git a/tests/backends/CMakeLists.txt b/tests/backends/CMakeLists.txt
index 805e8bc1b..6acf4f638 100644
--- a/tests/backends/CMakeLists.txt
+++ b/tests/backends/CMakeLists.txt
@@ -32,4 +32,9 @@ endif ()
 # create SYCL tests if the SYCL backend is available
 if (TARGET ${PLSSVM_SYCL_BACKEND_LIBRARY_NAME})
     add_subdirectory(SYCL)
+endif ()
+
+# create Kokkos tests if the Kokkos backend is available
+if (TARGET ${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME})
+    add_subdirectory(Kokkos)
 endif ()
\ No newline at end of file
diff --git a/tests/backends/Kokkos/CMakeLists.txt b/tests/backends/Kokkos/CMakeLists.txt
new file mode 100644
index 000000000..1a4d3d089
--- /dev/null
+++ b/tests/backends/Kokkos/CMakeLists.txt
@@ -0,0 +1,35 @@
+## Authors: Alexander Van Craen, Marcel Breyer
+## Copyright (C): 2018-today The PLSSVM project - All Rights Reserved
+## License: This file is part of the PLSSVM project which is released under the MIT license.
+##          See the LICENSE.md file in the project root for full license information.
+########################################################################################################################
+
+## create Kokkos tests
+set(PLSSVM_KOKKOS_TEST_NAME Kokkos_tests)
+
+# list all necessary sources
+set(PLSSVM_KOKKOS_TEST_SOURCES
+    ${CMAKE_CURRENT_LIST_DIR}/detail/device_ptr.cpp
+#    ${CMAKE_CURRENT_LIST_DIR}/detail/pinned_memory.cpp
+#    ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cu
+    ${CMAKE_CURRENT_LIST_DIR}/kokkos_csvm.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/exceptions.cpp
+)
+
+find_package(Kokkos REQUIRED)
+
+# add test executable
+add_executable(${PLSSVM_KOKKOS_TEST_NAME} ${CMAKE_CURRENT_LIST_DIR}/../../kokkos_main.cpp ${PLSSVM_KOKKOS_TEST_SOURCES})
+
+# link against test library
+target_link_libraries(${PLSSVM_KOKKOS_TEST_NAME} PRIVATE ${PLSSVM_BASE_TEST_LIBRARY_NAME})
+
+# add tests to google test
+include(GoogleTest)
+include(${PROJECT_SOURCE_DIR}/cmake/discover_tests_with_death_test_filter.cmake)
+discover_tests_with_death_test_filter(${PLSSVM_KOKKOS_TEST_NAME})
+
+# add test as coverage dependency
+if (TARGET coverage)
+    add_dependencies(coverage ${PLSSVM_KOKKOS_TEST_NAME})
+endif ()
\ No newline at end of file
diff --git a/tests/backends/Kokkos/detail/device_ptr.cpp b/tests/backends/Kokkos/detail/device_ptr.cpp
new file mode 100644
index 000000000..797bfef78
--- /dev/null
+++ b/tests/backends/Kokkos/detail/device_ptr.cpp
@@ -0,0 +1,42 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Tests for the Kokkos backend device pointer.
+ */
+
+#include "plssvm/backends/Kokkos/detail/device_ptr.hpp"  // plssvm::kokkos::detail::device_ptr
+
+#include "tests/backends/generic_device_ptr_tests.hpp"  // generic device pointer tests to instantiate
+#include "tests/naming.hpp"                             // naming::test_parameter_to_name
+#include "tests/types_to_test.hpp"                      // util::{combine_test_parameters_gtest_t, cartesian_type_product_t, layout_type_list}
+
+#include "gtest/gtest.h"  // INSTANTIATE_TYPED_TEST_SUITE_P
+
+#include <tuple>  // std::tuple
+
+template <typename T>
+struct kokkos_device_ptr_test_type {
+    using device_ptr_type = plssvm::kokkos::detail::device_ptr<T>;
+    using queue_type = Kokkos::DefaultExecutionSpace;
+
+    static const queue_type &default_queue() {
+        static const queue_type queue{};
+        return queue;
+    }
+};
+
+using kokkos_device_ptr_tuple = std::tuple<kokkos_device_ptr_test_type<float>, kokkos_device_ptr_test_type<double>>;
+
+// the tests used in the instantiated GTest test suites
+using kokkos_device_ptr_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<kokkos_device_ptr_tuple>>;
+using kokkos_device_ptr_layout_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<kokkos_device_ptr_tuple>, util::layout_type_list>;
+
+// instantiate type-parameterized tests
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosDevicePtr, DevicePtr, kokkos_device_ptr_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosDevicePtr, DevicePtrLayout, kokkos_device_ptr_layout_type_gtest, naming::test_parameter_to_name);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosDevicePtrDeathTest, DevicePtrDeathTest, kokkos_device_ptr_type_gtest, naming::test_parameter_to_name);
diff --git a/tests/backends/Kokkos/exceptions.cpp b/tests/backends/Kokkos/exceptions.cpp
new file mode 100644
index 000000000..d78ac7801
--- /dev/null
+++ b/tests/backends/Kokkos/exceptions.cpp
@@ -0,0 +1,25 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Tests for the custom exception classes related to the Kokkos backend.
+ */
+
+#include "plssvm/backends/Kokkos/exceptions.hpp"  // plssvm::kokkos::backend_exception
+
+#include "tests/backends/generic_exceptions_tests.hpp"  // generic exception tests to instantiate
+
+#include "gtest/gtest.h"  // INSTANTIATE_TYPED_TEST_SUITE_P
+
+#include <string_view>  // std::string_view
+
+struct exception_test_type {
+    using exception_type = plssvm::kokkos::backend_exception;
+    constexpr static std::string_view name = "kokkos::backend_exception";
+};
+
+// instantiate type-parameterized tests
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosExceptions, Exception, exception_test_type);
diff --git a/tests/backends/Kokkos/kokkos_csvm.cpp b/tests/backends/Kokkos/kokkos_csvm.cpp
new file mode 100644
index 000000000..e7af88d5b
--- /dev/null
+++ b/tests/backends/Kokkos/kokkos_csvm.cpp
@@ -0,0 +1,162 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Tests for the functionality related to the Kokkos backend.
+ */
+
+#include "plssvm/backends/Kokkos/csvm.hpp"        // plssvm::kokkos::csvm
+#include "plssvm/backends/Kokkos/exceptions.hpp"  // plssvm::kokkos::backend_exception
+#include "plssvm/detail/type_list.hpp"          // plssvm::detail::label_type_list
+#include "plssvm/kernel_function_types.hpp"     // plssvm::kernel_function_type
+#include "plssvm/parameter.hpp"                 // plssvm::parameter
+#include "plssvm/target_platforms.hpp"          // plssvm::target_platform
+
+#include "tests/backends/Kokkos/mock_kokkos_csvm.hpp"
+#include "tests/backends/generic_csvm_tests.hpp"      // generic CSVM tests to instantiate
+#include "tests/backends/generic_gpu_csvm_tests.hpp"  // generic GPU CSVM tests to instantiate
+#include "tests/custom_test_macros.hpp"               // EXPECT_THROW_WHAT
+#include "tests/naming.hpp"                           // naming::test_parameter_to_name
+#include "tests/types_to_test.hpp"                    // util::{cartesian_type_product_t, combine_test_parameters_gtest_t}
+#include "tests/utility.hpp"                          // util::redirect_output
+
+#include "gtest/gtest.h"  // TEST_F, EXPECT_NO_THROW, INSTANTIATE_TYPED_TEST_SUITE_P, ::testing::Test
+
+#include <tuple>  // std::make_tuple, std::tuple
+
+class KokkosCSVM : public ::testing::Test,
+                 private util::redirect_output<> { };
+
+//// check whether the constructor correctly fails when using an incompatible target platform
+//TEST_F(CUDACSVM, construct_parameter) {
+//#if defined(PLSSVM_HAS_NVIDIA_TARGET)
+//    // the automatic target platform must always be available
+//    EXPECT_NO_THROW(plssvm::cuda::csvm{ plssvm::parameter{} });
+//#else
+//    EXPECT_THROW_WHAT(plssvm::cuda::csvm{ plssvm::parameter{} },
+//                      plssvm::cuda::backend_exception,
+//                      "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+//#endif
+//}
+//
+//TEST_F(CUDACSVM, construct_target_and_parameter) {
+//    // create parameter struct
+//    const plssvm::parameter params{};
+//
+//#if defined(PLSSVM_HAS_NVIDIA_TARGET)
+//    // only automatic or gpu_nvidia are allowed as target platform for the CUDA backend
+//    EXPECT_NO_THROW((plssvm::cuda::csvm{ plssvm::target_platform::automatic, params }));
+//    EXPECT_NO_THROW((plssvm::cuda::csvm{ plssvm::target_platform::gpu_nvidia, params }));
+//#else
+//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::automatic, params }),
+//                      plssvm::cuda::backend_exception,
+//                      "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::gpu_nvidia, params }),
+//                      plssvm::cuda::backend_exception,
+//                      "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+//#endif
+//
+//    // all other target platforms must throw
+//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::cpu, params }),
+//                      plssvm::cuda::backend_exception,
+//                      "Invalid target platform 'cpu' for the CUDA backend!");
+//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::gpu_amd, params }),
+//                      plssvm::cuda::backend_exception,
+//                      "Invalid target platform 'gpu_amd' for the CUDA backend!");
+//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::gpu_intel, params }),
+//                      plssvm::cuda::backend_exception,
+//                      "Invalid target platform 'gpu_intel' for the CUDA backend!");
+//}
+//
+//TEST_F(CUDACSVM, construct_named_args) {
+//#if defined(PLSSVM_HAS_NVIDIA_TARGET)
+//    // only automatic or gpu_nvidia are allowed as target platform for the CUDA backend
+//    EXPECT_NO_THROW((plssvm::cuda::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
+//    EXPECT_NO_THROW((plssvm::cuda::csvm{ plssvm::cost = 2.0 }));
+//#else
+//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }),
+//                      plssvm::cuda::backend_exception,
+//                      "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+//#endif
+//}
+//
+//TEST_F(CUDACSVM, construct_target_and_named_args) {
+//#if defined(PLSSVM_HAS_NVIDIA_TARGET)
+//    // only automatic or gpu_nvidia are allowed as target platform for the CUDA backend
+//    EXPECT_NO_THROW((plssvm::cuda::csvm{ plssvm::target_platform::automatic, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
+//    EXPECT_NO_THROW((plssvm::cuda::csvm{ plssvm::target_platform::gpu_nvidia, plssvm::cost = 2.0 }));
+//#else
+//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::automatic, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }),
+//                      plssvm::cuda::backend_exception,
+//                      "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::gpu_nvidia, plssvm::cost = 2.0 }),
+//                      plssvm::cuda::backend_exception,
+//                      "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+//#endif
+//
+//    // all other target platforms must throw
+//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::cpu, plssvm::cost = 2.0 }),
+//                      plssvm::cuda::backend_exception,
+//                      "Invalid target platform 'cpu' for the CUDA backend!");
+//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::gpu_amd, plssvm::cost = 2.0 }),
+//                      plssvm::cuda::backend_exception,
+//                      "Invalid target platform 'gpu_amd' for the CUDA backend!");
+//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::gpu_intel, plssvm::cost = 2.0 }),
+//                      plssvm::cuda::backend_exception,
+//                      "Invalid target platform 'gpu_intel' for the CUDA backend!");
+//}
+
+template <bool mock_grid_size>
+struct kokkos_csvm_test_type {
+    using mock_csvm_type = mock_kokkos_csvm<mock_grid_size>;
+    using csvm_type = plssvm::kokkos::csvm;
+    using device_ptr_type = typename csvm_type::device_ptr_type;
+    inline constexpr static auto additional_arguments = std::make_tuple();
+};
+
+using kokkos_csvm_test_tuple = std::tuple<kokkos_csvm_test_type<false>>;
+using kokkos_csvm_test_label_type_list = util::cartesian_type_product_t<kokkos_csvm_test_tuple, plssvm::detail::supported_label_types>;
+using kokkos_csvm_test_type_list = util::cartesian_type_product_t<kokkos_csvm_test_tuple>;
+
+// the tests used in the instantiated GTest test suites
+using kokkos_csvm_test_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list>;
+using kokkos_solver_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list, util::solver_type_list>;
+using kokkos_kernel_function_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list, util::kernel_function_type_list>;
+using kokkos_solver_and_kernel_function_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list, util::solver_and_kernel_function_type_list>;
+using kokkos_label_type_kernel_function_and_classification_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_label_type_list, util::kernel_function_and_classification_type_list>;
+using kokkos_label_type_solver_kernel_function_and_classification_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_label_type_list, util::solver_and_kernel_function_and_classification_type_list>;
+
+// instantiate type-parameterized tests
+// generic CSVM tests
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVM, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMKernelFunction, kokkos_kernel_function_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolver, kokkos_solver_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolverKernelFunction, kokkos_solver_and_kernel_function_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMKernelFunctionClassification, kokkos_label_type_kernel_function_and_classification_type_gtest, naming::test_parameter_to_name);
+//INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolverKernelFunctionClassification, kokkos_label_type_solver_kernel_function_and_classification_type_gtest, naming::test_parameter_to_name);
+
+// generic CSVM DeathTests
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMDeathTest, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMSolverDeathTest, kokkos_solver_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMKernelFunctionDeathTest, kokkos_kernel_function_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMSolverKernelFunctionDeathTest, kokkos_solver_and_kernel_function_type_gtest, naming::test_parameter_to_name);
+
+// generic GPU CSVM tests - correct grid sizes
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericGPUCSVM, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericGPUCSVMKernelFunction, kokkos_kernel_function_type_gtest, naming::test_parameter_to_name);
+
+// generic GPU CSVM DeathTests - correct grid sizes
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericGPUCSVMDeathTest, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
+
+using kokkos_mock_csvm_test_tuple = std::tuple<kokkos_csvm_test_type<true>>;
+using kokkos_mock_csvm_test_type_list = util::cartesian_type_product_t<kokkos_mock_csvm_test_tuple>;
+
+using kokkos_mock_csvm_test_type_gtest = util::combine_test_parameters_gtest_t<kokkos_mock_csvm_test_type_list>;
+using kokkos_mock_kernel_function_type_gtest = util::combine_test_parameters_gtest_t<kokkos_mock_csvm_test_type_list, util::kernel_function_type_list>;
+
+// generic GPU CSVM tests - mocked grid sizes
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMFakedGridSize, GenericGPUCSVM, kokkos_mock_csvm_test_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMFakedGridSize, GenericGPUCSVMKernelFunction, kokkos_mock_kernel_function_type_gtest, naming::test_parameter_to_name);
diff --git a/tests/backends/Kokkos/mock_kokkos_csvm.hpp b/tests/backends/Kokkos/mock_kokkos_csvm.hpp
new file mode 100644
index 000000000..6fb35cd9c
--- /dev/null
+++ b/tests/backends/Kokkos/mock_kokkos_csvm.hpp
@@ -0,0 +1,85 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief MOCK class for the C-SVM class using the Kokkos backend.
+ */
+
+#ifndef PLSSVM_TESTS_BACKENDS_KOKKOS_MOCK_KOKKOS_CSVM_HPP_
+#define PLSSVM_TESTS_BACKENDS_KOKKOS_MOCK_KOKKOS_CSVM_HPP_
+#pragma once
+
+#include "plssvm/backends/execution_range.hpp"  // plssvm::detail::dim_type
+#include "plssvm/backends/Kokkos/csvm.hpp"      // plssvm::kokkos::csvm
+
+#include "gmock/gmock.h"  // MOCK_METHOD, ON_CALL, ::testing::Return
+
+#include <cstddef>  // std::size_t
+#include <utility>  // std::forward
+
+/**
+ * @brief GTest mock class for the Kokkos CSVM.
+ * @tparam mock_grid_size `true` if the `plssvm::kokkos::csvm::get_max_grid_size()` function should be mocked, otherwise `false`
+ */
+template <bool mock_grid_size>
+class mock_kokkos_csvm final : public plssvm::kokkos::csvm {
+    using base_type = plssvm::kokkos::csvm;
+
+  public:
+    using base_type::device_ptr_type;
+
+    template <typename... Args>
+    explicit mock_kokkos_csvm(Args &&...args) :
+        base_type{ std::forward<Args>(args)... } {
+        this->fake_functions();
+    }
+
+    MOCK_METHOD((plssvm::detail::dim_type), get_max_grid_size, (const std::size_t), (const, override));
+
+    // make protected member functions public
+    using base_type::assemble_kernel_matrix;
+    using base_type::blas_level_3;
+    using base_type::get_device_memory;
+    using base_type::get_max_work_group_size;
+    using base_type::num_available_devices;
+
+    using base_type::predict_values;
+
+    using base_type::conjugate_gradients;
+    using base_type::perform_dimensional_reduction;
+    using base_type::run_assemble_kernel_matrix_implicit_blas_level_3;
+    using base_type::run_blas_level_3;
+    using base_type::solve_lssvm_system_of_linear_equations;
+
+    using base_type::get_max_mem_alloc_size;
+
+    using base_type::run_assemble_kernel_matrix_explicit;
+    using base_type::run_blas_level_3_kernel_explicit;
+    using base_type::run_inplace_matrix_addition;
+    using base_type::run_inplace_matrix_scale;
+    using base_type::run_predict_kernel;
+    using base_type::run_w_kernel;
+
+    using base_type::data_distribution_;
+    using base_type::devices_;
+
+  private:
+    /*
+     * @brief Fake the plssvm::kokkos::csvm::get_max_grid_size() function if requested.
+     */
+    void fake_functions() const {
+        if constexpr (mock_grid_size) {
+            // mock the function using hardcoded maximum grid sizes
+            ON_CALL(*this, get_max_grid_size).WillByDefault(::testing::Return(plssvm::detail::dim_type{ std::size_t{ 4 }, std::size_t{ 4 }, std::size_t{ 4 } }));
+        } else {
+            // use the actual real implementation otherwise
+            ON_CALL(*this, get_max_grid_size).WillByDefault([this](const std::size_t device_id) { return base_type::get_max_grid_size(device_id); });
+        }
+    }
+};
+
+#endif  // PLSSVM_TESTS_BACKENDS_KOKKOS_MOCK_KOKKOS_CSVM_HPP_
diff --git a/tests/kokkos_main.cpp b/tests/kokkos_main.cpp
new file mode 100644
index 000000000..e53409bd4
--- /dev/null
+++ b/tests/kokkos_main.cpp
@@ -0,0 +1,63 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Contains the googletest main function. Sets the DeathTest to "threadsafe" execution instead of "fast".
+ */
+
+#include "gtest/gtest.h"  // GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST, RUN_ALL_TESTS, ::testing::{InitGoogleTest, GTEST_FLAG}
+
+#include "Kokkos_Core.hpp"  // TODO:
+
+// TODO: reduce copy-paste
+
+// silence GTest warnings/test errors
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVM);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunction);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolver);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunction);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunctionClassification);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunctionClassification);
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMDeathTest);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverDeathTest);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunctionDeathTest);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunctionDeathTest);
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVM);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVMKernelFunction);
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVMDeathTest);
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtr);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtrLayout);
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtrDeathTest);
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Exception);
+
+int main(int argc, char **argv) {
+    ::testing::InitGoogleTest(&argc, argv);
+
+    // initialize Kokkos
+    Kokkos::initialize(argc, argv);
+
+    // prevent problems with fork() in the presence of multiple threads
+    // https://github.com/google/googletest/blob/main/docs/advanced.md#death-tests-and-threads
+    // NOTE: may reduce performance of the (death) tests
+#if !defined(_WIN32)
+    ::testing::GTEST_FLAG(death_test_style) = "threadsafe";
+#endif
+
+    // run all tests
+    const int return_code = RUN_ALL_TESTS();
+
+    // finalize Kokkos
+    Kokkos::finalize();
+
+    return return_code;
+}

From 5cba91dfb8543d939e55583b95bfff8e0135c402 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 22 Oct 2024 14:08:50 +0200
Subject: [PATCH 016/123] Update and refactor implementation.

---
 include/plssvm/backends/Kokkos/csvm.hpp       |  34 ++--
 .../Kokkos/detail/conditional_execution.hpp   | 138 +++++++++++++++
 .../Kokkos/detail/execution_space.hpp         |  42 -----
 .../plssvm/backends/Kokkos/detail/utility.hpp |  87 ++++------
 .../backends/Kokkos/execution_space.hpp       | 134 ++++++++++++++
 src/plssvm/backends/Kokkos/CMakeLists.txt     |   2 +-
 src/plssvm/backends/Kokkos/csvm.cpp           | 164 +++++++++++-------
 .../Kokkos/detail/execution_space.cpp         |  39 -----
 src/plssvm/backends/Kokkos/detail/utility.cpp |  54 +-----
 .../backends/Kokkos/execution_space.cpp       |  74 ++++++++
 10 files changed, 505 insertions(+), 263 deletions(-)
 create mode 100644 include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
 delete mode 100644 include/plssvm/backends/Kokkos/detail/execution_space.hpp
 create mode 100644 include/plssvm/backends/Kokkos/execution_space.hpp
 delete mode 100644 src/plssvm/backends/Kokkos/detail/execution_space.cpp
 create mode 100644 src/plssvm/backends/Kokkos/execution_space.cpp

diff --git a/include/plssvm/backends/Kokkos/csvm.hpp b/include/plssvm/backends/Kokkos/csvm.hpp
index 206d85a81..859a9f43b 100644
--- a/include/plssvm/backends/Kokkos/csvm.hpp
+++ b/include/plssvm/backends/Kokkos/csvm.hpp
@@ -13,18 +13,19 @@
 #define PLSSVM_BACKENDS_KOKKOS_CSVM_HPP_
 #pragma once
 
-#include "plssvm/backends/execution_range.hpp"                // plssvm::detail::{dim_type, execution_range}
-#include "plssvm/backends/gpu_csvm.hpp"                       // plssvm::detail::gpu_csvm
-#include "plssvm/backends/Kokkos/detail/device_ptr.hpp"       // plssvm::kokkos::detail::device_ptr
-#include "plssvm/backends/Kokkos/detail/execution_space.hpp"  // plssvm::kokkos::detail::execution_space
-#include "plssvm/backends/Kokkos/detail/pinned_memory.hpp"    // plssvm::kokkos::detail::pinned_memory
-#include "plssvm/csvm.hpp"                                    // plssvm::detail::csvm_backend_exists
-#include "plssvm/detail/memory_size.hpp"                      // plssvm::detail::memory_size
-#include "plssvm/detail/type_traits.hpp"                      // PLSSVM_REQUIRES
-#include "plssvm/parameter.hpp"                               // plssvm::parameter, plssvm::detail::parameter
-#include "plssvm/target_platforms.hpp"                        // plssvm::target_platform
-
-#include "Kokkos_Core.hpp"  // TODO:
+#include "plssvm/backends/execution_range.hpp"              // plssvm::detail::{dim_type, execution_range}
+#include "plssvm/backends/gpu_csvm.hpp"                     // plssvm::detail::gpu_csvm
+#include "plssvm/backends/Kokkos/detail/device_ptr.hpp"     // plssvm::kokkos::detail::device_ptr
+#include "plssvm/backends/Kokkos/detail/pinned_memory.hpp"  // plssvm::kokkos::detail::pinned_memory
+#include "plssvm/backends/Kokkos/execution_space.hpp"       // plssvm::kokkos::execution_space
+#include "plssvm/constants.hpp"                             // plssvm::real_type
+#include "plssvm/csvm.hpp"                                  // plssvm::detail::csvm_backend_exists
+#include "plssvm/detail/memory_size.hpp"                    // plssvm::detail::memory_size
+#include "plssvm/detail/type_traits.hpp"                    // PLSSVM_REQUIRES
+#include "plssvm/parameter.hpp"                             // plssvm::parameter, plssvm::detail::parameter
+#include "plssvm/target_platforms.hpp"                      // plssvm::target_platform
+
+#include "Kokkos_Core_fwd.hpp"  // Kokkos::DefaultExecutionSpace
 
 #include <cstddef>      // std::size_t
 #include <type_traits>  // std::true_type
@@ -37,6 +38,7 @@ namespace kokkos {
 
 /**
  * @brief A C-SVM implementation using Kokkos as backend.
+ * @details Internally, we always only use the `Kokkos::DefaultExecutionSpace`.
  */
 class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, Kokkos::DefaultExecutionSpace, detail::pinned_memory> {
   protected:
@@ -117,6 +119,12 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, Kokkos::Defau
      */
     ~csvm() override;
 
+    /**
+     * @brief Return the currently used `execution_space` determined using `Kokkos::ExecutionSpace`.
+     * @return the execution space (`[[nodiscard]]`)
+     */
+    [[nodiscard]] execution_space get_execution_space() const noexcept { return space_; }
+
   protected:
     /**
      * @brief Initialize all important states related to the Kokkos backend.
@@ -180,7 +188,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, Kokkos::Defau
     [[nodiscard]] device_ptr_type run_predict_kernel(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &alpha_d, const device_ptr_type &rho_d, const device_ptr_type &sv_or_w_d, const device_ptr_type &predict_points_d) const final;
 
     /// The used Kokkos execution space.
-    detail::execution_space space_;
+    execution_space space_{};
 };
 
 }  // namespace kokkos
diff --git a/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp b/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
new file mode 100644
index 000000000..6ed8c3421
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
@@ -0,0 +1,138 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Conditionally defined macros for the different available Kokkos ExecutionSpaces.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_CONDITIONAL_EXECUTION_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_DETAIL_CONDITIONAL_EXECUTION_HPP_
+#pragma once
+
+#include "plssvm/backends/Kokkos/exceptions.hpp"       // plssvm::kokkos::backend_exception
+#include "plssvm/backends/Kokkos/execution_space.hpp"  // plssvm::kokkos::execution_space
+
+#include "Kokkos_Core.hpp"  // Kokkos macros
+
+#include "fmt/core.h"  // fmt::format
+
+#include <functional>  // std::invoke
+
+namespace plssvm::kokkos::detail {
+
+/**
+ * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA
+ * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA` macro if `KOKKOS_ENABLE_CUDA` is defined, i.e., the Kokkos CUDA ExecutionSpace is available.
+ * @details If `KOKKOS_ENABLE_CUDA` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception.
+ */
+#if defined(KOKKOS_ENABLE_CUDA)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA(func) return std::invoke(func)
+#else
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA(func) \
+        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::cuda) }
+#endif
+
+/**
+ * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP
+ * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP` macro if `KOKKOS_ENABLE_HIP` is defined, i.e., the Kokkos HIP ExecutionSpace is available.
+ * @details If `KOKKOS_ENABLE_HIP` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception.
+ */
+#if defined(KOKKOS_ENABLE_HIP)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP(func) return std::invoke(func)
+#else
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP(func) \
+        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::hip) }
+#endif
+
+/**
+ * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL
+ * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL` macro if `KOKKOS_ENABLE_SYCL` is defined, i.e., the Kokkos SYCL ExecutionSpace is available.
+ * @details If `KOKKOS_ENABLE_SYCL` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception.
+ */
+#if defined(KOKKOS_ENABLE_SYCL)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL(func) return std::invoke(func)
+#else
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL(func) \
+        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::sycl) }
+#endif
+
+/**
+ * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX
+ * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX` macro if `KOKKOS_ENABLE_HPX` is defined, i.e., the Kokkos HPX ExecutionSpace is available.
+ * @details If `KOKKOS_ENABLE_HPX` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception.
+ */
+#if defined(KOKKOS_ENABLE_HPX)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX(func) return std::invoke(func)
+#else
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX(func) \
+        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::hpx) }
+#endif
+
+/**
+ * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP
+ * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP` macro if `KOKKOS_ENABLE_OPENMP` is defined, i.e., the Kokkos OpenMP ExecutionSpace is available.
+ * @details If `KOKKOS_ENABLE_OPENMP` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception.
+ */
+#if defined(KOKKOS_ENABLE_OPENMP)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP(func) return std::invoke(func)
+#else
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP(func) \
+        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openmp) }
+#endif
+
+/**
+ * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET
+ * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET` macro if `KOKKOS_ENABLE_OPENMPTARGET` is defined, i.e., the Kokkos OpenMP target offloading ExecutionSpace is available.
+ * @details If `KOKKOS_ENABLE_OPENMPTARGET` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception.
+ */
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET(func) return std::invoke(func)
+#else
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET(func) \
+        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openmp_target) }
+#endif
+
+/**
+ * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC
+ * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC` macro if `KOKKOS_ENABLE_OPENACC` is defined, i.e., the Kokkos OpenACC ExecutionSpace is available.
+ * @details If `KOKKOS_ENABLE_OPENACC` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception.
+ */
+#if defined(KOKKOS_ENABLE_OPENACC)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC(func) return std::invoke(func)
+#else
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC(func) \
+        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openacc) }
+#endif
+
+/**
+ * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS
+ * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS` macro if `KOKKOS_ENABLE_THREADS` is defined, i.e., the Kokkos std::thread ExecutionSpace is available.
+ * @details If `KOKKOS_ENABLE_THREADS` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception.
+ */
+#if defined(KOKKOS_ENABLE_THREADS)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS(func) return std::invoke(func)
+#else
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS(func) \
+        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::threads) }
+#endif
+
+/**
+ * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL
+ * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL` macro if `KOKKOS_ENABLE_SERIAL` is defined, i.e., the Kokkos serial ExecutionSpace is available.
+ * @details If `KOKKOS_ENABLE_SERIAL` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception.
+ * @note This ExecutionSpace *should* always be available!
+ */
+#if defined(KOKKOS_ENABLE_SERIAL)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL(func) return std::invoke(func)
+#else
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL(func) \
+        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::serial) }
+#endif
+
+}  // namespace plssvm::kokkos::detail
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_DETAIL_CONDITIONAL_EXECUTION_HPP_
diff --git a/include/plssvm/backends/Kokkos/detail/execution_space.hpp b/include/plssvm/backends/Kokkos/detail/execution_space.hpp
deleted file mode 100644
index 8e89975c3..000000000
--- a/include/plssvm/backends/Kokkos/detail/execution_space.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * @file
- * @author Alexander Van Craen
- * @author Marcel Breyer
- * @copyright 2018-today The PLSSVM project - All Rights Reserved
- * @license This file is part of the PLSSVM project which is released under the MIT license.
- *          See the LICENSE.md file in the project root for full license information.
- *
- * @brief Execution space enumeration for the ExecutionSpaces in Kokkos.
- */
-
-#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_EXECUTION_SPACE_HPP_
-#define PLSSVM_BACKENDS_KOKKOS_DETAIL_EXECUTION_SPACE_HPP_
-#pragma once
-
-#include "fmt/base.h"     // fmt::formatter
-#include "fmt/ostream.h"  // fmt::ostream_formatter
-
-#include <iosfwd>  // std::ostream forward declaration
-
-namespace plssvm::kokkos::detail {
-
-enum class execution_space {
-    cuda,
-    hip,
-    sycl,
-    hpx,
-    openmp,
-    openmp_target,
-    openacc,
-    threads,
-    serial
-};
-
-std::ostream &operator<<(std::ostream &out, execution_space space);
-
-}  // namespace plssvm::kokkos::detail
-
-template <>
-struct fmt::formatter<plssvm::kokkos::detail::execution_space> : fmt::ostream_formatter { };
-
-#endif  // PLSSVM_BACKENDS_KOKKOS_DETAIL_EXECUTION_SPACE_HPP_
diff --git a/include/plssvm/backends/Kokkos/detail/utility.hpp b/include/plssvm/backends/Kokkos/detail/utility.hpp
index 523900aa9..b7e732aff 100644
--- a/include/plssvm/backends/Kokkos/detail/utility.hpp
+++ b/include/plssvm/backends/Kokkos/detail/utility.hpp
@@ -13,8 +13,9 @@
 #define PLSSVM_BACKENDS_KOKKOS_DETAIL_UTILITY_HPP_
 #pragma once
 
-#include "plssvm/backends/Kokkos/detail/execution_space.hpp"  // plssvm::kokkos::detail::execution_space
-#include "plssvm/target_platforms.hpp"                        // plssvm::target_platform
+#include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"  // PLSSVM_KOKKOS_BACKEND_INVOKE_IF_*
+#include "plssvm/backends/Kokkos/execution_space.hpp"               // plssvm::kokkos::execution_space
+#include "plssvm/target_platforms.hpp"                              // plssvm::target_platform
 
 #include "Kokkos_Core.hpp"  // TODO: ?
 
@@ -24,64 +25,40 @@
 
 namespace plssvm::kokkos::detail {
 
-template <typename ExecSpace>
-[[nodiscard]] execution_space determine_execution_space() noexcept {
-    // determine the execution_space enumeration value based on the provided Kokkos execution space
-#if defined(KOKKOS_ENABLE_CUDA)
-    if constexpr (std::is_same_v<ExecSpace, Kokkos::Cuda>) {
-        return execution_space::cuda;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_HIP)
-    if constexpr (std::is_same_v<ExecSpace, Kokkos::HIP>) {
-        return execution_space::hip;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_SYCL)
-    if constexpr (std::is_same_v<ExecSpace, Kokkos::SYCL>) {
-        return execution_space::sycl;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_HPX)
-    if constexpr (std::is_same_v<ExecSpace, Kokkos::Experimental::HPX>) {
-        return execution_space::hpx;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_OPENMP)
-    if constexpr (std::is_same_v<ExecSpace, Kokkos::OpenMP>) {
-        return execution_space::openmp;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_OPENMPTARGET)
-    if constexpr (std::is_same_v<ExecSpace, Kokkos::OpenMPTarget>) {
-        return execution_space::openmp_target;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_OPENACC)
-    if constexpr (std::is_same_v<ExecSpace, Kokkos::Experimental::OpenACC>) {
-        return execution_space::openacc;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_THREADS)
-    if constexpr (std::is_same_v<ExecSpace, Kokkos::Threads>) {
-        return execution_space::threads;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_SERIAL)
-    if constexpr (std::is_same_v<ExecSpace, Kokkos::Serial>) {
-        return execution_space::serial;
-    }
-#endif
-}
-
 [[nodiscard]] target_platform determine_default_target_platform_from_execution_space(execution_space space);
 
 void check_execution_space_target_platform_combination(execution_space space, target_platform target);
 
-[[nodiscard]] std::string get_device_name(execution_space space, std::size_t device_id);
-
-void device_synchronize(const Kokkos::DefaultExecutionSpace& exec);
+template <typename ExecSpace>
+[[nodiscard]] inline std::string get_device_name(const execution_space space, [[maybe_unused]] const ExecSpace &exec) {
+    // TODO: implement for other backends!
+    switch (space) {
+        case execution_space::cuda:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() {
+                return std::string{ exec.cuda_device_prop().name };
+            });
+        case execution_space::hip:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
+                return std::string{ exec.hip_device_prop().name };
+            });
+        case execution_space::sycl:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
+                return exec.sycl_queue.get_device().get_info<sycl::info::device::name>();
+            });
+        case execution_space::openmp:
+        case execution_space::hpx:
+        case execution_space::threads:
+        case execution_space::serial:
+            return "CPU host device";
+        case execution_space::openmp_target:
+            return "OpenMP target device";
+        case execution_space::openacc:
+            return "OpenACC target device";
+    }
+    return "unknown";
+}
 
+void device_synchronize(const Kokkos::DefaultExecutionSpace &exec);
 
 [[nodiscard]] std::string get_kokkos_version();
 
diff --git a/include/plssvm/backends/Kokkos/execution_space.hpp b/include/plssvm/backends/Kokkos/execution_space.hpp
new file mode 100644
index 000000000..adde9892f
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/execution_space.hpp
@@ -0,0 +1,134 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Execution space enumeration for the ExecutionSpaces in Kokkos.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_EXECUTION_SPACE_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_EXECUTION_SPACE_HPP_
+#pragma once
+
+#include "plssvm/detail/utility.hpp"  // plssvm::unreachable
+
+#include "Kokkos_Core.hpp"  // Kokkos macros, Kokkos ExecutionSpace types
+
+#include "fmt/base.h"     // fmt::formatter
+#include "fmt/ostream.h"  // fmt::ostream_formatter
+
+#include <iosfwd>       // std::ostream forward declaration
+#include <type_traits>  // std::is_same_v
+
+namespace plssvm::kokkos {
+
+/**
+ * @brief Enum class for all execution spaces supported by [Kokkos](https://github.com/kokkos/kokkos).
+ */
+enum class execution_space {
+    /** Execution space representing execution on a CUDA device. */
+    cuda,
+    /** Execution space representing execution on a device supported by HIP. */
+    hip,
+    /** Execution space representing execution on a device supported by SYCL. */
+    sycl,
+    /** Execution space representing execution with the HPX runtime system. */
+    hpx,
+    /** Execution space representing execution with the OpenMP runtime system. */
+    openmp,
+    /** Execution space representing execution using the target offloading feature of the OpenMP runtime system. */
+    openmp_target,
+    /** Execution space representing execution with the OpenACC runtime system. */
+    openacc,
+    /** Execution space representing parallel execution with std::threads. */
+    threads,
+    /** Execution space representing serial execution on the CPU. Always available. */
+    serial
+};
+
+/**
+ * @brief Create an `execution_space` from the provided Kokkos @p ExecSpace.
+ * @tparam ExecSpace the type of the provided Kokkos ExecutionSpace
+ * @return the enum value representing the provided Kokkos ExecutionSpace (`[[nodiscard]]`)
+ */
+template <typename ExecSpace>
+[[nodiscard]] inline execution_space determine_execution_space() noexcept {
+    // determine the execution_space enumeration value based on the provided Kokkos execution space
+#if defined(KOKKOS_ENABLE_CUDA)
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::Cuda>) {
+        return execution_space::cuda;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::HIP>) {
+        return execution_space::hip;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::SYCL>) {
+        return execution_space::sycl;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_HPX)
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::Experimental::HPX>) {
+        return execution_space::hpx;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_OPENMP)
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::OpenMP>) {
+        return execution_space::openmp;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::OpenMPTarget>) {
+        return execution_space::openmp_target;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_OPENACC)
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::Experimental::OpenACC>) {
+        return execution_space::openacc;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_THREADS)
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::Threads>) {
+        return execution_space::threads;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_SERIAL)
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::Serial>) {
+        return execution_space::serial;
+    }
+#endif
+    // at least one execution space must always be available!
+    ::plssvm::detail::unreachable();
+}
+
+/**
+ * @brief Output the execution @p space to the given output-stream @p out.
+ * @param[in,out] out the output-stream to write the execution space to
+ * @param[in] space the Kokkos execution space
+ * @return the output-stream
+ */
+std::ostream &operator<<(std::ostream &out, execution_space space);
+
+/**
+ * @brief Use the input-stream @p in to initialize the execution @p space.
+ * @param[in,out] in input-stream to extract the execution space from
+ * @param[in] space the Kokkos execution space
+ * @return the input-stream
+ */
+std::istream &operator>>(std::istream &in, execution_space &space);
+
+}  // namespace plssvm::kokkos
+
+/// @endcond
+
+template <>
+struct fmt::formatter<plssvm::kokkos::execution_space> : fmt::ostream_formatter { };
+
+/// @endcond
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_EXECUTION_SPACE_HPP_
diff --git a/src/plssvm/backends/Kokkos/CMakeLists.txt b/src/plssvm/backends/Kokkos/CMakeLists.txt
index d7d1037ce..89cf282ce 100644
--- a/src/plssvm/backends/Kokkos/CMakeLists.txt
+++ b/src/plssvm/backends/Kokkos/CMakeLists.txt
@@ -23,11 +23,11 @@ message(CHECK_PASS "found")
 # explicitly set sources
 set(PLSSVM_KOKKOS_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/detail/device_ptr.cpp
-    ${CMAKE_CURRENT_LIST_DIR}/detail/execution_space.cpp
     ${CMAKE_CURRENT_LIST_DIR}/detail/pinned_memory.cpp
     ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp
     ${CMAKE_CURRENT_LIST_DIR}/csvm.cpp
     ${CMAKE_CURRENT_LIST_DIR}/exceptions.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/execution_space.cpp
 )
 
 # set target properties
diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index f113c7e2c..114c8738a 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -9,27 +9,29 @@
 #include "plssvm/backends/Kokkos/csvm.hpp"
 
 #include "plssvm/backends/execution_range.hpp"                                        // plssvm::detail::{execution_range, dim_type}
+#include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"                    // PLSSVM_KOKKOS_BACKEND_INVOKE_IF_*
 #include "plssvm/backends/Kokkos/detail/device_ptr.hpp"                               // plssvm::kokkos::detail::device_ptr
-#include "plssvm/backends/Kokkos/detail/execution_space.hpp"                          // plssvm::kokkos::detail::execution_space
 #include "plssvm/backends/Kokkos/detail/utility.hpp"                                  // plssvm::kokkos::detail::get_runtime_version
 #include "plssvm/backends/Kokkos/exceptions.hpp"                                      // plssvm::kokkos::backend_exception
+#include "plssvm/backends/Kokkos/execution_space.hpp"                                 // plssvm::kokkos::execution_space
 #include "plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp"                         // plssvm::kokkos::detail::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale}
 #include "plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp"       // plssvm::kokkos::detail::device_kernel_assembly
 #include "plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp"  // plssvm::kokkos::detail::device_kernel_assembly_symm
 #include "plssvm/backends/Kokkos/kernel/predict_kernel.hpp"                           // plssvm::kokkos::detail::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
 #include "plssvm/constants.hpp"                                                       // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE, plssvm::FEATURE_BLOCK_SIZE
+#include "plssvm/detail/assert.hpp"                                                   // PLSSVM_ASSERT
 #include "plssvm/detail/data_distribution.hpp"                                        // plssvm::detail::triangular_data_distribution
 #include "plssvm/detail/logging.hpp"                                                  // plssvm::detail::log
 #include "plssvm/detail/memory_size.hpp"                                              // plssvm::detail::memory_size
 #include "plssvm/detail/tracking/performance_tracker.hpp"                             // plssvm::detail::tracking::tracking_entry
-#include "plssvm/detail/utility.hpp"                                                  // plssvm::detail::unreachable // TODO: remove
+#include "plssvm/detail/utility.hpp"                                                  // plssvm::detail::{get_system_memory, unreachable}
 #include "plssvm/exceptions/exceptions.hpp"                                           // plssvm::exception
 #include "plssvm/kernel_function_types.hpp"                                           // plssvm::kernel_function_type
 #include "plssvm/parameter.hpp"                                                       // plssvm::parameter
 #include "plssvm/target_platforms.hpp"                                                // plssvm::target_platform
 #include "plssvm/verbosity_levels.hpp"                                                // plssvm::verbosity_level
 
-#include "Kokkos_Core.hpp"  // TODO:
+#include "Kokkos_Core.hpp"  // TODO: docu
 
 #include "fmt/core.h"    // fmt::format
 #include "fmt/format.h"  // fmt::format
@@ -46,7 +48,8 @@ csvm::csvm(parameter params) :
     csvm{ plssvm::target_platform::automatic, params } { }
 
 csvm::csvm(target_platform target, parameter params) :
-    base_type{ params } {
+    base_type{ params },
+    space_{ determine_execution_space<Kokkos::DefaultExecutionSpace>() } {
     this->init(target);
 }
 
@@ -77,11 +80,6 @@ void csvm::init(const target_platform target) {
             break;
     }
 
-    // TODO: document: we ALWAYS use the default execution space
-
-    // set the execution space -> we always only use the Kokkos::DefaultExecutionSpace
-    space_ = detail::determine_execution_space<Kokkos::DefaultExecutionSpace>();
-
     plssvm::detail::log(verbosity_level::full,
                         "\nUsing Kokkos ({}) as backend with the Kokkos::DefaultExecutionSpace \"{}\".\n",
                         plssvm::detail::tracking::tracking_entry{ "dependencies", "kokkos_version", detail::get_kokkos_version() },
@@ -129,7 +127,7 @@ void csvm::init(const target_platform target) {
     std::vector<std::string> device_names{};
     device_names.reserve(devices_.size());
     for (typename std::vector<queue_type>::size_type device = 0; device < devices_.size(); ++device) {
-        const std::string device_name = detail::get_device_name(space_, device);
+        const std::string device_name = detail::get_device_name(space_, devices_[device]);
         plssvm::detail::log(verbosity_level::full,
                             "  [{}, {}]\n",
                             device,
@@ -154,95 +152,127 @@ csvm::~csvm() {
 }
 
 std::vector<::plssvm::detail::memory_size> csvm::get_device_memory() const {
-    // TODO: implement for other execution spaces, guard behind ifdef
+    // TODO: implement for other execution spaces
     std::vector<::plssvm::detail::memory_size> res(this->num_available_devices());
     switch (space_) {
-        case detail::execution_space::cuda:
-            {
-                cudaDeviceProp prop{};
+        case execution_space::cuda:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() {
                 for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
-                    cudaGetDeviceProperties(&prop, devices_[device_id].cuda_device());
-                    res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(prop.totalGlobalMem) };
+                    res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].cuda_device_prop().totalGlobalMem) };
                 }
-            }
-            break;
-        case detail::execution_space::hip:
-        case detail::execution_space::sycl:
-        case detail::execution_space::openmp_target:
-        case detail::execution_space::openacc:
-        case detail::execution_space::openmp:
-        case detail::execution_space::hpx:
-        case detail::execution_space::threads:
-        case detail::execution_space::serial:
+                return res;
+            });
+        case execution_space::hip:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
+                for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
+                    res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].hip_device_prop().totalGlobalMem) };
+                }
+                return res;
+            });
+        case execution_space::sycl:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
+                for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
+                    res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].sycl_queue().get_device().get_info<::sycl::info::device::global_mem_size>()) };
+                }
+                return res;
+            });
+        case execution_space::openmp:
+        case execution_space::hpx:
+        case execution_space::threads:
+        case execution_space::serial:
+            return std::vector<::plssvm::detail::memory_size>(this->num_available_devices(), ::plssvm::detail::get_system_memory());
+        case execution_space::openmp_target:
+        case execution_space::openacc:
             throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
     }
-
-    return res;
+    // all possible cases should be handled by the previous switch
+    // -> silence missing return statement compiler warnings due to throw statement
+    ::plssvm::detail::unreachable();
 }
 
 std::vector<::plssvm::detail::memory_size> csvm::get_max_mem_alloc_size() const {
-    // TODO: implement for other execution spaces, guard behind ifdef
+    // TODO: implement for other execution spaces
     switch (space_) {
-        case detail::execution_space::cuda:
+        case execution_space::cuda:
+        case execution_space::hip:
+            return this->get_device_memory();
+        case execution_space::sycl:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
+                for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
+                    res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].sycl_queue().get_device().get_info<::sycl::info::device::max_mem_alloc_size>()) };
+                }
+                return res;
+            });
+        case execution_space::openmp:
+        case execution_space::hpx:
+        case execution_space::threads:
+        case execution_space::serial:
             return this->get_device_memory();
-        case detail::execution_space::hip:
-        case detail::execution_space::sycl:
-        case detail::execution_space::openmp_target:
-        case detail::execution_space::openacc:
-        case detail::execution_space::openmp:
-        case detail::execution_space::hpx:
-        case detail::execution_space::threads:
-        case detail::execution_space::serial:
+        case execution_space::openmp_target:
+        case execution_space::openacc:
             throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
     }
+    // all possible cases should be handled by the previous switch
+    // -> silence missing return statement compiler warnings due to throw statement
     ::plssvm::detail::unreachable();
 }
 
 std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
     PLSSVM_ASSERT(device_id < this->num_available_devices(), "Invalid device {} requested!", device_id);
 
-    // TODO: implement for other execution spaces, guard behind ifdef
+    // TODO: implement for other execution spaces
     switch (space_) {
-        case detail::execution_space::cuda:
-            {
-                cudaDeviceProp prop{};
-                cudaGetDeviceProperties(&prop, devices_[device_id].cuda_device());
-                return static_cast<std::size_t>(prop.maxThreadsPerBlock);
-            }
-        case detail::execution_space::hip:
-        case detail::execution_space::sycl:
-        case detail::execution_space::openmp_target:
-        case detail::execution_space::openacc:
-        case detail::execution_space::openmp:
-        case detail::execution_space::hpx:
-        case detail::execution_space::threads:
-        case detail::execution_space::serial:
+        case execution_space::cuda:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() {
+                return static_cast<std::size_t>(devices_[device_id].cuda_device_prop().maxThreadsPerBlock);
+            });
+        case execution_space::hip:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
+                return static_cast<std::size_t>(devices_[device_id].hip_device_prop().maxThreadsPerBlock);
+            });
+        case execution_space::sycl:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
+                return devices_[device_id].sycl_queue().get_device().get_info<::sycl::info::device::max_work_group_size>();
+            });
+        case execution_space::openmp_target:
+        case execution_space::openacc:
+        case execution_space::openmp:
+        case execution_space::hpx:
+        case execution_space::threads:
+        case execution_space::serial:
             throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
     }
+    // all possible cases should be handled by the previous switch
+    // -> silence missing return statement compiler warnings due to throw statement
     ::plssvm::detail::unreachable();
 }
 
 ::plssvm::detail::dim_type csvm::get_max_grid_size(const std::size_t device_id) const {
     PLSSVM_ASSERT(device_id < this->num_available_devices(), "Invalid device {} requested!", device_id);
 
-    // TODO: implement for other execution spaces, guard behind ifdef
+    // TODO: implement for other execution spaces
     switch (space_) {
-        case detail::execution_space::cuda:
-            {
-                cudaDeviceProp prop{};
-                cudaGetDeviceProperties(&prop, devices_[device_id].cuda_device());
+        case execution_space::cuda:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA(([&]() -> ::plssvm::detail::dim_type {
+                const cudaDeviceProp &prop = devices_[device_id].cuda_device_prop();
+                return { static_cast<std::size_t>(prop.maxGridSize[0]), static_cast<std::size_t>(prop.maxGridSize[1]), static_cast<std::size_t>(prop.maxGridSize[2]) };
+            }));
+        case execution_space::hip:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP(([&]() -> ::plssvm::detail::dim_type {
+                const hipDeviceProp &prop = devices_[device_id].hip_device_prop();
                 return { static_cast<std::size_t>(prop.maxGridSize[0]), static_cast<std::size_t>(prop.maxGridSize[1]), static_cast<std::size_t>(prop.maxGridSize[2]) };
-            }
-        case detail::execution_space::hip:
-        case detail::execution_space::sycl:
-        case detail::execution_space::openmp_target:
-        case detail::execution_space::openacc:
-        case detail::execution_space::openmp:
-        case detail::execution_space::hpx:
-        case detail::execution_space::threads:
-        case detail::execution_space::serial:
+            }));
+        case execution_space::sycl:
+        case execution_space::openmp_target:
+        case execution_space::openacc:
+        case execution_space::openmp:
+        case execution_space::hpx:
+        case execution_space::threads:
+        case execution_space::serial:
             throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
     }
+    // all possible cases should be handled by the previous switch
+    // -> silence missing return statement compiler warnings due to throw statement
     ::plssvm::detail::unreachable();
 }
 
diff --git a/src/plssvm/backends/Kokkos/detail/execution_space.cpp b/src/plssvm/backends/Kokkos/detail/execution_space.cpp
deleted file mode 100644
index 65afa72b1..000000000
--- a/src/plssvm/backends/Kokkos/detail/execution_space.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/**
- * @author Alexander Van Craen
- * @author Marcel Breyer
- * @copyright 2018-today The PLSSVM project - All Rights Reserved
- * @license This file is part of the PLSSVM project which is released under the MIT license.
- *          See the LICENSE.md file in the project root for full license information.
- */
-
-#include "plssvm/backends/Kokkos/detail/execution_space.hpp"
-
-#include <ostream>  // std::ostream
-
-namespace plssvm::kokkos::detail {
-
-std::ostream &operator<<(std::ostream &out, const execution_space space) {
-    switch (space) {
-        case execution_space::cuda:
-            return out << "Cuda";
-        case execution_space::hip:
-            return out << "HIP";
-        case execution_space::sycl:
-            return out << "SYCL";
-        case execution_space::hpx:
-            return out << "HPX";
-        case execution_space::openmp:
-            return out << "OpenMP";
-        case execution_space::openmp_target:
-            return out << "OpenMPTarget";
-        case execution_space::openacc:
-            return out << "OpenACC";
-        case execution_space::threads:
-            return out << "Threads";
-        case execution_space::serial:
-            return out << "Serial";
-    }
-    return out << "unknown";
-}
-
-}  // namespace plssvm::kokkos::detail
diff --git a/src/plssvm/backends/Kokkos/detail/utility.cpp b/src/plssvm/backends/Kokkos/detail/utility.cpp
index 4505c0515..ac53ffc48 100644
--- a/src/plssvm/backends/Kokkos/detail/utility.cpp
+++ b/src/plssvm/backends/Kokkos/detail/utility.cpp
@@ -8,11 +8,11 @@
 
 #include "plssvm/backends/Kokkos/detail/utility.hpp"
 
-#include "plssvm/backends/Kokkos/detail/execution_space.hpp"  // plssvm::kokkos::detail::execution_space
-#include "plssvm/backends/Kokkos/exceptions.hpp"              // plssvm::kokkos::backend_exception
-#include "plssvm/detail/assert.hpp"                           // PLSSVM_ASSERT
-#include "plssvm/detail/utility.hpp"                          // plssvm::detail::unreachable
-#include "plssvm/target_platforms.hpp"                        // plssvm::target_platform
+#include "plssvm/backends/Kokkos/exceptions.hpp"       // plssvm::kokkos::backend_exception
+#include "plssvm/backends/Kokkos/execution_space.hpp"  // plssvm::kokkos::execution_space
+#include "plssvm/detail/assert.hpp"                    // PLSSVM_ASSERT
+#include "plssvm/detail/utility.hpp"                   // plssvm::detail::unreachable
+#include "plssvm/target_platforms.hpp"                 // plssvm::target_platform
 
 #include "Kokkos_Macros.hpp"
 
@@ -86,47 +86,7 @@ void check_execution_space_target_platform_combination(const execution_space spa
 
 // TODO: error checks?
 
-std::string get_device_name(const execution_space space, const std::size_t device_id) {
-    // TODO: implement for other backends!
-    switch (space) {
-        case execution_space::cuda:
-#if defined(KOKKOS_ENABLE_CUDA)
-            {
-                cudaDeviceProp prop{};
-                cudaGetDeviceProperties(&prop, static_cast<int>(device_id));
-                return std::string{ prop.name };
-            }
-#else
-            throw backend_exception{ fmt::format("Unsupported Kokkos execution space \"{}\"!", space) };
-#endif
-        case execution_space::hip:
-#if defined(KOKKOS_ENABLE_HIP)
-            {
-                hipDeviceProp_t prop{};
-                hipGetDeviceProperties(&prop, static_cast<int>(device_id));
-                return std::string{ prop.name };
-            }
-#else
-            throw backend_exception{ fmt::format("Unsupported Kokkos execution space \"{}\"!", space) };
-#endif
-        case execution_space::openmp:
-#if defined(KOKKOS_ENABLE_HIP)
-            return "CPU host device";
-#else
-            throw backend_exception{ fmt::format("Unsupported Kokkos execution space \"{}\"!", space) };
-#endif
-        case execution_space::sycl:
-        case execution_space::hpx:
-        case execution_space::openmp_target:
-        case execution_space::openacc:
-        case execution_space::threads:
-        case execution_space::serial:
-            throw backend_exception{ fmt::format("Unsupported Kokkos execution space \"{}\"!", space) };
-    }
-    return "unknown";
-}
-
-void device_synchronize(const Kokkos::DefaultExecutionSpace& exec) {
+void device_synchronize(const Kokkos::DefaultExecutionSpace &exec) {
     exec.fence();
 }
 
@@ -135,4 +95,6 @@ std::string get_kokkos_version() {
     return fmt::format("{}.{}.{}", KOKKOS_VERSION_MAJOR, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH);
 }
 
+// TODO: https://godbolt.org/z/eMYrbxsTj
+
 }  // namespace plssvm::kokkos::detail
diff --git a/src/plssvm/backends/Kokkos/execution_space.cpp b/src/plssvm/backends/Kokkos/execution_space.cpp
new file mode 100644
index 000000000..5453e11d8
--- /dev/null
+++ b/src/plssvm/backends/Kokkos/execution_space.cpp
@@ -0,0 +1,74 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ */
+
+#include "plssvm/backends/Kokkos/execution_space.hpp"
+
+#include "plssvm/detail/string_utility.hpp"  // plssvm::detail::to_lower_case
+#include "plssvm/detail/utility.hpp"         // plssvm::detail::contains
+
+#include <ios>      // std::ios::failbit
+#include <istream>  // std::istream
+#include <ostream>  // std::ostream
+#include <string>   // std::string
+
+namespace plssvm::kokkos {
+
+std::ostream &operator<<(std::ostream &out, const execution_space space) {
+    switch (space) {
+        case execution_space::cuda:
+            return out << "Cuda";
+        case execution_space::hip:
+            return out << "HIP";
+        case execution_space::sycl:
+            return out << "SYCL";
+        case execution_space::hpx:
+            return out << "HPX";
+        case execution_space::openmp:
+            return out << "OpenMP";
+        case execution_space::openmp_target:
+            return out << "OpenMPTarget";
+        case execution_space::openacc:
+            return out << "OpenACC";
+        case execution_space::threads:
+            return out << "Threads";
+        case execution_space::serial:
+            return out << "Serial";
+    }
+    return out << "unknown";
+}
+
+std::istream &operator>>(std::istream &in, execution_space &space) {
+    std::string str{};
+    in >> str;
+    ::plssvm::detail::to_lower_case(str);
+
+    if (str == "cuda") {
+        space = execution_space::cuda;
+    } else if (str == "hip") {
+        space = execution_space::hip;
+    } else if (str == "sycl") {
+        space = execution_space::sycl;
+    } else if (str == "hpx") {
+        space = execution_space::hpx;
+    } else if (str == "openmp") {
+        space = execution_space::openmp;
+    } else if (str == "openmp_target") {
+        space = execution_space::openmp_target;
+    } else if (str == "openacc") {
+        space = execution_space::openacc;
+    } else if (str == "threads") {
+        space = execution_space::threads;
+    } else if (str == "serial") {
+        space = execution_space::serial;
+    } else {
+        in.setstate(std::ios::failbit);
+    }
+    return in;
+}
+
+}  // namespace plssvm::kokkos

From 90ff2728f584795e8a3fd72798bcb931a14d457d Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 22 Oct 2024 15:34:44 +0200
Subject: [PATCH 017/123] Create get_device_list function and add missing
 documentation.

---
 .../plssvm/backends/Kokkos/detail/utility.hpp |  70 +++++------
 src/plssvm/backends/Kokkos/csvm.cpp           |  15 +--
 src/plssvm/backends/Kokkos/detail/utility.cpp | 109 +++++++++++++++---
 3 files changed, 130 insertions(+), 64 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/detail/utility.hpp b/include/plssvm/backends/Kokkos/detail/utility.hpp
index b7e732aff..e29468830 100644
--- a/include/plssvm/backends/Kokkos/detail/utility.hpp
+++ b/include/plssvm/backends/Kokkos/detail/utility.hpp
@@ -13,53 +13,57 @@
 #define PLSSVM_BACKENDS_KOKKOS_DETAIL_UTILITY_HPP_
 #pragma once
 
-#include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"  // PLSSVM_KOKKOS_BACKEND_INVOKE_IF_*
 #include "plssvm/backends/Kokkos/execution_space.hpp"               // plssvm::kokkos::execution_space
 #include "plssvm/target_platforms.hpp"                              // plssvm::target_platform
 
-#include "Kokkos_Core.hpp"  // TODO: ?
+#include "Kokkos_Core.hpp"  // Kokkos::DefaultExecutionSpace
 
-#include <cstddef>      // std::size_t
-#include <string>       // std::string
-#include <type_traits>  // std::is_same_v
+#include <string>  // std::string
+#include <vector>  // std::vector
 
 namespace plssvm::kokkos::detail {
 
+/**
+ * @brief Given the execution @p space, determine the respective default target platform.
+ * @param[in] space the Kokkos::ExecutionSpace for which the default target platform should be determined
+ * @return the default target platform (`[[nodiscard]]`)
+ */
 [[nodiscard]] target_platform determine_default_target_platform_from_execution_space(execution_space space);
 
+/**
+ * @brief Check whether the execution @p space supports the @p target platform. Throws an `plssvm::kokkos::backend_exception` if that's not the case.
+ * @param[in] space the Kokkos::ExecutionSpace to investigate
+ * @param[in] target the target platform to check
+ * @throws plssvm::kokkos::backend_exception if @p space doesn't support the @p target platform
+ */
 void check_execution_space_target_platform_combination(execution_space space, target_platform target);
 
-template <typename ExecSpace>
-[[nodiscard]] inline std::string get_device_name(const execution_space space, [[maybe_unused]] const ExecSpace &exec) {
-    // TODO: implement for other backends!
-    switch (space) {
-        case execution_space::cuda:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() {
-                return std::string{ exec.cuda_device_prop().name };
-            });
-        case execution_space::hip:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
-                return std::string{ exec.hip_device_prop().name };
-            });
-        case execution_space::sycl:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
-                return exec.sycl_queue.get_device().get_info<sycl::info::device::name>();
-            });
-        case execution_space::openmp:
-        case execution_space::hpx:
-        case execution_space::threads:
-        case execution_space::serial:
-            return "CPU host device";
-        case execution_space::openmp_target:
-            return "OpenMP target device";
-        case execution_space::openacc:
-            return "OpenACC target device";
-    }
-    return "unknown";
-}
+/**
+ * @brief Get a list of all available devices in the execution @p space that are supported by the @p target platform.
+ * @param[in] space the Kokkos::ExecutionSpace to retrieve the devices from
+ * @param[in] target the target platform that must be supported
+ * @return all devices for the @p target in the Kokkos::ExecutionSpace @p space (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::vector<Kokkos::DefaultExecutionSpace> get_device_list(execution_space space, target_platform target);
 
+/**
+ * @brief Get the name of the device represented by the Kokkos::ExecutionSpace @p exec in the execution @p space.
+ * @param[in] space the Kokkos::ExecutionSpace
+ * @param[in] exec the device
+ * @return the device name (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::string get_device_name(execution_space space, const Kokkos::DefaultExecutionSpace &exec);
+
+/**
+ * @brief Wait for all kernel and/or other operations on the Kokkos::ExecutionSpace @p exec to finish
+ * @param[in] exec the Kokkos::ExecutionSpace to synchronize
+ */
 void device_synchronize(const Kokkos::DefaultExecutionSpace &exec);
 
+/**
+ * @brief Get the used Kokkos library version.
+ * @return the library version (`[[nodiscard]]`)
+ */
 [[nodiscard]] std::string get_kokkos_version();
 
 }  // namespace plssvm::kokkos::detail
diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 114c8738a..f2a40050c 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -100,20 +100,9 @@ void csvm::init(const target_platform target) {
     }
 
     // get all available devices wrt the requested target platform
-// TODO: HOW CAN ONE USE MULTIPLE KOKKOS DEVICES
-// TODO: implement for other Kokkos execution spaces
-#if defined(KOKKOS_ENABLE_CUDA)
-    for (int device = 0; device < Kokkos::num_devices(); ++device) {
-        // create CUDA stream using the CUDA specific functions
-        cudaSetDevice(device);
-        cudaStream_t stream{};
-        cudaStreamCreate(&stream);
-        // create Kokkos execution space for the specific device
-        devices_.emplace_back(Kokkos::Cuda(stream, true));
-    }
-#endif
+    devices_ = detail::get_device_list(space_, target_);
 
-    // throw exception if no CUDA devices could be found
+    // throw exception if no devices in the current execution space could be found
     if (devices_.empty()) {
         throw backend_exception{ fmt::format("Not devices found for the Kokkos execution space {} with the target platform {}!", space_, target_) };
     }
diff --git a/src/plssvm/backends/Kokkos/detail/utility.cpp b/src/plssvm/backends/Kokkos/detail/utility.cpp
index ac53ffc48..b7d58be1d 100644
--- a/src/plssvm/backends/Kokkos/detail/utility.cpp
+++ b/src/plssvm/backends/Kokkos/detail/utility.cpp
@@ -8,25 +8,20 @@
 
 #include "plssvm/backends/Kokkos/detail/utility.hpp"
 
-#include "plssvm/backends/Kokkos/exceptions.hpp"       // plssvm::kokkos::backend_exception
-#include "plssvm/backends/Kokkos/execution_space.hpp"  // plssvm::kokkos::execution_space
-#include "plssvm/detail/assert.hpp"                    // PLSSVM_ASSERT
-#include "plssvm/detail/utility.hpp"                   // plssvm::detail::unreachable
-#include "plssvm/target_platforms.hpp"                 // plssvm::target_platform
+#include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"  // PLSSVM_KOKKOS_BACKEND_INVOKE_IF_*
+#include "plssvm/backends/Kokkos/exceptions.hpp"                    // plssvm::kokkos::backend_exception
+#include "plssvm/backends/Kokkos/execution_space.hpp"               // plssvm::kokkos::execution_space
+#include "plssvm/detail/assert.hpp"                                 // PLSSVM_ASSERT
+#include "plssvm/detail/utility.hpp"                                // plssvm::detail::unreachable
+#include "plssvm/target_platforms.hpp"                              // plssvm::target_platform
 
-#include "Kokkos_Macros.hpp"
-
-#if defined(KOKKOS_ENABLE_CUDA)
-    #include "cuda_runtime.h"  // cudaDeviceProp, cudaGetDeviceProperties
-#endif
-#if defined(KOKKOS_ENABLE_HIP)
-    #include "hip/hip_runtime_api.h"  // HIP runtime functions
-#endif
+#include "Kokkos_Core.hpp"    // Kokkos::DefaultExecutionSpace, Kokkos::num_devices, Kokkos::Cuda, Kokkos::Hip, Kokkos::Sycl, Kokkos::Impl::ManageStream
+#include "Kokkos_Macros.hpp"  // Kokkos macros
 
 #include "fmt/core.h"  // fmt::format
 
-#include <cstddef>  // std::size_t
-#include <string>   // std::string
+#include <string>  // std::string
+#include <vector>  // std::vector
 
 namespace plssvm::kokkos::detail {
 
@@ -46,6 +41,8 @@ target_platform determine_default_target_platform_from_execution_space(const exe
         case execution_space::serial:
             return target_platform::cpu;
     }
+    // all possible cases should be handled by the previous switch
+    // -> silence missing return statement compiler warnings due to throw statement
     ::plssvm::detail::unreachable();
 }
 
@@ -84,7 +81,85 @@ void check_execution_space_target_platform_combination(const execution_space spa
     }
 }
 
-// TODO: error checks?
+std::vector<Kokkos::DefaultExecutionSpace> get_device_list(const execution_space space, [[maybe_unused]] const target_platform target) {
+    std::vector<Kokkos::DefaultExecutionSpace> devices{};
+    switch (space) {
+        case execution_space::cuda:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() {
+                for (int device = 0; device < Kokkos::num_devices(); ++device) {
+                    // create CUDA stream using the CUDA specific functions
+                    cudaSetDevice(device);
+                    cudaStream_t stream{};
+                    cudaStreamCreate(&stream);
+                    // create Kokkos execution space for the specific device
+                    // Note: it is important to pass the cudaStream_t lifetime to be managed by Kokkos
+                    devices.emplace_back(Kokkos::Cuda(stream, Kokkos::Impl::ManageStream::yes));
+                }
+                return devices;
+            });
+        case execution_space::hip:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
+                for (int device = 0; device < Kokkos::num_devices(); ++device) {
+                    // HIP CUDA stream using the HIP specific functions
+                    hipSetDevice(device);
+                    hipStream_t stream{};
+                    hipStreamCreate(&stream);
+                    // create Kokkos execution space for the specific device
+                    // Note: it is important to pass the hipStream_t lifetime to be managed by Kokkos
+                    devices.emplace_back(Kokkos::Hip(stream, Kokkos::Impl::ManageStream::yes));
+                }
+                return devices;
+            });
+        case execution_space::sycl:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
+                // TODO: use all available devices -> not that trivial
+                // TODO: handle target
+                devices.emplace_back(Kokkos::SYCL{});
+                return devices;
+            });
+        case execution_space::openmp:
+        case execution_space::hpx:
+        case execution_space::threads:
+        case execution_space::serial:
+            devices.emplace_back(Kokkos::DefaultExecutionSpace{});
+            return devices;
+        case execution_space::openmp_target:
+        case execution_space::openacc:
+            // TODO: implement
+            throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space) };
+    }
+    // all possible cases should be handled by the previous switch
+    // -> silence missing return statement compiler warnings due to throw statement
+    ::plssvm::detail::unreachable();
+}
+
+std::string get_device_name(const execution_space space, [[maybe_unused]] const Kokkos::DefaultExecutionSpace &exec) {
+    // TODO: implement for other backends!
+    switch (space) {
+        case execution_space::cuda:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() {
+                return std::string{ exec.cuda_device_prop().name };
+            });
+        case execution_space::hip:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
+                return std::string{ exec.hip_device_prop().name };
+            });
+        case execution_space::sycl:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
+                return exec.sycl_queue.get_device().get_info<sycl::info::device::name>();
+            });
+        case execution_space::openmp:
+        case execution_space::hpx:
+        case execution_space::threads:
+        case execution_space::serial:
+            return "CPU host device";
+        case execution_space::openmp_target:
+            return "OpenMP target device";
+        case execution_space::openacc:
+            return "OpenACC target device";
+    }
+    return "unknown";
+}
 
 void device_synchronize(const Kokkos::DefaultExecutionSpace &exec) {
     exec.fence();
@@ -95,6 +170,4 @@ std::string get_kokkos_version() {
     return fmt::format("{}.{}.{}", KOKKOS_VERSION_MAJOR, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH);
 }
 
-// TODO: https://godbolt.org/z/eMYrbxsTj
-
 }  // namespace plssvm::kokkos::detail

From 3f4bf8def6f704ff96c0fe127ca2de5dd9c64598 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 22 Oct 2024 16:17:38 +0200
Subject: [PATCH 018/123] Move Kokkos::View typedefs to custom header.

---
 .../backends/Kokkos/detail/device_ptr.hpp     | 22 ++++--------
 .../backends/Kokkos/detail/typedefs.hpp       | 36 +++++++++++++++++++
 .../Kokkos/kernel/cg_explicit/blas.hpp        | 12 +++----
 .../cg_explicit/kernel_matrix_assembly.hpp    |  4 +--
 .../kernel_matrix_assembly_blas.hpp           |  6 ++--
 .../backends/Kokkos/kernel/predict_kernel.hpp |  6 ++--
 .../backends/Kokkos/detail/device_ptr.cpp     | 26 +++++++-------
 7 files changed, 71 insertions(+), 41 deletions(-)
 create mode 100644 include/plssvm/backends/Kokkos/detail/typedefs.hpp

diff --git a/include/plssvm/backends/Kokkos/detail/device_ptr.hpp b/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
index 8f587b667..98194bcfb 100644
--- a/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
+++ b/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
@@ -13,24 +13,16 @@
 #define PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_PTR_HPP_
 #pragma once
 
-#include "plssvm/backends/gpu_device_ptr.hpp"  // plssvm::detail::gpu_device_ptr
-#include "plssvm/shape.hpp"                    // plssvm::shape
+#include "plssvm/backends/gpu_device_ptr.hpp"          // plssvm::detail::gpu_device_ptr
+#include "plssvm/backends/Kokkos/detail/typedefs.hpp"  // plssvm::kokkos::detail::device_view_type
+#include "plssvm/shape.hpp"                            // plssvm::shape
 
-#include "Kokkos_Core.hpp"  // TODO: Kokkos::DefaultExecutionSpace
+#include "Kokkos_Core.hpp"  // Kokkos::DefaultExecutionSpace
 
 #include <cstddef>  // std::size_t
 
 namespace plssvm::kokkos::detail {
 
-template <typename T>
-using device_view_type = Kokkos::View<T *, Kokkos::DefaultExecutionSpace>;
-
-template <typename T>
-using device_subview_type = Kokkos::Subview<T *, Kokkos::DefaultExecutionSpace>;
-
-template <typename T>
-using host_view_type = Kokkos::View<T *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
-
 /**
  * @brief Small wrapper class around a Kokkos view together with commonly used device functions.
  * @tparam T the type of the kernel view to wrap
@@ -70,20 +62,20 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, Kokkos::DefaultExe
      * @param[in] size the number of elements represented by the device_ptr
      * @param[in] exec the associated Kokkos execution space
      */
-    explicit device_ptr(size_type size, Kokkos::DefaultExecutionSpace exec);
+    explicit device_ptr(size_type size, const Kokkos::DefaultExecutionSpace &exec);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes in the Kokkos execution space @p exec.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] exec the associated Kokkos execution space
      */
-    explicit device_ptr(plssvm::shape shape, Kokkos::DefaultExecutionSpace exec);
+    explicit device_ptr(plssvm::shape shape, const Kokkos::DefaultExecutionSpace &exec);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes in the Kokkos execution space @p exec.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
      * @param[in] exec the associated Kokkos execution space
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, Kokkos::DefaultExecutionSpace exec);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, const Kokkos::DefaultExecutionSpace &exec);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)
diff --git a/include/plssvm/backends/Kokkos/detail/typedefs.hpp b/include/plssvm/backends/Kokkos/detail/typedefs.hpp
new file mode 100644
index 000000000..61fffbb31
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/detail/typedefs.hpp
@@ -0,0 +1,36 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief A few convenient Kokkos::View typedefs.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_TYPEDEFS_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_DETAIL_TYPEDEFS_HPP_
+#pragma once
+
+#include "Kokkos_Core.hpp"  // Kokkos::View, Kokkos::DefaultExecutionSpace, Kokkos::HostSpace, Kokkos::MemoryUnmanaged
+
+namespace plssvm::kokkos::detail {
+
+/**
+ * @brief Typedef for a simple Kokkos::View targeting the Kokkos::DefaultExecutionSpace.
+ * @tparam T the type of the view's data
+ */
+template <typename T>
+using device_view_type = Kokkos::View<T *, Kokkos::DefaultExecutionSpace>;
+
+/**
+ * @brief Typedef for a simple Kokkos::View always targeting the Kokkos::HostSpace.
+ * @tparam T the type of the view's data
+ */
+template <typename T>
+using host_view_type = Kokkos::View<T *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
+
+}  // namespace plssvm::kokkos::detail
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_DETAIL_TYPEDEFS_HPP_
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
index c12220b0b..85997c118 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
@@ -13,10 +13,12 @@
 #define PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_BLAS_HPP_
 #pragma once
 
-#include "plssvm/backends/Kokkos/detail/device_ptr.hpp"  // TODO: view type aliases
-#include "plssvm/constants.hpp"                          // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/backends/Kokkos/detail/typedefs.hpp"  // plssvm::kokkos::detail::device_view_type
+#include "plssvm/constants.hpp"                        // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
 
-#include "Kokkos_Core.hpp"  // TODO:
+#include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents
+
+#include <cstddef>  // std::size_t
 
 namespace plssvm::kokkos::detail {
 
@@ -317,9 +319,7 @@ class device_kernel_inplace_matrix_add {
                 const auto global_i = i + static_cast<std::size_t>(internal_i);
                 const auto global_j = j + static_cast<std::size_t>(internal_j);
 
-                // if (global_i < lhs_.extent(0) && global_j < rhs_.extent(0)) {  // TODO:
                 lhs_[global_i * (num_cols_ + PADDING_SIZE_sz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_sz) + global_j];
-                // }
             }
         }
     }
@@ -378,9 +378,7 @@ class device_kernel_inplace_matrix_scale {
                 const auto global_i = i + static_cast<std::size_t>(internal_i);
                 const auto global_j = j + static_cast<std::size_t>(internal_j);
 
-                // if (global_i < lhs_.extent(0)) {  // TODO:
                 lhs_[global_i * (num_cols_ + PADDING_SIZE_sz) + global_j] *= scale_;
-                // }
             }
         }
     }
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
index ad9397377..550dbfe0e 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -13,13 +13,13 @@
 #define PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_KERNEL_MATRIX_ASSEMBLY_HPP_
 #pragma once
 
-#include "plssvm/backends/Kokkos/detail/device_ptr.hpp"             // TODO: view type aliases
 #include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp"  // plssvm::kokkos::detail::standard_layout_tuple
+#include "plssvm/backends/Kokkos/detail/typedefs.hpp"               // plssvm::kokkos::detail::device_view_type
 #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"       // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
 #include "plssvm/constants.hpp"                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                         // plssvm::kernel_function_type
 
-#include "Kokkos_Core.hpp"  // TODO:
+#include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents
 
 #include <cstddef>  // std::size_t
 
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index 2f0f6619c..cf73cadb4 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -13,13 +13,15 @@
 #define PLSSVM_BACKENDS_KOKKOS_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
 #pragma once
 
-#include "plssvm/backends/Kokkos/detail/device_ptr.hpp"             // TODO: view type aliases
 #include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp"  // plssvm::kokkos::detail::standard_layout_tuple
+#include "plssvm/backends/Kokkos/detail/typedefs.hpp"               // plssvm::kokkos::detail::device_view_type
 #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"       // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
 #include "plssvm/constants.hpp"                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                         // plssvm::kernel_function_type
 
-#include "Kokkos_Core.hpp"  // TODO: Kokkos::atomic_add
+#include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents, Kokkos::atomic_add
+
+#include <cstddef>  // std::size_t
 
 namespace plssvm::kokkos::detail {
 
diff --git a/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
index 629a0901f..c6a302d6d 100644
--- a/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
@@ -13,12 +13,14 @@
 #define PLSSVM_BACKENDS_KOKKOS_PREDICT_KERNEL_HPP_
 #pragma once
 
-#include "plssvm/backends/Kokkos/detail/device_ptr.hpp"        // TODO: view type aliases
+#include "plssvm/backends/Kokkos/detail/typedefs.hpp"          // plssvm::kokkos::detail::device_view_type
 #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"  // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
 #include "plssvm/constants.hpp"                                // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
 
-#include "Kokkos_Core.hpp"  // TODO: Kokkos::atomic_add
+#include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents, Kokkos::atomic_add
+
+#include <cstddef>  // std::size_t
 
 namespace plssvm::kokkos::detail {
 
diff --git a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
index b176c1283..cbf973ca4 100644
--- a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
@@ -8,36 +8,36 @@
 
 #include "plssvm/backends/Kokkos/detail/device_ptr.hpp"
 
-#include "plssvm/backends/Kokkos/detail/utility.hpp"  // plssvm::detail::device_synchronize
-#include "plssvm/backends/Kokkos/exceptions.hpp"      // plssvm::kokkos::backend_exception
-#include "plssvm/detail/assert.hpp"                   // PLSSVM_ASSERT
-#include "plssvm/exceptions/exceptions.hpp"           // plssvm::exception
-#include "plssvm/shape.hpp"                           // plssvm::shape
+#include "plssvm/backends/Kokkos/detail/typedefs.hpp"  // plssvm::kokkos::detail::{device_view_type, host_view_type}
+#include "plssvm/backends/Kokkos/detail/utility.hpp"   // plssvm::detail::device_synchronize
+#include "plssvm/backends/Kokkos/exceptions.hpp"       // plssvm::kokkos::backend_exception
+#include "plssvm/detail/assert.hpp"                    // PLSSVM_ASSERT
+#include "plssvm/shape.hpp"                            // plssvm::shape
 
-#include "Kokkos_Core.hpp"
+#include "Kokkos_Core.hpp"  // Kokkos::DefaultExecutionSpace, Kokkos::subview, Kokkos::parallel_for, KOKKOS_LAMBDA, Kokkos::deep_copy
 
 #include "fmt/core.h"  // fmt::format
 
+#include <algorithm>  // std::min
 #include <cstddef>    // std::size_t
-#include <exception>  // std::terminate
-#include <iostream>   // std::cout, std::endl
+#include <cstring>    // std::memcpy
+#include <utility>    // std::make_pair
 #include <vector>     // std::vector
 
 namespace plssvm::kokkos::detail {
 
 template <typename T>
-device_ptr<T>::device_ptr(const size_type size, const Kokkos::DefaultExecutionSpace exec) :
+device_ptr<T>::device_ptr(const size_type size, const Kokkos::DefaultExecutionSpace &exec) :
     device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, exec } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const Kokkos::DefaultExecutionSpace exec) :
+device_ptr<T>::device_ptr(const plssvm::shape shape, const Kokkos::DefaultExecutionSpace &exec) :
     device_ptr{ shape, plssvm::shape{ 0, 0 }, exec } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const Kokkos::DefaultExecutionSpace exec) :
+device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const Kokkos::DefaultExecutionSpace &exec) :
     base_type{ shape, padding, exec } {
-    // TODO: GUARD behind ifdef!
-    data_ = device_view_type<T>{ fmt::format("device_{}_view", exec.cuda_device()), this->size_padded() };
+    data_ = device_view_type<T>{ "device_ptr_view", this->size_padded() };
 }
 
 template <typename T>

From 33ecfe9e0edc3393d5f8c0e61ddbde732f53fa97 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 22 Oct 2024 16:56:25 +0200
Subject: [PATCH 019/123] Add additional execution_space query function and
 tests.

---
 .../backends/Kokkos/execution_space.hpp       |  72 ++----------
 src/plssvm/backends/Kokkos/csvm.cpp           |   2 +-
 .../backends/Kokkos/execution_space.cpp       | 106 ++++++++++++++++--
 tests/backends/Kokkos/CMakeLists.txt          |   1 +
 tests/backends/Kokkos/detail/device_ptr.cpp   |   2 +
 tests/backends/Kokkos/execution_space.cpp     |  79 +++++++++++++
 6 files changed, 194 insertions(+), 68 deletions(-)
 create mode 100644 tests/backends/Kokkos/execution_space.cpp

diff --git a/include/plssvm/backends/Kokkos/execution_space.hpp b/include/plssvm/backends/Kokkos/execution_space.hpp
index adde9892f..6d9d84e3f 100644
--- a/include/plssvm/backends/Kokkos/execution_space.hpp
+++ b/include/plssvm/backends/Kokkos/execution_space.hpp
@@ -13,15 +13,11 @@
 #define PLSSVM_BACKENDS_KOKKOS_EXECUTION_SPACE_HPP_
 #pragma once
 
-#include "plssvm/detail/utility.hpp"  // plssvm::unreachable
-
-#include "Kokkos_Core.hpp"  // Kokkos macros, Kokkos ExecutionSpace types
-
 #include "fmt/base.h"     // fmt::formatter
 #include "fmt/ostream.h"  // fmt::ostream_formatter
 
-#include <iosfwd>       // std::ostream forward declaration
-#include <type_traits>  // std::is_same_v
+#include <iosfwd>  // std::ostream forward declaration
+#include <vector>  // std::vector
 
 namespace plssvm::kokkos {
 
@@ -50,61 +46,17 @@ enum class execution_space {
 };
 
 /**
- * @brief Create an `execution_space` from the provided Kokkos @p ExecSpace.
- * @tparam ExecSpace the type of the provided Kokkos ExecutionSpace
- * @return the enum value representing the provided Kokkos ExecutionSpace (`[[nodiscard]]`)
+ * @brief Create an `execution_space` from the current `Kokkos::DefaultExecutionSpace`.
+ * @return the enum value representing the current `Kokkos::DefaultExecutionSpace` (`[[nodiscard]]`)
+ */
+[[nodiscard]] execution_space determine_execution_space() noexcept;
+
+/**
+ * @brief List all available Kokkos::ExecutionSpaces.
+ * @details At least one execution space must **always** be available!
+ * @return a vector containing all available execution spaces (`[[nodiscard]]`)
  */
-template <typename ExecSpace>
-[[nodiscard]] inline execution_space determine_execution_space() noexcept {
-    // determine the execution_space enumeration value based on the provided Kokkos execution space
-#if defined(KOKKOS_ENABLE_CUDA)
-    if constexpr (std::is_same_v<ExecSpace, Kokkos::Cuda>) {
-        return execution_space::cuda;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_HIP)
-    if constexpr (std::is_same_v<ExecSpace, Kokkos::HIP>) {
-        return execution_space::hip;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_SYCL)
-    if constexpr (std::is_same_v<ExecSpace, Kokkos::SYCL>) {
-        return execution_space::sycl;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_HPX)
-    if constexpr (std::is_same_v<ExecSpace, Kokkos::Experimental::HPX>) {
-        return execution_space::hpx;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_OPENMP)
-    if constexpr (std::is_same_v<ExecSpace, Kokkos::OpenMP>) {
-        return execution_space::openmp;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_OPENMPTARGET)
-    if constexpr (std::is_same_v<ExecSpace, Kokkos::OpenMPTarget>) {
-        return execution_space::openmp_target;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_OPENACC)
-    if constexpr (std::is_same_v<ExecSpace, Kokkos::Experimental::OpenACC>) {
-        return execution_space::openacc;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_THREADS)
-    if constexpr (std::is_same_v<ExecSpace, Kokkos::Threads>) {
-        return execution_space::threads;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_SERIAL)
-    if constexpr (std::is_same_v<ExecSpace, Kokkos::Serial>) {
-        return execution_space::serial;
-    }
-#endif
-    // at least one execution space must always be available!
-    ::plssvm::detail::unreachable();
-}
+[[nodiscard]] std::vector<execution_space> available_execution_spaces();
 
 /**
  * @brief Output the execution @p space to the given output-stream @p out.
diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index f2a40050c..7d24b2a8b 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -49,7 +49,7 @@ csvm::csvm(parameter params) :
 
 csvm::csvm(target_platform target, parameter params) :
     base_type{ params },
-    space_{ determine_execution_space<Kokkos::DefaultExecutionSpace>() } {
+    space_{ determine_execution_space() } {
     this->init(target);
 }
 
diff --git a/src/plssvm/backends/Kokkos/execution_space.cpp b/src/plssvm/backends/Kokkos/execution_space.cpp
index 5453e11d8..06a4c351e 100644
--- a/src/plssvm/backends/Kokkos/execution_space.cpp
+++ b/src/plssvm/backends/Kokkos/execution_space.cpp
@@ -8,13 +8,18 @@
 
 #include "plssvm/backends/Kokkos/execution_space.hpp"
 
+#include "plssvm/detail/assert.hpp"          // PLSSVM_ASSERT
 #include "plssvm/detail/string_utility.hpp"  // plssvm::detail::to_lower_case
-#include "plssvm/detail/utility.hpp"         // plssvm::detail::contains
+#include "plssvm/detail/utility.hpp"         // plssvm::detail::unreachable
 
-#include <ios>      // std::ios::failbit
-#include <istream>  // std::istream
-#include <ostream>  // std::ostream
-#include <string>   // std::string
+#include "Kokkos_Core.hpp"  // Kokkos::DefaultExecutionSpace, Kokkos macros, Kokkos ExecutionSpace types
+
+#include <ios>          // std::ios::failbit
+#include <istream>      // std::istream
+#include <ostream>      // std::ostream
+#include <string>       // std::string
+#include <type_traits>  // std::is_same_v
+#include <vector>       // std::vector
 
 namespace plssvm::kokkos {
 
@@ -57,11 +62,11 @@ std::istream &operator>>(std::istream &in, execution_space &space) {
         space = execution_space::hpx;
     } else if (str == "openmp") {
         space = execution_space::openmp;
-    } else if (str == "openmp_target") {
+    } else if (str == "openmp_target" || str == "openmptarget") {
         space = execution_space::openmp_target;
     } else if (str == "openacc") {
         space = execution_space::openacc;
-    } else if (str == "threads") {
+    } else if (str == "threads" || str == "std::threads") {
         space = execution_space::threads;
     } else if (str == "serial") {
         space = execution_space::serial;
@@ -71,4 +76,91 @@ std::istream &operator>>(std::istream &in, execution_space &space) {
     return in;
 }
 
+execution_space determine_execution_space() noexcept {
+    // determine the execution_space enumeration value based on the provided Kokkos execution space
+#if defined(KOKKOS_ENABLE_CUDA)
+    if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::Cuda>) {
+        return execution_space::cuda;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+    if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::HIP>) {
+        return execution_space::hip;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)
+    if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::SYCL>) {
+        return execution_space::sycl;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_HPX)
+    if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::Experimental::HPX>) {
+        return execution_space::hpx;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_OPENMP)
+    if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::OpenMP>) {
+        return execution_space::openmp;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+    if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::OpenMPTarget>) {
+        return execution_space::openmp_target;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_OPENACC)
+    if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::Experimental::OpenACC>) {
+        return execution_space::openacc;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_THREADS)
+    if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::Threads>) {
+        return execution_space::threads;
+    }
+#endif
+#if defined(KOKKOS_ENABLE_SERIAL)
+    if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::Serial>) {
+        return execution_space::serial;
+    }
+#endif
+    // at least one execution space must always be available!
+    ::plssvm::detail::unreachable();
+}
+
+[[nodiscard]] std::vector<execution_space> available_execution_spaces() {
+    std::vector<execution_space> available_spaces{};
+#if defined(KOKKOS_ENABLE_CUDA)
+    available_spaces.push_back(execution_space::cuda);
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+    available_spaces.push_back(execution_space::hip);
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)
+    available_spaces.push_back(execution_space::sycl);
+#endif
+#if defined(KOKKOS_ENABLE_HPX)
+    available_spaces.push_back(execution_space::hpx);
+#endif
+#if defined(KOKKOS_ENABLE_OPENMP)
+    available_spaces.push_back(execution_space::openmp);
+#endif
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+    available_spaces.push_back(execution_space::openmp_target);
+#endif
+#if defined(KOKKOS_ENABLE_OPENACC)
+    available_spaces.push_back(execution_space::openacc);
+#endif
+#if defined(KOKKOS_ENABLE_THREADS)
+    available_spaces.push_back(execution_space::threads);
+#endif
+#if defined(KOKKOS_ENABLE_SERIAL)
+    available_spaces.push_back(execution_space::serial);
+#endif
+
+    // AT LEAST ONE execution space must ALWAYS be available
+    PLSSVM_ASSERT(!available_spaces.empty(), "Aat least one execution space must always be available!");
+
+    return available_spaces;
+}
+
 }  // namespace plssvm::kokkos
diff --git a/tests/backends/Kokkos/CMakeLists.txt b/tests/backends/Kokkos/CMakeLists.txt
index 1a4d3d089..7032ec2d8 100644
--- a/tests/backends/Kokkos/CMakeLists.txt
+++ b/tests/backends/Kokkos/CMakeLists.txt
@@ -14,6 +14,7 @@ set(PLSSVM_KOKKOS_TEST_SOURCES
 #    ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cu
     ${CMAKE_CURRENT_LIST_DIR}/kokkos_csvm.cpp
     ${CMAKE_CURRENT_LIST_DIR}/exceptions.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/execution_space.cpp
 )
 
 find_package(Kokkos REQUIRED)
diff --git a/tests/backends/Kokkos/detail/device_ptr.cpp b/tests/backends/Kokkos/detail/device_ptr.cpp
index 797bfef78..34b83eefa 100644
--- a/tests/backends/Kokkos/detail/device_ptr.cpp
+++ b/tests/backends/Kokkos/detail/device_ptr.cpp
@@ -10,6 +10,8 @@
 
 #include "plssvm/backends/Kokkos/detail/device_ptr.hpp"  // plssvm::kokkos::detail::device_ptr
 
+#include "Kokkos_Core.hpp"  // Kokkos::DefaultExecutionSpace
+
 #include "tests/backends/generic_device_ptr_tests.hpp"  // generic device pointer tests to instantiate
 #include "tests/naming.hpp"                             // naming::test_parameter_to_name
 #include "tests/types_to_test.hpp"                      // util::{combine_test_parameters_gtest_t, cartesian_type_product_t, layout_type_list}
diff --git a/tests/backends/Kokkos/execution_space.cpp b/tests/backends/Kokkos/execution_space.cpp
new file mode 100644
index 000000000..5639f3802
--- /dev/null
+++ b/tests/backends/Kokkos/execution_space.cpp
@@ -0,0 +1,79 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Tests for functions related to the different Kokkos execution spaces.
+ */
+
+#include "plssvm/backends/Kokkos/execution_space.hpp"
+
+#include "tests/custom_test_macros.hpp"  // EXPECT_CONVERSION_TO_STRING, EXPECT_CONVERSION_FROM_STRING
+
+#include "gtest/gtest-matchers.h"  // EXPECT_THAT; ::testing::AnyOf
+#include "gtest/gtest.h"           // TEST, EXPECT_TRUE
+
+#include <sstream>  // std::istringstream
+
+// check whether the plssvm::kokkos::execution_space -> std::string conversions are correct
+TEST(KokkosExecutionSpace, to_string) {
+    // check conversions to std::string
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::cuda, "Cuda");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::hip, "HIP");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::sycl, "SYCL");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::hpx, "HPX");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::openmp, "OpenMP");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::openmp_target, "OpenMPTarget");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::openacc, "OpenACC");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::threads, "Threads");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::serial, "Serial");
+}
+
+TEST(KokkosExecutionSpace, to_string_unknown) {
+    // check conversions to std::string from unknown execution_space
+    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::kokkos::execution_space>(9), "unknown");
+}
+
+// check whether the std::string -> plssvm::kokkos::execution_space conversions are correct
+TEST(KokkosExecutionSpace, from_string) {
+    // check conversion from std::string
+    EXPECT_CONVERSION_FROM_STRING("Cuda", plssvm::kokkos::execution_space::cuda);
+    EXPECT_CONVERSION_FROM_STRING("CUDA", plssvm::kokkos::execution_space::cuda);
+    EXPECT_CONVERSION_FROM_STRING("Hip", plssvm::kokkos::execution_space::hip);
+    EXPECT_CONVERSION_FROM_STRING("HIP", plssvm::kokkos::execution_space::hip);
+    EXPECT_CONVERSION_FROM_STRING("Sycl", plssvm::kokkos::execution_space::sycl);
+    EXPECT_CONVERSION_FROM_STRING("SYCL", plssvm::kokkos::execution_space::sycl);
+    EXPECT_CONVERSION_FROM_STRING("Hpx", plssvm::kokkos::execution_space::hpx);
+    EXPECT_CONVERSION_FROM_STRING("HPX", plssvm::kokkos::execution_space::hpx);
+    EXPECT_CONVERSION_FROM_STRING("OpenMP", plssvm::kokkos::execution_space::openmp);
+    EXPECT_CONVERSION_FROM_STRING("OPENMP", plssvm::kokkos::execution_space::openmp);
+    EXPECT_CONVERSION_FROM_STRING("OpenMP_Target", plssvm::kokkos::execution_space::openmp_target);
+    EXPECT_CONVERSION_FROM_STRING("OPENMPTARGET", plssvm::kokkos::execution_space::openmp_target);
+    EXPECT_CONVERSION_FROM_STRING("OpenACC", plssvm::kokkos::execution_space::openacc);
+    EXPECT_CONVERSION_FROM_STRING("OPENACC", plssvm::kokkos::execution_space::openacc);
+    EXPECT_CONVERSION_FROM_STRING("threads", plssvm::kokkos::execution_space::threads);
+    EXPECT_CONVERSION_FROM_STRING("THREADS", plssvm::kokkos::execution_space::threads);
+    EXPECT_CONVERSION_FROM_STRING("std::threads", plssvm::kokkos::execution_space::threads);
+    EXPECT_CONVERSION_FROM_STRING("Serial", plssvm::kokkos::execution_space::serial);
+    EXPECT_CONVERSION_FROM_STRING("SERIAL", plssvm::kokkos::execution_space::serial);
+}
+
+TEST(KokkosExecutionSpace, from_string_unknown) {
+    // foo isn't a valid execution_space
+    std::istringstream input{ "foo" };
+    plssvm::kokkos::execution_space space{};
+    input >> space;
+    EXPECT_TRUE(input.fail());
+}
+
+TEST(KokkosExecutionSpace, determine_execution_space) {
+    // check that "unreachable" is never reached
+    EXPECT_THAT(plssvm::kokkos::determine_execution_space(), ::testing::AnyOf(plssvm::kokkos::execution_space::cuda, plssvm::kokkos::execution_space::hip, plssvm::kokkos::execution_space::sycl, plssvm::kokkos::execution_space::hpx, plssvm::kokkos::execution_space::openmp, plssvm::kokkos::execution_space::openmp_target, plssvm::kokkos::execution_space::openacc, plssvm::kokkos::execution_space::threads, plssvm::kokkos::execution_space::serial));
+}
+
+TEST(KokkosExecutionSpace, available_execution_spaces) {
+    // at least one execution space must always be available
+    EXPECT_FALSE(plssvm::kokkos::available_execution_spaces().empty());
+}

From 4ed8baced441b762d554192ce87ff557531a6154 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 22 Oct 2024 17:06:54 +0200
Subject: [PATCH 020/123] Add typedef tests.

---
 tests/backends/Kokkos/CMakeLists.txt      |  1 +
 tests/backends/Kokkos/detail/typedefs.cpp | 27 +++++++++++++++++++++++
 2 files changed, 28 insertions(+)
 create mode 100644 tests/backends/Kokkos/detail/typedefs.cpp

diff --git a/tests/backends/Kokkos/CMakeLists.txt b/tests/backends/Kokkos/CMakeLists.txt
index 7032ec2d8..b8401c933 100644
--- a/tests/backends/Kokkos/CMakeLists.txt
+++ b/tests/backends/Kokkos/CMakeLists.txt
@@ -10,6 +10,7 @@ set(PLSSVM_KOKKOS_TEST_NAME Kokkos_tests)
 # list all necessary sources
 set(PLSSVM_KOKKOS_TEST_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/detail/device_ptr.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/detail/typedefs.cpp
 #    ${CMAKE_CURRENT_LIST_DIR}/detail/pinned_memory.cpp
 #    ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cu
     ${CMAKE_CURRENT_LIST_DIR}/kokkos_csvm.cpp
diff --git a/tests/backends/Kokkos/detail/typedefs.cpp b/tests/backends/Kokkos/detail/typedefs.cpp
new file mode 100644
index 000000000..4e25d4a6c
--- /dev/null
+++ b/tests/backends/Kokkos/detail/typedefs.cpp
@@ -0,0 +1,27 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Tests for the Kokkos::View typedefs.
+ */
+
+#include "plssvm/backends/Kokkos/detail/typedefs.hpp"  // plssvm::kokkos::detail::{device_view_type, host_view_type}
+
+#include "Kokkos_Core.hpp"  // Kokkos::View, Kokkos::DefaultExecutionSpace, Kokkos::HostSpace, Kokkos::MemoryUnmanaged
+
+#include "gtest/gtest.h"  // TEST, ::testing::StaticAssertTypeEq
+
+TEST(KokkosTypedefs, device_view_type) {
+    // test device view typedefs
+    ::testing::StaticAssertTypeEq<Kokkos::View<int *, Kokkos::DefaultExecutionSpace>, plssvm::kokkos::detail::device_view_type<int>>();
+    ::testing::StaticAssertTypeEq<Kokkos::View<const unsigned *, Kokkos::DefaultExecutionSpace>, plssvm::kokkos::detail::device_view_type<const unsigned>>();
+}
+
+TEST(KokkosTypedefs, host_view_type) {
+    // test host view typedefs
+    ::testing::StaticAssertTypeEq<Kokkos::View<double *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>, plssvm::kokkos::detail::host_view_type<double>>();
+    ::testing::StaticAssertTypeEq<Kokkos::View<const float *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>, plssvm::kokkos::detail::host_view_type<const float>>();
+}

From ee9705c83da8d16d0ded6821fb49011f957e7ada Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 22 Oct 2024 17:17:19 +0200
Subject: [PATCH 021/123] Add standard layout tuple tests.

---
 .../Kokkos/detail/standard_layout_tuple.hpp   |  2 ++
 tests/backends/Kokkos/CMakeLists.txt          |  1 +
 .../Kokkos/detail/standard_layout_tuple.cpp   | 33 +++++++++++++++++++
 3 files changed, 36 insertions(+)
 create mode 100644 tests/backends/Kokkos/detail/standard_layout_tuple.cpp

diff --git a/include/plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp b/include/plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp
index 3f5fddddd..5b26f5e98 100644
--- a/include/plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp
+++ b/include/plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp
@@ -15,6 +15,8 @@
 
 #include "plssvm/constants.hpp"  // plssvm::real_type
 
+#include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION
+
 #include <cstddef>      // std::size_t
 #include <type_traits>  // std::is_standard_layout
 #include <utility>      // std::forward
diff --git a/tests/backends/Kokkos/CMakeLists.txt b/tests/backends/Kokkos/CMakeLists.txt
index b8401c933..c17034811 100644
--- a/tests/backends/Kokkos/CMakeLists.txt
+++ b/tests/backends/Kokkos/CMakeLists.txt
@@ -11,6 +11,7 @@ set(PLSSVM_KOKKOS_TEST_NAME Kokkos_tests)
 set(PLSSVM_KOKKOS_TEST_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/detail/device_ptr.cpp
     ${CMAKE_CURRENT_LIST_DIR}/detail/typedefs.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/detail/standard_layout_tuple.cpp
 #    ${CMAKE_CURRENT_LIST_DIR}/detail/pinned_memory.cpp
 #    ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cu
     ${CMAKE_CURRENT_LIST_DIR}/kokkos_csvm.cpp
diff --git a/tests/backends/Kokkos/detail/standard_layout_tuple.cpp b/tests/backends/Kokkos/detail/standard_layout_tuple.cpp
new file mode 100644
index 000000000..7b4fb6cd8
--- /dev/null
+++ b/tests/backends/Kokkos/detail/standard_layout_tuple.cpp
@@ -0,0 +1,33 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Tests for the custom standard layout tuple implementation necessary for Kokkos.
+ */
+
+#include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp"  // plssvm::kokkos::detail::{standard_layout_tuple, make_standard_layout_tuple, get}
+
+#include "gtest/gtest.h"  // TEST, EXPECT_EQ, testing::StaticAssertTypeEq
+
+#include <type_traits>  // std::remove_const_t
+
+TEST(KokkosStandardLayoutTuple, make_standard_layout_tuple) {
+    // create a new standard layout tuple
+    [[maybe_unused]] const auto tuple = plssvm::kokkos::detail::make_standard_layout_tuple(true, 42, 3.1415);
+
+    // check the tuple type
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::detail::standard_layout_tuple<bool, int, double>, std::remove_const_t<decltype(tuple)>>();
+}
+
+TEST(KokkosStandardLayoutTuple, get) {
+    // create a new standard layout tuple
+    const auto tuple = plssvm::kokkos::detail::make_standard_layout_tuple(true, 42, 3.1415);
+
+    // check getter functions
+    EXPECT_EQ(plssvm::kokkos::detail::get<0>(tuple), true);
+    EXPECT_EQ(plssvm::kokkos::detail::get<1>(tuple), 42);
+    EXPECT_EQ(plssvm::kokkos::detail::get<2>(tuple), 3.1415);
+}

From 8a90949058de87b85b385cbfb74bdb36cd3d31a8 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 22 Oct 2024 17:23:11 +0200
Subject: [PATCH 022/123] Add stub tests.

---
 .../backends/Kokkos/detail/pinned_memory.cpp  |  2 +
 tests/backends/Kokkos/CMakeLists.txt          |  4 +-
 .../backends/Kokkos/detail/pinned_memory.cpp  | 39 +++++++++++++++++++
 tests/backends/Kokkos/detail/utility.cpp      | 19 +++++++++
 4 files changed, 62 insertions(+), 2 deletions(-)
 create mode 100644 tests/backends/Kokkos/detail/pinned_memory.cpp
 create mode 100644 tests/backends/Kokkos/detail/utility.cpp

diff --git a/src/plssvm/backends/Kokkos/detail/pinned_memory.cpp b/src/plssvm/backends/Kokkos/detail/pinned_memory.cpp
index 919cbdaa1..dfae19661 100644
--- a/src/plssvm/backends/Kokkos/detail/pinned_memory.cpp
+++ b/src/plssvm/backends/Kokkos/detail/pinned_memory.cpp
@@ -40,6 +40,8 @@ pinned_memory<T>::~pinned_memory() {
     }
 }
 
+// TODO: check if implementable via Kokkos?
+
 template class pinned_memory<float>;
 template class pinned_memory<double>;
 
diff --git a/tests/backends/Kokkos/CMakeLists.txt b/tests/backends/Kokkos/CMakeLists.txt
index c17034811..f6925207f 100644
--- a/tests/backends/Kokkos/CMakeLists.txt
+++ b/tests/backends/Kokkos/CMakeLists.txt
@@ -12,8 +12,8 @@ set(PLSSVM_KOKKOS_TEST_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/detail/device_ptr.cpp
     ${CMAKE_CURRENT_LIST_DIR}/detail/typedefs.cpp
     ${CMAKE_CURRENT_LIST_DIR}/detail/standard_layout_tuple.cpp
-#    ${CMAKE_CURRENT_LIST_DIR}/detail/pinned_memory.cpp
-#    ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cu
+    ${CMAKE_CURRENT_LIST_DIR}/detail/pinned_memory.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp
     ${CMAKE_CURRENT_LIST_DIR}/kokkos_csvm.cpp
     ${CMAKE_CURRENT_LIST_DIR}/exceptions.cpp
     ${CMAKE_CURRENT_LIST_DIR}/execution_space.cpp
diff --git a/tests/backends/Kokkos/detail/pinned_memory.cpp b/tests/backends/Kokkos/detail/pinned_memory.cpp
new file mode 100644
index 000000000..aa91612d7
--- /dev/null
+++ b/tests/backends/Kokkos/detail/pinned_memory.cpp
@@ -0,0 +1,39 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Tests for the Kokkos backend pinned memory.
+ */
+
+#include "plssvm/backends/Kokkos/detail/pinned_memory.hpp"  // plssvm::kokkos::detail::pinned_memory
+
+#include "tests/backends/generic_pinned_memory_tests.hpp"  // generic pinned memory tests to instantiate
+#include "tests/naming.hpp"                                // naming::test_parameter_to_name
+#include "tests/types_to_test.hpp"                         // util::{combine_test_parameters_gtest_t, cartesian_type_product_t, layout_type_list}
+
+#include "gtest/gtest.h"  // INSTANTIATE_TYPED_TEST_SUITE_P
+
+#include <tuple>  // std::tuple
+
+template <typename T>
+struct kokkos_pinned_memory_test_type {
+    using pinned_memory_type = plssvm::kokkos::detail::pinned_memory<T>;
+
+    constexpr static bool can_pin = false;  // TODO: try implementing in Kokkos?
+};
+
+using kokkos_pinned_memory_tuple = std::tuple<kokkos_pinned_memory_test_type<float>, kokkos_pinned_memory_test_type<double>>;
+
+// the tests used in the instantiated GTest test suites
+using kokkos_pinned_memory_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<kokkos_pinned_memory_tuple>>;
+using kokkos_pinned_memory_layout_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<kokkos_pinned_memory_tuple>, util::layout_type_list>;
+
+// instantiate type-parameterized tests
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosPinnedMemory, PinnedMemory, kokkos_pinned_memory_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosPinnedMemory, PinnedMemoryLayout, kokkos_pinned_memory_layout_type_gtest, naming::test_parameter_to_name);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosPinnedMemoryDeathTest, PinnedMemoryDeathTest, kokkos_pinned_memory_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosPinnedMemoryDeathTest, PinnedMemoryLayoutDeathTest, kokkos_pinned_memory_layout_type_gtest, naming::test_parameter_to_name);
diff --git a/tests/backends/Kokkos/detail/utility.cpp b/tests/backends/Kokkos/detail/utility.cpp
new file mode 100644
index 000000000..26c4b1b56
--- /dev/null
+++ b/tests/backends/Kokkos/detail/utility.cpp
@@ -0,0 +1,19 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Tests for the custom utility functions related to the Kokkos backend.
+ */
+
+#include "plssvm/backends/Kokkos/detail/utility.hpp"
+
+#include "tests/custom_test_macros.hpp"  // EXPECT_THROW_WHAT, EXPECT_THROW_WHAT_MATCHER
+
+#include "fmt/format.h"   // fmt::format
+#include "gmock/gmock.h"  // ::testing::StartsWith
+#include "gtest/gtest.h"  // TEST, EXPECT_GE, EXPECT_NO_THROW
+
+#include <regex>  // std::regex, std::regex::extended, std::regex_match

From fc021ae44eb1d6f60f4f3e99b2eeabd90d60bacb Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 22 Oct 2024 18:09:00 +0200
Subject: [PATCH 023/123] Rename function.

---
 include/plssvm/backends/Kokkos/execution_space.hpp | 2 +-
 src/plssvm/backends/Kokkos/csvm.cpp                | 2 +-
 src/plssvm/backends/Kokkos/execution_space.cpp     | 2 +-
 tests/backends/Kokkos/execution_space.cpp          | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/execution_space.hpp b/include/plssvm/backends/Kokkos/execution_space.hpp
index 6d9d84e3f..fa1236d70 100644
--- a/include/plssvm/backends/Kokkos/execution_space.hpp
+++ b/include/plssvm/backends/Kokkos/execution_space.hpp
@@ -49,7 +49,7 @@ enum class execution_space {
  * @brief Create an `execution_space` from the current `Kokkos::DefaultExecutionSpace`.
  * @return the enum value representing the current `Kokkos::DefaultExecutionSpace` (`[[nodiscard]]`)
  */
-[[nodiscard]] execution_space determine_execution_space() noexcept;
+[[nodiscard]] execution_space determine_default_execution_space() noexcept;
 
 /**
  * @brief List all available Kokkos::ExecutionSpaces.
diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 7d24b2a8b..1f4b0d8d5 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -49,7 +49,7 @@ csvm::csvm(parameter params) :
 
 csvm::csvm(target_platform target, parameter params) :
     base_type{ params },
-    space_{ determine_execution_space() } {
+    space_{ determine_default_execution_space() } {
     this->init(target);
 }
 
diff --git a/src/plssvm/backends/Kokkos/execution_space.cpp b/src/plssvm/backends/Kokkos/execution_space.cpp
index 06a4c351e..2f3472aa8 100644
--- a/src/plssvm/backends/Kokkos/execution_space.cpp
+++ b/src/plssvm/backends/Kokkos/execution_space.cpp
@@ -76,7 +76,7 @@ std::istream &operator>>(std::istream &in, execution_space &space) {
     return in;
 }
 
-execution_space determine_execution_space() noexcept {
+execution_space determine_default_execution_space() noexcept {
     // determine the execution_space enumeration value based on the provided Kokkos execution space
 #if defined(KOKKOS_ENABLE_CUDA)
     if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::Cuda>) {
diff --git a/tests/backends/Kokkos/execution_space.cpp b/tests/backends/Kokkos/execution_space.cpp
index 5639f3802..c0cec6f45 100644
--- a/tests/backends/Kokkos/execution_space.cpp
+++ b/tests/backends/Kokkos/execution_space.cpp
@@ -12,8 +12,8 @@
 
 #include "tests/custom_test_macros.hpp"  // EXPECT_CONVERSION_TO_STRING, EXPECT_CONVERSION_FROM_STRING
 
-#include "gtest/gtest-matchers.h"  // EXPECT_THAT; ::testing::AnyOf
-#include "gtest/gtest.h"           // TEST, EXPECT_TRUE
+#include "gmock/gmock.h"  // EXPECT_THAT; ::testing::AnyOf
+#include "gtest/gtest.h"  // TEST, EXPECT_TRUE
 
 #include <sstream>  // std::istringstream
 
@@ -70,7 +70,7 @@ TEST(KokkosExecutionSpace, from_string_unknown) {
 
 TEST(KokkosExecutionSpace, determine_execution_space) {
     // check that "unreachable" is never reached
-    EXPECT_THAT(plssvm::kokkos::determine_execution_space(), ::testing::AnyOf(plssvm::kokkos::execution_space::cuda, plssvm::kokkos::execution_space::hip, plssvm::kokkos::execution_space::sycl, plssvm::kokkos::execution_space::hpx, plssvm::kokkos::execution_space::openmp, plssvm::kokkos::execution_space::openmp_target, plssvm::kokkos::execution_space::openacc, plssvm::kokkos::execution_space::threads, plssvm::kokkos::execution_space::serial));
+    EXPECT_THAT(plssvm::kokkos::determine_default_execution_space(), ::testing::AnyOf(plssvm::kokkos::execution_space::cuda, plssvm::kokkos::execution_space::hip, plssvm::kokkos::execution_space::sycl, plssvm::kokkos::execution_space::hpx, plssvm::kokkos::execution_space::openmp, plssvm::kokkos::execution_space::openmp_target, plssvm::kokkos::execution_space::openacc, plssvm::kokkos::execution_space::threads, plssvm::kokkos::execution_space::serial));
 }
 
 TEST(KokkosExecutionSpace, available_execution_spaces) {

From a5ee4ef2933487e935c3a1a7a12b50fad8bc3936 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 22 Oct 2024 18:09:21 +0200
Subject: [PATCH 024/123] Add TODOs and first utility tests.

---
 src/plssvm/backends/Kokkos/detail/utility.cpp |   4 +-
 tests/backends/Kokkos/detail/utility.cpp      | 105 +++++++++++++++++-
 2 files changed, 102 insertions(+), 7 deletions(-)

diff --git a/src/plssvm/backends/Kokkos/detail/utility.cpp b/src/plssvm/backends/Kokkos/detail/utility.cpp
index b7d58be1d..ac49ef532 100644
--- a/src/plssvm/backends/Kokkos/detail/utility.cpp
+++ b/src/plssvm/backends/Kokkos/detail/utility.cpp
@@ -30,7 +30,7 @@ target_platform determine_default_target_platform_from_execution_space(const exe
         case execution_space::cuda:
             return target_platform::gpu_nvidia;
         case execution_space::hip:
-            return target_platform::gpu_amd;
+            return target_platform::gpu_amd;  // TODO: or gpu_nvidia :/
         case execution_space::sycl:
         case execution_space::openmp_target:
         case execution_space::openacc:
@@ -56,7 +56,7 @@ void check_execution_space_target_platform_combination(const execution_space spa
             }
             break;
         case execution_space::hip:
-            if (target != target_platform::gpu_amd) {
+            if (target != target_platform::gpu_amd && target != target_platform::gpu_nvidia) {
                 throw backend_exception{ fmt::format("The target platform {} is not supported for Kokkos {} execution space!", target, space) };
             }
             break;
diff --git a/tests/backends/Kokkos/detail/utility.cpp b/tests/backends/Kokkos/detail/utility.cpp
index 26c4b1b56..ab49f1034 100644
--- a/tests/backends/Kokkos/detail/utility.cpp
+++ b/tests/backends/Kokkos/detail/utility.cpp
@@ -10,10 +10,105 @@
 
 #include "plssvm/backends/Kokkos/detail/utility.hpp"
 
-#include "tests/custom_test_macros.hpp"  // EXPECT_THROW_WHAT, EXPECT_THROW_WHAT_MATCHER
+#include "plssvm/backends/Kokkos/exceptions.hpp"       // plssvm::kokkos::backend_exception
+#include "plssvm/backends/Kokkos/execution_space.hpp"  // plssvm::kokkos::execution_space
+#include "plssvm/target_platforms.hpp"                 // plssvm::target_platform
 
-#include "fmt/format.h"   // fmt::format
-#include "gmock/gmock.h"  // ::testing::StartsWith
-#include "gtest/gtest.h"  // TEST, EXPECT_GE, EXPECT_NO_THROW
+#include "Kokkos_Core.hpp"  // Kokkos::DefaultExecutionSpace
 
-#include <regex>  // std::regex, std::regex::extended, std::regex_match
+#include "tests/custom_test_macros.hpp"  // EXPECT_THROW_WHAT
+
+#include "fmt/core.h"     // fmt::format
+#include "gmock/gmock.h"  // EXPECT_THAT; ::testing::AnyOf
+#include "gtest/gtest.h"  // TEST, EXPECT_GE, EXPECT_NE
+
+#include <regex>   // std::regex, std::regex::extended, std::regex_match
+#include <string>  // std::string
+#include <vector>  // std::vector
+
+TEST(KokkosUtility, determine_default_target_platform_from_execution_space) {
+    // determine the potential default target platform
+    EXPECT_EQ(plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(plssvm::kokkos::execution_space::cuda), plssvm::target_platform::gpu_nvidia);
+    EXPECT_THAT(plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(plssvm::kokkos::execution_space::hip), ::testing::AnyOf(plssvm::target_platform::gpu_nvidia, plssvm::target_platform::gpu_amd));
+    EXPECT_NE(plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(plssvm::kokkos::execution_space::sycl), plssvm::target_platform::automatic);
+    EXPECT_EQ(plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(plssvm::kokkos::execution_space::hpx), plssvm::target_platform::cpu);
+    EXPECT_EQ(plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(plssvm::kokkos::execution_space::openmp), plssvm::target_platform::cpu);
+    EXPECT_NE(plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(plssvm::kokkos::execution_space::openmp_target), plssvm::target_platform::automatic);
+    EXPECT_NE(plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(plssvm::kokkos::execution_space::openacc), plssvm::target_platform::automatic);
+    EXPECT_EQ(plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(plssvm::kokkos::execution_space::threads), plssvm::target_platform::cpu);
+    EXPECT_EQ(plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(plssvm::kokkos::execution_space::serial), plssvm::target_platform::cpu);
+}
+
+TEST(KokkosUtility, check_execution_space_target_platform_combination) {
+    // check some execution_space <-> target_platform combinations
+    // the cuda execution space only supports the NVIDIA GPU
+    EXPECT_NO_THROW(plssvm::kokkos::detail::check_execution_space_target_platform_combination(plssvm::kokkos::execution_space::cuda, plssvm::target_platform::gpu_nvidia));
+    EXPECT_THROW_WHAT(plssvm::kokkos::detail::check_execution_space_target_platform_combination(plssvm::kokkos::execution_space::cuda, plssvm::target_platform::gpu_amd),
+                      plssvm::kokkos::backend_exception,
+                      "The target platform gpu_amd is not supported for Kokkos Cuda execution space!");
+    EXPECT_THROW_WHAT(plssvm::kokkos::detail::check_execution_space_target_platform_combination(plssvm::kokkos::execution_space::cuda, plssvm::target_platform::gpu_intel),
+                      plssvm::kokkos::backend_exception,
+                      "The target platform gpu_intel is not supported for Kokkos Cuda execution space!");
+    EXPECT_THROW_WHAT(plssvm::kokkos::detail::check_execution_space_target_platform_combination(plssvm::kokkos::execution_space::cuda, plssvm::target_platform::cpu),
+                      plssvm::kokkos::backend_exception,
+                      "The target platform cpu is not supported for Kokkos Cuda execution space!");
+
+    // the hip execution space only supports the NVIDIA and AMD GPUs
+    EXPECT_NO_THROW(plssvm::kokkos::detail::check_execution_space_target_platform_combination(plssvm::kokkos::execution_space::hip, plssvm::target_platform::gpu_nvidia));
+    EXPECT_NO_THROW(plssvm::kokkos::detail::check_execution_space_target_platform_combination(plssvm::kokkos::execution_space::hip, plssvm::target_platform::gpu_amd));
+    EXPECT_THROW_WHAT(plssvm::kokkos::detail::check_execution_space_target_platform_combination(plssvm::kokkos::execution_space::hip, plssvm::target_platform::gpu_intel),
+                      plssvm::kokkos::backend_exception,
+                      "The target platform gpu_intel is not supported for Kokkos HIP execution space!");
+    EXPECT_THROW_WHAT(plssvm::kokkos::detail::check_execution_space_target_platform_combination(plssvm::kokkos::execution_space::hip, plssvm::target_platform::cpu),
+                      plssvm::kokkos::backend_exception,
+                      "The target platform cpu is not supported for Kokkos HIP execution space!");
+
+    // TODO: SYCL
+    // TODO: OpenMP target
+    // TODO: OpenACC
+
+    // the remaining execution spaces all only support CPUs!
+    for (const plssvm::kokkos::execution_space exec : { plssvm::kokkos::execution_space::hpx, plssvm::kokkos::execution_space::openmp, plssvm::kokkos::execution_space::threads, plssvm::kokkos::execution_space::serial }) {
+        EXPECT_THROW_WHAT(plssvm::kokkos::detail::check_execution_space_target_platform_combination(exec, plssvm::target_platform::gpu_nvidia),
+                          plssvm::kokkos::backend_exception,
+                          fmt::format("The target platform gpu_nvidia is not supported for Kokkos {} execution space!", exec));
+        EXPECT_THROW_WHAT(plssvm::kokkos::detail::check_execution_space_target_platform_combination(exec, plssvm::target_platform::gpu_amd),
+                          plssvm::kokkos::backend_exception,
+                          fmt::format("The target platform gpu_amd is not supported for Kokkos {} execution space!", exec));
+        EXPECT_THROW_WHAT(plssvm::kokkos::detail::check_execution_space_target_platform_combination(exec, plssvm::target_platform::gpu_intel),
+                          plssvm::kokkos::backend_exception,
+                          fmt::format("The target platform gpu_intel is not supported for Kokkos {} execution space!", exec));
+        EXPECT_NO_THROW(plssvm::kokkos::detail::check_execution_space_target_platform_combination(exec, plssvm::target_platform::cpu));
+    }
+}
+
+TEST(KokkosUtility, get_device_list) {
+    // get the default device list
+    const plssvm::kokkos::execution_space space = plssvm::kokkos::determine_default_execution_space();
+    const plssvm::target_platform target = plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(space);
+    const std::vector<Kokkos::DefaultExecutionSpace> devices = plssvm::kokkos::detail::get_device_list(space, target);
+
+    // check the number of returned devices
+    if (space == plssvm::kokkos::execution_space::cuda || space == plssvm::kokkos::execution_space::hip || space == plssvm::kokkos::execution_space::sycl) {
+        // for the device execution spaces AT LEAST ONE device must be found
+        EXPECT_GE(devices.size(), 1);
+    } else {
+        // for all other execution spaces EXACTLY ONE device must be found
+        EXPECT_EQ(devices.size(), 1);
+    }
+}
+
+TEST(KokkosUtility, get_device_name) {
+    // get the device name of the default Kokkos execution space
+    const plssvm::kokkos::execution_space space = plssvm::kokkos::determine_default_execution_space();
+    const std::string name = plssvm::kokkos::detail::get_device_name(space, Kokkos::DefaultExecutionSpace{});
+
+    // the returned device name may not be empty or unknown
+    EXPECT_FALSE(name.empty());
+    EXPECT_NE(name, std::string{ "unknown" });
+}
+
+TEST(KokkosUtility, get_kokkos_version) {
+    const std::regex reg{ "[0-9]+\\.[0-9]+\\.[0-9]+", std::regex::extended };
+    EXPECT_TRUE(std::regex_match(plssvm::kokkos::detail::get_kokkos_version(), reg));
+}

From 01f27938a7a46bb2fa0ffa11ba5c64e966fcf9ce Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 22 Oct 2024 18:17:41 +0200
Subject: [PATCH 025/123] Reduce code duplication be moving similar code to a
 common header file.

---
 tests/kokkos_main.cpp | 31 +++-------------------------
 tests/main.cpp        | 27 ++-----------------------
 tests/main.hpp        | 47 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 53 deletions(-)
 create mode 100644 tests/main.hpp

diff --git a/tests/kokkos_main.cpp b/tests/kokkos_main.cpp
index e53409bd4..1edfbb9fe 100644
--- a/tests/kokkos_main.cpp
+++ b/tests/kokkos_main.cpp
@@ -9,36 +9,11 @@
  * @brief Contains the googletest main function. Sets the DeathTest to "threadsafe" execution instead of "fast".
  */
 
-#include "gtest/gtest.h"  // GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST, RUN_ALL_TESTS, ::testing::{InitGoogleTest, GTEST_FLAG}
-
-#include "Kokkos_Core.hpp"  // TODO:
-
-// TODO: reduce copy-paste
-
-// silence GTest warnings/test errors
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVM);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunction);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolver);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunction);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunctionClassification);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunctionClassification);
-
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMDeathTest);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverDeathTest);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunctionDeathTest);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunctionDeathTest);
+#include "Kokkos_Core.hpp"  // Kokkos::initialize, Kokkos::finalize
 
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVM);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVMKernelFunction);
-
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVMDeathTest);
-
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtr);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtrLayout);
-
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtrDeathTest);
+#include "gtest/gtest.h"  // GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST, RUN_ALL_TESTS, ::testing::{InitGoogleTest, GTEST_FLAG}
 
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Exception);
+#include "main.hpp"  // GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST definitions
 
 int main(int argc, char **argv) {
     ::testing::InitGoogleTest(&argc, argv);
diff --git a/tests/main.cpp b/tests/main.cpp
index d27eddd7d..944ad9318 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -9,32 +9,9 @@
  * @brief Contains the googletest main function. Sets the DeathTest to "threadsafe" execution instead of "fast".
  */
 
-#include "gtest/gtest.h"  // GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST, RUN_ALL_TESTS, ::testing::{InitGoogleTest, GTEST_FLAG}
+#include "main.hpp"  // GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST definitions
 
-// silence GTest warnings/test errors
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVM);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunction);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolver);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunction);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunctionClassification);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunctionClassification);
-
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMDeathTest);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverDeathTest);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunctionDeathTest);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunctionDeathTest);
-
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVM);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVMKernelFunction);
-
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVMDeathTest);
-
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtr);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtrLayout);
-
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtrDeathTest);
-
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Exception);
+#include "gtest/gtest.h"  // RUN_ALL_TESTS, ::testing::{InitGoogleTest, GTEST_FLAG}
 
 int main(int argc, char **argv) {
     ::testing::InitGoogleTest(&argc, argv);
diff --git a/tests/main.hpp b/tests/main.hpp
new file mode 100644
index 000000000..ddb4ea590
--- /dev/null
+++ b/tests/main.hpp
@@ -0,0 +1,47 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Header file for the GoogleTest main files to reduce code duplication.
+ */
+
+#ifndef PLSSVM_TESTS_MAIN_HPP_
+#define PLSSVM_TESTS_MAIN_HPP_
+#pragma once
+
+#include "gtest/gtest.h"  // GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST
+
+// silence GTest warnings/test errors
+
+// generic CSVM tests
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVM);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunction);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolver);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunction);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunctionClassification);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunctionClassification);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMDeathTest);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverDeathTest);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunctionDeathTest);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunctionDeathTest);
+// generic GPU CSVM tests
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVM);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVMKernelFunction);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVMDeathTest);
+// pinned memory tests
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PinnedMemory);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PinnedMemoryLayout);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PinnedMemoryDeathTest);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PinnedMemoryLayoutDeathTest);
+// device pointer tests
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtr);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtrLayout);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtrDeathTest);
+// exception tests
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Exception);
+
+#endif  // PLSSVM_TESTS_MAIN_HPP_

From fd6473ebd7218eb1a49917e35f2c2cb42eaed56e Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 28 Oct 2024 15:12:53 +0100
Subject: [PATCH 026/123] Support switching between different
 Kokkos::ExecutionSpaces at runtime. plssvm::kokkos::csvm now correctly honors
 the provided target platform.

---
 include/plssvm/backends/Kokkos/csvm.hpp       |  32 +-
 .../backends/Kokkos/detail/device_ptr.hpp     |  27 +-
 .../Kokkos/detail/device_view_wrapper.hpp     | 230 ++++++++
 .../backends/Kokkos/detail/device_wrapper.hpp | 197 +++++++
 .../backends/Kokkos/detail/typedefs.hpp       |  36 --
 .../plssvm/backends/Kokkos/detail/utility.hpp |  76 ++-
 .../backends/Kokkos/execution_space.hpp       | 286 +++++++++-
 .../Kokkos/kernel/cg_explicit/blas.hpp        |  65 ++-
 .../cg_explicit/kernel_matrix_assembly.hpp    |  32 +-
 .../kernel_matrix_assembly_blas.hpp           |  19 +-
 .../Kokkos/kernel/detail/memset_kernel.hpp    |  56 ++
 .../Kokkos/kernel/kernel_functions.hpp        |  20 +-
 .../backends/Kokkos/kernel/predict_kernel.hpp |  51 +-
 src/plssvm/backends/Kokkos/CMakeLists.txt     |   1 +
 src/plssvm/backends/Kokkos/csvm.cpp           | 492 ++++++++++--------
 .../backends/Kokkos/detail/device_ptr.cpp     | 115 ++--
 .../backends/Kokkos/detail/device_wrapper.cpp | 106 ++++
 src/plssvm/backends/Kokkos/detail/utility.cpp | 182 +++----
 .../backends/Kokkos/execution_space.cpp       | 104 +---
 tests/backends/Kokkos/CMakeLists.txt          |   3 +-
 tests/backends/Kokkos/detail/device_ptr.cpp   |   6 +-
 .../Kokkos/detail/device_view_wrapper.cpp     |  77 +++
 .../backends/Kokkos/detail/device_wrapper.cpp | 115 ++++
 tests/backends/Kokkos/detail/typedefs.cpp     |  27 -
 tests/backends/Kokkos/detail/utility.cpp      | 125 ++---
 tests/backends/Kokkos/execution_space.cpp     |  72 ++-
 tests/backends/generic_csvm_tests.hpp         |   5 +-
 tests/backends/generic_gpu_csvm_tests.hpp     |   2 +-
 tests/utility.hpp                             |  20 +-
 29 files changed, 1839 insertions(+), 740 deletions(-)
 create mode 100644 include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
 create mode 100644 include/plssvm/backends/Kokkos/detail/device_wrapper.hpp
 delete mode 100644 include/plssvm/backends/Kokkos/detail/typedefs.hpp
 create mode 100644 include/plssvm/backends/Kokkos/kernel/detail/memset_kernel.hpp
 create mode 100644 src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
 create mode 100644 tests/backends/Kokkos/detail/device_view_wrapper.cpp
 create mode 100644 tests/backends/Kokkos/detail/device_wrapper.cpp
 delete mode 100644 tests/backends/Kokkos/detail/typedefs.cpp

diff --git a/include/plssvm/backends/Kokkos/csvm.hpp b/include/plssvm/backends/Kokkos/csvm.hpp
index 859a9f43b..d8dcfaab8 100644
--- a/include/plssvm/backends/Kokkos/csvm.hpp
+++ b/include/plssvm/backends/Kokkos/csvm.hpp
@@ -13,19 +13,18 @@
 #define PLSSVM_BACKENDS_KOKKOS_CSVM_HPP_
 #pragma once
 
-#include "plssvm/backends/execution_range.hpp"              // plssvm::detail::{dim_type, execution_range}
-#include "plssvm/backends/gpu_csvm.hpp"                     // plssvm::detail::gpu_csvm
-#include "plssvm/backends/Kokkos/detail/device_ptr.hpp"     // plssvm::kokkos::detail::device_ptr
-#include "plssvm/backends/Kokkos/detail/pinned_memory.hpp"  // plssvm::kokkos::detail::pinned_memory
-#include "plssvm/backends/Kokkos/execution_space.hpp"       // plssvm::kokkos::execution_space
-#include "plssvm/constants.hpp"                             // plssvm::real_type
-#include "plssvm/csvm.hpp"                                  // plssvm::detail::csvm_backend_exists
-#include "plssvm/detail/memory_size.hpp"                    // plssvm::detail::memory_size
-#include "plssvm/detail/type_traits.hpp"                    // PLSSVM_REQUIRES
-#include "plssvm/parameter.hpp"                             // plssvm::parameter, plssvm::detail::parameter
-#include "plssvm/target_platforms.hpp"                      // plssvm::target_platform
-
-#include "Kokkos_Core_fwd.hpp"  // Kokkos::DefaultExecutionSpace
+#include "plssvm/backends/execution_range.hpp"               // plssvm::detail::{dim_type, execution_range}
+#include "plssvm/backends/gpu_csvm.hpp"                      // plssvm::detail::gpu_csvm
+#include "plssvm/backends/Kokkos/detail/device_ptr.hpp"      // plssvm::kokkos::detail::device_ptr
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"  // plssvm::kokkos::detail::device_wrapper
+#include "plssvm/backends/Kokkos/detail/pinned_memory.hpp"   // plssvm::kokkos::detail::pinned_memory
+#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
+#include "plssvm/constants.hpp"                              // plssvm::real_type
+#include "plssvm/csvm.hpp"                                   // plssvm::detail::csvm_backend_exists
+#include "plssvm/detail/memory_size.hpp"                     // plssvm::detail::memory_size
+#include "plssvm/detail/type_traits.hpp"                     // PLSSVM_REQUIRES
+#include "plssvm/parameter.hpp"                              // plssvm::parameter, plssvm::detail::parameter
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include <cstddef>      // std::size_t
 #include <type_traits>  // std::true_type
@@ -38,13 +37,12 @@ namespace kokkos {
 
 /**
  * @brief A C-SVM implementation using Kokkos as backend.
- * @details Internally, we always only use the `Kokkos::DefaultExecutionSpace`.
  */
-class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, Kokkos::DefaultExecutionSpace, detail::pinned_memory> {
+class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::device_wrapper, detail::pinned_memory> {
   protected:
     // protected for the test mock class
     /// The template base type of the Kokkos C-SVM class.
-    using base_type = ::plssvm::detail::gpu_csvm<detail::device_ptr, Kokkos::DefaultExecutionSpace, detail::pinned_memory>;
+    using base_type = ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::device_wrapper, detail::pinned_memory>;
 
     using base_type::data_distribution_;
     using base_type::devices_;
@@ -120,7 +118,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, Kokkos::Defau
     ~csvm() override;
 
     /**
-     * @brief Return the currently used `execution_space` determined using `Kokkos::ExecutionSpace`.
+     * @brief Return the currently used Kokkos `execution_space`.
      * @return the execution space (`[[nodiscard]]`)
      */
     [[nodiscard]] execution_space get_execution_space() const noexcept { return space_; }
diff --git a/include/plssvm/backends/Kokkos/detail/device_ptr.hpp b/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
index 98194bcfb..ad067d00b 100644
--- a/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
+++ b/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
@@ -6,18 +6,17 @@
  * @license This file is part of the PLSSVM project which is released under the MIT license.
  *          See the LICENSE.md file in the project root for full license information.
  *
- * @brief Small wrapper around a Kokkos view.
+ * @brief Small wrapper around a Kokkos::View.
  */
 
 #ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_PTR_HPP_
 #define PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_PTR_HPP_
 #pragma once
 
-#include "plssvm/backends/gpu_device_ptr.hpp"          // plssvm::detail::gpu_device_ptr
-#include "plssvm/backends/Kokkos/detail/typedefs.hpp"  // plssvm::kokkos::detail::device_view_type
-#include "plssvm/shape.hpp"                            // plssvm::shape
-
-#include "Kokkos_Core.hpp"  // Kokkos::DefaultExecutionSpace
+#include "plssvm/backends/gpu_device_ptr.hpp"                     // plssvm::detail::gpu_device_ptr
+#include "plssvm/backends/Kokkos/detail/device_view_wrapper.hpp"  // plssvm::kokkos::detail::device_view_wrapper
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"       // plssvm::kokkos::detail::device_wrapper
+#include "plssvm/shape.hpp"                                       // plssvm::shape
 
 #include <cstddef>  // std::size_t
 
@@ -28,9 +27,9 @@ namespace plssvm::kokkos::detail {
  * @tparam T the type of the kernel view to wrap
  */
 template <typename T>
-class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, Kokkos::DefaultExecutionSpace, device_view_type<T>, device_ptr<T>> {
+class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, device_wrapper, device_view_wrapper<T *>, device_ptr<T>> {
     /// The template base type of the Kokkos device_ptr class.
-    using base_type = ::plssvm::detail::gpu_device_ptr<T, Kokkos::DefaultExecutionSpace, device_view_type<T>, device_ptr<T>>;
+    using base_type = ::plssvm::detail::gpu_device_ptr<T, device_wrapper, device_view_wrapper<T *>, device_ptr<T>>;
 
     using base_type::data_;
     using base_type::queue_;
@@ -60,22 +59,22 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, Kokkos::DefaultExe
     /**
      * @brief Allocates `size * sizeof(T)` bytes in the Kokkos execution space @p exec.
      * @param[in] size the number of elements represented by the device_ptr
-     * @param[in] exec the associated Kokkos execution space
+     * @param[in] device the device wrapper
      */
-    explicit device_ptr(size_type size, const Kokkos::DefaultExecutionSpace &exec);
+    explicit device_ptr(size_type size, const device_wrapper &device);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes in the Kokkos execution space @p exec.
      * @param[in] shape the number of elements represented by the device_ptr
-     * @param[in] exec the associated Kokkos execution space
+     * @param[in] device the device wrapper
      */
-    explicit device_ptr(plssvm::shape shape, const Kokkos::DefaultExecutionSpace &exec);
+    explicit device_ptr(plssvm::shape shape, const device_wrapper &device);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes in the Kokkos execution space @p exec.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
-     * @param[in] exec the associated Kokkos execution space
+     * @param[in] device the device wrapper
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, const Kokkos::DefaultExecutionSpace &exec);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, const device_wrapper &device);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)
diff --git a/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp b/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
new file mode 100644
index 000000000..1baddcec6
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
@@ -0,0 +1,230 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief A wrapper around a Kokkos::View.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_VIEW_WRAPPER_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_VIEW_WRAPPER_HPP_
+
+#include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"  // PLSSVM_KOKKOS_BACKEND_INVOKE_IF_*
+#include "plssvm/backends/Kokkos/execution_space.hpp"               // plssvm::kokkos::{execution_space, execution_space_to_kokkos_type_t}, plssvm::kokkos::detail::constexpr_available_execution_spaces
+#include "plssvm/detail/utility.hpp"                                // plssvm::detail::unreachable
+
+#include "Kokkos_Core.hpp"  // Kokkos::View, Kokkos::ExecutionSpace
+
+#include <array>       // std::array
+#include <cstddef>     // std::size_t
+#include <functional>  // std::invoke
+#include <utility>     // std::make_index_sequence, std::index_sequence, std::move
+#include <variant>     // std::variant, std::get, std::visit
+
+namespace plssvm::kokkos::detail {
+
+namespace impl {
+
+/**
+ * @brief Uninstantiated base type to create a `std::variant` containing all available Kokkos::View types.
+ */
+template <typename, typename>
+struct create_view_variant_type_helper;
+
+/**
+ * @brief Helper struct to create a `std::variant` containing all available Kokkos::View types by iterating over the `std::array` of
+ *        `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`.
+ * @tparam T the value type of the underlying Kokkos::View
+ * @tparam Is the indices to index the `std::array`
+ */
+template <typename T, std::size_t... Is>
+struct create_view_variant_type_helper<T, std::index_sequence<Is...>> {
+    /// The array containing all available execution spaces.
+    constexpr static auto array = detail::constexpr_available_execution_spaces();
+    /// The resulting variant type.
+    using type = std::variant<Kokkos::View<T, execution_space_to_kokkos_type_t<array[Is]>>...>;
+};
+
+/**
+ * @brief Create a `std::variant` containing all available Kokkos::View types by iterating over the `std::array` of
+ *        `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`.
+ * @tparam T the value type of the underlying Kokkos::View
+ */
+template <typename T>
+struct create_view_variant_type {
+    /// The number of types in the final variant.
+    constexpr static std::size_t N = detail::constexpr_available_execution_spaces().size();
+    /// The final variant type.
+    using type = typename create_view_variant_type_helper<T, std::make_index_sequence<N>>::type;
+};
+
+}  // namespace impl
+
+/**
+ * @brief A wrapper class around a `std::variant` that contains all available Kokkos::View types.
+ * @tparam T the value type of the underlying Kokkos::View
+ */
+template <typename T>
+class device_view_wrapper {
+  public:
+    /// The `std::variant` type containing all Kokkos::View types.
+    using variant_type = typename impl::create_view_variant_type<T>::type;
+
+    /**
+     * @brief Default construct the `std::variant` wrapper.
+     */
+    device_view_wrapper() = default;
+
+    /**
+     * @brief Construct the wrapper using the provided Kokkos::View instance by forwarding its value to the underlying `std::variant`.
+     * @tparam ExecutionSpace the used Kokkos::ExecutionSpace type of the Kokkos::View
+     * @param[in] view the Kokkos::View instance
+     */
+    template <typename ExecutionSpace>
+    explicit device_view_wrapper(Kokkos::View<T, ExecutionSpace> &&view) :
+        v_{ std::move(view) } { }
+
+    /**
+     * @brief Given the provided `execution_space` enum value, tries to get the `std::variant` alternative for the corresponding Kokkos::ExecutionSpace type.
+     * @tparam space the `execution_space` enum value
+     * @return the Kokkos::View instance (`[[nodiscard]]`)
+     */
+    template <execution_space space>
+    [[nodiscard]] Kokkos::View<T, execution_space_to_kokkos_type_t<space>> &get() {
+        return std::get<Kokkos::View<T, execution_space_to_kokkos_type_t<space>>>(v_);
+    }
+
+    /**
+     * @copydoc plssvm::kokkos::detail::device_view_wrapper::get
+     */
+    template <execution_space space>
+    [[nodiscard]] const Kokkos::View<T, execution_space_to_kokkos_type_t<space>> &get() const {
+        return std::get<Kokkos::View<T, execution_space_to_kokkos_type_t<space>>>(v_);
+    }
+
+    /**
+     * @brief Return the `execution_space` enum value of the currently active `std::variant` Kokkos::View type.
+     * @return the `execution_space` enum value (`[[nodiscard]]`)
+     */
+    [[nodiscard]] execution_space get_execution_space() const noexcept {
+        return detail::constexpr_available_execution_spaces()[v_.index()];
+    }
+
+    /**
+     * @brief Invoke the function @p func on the active `std::variant` member using `std::visit` internally.
+     * @tparam Func the type of the function
+     * @param[in] func the function to invoke
+     */
+    template <typename Func>
+    void execute(const Func &func) {
+        // clang-format off
+        std::visit([&func](auto &view) {
+            std::invoke(func, view);
+        }, v_);
+        // clang-format on
+    }
+
+    /**
+     * @copydoc plssvm::kokkos::detail::device_view_wrapper::execute
+     */
+    template <typename Func>
+    void execute(const Func &func) const {
+        // clang-format off
+        std::visit([&func](const auto &view) {
+            std::invoke(func, view);
+        }, v_);
+        // clang-format on
+    }
+
+    /**
+     * @brief Compare two device view wrappers for equality by comparing the wrapped `std::variant`s.
+     * @param[in] lhs the first device view wrapper
+     * @param[in] rhs the second device view wrapper
+     * @return `true` if both underlying `std::variant`s are equal, otherwise `false` (`[[nodiscard]]`)
+     */
+    [[nodiscard]] friend bool operator==(const device_view_wrapper &lhs, const device_view_wrapper &rhs) noexcept {
+        return lhs.v_ == rhs.v_;
+    }
+
+    /**
+     * @brief Compare two device view wrappers for inequality by comparing the wrapped `std::variant`s.
+     * @param[in] lhs the first device view wrapper
+     * @param[in] rhs the second device view wrapper
+     * @return `true` if both underlying `std::variant`s are unequal, otherwise `false` (`[[nodiscard]]`)
+     */
+    [[nodiscard]] friend bool operator!=(const device_view_wrapper &lhs, const device_view_wrapper &rhs) noexcept {
+        return !(lhs == rhs);
+    }
+
+  private:
+    /// The wrapped `std::variant` type.
+    variant_type v_;
+};
+
+/**
+ * @brief Given a execution @p space and the number of elements @p size, creates a Kokkos::View in the respective memory space.
+ * @tparam T the value type of the underlying Kokkos::View
+ * @param[in] space the specific execution space
+ * @param[in] size the size of the Kokkos::View (number of elements **not** byte!)
+ * @return a Kokkos::View wrapper where the active member of the internal `std::variant` corresponds to the Kokkos::View in the Kokkos::ExecutionSpace specified by @p space (`[[nodiscard]]`)
+ */
+template <typename T>
+[[nodiscard]] device_view_wrapper<T> make_device_view_wrapper(const execution_space &space, const std::size_t size) {
+    switch (space) {
+        case execution_space::cuda:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA(([&]() {
+                return device_view_wrapper{ Kokkos::View<T, Kokkos::Cuda>{ "cuda_device_ptr_view", size } };
+            }));
+            break;
+        case execution_space::hip:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP(([&]() {
+                return device_view_wrapper{ Kokkos::View<T, Kokkos::HIP>{ "hip_device_ptr_view", size } };
+            }));
+            break;
+        case execution_space::sycl:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL(([&]() {
+                return device_view_wrapper{ Kokkos::View<T, Kokkos::SYCL>{ "sycl_device_ptr_view", size } };
+            }));
+            break;
+        case execution_space::hpx:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX(([&]() {
+                return device_view_wrapper{ Kokkos::View<T, Kokkos::Experimental::HPX>{ "hpx_device_ptr_view", size } };
+            }));
+            break;
+        case execution_space::openmp:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP(([&]() {
+                return device_view_wrapper{ Kokkos::View<T, Kokkos::OpenMP>{ "openmp_device_ptr_view", size } };
+            }));
+            break;
+        case execution_space::openmp_target:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET(([&]() {
+                return device_view_wrapper{ Kokkos::View<T Kokkos::OpenMPTarget>{ "openmptarget_device_ptr_view", size } };
+            }));
+            break;
+        case execution_space::openacc:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC(([&]() {
+                return device_view_wrapper{ Kokkos::View<T, Kokkos::Experimental::OpenACC>{ "openacc_device_ptr_view", size } };
+            }));
+            break;
+        case execution_space::threads:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS(([&]() {
+                return device_view_wrapper{ Kokkos::View<T, Kokkos::Threads>{ "threads_device_ptr_view", size } };
+            }));
+            break;
+        case execution_space::serial:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL(([&]() {
+                return device_view_wrapper{ Kokkos::View<T, Kokkos::Serial>{ "serial_device_ptr_view", size } };
+            }));
+            break;
+    }
+    // all possible cases should be handled by the previous switch
+    // -> silence missing return statement compiler warnings due to throw statement
+    ::plssvm::detail::unreachable();
+}
+
+}  // namespace plssvm::kokkos::detail
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_VIEW_WRAPPER_HPP_
diff --git a/include/plssvm/backends/Kokkos/detail/device_wrapper.hpp b/include/plssvm/backends/Kokkos/detail/device_wrapper.hpp
new file mode 100644
index 000000000..30b2a91be
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/detail/device_wrapper.hpp
@@ -0,0 +1,197 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief A wrapper around a Kokkos::ExecutionSpace representing a single device.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_WRAPPER_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_WRAPPER_HPP_
+
+#include "plssvm/backends/Kokkos/execution_space.hpp"  // plssvm::kokkos::{execution_space, execution_space_to_kokkos_type_t}, plssvm::kokkos::detail::constexpr_available_execution_spaces
+#include "plssvm/target_platforms.hpp"                 // plssvm::target_platform
+
+#include <array>       // std::array
+#include <cstddef>     // std::size_t
+#include <functional>  // std::invoke
+#include <utility>     // std::make_index_sequence, std::index_sequence, std::forward
+#include <variant>     // std::variant, std::get, std::visit
+#include <vector>      // std::vector
+
+namespace plssvm::kokkos::detail {
+
+namespace impl {
+
+/**
+ * @brief Uninstantiated base type to create a `std::variant` containing all available Kokkos::ExecutionSpace types.
+ */
+template <typename>
+struct create_device_variant_type_helper;
+
+/**
+ * @brief Helper struct to create a `std::variant` containing all available Kokkos::ExecutionSpace types by iterating over the `std::array` of
+ *        `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`.
+ * @tparam Is the indices to index the `std::array`
+ */
+template <std::size_t... Is>
+struct create_device_variant_type_helper<std::index_sequence<Is...>> {
+    /// The array containing all available execution spaces.
+    constexpr static auto array = detail::constexpr_available_execution_spaces();
+    /// The resulting variant type.
+    using type = std::variant<execution_space_to_kokkos_type_t<array[Is]>...>;
+};
+
+/**
+ * @brief Create a `std::variant` containing all available Kokkos::ExecutionSpace types by iterating over the `std::array` of
+ *        `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`.
+ */
+struct create_device_variant_type {
+    /// The number of types in the final variant.
+    constexpr static std::size_t N = detail::constexpr_available_execution_spaces().size();
+    /// The final variant type.
+    using type = typename create_device_variant_type_helper<std::make_index_sequence<N>>::type;
+};
+
+}  // namespace impl
+
+/**
+ * @brief A wrapper class around a `std::variant` that contains all available Kokkos::ExecutionSpace types.
+ */
+class device_wrapper {
+  public:
+    /// The `std::variant` type containing all Kokkos::ExecutionSpace types.
+    using variant_type = typename impl::create_device_variant_type::type;
+
+    /**
+     * @brief Default construct the `std::variant` wrapper.
+     */
+    device_wrapper() = default;
+
+    /**
+     * @brief Construct the wrapper using the provided Kokkos::ExecutionSpace instance by forwarding its value to the underlying `std::variant`.
+     * @tparam ExecutionSpace the used Kokkos::ExecutionSpace type
+     * @param[in] exec the Kokkos::ExecutionSpace instance
+     */
+    template <typename ExecutionSpace>
+    explicit device_wrapper(ExecutionSpace &&exec) :
+        v_{ std::forward<ExecutionSpace>(exec) } { }
+
+    /**
+     * @brief Given the provided `execution_space` enum value, tries to get the `std::variant` alternative for the corresponding Kokkos::ExecutionSpace type.
+     * @tparam space the `execution_space` enum value
+     * @return the Kokkos::ExecutionSpace instance (`[[nodiscard]]`)
+     */
+    template <execution_space space>
+    [[nodiscard]] execution_space_to_kokkos_type_t<space> &get() {
+        return std::get<execution_space_to_kokkos_type_t<space>>(v_);
+    }
+
+    /**
+     * @copydoc plssvm::kokkos::detail::device_wrapper::get
+     */
+    template <execution_space space>
+    const execution_space_to_kokkos_type_t<space> &get() const {
+        return std::get<execution_space_to_kokkos_type_t<space>>(v_);
+    }
+
+    /**
+     * @brief Return the `execution_space` enum value of the currently active `std::variant` Kokkos::ExecutionSpace type.
+     * @return the `execution_space` enum value (`[[nodiscard]]`)
+     */
+    [[nodiscard]] execution_space get_execution_space() const noexcept {
+        return detail::constexpr_available_execution_spaces()[v_.index()];
+    }
+
+    /**
+     * @brief Invoke the function @p func on the active `std::variant` member using `std::visit` internally.
+     * @tparam Func the type of the function
+     * @param[in] func the function to invoke
+     */
+    template <typename Func>
+    void execute(const Func &func) {
+        // clang-format off
+        std::visit([&func](auto &device) {
+            std::invoke(func, device);
+        }, v_);
+        // clang-format on
+    }
+
+    /**
+     * @copydoc plssvm::kokkos::detail::device_wrapper::execute
+     */
+    template <typename Func>
+    void execute(const Func &func) const {
+        // clang-format off
+        std::visit([&func](const auto &device) {
+            std::invoke(func, device);
+        }, v_);
+        // clang-format on
+    }
+
+    /**
+     * @brief Invoke the function @p func on the active `std::variant` member using `std::visit` internally returning the result value of the function invocation.
+     * @tparam Func the type of the function
+     * @param[in] func the function to invoke
+     * @return the return value of function @p func (`[[nodiscard]]`)
+     */
+    template <typename Func>
+    [[nodiscard]] auto execute_and_return(const Func &func) {
+        // clang-format off
+        return std::visit([&func](auto &device) {
+            return std::invoke(func, device);
+        }, v_);
+        // clang-format on
+    }
+
+    /**
+     * @copydoc plssvm::kokkos::detail::device_wrapper::execute_and_return
+     */
+    template <typename Func>
+    [[nodiscard]] auto execute_and_return(const Func &func) const {
+        // clang-format off
+        return std::visit([&func](const auto &device) {
+            return std::invoke(func, device);
+        }, v_);
+        // clang-format on
+    }
+
+    /**
+     * @brief Compare two device wrappers for equality by comparing the wrapped `std::variant`s.
+     * @param[in] lhs the first device wrapper
+     * @param[in] rhs the second device wrapper
+     * @return `true` if both underlying `std::variant`s are equal, otherwise `false` (`[[nodiscard]]`)
+     */
+    [[nodiscard]] friend bool operator==(const device_wrapper &lhs, const device_wrapper &rhs) noexcept {
+        return lhs.v_ == rhs.v_;
+    }
+
+    /**
+     * @brief Compare two device wrappers for inequality by comparing the wrapped `std::variant`s.
+     * @param[in] lhs the first device wrapper
+     * @param[in] rhs the second device wrapper
+     * @return `true` if both underlying `std::variant`s are unequal, otherwise `false` (`[[nodiscard]]`)
+     */
+    [[nodiscard]] friend bool operator!=(const device_wrapper &lhs, const device_wrapper &rhs) noexcept {
+        return !(lhs == rhs);
+    }
+
+  private:
+    /// The wrapped `std::variant` type.
+    variant_type v_{};
+};
+
+/**
+ * @brief Get a list of all available devices in the execution @p space that are supported by the @p target platform.
+ * @param[in] space the Kokkos::ExecutionSpace to retrieve the devices from
+ * @param[in] target the target platform that must be supported
+ * @return all devices for the @p target in the Kokkos::ExecutionSpace @p space (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::vector<device_wrapper> get_device_list(execution_space space, target_platform target);
+
+}  // namespace plssvm::kokkos::detail
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_WRAPPER_HPP_
diff --git a/include/plssvm/backends/Kokkos/detail/typedefs.hpp b/include/plssvm/backends/Kokkos/detail/typedefs.hpp
deleted file mode 100644
index 61fffbb31..000000000
--- a/include/plssvm/backends/Kokkos/detail/typedefs.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * @file
- * @author Alexander Van Craen
- * @author Marcel Breyer
- * @copyright 2018-today The PLSSVM project - All Rights Reserved
- * @license This file is part of the PLSSVM project which is released under the MIT license.
- *          See the LICENSE.md file in the project root for full license information.
- *
- * @brief A few convenient Kokkos::View typedefs.
- */
-
-#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_TYPEDEFS_HPP_
-#define PLSSVM_BACKENDS_KOKKOS_DETAIL_TYPEDEFS_HPP_
-#pragma once
-
-#include "Kokkos_Core.hpp"  // Kokkos::View, Kokkos::DefaultExecutionSpace, Kokkos::HostSpace, Kokkos::MemoryUnmanaged
-
-namespace plssvm::kokkos::detail {
-
-/**
- * @brief Typedef for a simple Kokkos::View targeting the Kokkos::DefaultExecutionSpace.
- * @tparam T the type of the view's data
- */
-template <typename T>
-using device_view_type = Kokkos::View<T *, Kokkos::DefaultExecutionSpace>;
-
-/**
- * @brief Typedef for a simple Kokkos::View always targeting the Kokkos::HostSpace.
- * @tparam T the type of the view's data
- */
-template <typename T>
-using host_view_type = Kokkos::View<T *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
-
-}  // namespace plssvm::kokkos::detail
-
-#endif  // PLSSVM_BACKENDS_KOKKOS_DETAIL_TYPEDEFS_HPP_
diff --git a/include/plssvm/backends/Kokkos/detail/utility.hpp b/include/plssvm/backends/Kokkos/detail/utility.hpp
index e29468830..fe8b0367f 100644
--- a/include/plssvm/backends/Kokkos/detail/utility.hpp
+++ b/include/plssvm/backends/Kokkos/detail/utility.hpp
@@ -13,52 +13,76 @@
 #define PLSSVM_BACKENDS_KOKKOS_DETAIL_UTILITY_HPP_
 #pragma once
 
-#include "plssvm/backends/Kokkos/execution_space.hpp"               // plssvm::kokkos::execution_space
-#include "plssvm/target_platforms.hpp"                              // plssvm::target_platform
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"  // plssvm::kokkos::detail::device_wrapper
+#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
+#include "plssvm/detail/type_traits.hpp"                     // PLSSVM_REQUIRES
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
-#include "Kokkos_Core.hpp"  // Kokkos::DefaultExecutionSpace
+#include "Kokkos_Core.hpp"  // Kokkos::ExecutionSpace::fence
 
-#include <string>  // std::string
-#include <vector>  // std::vector
+#include <map>          // std::map
+#include <string>       // std::string
+#include <type_traits>  // std::disjunction, std::is_same
+#include <variant>      // std::variant
+#include <vector>       // std::vector
 
 namespace plssvm::kokkos::detail {
 
+namespace impl {
+
+/**
+ * @brief Uninstantiated base type for the check whether a type @p appears in a std::variant @p Variant.
+ * @tparam T the type to check for inclusion
+ * @tparam Variant the std::variant that should include the type @p T
+ */
+template <typename T, typename Variant>
+struct is_type_in_variant;
+
 /**
- * @brief Given the execution @p space, determine the respective default target platform.
- * @param[in] space the Kokkos::ExecutionSpace for which the default target platform should be determined
- * @return the default target platform (`[[nodiscard]]`)
+ * @brief Implement the inclusion check using `std::disjunction`.
+ * @tparam T the type to check for inclusion
+ * @tparam Variant the std::variant that should include the type @p T
  */
-[[nodiscard]] target_platform determine_default_target_platform_from_execution_space(execution_space space);
+template <typename T, typename... Types>
+struct is_type_in_variant<T, std::variant<Types...>> : std::disjunction<std::is_same<T, Types>...> { };
 
 /**
- * @brief Check whether the execution @p space supports the @p target platform. Throws an `plssvm::kokkos::backend_exception` if that's not the case.
- * @param[in] space the Kokkos::ExecutionSpace to investigate
- * @param[in] target the target platform to check
- * @throws plssvm::kokkos::backend_exception if @p space doesn't support the @p target platform
+ * @copydoc plssvm::kokkos::detail::impl::is_type_in_variant
  */
-void check_execution_space_target_platform_combination(execution_space space, target_platform target);
+template <typename T, typename Variant>
+inline constexpr bool is_type_in_variant_v = is_type_in_variant<T, Variant>::value;
+
+}  // namespace impl
 
 /**
- * @brief Get a list of all available devices in the execution @p space that are supported by the @p target platform.
- * @param[in] space the Kokkos::ExecutionSpace to retrieve the devices from
- * @param[in] target the target platform that must be supported
- * @return all devices for the @p target in the Kokkos::ExecutionSpace @p space (`[[nodiscard]]`)
+ * @brief Return a `std::map` containing a mapping from all available target platforms to the available Kokkos::ExecutionSpace that supports said target platform.
+ * @details If a target platform is supported by multiple Kokkos::ExecutionSpace, the order is determined by the order as returned by `list_available_execution_spaces`.
+ * @return the mapping of all available target_platform <-> Kokkos::ExecutionSpace combinations (`[[nodiscard]]`)
  */
-[[nodiscard]] std::vector<Kokkos::DefaultExecutionSpace> get_device_list(execution_space space, target_platform target);
+[[nodiscard]] std::map<target_platform, std::vector<execution_space>> available_target_platform_to_execution_space_mapping();
 
 /**
- * @brief Get the name of the device represented by the Kokkos::ExecutionSpace @p exec in the execution @p space.
- * @param[in] space the Kokkos::ExecutionSpace
- * @param[in] exec the device
+ * @brief Get the name of the device represented by the `device_wrapper` @p dev.
+ * @param[in] dev the device wrapper
  * @return the device name (`[[nodiscard]]`)
  */
-[[nodiscard]] std::string get_device_name(execution_space space, const Kokkos::DefaultExecutionSpace &exec);
+[[nodiscard]] std::string get_device_name(const device_wrapper &dev);
+
+/**
+ * @brief Wait for all kernel and/or other operations on the device wrapper in the @p dev to finish.
+ * @param[in] dev the device wrapper
+ */
+void device_synchronize(const device_wrapper &dev);
 
 /**
- * @brief Wait for all kernel and/or other operations on the Kokkos::ExecutionSpace @p exec to finish
- * @param[in] exec the Kokkos::ExecutionSpace to synchronize
+ * @brief Wait for all kernel and/or other operations on the device represented by the Kokkos::ExecutionSpace @p exec to finish.
+ * @tparam ExecutionSpace the type of the Kokkos::ExecutionSpace
+ * @param[in] exec the device represented by a Kokkos::ExecutionSpace
  */
-void device_synchronize(const Kokkos::DefaultExecutionSpace &exec);
+template <typename ExecutionSpace, PLSSVM_REQUIRES(impl::is_type_in_variant_v<ExecutionSpace, typename impl::create_device_variant_type::type>)>
+void device_synchronize(const ExecutionSpace &exec) {
+    exec.fence();
+}
 
 /**
  * @brief Get the used Kokkos library version.
diff --git a/include/plssvm/backends/Kokkos/execution_space.hpp b/include/plssvm/backends/Kokkos/execution_space.hpp
index fa1236d70..bb37a39a7 100644
--- a/include/plssvm/backends/Kokkos/execution_space.hpp
+++ b/include/plssvm/backends/Kokkos/execution_space.hpp
@@ -13,9 +13,12 @@
 #define PLSSVM_BACKENDS_KOKKOS_EXECUTION_SPACE_HPP_
 #pragma once
 
+#include "Kokkos_Core.hpp"  // Kokkos macros, Kokkos ExecutionSpace types
+
 #include "fmt/base.h"     // fmt::formatter
 #include "fmt/ostream.h"  // fmt::ostream_formatter
 
+#include <array>   // std::array
 #include <iosfwd>  // std::ostream forward declaration
 #include <vector>  // std::vector
 
@@ -45,19 +48,6 @@ enum class execution_space {
     serial
 };
 
-/**
- * @brief Create an `execution_space` from the current `Kokkos::DefaultExecutionSpace`.
- * @return the enum value representing the current `Kokkos::DefaultExecutionSpace` (`[[nodiscard]]`)
- */
-[[nodiscard]] execution_space determine_default_execution_space() noexcept;
-
-/**
- * @brief List all available Kokkos::ExecutionSpaces.
- * @details At least one execution space must **always** be available!
- * @return a vector containing all available execution spaces (`[[nodiscard]]`)
- */
-[[nodiscard]] std::vector<execution_space> available_execution_spaces();
-
 /**
  * @brief Output the execution @p space to the given output-stream @p out.
  * @param[in,out] out the output-stream to write the execution space to
@@ -74,9 +64,277 @@ std::ostream &operator<<(std::ostream &out, execution_space space);
  */
 std::istream &operator>>(std::istream &in, execution_space &space);
 
+//***************************************************//
+//           execution_space_to_kokkos_type          //
+//***************************************************//
+
+/**
+ * @brief Uninstantiated base type to convert an `execution_space` enum value to a Kokkos::ExecutionSpace type.
+ */
+template <execution_space>
+struct execution_space_to_kokkos_type;
+
+#if defined(KOKKOS_ENABLE_CUDA)
+/**
+ * @brief Convert an `execution_space::cuda` enum value to a `Kokkos::Cuda` Kokkos::ExecutionSpace type.
+ */
+template <>
+struct execution_space_to_kokkos_type<execution_space::cuda> {
+    using type = Kokkos::Cuda;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_HIP)
+/**
+ * @brief Convert an `execution_space::hip` enum value to a `Kokkos::HIP` Kokkos::ExecutionSpace type.
+ */
+template <>
+struct execution_space_to_kokkos_type<execution_space::hip> {
+    using type = Kokkos::HIP;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_SYCL)
+/**
+ * @brief Convert an `execution_space::sycl` enum value to a `Kokkos::SYCL` Kokkos::ExecutionSpace type.
+ */
+template <>
+struct execution_space_to_kokkos_type<execution_space::sycl> {
+    using type = Kokkos::SYCL;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_HPX)
+/**
+ * @brief Convert an `execution_space::hpx` enum value to a `Kokkos::Experimental::HPX` Kokkos::ExecutionSpace type.
+ */
+template <>
+struct execution_space_to_kokkos_type<execution_space::hpx> {
+    using type = Kokkos::Experimental::HPX;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENMP)
+/**
+ * @brief Convert an `execution_space::openmp` enum value to a `Kokkos::OpenMP` Kokkos::ExecutionSpace type.
+ */
+template <>
+struct execution_space_to_kokkos_type<execution_space::openmp> {
+    using type = Kokkos::OpenMP;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+/**
+ * @brief Convert an `execution_space::openmp_target` enum value to a `Kokkos::OpenMPTarget` Kokkos::ExecutionSpace type.
+ */
+template <>
+struct execution_space_to_kokkos_type<execution_space::openmp_target> {
+    using type = Kokkos::OpenMPTarget;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENACC)
+/**
+ * @brief Convert an `execution_space::openacc` enum value to a `Kokkos::Experimental::OpenACC` Kokkos::ExecutionSpace type.
+ */
+template <>
+struct execution_space_to_kokkos_type<execution_space::openacc> {
+    using type = Kokkos::Experimental::OpenACC;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_THREADS)
+/**
+ * @brief Convert an `execution_space::threads` enum value to a `Kokkos::Threads` Kokkos::ExecutionSpace type.
+ */
+template <>
+struct execution_space_to_kokkos_type<execution_space::threads> {
+    using type = Kokkos::Threads;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_SERIAL)
+/**
+ * @brief Convert an `execution_space::serial` enum value to a `Kokkos::Serial` Kokkos::ExecutionSpace type.
+ */
+template <>
+struct execution_space_to_kokkos_type<execution_space::serial> {
+    using type = Kokkos::Serial;
+};
+#endif
+
+/**
+ * @brief Convert the `execution_space` @p space to the corresponding Kokkos::ExecutionSpace type.
+ * @tparam space the enum value to convert
+ */
+template <execution_space space>
+using execution_space_to_kokkos_type_t = typename execution_space_to_kokkos_type<space>::type;
+
+//***************************************************//
+//           kokkos_type_to_execution_space          //
+//***************************************************//
+
+/**
+ * @brief Uninstantiated base type to convert a Kokkos::ExecutionSpace type to a `execution_space` enum value.
+ */
+template <typename>
+struct kokkos_type_to_execution_space;
+
+#if defined(KOKKOS_ENABLE_CUDA)
+/**
+ * @brief Convert a `Kokkos::Cuda` Kokkos::ExecutionSpace type to an `execution_space::cuda` enum value.
+ */
+template <>
+struct kokkos_type_to_execution_space<Kokkos::Cuda> {
+    constexpr static execution_space value = execution_space::cuda;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_HIP)
+/**
+ * @brief Convert a `Kokkos::HIP` Kokkos::ExecutionSpace type to an `execution_space::hip` enum value.
+ */
+template <>
+struct kokkos_type_to_execution_space<Kokkos::HIP> {
+    constexpr static execution_space value = execution_space::hip;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_SYCL)
+/**
+ * @brief Convert a `Kokkos::SYCL` Kokkos::ExecutionSpace type to an `execution_space::sycl` enum value.
+ */
+template <>
+struct kokkos_type_to_execution_space<Kokkos::SYCL> {
+    constexpr static execution_space value = execution_space::sycl;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_HPX)
+/**
+ * @brief Convert a `Kokkos::Experimental::HPX` Kokkos::ExecutionSpace type to an `execution_space::hpx` enum value.
+ */
+template <>
+struct kokkos_type_to_execution_space<Kokkos::Experimental::HPX> {
+    constexpr static execution_space value = execution_space::hpx;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENMP)
+/**
+ * @brief Convert a `Kokkos::OpenMP` Kokkos::ExecutionSpace type to an `execution_space::openmp` enum value.
+ */
+template <>
+struct kokkos_type_to_execution_space<Kokkos::OpenMP> {
+    constexpr static execution_space value = execution_space::openmp;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+/**
+ * @brief Convert a `Kokkos::OpenMPTarget` Kokkos::ExecutionSpace type to an `execution_space::openmp_target` enum value.
+ */
+template <>
+struct kokkos_type_to_execution_space<Kokkos::OpenMPTarget> {
+    constexpr static execution_space value = execution_space::openmp_target;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENACC)
+/**
+ * @brief Convert a `Kokkos::Experimental::OpenACC` Kokkos::ExecutionSpace type to an `execution_space::openacc` enum value.
+ */
+template <>
+struct kokkos_type_to_execution_space<Kokkos::Experimental::OpenACC> {
+    constexpr static execution_space value = execution_space::openacc;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_THREADS)
+/**
+ * @brief Convert a `Kokkos::Threads` Kokkos::ExecutionSpace type to an `execution_space::threads` enum value.
+ */
+template <>
+struct kokkos_type_to_execution_space<Kokkos::Threads> {
+    constexpr static execution_space value = execution_space::threads;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_SERIAL)
+/**
+ * @brief Convert a `Kokkos::Serial` Kokkos::ExecutionSpace type to an `execution_space::serial` enum value.
+ */
+template <>
+struct kokkos_type_to_execution_space<Kokkos::Serial> {
+    constexpr static execution_space value = execution_space::serial;
+};
+#endif
+
+/**
+ * @brief Convert the Kokkos::ExecutionSpace type @p ExecutionSpace to the corresponding `execution_space` enum value.
+ * @tparam ExecutionSpace the Kokkos::ExecutionSpace type to convert
+ */
+template <typename ExecutionSpace>
+inline constexpr execution_space kokkos_type_to_execution_space_v = kokkos_type_to_execution_space<ExecutionSpace>::value;
+
+//***************************************************//
+//                  other functions                  //
+//***************************************************//
+
+namespace detail {
+
+/**
+ * @brief List all available Kokkos::ExecutionSpaces at compile time.
+ * @details At least one execution space must **always** be available!
+ * @return a `std::array` containing all available execution spaces (`[[nodiscard]]`)
+ */
+[[nodiscard]] inline constexpr auto constexpr_available_execution_spaces() noexcept {
+    // Note: the trailing comma is explicitly allowed by the standard
+    // Note: the order is intentionally chosen this way -> the order of the entries determines the priority when using a backend to run our code
+    return std::array{
+#if defined(KOKKOS_ENABLE_CUDA)
+        execution_space::cuda,
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+        execution_space::hip,
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)
+        execution_space::sycl,
+#endif
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+        execution_space::openmp_target,
+#endif
+#if defined(KOKKOS_ENABLE_OPENACC)
+        execution_space::openacc,
+#endif
+#if defined(KOKKOS_ENABLE_OPENMP)
+        execution_space::openmp,
+#endif
+#if defined(KOKKOS_ENABLE_THREADS)
+        execution_space::threads,
+#endif
+#if defined(KOKKOS_ENABLE_HPX)
+        execution_space::hpx,
+#endif
+#if defined(KOKKOS_ENABLE_SERIAL)
+        execution_space::serial,
+#endif
+    };
+}
+
+}  // namespace detail
+
+/**
+ * @brief List all available Kokkos::ExecutionSpaces.
+ * @details Only Kokkos::ExecutionSpaces that where enabled during the CMake configuration are available.
+ * @return the available Kokkos::ExecutionSpaces (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::vector<execution_space> list_available_execution_spaces();
+
 }  // namespace plssvm::kokkos
 
-/// @endcond
+/// @cond
 
 template <>
 struct fmt::formatter<plssvm::kokkos::execution_space> : fmt::ostream_formatter { };
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
index 85997c118..bddadac01 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
@@ -13,10 +13,9 @@
 #define PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_BLAS_HPP_
 #pragma once
 
-#include "plssvm/backends/Kokkos/detail/typedefs.hpp"  // plssvm::kokkos::detail::device_view_type
-#include "plssvm/constants.hpp"                        // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
 
-#include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents
+#include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents
 
 #include <cstddef>  // std::size_t
 
@@ -24,8 +23,16 @@ namespace plssvm::kokkos::detail {
 
 /**
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
+ * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
  */
+template <typename ExecutionSpace>
 class device_kernel_symm {
+    /**
+     * @brief The type of the used Kokkos::View.
+     */
+    template <typename T>
+    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+
   public:
     /**
      * @brief Initialize the Kokkos kernel function object.
@@ -40,6 +47,7 @@ class device_kernel_symm {
      * @param[in,out] C the matrix @p C, also used as result matrix
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_size_x the size of the execution grid in x-dimension
      */
     device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, device_view_type<const real_type> A, device_view_type<const real_type> B, const real_type beta, device_view_type<real_type> C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
         num_rows_{ num_rows },
@@ -55,8 +63,12 @@ class device_kernel_symm {
         grid_y_offset_{ grid_y_offset },
         grid_size_x_{ grid_size_x } { }
 
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] team the Kokkos team representing the current point in the execution space
+     */
     KOKKOS_INLINE_FUNCTION
-    void operator()(const Kokkos::TeamPolicy<>::member_type &team) const {
+    void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
         const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
         const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
@@ -155,8 +167,16 @@ class device_kernel_symm {
 /**
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
+ * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
  */
+template <typename ExecutionSpace>
 class device_kernel_symm_mirror {
+    /**
+     * @brief The type of the used Kokkos::View.
+     */
+    template <typename T>
+    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+
   public:
     /**
      * @brief Initialize the Kokkos kernel function object.
@@ -172,6 +192,7 @@ class device_kernel_symm_mirror {
      * @param[in,out] C the matrix @p C, also used as result matrix
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_size_x the size of the execution grid in x-dimension
      */
     device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, device_view_type<const real_type> A, device_view_type<const real_type> B, const real_type beta, device_view_type<real_type> C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
         num_rows_{ num_rows },
@@ -188,8 +209,12 @@ class device_kernel_symm_mirror {
         grid_y_offset_{ grid_y_offset },
         grid_size_x_{ grid_size_x } { }
 
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] team the Kokkos team representing the current point in the execution space
+     */
     KOKKOS_INLINE_FUNCTION
-    void operator()(const Kokkos::TeamPolicy<>::member_type &team) const {
+    void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
         const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
         const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
@@ -278,8 +303,16 @@ class device_kernel_symm_mirror {
 
 /**
  * @brief Perform a simple inplace matrix addition: lhs += rhs.
+ * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
  */
+template <typename ExecutionSpace>
 class device_kernel_inplace_matrix_add {
+    /**
+     * @brief The type of the used Kokkos::View.
+     */
+    template <typename T>
+    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+
   public:
     /**
      * @brief Initialize the Kokkos kernel function object.
@@ -288,6 +321,7 @@ class device_kernel_inplace_matrix_add {
      * @param[in] rhs the second matrix
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_size_x the size of the execution grid in x-dimension
      */
     device_kernel_inplace_matrix_add(const std::size_t num_cols, device_view_type<real_type> lhs, device_view_type<const real_type> rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
         num_cols_{ num_cols },
@@ -297,8 +331,12 @@ class device_kernel_inplace_matrix_add {
         grid_y_offset_{ grid_y_offset },
         grid_size_x_{ grid_size_x } { }
 
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] team the Kokkos team representing the current point in the execution space
+     */
     KOKKOS_INLINE_FUNCTION
-    void operator()(const Kokkos::TeamPolicy<>::member_type &team) const {
+    void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
         const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
         const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
@@ -337,8 +375,16 @@ class device_kernel_inplace_matrix_add {
 
 /**
  * @brief Perform a simple inplace matrix scale: lhs *= scalar.
+ * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
  */
+template <typename ExecutionSpace>
 class device_kernel_inplace_matrix_scale {
+    /**
+     * @brief The type of the used Kokkos::View.
+     */
+    template <typename T>
+    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+
   public:
     /**
      * @brief Initialize the Kokkos kernel function object.
@@ -347,6 +393,7 @@ class device_kernel_inplace_matrix_scale {
      * @param[in] scale the value to scale
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_size_x the size of the execution grid in x-dimension
      */
     device_kernel_inplace_matrix_scale(const std::size_t num_cols, device_view_type<real_type> lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
         num_cols_{ num_cols },
@@ -356,8 +403,12 @@ class device_kernel_inplace_matrix_scale {
         grid_y_offset_{ grid_y_offset },
         grid_size_x_{ grid_size_x } { }
 
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] team the Kokkos team representing the current point in the execution space
+     */
     KOKKOS_INLINE_FUNCTION
-    void operator()(const Kokkos::TeamPolicy<>::member_type &team) const {
+    void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
         const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
         const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
index 550dbfe0e..b3d46112d 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -14,12 +14,11 @@
 #pragma once
 
 #include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp"  // plssvm::kokkos::detail::standard_layout_tuple
-#include "plssvm/backends/Kokkos/detail/typedefs.hpp"               // plssvm::kokkos::detail::device_view_type
 #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"       // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
 #include "plssvm/constants.hpp"                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                         // plssvm::kernel_function_type
 
-#include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents
+#include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents
 
 #include <cstddef>  // std::size_t
 
@@ -27,15 +26,21 @@ namespace plssvm::kokkos::detail {
 
 /**
  * @brief Create the explicit kernel matrix using the @p kernel_function.
+ * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `standard_layout_tuple`
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <typename ExecutionSpace, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly {
+    /**
+     * @brief The type of the used Kokkos::View.
+     */
+    template <typename T>
+    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+
   public:
     /**
-     * @brief Initialize the SYCL kernel function object.
-     * @param[in] cgh the SYCL handler used to allocate the local memory
+     * @brief Initialize the Kokkos kernel function object.
      * @param[out] kernel_matrix_d the calculated kernel matrix
      * @param[in] data_d the data points to calculate the kernel matrix from
      * @param[in] num_rows the number of data points
@@ -47,6 +52,7 @@ class device_kernel_assembly {
      * @param[in] cost the cost factor the diagonal is scaled with
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_size_x the size of the execution grid in x-dimension
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
     device_kernel_assembly(device_view_type<real_type> kernel_matrix_d, device_view_type<real_type> data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, device_view_type<real_type> q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
@@ -65,8 +71,12 @@ class device_kernel_assembly {
         kernel_function_parameter_{ detail::make_standard_layout_tuple(std::forward<Args>(kernel_function_parameter)...) } {
     }
 
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] team the Kokkos team representing the current point in the execution space
+     */
     KOKKOS_INLINE_FUNCTION
-    void operator()(const Kokkos::TeamPolicy<>::member_type &team) const {
+    void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
         const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
         const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
@@ -104,10 +114,10 @@ class device_kernel_assembly {
                     const auto global_j = row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
 
                     // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    data_cache_i(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + 1ull + PADDING_SIZE_sz) + global_i];
-                    data_cache_i(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + 1ull + PADDING_SIZE_sz) + global_i];
-                    data_cache_j(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + 1ull + PADDING_SIZE_sz) + global_j];
-                    data_cache_j(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + 1ull + PADDING_SIZE_sz) + global_j];
+                    data_cache_i(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i];
+                    data_cache_i(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i];
+                    data_cache_j(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j];
+                    data_cache_j(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j];
                 }
                 team.team_barrier();  // wait until all threads loaded their part of the data
 
@@ -141,7 +151,7 @@ class device_kernel_assembly {
                             temp_ij += cost_;
                         }
                         // update the kernel matrix
-                        kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - device_global_j * (device_global_j + 1ull) / 2ull + device_global_i] = temp_ij;
+                        kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 }; + device_global_i] = temp_ij;
                     }
                 }
             }
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index cf73cadb4..b22f69885 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -14,12 +14,11 @@
 #pragma once
 
 #include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp"  // plssvm::kokkos::detail::standard_layout_tuple
-#include "plssvm/backends/Kokkos/detail/typedefs.hpp"               // plssvm::kokkos::detail::device_view_type
 #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"       // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
 #include "plssvm/constants.hpp"                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                         // plssvm::kernel_function_type
 
-#include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents, Kokkos::atomic_add
+#include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents, Kokkos::atomic_add
 
 #include <cstddef>  // std::size_t
 
@@ -27,11 +26,18 @@ namespace plssvm::kokkos::detail {
 
 /**
  * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
+ * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <typename ExecutionSpace, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly_symm {
+    /**
+     * @brief The type of the used Kokkos::View.
+     */
+    template <typename T>
+    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+
   public:
     /**
      * @brief Initialize the Kokkos kernel function object.
@@ -49,6 +55,7 @@ class device_kernel_assembly_symm {
      * @param[in] num_classes the number of classes in the data set
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_size_x the size of the execution grid in x-dimension
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
     device_kernel_assembly_symm(const real_type alpha, device_view_type<const real_type> q, device_view_type<const real_type> data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, device_view_type<const real_type> B, device_view_type<real_type> C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
@@ -69,8 +76,12 @@ class device_kernel_assembly_symm {
         grid_size_x_{ grid_size_x },
         kernel_function_parameter_{ detail::make_standard_layout_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
 
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] team the Kokkos team representing the current point in the execution space
+     */
     KOKKOS_INLINE_FUNCTION
-    void operator()(const Kokkos::TeamPolicy<>::member_type &team) const {
+    void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
         const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
         const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
diff --git a/include/plssvm/backends/Kokkos/kernel/detail/memset_kernel.hpp b/include/plssvm/backends/Kokkos/kernel/detail/memset_kernel.hpp
new file mode 100644
index 000000000..584b1afdd
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/kernel/detail/memset_kernel.hpp
@@ -0,0 +1,56 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Defines a Kokkos function object for memsetting a device pointer with a specific value.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_KERNEL_DETAIL_MEMSET_KERNEL_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_KERNEL_DETAIL_MEMSET_KERNEL_HPP_
+#pragma once
+
+#include "plssvm/constants.hpp"  // plssvm::real_type
+
+#include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION
+
+#include <cstddef>  // std::size_t
+
+namespace plssvm::kokkos::detail {
+
+/**
+ * @brief A kernel to perform a memset-like operation on a Kokkos::View
+ */
+class device_memset_kernel {
+  public:
+    /**
+     * @brief Memset all bytes in @p data to the provided @p pattern.
+     * @param[out] data the array to memset
+     * @param[in] pattern the memset pattern
+     */
+    device_memset_kernel(unsigned char* data, const unsigned char pattern) :
+        data_{ data },
+        pattern_{ pattern } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] idx the index representing the current point in the execution space
+     */
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const std::size_t idx) const {
+        data_[idx] = pattern_;
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    unsigned char* data_;
+    const unsigned char pattern_;
+    /// @endcond
+};
+
+}  // namespace plssvm::kokkos::detail
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_KERNEL_DETAIL_MEMSET_KERNEL_HPP_
diff --git a/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp b/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
index 952b1e99f..3c6f9c8aa 100644
--- a/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
@@ -17,7 +17,7 @@
 #include "plssvm/detail/utility.hpp"                                // plssvm::detail::always_false_v
 #include "plssvm/kernel_function_types.hpp"                         // plssvm::kernel_function_type
 
-#include "Kokkos_MathematicalFunctions.hpp"  // Kokkos::pow, Kokkos::exp, Kokkos::tanh, Kokkos::abs
+#include "Kokkos_MathematicalFunctions.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::pow, Kokkos::exp, Kokkos::tanh, Kokkos::abs
 
 #include <type_traits>  // std::is_same_v
 
@@ -59,7 +59,7 @@ KOKKOS_INLINE_FUNCTION real_type feature_reduce<kernel_function_type::rbf>(const
  */
 template <>
 KOKKOS_INLINE_FUNCTION real_type feature_reduce<kernel_function_type::laplacian>(const real_type val1, const real_type val2) {
-    return ::Kokkos::fabs(val1 - val2);
+    return Kokkos::fabs(val1 - val2);
 }
 
 /**
@@ -73,9 +73,9 @@ template <>
 KOKKOS_INLINE_FUNCTION real_type feature_reduce<kernel_function_type::chi_squared>(const real_type val1, const real_type val2) {
     const real_type d = val1 - val2;
     if constexpr (std::is_same_v<real_type, float>) {
-        return (real_type{ 1.0 } / (val1 + val2 + FLT_MIN)) * d * d;  // TODO: std::numeric_limits::min
+        return (real_type{ 1.0 } / (val1 + val2 + FLT_MIN)) * d * d;
     } else {
-        return (real_type{ 1.0 } / (val1 + val2 + DBL_MIN)) * d * d;  // TODO: std::numeric_limits::min
+        return (real_type{ 1.0 } / (val1 + val2 + DBL_MIN)) * d * d;
     }
 }
 
@@ -92,19 +92,19 @@ KOKKOS_INLINE_FUNCTION real_type feature_reduce<kernel_function_type::chi_square
  * @return the result value (`[[nodiscard]]`)
  */
 template <kernel_function_type kernel_function, typename... Args>
-KOKKOS_INLINE_FUNCTION real_type apply_kernel_function(const real_type value, const detail::standard_layout_tuple<Args...> params) {
+KOKKOS_INLINE_FUNCTION real_type apply_kernel_function(const real_type value, [[maybe_unused]] const detail::standard_layout_tuple<Args...> params) {
     if constexpr (kernel_function == kernel_function_type::linear) {
         return value;
     } else if constexpr (kernel_function == kernel_function_type::polynomial) {
-        return ::Kokkos::pow(detail::get<1>(params) * value + detail::get<2>(params), detail::get<0>(params));
+        return Kokkos::pow(detail::get<1>(params) * value + detail::get<2>(params), detail::get<0>(params));
     } else if constexpr (kernel_function == kernel_function_type::rbf) {
-        return ::Kokkos::exp(-detail::get<0>(params) * value);
+        return Kokkos::exp(-detail::get<0>(params) * value);
     } else if constexpr (kernel_function == kernel_function_type::sigmoid) {
-        return ::Kokkos::tanh(detail::get<0>(params) * value + detail::get<1>(params));
+        return Kokkos::tanh(detail::get<0>(params) * value + detail::get<1>(params));
     } else if constexpr (kernel_function == kernel_function_type::laplacian) {
-        return ::Kokkos::exp(-detail::get<0>(params) * value);
+        return Kokkos::exp(-detail::get<0>(params) * value);
     } else if constexpr (kernel_function == kernel_function_type::chi_squared) {
-        return ::Kokkos::exp(-detail::get<0>(params) * value);
+        return Kokkos::exp(-detail::get<0>(params) * value);
     } else {
         static_assert(::plssvm::detail::always_false_v<Args...>, "Unsupported kernel function!");
     }
diff --git a/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
index c6a302d6d..767bfc958 100644
--- a/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
@@ -13,12 +13,11 @@
 #define PLSSVM_BACKENDS_KOKKOS_PREDICT_KERNEL_HPP_
 #pragma once
 
-#include "plssvm/backends/Kokkos/detail/typedefs.hpp"          // plssvm::kokkos::detail::device_view_type
 #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"  // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
 #include "plssvm/constants.hpp"                                // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
 
-#include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents, Kokkos::atomic_add
+#include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents, Kokkos::atomic_add
 
 #include <cstddef>  // std::size_t
 
@@ -26,8 +25,16 @@ namespace plssvm::kokkos::detail {
 
 /**
  * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
+ * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
  */
+template <typename ExecutionSpace>
 class device_kernel_w_linear {
+    /**
+     * @brief The type of the used Kokkos::View.
+     */
+    template <typename T>
+    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+
   public:
     /**
      * @brief Initialize the Kokkos kernel function object.
@@ -40,6 +47,7 @@ class device_kernel_w_linear {
      * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_size_x the size of the execution grid in x-dimension
      */
     device_kernel_w_linear(device_view_type<real_type> w_d, device_view_type<const real_type> alpha_d, device_view_type<const real_type> sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
         w_d_{ w_d },
@@ -53,8 +61,12 @@ class device_kernel_w_linear {
         grid_y_offset_{ grid_y_offset },
         grid_size_x_{ grid_size_x } { }
 
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] team the Kokkos team representing the current point in the execution space
+     */
     KOKKOS_INLINE_FUNCTION
-    void operator()(const Kokkos::TeamPolicy<>::member_type &team) const {
+    void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
         const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
         const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
@@ -132,8 +144,16 @@ class device_kernel_w_linear {
 
 /**
  * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
+ * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
  */
+template <typename ExecutionSpace>
 class device_kernel_predict_linear {
+    /**
+     * @brief The type of the used Kokkos::View.
+     */
+    template <typename T>
+    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+
   public:
     /**
      * @brief Initialize the Kokkos kernel function object.
@@ -146,6 +166,7 @@ class device_kernel_predict_linear {
      * @param[in] num_features the number of features per data point
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_size_x the size of the execution grid in x-dimension
      */
     device_kernel_predict_linear(device_view_type<real_type> prediction_d, device_view_type<const real_type> w_d, device_view_type<const real_type> rho_d, device_view_type<const real_type> predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
         prediction_d_{ prediction_d },
@@ -159,8 +180,12 @@ class device_kernel_predict_linear {
         grid_y_offset_{ grid_y_offset },
         grid_size_x_{ grid_size_x } { }
 
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] team the Kokkos team representing the current point in the execution space
+     */
     KOKKOS_INLINE_FUNCTION
-    void operator()(const Kokkos::TeamPolicy<>::member_type &team) const {
+    void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
         const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
         const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
@@ -242,11 +267,18 @@ class device_kernel_predict_linear {
 
 /**
  * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <typename ExecutionSpace, kernel_function_type kernel_function, typename... Args>
 class device_kernel_predict {
+    /**
+     * @brief The type of the used Kokkos::View.
+     */
+    template <typename T>
+    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+
   public:
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -261,6 +293,7 @@ class device_kernel_predict {
      * @param[in] num_features the number of features per data point
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_size_x the size of the execution grid in x-dimension
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
     device_kernel_predict(device_view_type<real_type> prediction_d, device_view_type<const real_type> alpha_d, device_view_type<const real_type> rho_d, device_view_type<const real_type> sv_d, device_view_type<const real_type> predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
@@ -278,8 +311,12 @@ class device_kernel_predict {
         grid_size_x_{ grid_size_x },
         kernel_function_parameter_{ detail::make_standard_layout_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
 
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] team the Kokkos team representing the current point in the execution space
+     */
     KOKKOS_INLINE_FUNCTION
-    void operator()(const Kokkos::TeamPolicy<>::member_type &team) const {
+    void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
         const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
         const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
@@ -359,7 +396,7 @@ class device_kernel_predict {
                     alpha_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx];
 
                     // the bias (rho) must only be applied once for all support vectors
-                    if (blockIdx_y == 0ull) {
+                    if (blockIdx_y == std::size_t{ 0 }) {
                         out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = -rho_d_[dim + threadIdx_y];
                         out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = -rho_d_[dim + threadIdx_y + THREAD_BLOCK_SIZE_sz];
                     } else {
diff --git a/src/plssvm/backends/Kokkos/CMakeLists.txt b/src/plssvm/backends/Kokkos/CMakeLists.txt
index 89cf282ce..20ae3c0a6 100644
--- a/src/plssvm/backends/Kokkos/CMakeLists.txt
+++ b/src/plssvm/backends/Kokkos/CMakeLists.txt
@@ -23,6 +23,7 @@ message(CHECK_PASS "found")
 # explicitly set sources
 set(PLSSVM_KOKKOS_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/detail/device_ptr.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/detail/device_wrapper.cpp
     ${CMAKE_CURRENT_LIST_DIR}/detail/pinned_memory.cpp
     ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp
     ${CMAKE_CURRENT_LIST_DIR}/csvm.cpp
diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 1f4b0d8d5..0a1903c16 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -11,7 +11,8 @@
 #include "plssvm/backends/execution_range.hpp"                                        // plssvm::detail::{execution_range, dim_type}
 #include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"                    // PLSSVM_KOKKOS_BACKEND_INVOKE_IF_*
 #include "plssvm/backends/Kokkos/detail/device_ptr.hpp"                               // plssvm::kokkos::detail::device_ptr
-#include "plssvm/backends/Kokkos/detail/utility.hpp"                                  // plssvm::kokkos::detail::get_runtime_version
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"                           // plssvm::kokkos::detail::{device_wrapper, get_device_list}
+#include "plssvm/backends/Kokkos/detail/utility.hpp"                                  // plssvm::kokkos::detail::get_runtime_version // TODO: docu
 #include "plssvm/backends/Kokkos/exceptions.hpp"                                      // plssvm::kokkos::backend_exception
 #include "plssvm/backends/Kokkos/execution_space.hpp"                                 // plssvm::kokkos::execution_space
 #include "plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp"                         // plssvm::kokkos::detail::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale}
@@ -24,6 +25,7 @@
 #include "plssvm/detail/logging.hpp"                                                  // plssvm::detail::log
 #include "plssvm/detail/memory_size.hpp"                                              // plssvm::detail::memory_size
 #include "plssvm/detail/tracking/performance_tracker.hpp"                             // plssvm::detail::tracking::tracking_entry
+#include "plssvm/detail/type_traits.hpp"                                              // plssvm::detail::remove_cvref_t
 #include "plssvm/detail/utility.hpp"                                                  // plssvm::detail::{get_system_memory, unreachable}
 #include "plssvm/exceptions/exceptions.hpp"                                           // plssvm::exception
 #include "plssvm/kernel_function_types.hpp"                                           // plssvm::kernel_function_type
@@ -39,7 +41,9 @@
 #include <cstddef>    // std::size_t
 #include <exception>  // std::terminate
 #include <iostream>   // std::cout, std::endl
+#include <map>        // std::map
 #include <string>     // std::string
+#include <utility>    // std::move
 #include <vector>     // std::vector
 
 namespace plssvm::kokkos {
@@ -48,8 +52,7 @@ csvm::csvm(parameter params) :
     csvm{ plssvm::target_platform::automatic, params } { }
 
 csvm::csvm(target_platform target, parameter params) :
-    base_type{ params },
-    space_{ determine_default_execution_space() } {
+    base_type{ params } {
     this->init(target);
 }
 
@@ -80,23 +83,41 @@ void csvm::init(const target_platform target) {
             break;
     }
 
+    // get all available target_platform <-> Kokkos::ExecutionSpace combinations
+    const std::map<target_platform, std::vector<execution_space>> available_combinations = detail::available_target_platform_to_execution_space_mapping();
+
+    if (target == target_platform::automatic) {
+        // go through all combinations and choose the first execution space in order: gpu_nvidia -> gpu_amd -> gpu_intel -> cpu
+        for (const target_platform target_order : { target_platform::gpu_nvidia, target_platform::gpu_amd, target_platform::gpu_intel, target_platform::cpu }) {
+            if (::plssvm::detail::contains(available_combinations, target_order)) {
+                // the target platform is supported -> choose the first execution space to use in the Kokkos backend
+                space_ = available_combinations.at(target_order).front();
+                target_ = target_order;
+                break;
+            }
+        }
+    } else {
+        // check whether the provided target platform is compatible with the currently available Kokkos::ExecutionSpaces
+        if (::plssvm::detail::contains(available_combinations, target)) {
+            // the target platform is supported -> choose the first execution space to use in the Kokkos backend
+            space_ = available_combinations.at(target).front();
+            target_ = target;
+        } else {
+            // the provided target platform is unsupported -> throw an exception
+            throw backend_exception{ fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform {}!", fmt::join(list_available_execution_spaces(), ", "), target) };
+        }
+    }
+
     plssvm::detail::log(verbosity_level::full,
-                        "\nUsing Kokkos ({}) as backend with the Kokkos::DefaultExecutionSpace \"{}\".\n",
+                        "\nUsing Kokkos ({}) as backend with the Kokkos::ExecutionSpace \"{}\".\n",
                         plssvm::detail::tracking::tracking_entry{ "dependencies", "kokkos_version", detail::get_kokkos_version() },
                         plssvm::detail::tracking::tracking_entry{ "dependencies", "kokkos_default_execution_space", space_ });
 
-    // check whether the provided target platform is compatible with the Kokkos execution space
+    // output automatic target platform information
     if (target == target_platform::automatic) {
-        // determine the default target based on the provided Kokkos execution space
-        target_ = detail::determine_default_target_platform_from_execution_space(space_);
         plssvm::detail::log(verbosity_level::full,
                             "Using {} as automatic target platform.\n",
                             target_);
-    } else {
-        // check whether the provided target platform is compatible with the execution space
-        // throws a backend exception if the combination is invalid
-        detail::check_execution_space_target_platform_combination(space_, target);
-        target_ = target;
     }
 
     // get all available devices wrt the requested target platform
@@ -116,7 +137,7 @@ void csvm::init(const target_platform target) {
     std::vector<std::string> device_names{};
     device_names.reserve(devices_.size());
     for (typename std::vector<queue_type>::size_type device = 0; device < devices_.size(); ++device) {
-        const std::string device_name = detail::get_device_name(space_, devices_[device]);
+        const std::string device_name = detail::get_device_name(devices_[device]);
         plssvm::detail::log(verbosity_level::full,
                             "  [{}, {}]\n",
                             device,
@@ -147,21 +168,21 @@ std::vector<::plssvm::detail::memory_size> csvm::get_device_memory() const {
         case execution_space::cuda:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() {
                 for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
-                    res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].cuda_device_prop().totalGlobalMem) };
+                    res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::cuda>().cuda_device_prop().totalGlobalMem) };
                 }
                 return res;
             });
         case execution_space::hip:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
                 for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
-                    res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].hip_device_prop().totalGlobalMem) };
+                    res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::hip>().hip_device_prop().totalGlobalMem) };
                 }
                 return res;
             });
         case execution_space::sycl:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
                 for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
-                    res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].sycl_queue().get_device().get_info<::sycl::info::device::global_mem_size>()) };
+                    res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::sycl>().sycl_queue().get_device().get_info<::sycl::info::device::global_mem_size>()) };
                 }
                 return res;
             });
@@ -188,7 +209,7 @@ std::vector<::plssvm::detail::memory_size> csvm::get_max_mem_alloc_size() const
         case execution_space::sycl:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
                 for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
-                    res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].sycl_queue().get_device().get_info<::sycl::info::device::max_mem_alloc_size>()) };
+                    res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::sycl>().sycl_queue().get_device().get_info<::sycl::info::device::max_mem_alloc_size>()) };
                 }
                 return res;
             });
@@ -213,22 +234,25 @@ std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
     switch (space_) {
         case execution_space::cuda:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() {
-                return static_cast<std::size_t>(devices_[device_id].cuda_device_prop().maxThreadsPerBlock);
+                return static_cast<std::size_t>(devices_[device_id].get<execution_space::cuda>().cuda_device_prop().maxThreadsPerBlock);
             });
         case execution_space::hip:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
-                return static_cast<std::size_t>(devices_[device_id].hip_device_prop().maxThreadsPerBlock);
+                return static_cast<std::size_t>(devices_[device_id].get<execution_space::hip>().hip_device_prop().maxThreadsPerBlock);
             });
         case execution_space::sycl:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
-                return devices_[device_id].sycl_queue().get_device().get_info<::sycl::info::device::max_work_group_size>();
+                return devices_[device_id].get<execution_space::sycl>().sycl_queue().get_device().get_info<::sycl::info::device::max_work_group_size>();
             });
+        case execution_space::openmp:
+            return 16;  // TODO: most likely dependent on the number of cores in Kokkos...
+        case execution_space::serial:
+            // only one thread allowed in serial execution
+            return 1;
         case execution_space::openmp_target:
         case execution_space::openacc:
-        case execution_space::openmp:
         case execution_space::hpx:
         case execution_space::threads:
-        case execution_space::serial:
             throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
     }
     // all possible cases should be handled by the previous switch
@@ -243,21 +267,25 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size(const std::size_t device_id)
     switch (space_) {
         case execution_space::cuda:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA(([&]() -> ::plssvm::detail::dim_type {
-                const cudaDeviceProp &prop = devices_[device_id].cuda_device_prop();
+                // TODO: Kokkos only uses maxGridSize[0]
+                const cudaDeviceProp &prop = devices_[device_id].get<execution_space::cuda>().cuda_device_prop();
                 return { static_cast<std::size_t>(prop.maxGridSize[0]), static_cast<std::size_t>(prop.maxGridSize[1]), static_cast<std::size_t>(prop.maxGridSize[2]) };
             }));
         case execution_space::hip:
+            // TODO: Kokkos only uses maxGridSize[0]
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP(([&]() -> ::plssvm::detail::dim_type {
-                const hipDeviceProp &prop = devices_[device_id].hip_device_prop();
+                const hipDeviceProp &prop = devices_[device_id].get<execution_space::hip>().hip_device_prop();
                 return { static_cast<std::size_t>(prop.maxGridSize[0]), static_cast<std::size_t>(prop.maxGridSize[1]), static_cast<std::size_t>(prop.maxGridSize[2]) };
             }));
+        case execution_space::openmp:
+            return { 16, 16, 16 };  // TODO: correct values
+        case execution_space::serial:
+            return { 1, 1, 1 };  // TODO: correct values
         case execution_space::sycl:
         case execution_space::openmp_target:
         case execution_space::openacc:
-        case execution_space::openmp:
         case execution_space::hpx:
         case execution_space::threads:
-        case execution_space::serial:
             throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
     }
     // all possible cases should be handled by the previous switch
@@ -272,7 +300,6 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size(const std::size_t device_id)
 auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
     const unsigned long long num_rows_reduced = data_d.shape().x - 1;
     const unsigned long long num_features = data_d.shape().y;
-    const queue_type &device = devices_[device_id];
 
     // calculate the number of data points this device is responsible for
     const unsigned long long device_specific_num_rows = data_distribution_->place_specific_num_rows(device_id);
@@ -284,192 +311,213 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
     const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_);
     const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id);
 
-    device_ptr_type kernel_matrix_d{ num_entries_padded, device };  // only explicitly store the upper triangular matrix
+    device_ptr_type kernel_matrix_d{ num_entries_padded, devices_[device_id] };  // only explicitly store the upper triangular matrix
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
     const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
     // save the team sizes
     const ::plssvm::detail::dim_type team_sizes = exec.block;
 
-    for (const auto &[partial_grid, offsets] : exec.grids) {
-        // create a Kokkos TeamPolicy
-        Kokkos::TeamPolicy<> team_policy(device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO);
+    return devices_[device_id].execute_and_return([&](auto &device) {
+        using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
+        constexpr execution_space space = kokkos_type_to_execution_space_v<kokkos_execution_space_type>;
 
-        switch (params.kernel_type) {
-            case kernel_function_type::linear:
-                {
-                    using functor_type = detail::device_kernel_assembly<kernel_function_type::linear>;
-                    Kokkos::parallel_for("assemble_kernel_matrix_explicit_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x });
-                }
-                break;
-            case kernel_function_type::polynomial:
-                {
-                    using functor_type = detail::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                    Kokkos::parallel_for("assemble_kernel_matrix_explicit_polynomial", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                }
-                break;
-            case kernel_function_type::rbf:
-                {
-                    using functor_type = detail::device_kernel_assembly<kernel_function_type::rbf, real_type>;
-                    Kokkos::parallel_for("assemble_kernel_matrix_explicit_rbf", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                }
-                break;
-            case kernel_function_type::sigmoid:
-                {
-                    using functor_type = detail::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                    Kokkos::parallel_for("assemble_kernel_matrix_explicit_sigmoid", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma), params.coef0 });
-                }
-                break;
-            case kernel_function_type::laplacian:
-                {
-                    using functor_type = detail::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
-                    Kokkos::parallel_for("assemble_kernel_matrix_explicit_laplacian", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                }
-                break;
-            case kernel_function_type::chi_squared:
-                {
-                    using functor_type = detail::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
-                    Kokkos::parallel_for("assemble_kernel_matrix_explicit_chi_squared", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                }
-                break;
+        for (const auto &[partial_grid, offsets] : exec.grids) {
+            // create a Kokkos TeamPolicy
+            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()) };
+
+            switch (params.kernel_type) {
+                case kernel_function_type::linear:
+                    {
+                        using functor_type = detail::device_kernel_assembly<kokkos_execution_space_type, kernel_function_type::linear>;
+                        Kokkos::parallel_for("assemble_kernel_matrix_explicit_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x });
+                    }
+                    break;
+                case kernel_function_type::polynomial:
+                    {
+                        using functor_type = detail::device_kernel_assembly<kokkos_execution_space_type, kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                        Kokkos::parallel_for("assemble_kernel_matrix_explicit_polynomial", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                    }
+                    break;
+                case kernel_function_type::rbf:
+                    {
+                        using functor_type = detail::device_kernel_assembly<kokkos_execution_space_type, kernel_function_type::rbf, real_type>;
+                        Kokkos::parallel_for("assemble_kernel_matrix_explicit_rbf", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
+                    }
+                    break;
+                case kernel_function_type::sigmoid:
+                    {
+                        using functor_type = detail::device_kernel_assembly<kokkos_execution_space_type, kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                        Kokkos::parallel_for("assemble_kernel_matrix_explicit_sigmoid", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma), params.coef0 });
+                    }
+                    break;
+                case kernel_function_type::laplacian:
+                    {
+                        using functor_type = detail::device_kernel_assembly<kokkos_execution_space_type, kernel_function_type::laplacian, real_type>;
+                        Kokkos::parallel_for("assemble_kernel_matrix_explicit_laplacian", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
+                    }
+                    break;
+                case kernel_function_type::chi_squared:
+                    {
+                        using functor_type = detail::device_kernel_assembly<kokkos_execution_space_type, kernel_function_type::chi_squared, real_type>;
+                        Kokkos::parallel_for("assemble_kernel_matrix_explicit_chi_squared", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
+                    }
+                    break;
+            }
         }
-    }
-    detail::device_synchronize(device);
+        detail::device_synchronize(device);
 
-    return kernel_matrix_d;
+        return std::move(kernel_matrix_d);
+    });
 }
 
 void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const ::plssvm::detail::execution_range &mirror_exec, const real_type alpha, const device_ptr_type &A_d, const device_ptr_type &B_d, const real_type beta, device_ptr_type &C_d) const {
     const unsigned long long num_rhs = B_d.shape().x;
     const unsigned long long num_rows = B_d.shape().y;
-    const queue_type &device = devices_[device_id];
 
-    // calculate the number of data points this device is responsible for
-    const unsigned long long device_specific_num_rows = data_distribution_->place_specific_num_rows(device_id);
-    // get the offset of the data points this device is responsible for
-    const unsigned long long row_offset = data_distribution_->place_row_offset(device_id);
-    // the necessary amount of scratch memory for the kernels
-    const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
+    devices_[device_id].execute([&](auto &device) {
+        using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
+        constexpr execution_space space = kokkos_type_to_execution_space_v<kokkos_execution_space_type>;
 
-    // save the team sizes
-    const ::plssvm::detail::dim_type team_sizes = exec.block;
+        // calculate the number of data points this device is responsible for
+        const unsigned long long device_specific_num_rows = data_distribution_->place_specific_num_rows(device_id);
+        // get the offset of the data points this device is responsible for
+        const unsigned long long row_offset = data_distribution_->place_row_offset(device_id);
+        // the necessary amount of scratch memory for the kernels
+        const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
-    for (const auto &[partial_grid, offsets] : exec.grids) {
-        // create a Kokkos TeamPolicy
-        Kokkos::TeamPolicy<> team_policy{ device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+        // save the team sizes
+        const ::plssvm::detail::dim_type team_sizes = exec.block;
 
-        Kokkos::parallel_for("blas_level_3_kernel_explicit", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.x, offsets.y, partial_grid.x));
-    }
+        for (const auto &[partial_grid, offsets] : exec.grids) {
+            // create a Kokkos TeamPolicy
+            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
 
-    // save the mirror team sizes
-    const ::plssvm::detail::dim_type mirror_team_sizes = mirror_exec.block;
+            Kokkos::parallel_for("blas_level_3_kernel_explicit", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_symm<kokkos_execution_space_type>{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get().get<space>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x });
+        }
 
-    for (const auto &[partial_grid, offsets] : mirror_exec.grids) {
-        const unsigned long long num_mirror_rows = num_rows - row_offset - device_specific_num_rows;
+        // save the mirror team sizes
+        const ::plssvm::detail::dim_type mirror_team_sizes = mirror_exec.block;
 
-        if (num_mirror_rows > 0) {
-            // create a Kokkos TeamPolicy
-            Kokkos::TeamPolicy<> team_policy{ static_cast<int>(partial_grid.total_size()), static_cast<int>(mirror_team_sizes.total_size()), Kokkos::AUTO };
+        for (const auto &[partial_grid, offsets] : mirror_exec.grids) {
+            const unsigned long long num_mirror_rows = num_rows - row_offset - device_specific_num_rows;
 
-            Kokkos::parallel_for("blas_level_3_kernel_explicit_mirror", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.x, offsets.y, partial_grid.x));
+            if (num_mirror_rows > 0) {
+                // create a Kokkos TeamPolicy
+                Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, static_cast<int>(partial_grid.total_size()), static_cast<int>(mirror_team_sizes.total_size()), Kokkos::AUTO };
+
+                Kokkos::parallel_for("blas_level_3_kernel_explicit_mirror", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_symm_mirror<kokkos_execution_space_type>{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get().get<space>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x });
+            }
         }
-    }
-    detail::device_synchronize(device);
+        detail::device_synchronize(device);
+    });
 }
 
 void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, device_ptr_type &lhs_d, const device_ptr_type &rhs_d) const {
     const unsigned long long num_rhs = lhs_d.shape().x;
-    const queue_type &device = devices_[device_id];
 
-    // save the team sizes
-    const ::plssvm::detail::dim_type team_sizes = exec.block;
+    devices_[device_id].execute([&](auto &device) {
+        using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
+        constexpr execution_space space = kokkos_type_to_execution_space_v<kokkos_execution_space_type>;
 
-    for (const auto &[partial_grid, offsets] : exec.grids) {
-        // create a Kokkos TeamPolicy
-        Kokkos::TeamPolicy<> team_policy{ static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+        // save the team sizes
+        const ::plssvm::detail::dim_type team_sizes = exec.block;
 
-        Kokkos::parallel_for("inplace_matrix_addition", team_policy, detail::device_kernel_inplace_matrix_add(num_rhs, lhs_d.get(), rhs_d.get(), offsets.x, offsets.y, partial_grid.x));
-    }
-    detail::device_synchronize(device);
+        for (const auto &[partial_grid, offsets] : exec.grids) {
+            // create a Kokkos TeamPolicy
+            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+
+            Kokkos::parallel_for("inplace_matrix_addition", team_policy, detail::device_kernel_inplace_matrix_add<kokkos_execution_space_type>{ num_rhs, lhs_d.get().get<space>(), rhs_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x });
+        }
+        detail::device_synchronize(device);
+    });
 }
 
 void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, device_ptr_type &lhs_d, const real_type scale) const {
     const unsigned long long num_rhs = lhs_d.shape().x;
-    const queue_type &device = devices_[device_id];
 
-    // save the team sizes
-    const ::plssvm::detail::dim_type team_sizes = exec.block;
+    devices_[device_id].execute([&](auto &device) {
+        using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
+        constexpr execution_space space = kokkos_type_to_execution_space_v<kokkos_execution_space_type>;
 
-    for (const auto &[partial_grid, offsets] : exec.grids) {
-        // create a Kokkos TeamPolicy
-        Kokkos::TeamPolicy<> team_policy{ static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+        // save the team sizes
+        const ::plssvm::detail::dim_type team_sizes = exec.block;
 
-        Kokkos::parallel_for("inplace_matrix_scale", team_policy, detail::device_kernel_inplace_matrix_scale(num_rhs, lhs_d.get(), scale, offsets.x, offsets.y, partial_grid.x));
-    }
-    detail::device_synchronize(device);
+        for (const auto &[partial_grid, offsets] : exec.grids) {
+            // create a Kokkos TeamPolicy
+            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+
+            Kokkos::parallel_for("inplace_matrix_scale", team_policy, detail::device_kernel_inplace_matrix_scale<kokkos_execution_space_type>{ num_rhs, lhs_d.get().get<space>(), scale, offsets.x, offsets.y, partial_grid.x });
+        }
+        detail::device_synchronize(device);
+    });
 }
 
 void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const real_type alpha, const device_ptr_type &A_d, const parameter &params, const device_ptr_type &q_red, const real_type QA_cost, const device_ptr_type &B_d, device_ptr_type &C_d) const {
     const unsigned long long num_rows_reduced = A_d.shape().x - 1;
     const unsigned long long num_features = A_d.shape().y;
     const unsigned long long num_classes = B_d.shape().x;
-    const queue_type &device = devices_[device_id];
 
-    // calculate the number of data points this device is responsible for
-    const unsigned long long device_specific_num_rows = data_distribution_->place_specific_num_rows(device_id);
-    // get the offset of the data points this device is responsible for
-    const unsigned long long row_offset = data_distribution_->place_row_offset(device_id);
+    devices_[device_id].execute([&](auto &device) {
+        using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
+        constexpr execution_space space = kokkos_type_to_execution_space_v<kokkos_execution_space_type>;
 
-    const real_type cost_factor = real_type{ 1.0 } / params.cost;
-    const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
+        // calculate the number of data points this device is responsible for
+        const unsigned long long device_specific_num_rows = data_distribution_->place_specific_num_rows(device_id);
+        // get the offset of the data points this device is responsible for
+        const unsigned long long row_offset = data_distribution_->place_row_offset(device_id);
 
-    // save the team sizes
-    const ::plssvm::detail::dim_type team_sizes = exec.block;
+        const real_type cost_factor = real_type{ 1.0 } / params.cost;
+        const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
-    for (const auto &[partial_grid, offsets] : exec.grids) {
-        // create a Kokkos TeamPolicy
-        Kokkos::TeamPolicy<> team_policy(device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO);
+        // save the team sizes
+        const ::plssvm::detail::dim_type team_sizes = exec.block;
 
-        switch (params.kernel_type) {
-            case kernel_function_type::linear:
-                {
-                    using functor_type = detail::device_kernel_assembly_symm<kernel_function_type::linear>;
-                    Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, partial_grid.x });
-                }
-                break;
-            case kernel_function_type::polynomial:
-                {
-                    using functor_type = detail::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                    Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_polynomial", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, partial_grid.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                }
-                break;
-            case kernel_function_type::rbf:
-                {
-                    using functor_type = detail::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
-                    Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_rbf", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                }
-                break;
-            case kernel_function_type::sigmoid:
-                {
-                    using functor_type = detail::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                    Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_sigmoid", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma), params.coef0 });
-                }
-                break;
-            case kernel_function_type::laplacian:
-                {
-                    using functor_type = detail::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
-                    Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_laplacian", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                }
-                break;
-            case kernel_function_type::chi_squared:
-                {
-                    using functor_type = detail::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
-                    Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_chi_squared", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                }
-                break;
+        for (const auto &[partial_grid, offsets] : exec.grids) {
+            // create a Kokkos TeamPolicy
+            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+
+            switch (params.kernel_type) {
+                case kernel_function_type::linear:
+                    {
+                        using functor_type = detail::device_kernel_assembly_symm<kokkos_execution_space_type, kernel_function_type::linear>;
+                        Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x });
+                    }
+                    break;
+                case kernel_function_type::polynomial:
+                    {
+                        using functor_type = detail::device_kernel_assembly_symm<kokkos_execution_space_type, kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                        Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_polynomial", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                    }
+                    break;
+                case kernel_function_type::rbf:
+                    {
+                        using functor_type = detail::device_kernel_assembly_symm<kokkos_execution_space_type, kernel_function_type::rbf, real_type>;
+                        Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_rbf", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
+                    }
+                    break;
+                case kernel_function_type::sigmoid:
+                    {
+                        using functor_type = detail::device_kernel_assembly_symm<kokkos_execution_space_type, kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                        Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_sigmoid", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma), params.coef0 });
+                    }
+                    break;
+                case kernel_function_type::laplacian:
+                    {
+                        using functor_type = detail::device_kernel_assembly_symm<kokkos_execution_space_type, kernel_function_type::laplacian, real_type>;
+                        Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_laplacian", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
+                    }
+                    break;
+                case kernel_function_type::chi_squared:
+                    {
+                        using functor_type = detail::device_kernel_assembly_symm<kokkos_execution_space_type, kernel_function_type::chi_squared, real_type>;
+                        Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_chi_squared", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
+                    }
+                    break;
+            }
         }
-    }
-    detail::device_synchronize(device);
+        detail::device_synchronize(device);
+    });
 }
 
 //***************************************************//
@@ -481,27 +529,31 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
     const unsigned long long num_sv = alpha_d.shape().y;
     const unsigned long long device_specific_num_sv = sv_d.shape().x;
     const unsigned long long num_features = sv_d.shape().y;
-    const queue_type &device = devices_[device_id];
 
     // get the offset of the data points this device is responsible for
     const unsigned long long sv_offset = data_distribution_->place_row_offset(device_id);
 
-    device_ptr_type w_d{ shape{ num_classes, num_features }, shape{ PADDING_SIZE, PADDING_SIZE }, device };
+    device_ptr_type w_d{ shape{ num_classes, num_features }, shape{ PADDING_SIZE, PADDING_SIZE }, devices_[device_id] };
 
     const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
     // save the team sizes
     const ::plssvm::detail::dim_type team_sizes = exec.block;
 
-    for (const auto &[partial_grid, offsets] : exec.grids) {
-        // create a Kokkos TeamPolicy
-        Kokkos::TeamPolicy<> team_policy{ static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+    return devices_[device_id].execute_and_return([&](auto &device) {
+        using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
+        constexpr execution_space space = kokkos_type_to_execution_space_v<kokkos_execution_space_type>;
 
-        Kokkos::parallel_for("w_kernel", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_w_linear(w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.x, offsets.y, partial_grid.x));
-    }
-    detail::device_synchronize(device);
+        for (const auto &[partial_grid, offsets] : exec.grids) {
+            // create a Kokkos TeamPolicy
+            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+
+            Kokkos::parallel_for("w_kernel", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_w_linear<kokkos_execution_space_type>{ w_d.get().get<space>(), alpha_d.get().get<space>(), sv_d.get().get<space>(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.x, offsets.y, partial_grid.x });
+        }
+        detail::device_synchronize(device);
 
-    return w_d;
+        return std::move(w_d);
+    });
 }
 
 auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &alpha_d, const device_ptr_type &rho_d, const device_ptr_type &sv_or_w_d, const device_ptr_type &predict_points_d) const -> device_ptr_type {
@@ -509,61 +561,65 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
     const unsigned long long num_predict_points = predict_points_d.shape().x;  // = device_specific_num_rows
     const unsigned long long num_features = predict_points_d.shape().y;
     const unsigned long long num_sv = sv_or_w_d.shape().x;
-    const queue_type &device = devices_[device_id];
 
-    device_ptr_type out_d{ shape{ num_predict_points, num_classes }, shape{ PADDING_SIZE, PADDING_SIZE }, device };
+    device_ptr_type out_d{ shape{ num_predict_points, num_classes }, shape{ PADDING_SIZE, PADDING_SIZE }, devices_[device_id] };
 
     const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
     // save the team sizes
     const ::plssvm::detail::dim_type team_sizes = exec.block;
 
-    for (const auto &[partial_grid, offsets] : exec.grids) {
-        // create a Kokkos TeamPolicy
-        Kokkos::TeamPolicy<> team_policy{ static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+    return devices_[device_id].execute_and_return([&](auto &device) {
+        using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
+        constexpr execution_space space = kokkos_type_to_execution_space_v<kokkos_execution_space_type>;
 
-        switch (params.kernel_type) {
-            case kernel_function_type::linear:
-                {
-                    using functor_type = detail::device_kernel_predict_linear;
-                    Kokkos::parallel_for("predict_kernel_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x });
-                }
-                break;
-            case kernel_function_type::polynomial:
-                {
-                    using functor_type = detail::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                    Kokkos::parallel_for("predict_kernel_polynomial", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                }
-                break;
-            case kernel_function_type::rbf:
-                {
-                    using functor_type = detail::device_kernel_predict<kernel_function_type::rbf, real_type>;
-                    Kokkos::parallel_for("predict_kernel_rbf", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                }
-                break;
-            case kernel_function_type::sigmoid:
-                {
-                    using functor_type = detail::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                    Kokkos::parallel_for("predict_kernel_sigmoid", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma), params.coef0 });
-                }
-                break;
-            case kernel_function_type::laplacian:
-                {
-                    using functor_type = detail::device_kernel_predict<kernel_function_type::laplacian, real_type>;
-                    Kokkos::parallel_for("predict_kernel_laplacian", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                }
-                break;
-            case kernel_function_type::chi_squared:
-                {
-                    using functor_type = detail::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
-                    Kokkos::parallel_for("predict_kernel_chi_squared", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                }
-                break;
+        for (const auto &[partial_grid, offsets] : exec.grids) {
+            // create a Kokkos TeamPolicy
+            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+
+            switch (params.kernel_type) {
+                case kernel_function_type::linear:
+                    {
+                        using functor_type = detail::device_kernel_predict_linear<kokkos_execution_space_type>;
+                        Kokkos::parallel_for("predict_kernel_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get<space>(), sv_or_w_d.get().get<space>(), rho_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x });
+                    }
+                    break;
+                case kernel_function_type::polynomial:
+                    {
+                        using functor_type = detail::device_kernel_predict<kokkos_execution_space_type, kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                        Kokkos::parallel_for("predict_kernel_polynomial", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                    }
+                    break;
+                case kernel_function_type::rbf:
+                    {
+                        using functor_type = detail::device_kernel_predict<kokkos_execution_space_type, kernel_function_type::rbf, real_type>;
+                        Kokkos::parallel_for("predict_kernel_rbf", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
+                    }
+                    break;
+                case kernel_function_type::sigmoid:
+                    {
+                        using functor_type = detail::device_kernel_predict<kokkos_execution_space_type, kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                        Kokkos::parallel_for("predict_kernel_sigmoid", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma), params.coef0 });
+                    }
+                    break;
+                case kernel_function_type::laplacian:
+                    {
+                        using functor_type = detail::device_kernel_predict<kokkos_execution_space_type, kernel_function_type::laplacian, real_type>;
+                        Kokkos::parallel_for("predict_kernel_laplacian", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
+                    }
+                    break;
+                case kernel_function_type::chi_squared:
+                    {
+                        using functor_type = detail::device_kernel_predict<kokkos_execution_space_type, kernel_function_type::chi_squared, real_type>;
+                        Kokkos::parallel_for("predict_kernel_chi_squared", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
+                    }
+                    break;
+            }
         }
-    }
-    detail::device_synchronize(device);
+        detail::device_synchronize(device);
 
-    return out_d;
+        return std::move(out_d);
+    });
 }
 
 }  // namespace plssvm::kokkos
diff --git a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
index cbf973ca4..dcd0f98d3 100644
--- a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
@@ -8,13 +8,16 @@
 
 #include "plssvm/backends/Kokkos/detail/device_ptr.hpp"
 
-#include "plssvm/backends/Kokkos/detail/typedefs.hpp"  // plssvm::kokkos::detail::{device_view_type, host_view_type}
-#include "plssvm/backends/Kokkos/detail/utility.hpp"   // plssvm::detail::device_synchronize
-#include "plssvm/backends/Kokkos/exceptions.hpp"       // plssvm::kokkos::backend_exception
-#include "plssvm/detail/assert.hpp"                    // PLSSVM_ASSERT
-#include "plssvm/shape.hpp"                            // plssvm::shape
+#include "plssvm/backends/Kokkos/detail/device_view_wrapper.hpp"   // plssvm::kokkos::detail::{device_view_wrapper, make_device_view_wrapper}
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"        // plssvm::kokkos::detail::device_wrapper
+#include "plssvm/backends/Kokkos/detail/utility.hpp"               // plssvm::detail::device_synchronize
+#include "plssvm/backends/Kokkos/exceptions.hpp"                   // plssvm::kokkos::backend_exception
+#include "plssvm/backends/Kokkos/kernel/detail/memset_kernel.hpp"  // plssvm::kokkos::detail::device_fill_array
+#include "plssvm/detail/assert.hpp"                                // PLSSVM_ASSERT
+#include "plssvm/detail/type_traits.hpp"                           // plssvm::detail::remove_cvref_t
+#include "plssvm/shape.hpp"                                        // plssvm::shape
 
-#include "Kokkos_Core.hpp"  // Kokkos::DefaultExecutionSpace, Kokkos::subview, Kokkos::parallel_for, KOKKOS_LAMBDA, Kokkos::deep_copy
+#include "Kokkos_Core.hpp"  // Kokkos::View, Kokkos::HostSpace, Kokkos::MemoryUnmanaged, Kokkos::subview, Kokkos::parallel_for, Kokkos::deep_copy
 
 #include "fmt/core.h"  // fmt::format
 
@@ -26,76 +29,94 @@
 
 namespace plssvm::kokkos::detail {
 
+/**
+ * @brief Typedef for a simple Kokkos::View always targeting the Kokkos::HostSpace.
+ * @tparam T the type of the view's data
+ */
 template <typename T>
-device_ptr<T>::device_ptr(const size_type size, const Kokkos::DefaultExecutionSpace &exec) :
-    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, exec } { }
+using host_view_type = Kokkos::View<T *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const Kokkos::DefaultExecutionSpace &exec) :
-    device_ptr{ shape, plssvm::shape{ 0, 0 }, exec } { }
+device_ptr<T>::device_ptr(const size_type size, const device_wrapper &device) :
+    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const Kokkos::DefaultExecutionSpace &exec) :
-    base_type{ shape, padding, exec } {
-    data_ = device_view_type<T>{ "device_ptr_view", this->size_padded() };
+device_ptr<T>::device_ptr(const plssvm::shape shape, const device_wrapper &device) :
+    device_ptr{ shape, plssvm::shape{ 0, 0 }, device } { }
+
+template <typename T>
+device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const device_wrapper &device) :
+    base_type{ shape, padding, device } {
+    data_ = make_device_view_wrapper<T *>(device.get_execution_space(), this->size_padded());
+    this->memset(0);
 }
 
 template <typename T>
 void device_ptr<T>::memset(const int pattern, const size_type pos, const size_type num_bytes) {
-    PLSSVM_ASSERT(data_ != device_view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (pos >= this->size_padded()) {
         throw backend_exception{ fmt::format("Illegal access in memset!: {} >= {}", pos, this->size_padded()) };
     }
     const size_type rnum_bytes = std::min(num_bytes, (this->size_padded() - pos) * sizeof(value_type));
 
-    // create subview of the device data
-    auto data_subview = Kokkos::subview(data_, std::make_pair(pos, pos + (rnum_bytes / sizeof(value_type))));
-    // fill subview with constant data
-    Kokkos::parallel_for("device_ptr_memset", num_bytes, KOKKOS_LAMBDA(const std::size_t idx) {
-        // Cast the view's data pointer to unsigned char* (byte access)
-        reinterpret_cast<unsigned char*>(data_subview.data())[idx] = pattern; });
+    // TODO: use Kokkos ZeroMemset specialization?
+    data_.execute([&](const auto &data) {
+        using kokkos_execution_space_type = typename ::plssvm::detail::remove_cvref_t<decltype(data)>::execution_space;
 
-    detail::device_synchronize(queue_);
+        // create subview of the device data
+        auto *data_ptr = reinterpret_cast<unsigned char *>(data.data() + pos);
+        auto p = static_cast<unsigned char>(pattern);
+        // memset subview
+        Kokkos::parallel_for("device_ptr_memset",
+                             Kokkos::RangePolicy<kokkos_execution_space_type>(0, rnum_bytes),
+                             device_memset_kernel{ data_ptr, p });
+
+        detail::device_synchronize(queue_);
+    });
 }
 
 template <typename T>
 void device_ptr<T>::fill(const value_type value, const size_type pos, const size_type count) {
-    PLSSVM_ASSERT(data_ != device_view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (pos >= this->size_padded()) {
         throw backend_exception{ fmt::format("Illegal access in fill!: {} >= {}", pos, this->size_padded()) };
     }
     const size_type rcount = std::min(count, this->size_padded() - pos);
 
-    // create subview of the device data
-    auto data_subview = Kokkos::subview(data_, std::make_pair(pos, pos + rcount));
-    // fill subview with constant data
-    Kokkos::deep_copy(data_subview, value);
+    data_.execute([&](const auto &data) {
+        // create subview of the device data
+        auto data_subview = Kokkos::subview(data, std::make_pair(pos, pos + rcount));
+        // fill subview with constant data
+        Kokkos::deep_copy(data_subview, value);
 
-    detail::device_synchronize(queue_);
+        detail::device_synchronize(queue_);
+    });
 }
 
 template <typename T>
 void device_ptr<T>::copy_to_device(const_host_pointer_type data_to_copy, const size_type pos, const size_type count) {
-    PLSSVM_ASSERT(data_ != device_view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
 
-    // create view of the host data
-    const host_view_type<const T> host_view{ data_to_copy, rcount };
-    // create subview of the device data
-    auto data_subview = Kokkos::subview(data_, std::make_pair(pos, pos + rcount));
-    // copy the data to the device subview
-    Kokkos::deep_copy(data_subview, host_view);
+    data_.execute([&](const auto &data) {
+        // create view of the host data
+        const host_view_type<const T> host_view{ data_to_copy, rcount };
+        // create subview of the device data
+        auto data_subview = Kokkos::subview(data, std::make_pair(pos, pos + rcount));
+        // copy the data to the device subview
+        Kokkos::deep_copy(data_subview, host_view);
 
-    detail::device_synchronize(queue_);
+        detail::device_synchronize(queue_);
+    });
 }
 
 template <typename T>
 void device_ptr<T>::copy_to_device_strided(const_host_pointer_type data_to_copy, const std::size_t spitch, const std::size_t width, const std::size_t height) {
-    PLSSVM_ASSERT(data_ != device_view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
 
     if (width > spitch) {
@@ -121,25 +142,27 @@ void device_ptr<T>::copy_to_device_strided(const_host_pointer_type data_to_copy,
 
 template <typename T>
 void device_ptr<T>::copy_to_host(host_pointer_type buffer, const size_type pos, const size_type count) const {
-    PLSSVM_ASSERT(data_ != device_view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
 
-    // create view of the host data
-    const host_view_type<T> host_view{ buffer, rcount };
-    // create subview of the device data
-    auto data_subview = Kokkos::subview(data_, std::make_pair(pos, pos + rcount));
-    // copy the data to the host
-    Kokkos::deep_copy(host_view, data_subview);
+    data_.execute([&](const auto &data) {
+        // create view of the host data
+        const host_view_type<T> host_view{ buffer, rcount };
+        // create subview of the device data
+        auto data_subview = Kokkos::subview(data, std::make_pair(pos, pos + rcount));
+        // copy the data to the host
+        Kokkos::deep_copy(host_view, data_subview);
 
-    detail::device_synchronize(queue_);
+        detail::device_synchronize(queue_);
+    });
 }
 
 template <typename T>
 void device_ptr<T>::copy_to_other_device(device_ptr &target, const size_type pos, const size_type count) const {
-    PLSSVM_ASSERT(data_ != device_view_type<T>{}, "Invalid data pointer! Maybe *this has been default constructed?");
-    PLSSVM_ASSERT(target.get() != device_view_type<T>{}, "Invalid target pointer! Maybe target has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(target.get() != device_pointer_type{}, "Invalid target pointer! Maybe target has been default constructed?");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
     if (target.size_padded() < rcount) {
diff --git a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
new file mode 100644
index 000000000..add12def4
--- /dev/null
+++ b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
@@ -0,0 +1,106 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ */
+
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"
+
+#include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"  // PLSSVM_KOKKOS_BACKEND_INVOKE_IF_*
+#include "plssvm/backends/Kokkos/execution_space.hpp"               // plssvm::kokkos::execution_space
+#include "plssvm/detail/logging_without_performance_tracking.hpp"   // plssvm::detail::log_untracked
+#include "plssvm/detail/utility.hpp"                                // plssvm::detail::unreachable
+#include "plssvm/target_platforms.hpp"                              // plssvm::target_platform
+#include "plssvm/verbosity_levels.hpp"                              // plssvm::verbosity_level
+
+#include "Kokkos_Core.hpp"  // Kokkos::num_devices, Kokkos::ExecutionSpace
+
+#include <vector>  // std::vector
+
+namespace plssvm::kokkos::detail {
+
+std::vector<device_wrapper> get_device_list(const execution_space space, [[maybe_unused]] const target_platform target) {
+    std::vector<device_wrapper> devices{};
+    switch (space) {
+        case execution_space::cuda:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() {
+                for (int device = 0; device < Kokkos::num_devices(); ++device) {
+                    // create CUDA stream using the CUDA specific functions
+                    cudaSetDevice(device);
+                    cudaStream_t stream{};
+                    cudaStreamCreate(&stream);
+                    // create Kokkos execution space for the specific device
+                    // Note: it is important to pass the cudaStream_t lifetime to be managed by Kokkos
+                    devices.emplace_back(Kokkos::Cuda(stream, Kokkos::Impl::ManageStream::yes));
+                }
+                return devices;
+            });
+        case execution_space::hip:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
+                for (int device = 0; device < Kokkos::num_devices(); ++device) {
+                    // HIP CUDA stream using the HIP specific functions
+                    hipSetDevice(device);
+                    hipStream_t stream{};
+                    hipStreamCreate(&stream);
+                    // create Kokkos execution space for the specific device
+                    // Note: it is important to pass the hipStream_t lifetime to be managed by Kokkos
+                    devices.emplace_back(Kokkos::Hip(stream, Kokkos::Impl::ManageStream::yes));
+                }
+                return devices;
+            });
+        case execution_space::sycl:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
+                // TODO: use all available devices -> not that trivial
+                // TODO: handle target <- if provide queue -> managed?
+                devices.emplace_back(Kokkos::SYCL{});
+                return devices;
+            });
+        case execution_space::hpx:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX([&]() {
+                devices.emplace_back(Kokkos::Hpx{});
+                return devices;
+            });
+        case execution_space::openmp:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP([&]() {
+                // Note: if OpenMP should be used as device  must be set in order for it to work!
+                if (omp_get_nested() == 0) {
+                    ::plssvm::detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                                                    "WARNING: In order for Kokkos::OpenMP to work properly, we have to set \"omp_set_nested(1)\"!\n");
+                    // enable OMP_NESTED support
+                    // Note: function is officially deprecated but still necessary for Kokkos::OpenMP to work properly
+                    omp_set_nested(1);
+                }
+                devices.emplace_back(Kokkos::OpenMP{});
+                return devices;
+            });
+        case execution_space::openmp_target:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET([&]() {
+                // TODO: multi-GPU?
+                devices.emplace_back(Kokkos::OpenMPTarget{});
+                return devices;
+            });
+        case execution_space::openacc:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC([&]() {
+                // TODO: multi-GPU?
+                devices.emplace_back(Kokkos::OpenACC{});
+                return devices;
+            });
+        case execution_space::threads:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS([&]() {
+                devices.emplace_back(Kokkos::Threads{});
+                return devices;
+            });
+        case execution_space::serial:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL([&]() {
+                devices.emplace_back(Kokkos::Serial{});
+                return devices;
+            });
+    }
+    // all possible cases should be handled by the previous switch
+    // -> silence missing return statement compiler warnings due to throw statement
+    ::plssvm::detail::unreachable();
+}
+
+}  // namespace plssvm::kokkos::detail
diff --git a/src/plssvm/backends/Kokkos/detail/utility.cpp b/src/plssvm/backends/Kokkos/detail/utility.cpp
index ac49ef532..f8521523b 100644
--- a/src/plssvm/backends/Kokkos/detail/utility.cpp
+++ b/src/plssvm/backends/Kokkos/detail/utility.cpp
@@ -9,160 +9,108 @@
 #include "plssvm/backends/Kokkos/detail/utility.hpp"
 
 #include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"  // PLSSVM_KOKKOS_BACKEND_INVOKE_IF_*
-#include "plssvm/backends/Kokkos/exceptions.hpp"                    // plssvm::kokkos::backend_exception
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"         // plssvm::kokkos::detail::device_wrapper
 #include "plssvm/backends/Kokkos/execution_space.hpp"               // plssvm::kokkos::execution_space
 #include "plssvm/detail/assert.hpp"                                 // PLSSVM_ASSERT
-#include "plssvm/detail/utility.hpp"                                // plssvm::detail::unreachable
+#include "plssvm/detail/string_utility.hpp"                         // plssvm::detail::as_lower_case
+#include "plssvm/detail/utility.hpp"                                // plssvm::detail::contains
 #include "plssvm/target_platforms.hpp"                              // plssvm::target_platform
 
-#include "Kokkos_Core.hpp"    // Kokkos::DefaultExecutionSpace, Kokkos::num_devices, Kokkos::Cuda, Kokkos::Hip, Kokkos::Sycl, Kokkos::Impl::ManageStream
+#include "Kokkos_Core.hpp"    // Kokkos::ExecutionSpace, Kokkos::Impl::ManageStream
 #include "Kokkos_Macros.hpp"  // Kokkos macros
 
 #include "fmt/core.h"  // fmt::format
 
+#include <map>     // std::map
 #include <string>  // std::string
 #include <vector>  // std::vector
 
 namespace plssvm::kokkos::detail {
 
-target_platform determine_default_target_platform_from_execution_space(const execution_space space) {
-    switch (space) {
-        case execution_space::cuda:
-            return target_platform::gpu_nvidia;
-        case execution_space::hip:
-            return target_platform::gpu_amd;  // TODO: or gpu_nvidia :/
-        case execution_space::sycl:
-        case execution_space::openmp_target:
-        case execution_space::openacc:
-            return target_platform::gpu_nvidia;  // TODO: what to return here?
-        case execution_space::openmp:
-        case execution_space::hpx:
-        case execution_space::threads:
-        case execution_space::serial:
-            return target_platform::cpu;
+std::map<target_platform, std::vector<execution_space>> available_target_platform_to_execution_space_mapping() {
+    std::map<target_platform, std::vector<execution_space>> available_map{};
+
+    // TODO: only return really POSSIBLE target platforms?
+    // iterate over all available execution spaces
+    for (const execution_space space : list_available_execution_spaces()) {
+        switch (space) {
+            case execution_space::cuda:
+                // NVIDIA GPUs only
+                available_map[target_platform::gpu_nvidia].push_back(execution_space::cuda);
+                break;
+            case execution_space::hip:
+                // NVIDIA and AMD GPUs possible
+                available_map[target_platform::gpu_nvidia].push_back(execution_space::hip);
+                available_map[target_platform::gpu_amd].push_back(execution_space::hip);
+                break;
+            case execution_space::sycl:
+            case execution_space::openacc:
+                // all GPUs and CPU possible
+                available_map[target_platform::gpu_nvidia].push_back(execution_space::sycl);
+                available_map[target_platform::gpu_amd].push_back(execution_space::sycl);
+                available_map[target_platform::gpu_intel].push_back(execution_space::sycl);
+                available_map[target_platform::cpu].push_back(execution_space::sycl);
+                break;
+            case execution_space::openmp_target:
+                // all GPUs
+                available_map[target_platform::gpu_nvidia].push_back(execution_space::openmp_target);
+                available_map[target_platform::gpu_amd].push_back(execution_space::openmp_target);
+                available_map[target_platform::gpu_intel].push_back(execution_space::openmp_target);
+                break;
+            case execution_space::hpx:
+            case execution_space::openmp:
+            case execution_space::threads:
+            case execution_space::serial:
+                // all these execution spaces are CPU only
+                available_map[target_platform::cpu].push_back(space);
+                break;
+        }
     }
-    // all possible cases should be handled by the previous switch
-    // -> silence missing return statement compiler warnings due to throw statement
-    ::plssvm::detail::unreachable();
-}
 
-void check_execution_space_target_platform_combination(const execution_space space, const target_platform target) {
-    PLSSVM_ASSERT(target != target_platform::automatic, "The provided target platform may not be the automatic target platform!");
+    // the map must at least have one entry
+    PLSSVM_ASSERT(!available_map.empty(), "At least one target platform must be available!");
+    // the automatic target platform must not be present
+    PLSSVM_ASSERT(!::plssvm::detail::contains(available_map, target_platform::automatic), "The automatic target platform may not be present!");
 
-    switch (space) {
-        case execution_space::cuda:
-            if (target != target_platform::gpu_nvidia) {
-                throw backend_exception{ fmt::format("The target platform {} is not supported for Kokkos {} execution space!", target, space) };
-            }
-            break;
-        case execution_space::hip:
-            if (target != target_platform::gpu_amd && target != target_platform::gpu_nvidia) {
-                throw backend_exception{ fmt::format("The target platform {} is not supported for Kokkos {} execution space!", target, space) };
-            }
-            break;
-        case execution_space::sycl:
-            // SYCL may support all target platforms!
-            // TODO: use SYCL specific functions to check?
-        case execution_space::openmp_target:
-            // OpenMP Target Offloading may support all target platforms!
-            // TODO: use OpenMP Target Offloading specific functions to check?
-        case execution_space::openacc:
-            // OpenACC may support all target platforms!
-            // TODO: use OpenACC Target Offloading specific functions to check?
-            break;
-        case execution_space::openmp:
-        case execution_space::hpx:
-        case execution_space::threads:
-        case execution_space::serial:
-            if (target != target_platform::cpu) {
-                throw backend_exception{ fmt::format("The target platform {} is not supported for Kokkos {} execution space!", target, space) };
-            }
-            break;
-    }
+    return available_map;
 }
 
-std::vector<Kokkos::DefaultExecutionSpace> get_device_list(const execution_space space, [[maybe_unused]] const target_platform target) {
-    std::vector<Kokkos::DefaultExecutionSpace> devices{};
-    switch (space) {
+std::string get_device_name([[maybe_unused]] const device_wrapper &dev) {
+    switch (dev.get_execution_space()) {
         case execution_space::cuda:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() {
-                for (int device = 0; device < Kokkos::num_devices(); ++device) {
-                    // create CUDA stream using the CUDA specific functions
-                    cudaSetDevice(device);
-                    cudaStream_t stream{};
-                    cudaStreamCreate(&stream);
-                    // create Kokkos execution space for the specific device
-                    // Note: it is important to pass the cudaStream_t lifetime to be managed by Kokkos
-                    devices.emplace_back(Kokkos::Cuda(stream, Kokkos::Impl::ManageStream::yes));
-                }
-                return devices;
+                return std::string{ dev.get<execution_space::cuda>().cuda_device_prop().name };
             });
         case execution_space::hip:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
-                for (int device = 0; device < Kokkos::num_devices(); ++device) {
-                    // HIP CUDA stream using the HIP specific functions
-                    hipSetDevice(device);
-                    hipStream_t stream{};
-                    hipStreamCreate(&stream);
-                    // create Kokkos execution space for the specific device
-                    // Note: it is important to pass the hipStream_t lifetime to be managed by Kokkos
-                    devices.emplace_back(Kokkos::Hip(stream, Kokkos::Impl::ManageStream::yes));
-                }
-                return devices;
+                return std::string{ dev.get<execution_space::hip>().hip_device_prop().name };
             });
         case execution_space::sycl:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
-                // TODO: use all available devices -> not that trivial
-                // TODO: handle target
-                devices.emplace_back(Kokkos::SYCL{});
-                return devices;
+                return dev.get<execution_space::sycl>().sycl_queue.get_device().get_info<sycl::info::device::name>();
             });
-        case execution_space::openmp:
         case execution_space::hpx:
-        case execution_space::threads:
-        case execution_space::serial:
-            devices.emplace_back(Kokkos::DefaultExecutionSpace{});
-            return devices;
-        case execution_space::openmp_target:
-        case execution_space::openacc:
-            // TODO: implement
-            throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space) };
-    }
-    // all possible cases should be handled by the previous switch
-    // -> silence missing return statement compiler warnings due to throw statement
-    ::plssvm::detail::unreachable();
-}
-
-std::string get_device_name(const execution_space space, [[maybe_unused]] const Kokkos::DefaultExecutionSpace &exec) {
-    // TODO: implement for other backends!
-    switch (space) {
-        case execution_space::cuda:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() {
-                return std::string{ exec.cuda_device_prop().name };
-            });
-        case execution_space::hip:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
-                return std::string{ exec.hip_device_prop().name };
-            });
-        case execution_space::sycl:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
-                return exec.sycl_queue.get_device().get_info<sycl::info::device::name>();
-            });
+            return "HPX CPU host device";
         case execution_space::openmp:
-        case execution_space::hpx:
-        case execution_space::threads:
-        case execution_space::serial:
-            return "CPU host device";
+            return "OpenMP CPU host device";
         case execution_space::openmp_target:
+            // TODO: device name?
             return "OpenMP target device";
         case execution_space::openacc:
+            // TODO: device name?
             return "OpenACC target device";
+        case execution_space::threads:
+            return "std::threads CPU host device";
+        case execution_space::serial:
+            return "serial CPU host device";
     }
     return "unknown";
 }
 
-void device_synchronize(const Kokkos::DefaultExecutionSpace &exec) {
-    exec.fence();
+void device_synchronize(const device_wrapper &dev) {
+    dev.execute([](const auto &device) {
+        device.fence();
+    });
 }
 
 std::string get_kokkos_version() {
diff --git a/src/plssvm/backends/Kokkos/execution_space.cpp b/src/plssvm/backends/Kokkos/execution_space.cpp
index 2f3472aa8..6179c496d 100644
--- a/src/plssvm/backends/Kokkos/execution_space.cpp
+++ b/src/plssvm/backends/Kokkos/execution_space.cpp
@@ -8,18 +8,14 @@
 
 #include "plssvm/backends/Kokkos/execution_space.hpp"
 
-#include "plssvm/detail/assert.hpp"          // PLSSVM_ASSERT
 #include "plssvm/detail/string_utility.hpp"  // plssvm::detail::to_lower_case
-#include "plssvm/detail/utility.hpp"         // plssvm::detail::unreachable
 
-#include "Kokkos_Core.hpp"  // Kokkos::DefaultExecutionSpace, Kokkos macros, Kokkos ExecutionSpace types
-
-#include <ios>          // std::ios::failbit
-#include <istream>      // std::istream
-#include <ostream>      // std::ostream
-#include <string>       // std::string
-#include <type_traits>  // std::is_same_v
-#include <vector>       // std::vector
+#include <array>    // std::array
+#include <ios>      // std::ios::failbit
+#include <istream>  // std::istream
+#include <ostream>  // std::ostream
+#include <string>   // std::string
+#include <vector>   // std::vector
 
 namespace plssvm::kokkos {
 
@@ -76,91 +72,9 @@ std::istream &operator>>(std::istream &in, execution_space &space) {
     return in;
 }
 
-execution_space determine_default_execution_space() noexcept {
-    // determine the execution_space enumeration value based on the provided Kokkos execution space
-#if defined(KOKKOS_ENABLE_CUDA)
-    if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::Cuda>) {
-        return execution_space::cuda;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_HIP)
-    if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::HIP>) {
-        return execution_space::hip;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_SYCL)
-    if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::SYCL>) {
-        return execution_space::sycl;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_HPX)
-    if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::Experimental::HPX>) {
-        return execution_space::hpx;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_OPENMP)
-    if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::OpenMP>) {
-        return execution_space::openmp;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_OPENMPTARGET)
-    if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::OpenMPTarget>) {
-        return execution_space::openmp_target;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_OPENACC)
-    if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::Experimental::OpenACC>) {
-        return execution_space::openacc;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_THREADS)
-    if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::Threads>) {
-        return execution_space::threads;
-    }
-#endif
-#if defined(KOKKOS_ENABLE_SERIAL)
-    if constexpr (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::Serial>) {
-        return execution_space::serial;
-    }
-#endif
-    // at least one execution space must always be available!
-    ::plssvm::detail::unreachable();
-}
-
-[[nodiscard]] std::vector<execution_space> available_execution_spaces() {
-    std::vector<execution_space> available_spaces{};
-#if defined(KOKKOS_ENABLE_CUDA)
-    available_spaces.push_back(execution_space::cuda);
-#endif
-#if defined(KOKKOS_ENABLE_HIP)
-    available_spaces.push_back(execution_space::hip);
-#endif
-#if defined(KOKKOS_ENABLE_SYCL)
-    available_spaces.push_back(execution_space::sycl);
-#endif
-#if defined(KOKKOS_ENABLE_HPX)
-    available_spaces.push_back(execution_space::hpx);
-#endif
-#if defined(KOKKOS_ENABLE_OPENMP)
-    available_spaces.push_back(execution_space::openmp);
-#endif
-#if defined(KOKKOS_ENABLE_OPENMPTARGET)
-    available_spaces.push_back(execution_space::openmp_target);
-#endif
-#if defined(KOKKOS_ENABLE_OPENACC)
-    available_spaces.push_back(execution_space::openacc);
-#endif
-#if defined(KOKKOS_ENABLE_THREADS)
-    available_spaces.push_back(execution_space::threads);
-#endif
-#if defined(KOKKOS_ENABLE_SERIAL)
-    available_spaces.push_back(execution_space::serial);
-#endif
-
-    // AT LEAST ONE execution space must ALWAYS be available
-    PLSSVM_ASSERT(!available_spaces.empty(), "Aat least one execution space must always be available!");
-
-    return available_spaces;
+std::vector<execution_space> list_available_execution_spaces() {
+    constexpr auto arr = detail::constexpr_available_execution_spaces();
+    return std::vector<execution_space>(arr.cbegin(), arr.cend());
 }
 
 }  // namespace plssvm::kokkos
diff --git a/tests/backends/Kokkos/CMakeLists.txt b/tests/backends/Kokkos/CMakeLists.txt
index f6925207f..e0686c2e4 100644
--- a/tests/backends/Kokkos/CMakeLists.txt
+++ b/tests/backends/Kokkos/CMakeLists.txt
@@ -10,7 +10,8 @@ set(PLSSVM_KOKKOS_TEST_NAME Kokkos_tests)
 # list all necessary sources
 set(PLSSVM_KOKKOS_TEST_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/detail/device_ptr.cpp
-    ${CMAKE_CURRENT_LIST_DIR}/detail/typedefs.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/detail/device_view_wrapper.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/detail/device_wrapper.cpp
     ${CMAKE_CURRENT_LIST_DIR}/detail/standard_layout_tuple.cpp
     ${CMAKE_CURRENT_LIST_DIR}/detail/pinned_memory.cpp
     ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp
diff --git a/tests/backends/Kokkos/detail/device_ptr.cpp b/tests/backends/Kokkos/detail/device_ptr.cpp
index 34b83eefa..c96a1ed87 100644
--- a/tests/backends/Kokkos/detail/device_ptr.cpp
+++ b/tests/backends/Kokkos/detail/device_ptr.cpp
@@ -10,6 +10,8 @@
 
 #include "plssvm/backends/Kokkos/detail/device_ptr.hpp"  // plssvm::kokkos::detail::device_ptr
 
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"  // plssvm::kokkos::detail::device_wrapper
+
 #include "Kokkos_Core.hpp"  // Kokkos::DefaultExecutionSpace
 
 #include "tests/backends/generic_device_ptr_tests.hpp"  // generic device pointer tests to instantiate
@@ -23,10 +25,10 @@
 template <typename T>
 struct kokkos_device_ptr_test_type {
     using device_ptr_type = plssvm::kokkos::detail::device_ptr<T>;
-    using queue_type = Kokkos::DefaultExecutionSpace;
+    using queue_type = plssvm::kokkos::detail::device_wrapper;
 
     static const queue_type &default_queue() {
-        static const queue_type queue{};
+        static const queue_type queue{ Kokkos::DefaultExecutionSpace{} };
         return queue;
     }
 };
diff --git a/tests/backends/Kokkos/detail/device_view_wrapper.cpp b/tests/backends/Kokkos/detail/device_view_wrapper.cpp
new file mode 100644
index 000000000..026daaf1e
--- /dev/null
+++ b/tests/backends/Kokkos/detail/device_view_wrapper.cpp
@@ -0,0 +1,77 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Tests for the device_view_wrapper class.
+ */
+
+#include "plssvm/backends/Kokkos/detail/device_view_wrapper.hpp"
+
+#include "plssvm/backends/Kokkos/execution_space.hpp"  // plssvm::kokkos::{execution_space, kokkos_type_to_execution_space_v}
+
+#include "Kokkos_Core.hpp"  // Kokkos::DefaultExecutionSpace, Kokkos::View
+
+#include "gtest/gtest.h"  // TEST, EXPECT_EQ, EXPECT_TRUE, EXPECT_FALSE
+
+TEST(KokkosDeviceViewWrapper, default_construct) {
+    // default construct a device view wrapper
+    const plssvm::kokkos::detail::device_view_wrapper<double *> view{};
+
+    // per std::variant specification, the first type in the underlying variant is now the active member
+    // -> this always corresponds to the first entry in our constexpr_available_execution_spaces array
+    constexpr auto spaces = plssvm::kokkos::detail::constexpr_available_execution_spaces();
+    EXPECT_EQ(view.get_execution_space(), spaces.front());
+}
+
+TEST(KokkosDeviceViewWrapper, construct) {
+    // construct a device view wrapper using the current Kokkos::DefaultExecutionSpace
+    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
+
+    // check that the device view is associated with the correct execution space
+    EXPECT_EQ(view.get_execution_space(), plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>);
+}
+
+TEST(KokkosDeviceViewWrapper, get) {
+    // construct a device view wrapper using the current Kokkos::DefaultExecutionSpace
+    plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
+
+    // check that the returned Kokkos::View has the correct type
+    constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>;
+    ::testing::StaticAssertTypeEq<decltype(view.get<space>()), Kokkos::View<double *, Kokkos::DefaultExecutionSpace> &>();
+}
+
+TEST(KokkosDeviceViewWrapper, get_const) {
+    // construct a device view wrapper using the current Kokkos::DefaultExecutionSpace
+    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<int **, Kokkos::DefaultExecutionSpace>{} };
+
+    // check that the returned Kokkos::View has the correct type
+    constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>;
+    ::testing::StaticAssertTypeEq<decltype(view.get<space>()), const Kokkos::View<int **, Kokkos::DefaultExecutionSpace> &>();
+}
+
+TEST(KokkosDeviceViewWrapper, get_execution_space) {
+    // construct a device wrapper using the current Kokkos::DefaultExecutionSpace
+    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
+
+    // check that the device view is associated with the correct execution space
+    EXPECT_EQ(view.get_execution_space(), plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>);
+}
+
+TEST(KokkosDeviceViewWrapper, equality) {
+    const plssvm::kokkos::detail::device_view_wrapper view1{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
+    const plssvm::kokkos::detail::device_view_wrapper view2{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
+
+    // should be equal
+    EXPECT_TRUE(view1 == view2);
+}
+
+TEST(KokkosDeviceViewWrapper, inequality) {
+    const plssvm::kokkos::detail::device_view_wrapper view1{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
+    const plssvm::kokkos::detail::device_view_wrapper view2{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
+
+    // should not be unequal
+    EXPECT_FALSE(view1 != view2);
+}
diff --git a/tests/backends/Kokkos/detail/device_wrapper.cpp b/tests/backends/Kokkos/detail/device_wrapper.cpp
new file mode 100644
index 000000000..4547281ff
--- /dev/null
+++ b/tests/backends/Kokkos/detail/device_wrapper.cpp
@@ -0,0 +1,115 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Tests for the device_wrapper class.
+ */
+
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"
+
+#include "plssvm/backends/Kokkos/detail/utility.hpp"   // plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping
+#include "plssvm/backends/Kokkos/execution_space.hpp"  // plssvm::kokkos::{execution_space, kokkos_type_to_execution_space_v}
+#include "plssvm/detail/utility.hpp"                   // plssvm::detail::contains
+#include "plssvm/target_platforms.hpp"                 // plssvm::target_platform
+
+#include "Kokkos_Core.hpp"  // Kokkos::DefaultExecutionSpace
+
+#include "tests/utility.hpp"  // util::for_each_variant_type
+
+#include "gtest/gtest.h"  // TEST, EXPECT_GE, EXPECT_EQ
+
+#include <vector>  // std::vector
+
+TEST(KokkosDeviceWrapper, default_construct) {
+    // default construct a device wrapper
+    const plssvm::kokkos::detail::device_wrapper device{};
+
+    // per std::variant specification, the first type in the underlying variant is now the active member
+    // -> this always corresponds to the first entry in our constexpr_available_execution_spaces array
+    constexpr auto spaces = plssvm::kokkos::detail::constexpr_available_execution_spaces();
+    EXPECT_EQ(device.get_execution_space(), spaces.front());
+}
+
+TEST(KokkosDeviceWrapper, construct) {
+    // construct a device wrapper using the current Kokkos::DefaultExecutionSpace
+    const plssvm::kokkos::detail::device_wrapper device{ Kokkos::DefaultExecutionSpace{} };
+
+    // check that the device is associated with the correct execution space
+    EXPECT_EQ(device.get_execution_space(), plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>);
+}
+
+TEST(KokkosDeviceWrapper, get) {
+    // construct a device wrapper using the current Kokkos::DefaultExecutionSpace
+    plssvm::kokkos::detail::device_wrapper device{ Kokkos::DefaultExecutionSpace{} };
+
+    // check that the returned Kokkos::ExecutionSpace has the correct type
+    constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>;
+    ::testing::StaticAssertTypeEq<decltype(device.get<space>()), Kokkos::DefaultExecutionSpace &>();
+}
+
+TEST(KokkosDeviceWrapper, get_const) {
+    // construct a device wrapper using the current Kokkos::DefaultExecutionSpace
+    const plssvm::kokkos::detail::device_wrapper device{ Kokkos::DefaultExecutionSpace{} };
+
+    // check that the returned Kokkos::ExecutionSpace has the correct type
+    constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>;
+    ::testing::StaticAssertTypeEq<decltype(device.get<space>()), const Kokkos::DefaultExecutionSpace &>();
+}
+
+TEST(KokkosDeviceWrapper, get_execution_space) {
+    // construct a device wrapper using the current Kokkos::DefaultExecutionSpace
+    const plssvm::kokkos::detail::device_wrapper device{ Kokkos::DefaultExecutionSpace{} };
+
+    // check that the device is associated with the correct execution space
+    EXPECT_EQ(device.get_execution_space(), plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>);
+}
+
+TEST(KokkosDeviceWrapper, equality) {
+    const plssvm::kokkos::detail::device_wrapper device1{ Kokkos::DefaultExecutionSpace{} };
+    const plssvm::kokkos::detail::device_wrapper device2{ Kokkos::DefaultExecutionSpace{} };
+
+    // should be equal
+    EXPECT_TRUE(device1 == device2);
+}
+
+TEST(KokkosDeviceWrapper, inequality) {
+    const plssvm::kokkos::detail::device_wrapper device1{ Kokkos::DefaultExecutionSpace{} };
+    const plssvm::kokkos::detail::device_wrapper device2{ Kokkos::DefaultExecutionSpace{} };
+
+    // should not be unequal
+    EXPECT_FALSE(device1 != device2);
+}
+
+struct device_list_test {
+    template <typename ExecutionSpace>
+    void operator()() const {
+        // get the default device list
+        const plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v<ExecutionSpace>;
+        plssvm::target_platform default_target{};
+        for (const auto &[target, spaces] : plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping()) {
+            if (::plssvm::detail::contains(spaces, space)) {
+                default_target = target;
+                break;
+            }
+        }
+        const std::vector<plssvm::kokkos::detail::device_wrapper> devices = plssvm::kokkos::detail::get_device_list(space, default_target);
+
+        // check the number of returned devices
+        if (space == plssvm::kokkos::execution_space::cuda || space == plssvm::kokkos::execution_space::hip || space == plssvm::kokkos::execution_space::sycl) {
+            // TODO: OpenMP Target Offloading / OpenACC
+            // for the device execution spaces AT LEAST ONE device must be found
+            EXPECT_GE(devices.size(), 1);
+        } else {
+            // for all other execution spaces EXACTLY ONE device must be found
+            EXPECT_EQ(devices.size(), 1);
+        }
+    }
+};
+
+TEST(KokkosDeviceWrapper, get_device_list) {
+    using variant_type = typename plssvm::kokkos::detail::impl::create_device_variant_type::type;
+    util::for_each_variant_type<variant_type>(device_list_test{});
+}
diff --git a/tests/backends/Kokkos/detail/typedefs.cpp b/tests/backends/Kokkos/detail/typedefs.cpp
deleted file mode 100644
index 4e25d4a6c..000000000
--- a/tests/backends/Kokkos/detail/typedefs.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * @author Alexander Van Craen
- * @author Marcel Breyer
- * @copyright 2018-today The PLSSVM project - All Rights Reserved
- * @license This file is part of the PLSSVM project which is released under the MIT license.
- *          See the LICENSE.md file in the project root for full license information.
- *
- * @brief Tests for the Kokkos::View typedefs.
- */
-
-#include "plssvm/backends/Kokkos/detail/typedefs.hpp"  // plssvm::kokkos::detail::{device_view_type, host_view_type}
-
-#include "Kokkos_Core.hpp"  // Kokkos::View, Kokkos::DefaultExecutionSpace, Kokkos::HostSpace, Kokkos::MemoryUnmanaged
-
-#include "gtest/gtest.h"  // TEST, ::testing::StaticAssertTypeEq
-
-TEST(KokkosTypedefs, device_view_type) {
-    // test device view typedefs
-    ::testing::StaticAssertTypeEq<Kokkos::View<int *, Kokkos::DefaultExecutionSpace>, plssvm::kokkos::detail::device_view_type<int>>();
-    ::testing::StaticAssertTypeEq<Kokkos::View<const unsigned *, Kokkos::DefaultExecutionSpace>, plssvm::kokkos::detail::device_view_type<const unsigned>>();
-}
-
-TEST(KokkosTypedefs, host_view_type) {
-    // test host view typedefs
-    ::testing::StaticAssertTypeEq<Kokkos::View<double *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>, plssvm::kokkos::detail::host_view_type<double>>();
-    ::testing::StaticAssertTypeEq<Kokkos::View<const float *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>, plssvm::kokkos::detail::host_view_type<const float>>();
-}
diff --git a/tests/backends/Kokkos/detail/utility.cpp b/tests/backends/Kokkos/detail/utility.cpp
index ab49f1034..7c6d491d5 100644
--- a/tests/backends/Kokkos/detail/utility.cpp
+++ b/tests/backends/Kokkos/detail/utility.cpp
@@ -10,102 +10,69 @@
 
 #include "plssvm/backends/Kokkos/detail/utility.hpp"
 
-#include "plssvm/backends/Kokkos/exceptions.hpp"       // plssvm::kokkos::backend_exception
-#include "plssvm/backends/Kokkos/execution_space.hpp"  // plssvm::kokkos::execution_space
-#include "plssvm/target_platforms.hpp"                 // plssvm::target_platform
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"  // plssvm::kokkos::detail::device_wrapper
+#include "plssvm/backends/Kokkos/exceptions.hpp"             // plssvm::kokkos::backend_exception
+#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::{execution_space, kokkos_type_to_execution_space_v}
+#include "plssvm/detail/utility.hpp"                         // plssvm::detail::contains
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
-#include "Kokkos_Core.hpp"  // Kokkos::DefaultExecutionSpace
+#include "Kokkos_Core.hpp"  // Kokkos::ExecutionSpace
 
 #include "tests/custom_test_macros.hpp"  // EXPECT_THROW_WHAT
+#include "tests/utility.hpp"             // util::for_each_variant_type
 
 #include "fmt/core.h"     // fmt::format
 #include "gmock/gmock.h"  // EXPECT_THAT; ::testing::AnyOf
-#include "gtest/gtest.h"  // TEST, EXPECT_GE, EXPECT_NE
-
-#include <regex>   // std::regex, std::regex::extended, std::regex_match
-#include <string>  // std::string
-#include <vector>  // std::vector
-
-TEST(KokkosUtility, determine_default_target_platform_from_execution_space) {
-    // determine the potential default target platform
-    EXPECT_EQ(plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(plssvm::kokkos::execution_space::cuda), plssvm::target_platform::gpu_nvidia);
-    EXPECT_THAT(plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(plssvm::kokkos::execution_space::hip), ::testing::AnyOf(plssvm::target_platform::gpu_nvidia, plssvm::target_platform::gpu_amd));
-    EXPECT_NE(plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(plssvm::kokkos::execution_space::sycl), plssvm::target_platform::automatic);
-    EXPECT_EQ(plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(plssvm::kokkos::execution_space::hpx), plssvm::target_platform::cpu);
-    EXPECT_EQ(plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(plssvm::kokkos::execution_space::openmp), plssvm::target_platform::cpu);
-    EXPECT_NE(plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(plssvm::kokkos::execution_space::openmp_target), plssvm::target_platform::automatic);
-    EXPECT_NE(plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(plssvm::kokkos::execution_space::openacc), plssvm::target_platform::automatic);
-    EXPECT_EQ(plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(plssvm::kokkos::execution_space::threads), plssvm::target_platform::cpu);
-    EXPECT_EQ(plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(plssvm::kokkos::execution_space::serial), plssvm::target_platform::cpu);
+#include "gtest/gtest.h"  // TEST, EXPECT_NE
+
+#include <map>      // std::map
+#include <regex>    // std::regex, std::regex::extended, std::regex_match
+#include <string>   // std::string
+#include <variant>  // std::variant
+#include <vector>   // std::vector
+
+TEST(KokkosUtility, is_type_in_variant) {
+    // check type trait that determines if a type is contained in a type trait
+    using variant_type = std::variant<int, double, bool, std::string>;
+
+    EXPECT_TRUE((plssvm::kokkos::detail::impl::is_type_in_variant_v<int, variant_type>) );
+    EXPECT_TRUE((plssvm::kokkos::detail::impl::is_type_in_variant_v<double, variant_type>) );
+    EXPECT_TRUE((plssvm::kokkos::detail::impl::is_type_in_variant_v<bool, variant_type>) );
+    EXPECT_TRUE((plssvm::kokkos::detail::impl::is_type_in_variant_v<std::string, variant_type>) );
+    EXPECT_FALSE((plssvm::kokkos::detail::impl::is_type_in_variant_v<short, variant_type>) );
+    EXPECT_FALSE((plssvm::kokkos::detail::impl::is_type_in_variant_v<float, variant_type>) );
 }
 
-TEST(KokkosUtility, check_execution_space_target_platform_combination) {
-    // check some execution_space <-> target_platform combinations
-    // the cuda execution space only supports the NVIDIA GPU
-    EXPECT_NO_THROW(plssvm::kokkos::detail::check_execution_space_target_platform_combination(plssvm::kokkos::execution_space::cuda, plssvm::target_platform::gpu_nvidia));
-    EXPECT_THROW_WHAT(plssvm::kokkos::detail::check_execution_space_target_platform_combination(plssvm::kokkos::execution_space::cuda, plssvm::target_platform::gpu_amd),
-                      plssvm::kokkos::backend_exception,
-                      "The target platform gpu_amd is not supported for Kokkos Cuda execution space!");
-    EXPECT_THROW_WHAT(plssvm::kokkos::detail::check_execution_space_target_platform_combination(plssvm::kokkos::execution_space::cuda, plssvm::target_platform::gpu_intel),
-                      plssvm::kokkos::backend_exception,
-                      "The target platform gpu_intel is not supported for Kokkos Cuda execution space!");
-    EXPECT_THROW_WHAT(plssvm::kokkos::detail::check_execution_space_target_platform_combination(plssvm::kokkos::execution_space::cuda, plssvm::target_platform::cpu),
-                      plssvm::kokkos::backend_exception,
-                      "The target platform cpu is not supported for Kokkos Cuda execution space!");
-
-    // the hip execution space only supports the NVIDIA and AMD GPUs
-    EXPECT_NO_THROW(plssvm::kokkos::detail::check_execution_space_target_platform_combination(plssvm::kokkos::execution_space::hip, plssvm::target_platform::gpu_nvidia));
-    EXPECT_NO_THROW(plssvm::kokkos::detail::check_execution_space_target_platform_combination(plssvm::kokkos::execution_space::hip, plssvm::target_platform::gpu_amd));
-    EXPECT_THROW_WHAT(plssvm::kokkos::detail::check_execution_space_target_platform_combination(plssvm::kokkos::execution_space::hip, plssvm::target_platform::gpu_intel),
-                      plssvm::kokkos::backend_exception,
-                      "The target platform gpu_intel is not supported for Kokkos HIP execution space!");
-    EXPECT_THROW_WHAT(plssvm::kokkos::detail::check_execution_space_target_platform_combination(plssvm::kokkos::execution_space::hip, plssvm::target_platform::cpu),
-                      plssvm::kokkos::backend_exception,
-                      "The target platform cpu is not supported for Kokkos HIP execution space!");
+TEST(KokkosUtility, available_target_platform_to_execution_space_mapping) {
+    // get the target_platform <-> execution_space mappings
+    const std::map<plssvm::target_platform, std::vector<plssvm::kokkos::execution_space>> mapping = plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping();
 
-    // TODO: SYCL
-    // TODO: OpenMP target
-    // TODO: OpenACC
+    // the map must not be empty
+    EXPECT_FALSE(mapping.empty());
 
-    // the remaining execution spaces all only support CPUs!
-    for (const plssvm::kokkos::execution_space exec : { plssvm::kokkos::execution_space::hpx, plssvm::kokkos::execution_space::openmp, plssvm::kokkos::execution_space::threads, plssvm::kokkos::execution_space::serial }) {
-        EXPECT_THROW_WHAT(plssvm::kokkos::detail::check_execution_space_target_platform_combination(exec, plssvm::target_platform::gpu_nvidia),
-                          plssvm::kokkos::backend_exception,
-                          fmt::format("The target platform gpu_nvidia is not supported for Kokkos {} execution space!", exec));
-        EXPECT_THROW_WHAT(plssvm::kokkos::detail::check_execution_space_target_platform_combination(exec, plssvm::target_platform::gpu_amd),
-                          plssvm::kokkos::backend_exception,
-                          fmt::format("The target platform gpu_amd is not supported for Kokkos {} execution space!", exec));
-        EXPECT_THROW_WHAT(plssvm::kokkos::detail::check_execution_space_target_platform_combination(exec, plssvm::target_platform::gpu_intel),
-                          plssvm::kokkos::backend_exception,
-                          fmt::format("The target platform gpu_intel is not supported for Kokkos {} execution space!", exec));
-        EXPECT_NO_THROW(plssvm::kokkos::detail::check_execution_space_target_platform_combination(exec, plssvm::target_platform::cpu));
+    // each vector must at least have one entry + the automatic target platform must not be present
+    for (const auto &[target, spaces] : mapping) {
+        EXPECT_NE(target, plssvm::target_platform::automatic);
+        EXPECT_GE(spaces.size(), 1);
     }
 }
 
-TEST(KokkosUtility, get_device_list) {
-    // get the default device list
-    const plssvm::kokkos::execution_space space = plssvm::kokkos::determine_default_execution_space();
-    const plssvm::target_platform target = plssvm::kokkos::detail::determine_default_target_platform_from_execution_space(space);
-    const std::vector<Kokkos::DefaultExecutionSpace> devices = plssvm::kokkos::detail::get_device_list(space, target);
+struct device_name_test {
+    template <typename ExecutionSpace>
+    void operator()() const {
+        // get the device name of the default Kokkos execution space
+        const std::string name = plssvm::kokkos::detail::get_device_name(plssvm::kokkos::detail::device_wrapper{ ExecutionSpace{} });
+        SCOPED_TRACE(name);
 
-    // check the number of returned devices
-    if (space == plssvm::kokkos::execution_space::cuda || space == plssvm::kokkos::execution_space::hip || space == plssvm::kokkos::execution_space::sycl) {
-        // for the device execution spaces AT LEAST ONE device must be found
-        EXPECT_GE(devices.size(), 1);
-    } else {
-        // for all other execution spaces EXACTLY ONE device must be found
-        EXPECT_EQ(devices.size(), 1);
+        // the returned device name may not be empty or unknown
+        EXPECT_FALSE(name.empty());
+        EXPECT_NE(name, std::string{ "unknown" });
     }
-}
+};
 
 TEST(KokkosUtility, get_device_name) {
-    // get the device name of the default Kokkos execution space
-    const plssvm::kokkos::execution_space space = plssvm::kokkos::determine_default_execution_space();
-    const std::string name = plssvm::kokkos::detail::get_device_name(space, Kokkos::DefaultExecutionSpace{});
-
-    // the returned device name may not be empty or unknown
-    EXPECT_FALSE(name.empty());
-    EXPECT_NE(name, std::string{ "unknown" });
+    using variant_type = typename plssvm::kokkos::detail::impl::create_device_variant_type::type;
+    util::for_each_variant_type<variant_type>(device_name_test{});
 }
 
 TEST(KokkosUtility, get_kokkos_version) {
diff --git a/tests/backends/Kokkos/execution_space.cpp b/tests/backends/Kokkos/execution_space.cpp
index c0cec6f45..2073d1fd4 100644
--- a/tests/backends/Kokkos/execution_space.cpp
+++ b/tests/backends/Kokkos/execution_space.cpp
@@ -68,12 +68,74 @@ TEST(KokkosExecutionSpace, from_string_unknown) {
     EXPECT_TRUE(input.fail());
 }
 
-TEST(KokkosExecutionSpace, determine_execution_space) {
-    // check that "unreachable" is never reached
-    EXPECT_THAT(plssvm::kokkos::determine_default_execution_space(), ::testing::AnyOf(plssvm::kokkos::execution_space::cuda, plssvm::kokkos::execution_space::hip, plssvm::kokkos::execution_space::sycl, plssvm::kokkos::execution_space::hpx, plssvm::kokkos::execution_space::openmp, plssvm::kokkos::execution_space::openmp_target, plssvm::kokkos::execution_space::openacc, plssvm::kokkos::execution_space::threads, plssvm::kokkos::execution_space::serial));
+TEST(KokkosExecutionSpace, execution_space_to_kokkos_type) {
+    // check conversions
+#if defined(KOKKOS_ENABLE_CUDA)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::cuda>, Kokkos::Cuda>();
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::hip>, Kokkos::HIP>();
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::sycl>, Kokkos::SYCL>();
+#endif
+#if defined(KOKKOS_ENABLE_HPX)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::hpx>, Kokkos::Experimental::HPX>();
+#endif
+#if defined(KOKKOS_ENABLE_OPENMP)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::openmp>, Kokkos::OpenMP>();
+#endif
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::openmp_target>, Kokkos::OpenMPTarget>();
+#endif
+#if defined(KOKKOS_ENABLE_OPENACC)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::openacc>, Kokkos::OpenACC>();
+#endif
+#if defined(KOKKOS_ENABLE_THREADS)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::threads>, Kokkos::Threads>();
+#endif
+#if defined(KOKKOS_ENABLE_SERIAL)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::serial>, Kokkos::Serial>();
+#endif
 }
 
-TEST(KokkosExecutionSpace, available_execution_spaces) {
+TEST(KokkosExecutionSpace, kokkos_type_to_execution_space) {
+    // check conversions
+#if defined(KOKKOS_ENABLE_CUDA)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Cuda>, plssvm::kokkos::execution_space::cuda);
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::HIP>, plssvm::kokkos::execution_space::hip);
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::SYCL>, plssvm::kokkos::execution_space::sycl);
+#endif
+#if defined(KOKKOS_ENABLE_HPX)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Experimental::HPX>, plssvm::kokkos::execution_space::hpx);
+#endif
+#if defined(KOKKOS_ENABLE_OPENMP)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::OpenMP>, plssvm::kokkos::execution_space::openmp);
+#endif
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::OpenMPTarget>, plssvm::kokkos::execution_space::openmp_target);
+#endif
+#if defined(KOKKOS_ENABLE_OPENACC)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Experimental::OpenACC>, plssvm::kokkos::execution_space::openacc);
+#endif
+#if defined(KOKKOS_ENABLE_THREADS)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Threads>, plssvm::kokkos::execution_space::threads);
+#endif
+#if defined(KOKKOS_ENABLE_SERIAL)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Serial>, plssvm::kokkos::execution_space::serial);
+#endif
+}
+
+TEST(KokkosExecutionSpace, constexpr_available_execution_spaces) {
+    // at least one execution space must always be available
+    EXPECT_FALSE(plssvm::kokkos::detail::constexpr_available_execution_spaces().empty());
+}
+
+TEST(KokkosExecutionSpace, list_available_execution_spaces) {
     // at least one execution space must always be available
-    EXPECT_FALSE(plssvm::kokkos::available_execution_spaces().empty());
+    EXPECT_FALSE(plssvm::kokkos::list_available_execution_spaces().empty());
 }
diff --git a/tests/backends/generic_csvm_tests.hpp b/tests/backends/generic_csvm_tests.hpp
index 562785728..4c5d59738 100644
--- a/tests/backends/generic_csvm_tests.hpp
+++ b/tests/backends/generic_csvm_tests.hpp
@@ -36,6 +36,7 @@
 #include "tests/utility.hpp"                // util::{redirect_output, generate_specific_matrix, construct_from_tuple, flatten, generate_random_matrix}
 
 #include "fmt/format.h"   // fmt::format
+#include "fmt/ranges.h"
 #include "gmock/gmock.h"  // ::testing::HasSubstr
 #include "gtest/gtest.h"  // TYPED_TEST_SUITE_P, TYPED_TEST_P, REGISTER_TYPED_TEST_SUITE_P, EXPECT_EQ, EXPECT_NE, EXPECT_GT, EXPECT_TRUE, EXPECT_DEATH,
                           // ASSERT_EQ, GTEST_SKIP, SUCCEED, ::testing::Test
@@ -891,7 +892,7 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix_minimal) {
     const mock_csvm_type svm = util::construct_from_tuple<mock_csvm_type>(params, csvm_test_type::additional_arguments);
     const std::size_t num_devices = svm.num_available_devices();
     // be sure to use the correct data distribution
-    svm.data_distribution_ = std::make_unique<plssvm::detail::triangular_data_distribution>(data.num_rows() - 1, 1);
+    svm.data_distribution_ = std::make_unique<plssvm::detail::triangular_data_distribution>(data.num_rows() - 1, num_devices);
 
     // automatic solver type not permitted
     if constexpr (solver == plssvm::solver_type::automatic) {
@@ -1001,7 +1002,7 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix) {
     const mock_csvm_type svm = util::construct_from_tuple<mock_csvm_type>(params, csvm_test_type::additional_arguments);
     const std::size_t num_devices = svm.num_available_devices();
     // be sure to use the correct data distribution
-    svm.data_distribution_ = std::make_unique<plssvm::detail::triangular_data_distribution>(data.num_rows() - 1, 1);
+    svm.data_distribution_ = std::make_unique<plssvm::detail::triangular_data_distribution>(data.num_rows() - 1, num_devices);
 
     // automatic solver type not permitted
     if constexpr (solver == plssvm::solver_type::automatic) {
diff --git a/tests/backends/generic_gpu_csvm_tests.hpp b/tests/backends/generic_gpu_csvm_tests.hpp
index dea31b85c..38264f507 100644
--- a/tests/backends/generic_gpu_csvm_tests.hpp
+++ b/tests/backends/generic_gpu_csvm_tests.hpp
@@ -156,7 +156,7 @@ TYPED_TEST_P(GenericGPUCSVM, run_blas_level_3_kernel_explicit) {
         ground_truth::device_specific_gemm(alpha, full_kernel_matrix, B, correct_C, *svm.data_distribution_, device_id);
 
         // check C for correctness
-        EXPECT_FLOATING_POINT_MATRIX_NEAR(C_res, correct_C);
+        EXPECT_FLOATING_POINT_MATRIX_NEAR_EPS(C_res, correct_C, 1e6);
     }
 }
 
diff --git a/tests/utility.hpp b/tests/utility.hpp
index 8e4f51e4f..61a20451d 100644
--- a/tests/utility.hpp
+++ b/tests/utility.hpp
@@ -46,7 +46,8 @@
 #include <string>       // std::string
 #include <tuple>        // std::tuple, std::make_tuple, std::get, std::tuple_size
 #include <type_traits>  // std::is_floating_point_v, std::is_same_v, std::is_signed_v, std::is_unsigned_v, std::decay_t
-#include <utility>      // std::pair, std::make_pair, std::move, std::make_index_sequence, std::index_sequence
+#include <utility>      // std::pair, std::make_pair, std::move, std::make_index_sequence, std::index_sequence, std::forward
+#include <variant>      // std::variant_size_v, std::variant_alternative_t
 #include <vector>       // std::vector
 
 namespace util {
@@ -694,6 +695,23 @@ template <typename T, typename Tuple>
     return count;
 }
 
+/**
+ * @brief Call the function @p func for each type in the @p Variant.
+ * @brief The function @p func must have a templated overload of the `operator()()` function.
+ * @tparam Variant the type of the std::variant
+ * @tparam Func the type of the function to apply
+ * @tparam Index the current index of the type the function should be applied to
+ * @param[in] func the function
+ */
+template <typename Variant, typename Func, std::size_t Index = 0>
+constexpr void for_each_variant_type(Func &&func) {
+    if constexpr (Index < std::variant_size_v<Variant>) {
+        using T = std::variant_alternative_t<Index, Variant>;
+        func.template operator()<T>();  // Call function with current type
+        for_each_variant_type<Variant, Func, Index + 1>(std::forward<Func>(func));
+    }
+}
+
 }  // namespace util
 
 #endif  // PLSSVM_TESTS_UTILITY_HPP_

From 2b1f0a48a6ab4a73f141683aa2ae78f8d6c97124 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 28 Oct 2024 15:13:14 +0100
Subject: [PATCH 027/123] Update documentation.

---
 docs/CMakeLists.txt     |  1 +
 docs/resources/dirs.dox | 66 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index 3bf366b62..ec8c0c40f 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -32,6 +32,7 @@ set(DOXYGEN_PROJECT_LOGO "${PROJECT_SOURCE_DIR}/docs/resources/logo_90x55.png")
 set(DOXYGEN_EXCLUDE_SYMBOLS "*_HPP_")
 
 set(DOXYGEN_DOT_IMAGE_FORMAT "svg")
+set(DOXYGEN_DOT_GRAPH_MAX_NODES "100")
 set(DOXYGEN_INTERACTIVE_SVG "YES")
 set(DOXYGEN_INCLUDE_GRAPH "NO")
 set(DOXYGEN_EXTRACT_PRIVATE "YES")
diff --git a/docs/resources/dirs.dox b/docs/resources/dirs.dox
index 8c3119aab..de0ce2d6a 100644
--- a/docs/resources/dirs.dox
+++ b/docs/resources/dirs.dox
@@ -153,6 +153,72 @@
  * @brief Directory containing kernel implementations for utility functions using the HIP backend.
  */
 
+/**
+ * @dir include/plssvm/backends/Kokkos
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing the implementation for the Kokkos backend.
+ */
+
+/**
+ * @dir include/plssvm/backends/Kokkos/detail
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing implementation details for the Kokkos backend.
+ */
+
+/**
+ * @dir include/plssvm/backends/Kokkos/kernel
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing all kernels for the Kokkos backend.
+ */
+
+/**
+ * @dir include/plssvm/backends/Kokkos/kernel/cg_explicit
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing kernel implementations for the explicit CG algorithm using the Kokkos backend.
+ */
+
+/**
+ * @dir include/plssvm/backends/Kokkos/kernel/cg_implicit
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing kernel implementations for the implicit CG algorithm using the Kokkos backend.
+ */
+
+/**
+ * @dir include/plssvm/backends/Kokkos/kernel/detail
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing kernel implementations for utility functions using the Kokkos backend.
+ */
+
 /**
  * @dir include/plssvm/backends/OpenCL
  * @author Alexander Van Craen

From c13a4200913c04627e38d6423586fcfd1efe7345 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 29 Oct 2024 09:58:15 +0100
Subject: [PATCH 028/123] Fix compilation error.

---
 .../Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
index b3d46112d..8e42e8b41 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -151,7 +151,7 @@ class device_kernel_assembly {
                             temp_ij += cost_;
                         }
                         // update the kernel matrix
-                        kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 }; + device_global_i] = temp_ij;
+                        kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
                     }
                 }
             }

From f4c744107147d85dbc2462ae30c35fcf02b954f7 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 29 Oct 2024 10:35:28 +0100
Subject: [PATCH 029/123] Fix Kokkos warning regarding implicit conversions.

---
 src/plssvm/backends/Kokkos/detail/device_ptr.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
index dcd0f98d3..167ec027f 100644
--- a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
@@ -60,7 +60,6 @@ void device_ptr<T>::memset(const int pattern, const size_type pos, const size_ty
     }
     const size_type rnum_bytes = std::min(num_bytes, (this->size_padded() - pos) * sizeof(value_type));
 
-    // TODO: use Kokkos ZeroMemset specialization?
     data_.execute([&](const auto &data) {
         using kokkos_execution_space_type = typename ::plssvm::detail::remove_cvref_t<decltype(data)>::execution_space;
 
@@ -69,7 +68,7 @@ void device_ptr<T>::memset(const int pattern, const size_type pos, const size_ty
         auto p = static_cast<unsigned char>(pattern);
         // memset subview
         Kokkos::parallel_for("device_ptr_memset",
-                             Kokkos::RangePolicy<kokkos_execution_space_type>(0, rnum_bytes),
+                             Kokkos::RangePolicy<kokkos_execution_space_type, size_type>(size_type{ 0 }, rnum_bytes),
                              device_memset_kernel{ data_ptr, p });
 
         detail::device_synchronize(queue_);

From d72f8a67fdaf0428ab17fc901e61bf78a4dd3710 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 29 Oct 2024 12:40:42 +0100
Subject: [PATCH 030/123] Add missing header.

---
 include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp b/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
index 3c6f9c8aa..0ca70c2de 100644
--- a/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
@@ -19,6 +19,7 @@
 
 #include "Kokkos_MathematicalFunctions.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::pow, Kokkos::exp, Kokkos::tanh, Kokkos::abs
 
+#include <float.h>      // LT_MIN, DBL_MIN
 #include <type_traits>  // std::is_same_v
 
 namespace plssvm::kokkos::detail {

From fba8361c58544ea388116f790dcaf5e3a8c625c9 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 31 Oct 2024 16:09:42 +0100
Subject: [PATCH 031/123] Fix problems when using multiple GPUs.

---
 .../Kokkos/detail/device_view_wrapper.hpp     | 61 +++----------------
 .../backends/Kokkos/detail/device_ptr.cpp     | 53 ++++++++++------
 .../Kokkos/detail/device_view_wrapper.cpp     |  2 +
 3 files changed, 44 insertions(+), 72 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp b/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
index 1baddcec6..ad8f0ddcf 100644
--- a/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
+++ b/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
@@ -13,7 +13,9 @@
 #define PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_VIEW_WRAPPER_HPP_
 
 #include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"  // PLSSVM_KOKKOS_BACKEND_INVOKE_IF_*
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"         // plssvm::kokkos::detail::device_wrapper
 #include "plssvm/backends/Kokkos/execution_space.hpp"               // plssvm::kokkos::{execution_space, execution_space_to_kokkos_type_t}, plssvm::kokkos::detail::constexpr_available_execution_spaces
+#include "plssvm/detail/type_traits.hpp"                            // plssvm::detail::remove_cvref_t
 #include "plssvm/detail/utility.hpp"                                // plssvm::detail::unreachable
 
 #include "Kokkos_Core.hpp"  // Kokkos::View, Kokkos::ExecutionSpace
@@ -167,62 +169,17 @@ class device_view_wrapper {
 /**
  * @brief Given a execution @p space and the number of elements @p size, creates a Kokkos::View in the respective memory space.
  * @tparam T the value type of the underlying Kokkos::View
- * @param[in] space the specific execution space
+ * @param[in] device the device for which this view should be allocated
  * @param[in] size the size of the Kokkos::View (number of elements **not** byte!)
  * @return a Kokkos::View wrapper where the active member of the internal `std::variant` corresponds to the Kokkos::View in the Kokkos::ExecutionSpace specified by @p space (`[[nodiscard]]`)
  */
 template <typename T>
-[[nodiscard]] device_view_wrapper<T> make_device_view_wrapper(const execution_space &space, const std::size_t size) {
-    switch (space) {
-        case execution_space::cuda:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA(([&]() {
-                return device_view_wrapper{ Kokkos::View<T, Kokkos::Cuda>{ "cuda_device_ptr_view", size } };
-            }));
-            break;
-        case execution_space::hip:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP(([&]() {
-                return device_view_wrapper{ Kokkos::View<T, Kokkos::HIP>{ "hip_device_ptr_view", size } };
-            }));
-            break;
-        case execution_space::sycl:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL(([&]() {
-                return device_view_wrapper{ Kokkos::View<T, Kokkos::SYCL>{ "sycl_device_ptr_view", size } };
-            }));
-            break;
-        case execution_space::hpx:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX(([&]() {
-                return device_view_wrapper{ Kokkos::View<T, Kokkos::Experimental::HPX>{ "hpx_device_ptr_view", size } };
-            }));
-            break;
-        case execution_space::openmp:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP(([&]() {
-                return device_view_wrapper{ Kokkos::View<T, Kokkos::OpenMP>{ "openmp_device_ptr_view", size } };
-            }));
-            break;
-        case execution_space::openmp_target:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET(([&]() {
-                return device_view_wrapper{ Kokkos::View<T Kokkos::OpenMPTarget>{ "openmptarget_device_ptr_view", size } };
-            }));
-            break;
-        case execution_space::openacc:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC(([&]() {
-                return device_view_wrapper{ Kokkos::View<T, Kokkos::Experimental::OpenACC>{ "openacc_device_ptr_view", size } };
-            }));
-            break;
-        case execution_space::threads:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS(([&]() {
-                return device_view_wrapper{ Kokkos::View<T, Kokkos::Threads>{ "threads_device_ptr_view", size } };
-            }));
-            break;
-        case execution_space::serial:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL(([&]() {
-                return device_view_wrapper{ Kokkos::View<T, Kokkos::Serial>{ "serial_device_ptr_view", size } };
-            }));
-            break;
-    }
-    // all possible cases should be handled by the previous switch
-    // -> silence missing return statement compiler warnings due to throw statement
-    ::plssvm::detail::unreachable();
+[[nodiscard]] device_view_wrapper<T> make_device_view_wrapper(const device_wrapper &device, const std::size_t size) {
+    return device.execute_and_return([&](const auto &value) {
+        using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(value)>;
+
+        return device_view_wrapper{ Kokkos::View<T, kokkos_execution_space_type>{ Kokkos::view_alloc(value, "device_ptr_view"), size } };
+    });
 }
 
 }  // namespace plssvm::kokkos::detail
diff --git a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
index 167ec027f..6c5640870 100644
--- a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
@@ -47,7 +47,7 @@ device_ptr<T>::device_ptr(const plssvm::shape shape, const device_wrapper &devic
 template <typename T>
 device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const device_wrapper &device) :
     base_type{ shape, padding, device } {
-    data_ = make_device_view_wrapper<T *>(device.get_execution_space(), this->size_padded());
+    data_ = make_device_view_wrapper<T *>(device, this->size_padded());
     this->memset(0);
 }
 
@@ -61,18 +61,25 @@ void device_ptr<T>::memset(const int pattern, const size_type pos, const size_ty
     const size_type rnum_bytes = std::min(num_bytes, (this->size_padded() - pos) * sizeof(value_type));
 
     data_.execute([&](const auto &data) {
-        using kokkos_execution_space_type = typename ::plssvm::detail::remove_cvref_t<decltype(data)>::execution_space;
-
         // create subview of the device data
         auto *data_ptr = reinterpret_cast<unsigned char *>(data.data() + pos);
         auto p = static_cast<unsigned char>(pattern);
         // memset subview
-        Kokkos::parallel_for("device_ptr_memset",
-                             Kokkos::RangePolicy<kokkos_execution_space_type, size_type>(size_type{ 0 }, rnum_bytes),
-                             device_memset_kernel{ data_ptr, p });
-
-        detail::device_synchronize(queue_);
+        // TODO: warning?
+        // TODO: if possible, use fill(0) kernel?
+        queue_.execute([&](const auto &exec) {
+            using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(exec)>;
+
+            // create the execution policy
+            const Kokkos::RangePolicy<kokkos_execution_space_type> policy{ exec, size_type{ 0 }, rnum_bytes };
+            // launch the memset kernel
+            Kokkos::parallel_for("device_ptr_memset",
+                                 policy,
+                                 device_memset_kernel{ data_ptr, p });
+        });
     });
+
+    detail::device_synchronize(queue_);
 }
 
 template <typename T>
@@ -87,11 +94,13 @@ void device_ptr<T>::fill(const value_type value, const size_type pos, const size
     data_.execute([&](const auto &data) {
         // create subview of the device data
         auto data_subview = Kokkos::subview(data, std::make_pair(pos, pos + rcount));
-        // fill subview with constant data
-        Kokkos::deep_copy(data_subview, value);
-
-        detail::device_synchronize(queue_);
+        queue_.execute([&](const auto &exec) {
+            // fill subview with constant data
+            Kokkos::deep_copy(exec, data_subview, value);
+        });
     });
+
+    detail::device_synchronize(queue_);
 }
 
 template <typename T>
@@ -106,11 +115,13 @@ void device_ptr<T>::copy_to_device(const_host_pointer_type data_to_copy, const s
         const host_view_type<const T> host_view{ data_to_copy, rcount };
         // create subview of the device data
         auto data_subview = Kokkos::subview(data, std::make_pair(pos, pos + rcount));
-        // copy the data to the device subview
-        Kokkos::deep_copy(data_subview, host_view);
-
-        detail::device_synchronize(queue_);
+        queue_.execute([&](const auto &exec) {
+            // fill subview with constant data
+            Kokkos::deep_copy(exec, data_subview, host_view);
+        });
     });
+
+    detail::device_synchronize(queue_);
 }
 
 template <typename T>
@@ -151,11 +162,13 @@ void device_ptr<T>::copy_to_host(host_pointer_type buffer, const size_type pos,
         const host_view_type<T> host_view{ buffer, rcount };
         // create subview of the device data
         auto data_subview = Kokkos::subview(data, std::make_pair(pos, pos + rcount));
-        // copy the data to the host
-        Kokkos::deep_copy(host_view, data_subview);
-
-        detail::device_synchronize(queue_);
+        queue_.execute([&](const auto &exec) {
+            // fill subview with constant data
+            Kokkos::deep_copy(exec, host_view, data_subview);
+        });
     });
+
+    detail::device_synchronize(queue_);
 }
 
 template <typename T>
diff --git a/tests/backends/Kokkos/detail/device_view_wrapper.cpp b/tests/backends/Kokkos/detail/device_view_wrapper.cpp
index 026daaf1e..c794072b4 100644
--- a/tests/backends/Kokkos/detail/device_view_wrapper.cpp
+++ b/tests/backends/Kokkos/detail/device_view_wrapper.cpp
@@ -75,3 +75,5 @@ TEST(KokkosDeviceViewWrapper, inequality) {
     // should not be unequal
     EXPECT_FALSE(view1 != view2);
 }
+
+// TODO: make_device_view_wrapper
\ No newline at end of file

From 1c75216135b61bb6a85489fa30ddd7dc38444cdb Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 31 Oct 2024 16:10:08 +0100
Subject: [PATCH 032/123] First try implementing map for Kokkos's SYCL backend.

---
 src/plssvm/backends/Kokkos/detail/utility.cpp | 54 +++++++++++++++++--
 1 file changed, 49 insertions(+), 5 deletions(-)

diff --git a/src/plssvm/backends/Kokkos/detail/utility.cpp b/src/plssvm/backends/Kokkos/detail/utility.cpp
index f8521523b..eb02660a6 100644
--- a/src/plssvm/backends/Kokkos/detail/utility.cpp
+++ b/src/plssvm/backends/Kokkos/detail/utility.cpp
@@ -21,9 +21,10 @@
 
 #include "fmt/core.h"  // fmt::format
 
-#include <map>     // std::map
-#include <string>  // std::string
-#include <vector>  // std::vector
+#include <map>            // std::map
+#include <string>         // std::string
+#include <unordered_set>  // std::unordered_set
+#include <vector>         // std::vector
 
 namespace plssvm::kokkos::detail {
 
@@ -39,12 +40,54 @@ std::map<target_platform, std::vector<execution_space>> available_target_platfor
                 available_map[target_platform::gpu_nvidia].push_back(execution_space::cuda);
                 break;
             case execution_space::hip:
-                // NVIDIA and AMD GPUs possible
-                available_map[target_platform::gpu_nvidia].push_back(execution_space::hip);
+                // NVIDIA or AMD GPUs possible (both simultaneously are unsupported)
+#if defined(KOKKOS_ENABLE_HIP)
+    #if defined(__HIP_PLATFORM_AMD__)
                 available_map[target_platform::gpu_amd].push_back(execution_space::hip);
+    #elif defined(__HIP_PLATFORM_NVIDIA__)
+                available_map[target_platform::gpu_nvidia].push_back(execution_space::hip);
+    #else
+        #error "Unknown HIP platform"
+    #endif
+#endif
                 break;
             case execution_space::sycl:
+                // list all potential target platforms currently available in SYCL
+#if defined(KOKKOS_ENABLE_SYCL)
+                {
+                    std::unordered_set<target_platform> targets{};
+                    for (const auto &platform : sycl::platform::get_platforms()) {
+                        for (const auto &device : platform.get_devices()) {
+                            // Note: Kokkos is Intel LLVM/DPC++/icpx only -> we can use the specific implementation defined enum values
+                            if (device.is_cpu()) {
+                                targets.insert(target_platform::cpu);
+                            } else if (device.is_gpu()) {
+                                // the current device is a GPU
+                                // get vendor string and convert it to all lower case
+                                const std::string vendor_string = ::plssvm::detail::as_lower_case(device.get_info<::sycl::info::device::vendor>());
+                                // get platform name of current GPU device and convert it to all lower case
+                                const std::string platform_string = ::plssvm::detail::as_lower_case(platform.get_info<::sycl::info::platform::name>());
+
+                                // check vendor string and insert to correct target platform
+                                if (::plssvm::detail::contains(vendor_string, "nvidia")) {
+                                    targets.insert(target_platform::gpu_nvidia);
+                                } else if (::plssvm::detail::contains(vendor_string, "amd") || ::plssvm::detail::contains(vendor_string, "advanced micro devices")) {
+                                    targets.insert(target_platform::gpu_amd);
+                                } else if (::plssvm::detail::contains(vendor_string, "intel")) {
+                                    targets.insert(target_platform::gpu_intel);
+                                }
+                            }
+                        }
+                    }
+                    // now we know which target platforms are available in SYCL -> add them to our mapping
+                    for (const target_platform target : targets) {
+                        available_map[target].push_back(execution_space::sycl);
+                    }
+                }
+#endif
+                break;
             case execution_space::openacc:
+                // TODO: restrict to available devices
                 // all GPUs and CPU possible
                 available_map[target_platform::gpu_nvidia].push_back(execution_space::sycl);
                 available_map[target_platform::gpu_amd].push_back(execution_space::sycl);
@@ -52,6 +95,7 @@ std::map<target_platform, std::vector<execution_space>> available_target_platfor
                 available_map[target_platform::cpu].push_back(execution_space::sycl);
                 break;
             case execution_space::openmp_target:
+                // TODO: restrict to available devices
                 // all GPUs
                 available_map[target_platform::gpu_nvidia].push_back(execution_space::openmp_target);
                 available_map[target_platform::gpu_amd].push_back(execution_space::openmp_target);

From bb89a07a37d005cf2e02146835c005ef93671452 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 31 Oct 2024 16:50:04 +0100
Subject: [PATCH 033/123] Also set Kokkos backend to be OFF per default.

---
 cmake/presets/common.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/presets/common.json b/cmake/presets/common.json
index 3d3d9c4df..85934b051 100644
--- a/cmake/presets/common.json
+++ b/cmake/presets/common.json
@@ -11,7 +11,8 @@
         "PLSSVM_ENABLE_CUDA_BACKEND": "OFF",
         "PLSSVM_ENABLE_HIP_BACKEND": "OFF",
         "PLSSVM_ENABLE_OPENCL_BACKEND": "OFF",
-        "PLSSVM_ENABLE_SYCL_BACKEND": "OFF"
+        "PLSSVM_ENABLE_SYCL_BACKEND": "OFF",
+        "PLSSVM_ENABLE_KOKKOS_BACKEND": "OFF"
       }
     },
     {

From 61b38339366e7622c07d458d4353784c6ebb9be8 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 2 Nov 2024 22:55:18 +0100
Subject: [PATCH 034/123] Add missing device (i.e., Kokkos::ExecutionSpace
 instance) to Kokkos::TeamPolicy constructor.

---
 src/plssvm/backends/Kokkos/csvm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 0a1903c16..b097d42ef 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -546,7 +546,7 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
 
         for (const auto &[partial_grid, offsets] : exec.grids) {
             // create a Kokkos TeamPolicy
-            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
 
             Kokkos::parallel_for("w_kernel", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_w_linear<kokkos_execution_space_type>{ w_d.get().get<space>(), alpha_d.get().get<space>(), sv_d.get().get<space>(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.x, offsets.y, partial_grid.x });
         }

From 8064363b1283f0813766dd962bae138b9f7e13ee Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 2 Nov 2024 23:11:58 +0100
Subject: [PATCH 035/123] Remove unused include.

---
 include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp b/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
index ad8f0ddcf..018edc48e 100644
--- a/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
+++ b/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
@@ -16,7 +16,6 @@
 #include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"         // plssvm::kokkos::detail::device_wrapper
 #include "plssvm/backends/Kokkos/execution_space.hpp"               // plssvm::kokkos::{execution_space, execution_space_to_kokkos_type_t}, plssvm::kokkos::detail::constexpr_available_execution_spaces
 #include "plssvm/detail/type_traits.hpp"                            // plssvm::detail::remove_cvref_t
-#include "plssvm/detail/utility.hpp"                                // plssvm::detail::unreachable
 
 #include "Kokkos_Core.hpp"  // Kokkos::View, Kokkos::ExecutionSpace
 

From 3cb4a9940f478931e5705705133bdf60903eb607 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 2 Nov 2024 23:14:48 +0100
Subject: [PATCH 036/123] Use Kokkos::ScopeGuard to be sure that
 Kokkos::finalize is called correctly even in case of an exception.

---
 src/main_predict.cpp | 17 ++++++++---------
 src/main_train.cpp   | 18 ++++++++----------
 2 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/src/main_predict.cpp b/src/main_predict.cpp
index 1fe40d102..bc83ffcfa 100644
--- a/src/main_predict.cpp
+++ b/src/main_predict.cpp
@@ -38,12 +38,19 @@
 #include <functional>  // std::mem_fn
 #include <iostream>    // std::cerr, std::endl
 #include <utility>     // std::pair
+#include <memory>  // std::unique_ptr, std::make_unique
 #include <variant>     // std::visit
 #include <vector>      // std::vector
 
 using namespace std::chrono_literals;
 
 int main(int argc, char *argv[]) {
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    // create std::unique_ptr containing a Kokkos::ScopeGuard
+    // -> used to automatically handle Kokkos::finalize
+    std::unique_ptr<Kokkos::ScopeGuard> kokkos_guard{};
+#endif
+
     try {
         const std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now();
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_SET_REFERENCE_TIME(start_time);
@@ -82,7 +89,7 @@ int main(int argc, char *argv[]) {
 
             // initialize Kokkos if necessary
             if (use_kokkos_as_backend) {
-                Kokkos::initialize(argc, argv);  // TODO: set device?
+                kokkos_guard = std::make_unique<Kokkos::ScopeGuard>(argc, argv);
                 PLSSVM_ASSERT(Kokkos::is_initialized(), "Something went wrong initializing the Kokkos environment!");
             }
 #endif
@@ -161,14 +168,6 @@ int main(int argc, char *argv[]) {
                 PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "accuracy", "num_correct", report.accuracy().num_correct }));
                 PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "accuracy", "num_total", report.accuracy().num_total }));
             }
-
-            // finalize Kokkos if necessary
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
-            if (use_kokkos_as_backend) {  // TODO: what if an exception occurred?
-                Kokkos::finalize();
-                PLSSVM_ASSERT(Kokkos::is_finalized(), "Something went wrong finalizing the Kokkos environment!");
-            }
-#endif
         };
         std::visit(data_set_visitor, plssvm::detail::cmd::data_set_factory(cmd_parser));
 
diff --git a/src/main_train.cpp b/src/main_train.cpp
index 1d18d2744..14cf8941b 100644
--- a/src/main_train.cpp
+++ b/src/main_train.cpp
@@ -32,7 +32,7 @@
 #include <exception>    // std::exception
 #include <functional>   // std::mem_fn
 #include <iostream>     // std::cerr, std::endl
-#include <memory>       // std::unique_ptr
+#include <memory>       // std::unique_ptr, std::make_unique
 #include <type_traits>  // std::remove_reference_t
 #include <utility>      // std::pair
 #include <variant>      // std::visit
@@ -41,6 +41,12 @@
 using namespace std::chrono_literals;
 
 int main(int argc, char *argv[]) {
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    // create std::unique_ptr containing a Kokkos::ScopeGuard
+    // -> used to automatically handle Kokkos::finalize
+    std::unique_ptr<Kokkos::ScopeGuard> kokkos_guard{};
+#endif
+
     try {
         const std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now();
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_SET_REFERENCE_TIME(start_time);
@@ -79,7 +85,7 @@ int main(int argc, char *argv[]) {
 
             // initialize Kokkos if necessary
             if (use_kokkos_as_backend) {
-                Kokkos::initialize(argc, argv);  // TODO: set device?
+                kokkos_guard = std::make_unique<Kokkos::ScopeGuard>(argc, argv);
                 PLSSVM_ASSERT(Kokkos::is_initialized(), "Something went wrong initializing the Kokkos environment!");
             }
 #endif
@@ -102,14 +108,6 @@ int main(int argc, char *argv[]) {
                                plssvm::solver = cmd_parser.solver);
             // save model to file
             model.save(cmd_parser.model_filename);
-
-            // finalize Kokkos if necessary
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
-            if (use_kokkos_as_backend) {  // TODO: what if an exception occurred
-                Kokkos::finalize();
-                PLSSVM_ASSERT(Kokkos::is_finalized(), "Something went wrong finalizing the Kokkos environment!");
-            }
-#endif
         };
         std::visit(data_set_visitor, plssvm::detail::cmd::data_set_factory(cmd_parser));
 

From cf52ebc2b4f57157b501507b7fa514033c58adc6 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 2 Nov 2024 23:22:35 +0100
Subject: [PATCH 037/123] Improve memset implementation.

---
 .../Kokkos/kernel/detail/memset_kernel.hpp    | 56 -------------------
 .../backends/Kokkos/detail/device_ptr.cpp     | 31 ++++------
 2 files changed, 11 insertions(+), 76 deletions(-)
 delete mode 100644 include/plssvm/backends/Kokkos/kernel/detail/memset_kernel.hpp

diff --git a/include/plssvm/backends/Kokkos/kernel/detail/memset_kernel.hpp b/include/plssvm/backends/Kokkos/kernel/detail/memset_kernel.hpp
deleted file mode 100644
index 584b1afdd..000000000
--- a/include/plssvm/backends/Kokkos/kernel/detail/memset_kernel.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/**
- * @file
- * @author Alexander Van Craen
- * @author Marcel Breyer
- * @copyright 2018-today The PLSSVM project - All Rights Reserved
- * @license This file is part of the PLSSVM project which is released under the MIT license.
- *          See the LICENSE.md file in the project root for full license information.
- *
- * @brief Defines a Kokkos function object for memsetting a device pointer with a specific value.
- */
-
-#ifndef PLSSVM_BACKENDS_KOKKOS_KERNEL_DETAIL_MEMSET_KERNEL_HPP_
-#define PLSSVM_BACKENDS_KOKKOS_KERNEL_DETAIL_MEMSET_KERNEL_HPP_
-#pragma once
-
-#include "plssvm/constants.hpp"  // plssvm::real_type
-
-#include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION
-
-#include <cstddef>  // std::size_t
-
-namespace plssvm::kokkos::detail {
-
-/**
- * @brief A kernel to perform a memset-like operation on a Kokkos::View
- */
-class device_memset_kernel {
-  public:
-    /**
-     * @brief Memset all bytes in @p data to the provided @p pattern.
-     * @param[out] data the array to memset
-     * @param[in] pattern the memset pattern
-     */
-    device_memset_kernel(unsigned char* data, const unsigned char pattern) :
-        data_{ data },
-        pattern_{ pattern } { }
-
-    /**
-     * @brief Function call operator overload performing the actual calculation.
-     * @param[in] idx the index representing the current point in the execution space
-     */
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const std::size_t idx) const {
-        data_[idx] = pattern_;
-    }
-
-  private:
-    /// @cond Doxygen_suppress
-    unsigned char* data_;
-    const unsigned char pattern_;
-    /// @endcond
-};
-
-}  // namespace plssvm::kokkos::detail
-
-#endif  // PLSSVM_BACKENDS_KOKKOS_KERNEL_DETAIL_MEMSET_KERNEL_HPP_
diff --git a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
index 6c5640870..0dfe9adc0 100644
--- a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
@@ -8,14 +8,13 @@
 
 #include "plssvm/backends/Kokkos/detail/device_ptr.hpp"
 
-#include "plssvm/backends/Kokkos/detail/device_view_wrapper.hpp"   // plssvm::kokkos::detail::{device_view_wrapper, make_device_view_wrapper}
-#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"        // plssvm::kokkos::detail::device_wrapper
-#include "plssvm/backends/Kokkos/detail/utility.hpp"               // plssvm::detail::device_synchronize
-#include "plssvm/backends/Kokkos/exceptions.hpp"                   // plssvm::kokkos::backend_exception
-#include "plssvm/backends/Kokkos/kernel/detail/memset_kernel.hpp"  // plssvm::kokkos::detail::device_fill_array
-#include "plssvm/detail/assert.hpp"                                // PLSSVM_ASSERT
-#include "plssvm/detail/type_traits.hpp"                           // plssvm::detail::remove_cvref_t
-#include "plssvm/shape.hpp"                                        // plssvm::shape
+#include "plssvm/backends/Kokkos/detail/device_view_wrapper.hpp"  // plssvm::kokkos::detail::{device_view_wrapper, make_device_view_wrapper}
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"       // plssvm::kokkos::detail::device_wrapper
+#include "plssvm/backends/Kokkos/detail/utility.hpp"              // plssvm::detail::device_synchronize
+#include "plssvm/backends/Kokkos/exceptions.hpp"                  // plssvm::kokkos::backend_exception
+#include "plssvm/detail/assert.hpp"                               // PLSSVM_ASSERT
+#include "plssvm/detail/type_traits.hpp"                          // plssvm::detail::remove_cvref_t
+#include "plssvm/shape.hpp"                                       // plssvm::shape
 
 #include "Kokkos_Core.hpp"  // Kokkos::View, Kokkos::HostSpace, Kokkos::MemoryUnmanaged, Kokkos::subview, Kokkos::parallel_for, Kokkos::deep_copy
 
@@ -61,21 +60,13 @@ void device_ptr<T>::memset(const int pattern, const size_type pos, const size_ty
     const size_type rnum_bytes = std::min(num_bytes, (this->size_padded() - pos) * sizeof(value_type));
 
     data_.execute([&](const auto &data) {
-        // create subview of the device data
-        auto *data_ptr = reinterpret_cast<unsigned char *>(data.data() + pos);
-        auto p = static_cast<unsigned char>(pattern);
-        // memset subview
-        // TODO: warning?
-        // TODO: if possible, use fill(0) kernel?
         queue_.execute([&](const auto &exec) {
             using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(exec)>;
 
-            // create the execution policy
-            const Kokkos::RangePolicy<kokkos_execution_space_type> policy{ exec, size_type{ 0 }, rnum_bytes };
-            // launch the memset kernel
-            Kokkos::parallel_for("device_ptr_memset",
-                                 policy,
-                                 device_memset_kernel{ data_ptr, p });
+            // create view of the device data cast to unsigned char
+            const Kokkos::View<unsigned char *, kokkos_execution_space_type> view{ reinterpret_cast<unsigned char *>(data.data() + pos), rnum_bytes };
+            // fill the view with the pattern -> acts like a memset
+            Kokkos::deep_copy(exec, view, static_cast<unsigned char>(pattern));
         });
     });
 

From 275da8493a5107aac896a5ed66e406ae50717ade Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 2 Nov 2024 23:33:09 +0100
Subject: [PATCH 038/123] Handle the Kokkos backend in some more tests.

---
 tests/backend_types.cpp | 8 +++++++-
 tests/csvm_factory.cpp  | 3 +++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/tests/backend_types.cpp b/tests/backend_types.cpp
index 9975fbbfc..3cd9fa5bc 100644
--- a/tests/backend_types.cpp
+++ b/tests/backend_types.cpp
@@ -38,11 +38,12 @@ TEST(BackendType, to_string) {
     EXPECT_CONVERSION_TO_STRING(plssvm::backend_type::hip, "hip");
     EXPECT_CONVERSION_TO_STRING(plssvm::backend_type::opencl, "opencl");
     EXPECT_CONVERSION_TO_STRING(plssvm::backend_type::sycl, "sycl");
+    EXPECT_CONVERSION_TO_STRING(plssvm::backend_type::kokkos, "kokkos");
 }
 
 TEST(BackendType, to_string_unknown) {
     // check conversions to std::string from unknown backend_type
-    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::backend_type>(7), "unknown");
+    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::backend_type>(8), "unknown");
 }
 
 // check whether the std::string -> plssvm::backend_type conversions are correct
@@ -64,6 +65,8 @@ TEST(BackendType, from_string) {
     EXPECT_CONVERSION_FROM_STRING("OpenCL", plssvm::backend_type::opencl);
     EXPECT_CONVERSION_FROM_STRING("sycl", plssvm::backend_type::sycl);
     EXPECT_CONVERSION_FROM_STRING("SYCL", plssvm::backend_type::sycl);
+    EXPECT_CONVERSION_FROM_STRING("Kokkos", plssvm::backend_type::kokkos);
+    EXPECT_CONVERSION_FROM_STRING("KOKKOS", plssvm::backend_type::kokkos);
 }
 
 TEST(BackendType, from_string_unknown) {
@@ -127,6 +130,7 @@ INSTANTIATE_TEST_SUITE_P(BackendType, BackendTypeSupportedCombination, ::testing
          supported_combination_type{ { plssvm::backend_type::hip }, { plssvm::target_platform::cpu, plssvm::target_platform::gpu_nvidia, plssvm::target_platform::gpu_amd, plssvm::target_platform::gpu_intel }, plssvm::backend_type::hip },
          supported_combination_type{ { plssvm::backend_type::opencl }, { plssvm::target_platform::cpu, plssvm::target_platform::gpu_nvidia, plssvm::target_platform::gpu_amd, plssvm::target_platform::gpu_intel }, plssvm::backend_type::opencl },
          supported_combination_type{ { plssvm::backend_type::sycl }, { plssvm::target_platform::cpu, plssvm::target_platform::gpu_nvidia, plssvm::target_platform::gpu_amd, plssvm::target_platform::gpu_intel }, plssvm::backend_type::sycl },
+         supported_combination_type{ { plssvm::backend_type::kokkos }, { plssvm::target_platform::cpu, plssvm::target_platform::gpu_nvidia, plssvm::target_platform::gpu_amd, plssvm::target_platform::gpu_intel }, plssvm::backend_type::kokkos },
          supported_combination_type{ { plssvm::backend_type::openmp, plssvm::backend_type::cuda, plssvm::backend_type::hip, plssvm::backend_type::opencl, plssvm::backend_type::sycl }, { plssvm::target_platform::cpu }, plssvm::backend_type::sycl },
          supported_combination_type{ { plssvm::backend_type::openmp, plssvm::backend_type::cuda, plssvm::backend_type::hip, plssvm::backend_type::opencl, plssvm::backend_type::sycl }, { plssvm::target_platform::gpu_nvidia }, plssvm::backend_type::cuda },
          supported_combination_type{ { plssvm::backend_type::openmp, plssvm::backend_type::cuda, plssvm::backend_type::hip, plssvm::backend_type::opencl, plssvm::backend_type::sycl }, { plssvm::target_platform::gpu_amd }, plssvm::backend_type::hip },
@@ -144,6 +148,7 @@ TEST(BackendType, csvm_to_backend_type) {
     EXPECT_EQ(plssvm::csvm_to_backend_type<volatile plssvm::sycl::csvm>::value, plssvm::backend_type::sycl);
     EXPECT_EQ(plssvm::csvm_to_backend_type<const volatile plssvm::adaptivecpp::csvm>::value, plssvm::backend_type::sycl);
     EXPECT_EQ(plssvm::csvm_to_backend_type<const volatile plssvm::dpcpp::csvm &>::value, plssvm::backend_type::sycl);
+    EXPECT_EQ(plssvm::csvm_to_backend_type<plssvm::kokkos::csvm>::value, plssvm::backend_type::kokkos);
 
     EXPECT_EQ(plssvm::csvm_to_backend_type<plssvm::adaptivecpp::csvm>::impl, plssvm::sycl::implementation_type::adaptivecpp);
     EXPECT_EQ(plssvm::csvm_to_backend_type<plssvm::dpcpp::csvm>::impl, plssvm::sycl::implementation_type::dpcpp);
@@ -159,4 +164,5 @@ TEST(BackendType, csvm_to_backend_type_v) {
     EXPECT_EQ(plssvm::csvm_to_backend_type_v<volatile plssvm::sycl::csvm>, plssvm::backend_type::sycl);
     EXPECT_EQ(plssvm::csvm_to_backend_type_v<const volatile plssvm::adaptivecpp::csvm>, plssvm::backend_type::sycl);
     EXPECT_EQ(plssvm::csvm_to_backend_type_v<const volatile plssvm::dpcpp::csvm &>, plssvm::backend_type::sycl);
+    EXPECT_EQ(plssvm::csvm_to_backend_type_v<plssvm::kokkos::csvm>, plssvm::backend_type::kokkos);
 }
diff --git a/tests/csvm_factory.cpp b/tests/csvm_factory.cpp
index cb06f6b68..dc365293c 100644
--- a/tests/csvm_factory.cpp
+++ b/tests/csvm_factory.cpp
@@ -60,6 +60,9 @@ std::string GetTypeName<util::test_parameter<util::type_list<plssvm::dpcpp::csvm
 
 template <>
 std::string GetTypeName<util::test_parameter<util::type_list<plssvm::adaptivecpp::csvm>, util::value_list<>>>() { return "sycl_adaptivecpp_csvm"; }
+
+template <>
+std::string GetTypeName<util::test_parameter<util::type_list<plssvm::kokkos::csvm>, util::value_list<>>>() { return "kokkos_csvm"; }
 }  // namespace testing::internal
 
 template <typename T>

From 4d44b56c965c2ccaf313b67fa470cfd778ecf691 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 2 Nov 2024 23:33:59 +0100
Subject: [PATCH 039/123] Move the Kokkos initialization to the general test
 main.cpp file (guarded behind ifdef). Remove Kokkos specific main file.

---
 tests/backends/Kokkos/CMakeLists.txt |  2 +-
 tests/kokkos_main.cpp                | 38 ----------------------
 tests/main.cpp                       | 41 ++++++++++++++++++++++--
 tests/main.hpp                       | 47 ----------------------------
 4 files changed, 40 insertions(+), 88 deletions(-)
 delete mode 100644 tests/kokkos_main.cpp
 delete mode 100644 tests/main.hpp

diff --git a/tests/backends/Kokkos/CMakeLists.txt b/tests/backends/Kokkos/CMakeLists.txt
index e0686c2e4..142f72a37 100644
--- a/tests/backends/Kokkos/CMakeLists.txt
+++ b/tests/backends/Kokkos/CMakeLists.txt
@@ -23,7 +23,7 @@ set(PLSSVM_KOKKOS_TEST_SOURCES
 find_package(Kokkos REQUIRED)
 
 # add test executable
-add_executable(${PLSSVM_KOKKOS_TEST_NAME} ${CMAKE_CURRENT_LIST_DIR}/../../kokkos_main.cpp ${PLSSVM_KOKKOS_TEST_SOURCES})
+add_executable(${PLSSVM_KOKKOS_TEST_NAME} ${CMAKE_CURRENT_LIST_DIR}/../../main.cpp ${PLSSVM_KOKKOS_TEST_SOURCES})
 
 # link against test library
 target_link_libraries(${PLSSVM_KOKKOS_TEST_NAME} PRIVATE ${PLSSVM_BASE_TEST_LIBRARY_NAME})
diff --git a/tests/kokkos_main.cpp b/tests/kokkos_main.cpp
deleted file mode 100644
index 1edfbb9fe..000000000
--- a/tests/kokkos_main.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * @file
- * @author Alexander Van Craen
- * @author Marcel Breyer
- * @copyright 2018-today The PLSSVM project - All Rights Reserved
- * @license This file is part of the PLSSVM project which is released under the MIT license.
- *          See the LICENSE.md file in the project root for full license information.
- *
- * @brief Contains the googletest main function. Sets the DeathTest to "threadsafe" execution instead of "fast".
- */
-
-#include "Kokkos_Core.hpp"  // Kokkos::initialize, Kokkos::finalize
-
-#include "gtest/gtest.h"  // GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST, RUN_ALL_TESTS, ::testing::{InitGoogleTest, GTEST_FLAG}
-
-#include "main.hpp"  // GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST definitions
-
-int main(int argc, char **argv) {
-    ::testing::InitGoogleTest(&argc, argv);
-
-    // initialize Kokkos
-    Kokkos::initialize(argc, argv);
-
-    // prevent problems with fork() in the presence of multiple threads
-    // https://github.com/google/googletest/blob/main/docs/advanced.md#death-tests-and-threads
-    // NOTE: may reduce performance of the (death) tests
-#if !defined(_WIN32)
-    ::testing::GTEST_FLAG(death_test_style) = "threadsafe";
-#endif
-
-    // run all tests
-    const int return_code = RUN_ALL_TESTS();
-
-    // finalize Kokkos
-    Kokkos::finalize();
-
-    return return_code;
-}
diff --git a/tests/main.cpp b/tests/main.cpp
index 944ad9318..614b38cff 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -9,12 +9,49 @@
  * @brief Contains the googletest main function. Sets the DeathTest to "threadsafe" execution instead of "fast".
  */
 
-#include "main.hpp"  // GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST definitions
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    #include "Kokkos_Core.hpp"  // Kokkos::ScopeGuard
+#endif
+
+#include "gtest/gtest.h"  // RUN_ALL_TESTS, ::testing::{InitGoogleTest, GTEST_FLAG},GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST definitions
+
+// silence GTest warnings/test errors
 
-#include "gtest/gtest.h"  // RUN_ALL_TESTS, ::testing::{InitGoogleTest, GTEST_FLAG}
+// generic CSVM tests
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVM);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunction);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolver);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunction);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunctionClassification);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunctionClassification);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMDeathTest);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverDeathTest);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunctionDeathTest);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunctionDeathTest);
+// generic GPU CSVM tests
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVM);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVMKernelFunction);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVMDeathTest);
+// pinned memory tests
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PinnedMemory);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PinnedMemoryLayout);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PinnedMemoryDeathTest);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PinnedMemoryLayoutDeathTest);
+// device pointer tests
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtr);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtrLayout);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtrDeathTest);
+// exception tests
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Exception);
 
 int main(int argc, char **argv) {
     ::testing::InitGoogleTest(&argc, argv);
+
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    // initialize Kokkos using a Kokkos::ScopeGuard
+    const Kokkos::ScopeGuard guard{ argc, argv };
+#endif
+
     // prevent problems with fork() in the presence of multiple threads
     // https://github.com/google/googletest/blob/main/docs/advanced.md#death-tests-and-threads
     // NOTE: may reduce performance of the (death) tests
diff --git a/tests/main.hpp b/tests/main.hpp
deleted file mode 100644
index ddb4ea590..000000000
--- a/tests/main.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/**
- * @file
- * @author Alexander Van Craen
- * @author Marcel Breyer
- * @copyright 2018-today The PLSSVM project - All Rights Reserved
- * @license This file is part of the PLSSVM project which is released under the MIT license.
- *          See the LICENSE.md file in the project root for full license information.
- *
- * @brief Header file for the GoogleTest main files to reduce code duplication.
- */
-
-#ifndef PLSSVM_TESTS_MAIN_HPP_
-#define PLSSVM_TESTS_MAIN_HPP_
-#pragma once
-
-#include "gtest/gtest.h"  // GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST
-
-// silence GTest warnings/test errors
-
-// generic CSVM tests
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVM);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunction);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolver);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunction);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunctionClassification);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunctionClassification);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMDeathTest);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverDeathTest);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMKernelFunctionDeathTest);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericCSVMSolverKernelFunctionDeathTest);
-// generic GPU CSVM tests
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVM);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVMKernelFunction);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GenericGPUCSVMDeathTest);
-// pinned memory tests
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PinnedMemory);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PinnedMemoryLayout);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PinnedMemoryDeathTest);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PinnedMemoryLayoutDeathTest);
-// device pointer tests
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtr);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtrLayout);
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtrDeathTest);
-// exception tests
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Exception);
-
-#endif  // PLSSVM_TESTS_MAIN_HPP_

From 7ad8d1ccf61cd3da67fee6487076a962a4ccf842 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 4 Nov 2024 16:40:42 +0100
Subject: [PATCH 040/123] Remove now unused directory documentation.

---
 docs/resources/dirs.dox | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/docs/resources/dirs.dox b/docs/resources/dirs.dox
index de0ce2d6a..7e9a5491d 100644
--- a/docs/resources/dirs.dox
+++ b/docs/resources/dirs.dox
@@ -208,17 +208,6 @@
  * @brief Directory containing kernel implementations for the implicit CG algorithm using the Kokkos backend.
  */
 
-/**
- * @dir include/plssvm/backends/Kokkos/kernel/detail
- * @author Alexander Van Craen
- * @author Marcel Breyer
- * @copyright 2018-today The PLSSVM project - All Rights Reserved
- * @license This file is part of the PLSSVM project which is released under the MIT license.
- *          See the LICENSE.md file in the project root for full license information.
- *
- * @brief Directory containing kernel implementations for utility functions using the Kokkos backend.
- */
-
 /**
  * @dir include/plssvm/backends/OpenCL
  * @author Alexander Van Craen

From f551364e544b515fab9a89af863fed90a2fdfeb2 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 5 Nov 2024 10:58:45 +0100
Subject: [PATCH 041/123] Fix failing test.

---
 tests/csvm_factory.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/csvm_factory.cpp b/tests/csvm_factory.cpp
index dc365293c..a80b705c5 100644
--- a/tests/csvm_factory.cpp
+++ b/tests/csvm_factory.cpp
@@ -234,7 +234,7 @@ TEST(CSVMFactory, factory_named_parameter) {
 }
 
 TEST(CSVMFactory, invalid_backend) {
-    EXPECT_THROW_WHAT(std::ignore = plssvm::make_csvm(static_cast<plssvm::backend_type>(7)),
+    EXPECT_THROW_WHAT(std::ignore = plssvm::make_csvm(static_cast<plssvm::backend_type>(8)),
                       plssvm::unsupported_backend_exception,
                       "Unrecognized backend provided!");
 }

From b064677dfd1c191c3eb9d5f0459e76702933a740 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 5 Nov 2024 10:59:12 +0100
Subject: [PATCH 042/123] Add new conditional execution macro.

---
 .../Kokkos/detail/conditional_execution.hpp   | 118 ++++++++++++++++--
 .../Kokkos/detail/device_view_wrapper.hpp     |   7 +-
 src/plssvm/backends/Kokkos/csvm.cpp           |  26 ++--
 .../backends/Kokkos/detail/device_wrapper.cpp |  21 ++--
 src/plssvm/backends/Kokkos/detail/utility.cpp |  28 ++---
 5 files changed, 143 insertions(+), 57 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp b/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
index 6ed8c3421..95d4c8300 100644
--- a/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
+++ b/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
@@ -24,111 +24,211 @@
 
 namespace plssvm::kokkos::detail {
 
+//***************************************************//
+//                    Kokkos::Cuda                   //
+//***************************************************//
+
 /**
+ * @def PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA
+ * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA` macro if `KOKKOS_ENABLE_CUDA` is defined, i.e., the Kokkos CUDA ExecutionSpace is available.
+ * @details If `KOKKOS_ENABLE_CUDA` is enabled, invokes the provided function (normally a lambda function) and returns the return value, otherwise throws an exception.
+ *
  * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA
  * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA` macro if `KOKKOS_ENABLE_CUDA` is defined, i.e., the Kokkos CUDA ExecutionSpace is available.
  * @details If `KOKKOS_ENABLE_CUDA` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception.
  */
 #if defined(KOKKOS_ENABLE_CUDA)
-    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA(func) return std::invoke(func)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA(func) return std::invoke(func)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA(func) std::invoke(func)
 #else
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA(func) \
+        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::cuda) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA(func) \
         throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::cuda) }
 #endif
 
+//***************************************************//
+//                    Kokkos::Hip                    //
+//***************************************************//
+
 /**
+ * @def PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HIP
+ * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HIP` macro if `KOKKOS_ENABLE_HIP` is defined, i.e., the Kokkos HIP ExecutionSpace is available.
+ * @details If `KOKKOS_ENABLE_HIP` is enabled, invokes the provided function (normally a lambda function) and returns the return value, otherwise throws an exception.
+ *
  * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP
  * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP` macro if `KOKKOS_ENABLE_HIP` is defined, i.e., the Kokkos HIP ExecutionSpace is available.
  * @details If `KOKKOS_ENABLE_HIP` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception.
  */
 #if defined(KOKKOS_ENABLE_HIP)
-    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP(func) return std::invoke(func)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HIP(func) return std::invoke(func)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP(func) std::invoke(func)
 #else
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HIP(func) \
+        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::hip) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP(func) \
         throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::hip) }
 #endif
 
+//***************************************************//
+//                    Kokkos::SYCL                   //
+//***************************************************//
+
 /**
+ * @def PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SYCL
+ * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SYCL` macro if `KOKKOS_ENABLE_SYCL` is defined, i.e., the Kokkos SYCL ExecutionSpace is available.
+ * @details If `KOKKOS_ENABLE_SYCL` is enabled, invokes the provided function (normally a lambda function) and returns the return value, otherwise throws an exception.
+ *
  * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL
  * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL` macro if `KOKKOS_ENABLE_SYCL` is defined, i.e., the Kokkos SYCL ExecutionSpace is available.
  * @details If `KOKKOS_ENABLE_SYCL` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception.
  */
 #if defined(KOKKOS_ENABLE_SYCL)
-    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL(func) return std::invoke(func)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SYCL(func) return std::invoke(func)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL(func) std::invoke(func)
 #else
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SYCL(func) \
+        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::sycl) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL(func) \
         throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::sycl) }
 #endif
 
+//***************************************************//
+//                    Kokkos::HPX                    //
+//***************************************************//
+
 /**
+ * @def PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HPX
+ * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HPX` macro if `KOKKOS_ENABLE_HPX` is defined, i.e., the Kokkos HPX ExecutionSpace is available.
+ * @details If `KOKKOS_ENABLE_HPX` is enabled, invokes the provided function (normally a lambda function) and returns the return value, otherwise throws an exception.
+ *
  * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX
  * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX` macro if `KOKKOS_ENABLE_HPX` is defined, i.e., the Kokkos HPX ExecutionSpace is available.
  * @details If `KOKKOS_ENABLE_HPX` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception.
  */
 #if defined(KOKKOS_ENABLE_HPX)
-    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX(func) return std::invoke(func)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HPX(func) return std::invoke(func)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX(func) std::invoke(func)
 #else
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HPX(func) \
+        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::hpx) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX(func) \
         throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::hpx) }
 #endif
 
+//***************************************************//
+//                  Kokkos::OpenMP                   //
+//***************************************************//
+
 /**
+ * @def PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMP
+ * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMP` macro if `KOKKOS_ENABLE_OPENMP` is defined, i.e., the Kokkos OpenMP ExecutionSpace is available.
+ * @details If `KOKKOS_ENABLE_OPENMP` is enabled, invokes the provided function (normally a lambda function) and returns the return value, otherwise throws an exception.
+ *
  * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP
  * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP` macro if `KOKKOS_ENABLE_OPENMP` is defined, i.e., the Kokkos OpenMP ExecutionSpace is available.
  * @details If `KOKKOS_ENABLE_OPENMP` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception.
  */
 #if defined(KOKKOS_ENABLE_OPENMP)
-    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP(func) return std::invoke(func)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMP(func) return std::invoke(func)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP(func) std::invoke(func)
 #else
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMP(func) \
+        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openmp) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP(func) \
         throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openmp) }
 #endif
 
+//***************************************************//
+//               Kokkos::OpenMPTarget                //
+//***************************************************//
+
 /**
+ * @def PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMPTARGET
+ * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMPTARGET` macro if `KOKKOS_ENABLE_OPENMPTARGET` is defined, i.e., the Kokkos OpenMP target offloading ExecutionSpace is available.
+ * @details If `KOKKOS_ENABLE_OPENMPTARGET` is enabled, invokes the provided function (normally a lambda function) and returns the return value, otherwise throws an exception.
+ *
  * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET
  * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET` macro if `KOKKOS_ENABLE_OPENMPTARGET` is defined, i.e., the Kokkos OpenMP target offloading ExecutionSpace is available.
  * @details If `KOKKOS_ENABLE_OPENMPTARGET` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception.
  */
 #if defined(KOKKOS_ENABLE_OPENMPTARGET)
-    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET(func) return std::invoke(func)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMPTARGET(func) return std::invoke(func)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET(func) std::invoke(func)
 #else
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMPTARGET(func) \
+        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openmp_target) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET(func) \
         throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openmp_target) }
 #endif
 
+//***************************************************//
+//                  Kokkos::OpenACC                  //
+//***************************************************//
+
 /**
+ * @def PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENACC
+ * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENACC` macro if `KOKKOS_ENABLE_OPENACC` is defined, i.e., the Kokkos OpenACC ExecutionSpace is available.
+ * @details If `KOKKOS_ENABLE_OPENACC` is enabled, invokes the provided function (normally a lambda function) and returns the return value, otherwise throws an exception.
+ *
  * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC
  * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC` macro if `KOKKOS_ENABLE_OPENACC` is defined, i.e., the Kokkos OpenACC ExecutionSpace is available.
  * @details If `KOKKOS_ENABLE_OPENACC` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception.
  */
 #if defined(KOKKOS_ENABLE_OPENACC)
-    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC(func) return std::invoke(func)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENACC(func) return std::invoke(func)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC(func) std::invoke(func)
 #else
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENACC(func) \
+        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openacc) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC(func) \
         throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openacc) }
 #endif
 
+//***************************************************//
+//                  Kokkos::Threads                  //
+//***************************************************//
+
 /**
+ * @def PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_THREADS
+ * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_THREADS` macro if `KOKKOS_ENABLE_THREADS` is defined, i.e., the Kokkos std::thread ExecutionSpace is available.
+ * @details If `KOKKOS_ENABLE_THREADS` is enabled, invokes the provided function (normally a lambda function) and returns the return value, otherwise throws an exception.
+ *
  * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS
  * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS` macro if `KOKKOS_ENABLE_THREADS` is defined, i.e., the Kokkos std::thread ExecutionSpace is available.
  * @details If `KOKKOS_ENABLE_THREADS` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception.
  */
 #if defined(KOKKOS_ENABLE_THREADS)
-    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS(func) return std::invoke(func)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_THREADS(func) return std::invoke(func)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS(func) std::invoke(func)
 #else
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_THREADS(func) \
+        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::threads) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS(func) \
         throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::threads) }
 #endif
 
+//***************************************************//
+//                   Kokkos::Serial                  //
+//***************************************************//
+
 /**
+ * @def PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SERIAL
+ * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SERIAL` macro if `KOKKOS_ENABLE_SERIAL` is defined, i.e., the Kokkos serial ExecutionSpace is available.
+ * @details If `KOKKOS_ENABLE_SERIAL` is enabled, invokes the provided function (normally a lambda function) and returns the return value, otherwise throws an exception.
+ * @note This ExecutionSpace *should* always be available!
+ *
  * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL
  * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL` macro if `KOKKOS_ENABLE_SERIAL` is defined, i.e., the Kokkos serial ExecutionSpace is available.
  * @details If `KOKKOS_ENABLE_SERIAL` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception.
  * @note This ExecutionSpace *should* always be available!
  */
 #if defined(KOKKOS_ENABLE_SERIAL)
-    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL(func) return std::invoke(func)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SERIAL(func) return std::invoke(func)
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL(func) std::invoke(func)
 #else
+    #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SERIAL(func) \
+        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::serial) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL(func) \
         throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::serial) }
 #endif
diff --git a/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp b/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
index 018edc48e..a3019829e 100644
--- a/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
+++ b/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
@@ -12,10 +12,9 @@
 #ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_VIEW_WRAPPER_HPP_
 #define PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_VIEW_WRAPPER_HPP_
 
-#include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"  // PLSSVM_KOKKOS_BACKEND_INVOKE_IF_*
-#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"         // plssvm::kokkos::detail::device_wrapper
-#include "plssvm/backends/Kokkos/execution_space.hpp"               // plssvm::kokkos::{execution_space, execution_space_to_kokkos_type_t}, plssvm::kokkos::detail::constexpr_available_execution_spaces
-#include "plssvm/detail/type_traits.hpp"                            // plssvm::detail::remove_cvref_t
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"  // plssvm::kokkos::detail::device_wrapper
+#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::{execution_space, execution_space_to_kokkos_type_t}, plssvm::kokkos::detail::constexpr_available_execution_spaces
+#include "plssvm/detail/type_traits.hpp"                     // plssvm::detail::remove_cvref_t
 
 #include "Kokkos_Core.hpp"  // Kokkos::View, Kokkos::ExecutionSpace
 
diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index b097d42ef..6e0335e56 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -9,7 +9,7 @@
 #include "plssvm/backends/Kokkos/csvm.hpp"
 
 #include "plssvm/backends/execution_range.hpp"                                        // plssvm::detail::{execution_range, dim_type}
-#include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"                    // PLSSVM_KOKKOS_BACKEND_INVOKE_IF_*
+#include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"                    // PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_*, PLSSVM_KOKKOS_BACKEND_INVOKE_IF_
 #include "plssvm/backends/Kokkos/detail/device_ptr.hpp"                               // plssvm::kokkos::detail::device_ptr
 #include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"                           // plssvm::kokkos::detail::{device_wrapper, get_device_list}
 #include "plssvm/backends/Kokkos/detail/utility.hpp"                                  // plssvm::kokkos::detail::get_runtime_version // TODO: docu
@@ -170,21 +170,18 @@ std::vector<::plssvm::detail::memory_size> csvm::get_device_memory() const {
                 for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
                     res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::cuda>().cuda_device_prop().totalGlobalMem) };
                 }
-                return res;
             });
         case execution_space::hip:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
                 for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
                     res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::hip>().hip_device_prop().totalGlobalMem) };
                 }
-                return res;
             });
         case execution_space::sycl:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
                 for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
                     res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::sycl>().sycl_queue().get_device().get_info<::sycl::info::device::global_mem_size>()) };
                 }
-                return res;
             });
         case execution_space::openmp:
         case execution_space::hpx:
@@ -195,12 +192,11 @@ std::vector<::plssvm::detail::memory_size> csvm::get_device_memory() const {
         case execution_space::openacc:
             throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
     }
-    // all possible cases should be handled by the previous switch
-    // -> silence missing return statement compiler warnings due to throw statement
-    ::plssvm::detail::unreachable();
+    return res;
 }
 
 std::vector<::plssvm::detail::memory_size> csvm::get_max_mem_alloc_size() const {
+    [[maybe_unused]] std::vector<::plssvm::detail::memory_size> res(this->num_available_devices());
     // TODO: implement for other execution spaces
     switch (space_) {
         case execution_space::cuda:
@@ -211,8 +207,8 @@ std::vector<::plssvm::detail::memory_size> csvm::get_max_mem_alloc_size() const
                 for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
                     res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::sycl>().sycl_queue().get_device().get_info<::sycl::info::device::max_mem_alloc_size>()) };
                 }
-                return res;
             });
+            break;
         case execution_space::openmp:
         case execution_space::hpx:
         case execution_space::threads:
@@ -222,9 +218,7 @@ std::vector<::plssvm::detail::memory_size> csvm::get_max_mem_alloc_size() const
         case execution_space::openacc:
             throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
     }
-    // all possible cases should be handled by the previous switch
-    // -> silence missing return statement compiler warnings due to throw statement
-    ::plssvm::detail::unreachable();
+    return res;
 }
 
 std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
@@ -233,15 +227,15 @@ std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
     // TODO: implement for other execution spaces
     switch (space_) {
         case execution_space::cuda:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() {
+            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA([&]() {
                 return static_cast<std::size_t>(devices_[device_id].get<execution_space::cuda>().cuda_device_prop().maxThreadsPerBlock);
             });
         case execution_space::hip:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
+            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HIP([&]() {
                 return static_cast<std::size_t>(devices_[device_id].get<execution_space::hip>().hip_device_prop().maxThreadsPerBlock);
             });
         case execution_space::sycl:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
+            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SYCL([&]() {
                 return devices_[device_id].get<execution_space::sycl>().sycl_queue().get_device().get_info<::sycl::info::device::max_work_group_size>();
             });
         case execution_space::openmp:
@@ -266,14 +260,14 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size(const std::size_t device_id)
     // TODO: implement for other execution spaces
     switch (space_) {
         case execution_space::cuda:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA(([&]() -> ::plssvm::detail::dim_type {
+            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA(([&]() -> ::plssvm::detail::dim_type {
                 // TODO: Kokkos only uses maxGridSize[0]
                 const cudaDeviceProp &prop = devices_[device_id].get<execution_space::cuda>().cuda_device_prop();
                 return { static_cast<std::size_t>(prop.maxGridSize[0]), static_cast<std::size_t>(prop.maxGridSize[1]), static_cast<std::size_t>(prop.maxGridSize[2]) };
             }));
         case execution_space::hip:
             // TODO: Kokkos only uses maxGridSize[0]
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP(([&]() -> ::plssvm::detail::dim_type {
+            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HIP(([&]() -> ::plssvm::detail::dim_type {
                 const hipDeviceProp &prop = devices_[device_id].get<execution_space::hip>().hip_device_prop();
                 return { static_cast<std::size_t>(prop.maxGridSize[0]), static_cast<std::size_t>(prop.maxGridSize[1]), static_cast<std::size_t>(prop.maxGridSize[2]) };
             }));
diff --git a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
index add12def4..5fa580aae 100644
--- a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
@@ -35,8 +35,8 @@ std::vector<device_wrapper> get_device_list(const execution_space space, [[maybe
                     // Note: it is important to pass the cudaStream_t lifetime to be managed by Kokkos
                     devices.emplace_back(Kokkos::Cuda(stream, Kokkos::Impl::ManageStream::yes));
                 }
-                return devices;
             });
+            break;
         case execution_space::hip:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
                 for (int device = 0; device < Kokkos::num_devices(); ++device) {
@@ -48,20 +48,20 @@ std::vector<device_wrapper> get_device_list(const execution_space space, [[maybe
                     // Note: it is important to pass the hipStream_t lifetime to be managed by Kokkos
                     devices.emplace_back(Kokkos::Hip(stream, Kokkos::Impl::ManageStream::yes));
                 }
-                return devices;
             });
+            break;
         case execution_space::sycl:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
                 // TODO: use all available devices -> not that trivial
                 // TODO: handle target <- if provide queue -> managed?
                 devices.emplace_back(Kokkos::SYCL{});
-                return devices;
             });
+            break;
         case execution_space::hpx:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX([&]() {
                 devices.emplace_back(Kokkos::Hpx{});
-                return devices;
             });
+            break;
         case execution_space::openmp:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP([&]() {
                 // Note: if OpenMP should be used as device  must be set in order for it to work!
@@ -73,34 +73,31 @@ std::vector<device_wrapper> get_device_list(const execution_space space, [[maybe
                     omp_set_nested(1);
                 }
                 devices.emplace_back(Kokkos::OpenMP{});
-                return devices;
             });
+            break;
         case execution_space::openmp_target:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET([&]() {
                 // TODO: multi-GPU?
                 devices.emplace_back(Kokkos::OpenMPTarget{});
-                return devices;
             });
+            break;
         case execution_space::openacc:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC([&]() {
                 // TODO: multi-GPU?
                 devices.emplace_back(Kokkos::OpenACC{});
-                return devices;
             });
+            break;
         case execution_space::threads:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS([&]() {
                 devices.emplace_back(Kokkos::Threads{});
-                return devices;
             });
         case execution_space::serial:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL([&]() {
                 devices.emplace_back(Kokkos::Serial{});
-                return devices;
             });
+            break;
     }
-    // all possible cases should be handled by the previous switch
-    // -> silence missing return statement compiler warnings due to throw statement
-    ::plssvm::detail::unreachable();
+    return devices;
 }
 
 }  // namespace plssvm::kokkos::detail
diff --git a/src/plssvm/backends/Kokkos/detail/utility.cpp b/src/plssvm/backends/Kokkos/detail/utility.cpp
index eb02660a6..8619471a9 100644
--- a/src/plssvm/backends/Kokkos/detail/utility.cpp
+++ b/src/plssvm/backends/Kokkos/detail/utility.cpp
@@ -8,7 +8,7 @@
 
 #include "plssvm/backends/Kokkos/detail/utility.hpp"
 
-#include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"  // PLSSVM_KOKKOS_BACKEND_INVOKE_IF_*
+#include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"  // PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_*
 #include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"         // plssvm::kokkos::detail::device_wrapper
 #include "plssvm/backends/Kokkos/execution_space.hpp"               // plssvm::kokkos::execution_space
 #include "plssvm/detail/assert.hpp"                                 // PLSSVM_ASSERT
@@ -41,20 +41,17 @@ std::map<target_platform, std::vector<execution_space>> available_target_platfor
                 break;
             case execution_space::hip:
                 // NVIDIA or AMD GPUs possible (both simultaneously are unsupported)
-#if defined(KOKKOS_ENABLE_HIP)
-    #if defined(__HIP_PLATFORM_AMD__)
-                available_map[target_platform::gpu_amd].push_back(execution_space::hip);
-    #elif defined(__HIP_PLATFORM_NVIDIA__)
-                available_map[target_platform::gpu_nvidia].push_back(execution_space::hip);
-    #else
-        #error "Unknown HIP platform"
-    #endif
+                PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
+#if defined(__HIP_PLATFORM_AMD__)
+                    available_map[target_platform::gpu_amd].push_back(execution_space::hip);
+#elif defined(__HIP_PLATFORM_NVIDIA__)
+                    available_map[target_platform::gpu_nvidia].push_back(execution_space::hip);
 #endif
+                });
                 break;
             case execution_space::sycl:
                 // list all potential target platforms currently available in SYCL
-#if defined(KOKKOS_ENABLE_SYCL)
-                {
+                PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
                     std::unordered_set<target_platform> targets{};
                     for (const auto &platform : sycl::platform::get_platforms()) {
                         for (const auto &device : platform.get_devices()) {
@@ -83,8 +80,7 @@ std::map<target_platform, std::vector<execution_space>> available_target_platfor
                     for (const target_platform target : targets) {
                         available_map[target].push_back(execution_space::sycl);
                     }
-                }
-#endif
+                });
                 break;
             case execution_space::openacc:
                 // TODO: restrict to available devices
@@ -122,15 +118,15 @@ std::map<target_platform, std::vector<execution_space>> available_target_platfor
 std::string get_device_name([[maybe_unused]] const device_wrapper &dev) {
     switch (dev.get_execution_space()) {
         case execution_space::cuda:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() {
+            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA([&]() {
                 return std::string{ dev.get<execution_space::cuda>().cuda_device_prop().name };
             });
         case execution_space::hip:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
+            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HIP([&]() {
                 return std::string{ dev.get<execution_space::hip>().hip_device_prop().name };
             });
         case execution_space::sycl:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
+            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SYCL([&]() {
                 return dev.get<execution_space::sycl>().sycl_queue.get_device().get_info<sycl::info::device::name>();
             });
         case execution_space::hpx:

From b5174df96016665fdc47656c950131ee93c2fccc Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 5 Nov 2024 11:04:48 +0100
Subject: [PATCH 043/123] Add missing break statements.

---
 src/plssvm/backends/Kokkos/csvm.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 6e0335e56..0fa73773e 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -171,18 +171,21 @@ std::vector<::plssvm::detail::memory_size> csvm::get_device_memory() const {
                     res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::cuda>().cuda_device_prop().totalGlobalMem) };
                 }
             });
+            break;
         case execution_space::hip:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
                 for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
                     res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::hip>().hip_device_prop().totalGlobalMem) };
                 }
             });
+            break;
         case execution_space::sycl:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
                 for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
                     res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::sycl>().sycl_queue().get_device().get_info<::sycl::info::device::global_mem_size>()) };
                 }
             });
+            break;
         case execution_space::openmp:
         case execution_space::hpx:
         case execution_space::threads:

From 25105ebc1cf84693dc10287d17ed23b195a29dc8 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 5 Nov 2024 11:24:15 +0100
Subject: [PATCH 044/123] Add missing make_device_view_wrapper test.

---
 .../Kokkos/detail/device_view_wrapper.cpp     | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tests/backends/Kokkos/detail/device_view_wrapper.cpp b/tests/backends/Kokkos/detail/device_view_wrapper.cpp
index c794072b4..28dc97cba 100644
--- a/tests/backends/Kokkos/detail/device_view_wrapper.cpp
+++ b/tests/backends/Kokkos/detail/device_view_wrapper.cpp
@@ -10,12 +10,15 @@
 
 #include "plssvm/backends/Kokkos/detail/device_view_wrapper.hpp"
 
-#include "plssvm/backends/Kokkos/execution_space.hpp"  // plssvm::kokkos::{execution_space, kokkos_type_to_execution_space_v}
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"  // plssvm::kokkos::detail::device_wrapper
+#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::{execution_space, kokkos_type_to_execution_space_v}
 
 #include "Kokkos_Core.hpp"  // Kokkos::DefaultExecutionSpace, Kokkos::View
 
 #include "gtest/gtest.h"  // TEST, EXPECT_EQ, EXPECT_TRUE, EXPECT_FALSE
 
+#include <cstddef>  // std::size_t
+
 TEST(KokkosDeviceViewWrapper, default_construct) {
     // default construct a device view wrapper
     const plssvm::kokkos::detail::device_view_wrapper<double *> view{};
@@ -76,4 +79,17 @@ TEST(KokkosDeviceViewWrapper, inequality) {
     EXPECT_FALSE(view1 != view2);
 }
 
-// TODO: make_device_view_wrapper
\ No newline at end of file
+TEST(KokkosDeviceViewWrapper, make_device_view_wrapper) {
+    // create a device wrapper for the Kokkos::DefaultExecutionSpace
+    const plssvm::kokkos::detail::device_wrapper device{ Kokkos::DefaultExecutionSpace{} };
+
+    // create device view wrapper
+    const plssvm::kokkos::detail::device_view_wrapper<double *> view = plssvm::kokkos::detail::make_device_view_wrapper<double *>(device, 42);
+
+    // check that the returned Kokkos::View has the correct type
+    constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>;
+    ::testing::StaticAssertTypeEq<decltype(view.get<space>()), const Kokkos::View<double *, Kokkos::DefaultExecutionSpace> &>();
+
+    // check the number of elements
+    EXPECT_EQ(view.get<space>().size(), std::size_t{ 42 });
+}

From a1a2fdb5904a475117bc2b7e196ceceb53d91ac8 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 5 Nov 2024 15:34:11 +0100
Subject: [PATCH 045/123] Remove dim_type::total_size in favor of Kokkos
 specific dim_type_to_native function.

---
 .../plssvm/backends/Kokkos/detail/utility.hpp |  8 ++
 include/plssvm/backends/execution_range.hpp   | 12 +--
 src/plssvm/backends/Kokkos/csvm.cpp           | 73 +++++++++++++------
 src/plssvm/backends/Kokkos/detail/utility.cpp |  5 ++
 tests/backends/Kokkos/detail/utility.cpp      | 12 +++
 tests/backends/execution_range.cpp            | 14 ----
 6 files changed, 76 insertions(+), 48 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/detail/utility.hpp b/include/plssvm/backends/Kokkos/detail/utility.hpp
index fe8b0367f..9bbc9b172 100644
--- a/include/plssvm/backends/Kokkos/detail/utility.hpp
+++ b/include/plssvm/backends/Kokkos/detail/utility.hpp
@@ -13,6 +13,7 @@
 #define PLSSVM_BACKENDS_KOKKOS_DETAIL_UTILITY_HPP_
 #pragma once
 
+#include "plssvm/backends/execution_range.hpp"               // plssvm::detail::dim_type
 #include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"  // plssvm::kokkos::detail::device_wrapper
 #include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
 #include "plssvm/detail/type_traits.hpp"                     // PLSSVM_REQUIRES
@@ -54,6 +55,13 @@ inline constexpr bool is_type_in_variant_v = is_type_in_variant<T, Variant>::val
 
 }  // namespace impl
 
+/**
+ * @brief Convert a `plssvm::detail::dim_type` to a Kokkos native one-dimensional value.
+ * @param[in] dims the dimensional value to convert
+ * @return the native one-dimensional value (`[[nodiscard]]`)
+ */
+[[nodiscard]] int dim_type_to_native(const ::plssvm::detail::dim_type &dims);
+
 /**
  * @brief Return a `std::map` containing a mapping from all available target platforms to the available Kokkos::ExecutionSpace that supports said target platform.
  * @details If a target platform is supported by multiple Kokkos::ExecutionSpace, the order is determined by the order as returned by `list_available_execution_spaces`.
diff --git a/include/plssvm/backends/execution_range.hpp b/include/plssvm/backends/execution_range.hpp
index c44aab7b6..5be842f9a 100644
--- a/include/plssvm/backends/execution_range.hpp
+++ b/include/plssvm/backends/execution_range.hpp
@@ -12,6 +12,8 @@
 #ifndef PLSSVM_BACKENDS_EXECUTION_RANGE_HPP_
 #define PLSSVM_BACKENDS_EXECUTION_RANGE_HPP_
 
+#include "plssvm/backend_types.hpp"  // plssvm::backend_type
+
 #include "fmt/base.h"     // fmt::formatter
 #include "fmt/ostream.h"  // fmt::ostream_formatter
 
@@ -77,15 +79,6 @@ struct [[nodiscard]] dim_type {
         swap_ull(z, other.z);
     }
 
-    /**
-     * @brief Return the total number of elements in the dimensional type.
-     * @details Equal to: `x * y * z`.
-     * @return the total number of elements (`[[nodiscard]]`)
-     */
-    [[nodiscard]] constexpr unsigned long long total_size() const noexcept {
-        return x * y * z;
-    }
-
     /// The dimensional size in x direction.
     unsigned long long x{ 1 };
     /// The dimensional size in y direction.
@@ -170,7 +163,6 @@ struct execution_range {
     /// The grids. Multiple grids are used, if the grid sizes would exceed the maximum allowed number. Also stores the offsets for the respective grids used in the kernels.
     /// Note: no default initialization due to a linker error occurring with NVIDIA's nvhpc!
     std::vector<grid_type> grids;
-
 };
 
 /**
diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 0fa73773e..6c7807527 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -312,16 +312,20 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
     const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
-    // save the team sizes
-    const ::plssvm::detail::dim_type team_sizes = exec.block;
+    // save the team size
+    const int team_size = detail::dim_type_to_native(exec.block);
 
     return devices_[device_id].execute_and_return([&](auto &device) {
         using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
         constexpr execution_space space = kokkos_type_to_execution_space_v<kokkos_execution_space_type>;
 
         for (const auto &[partial_grid, offsets] : exec.grids) {
+            // convert execution range partial_grid to Kokkos' native one-dimensional size
+            const int native_partial_grid = detail::dim_type_to_native(partial_grid);
+
             // create a Kokkos TeamPolicy
-            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()) };
+            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
+            // TODO: test MDRangeTeamPolicy?!
 
             switch (params.kernel_type) {
                 case kernel_function_type::linear:
@@ -383,25 +387,31 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
         // the necessary amount of scratch memory for the kernels
         const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
-        // save the team sizes
-        const ::plssvm::detail::dim_type team_sizes = exec.block;
+        // save the team size
+        const int team_size = detail::dim_type_to_native(exec.block);
 
         for (const auto &[partial_grid, offsets] : exec.grids) {
+            // convert execution range partial_grid to Kokkos' native one-dimensional size
+            const int native_partial_grid = detail::dim_type_to_native(partial_grid);
+
             // create a Kokkos TeamPolicy
-            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
             Kokkos::parallel_for("blas_level_3_kernel_explicit", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_symm<kokkos_execution_space_type>{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get().get<space>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x });
         }
 
-        // save the mirror team sizes
-        const ::plssvm::detail::dim_type mirror_team_sizes = mirror_exec.block;
+        // save the team size
+        const int mirror_team_size = detail::dim_type_to_native(mirror_exec.block);
 
         for (const auto &[partial_grid, offsets] : mirror_exec.grids) {
             const unsigned long long num_mirror_rows = num_rows - row_offset - device_specific_num_rows;
 
             if (num_mirror_rows > 0) {
+                // convert execution range partial_grid to Kokkos' native one-dimensional size
+                const int native_partial_grid = detail::dim_type_to_native(partial_grid);
+
                 // create a Kokkos TeamPolicy
-                Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, static_cast<int>(partial_grid.total_size()), static_cast<int>(mirror_team_sizes.total_size()), Kokkos::AUTO };
+                Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, mirror_team_size };
 
                 Kokkos::parallel_for("blas_level_3_kernel_explicit_mirror", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_symm_mirror<kokkos_execution_space_type>{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get().get<space>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x });
             }
@@ -417,12 +427,15 @@ void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plss
         using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
         constexpr execution_space space = kokkos_type_to_execution_space_v<kokkos_execution_space_type>;
 
-        // save the team sizes
-        const ::plssvm::detail::dim_type team_sizes = exec.block;
+        // save the team size
+        const int team_size = detail::dim_type_to_native(exec.block);
 
         for (const auto &[partial_grid, offsets] : exec.grids) {
+            // convert execution range partial_grid to Kokkos' native one-dimensional size
+            const int native_partial_grid = detail::dim_type_to_native(partial_grid);
+
             // create a Kokkos TeamPolicy
-            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
             Kokkos::parallel_for("inplace_matrix_addition", team_policy, detail::device_kernel_inplace_matrix_add<kokkos_execution_space_type>{ num_rhs, lhs_d.get().get<space>(), rhs_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x });
         }
@@ -437,12 +450,15 @@ void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm:
         using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
         constexpr execution_space space = kokkos_type_to_execution_space_v<kokkos_execution_space_type>;
 
-        // save the team sizes
-        const ::plssvm::detail::dim_type team_sizes = exec.block;
+        // save the team size
+        const int team_size = detail::dim_type_to_native(exec.block);
 
         for (const auto &[partial_grid, offsets] : exec.grids) {
+            // convert execution range partial_grid to Kokkos' native one-dimensional size
+            const int native_partial_grid = detail::dim_type_to_native(partial_grid);
+
             // create a Kokkos TeamPolicy
-            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
             Kokkos::parallel_for("inplace_matrix_scale", team_policy, detail::device_kernel_inplace_matrix_scale<kokkos_execution_space_type>{ num_rhs, lhs_d.get().get<space>(), scale, offsets.x, offsets.y, partial_grid.x });
         }
@@ -467,12 +483,15 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
         const real_type cost_factor = real_type{ 1.0 } / params.cost;
         const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
-        // save the team sizes
-        const ::plssvm::detail::dim_type team_sizes = exec.block;
+        // save the team size
+        const int team_size = detail::dim_type_to_native(exec.block);
 
         for (const auto &[partial_grid, offsets] : exec.grids) {
+            // convert execution range partial_grid to Kokkos' native one-dimensional size
+            const int native_partial_grid = detail::dim_type_to_native(partial_grid);
+
             // create a Kokkos TeamPolicy
-            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
             switch (params.kernel_type) {
                 case kernel_function_type::linear:
@@ -534,16 +553,19 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
 
     const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
-    // save the team sizes
-    const ::plssvm::detail::dim_type team_sizes = exec.block;
+    // save the team size
+    const int team_size = detail::dim_type_to_native(exec.block);
 
     return devices_[device_id].execute_and_return([&](auto &device) {
         using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
         constexpr execution_space space = kokkos_type_to_execution_space_v<kokkos_execution_space_type>;
 
         for (const auto &[partial_grid, offsets] : exec.grids) {
+            // convert execution range partial_grid to Kokkos' native one-dimensional size
+            const int native_partial_grid = detail::dim_type_to_native(partial_grid);
+
             // create a Kokkos TeamPolicy
-            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
             Kokkos::parallel_for("w_kernel", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_w_linear<kokkos_execution_space_type>{ w_d.get().get<space>(), alpha_d.get().get<space>(), sv_d.get().get<space>(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.x, offsets.y, partial_grid.x });
         }
@@ -563,16 +585,19 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
 
     const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
-    // save the team sizes
-    const ::plssvm::detail::dim_type team_sizes = exec.block;
+    // save the team size
+    const int team_size = detail::dim_type_to_native(exec.block);
 
     return devices_[device_id].execute_and_return([&](auto &device) {
         using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
         constexpr execution_space space = kokkos_type_to_execution_space_v<kokkos_execution_space_type>;
 
         for (const auto &[partial_grid, offsets] : exec.grids) {
+            // convert execution range partial_grid to Kokkos' native one-dimensional size
+            const int native_partial_grid = detail::dim_type_to_native(partial_grid);
+
             // create a Kokkos TeamPolicy
-            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, static_cast<int>(partial_grid.total_size()), static_cast<int>(team_sizes.total_size()), Kokkos::AUTO };
+            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
             switch (params.kernel_type) {
                 case kernel_function_type::linear:
diff --git a/src/plssvm/backends/Kokkos/detail/utility.cpp b/src/plssvm/backends/Kokkos/detail/utility.cpp
index 8619471a9..b5451bc2c 100644
--- a/src/plssvm/backends/Kokkos/detail/utility.cpp
+++ b/src/plssvm/backends/Kokkos/detail/utility.cpp
@@ -8,6 +8,7 @@
 
 #include "plssvm/backends/Kokkos/detail/utility.hpp"
 
+#include "plssvm/backends/execution_range.hpp"                      // plssvm::detail::dim_type
 #include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"  // PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_*
 #include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"         // plssvm::kokkos::detail::device_wrapper
 #include "plssvm/backends/Kokkos/execution_space.hpp"               // plssvm::kokkos::execution_space
@@ -28,6 +29,10 @@
 
 namespace plssvm::kokkos::detail {
 
+int dim_type_to_native(const ::plssvm::detail::dim_type &dims) {
+    return static_cast<int>(dims.x * dims.y * dims.z);
+}
+
 std::map<target_platform, std::vector<execution_space>> available_target_platform_to_execution_space_mapping() {
     std::map<target_platform, std::vector<execution_space>> available_map{};
 
diff --git a/tests/backends/Kokkos/detail/utility.cpp b/tests/backends/Kokkos/detail/utility.cpp
index 7c6d491d5..ec18a977b 100644
--- a/tests/backends/Kokkos/detail/utility.cpp
+++ b/tests/backends/Kokkos/detail/utility.cpp
@@ -10,6 +10,7 @@
 
 #include "plssvm/backends/Kokkos/detail/utility.hpp"
 
+#include "plssvm/backends/execution_range.hpp"               // plssvm::detail::dim_type
 #include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"  // plssvm::kokkos::detail::device_wrapper
 #include "plssvm/backends/Kokkos/exceptions.hpp"             // plssvm::kokkos::backend_exception
 #include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::{execution_space, kokkos_type_to_execution_space_v}
@@ -43,6 +44,17 @@ TEST(KokkosUtility, is_type_in_variant) {
     EXPECT_FALSE((plssvm::kokkos::detail::impl::is_type_in_variant_v<float, variant_type>) );
 }
 
+TEST(KokkosUtility, dim_type_to_native) {
+    // create a dim_type
+    constexpr plssvm::detail::dim_type dim{ 128ull, 64ull, 32ull };
+
+    // convert it to a Kokkos one-dimensional execution range
+    const int native_dim = plssvm::kokkos::detail::dim_type_to_native(dim);
+
+    // check values for correctness
+    EXPECT_EQ(native_dim, 262'144);  // = 128 * 64 * 32
+}
+
 TEST(KokkosUtility, available_target_platform_to_execution_space_mapping) {
     // get the target_platform <-> execution_space mappings
     const std::map<plssvm::target_platform, std::vector<plssvm::kokkos::execution_space>> mapping = plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping();
diff --git a/tests/backends/execution_range.cpp b/tests/backends/execution_range.cpp
index 866dae83a..75fe16ef2 100644
--- a/tests/backends/execution_range.cpp
+++ b/tests/backends/execution_range.cpp
@@ -94,20 +94,6 @@ TEST(DimType, swap_free_function) {
     EXPECT_EQ(dim2.z, 1ull);
 }
 
-TEST(DimType, total_size) {
-    // create dim types
-    constexpr plssvm::detail::dim_type dim1{};
-    constexpr plssvm::detail::dim_type dim2{ 64ull };
-    constexpr plssvm::detail::dim_type dim3{ 64ull, 32ull };
-    constexpr plssvm::detail::dim_type dim4{ 64ull, 32ull, 16ull };
-
-    // test total_size function
-    EXPECT_EQ(dim1.total_size(), 1ull);
-    EXPECT_EQ(dim2.total_size(), 64ull);
-    EXPECT_EQ(dim3.total_size(), 2048ull);
-    EXPECT_EQ(dim4.total_size(), 32768ull);
-}
-
 TEST(DimType, equality) {
     // create dim types
     constexpr plssvm::detail::dim_type dim1{};

From 208055ebabb7cf7d70d3ba15d6c93d24d076fcd9 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 5 Nov 2024 15:35:08 +0100
Subject: [PATCH 046/123] Fix maximum grid size problems when using Kokkos
 (since Kokkos only supports a one-dimensional execution range).

---
 src/plssvm/backends/Kokkos/csvm.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 6c7807527..157e08685 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -38,6 +38,7 @@
 #include "fmt/core.h"    // fmt::format
 #include "fmt/format.h"  // fmt::format
 
+#include <cmath>      // std::sqrt
 #include <cstddef>    // std::size_t
 #include <exception>  // std::terminate
 #include <iostream>   // std::cout, std::endl
@@ -260,22 +261,24 @@ std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
 ::plssvm::detail::dim_type csvm::get_max_grid_size(const std::size_t device_id) const {
     PLSSVM_ASSERT(device_id < this->num_available_devices(), "Invalid device {} requested!", device_id);
 
+    // NOTE: Kokkos only supports one-dimensional execution ranges!
+    // NOTE: we only use two-dimensional kernels!
     // TODO: implement for other execution spaces
     switch (space_) {
         case execution_space::cuda:
             PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA(([&]() -> ::plssvm::detail::dim_type {
-                // TODO: Kokkos only uses maxGridSize[0]
                 const cudaDeviceProp &prop = devices_[device_id].get<execution_space::cuda>().cuda_device_prop();
-                return { static_cast<std::size_t>(prop.maxGridSize[0]), static_cast<std::size_t>(prop.maxGridSize[1]), static_cast<std::size_t>(prop.maxGridSize[2]) };
+                const auto max_grid_size = static_cast<std::size_t>(std::sqrt(prop.maxGridSize[0]));
+                return { max_grid_size, max_grid_size, std::size_t{ 1 } };
             }));
         case execution_space::hip:
-            // TODO: Kokkos only uses maxGridSize[0]
             PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HIP(([&]() -> ::plssvm::detail::dim_type {
                 const hipDeviceProp &prop = devices_[device_id].get<execution_space::hip>().hip_device_prop();
-                return { static_cast<std::size_t>(prop.maxGridSize[0]), static_cast<std::size_t>(prop.maxGridSize[1]), static_cast<std::size_t>(prop.maxGridSize[2]) };
+                const auto max_grid_size = static_cast<std::size_t>(std::sqrt(prop.maxGridSize[0]));
+                return { max_grid_size, max_grid_size, std::size_t{ 1 } };
             }));
         case execution_space::openmp:
-            return { 16, 16, 16 };  // TODO: correct values
+            return { 16, 16, 1 };  // TODO: correct values
         case execution_space::serial:
             return { 1, 1, 1 };  // TODO: correct values
         case execution_space::sycl:

From 2d82a248dcd95a45a5872f1d35801c1189f972e8 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 5 Nov 2024 15:46:19 +0100
Subject: [PATCH 047/123] Fix TODO.

---
 src/plssvm/backends/Kokkos/CMakeLists.txt | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/plssvm/backends/Kokkos/CMakeLists.txt b/src/plssvm/backends/Kokkos/CMakeLists.txt
index 20ae3c0a6..d2bf6addf 100644
--- a/src/plssvm/backends/Kokkos/CMakeLists.txt
+++ b/src/plssvm/backends/Kokkos/CMakeLists.txt
@@ -39,11 +39,6 @@ target_link_libraries(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PUBLIC Kokkos::kokko
 # link base library against Kokkos library
 target_link_libraries(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PUBLIC ${PLSSVM_BASE_LIBRARY_NAME})
 
-# set whether the kernel source should be compiled with fast math enabled or not # TODO: enable fast-math
-#if (PLSSVM_ENABLE_FAST_MATH)
-#    target_compile_definitions(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE PLSSVM_ENABLE_FAST_MATH)
-#endif ()
-
 # set compile definition that the Kokkos backend is available
 target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PRIVATE PLSSVM_HAS_KOKKOS_BACKEND)
 target_compile_definitions(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PUBLIC PLSSVM_HAS_KOKKOS_BACKEND)

From b130e15d68abfae5af2677b9c5716f7fa92b5ac9 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 5 Nov 2024 20:57:29 +0100
Subject: [PATCH 048/123] Enable the Kokkos CSVM in the Python bindings.

---
 bindings/Python/CMakeLists.txt           |  3 +
 bindings/Python/README.md                | 18 +++++-
 bindings/Python/backends/kokkos_csvm.cpp | 72 ++++++++++++++++++++++++
 bindings/Python/main.cpp                 |  4 ++
 4 files changed, 95 insertions(+), 2 deletions(-)
 create mode 100644 bindings/Python/backends/kokkos_csvm.cpp

diff --git a/bindings/Python/CMakeLists.txt b/bindings/Python/CMakeLists.txt
index 0ae329356..8dfba2e04 100644
--- a/bindings/Python/CMakeLists.txt
+++ b/bindings/Python/CMakeLists.txt
@@ -94,6 +94,9 @@ endif ()
 if (TARGET ${PLSSVM_SYCL_BACKEND_DPCPP_LIBRARY_NAME})
     list(APPEND PLSSVM_PYTHON_BINDINGS_SOURCES ${CMAKE_CURRENT_LIST_DIR}/backends/dpcpp_csvm.cpp)
 endif ()
+if (TARGET ${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME})
+    list(APPEND PLSSVM_PYTHON_BINDINGS_SOURCES ${CMAKE_CURRENT_LIST_DIR}/backends/kokkos_csvm.cpp)
+endif ()
 
 # create pybind11 module
 set(PLSSVM_PYTHON_BINDINGS_LIBRARY_NAME plssvm)
diff --git a/bindings/Python/README.md b/bindings/Python/README.md
index b7825de52..7d6140bc3 100644
--- a/bindings/Python/README.md
+++ b/bindings/Python/README.md
@@ -10,7 +10,7 @@
         - [plssvm.Parameter](#plssvmparameter)
         - [plssvm.DataSet](#plssvmdataset)
         - [plssvm.CSVM](#plssvmcsvm)
-        - [plssvm.openmp.CSVM, plssvm.stdpar.CSVM, plssvm.cuda.CSVM, plssvm.hip.CSVM, plssvm.opencl.CSVM, plssvm.sycl.CSVM, plssvm.dpcpp.CSVM, plssvm.adaptivecpp.CSVM](#plssvmopenmpcsvm-plssvmcudacsvm-plssvmhipcsvm-plssvmopenclcsvm-plssvmsyclcsvm-plssvmdpcppcsvm-plssvmadaptivecppcsvm)
+        - [plssvm.openmp.CSVM, plssvm.stdpar.CSVM, plssvm.cuda.CSVM, plssvm.hip.CSVM, plssvm.opencl.CSVM, plssvm.sycl.CSVM, plssvm.dpcpp.CSVM, plssvm.adaptivecpp.CSVM, plssvm.kokkos.CSVM](#plssvmopenmpcsvm-plssvmcudacsvm-plssvmhipcsvm-plssvmopenclcsvm-plssvmsyclcsvm-plssvmdpcppcsvm-plssvmadaptivecppcsvm)
         - [plssvm.Model](#plssvmmodel)
         - [plssvm.Version](#plssvmversion)
         - [plssvm.detail.tracking.PerformanceTracker](#plssvmdetailtrackingperformancetracker)
@@ -211,6 +211,12 @@ If the stdpar backend is available, an additional enumeration is available:
 |----------------------|---------------------------------------------------------------|-------------------------------------------------|
 | `ImplementationType` | `NVHPC`, `ROC_STDPAR`, `INTEL_LLVM`, `ADAPTIVECPP`, `GNU_TBB` | The different supported stdpar implementations. |
 
+If the Kokos backend is available, an additional enumeration is available:
+
+| enumeration      | values                                                                                 | description                                      |
+|------------------|----------------------------------------------------------------------------------------|--------------------------------------------------|
+| `ExecutionSpace` | `CUDA`, `HIP`, `SYCL`, `HPX`, `OPENMP`, `OPENMPTARGET`, `OPENACC`, `THREADS`, `SERIAL` | The different supported Kokkos execution spaces. |
+
 ### Classes and submodules
 
 The following tables list all PLSSVM classes exposed on the Python side:
@@ -347,7 +353,7 @@ and `sycl_kernel_invocation_type` to choose between the two different SYCL kerne
 | `score(model)`                                                                                                                               | Score the model with respect to itself returning its accuracy.                                                                                                                                                      |
 | `score(model, data_set)`                                                                                                                     | Score the model given the provided data set returning its accuracy.                                                                                                                                                 |
 
-#### `plssvm.openmp.CSVM`, `plssvm.stdpar.CSVM`, plssvm.cuda.CSVM`, `plssvm.hip.CSVM`, `plssvm.opencl.CSVM`, `plssvm.sycl.CSVM`, `plssvm.dpcpp.CSVM`, `plssvm.adaptivecpp.CSVM`
+#### `plssvm.openmp.CSVM`, `plssvm.stdpar.CSVM`, plssvm.cuda.CSVM`, `plssvm.hip.CSVM`, `plssvm.opencl.CSVM`, `plssvm.sycl.CSVM`, `plssvm.dpcpp.CSVM`, `plssvm.adaptivecpp.CSVM`, `plssvm.kokkos.CSVM`
 
 These classes represent the backend specific CSVMs.
 **Note**: they are only available if the respective backend has been enabled during PLSSVM's build step.
@@ -385,6 +391,14 @@ CSVM.
 |-----------------------------|---------------------------------------------|
 | `get_implementation_type()` | Return the used stdpar implementation type. |
 
+In case of the Kokkos CSVM (`plssvm.kokkos.CSVM`) the following method is additional available for the backend specific 
+CSVM.
+
+
+| methods                 | description                             |
+|-------------------------|-----------------------------------------|
+| `get_execution_space()` | Return the used Kokkos execution space. |
+
 #### `plssvm.Model`
 
 A class encapsulating a model learned during a call to `plssvm.CSVM.fit()`.
diff --git a/bindings/Python/backends/kokkos_csvm.cpp b/bindings/Python/backends/kokkos_csvm.cpp
new file mode 100644
index 000000000..ea0dc17e6
--- /dev/null
+++ b/bindings/Python/backends/kokkos_csvm.cpp
@@ -0,0 +1,72 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ */
+
+#include "plssvm/backends/Kokkos/csvm.hpp"             // plssvm::kokkos::csvm
+#include "plssvm/backends/Kokkos/exceptions.hpp"       // plssvm::kokkos::backend_exception
+#include "plssvm/backends/Kokkos/execution_space.hpp"  // plssvm::kokkos::execution_space
+#include "plssvm/csvm.hpp"                             // plssvm::csvm
+#include "plssvm/exceptions/exceptions.hpp"            // plssvm::exception
+#include "plssvm/parameter.hpp"                        // plssvm::parameter
+#include "plssvm/target_platforms.hpp"                 // plssvm::target_platform
+
+#include "bindings/Python/utility.hpp"  // check_kwargs_for_correctness, convert_kwargs_to_parameter, register_py_exception
+
+#include "pybind11/pybind11.h"  // py::module_, py::class_, py::init
+#include "pybind11/stl.h"       // support for STL types
+
+#include <memory>  // std::make_unique
+
+namespace py = pybind11;
+
+void init_kokkos_csvm(py::module_ &m, const py::exception<plssvm::exception> &base_exception) {
+    // use its own submodule for the Kokkos CSVM bindings
+    py::module_ kokkos_module = m.def_submodule("kokkos", "a module containing all Kokkos backend specific functionality");
+
+    // bind the CSVM using the Kokkos backend
+    py::class_<plssvm::kokkos::csvm, plssvm::csvm>(kokkos_module, "CSVM")
+        .def(py::init<>(), "create an SVM with the automatic target platform and default parameter object")
+        .def(py::init<plssvm::parameter>(), "create an SVM with the automatic target platform and provided parameter object")
+        .def(py::init<plssvm::target_platform>(), "create an SVM with the provided target platform and default parameter object")
+        .def(py::init<plssvm::target_platform, plssvm::parameter>(), "create an SVM with the provided target platform and parameter object")
+        .def(py::init([](const py::kwargs &args) {
+                 // check for valid keys
+                 check_kwargs_for_correctness(args, { "kernel_type", "degree", "gamma", "coef0", "cost" });
+                 // if one of the value keyword parameter is provided, set the respective value
+                 const plssvm::parameter params = convert_kwargs_to_parameter(args);
+                 // create CSVM with the default target platform
+                 return std::make_unique<plssvm::kokkos::csvm>(params);
+             }),
+             "create an SVM with the default target platform and keyword arguments")
+        .def(py::init([](const plssvm::target_platform target, const py::kwargs &args) {
+                 // check for valid keys
+                 check_kwargs_for_correctness(args, { "kernel_type", "degree", "gamma", "coef0", "cost" });
+                 // if one of the value keyword parameter is provided, set the respective value
+                 const plssvm::parameter params = convert_kwargs_to_parameter(args);
+                 // create CSVM with the provided target platform
+                 return std::make_unique<plssvm::kokkos::csvm>(target, params);
+             }),
+             "create an SVM with the provided target platform and keyword arguments")
+        .def("get_execution_space", &plssvm::kokkos::csvm::get_execution_space, "get the Kokkos execution space used in this Kokkos SVM");
+
+    // register Kokkos backend specific exceptions
+    register_py_exception<plssvm::kokkos::backend_exception>(kokkos_module, "BackendError", base_exception);
+
+    // bind the execution space enum classes
+    py::enum_<plssvm::kokkos::execution_space>(kokkos_module, "ExecutionSpace")
+        .value("CUDA", plssvm::kokkos::execution_space::cuda, "execution space representing execution on a CUDA device")
+        .value("HIP", plssvm::kokkos::execution_space::hip, "execution space representing execution on a device supported by HIP")
+        .value("SYCL", plssvm::kokkos::execution_space::sycl, "execution space representing execution on a device supported by SYCL")
+        .value("HPX", plssvm::kokkos::execution_space::hpx, "execution space representing execution with the HPX runtime system")
+        .value("OPENMP", plssvm::kokkos::execution_space::openmp, "execution space representing execution with the OpenMP runtime system")
+        .value("OPENMPTARGET", plssvm::kokkos::execution_space::openmp_target, "execution space representing execution using the target offloading feature of the OpenMP runtime system")
+        .value("OPENACC", plssvm::kokkos::execution_space::openacc, "execution space representing execution with the OpenACC runtime system")
+        .value("THREADS", plssvm::kokkos::execution_space::threads, "execution space representing parallel execution with std::threads")
+        .value("SERIAL", plssvm::kokkos::execution_space::serial, "execution space representing serial execution on the CPU; should always be available");
+
+    kokkos_module.def("list_available_execution_spaces", &plssvm::kokkos::list_available_execution_spaces, "list all available Kokkos execution spaces");
+}
diff --git a/bindings/Python/main.cpp b/bindings/Python/main.cpp
index 10fbafbef..c49d57092 100644
--- a/bindings/Python/main.cpp
+++ b/bindings/Python/main.cpp
@@ -39,6 +39,7 @@ void init_cuda_csvm(py::module_ &, const py::exception<plssvm::exception> &);
 void init_hip_csvm(py::module_ &, const py::exception<plssvm::exception> &);
 void init_opencl_csvm(py::module_ &, const py::exception<plssvm::exception> &);
 void init_sycl(py::module_ &, const py::exception<plssvm::exception> &);
+void init_kokkos_csvm(py::module_ &, const py::exception<plssvm::exception> &);
 void init_sklearn(py::module_ &);
 
 PYBIND11_MODULE(plssvm, m) {
@@ -99,6 +100,9 @@ PYBIND11_MODULE(plssvm, m) {
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
     init_sycl(m, base_exception);
 #endif
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    init_kokkos_csvm(m, base_exception);
+#endif
 
     init_sklearn(m);
 }

From f0fc6aefe38a450131dcb3c19630cecb26f33206 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 5 Nov 2024 20:57:47 +0100
Subject: [PATCH 049/123] Improve documentation.

---
 include/plssvm/backends/Kokkos/execution_space.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/plssvm/backends/Kokkos/execution_space.hpp b/include/plssvm/backends/Kokkos/execution_space.hpp
index bb37a39a7..07ecadf24 100644
--- a/include/plssvm/backends/Kokkos/execution_space.hpp
+++ b/include/plssvm/backends/Kokkos/execution_space.hpp
@@ -44,7 +44,7 @@ enum class execution_space {
     openacc,
     /** Execution space representing parallel execution with std::threads. */
     threads,
-    /** Execution space representing serial execution on the CPU. Always available. */
+    /** Execution space representing serial execution on the CPU. Should always be available. */
     serial
 };
 

From 2b69199d4b50f040070dff159c7fed7bb384398d Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 5 Nov 2024 21:52:34 +0100
Subject: [PATCH 050/123] Add missing const.

---
 src/plssvm/backends/Kokkos/csvm.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 157e08685..806e908c0 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -438,7 +438,7 @@ void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plss
             const int native_partial_grid = detail::dim_type_to_native(partial_grid);
 
             // create a Kokkos TeamPolicy
-            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
+            const Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
             Kokkos::parallel_for("inplace_matrix_addition", team_policy, detail::device_kernel_inplace_matrix_add<kokkos_execution_space_type>{ num_rhs, lhs_d.get().get<space>(), rhs_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x });
         }
@@ -461,7 +461,7 @@ void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm:
             const int native_partial_grid = detail::dim_type_to_native(partial_grid);
 
             // create a Kokkos TeamPolicy
-            Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
+            const Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
             Kokkos::parallel_for("inplace_matrix_scale", team_policy, detail::device_kernel_inplace_matrix_scale<kokkos_execution_space_type>{ num_rhs, lhs_d.get().get<space>(), scale, offsets.x, offsets.y, partial_grid.x });
         }

From 0f5663f285d638a6427e6682e5535e5f127ae1f9 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 6 Nov 2024 14:50:51 +0100
Subject: [PATCH 051/123] Change wrong Kokkos::Hip to Kokkos::HIP.

---
 include/plssvm/backends/Kokkos/detail/conditional_execution.hpp | 2 +-
 src/plssvm/backends/Kokkos/detail/device_wrapper.cpp            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp b/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
index 95d4c8300..752405f76 100644
--- a/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
+++ b/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
@@ -48,7 +48,7 @@ namespace plssvm::kokkos::detail {
 #endif
 
 //***************************************************//
-//                    Kokkos::Hip                    //
+//                    Kokkos::HIP                    //
 //***************************************************//
 
 /**
diff --git a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
index 5fa580aae..10013d227 100644
--- a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
@@ -46,7 +46,7 @@ std::vector<device_wrapper> get_device_list(const execution_space space, [[maybe
                     hipStreamCreate(&stream);
                     // create Kokkos execution space for the specific device
                     // Note: it is important to pass the hipStream_t lifetime to be managed by Kokkos
-                    devices.emplace_back(Kokkos::Hip(stream, Kokkos::Impl::ManageStream::yes));
+                    devices.emplace_back(Kokkos::HIP(stream, Kokkos::Impl::ManageStream::yes));
                 }
             });
             break;

From 0bcd6d74a147c2191ea1281bab729bbecaaf2607 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 6 Nov 2024 14:51:46 +0100
Subject: [PATCH 052/123] Add missing namespace qualifiers and function call
 parenthesis.

---
 src/plssvm/backends/Kokkos/detail/utility.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/plssvm/backends/Kokkos/detail/utility.cpp b/src/plssvm/backends/Kokkos/detail/utility.cpp
index b5451bc2c..bd07e4fab 100644
--- a/src/plssvm/backends/Kokkos/detail/utility.cpp
+++ b/src/plssvm/backends/Kokkos/detail/utility.cpp
@@ -58,7 +58,7 @@ std::map<target_platform, std::vector<execution_space>> available_target_platfor
                 // list all potential target platforms currently available in SYCL
                 PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
                     std::unordered_set<target_platform> targets{};
-                    for (const auto &platform : sycl::platform::get_platforms()) {
+                    for (const auto &platform : ::sycl::platform::get_platforms()) {
                         for (const auto &device : platform.get_devices()) {
                             // Note: Kokkos is Intel LLVM/DPC++/icpx only -> we can use the specific implementation defined enum values
                             if (device.is_cpu()) {
@@ -132,7 +132,7 @@ std::string get_device_name([[maybe_unused]] const device_wrapper &dev) {
             });
         case execution_space::sycl:
             PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SYCL([&]() {
-                return dev.get<execution_space::sycl>().sycl_queue.get_device().get_info<sycl::info::device::name>();
+                return dev.get<execution_space::sycl>().sycl_queue().get_device().get_info<::sycl::info::device::name>();
             });
         case execution_space::hpx:
             return "HPX CPU host device";

From f38fe3777f65917d55df96acce342905a0037cf4 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 6 Nov 2024 14:52:02 +0100
Subject: [PATCH 053/123] Add missing switch break.

---
 src/plssvm/backends/Kokkos/detail/device_wrapper.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
index 10013d227..b232a3316 100644
--- a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
@@ -91,6 +91,7 @@ std::vector<device_wrapper> get_device_list(const execution_space space, [[maybe
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS([&]() {
                 devices.emplace_back(Kokkos::Threads{});
             });
+            break;
         case execution_space::serial:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL([&]() {
                 devices.emplace_back(Kokkos::Serial{});

From c9cd1c5b7d5b9f617ed31164db64ac0c13cf4727 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 6 Nov 2024 16:50:43 +0100
Subject: [PATCH 054/123] Fix usage of wrong Kokkos HPX namespace.

---
 include/plssvm/backends/Kokkos/detail/conditional_execution.hpp | 2 +-
 src/plssvm/backends/Kokkos/detail/device_wrapper.cpp            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp b/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
index 752405f76..20fc118f6 100644
--- a/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
+++ b/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
@@ -94,7 +94,7 @@ namespace plssvm::kokkos::detail {
 #endif
 
 //***************************************************//
-//                    Kokkos::HPX                    //
+//             Kokkos::Experimental::HPX             //
 //***************************************************//
 
 /**
diff --git a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
index b232a3316..2daa03b6d 100644
--- a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
@@ -59,7 +59,7 @@ std::vector<device_wrapper> get_device_list(const execution_space space, [[maybe
             break;
         case execution_space::hpx:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX([&]() {
-                devices.emplace_back(Kokkos::Hpx{});
+                devices.emplace_back(Kokkos::Experimental::HPX{});
             });
             break;
         case execution_space::openmp:

From eb5cd36e5e67225993f1d662c53cb8ce729a4ecc Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 6 Nov 2024 17:43:33 +0100
Subject: [PATCH 055/123] Implement SYCL specific device selection (supporting
 potentially multi-GPU).

---
 .../backends/Kokkos/detail/device_wrapper.cpp | 41 ++++++++++++++++---
 src/plssvm/backends/Kokkos/detail/utility.cpp |  2 +-
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
index 2daa03b6d..e65031538 100644
--- a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
@@ -11,7 +11,8 @@
 #include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"  // PLSSVM_KOKKOS_BACKEND_INVOKE_IF_*
 #include "plssvm/backends/Kokkos/execution_space.hpp"               // plssvm::kokkos::execution_space
 #include "plssvm/detail/logging_without_performance_tracking.hpp"   // plssvm::detail::log_untracked
-#include "plssvm/detail/utility.hpp"                                // plssvm::detail::unreachable
+#include "plssvm/detail/string_utility.hpp"                         // plssvm::detail::as_lower_case
+#include "plssvm/detail/utility.hpp"                                // plssvm::detail::contains
 #include "plssvm/target_platforms.hpp"                              // plssvm::target_platform
 #include "plssvm/verbosity_levels.hpp"                              // plssvm::verbosity_level
 
@@ -51,11 +52,39 @@ std::vector<device_wrapper> get_device_list(const execution_space space, [[maybe
             });
             break;
         case execution_space::sycl:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
-                // TODO: use all available devices -> not that trivial
-                // TODO: handle target <- if provide queue -> managed?
-                devices.emplace_back(Kokkos::SYCL{});
-            });
+            PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL(([&]() {
+                // all user provided sycl::queues must be in-order queues
+                ::sycl::property_list props{ ::sycl::property::queue::in_order{} };
+                static ::sycl::queue q;
+
+                for (const auto &platform : ::sycl::platform::get_platforms()) {
+                    for (const auto &device : platform.get_devices()) {
+                        // Note: Kokkos is IntelLLVM/DPC++/icpx only
+                        if (device.is_cpu() && target == target_platform::cpu) {
+                            q = ::sycl::queue{ device, props };
+                            devices.emplace_back(Kokkos::SYCL{ q });
+                        } else if (device.is_gpu()) {
+                            // the current device is a GPU
+                            // get vendor string and convert it to all lower case
+                            const std::string vendor_string = ::plssvm::detail::as_lower_case(device.get_info<::sycl::info::device::vendor>());
+                            // get platform name of current GPU device and convert it to all lower case
+                            const std::string platform_string = ::plssvm::detail::as_lower_case(platform.get_info<::sycl::info::platform::name>());
+
+                            // check vendor string and insert to correct target platform
+                            if (::plssvm::detail::contains(vendor_string, "nvidia") && target == target_platform::gpu_nvidia) {
+                                q = ::sycl::queue{ device, props };
+                                devices.emplace_back(Kokkos::SYCL{ q });
+                            } else if ((::plssvm::detail::contains(vendor_string, "amd") || ::plssvm::detail::contains(vendor_string, "advanced micro devices")) && target == target_platform::gpu_amd) {
+                                q = ::sycl::queue{ device, props };
+                                devices.emplace_back(Kokkos::SYCL{ q });
+                            } else if (::plssvm::detail::contains(vendor_string, "intel") && target == target_platform::gpu_intel) {
+                                q = ::sycl::queue{ device, props };
+                                devices.emplace_back(Kokkos::SYCL{ q });
+                            }
+                        }
+                    }
+                }
+            }));
             break;
         case execution_space::hpx:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX([&]() {
diff --git a/src/plssvm/backends/Kokkos/detail/utility.cpp b/src/plssvm/backends/Kokkos/detail/utility.cpp
index bd07e4fab..b922b9a1d 100644
--- a/src/plssvm/backends/Kokkos/detail/utility.cpp
+++ b/src/plssvm/backends/Kokkos/detail/utility.cpp
@@ -60,7 +60,7 @@ std::map<target_platform, std::vector<execution_space>> available_target_platfor
                     std::unordered_set<target_platform> targets{};
                     for (const auto &platform : ::sycl::platform::get_platforms()) {
                         for (const auto &device : platform.get_devices()) {
-                            // Note: Kokkos is Intel LLVM/DPC++/icpx only -> we can use the specific implementation defined enum values
+                            // Note: Kokkos is Intel LLVM/DPC++/icpx only
                             if (device.is_cpu()) {
                                 targets.insert(target_platform::cpu);
                             } else if (device.is_gpu()) {

From c88d156edc97a8f8780b50c935c6ba0aee4f6e81 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 6 Nov 2024 17:44:10 +0100
Subject: [PATCH 056/123] Use icpx specific compilation flags.

---
 src/plssvm/backends/Kokkos/CMakeLists.txt | 79 +++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/src/plssvm/backends/Kokkos/CMakeLists.txt b/src/plssvm/backends/Kokkos/CMakeLists.txt
index d2bf6addf..271c32527 100644
--- a/src/plssvm/backends/Kokkos/CMakeLists.txt
+++ b/src/plssvm/backends/Kokkos/CMakeLists.txt
@@ -36,6 +36,85 @@ set_local_and_parent(PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME plssvm-Kokkos)
 add_library(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} SHARED ${PLSSVM_KOKKOS_SOURCES})
 target_link_libraries(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PUBLIC Kokkos::kokkos)
 
+if (Kokkos_ENABLE_SYCL)
+    # set SYCL (icpx) specific compilation flags
+    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "IntelLLVM")
+        message(FATAL_ERROR "For Kokkos::SYCL to work, the compiler must be IntelLLVM, but is ${CMAKE_CXX_COMPILER}!")
+    endif ()
+
+    # set icpx specific compiler flags based on the provided PLSSVM_TARGET_PLATFORMS
+    set(PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS "")
+    # cpu targets
+    if (DEFINED PLSSVM_CPU_TARGET_ARCHS)
+        # assemble -fsycl-targets
+        list(APPEND PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS "spir64_x86_64")
+    endif ()
+    # nvidia targets
+    if (DEFINED PLSSVM_NVIDIA_TARGET_ARCHS)
+        # assemble -fsycl-targets
+        list(APPEND PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS "nvptx64-nvidia-cuda")
+    endif ()
+    # amd targets
+    if (DEFINED PLSSVM_AMD_TARGET_ARCHS)
+        # assemble -fsycl-targets
+        list(APPEND PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS "amdgcn-amd-amdhsa")
+        # add target specific flags for AOT -> must always be specified von amd targets
+        if (NOT PLSSVM_NUM_AMD_TARGET_ARCHS EQUAL 1)
+            message(SEND_ERROR "IntelLLVM currently only supports a single AMD architecture specification but ${PLSSVM_NUM_AMD_TARGET_ARCHS} were provided!")
+        endif ()
+        target_compile_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCHS})
+        target_link_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCHS})
+    endif ()
+    # intel targets
+    if (DEFINED PLSSVM_INTEL_TARGET_ARCHS)
+        # assemble -fsycl-targets
+        list(APPEND PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS "spir64_gen")
+    endif ()
+    # set -fsycl-targets
+    list(JOIN PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS "," PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS_STRING)
+    target_compile_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -sycl-std=2020 -fsycl -fsycl-targets=${PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS_STRING})
+    target_link_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -fsycl -fsycl-targets=${PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS_STRING})
+endif ()
+
+# add option for IntelLLVM Ahead-of-Time (AOT) compilation
+option(PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT "Enables Ahead-of-Time compilation for the Kokkos::SYCL execution space using IntelLLVM." ON)
+if (PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT)
+    message(STATUS "Enabled Ahead-of-Time (AOT) compilation for the Kokkos::SYCL execution space using IntelLLVM.")
+    target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PRIVATE PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT)
+    target_compile_definitions(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT)
+    ## set AOT compiler flags
+    # cpu targets
+    if (DEFINED PLSSVM_CPU_TARGET_ARCHS)
+        # add target specific flags for AOT
+        if (PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 1)
+            target_compile_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_x86_64 "-march=${PLSSVM_CPU_TARGET_ARCHS}")
+            target_link_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_x86_64 "-march=${PLSSVM_CPU_TARGET_ARCHS}")
+        endif ()
+    endif ()
+    # nvidia targets
+    if (DEFINED PLSSVM_NVIDIA_TARGET_ARCHS)
+        # add target specific flags for AOT
+        if (NOT PLSSVM_NUM_NVIDIA_TARGET_ARCHS EQUAL 1)
+            message(SEND_ERROR "IntelLLVM currently only supports a single NVIDIA architecture specification for AOT but ${PLSSVM_NUM_NVIDIA_TARGET_ARCHS} were provided!")
+        endif ()
+        target_compile_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=nvptx64-nvidia-cuda --offload-arch=${PLSSVM_NVIDIA_TARGET_ARCHS})
+        target_link_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=nvptx64-nvidia-cuda --offload-arch=${PLSSVM_NVIDIA_TARGET_ARCHS})
+    endif ()
+    # intel targets
+    if (DEFINED PLSSVM_INTEL_TARGET_ARCHS)
+        # add target specific flags for AOT
+        list(JOIN PLSSVM_INTEL_TARGET_ARCHS "," PLSSVM_INTEL_TARGET_ARCHS_STRING)
+        target_compile_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_gen "-device ${PLSSVM_INTEL_TARGET_ARCHS_STRING}")
+        target_link_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_gen "-device ${PLSSVM_INTEL_TARGET_ARCHS_STRING}")
+    endif ()
+endif ()
+
+if (Kokkos_ENABLE_HWLOC)
+    message(STATUS "Kokkos was built with hwloc support.")
+else()
+    message(STATUS "Kokkos was NOT built with hwloc support.")
+endif()
+
 # link base library against Kokkos library
 target_link_libraries(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PUBLIC ${PLSSVM_BASE_LIBRARY_NAME})
 

From ff1bb2442578a3d4cf1dbac2ce2656f789b71849 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 6 Nov 2024 17:59:42 +0100
Subject: [PATCH 057/123] Add return code checks.

---
 .../backends/Kokkos/detail/device_wrapper.cpp | 23 +++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
index e65031538..f79eb396f 100644
--- a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
@@ -9,6 +9,7 @@
 #include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"
 
 #include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"  // PLSSVM_KOKKOS_BACKEND_INVOKE_IF_*
+#include "plssvm/backends/Kokkos/exceptions.hpp"                    // plssvm::kokkos::backend_exception
 #include "plssvm/backends/Kokkos/execution_space.hpp"               // plssvm::kokkos::execution_space
 #include "plssvm/detail/logging_without_performance_tracking.hpp"   // plssvm::detail::log_untracked
 #include "plssvm/detail/string_utility.hpp"                         // plssvm::detail::as_lower_case
@@ -18,6 +19,20 @@
 
 #include "Kokkos_Core.hpp"  // Kokkos::num_devices, Kokkos::ExecutionSpace
 
+#if defined(KOKKOS_ENABLE_CUDA)
+    #define PLSSVM_CUDA_ERROR_CHECK(err)                                                                                                            \
+        if ((err) != cudaSuccess) {                                                                                                                 \
+            throw plssvm::kokkos::backend_exception{ fmt::format("Kokkos::Cuda assert '{}': {}", cudaGetErrorName(err), cudaGetErrorString(err)) }; \
+        }
+#endif
+
+#if defined(KOKKOS_ENABLE_HIP)
+    #define PLSSVM_HIP_ERROR_CHECK(err)                                                                                                  \
+        if ((err) != hipSuccess) {                                                                                                       \
+            throw plssvm::kokkos::backend_exception{ fmt::format("HIP assert '{}': {}", hipGetErrorName(err), hipGetErrorString(err)) }; \
+        }
+#endif
+
 #include <vector>  // std::vector
 
 namespace plssvm::kokkos::detail {
@@ -29,9 +44,9 @@ std::vector<device_wrapper> get_device_list(const execution_space space, [[maybe
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() {
                 for (int device = 0; device < Kokkos::num_devices(); ++device) {
                     // create CUDA stream using the CUDA specific functions
-                    cudaSetDevice(device);
+                    PLSSVM_CUDA_ERROR_CHECK(cudaSetDevice(device));
                     cudaStream_t stream{};
-                    cudaStreamCreate(&stream);
+                    PLSSVM_CUDA_ERROR_CHECK(cudaStreamCreate(&stream));
                     // create Kokkos execution space for the specific device
                     // Note: it is important to pass the cudaStream_t lifetime to be managed by Kokkos
                     devices.emplace_back(Kokkos::Cuda(stream, Kokkos::Impl::ManageStream::yes));
@@ -42,9 +57,9 @@ std::vector<device_wrapper> get_device_list(const execution_space space, [[maybe
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
                 for (int device = 0; device < Kokkos::num_devices(); ++device) {
                     // HIP CUDA stream using the HIP specific functions
-                    hipSetDevice(device);
+                    PLSSVM_HIP_ERROR_CHECK(hipSetDevice(device));
                     hipStream_t stream{};
-                    hipStreamCreate(&stream);
+                    PLSSVM_HIP_ERROR_CHECK(hipStreamCreate(&stream));
                     // create Kokkos execution space for the specific device
                     // Note: it is important to pass the hipStream_t lifetime to be managed by Kokkos
                     devices.emplace_back(Kokkos::HIP(stream, Kokkos::Impl::ManageStream::yes));

From 0ab2f64319d735a36c61101ccbd9957ad73574e3 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 6 Nov 2024 18:00:13 +0100
Subject: [PATCH 058/123] Move CMake functionality inside correct if.

---
 src/plssvm/backends/Kokkos/CMakeLists.txt | 68 +++++++++++------------
 1 file changed, 31 insertions(+), 37 deletions(-)

diff --git a/src/plssvm/backends/Kokkos/CMakeLists.txt b/src/plssvm/backends/Kokkos/CMakeLists.txt
index 271c32527..b66927fa9 100644
--- a/src/plssvm/backends/Kokkos/CMakeLists.txt
+++ b/src/plssvm/backends/Kokkos/CMakeLists.txt
@@ -41,7 +41,7 @@ if (Kokkos_ENABLE_SYCL)
     if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "IntelLLVM")
         message(FATAL_ERROR "For Kokkos::SYCL to work, the compiler must be IntelLLVM, but is ${CMAKE_CXX_COMPILER}!")
     endif ()
-
+    
     # set icpx specific compiler flags based on the provided PLSSVM_TARGET_PLATFORMS
     set(PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS "")
     # cpu targets
@@ -74,47 +74,41 @@ if (Kokkos_ENABLE_SYCL)
     list(JOIN PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS "," PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS_STRING)
     target_compile_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -sycl-std=2020 -fsycl -fsycl-targets=${PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS_STRING})
     target_link_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -fsycl -fsycl-targets=${PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS_STRING})
-endif ()
-
-# add option for IntelLLVM Ahead-of-Time (AOT) compilation
-option(PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT "Enables Ahead-of-Time compilation for the Kokkos::SYCL execution space using IntelLLVM." ON)
-if (PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT)
-    message(STATUS "Enabled Ahead-of-Time (AOT) compilation for the Kokkos::SYCL execution space using IntelLLVM.")
-    target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PRIVATE PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT)
-    target_compile_definitions(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT)
-    ## set AOT compiler flags
-    # cpu targets
-    if (DEFINED PLSSVM_CPU_TARGET_ARCHS)
-        # add target specific flags for AOT
-        if (PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 1)
-            target_compile_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_x86_64 "-march=${PLSSVM_CPU_TARGET_ARCHS}")
-            target_link_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_x86_64 "-march=${PLSSVM_CPU_TARGET_ARCHS}")
+    
+    # add option for IntelLLVM Ahead-of-Time (AOT) compilation
+    option(PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT "Enables Ahead-of-Time compilation for the Kokkos::SYCL execution space using IntelLLVM." ON)
+    if (PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT)
+        message(STATUS "Enabled Ahead-of-Time (AOT) compilation for the Kokkos::SYCL execution space using IntelLLVM.")
+        target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PRIVATE PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT)
+        target_compile_definitions(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT)
+        ## set AOT compiler flags
+        # cpu targets
+        if (DEFINED PLSSVM_CPU_TARGET_ARCHS)
+            # add target specific flags for AOT
+            if (PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 1)
+                target_compile_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_x86_64 "-march=${PLSSVM_CPU_TARGET_ARCHS}")
+                target_link_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_x86_64 "-march=${PLSSVM_CPU_TARGET_ARCHS}")
+            endif ()
         endif ()
-    endif ()
-    # nvidia targets
-    if (DEFINED PLSSVM_NVIDIA_TARGET_ARCHS)
-        # add target specific flags for AOT
-        if (NOT PLSSVM_NUM_NVIDIA_TARGET_ARCHS EQUAL 1)
-            message(SEND_ERROR "IntelLLVM currently only supports a single NVIDIA architecture specification for AOT but ${PLSSVM_NUM_NVIDIA_TARGET_ARCHS} were provided!")
+        # nvidia targets
+        if (DEFINED PLSSVM_NVIDIA_TARGET_ARCHS)
+            # add target specific flags for AOT
+            if (NOT PLSSVM_NUM_NVIDIA_TARGET_ARCHS EQUAL 1)
+                message(SEND_ERROR "IntelLLVM currently only supports a single NVIDIA architecture specification for AOT but ${PLSSVM_NUM_NVIDIA_TARGET_ARCHS} were provided!")
+            endif ()
+            target_compile_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=nvptx64-nvidia-cuda --offload-arch=${PLSSVM_NVIDIA_TARGET_ARCHS})
+            target_link_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=nvptx64-nvidia-cuda --offload-arch=${PLSSVM_NVIDIA_TARGET_ARCHS})
+        endif ()
+        # intel targets
+        if (DEFINED PLSSVM_INTEL_TARGET_ARCHS)
+            # add target specific flags for AOT
+            list(JOIN PLSSVM_INTEL_TARGET_ARCHS "," PLSSVM_INTEL_TARGET_ARCHS_STRING)
+            target_compile_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_gen "-device ${PLSSVM_INTEL_TARGET_ARCHS_STRING}")
+            target_link_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_gen "-device ${PLSSVM_INTEL_TARGET_ARCHS_STRING}")
         endif ()
-        target_compile_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=nvptx64-nvidia-cuda --offload-arch=${PLSSVM_NVIDIA_TARGET_ARCHS})
-        target_link_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=nvptx64-nvidia-cuda --offload-arch=${PLSSVM_NVIDIA_TARGET_ARCHS})
-    endif ()
-    # intel targets
-    if (DEFINED PLSSVM_INTEL_TARGET_ARCHS)
-        # add target specific flags for AOT
-        list(JOIN PLSSVM_INTEL_TARGET_ARCHS "," PLSSVM_INTEL_TARGET_ARCHS_STRING)
-        target_compile_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_gen "-device ${PLSSVM_INTEL_TARGET_ARCHS_STRING}")
-        target_link_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_gen "-device ${PLSSVM_INTEL_TARGET_ARCHS_STRING}")
     endif ()
 endif ()
 
-if (Kokkos_ENABLE_HWLOC)
-    message(STATUS "Kokkos was built with hwloc support.")
-else()
-    message(STATUS "Kokkos was NOT built with hwloc support.")
-endif()
-
 # link base library against Kokkos library
 target_link_libraries(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PUBLIC ${PLSSVM_BASE_LIBRARY_NAME})
 

From 9b7736e635cf4ca7c7892b22b680fc7365923d72 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 6 Nov 2024 18:50:18 +0100
Subject: [PATCH 059/123] Remove old test implementation.

---
 .../backends/Kokkos/detail/device_wrapper.cpp       | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
index f79eb396f..eb82ec0d4 100644
--- a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
@@ -70,14 +70,12 @@ std::vector<device_wrapper> get_device_list(const execution_space space, [[maybe
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL(([&]() {
                 // all user provided sycl::queues must be in-order queues
                 ::sycl::property_list props{ ::sycl::property::queue::in_order{} };
-                static ::sycl::queue q;
 
                 for (const auto &platform : ::sycl::platform::get_platforms()) {
                     for (const auto &device : platform.get_devices()) {
                         // Note: Kokkos is IntelLLVM/DPC++/icpx only
                         if (device.is_cpu() && target == target_platform::cpu) {
-                            q = ::sycl::queue{ device, props };
-                            devices.emplace_back(Kokkos::SYCL{ q });
+                            devices.emplace_back(Kokkos::SYCL{ ::sycl::queue{ device, props } });
                         } else if (device.is_gpu()) {
                             // the current device is a GPU
                             // get vendor string and convert it to all lower case
@@ -87,14 +85,11 @@ std::vector<device_wrapper> get_device_list(const execution_space space, [[maybe
 
                             // check vendor string and insert to correct target platform
                             if (::plssvm::detail::contains(vendor_string, "nvidia") && target == target_platform::gpu_nvidia) {
-                                q = ::sycl::queue{ device, props };
-                                devices.emplace_back(Kokkos::SYCL{ q });
+                                devices.emplace_back(Kokkos::SYCL{ ::sycl::queue{ device, props } });
                             } else if ((::plssvm::detail::contains(vendor_string, "amd") || ::plssvm::detail::contains(vendor_string, "advanced micro devices")) && target == target_platform::gpu_amd) {
-                                q = ::sycl::queue{ device, props };
-                                devices.emplace_back(Kokkos::SYCL{ q });
+                                devices.emplace_back(Kokkos::SYCL{ ::sycl::queue{ device, props } });
                             } else if (::plssvm::detail::contains(vendor_string, "intel") && target == target_platform::gpu_intel) {
-                                q = ::sycl::queue{ device, props };
-                                devices.emplace_back(Kokkos::SYCL{ q });
+                                devices.emplace_back(Kokkos::SYCL{ ::sycl::queue{ device, props } });
                             }
                         }
                     }

From a1d13d9333a997eaea955360cc31fe40fbd6b40b Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 11:05:46 +0100
Subject: [PATCH 060/123] Fix TODO in documentation.

---
 include/plssvm/backends/Kokkos/detail/pinned_memory.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/plssvm/backends/Kokkos/detail/pinned_memory.hpp b/include/plssvm/backends/Kokkos/detail/pinned_memory.hpp
index dffb0d1c7..cb328e6d3 100644
--- a/include/plssvm/backends/Kokkos/detail/pinned_memory.hpp
+++ b/include/plssvm/backends/Kokkos/detail/pinned_memory.hpp
@@ -6,7 +6,7 @@
  * @license This file is part of the PLSSVM project which is released under the MIT license.
  *          See the LICENSE.md file in the project root for full license information.
  *
- * @brief Small wrapper around RAII enabled TODO.
+ * @brief Small wrapper around RAII for registering memory as pinned memory.
  */
 
 #ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_PINNED_MEMORY_HPP_

From a0af87c328998c9a755adf3e483c31893d378f52 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 11:46:02 +0100
Subject: [PATCH 061/123] Correctly use the Kokkos::Experimental::OpenMPTarget
 namespace.

---
 .../backends/Kokkos/detail/conditional_execution.hpp      | 2 +-
 include/plssvm/backends/Kokkos/execution_space.hpp        | 8 ++++----
 src/plssvm/backends/Kokkos/detail/device_wrapper.cpp      | 2 +-
 tests/backends/Kokkos/execution_space.cpp                 | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp b/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
index 20fc118f6..f981fb2bd 100644
--- a/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
+++ b/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
@@ -140,7 +140,7 @@ namespace plssvm::kokkos::detail {
 #endif
 
 //***************************************************//
-//               Kokkos::OpenMPTarget                //
+//        Kokkos::Experimental::OpenMPTarget         //
 //***************************************************//
 
 /**
diff --git a/include/plssvm/backends/Kokkos/execution_space.hpp b/include/plssvm/backends/Kokkos/execution_space.hpp
index 07ecadf24..abf8b227d 100644
--- a/include/plssvm/backends/Kokkos/execution_space.hpp
+++ b/include/plssvm/backends/Kokkos/execution_space.hpp
@@ -126,11 +126,11 @@ struct execution_space_to_kokkos_type<execution_space::openmp> {
 
 #if defined(KOKKOS_ENABLE_OPENMPTARGET)
 /**
- * @brief Convert an `execution_space::openmp_target` enum value to a `Kokkos::OpenMPTarget` Kokkos::ExecutionSpace type.
+ * @brief Convert an `execution_space::openmp_target` enum value to a `Kokkos::Experimental::OpenMPTarget` Kokkos::ExecutionSpace type.
  */
 template <>
 struct execution_space_to_kokkos_type<execution_space::openmp_target> {
-    using type = Kokkos::OpenMPTarget;
+    using type = Kokkos::Experimental::OpenMPTarget;
 };
 #endif
 
@@ -233,10 +233,10 @@ struct kokkos_type_to_execution_space<Kokkos::OpenMP> {
 
 #if defined(KOKKOS_ENABLE_OPENMPTARGET)
 /**
- * @brief Convert a `Kokkos::OpenMPTarget` Kokkos::ExecutionSpace type to an `execution_space::openmp_target` enum value.
+ * @brief Convert a `Kokkos::Experimental::OpenMPTarget` Kokkos::ExecutionSpace type to an `execution_space::openmp_target` enum value.
  */
 template <>
-struct kokkos_type_to_execution_space<Kokkos::OpenMPTarget> {
+struct kokkos_type_to_execution_space<Kokkos::Experimental::OpenMPTarget> {
     constexpr static execution_space value = execution_space::openmp_target;
 };
 #endif
diff --git a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
index eb82ec0d4..a020454ef 100644
--- a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
@@ -117,7 +117,7 @@ std::vector<device_wrapper> get_device_list(const execution_space space, [[maybe
         case execution_space::openmp_target:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET([&]() {
                 // TODO: multi-GPU?
-                devices.emplace_back(Kokkos::OpenMPTarget{});
+                devices.emplace_back(Kokkos::Experimental::OpenMPTarget{});
             });
             break;
         case execution_space::openacc:
diff --git a/tests/backends/Kokkos/execution_space.cpp b/tests/backends/Kokkos/execution_space.cpp
index 2073d1fd4..24c3135de 100644
--- a/tests/backends/Kokkos/execution_space.cpp
+++ b/tests/backends/Kokkos/execution_space.cpp
@@ -86,7 +86,7 @@ TEST(KokkosExecutionSpace, execution_space_to_kokkos_type) {
     ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::openmp>, Kokkos::OpenMP>();
 #endif
 #if defined(KOKKOS_ENABLE_OPENMPTARGET)
-    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::openmp_target>, Kokkos::OpenMPTarget>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::openmp_target>, Kokkos::Experimental::OpenMPTarget>();
 #endif
 #if defined(KOKKOS_ENABLE_OPENACC)
     ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::openacc>, Kokkos::OpenACC>();
@@ -117,7 +117,7 @@ TEST(KokkosExecutionSpace, kokkos_type_to_execution_space) {
     EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::OpenMP>, plssvm::kokkos::execution_space::openmp);
 #endif
 #if defined(KOKKOS_ENABLE_OPENMPTARGET)
-    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::OpenMPTarget>, plssvm::kokkos::execution_space::openmp_target);
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Experimental::OpenMPTarget>, plssvm::kokkos::execution_space::openmp_target);
 #endif
 #if defined(KOKKOS_ENABLE_OPENACC)
     EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Experimental::OpenACC>, plssvm::kokkos::execution_space::openacc);

From 39e447cdac14361f2d4d662906692ffc268df875 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 13:44:59 +0100
Subject: [PATCH 062/123] Add support for more Kokkos execution spaces. NOTE:
 OpenMPTarget and OpenACC are currently NOT supported.

---
 src/plssvm/backends/Kokkos/csvm.cpp           | 164 +++++++++++-------
 .../backends/Kokkos/detail/device_wrapper.cpp |   4 +-
 .../backends/Kokkos/detail/pinned_memory.cpp  |   2 -
 src/plssvm/backends/Kokkos/detail/utility.cpp |   3 -
 4 files changed, 107 insertions(+), 66 deletions(-)

diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 806e908c0..28823fd83 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -12,9 +12,9 @@
 #include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"                    // PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_*, PLSSVM_KOKKOS_BACKEND_INVOKE_IF_
 #include "plssvm/backends/Kokkos/detail/device_ptr.hpp"                               // plssvm::kokkos::detail::device_ptr
 #include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"                           // plssvm::kokkos::detail::{device_wrapper, get_device_list}
-#include "plssvm/backends/Kokkos/detail/utility.hpp"                                  // plssvm::kokkos::detail::get_runtime_version // TODO: docu
+#include "plssvm/backends/Kokkos/detail/utility.hpp"                                  // plssvm::kokkos::detail::{available_target_platform_to_execution_space_mapping, get_kokkos_version, dim_type_to_native, get_device_name, device_synchronize}
 #include "plssvm/backends/Kokkos/exceptions.hpp"                                      // plssvm::kokkos::backend_exception
-#include "plssvm/backends/Kokkos/execution_space.hpp"                                 // plssvm::kokkos::execution_space
+#include "plssvm/backends/Kokkos/execution_space.hpp"                                 // plssvm::kokkos::{execution_space, list_available_execution_spaces}
 #include "plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp"                         // plssvm::kokkos::detail::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale}
 #include "plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp"       // plssvm::kokkos::detail::device_kernel_assembly
 #include "plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp"  // plssvm::kokkos::detail::device_kernel_assembly_symm
@@ -33,7 +33,8 @@
 #include "plssvm/target_platforms.hpp"                                                // plssvm::target_platform
 #include "plssvm/verbosity_levels.hpp"                                                // plssvm::verbosity_level
 
-#include "Kokkos_Core.hpp"  // TODO: docu
+#include "Kokkos_Core.hpp"  // Kokkos::TeamPolicy, Kokkos::ParallelForTag, Kokkos::parallel_for, Kokkos::PerTeam
+                            // Kokkos::Experimental::HPX::impl_max_hardware_threads, Kokkos::OpenMP::impl_max_hardware_threads, Kokkos::Threads::impl_max_hardware_threads
 
 #include "fmt/core.h"    // fmt::format
 #include "fmt/format.h"  // fmt::format
@@ -42,11 +43,19 @@
 #include <cstddef>    // std::size_t
 #include <exception>  // std::terminate
 #include <iostream>   // std::cout, std::endl
+#include <limits>     // std::numeric_limits::max
 #include <map>        // std::map
 #include <string>     // std::string
 #include <utility>    // std::move
 #include <vector>     // std::vector
 
+// a dummy class used as functor to the team_size_max function
+template <typename ExecutionSpace>
+struct dummy {
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &) const { }
+};
+
 namespace plssvm::kokkos {
 
 csvm::csvm(parameter params) :
@@ -109,6 +118,11 @@ void csvm::init(const target_platform target) {
         }
     }
 
+    // Kokkos::Experimental::OpenMPTarget and Kokkos::Experimental::OpenACC currently not supported!
+    if (space_ == execution_space::openmp_target || space_ == execution_space::openacc) {
+        throw backend_exception{ fmt::format("The Kokkos execution space {} is currently not supported!", space_) };
+    }
+
     plssvm::detail::log(verbosity_level::full,
                         "\nUsing Kokkos ({}) as backend with the Kokkos::ExecutionSpace \"{}\".\n",
                         plssvm::detail::tracking::tracking_entry{ "dependencies", "kokkos_version", detail::get_kokkos_version() },
@@ -163,129 +177,162 @@ csvm::~csvm() {
 }
 
 std::vector<::plssvm::detail::memory_size> csvm::get_device_memory() const {
-    // TODO: implement for other execution spaces
-    std::vector<::plssvm::detail::memory_size> res(this->num_available_devices());
+    std::vector<::plssvm::detail::memory_size> device_memory(this->num_available_devices());
     switch (space_) {
         case execution_space::cuda:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() {
                 for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
-                    res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::cuda>().cuda_device_prop().totalGlobalMem) };
+                    device_memory[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::cuda>().cuda_device_prop().totalGlobalMem) };
                 }
             });
             break;
         case execution_space::hip:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() {
                 for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
-                    res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::hip>().hip_device_prop().totalGlobalMem) };
+                    device_memory[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::hip>().hip_device_prop().totalGlobalMem) };
                 }
             });
             break;
         case execution_space::sycl:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
                 for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
-                    res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::sycl>().sycl_queue().get_device().get_info<::sycl::info::device::global_mem_size>()) };
+                    device_memory[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::sycl>().sycl_queue().get_device().get_info<::sycl::info::device::global_mem_size>()) };
                 }
             });
             break;
-        case execution_space::openmp:
         case execution_space::hpx:
+        case execution_space::openmp:
         case execution_space::threads:
         case execution_space::serial:
-            return std::vector<::plssvm::detail::memory_size>(this->num_available_devices(), ::plssvm::detail::get_system_memory());
+            // NOTE: for these execution spaces, this->num_available_devices will always return 1
+            PLSSVM_ASSERT(this->num_available_devices() == 1, "The host side Kokkos execution spaces should always only be represented using a single device!");
+            device_memory[0] = ::plssvm::detail::get_system_memory();
+            break;
+        // TODO: implement for Kokkos::Experimental::OpenMPTarget and Kokkos::Experimental::OpenACC
         case execution_space::openmp_target:
         case execution_space::openacc:
             throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
     }
-    return res;
+    return device_memory;
 }
 
 std::vector<::plssvm::detail::memory_size> csvm::get_max_mem_alloc_size() const {
-    [[maybe_unused]] std::vector<::plssvm::detail::memory_size> res(this->num_available_devices());
-    // TODO: implement for other execution spaces
+    std::vector<::plssvm::detail::memory_size> max_mem_alloc_size(this->num_available_devices());
     switch (space_) {
         case execution_space::cuda:
         case execution_space::hip:
-            return this->get_device_memory();
+            max_mem_alloc_size = this->get_device_memory();
+            break;
         case execution_space::sycl:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() {
                 for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
-                    res[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::sycl>().sycl_queue().get_device().get_info<::sycl::info::device::max_mem_alloc_size>()) };
+                    max_mem_alloc_size[device_id] = ::plssvm::detail::memory_size{ static_cast<unsigned long long>(devices_[device_id].get<execution_space::sycl>().sycl_queue().get_device().get_info<::sycl::info::device::max_mem_alloc_size>()) };
                 }
             });
             break;
-        case execution_space::openmp:
         case execution_space::hpx:
+        case execution_space::openmp:
         case execution_space::threads:
         case execution_space::serial:
-            return this->get_device_memory();
+            max_mem_alloc_size = this->get_device_memory();
+            break;
+        // TODO: implement for Kokkos::Experimental::OpenMPTarget and Kokkos::Experimental::OpenACC
         case execution_space::openmp_target:
         case execution_space::openacc:
             throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
     }
-    return res;
+    return max_mem_alloc_size;
 }
 
 std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
     PLSSVM_ASSERT(device_id < this->num_available_devices(), "Invalid device {} requested!", device_id);
 
-    // TODO: implement for other execution spaces
-    switch (space_) {
-        case execution_space::cuda:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA([&]() {
-                return static_cast<std::size_t>(devices_[device_id].get<execution_space::cuda>().cuda_device_prop().maxThreadsPerBlock);
-            });
-        case execution_space::hip:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HIP([&]() {
-                return static_cast<std::size_t>(devices_[device_id].get<execution_space::hip>().hip_device_prop().maxThreadsPerBlock);
-            });
-        case execution_space::sycl:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SYCL([&]() {
-                return devices_[device_id].get<execution_space::sycl>().sycl_queue().get_device().get_info<::sycl::info::device::max_work_group_size>();
-            });
-        case execution_space::openmp:
-            return 16;  // TODO: most likely dependent on the number of cores in Kokkos...
-        case execution_space::serial:
-            // only one thread allowed in serial execution
-            return 1;
-        case execution_space::openmp_target:
-        case execution_space::openacc:
-        case execution_space::hpx:
-        case execution_space::threads:
-            throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
-    }
-    // all possible cases should be handled by the previous switch
-    // -> silence missing return statement compiler warnings due to throw statement
-    ::plssvm::detail::unreachable();
+    // NOTE: the maximum theoretical work-group size, may be additionally limited by the amount of used scratch memory
+    return devices_[device_id].execute_and_return([](const auto &device) {
+        using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
+        // NOTE: CUDA + HIP + SYCL: returns the maximum possible number of threads, due to no further limitations in the dummy functor (like, e.g., scratch memory)
+        // NOTE: HPX + Serial: hardcoded to 1
+        // NOTE: OpenMP: should be 1-2; most likely 1
+        // NOTE: Threads: should be equal to number of hardware threads IF hwloc is enabled; otherwise 1
+        // NOTE: OpenMPTarget: hardcoded to 256
+        // NOTE: OpenACC: hardcoded to 512
+
+        // NOTE: the functor types doesn't matter -> the dummy class
+        return Kokkos::TeamPolicy<kokkos_execution_space_type>{}.team_size_max(dummy<kokkos_execution_space_type>{}, Kokkos::ParallelForTag{});
+    });
 }
 
-::plssvm::detail::dim_type csvm::get_max_grid_size(const std::size_t device_id) const {
+::plssvm::detail::dim_type csvm::get_max_grid_size([[maybe_unused]] const std::size_t device_id) const {
     PLSSVM_ASSERT(device_id < this->num_available_devices(), "Invalid device {} requested!", device_id);
 
     // NOTE: Kokkos only supports one-dimensional execution ranges!
     // NOTE: we only use two-dimensional kernels!
-    // TODO: implement for other execution spaces
     switch (space_) {
         case execution_space::cuda:
             PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA(([&]() -> ::plssvm::detail::dim_type {
                 const cudaDeviceProp &prop = devices_[device_id].get<execution_space::cuda>().cuda_device_prop();
-                const auto max_grid_size = static_cast<std::size_t>(std::sqrt(prop.maxGridSize[0]));
-                return { max_grid_size, max_grid_size, std::size_t{ 1 } };
+                const auto max_grid_size = static_cast<unsigned long long>(std::sqrt(prop.maxGridSize[0]));
+                return { max_grid_size, max_grid_size, 1ull };
             }));
         case execution_space::hip:
             PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HIP(([&]() -> ::plssvm::detail::dim_type {
-                const hipDeviceProp &prop = devices_[device_id].get<execution_space::hip>().hip_device_prop();
-                const auto max_grid_size = static_cast<std::size_t>(std::sqrt(prop.maxGridSize[0]));
-                return { max_grid_size, max_grid_size, std::size_t{ 1 } };
+                const hipDeviceProp_t &prop = devices_[device_id].get<execution_space::hip>().hip_device_prop();
+                const auto max_grid_size = static_cast<unsigned long long>(std::sqrt(prop.maxGridSize[0]));
+                return { max_grid_size, max_grid_size, 1ull };
+            }));
+        case execution_space::sycl:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SYCL(([&]() -> ::plssvm::detail::dim_type {
+            // TODO: replace with standardized function if there will be one in the future
+#if defined(SYCL_EXT_ONEAPI_MAX_WORK_GROUP_QUERY)
+                const ::sycl::id<3> native_range = devices_[device_id].get<execution_space::sycl>().sycl_queue().get_device().get_info<::sycl::ext::oneapi::experimental::info::device::max_work_groups<3>>();
+#else
+                // fallback to maximum theoretical value, may break at runtime!
+                ::sycl::id<3> native_range{};
+                const std::size_t max_int32 = std::numeric_limits<std::int32_t>::max();
+                const std::size_t max_uint16 = std::numeric_limits<std::uint16_t>::max();
+                if (target_ == target_platform::cpu) {
+                    native_range = ::sycl::id<3>{ max_int32, max_int32, max_int32 };
+                } else {
+                    native_range = ::sycl::id<3>{ max_int32, max_uint16, max_uint16 };
+                }
+#endif
+                // note: account for SYCL's different iteration range!
+                return { native_range[2], native_range[1], native_range[0] };
+            }));
+        case execution_space::hpx:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HPX(([&]() -> ::plssvm::detail::dim_type {
+                // get the total number of threads
+                const std::size_t num_threads = Kokkos::Experimental::HPX::impl_max_hardware_threads();
+                // set the maximum league size to twice the number of available hardware threads
+                // NOTE: this is just an estimate and can or should be changed depending on the performance
+                const auto league_size = static_cast<unsigned long long>(std::ceil(std::sqrt(num_threads * 2)));
+                return { league_size, league_size, 1ull };
             }));
         case execution_space::openmp:
-            return { 16, 16, 1 };  // TODO: correct values
+            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMP(([&]() -> ::plssvm::detail::dim_type {
+                // get the total number of threads
+                const std::size_t num_threads = Kokkos::OpenMP::impl_max_hardware_threads();
+                // set the maximum league size to twice the number of available hardware threads
+                // NOTE: this is just an estimate and can or should be changed depending on the performance
+                const auto league_size = static_cast<unsigned long long>(std::ceil(std::sqrt(num_threads * 2)));
+                return { league_size, league_size, 1ull };
+            }));
+        case execution_space::threads:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_THREADS(([&]() -> ::plssvm::detail::dim_type {
+                // get the total number of threads
+                const std::size_t num_threads = Kokkos::Threads::impl_max_hardware_threads();
+                // set the maximum league size to twice the number of available hardware threads
+                // NOTE: this is just an estimate and can or should be changed depending on the performance
+                const auto league_size = static_cast<unsigned long long>(std::ceil(std::sqrt(num_threads * 2)));
+                return { league_size, league_size, 1ull };
+            }));
         case execution_space::serial:
-            return { 1, 1, 1 };  // TODO: correct values
-        case execution_space::sycl:
+            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SERIAL(([&]() -> ::plssvm::detail::dim_type {
+                return { std::numeric_limits<int>::max(), std::numeric_limits<int>::max(), 1ull };
+            }));
+        // TODO: implement for Kokkos::Experimental::OpenMPTarget and Kokkos::Experimental::OpenACC
         case execution_space::openmp_target:
         case execution_space::openacc:
-        case execution_space::hpx:
-        case execution_space::threads:
             throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
     }
     // all possible cases should be handled by the previous switch
@@ -328,7 +375,6 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
 
             // create a Kokkos TeamPolicy
             Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
-            // TODO: test MDRangeTeamPolicy?!
 
             switch (params.kernel_type) {
                 case kernel_function_type::linear:
diff --git a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
index a020454ef..6d1c950dd 100644
--- a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
@@ -116,13 +116,13 @@ std::vector<device_wrapper> get_device_list(const execution_space space, [[maybe
             break;
         case execution_space::openmp_target:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET([&]() {
-                // TODO: multi-GPU?
+                // TODO: implement multi-GPU support?
                 devices.emplace_back(Kokkos::Experimental::OpenMPTarget{});
             });
             break;
         case execution_space::openacc:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC([&]() {
-                // TODO: multi-GPU?
+                // TODO: implement multi-GPU support?
                 devices.emplace_back(Kokkos::OpenACC{});
             });
             break;
diff --git a/src/plssvm/backends/Kokkos/detail/pinned_memory.cpp b/src/plssvm/backends/Kokkos/detail/pinned_memory.cpp
index dfae19661..919cbdaa1 100644
--- a/src/plssvm/backends/Kokkos/detail/pinned_memory.cpp
+++ b/src/plssvm/backends/Kokkos/detail/pinned_memory.cpp
@@ -40,8 +40,6 @@ pinned_memory<T>::~pinned_memory() {
     }
 }
 
-// TODO: check if implementable via Kokkos?
-
 template class pinned_memory<float>;
 template class pinned_memory<double>;
 
diff --git a/src/plssvm/backends/Kokkos/detail/utility.cpp b/src/plssvm/backends/Kokkos/detail/utility.cpp
index b922b9a1d..45392e509 100644
--- a/src/plssvm/backends/Kokkos/detail/utility.cpp
+++ b/src/plssvm/backends/Kokkos/detail/utility.cpp
@@ -36,7 +36,6 @@ int dim_type_to_native(const ::plssvm::detail::dim_type &dims) {
 std::map<target_platform, std::vector<execution_space>> available_target_platform_to_execution_space_mapping() {
     std::map<target_platform, std::vector<execution_space>> available_map{};
 
-    // TODO: only return really POSSIBLE target platforms?
     // iterate over all available execution spaces
     for (const execution_space space : list_available_execution_spaces()) {
         switch (space) {
@@ -139,10 +138,8 @@ std::string get_device_name([[maybe_unused]] const device_wrapper &dev) {
         case execution_space::openmp:
             return "OpenMP CPU host device";
         case execution_space::openmp_target:
-            // TODO: device name?
             return "OpenMP target device";
         case execution_space::openacc:
-            // TODO: device name?
             return "OpenACC target device";
         case execution_space::threads:
             return "std::threads CPU host device";

From 63d38144d44651eaf7728ea8754d59a750d51970 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 13:54:27 +0100
Subject: [PATCH 063/123] Correctly use Kokkos::Experimental::OpenACC instead
 of Kokkos::OpenACC.

---
 include/plssvm/backends/Kokkos/detail/conditional_execution.hpp | 2 +-
 src/plssvm/backends/Kokkos/detail/device_wrapper.cpp            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp b/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
index f981fb2bd..559c9e75c 100644
--- a/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
+++ b/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
@@ -163,7 +163,7 @@ namespace plssvm::kokkos::detail {
 #endif
 
 //***************************************************//
-//                  Kokkos::OpenACC                  //
+//           Kokkos::Experimental::OpenACC           //
 //***************************************************//
 
 /**
diff --git a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
index 6d1c950dd..bfd79d9d2 100644
--- a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
@@ -123,7 +123,7 @@ std::vector<device_wrapper> get_device_list(const execution_space space, [[maybe
         case execution_space::openacc:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC([&]() {
                 // TODO: implement multi-GPU support?
-                devices.emplace_back(Kokkos::OpenACC{});
+                devices.emplace_back(Kokkos::Experimental::OpenACC{});
             });
             break;
         case execution_space::threads:

From b673f2834acac44a6469ed742e4ebba75d4c7718 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 14:18:25 +0100
Subject: [PATCH 064/123] Split execution_space header in multiple headers such
 that the Kokkos_Core.hpp header must not be included in execution_space.hpp.

---
 .../constexpr_available_execution_spaces.hpp  |  65 +++++
 .../Kokkos/detail/device_view_wrapper.hpp     |   8 +-
 .../backends/Kokkos/detail/device_wrapper.hpp |   6 +-
 .../backends/Kokkos/execution_space.hpp       | 264 ------------------
 .../Kokkos/execution_space_type_traits.hpp    | 238 ++++++++++++++++
 .../backends/Kokkos/execution_space.cpp       |   3 +-
 tests/backends/Kokkos/CMakeLists.txt          |   2 +
 .../constexpr_available_execution_spaces.cpp  |  18 ++
 tests/backends/Kokkos/execution_space.cpp     |  70 +----
 .../Kokkos/execution_space_type_traits.cpp    |  75 +++++
 10 files changed, 410 insertions(+), 339 deletions(-)
 create mode 100644 include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp
 create mode 100644 include/plssvm/backends/Kokkos/execution_space_type_traits.hpp
 create mode 100644 tests/backends/Kokkos/detail/constexpr_available_execution_spaces.cpp
 create mode 100644 tests/backends/Kokkos/execution_space_type_traits.cpp

diff --git a/include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp b/include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp
new file mode 100644
index 000000000..5d964f66a
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp
@@ -0,0 +1,65 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Function to list all available execution spaces at compile time.
+ * @note Must be a separate file such that the Kokkos header must not be included in the "execution_space.hpp" file.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_CONSTEXPR_AVAILABLE_EXECUTION_SPACES_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_DETAIL_CONSTEXPR_AVAILABLE_EXECUTION_SPACES_HPP_
+
+#include "plssvm/backends/Kokkos/execution_space.hpp"  // plssvm::kokkos::execution_space
+
+#include "Kokkos_Core.hpp"  // Kokkos macros, Kokkos ExecutionSpace types
+
+#include <array>  // std::array
+
+namespace plssvm::kokkos::detail {
+
+/**
+ * @brief List all available Kokkos::ExecutionSpaces at compile time.
+ * @details At least one execution space must **always** be available!
+ * @return a `std::array` containing all available execution spaces (`[[nodiscard]]`)
+ */
+[[nodiscard]] inline constexpr auto constexpr_available_execution_spaces() noexcept {
+    // Note: the trailing comma is explicitly allowed by the standard
+    // Note: the order is intentionally chosen this way -> the order of the entries determines the priority when using a backend to run our code
+    return std::array{
+#if defined(KOKKOS_ENABLE_CUDA)
+        execution_space::cuda,
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+        execution_space::hip,
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)
+        execution_space::sycl,
+#endif
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+        execution_space::openmp_target,
+#endif
+#if defined(KOKKOS_ENABLE_OPENACC)
+        execution_space::openacc,
+#endif
+#if defined(KOKKOS_ENABLE_OPENMP)
+        execution_space::openmp,
+#endif
+#if defined(KOKKOS_ENABLE_THREADS)
+        execution_space::threads,
+#endif
+#if defined(KOKKOS_ENABLE_HPX)
+        execution_space::hpx,
+#endif
+#if defined(KOKKOS_ENABLE_SERIAL)
+        execution_space::serial,
+#endif
+    };
+}
+
+}  // namespace plssvm::kokkos::detail
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_DETAIL_CONSTEXPR_AVAILABLE_EXECUTION_SPACES_HPP_
diff --git a/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp b/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
index a3019829e..ea60bb1fd 100644
--- a/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
+++ b/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
@@ -12,9 +12,11 @@
 #ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_VIEW_WRAPPER_HPP_
 #define PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_VIEW_WRAPPER_HPP_
 
-#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"  // plssvm::kokkos::detail::device_wrapper
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::{execution_space, execution_space_to_kokkos_type_t}, plssvm::kokkos::detail::constexpr_available_execution_spaces
-#include "plssvm/detail/type_traits.hpp"                     // plssvm::detail::remove_cvref_t
+#include "plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp"  // plssvm::kokkos::detail::constexpr_available_execution_spaces
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"                        // plssvm::kokkos::detail::device_wrapper
+#include "plssvm/backends/Kokkos/execution_space.hpp"                              // plssvm::kokkos::execution_space
+#include "plssvm/backends/Kokkos/execution_space_type_traits.hpp"                  // plssvm::kokkos::execution_space_to_kokkos_type_t
+#include "plssvm/detail/type_traits.hpp"                                           // plssvm::detail::remove_cvref_t
 
 #include "Kokkos_Core.hpp"  // Kokkos::View, Kokkos::ExecutionSpace
 
diff --git a/include/plssvm/backends/Kokkos/detail/device_wrapper.hpp b/include/plssvm/backends/Kokkos/detail/device_wrapper.hpp
index 30b2a91be..da0aaf755 100644
--- a/include/plssvm/backends/Kokkos/detail/device_wrapper.hpp
+++ b/include/plssvm/backends/Kokkos/detail/device_wrapper.hpp
@@ -12,8 +12,10 @@
 #ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_WRAPPER_HPP_
 #define PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_WRAPPER_HPP_
 
-#include "plssvm/backends/Kokkos/execution_space.hpp"  // plssvm::kokkos::{execution_space, execution_space_to_kokkos_type_t}, plssvm::kokkos::detail::constexpr_available_execution_spaces
-#include "plssvm/target_platforms.hpp"                 // plssvm::target_platform
+#include "plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp"  // plssvm::kokkos::detail::constexpr_available_execution_spaces
+#include "plssvm/backends/Kokkos/execution_space.hpp"                              // plssvm::kokkos::execution_space
+#include "plssvm/backends/Kokkos/execution_space_type_traits.hpp"                  // plssvm::kokkos::execution_space_to_kokkos_type_t
+#include "plssvm/target_platforms.hpp"                                             // plssvm::target_platform
 
 #include <array>       // std::array
 #include <cstddef>     // std::size_t
diff --git a/include/plssvm/backends/Kokkos/execution_space.hpp b/include/plssvm/backends/Kokkos/execution_space.hpp
index abf8b227d..d77ae845b 100644
--- a/include/plssvm/backends/Kokkos/execution_space.hpp
+++ b/include/plssvm/backends/Kokkos/execution_space.hpp
@@ -13,12 +13,9 @@
 #define PLSSVM_BACKENDS_KOKKOS_EXECUTION_SPACE_HPP_
 #pragma once
 
-#include "Kokkos_Core.hpp"  // Kokkos macros, Kokkos ExecutionSpace types
-
 #include "fmt/base.h"     // fmt::formatter
 #include "fmt/ostream.h"  // fmt::ostream_formatter
 
-#include <array>   // std::array
 #include <iosfwd>  // std::ostream forward declaration
 #include <vector>  // std::vector
 
@@ -64,267 +61,6 @@ std::ostream &operator<<(std::ostream &out, execution_space space);
  */
 std::istream &operator>>(std::istream &in, execution_space &space);
 
-//***************************************************//
-//           execution_space_to_kokkos_type          //
-//***************************************************//
-
-/**
- * @brief Uninstantiated base type to convert an `execution_space` enum value to a Kokkos::ExecutionSpace type.
- */
-template <execution_space>
-struct execution_space_to_kokkos_type;
-
-#if defined(KOKKOS_ENABLE_CUDA)
-/**
- * @brief Convert an `execution_space::cuda` enum value to a `Kokkos::Cuda` Kokkos::ExecutionSpace type.
- */
-template <>
-struct execution_space_to_kokkos_type<execution_space::cuda> {
-    using type = Kokkos::Cuda;
-};
-#endif
-
-#if defined(KOKKOS_ENABLE_HIP)
-/**
- * @brief Convert an `execution_space::hip` enum value to a `Kokkos::HIP` Kokkos::ExecutionSpace type.
- */
-template <>
-struct execution_space_to_kokkos_type<execution_space::hip> {
-    using type = Kokkos::HIP;
-};
-#endif
-
-#if defined(KOKKOS_ENABLE_SYCL)
-/**
- * @brief Convert an `execution_space::sycl` enum value to a `Kokkos::SYCL` Kokkos::ExecutionSpace type.
- */
-template <>
-struct execution_space_to_kokkos_type<execution_space::sycl> {
-    using type = Kokkos::SYCL;
-};
-#endif
-
-#if defined(KOKKOS_ENABLE_HPX)
-/**
- * @brief Convert an `execution_space::hpx` enum value to a `Kokkos::Experimental::HPX` Kokkos::ExecutionSpace type.
- */
-template <>
-struct execution_space_to_kokkos_type<execution_space::hpx> {
-    using type = Kokkos::Experimental::HPX;
-};
-#endif
-
-#if defined(KOKKOS_ENABLE_OPENMP)
-/**
- * @brief Convert an `execution_space::openmp` enum value to a `Kokkos::OpenMP` Kokkos::ExecutionSpace type.
- */
-template <>
-struct execution_space_to_kokkos_type<execution_space::openmp> {
-    using type = Kokkos::OpenMP;
-};
-#endif
-
-#if defined(KOKKOS_ENABLE_OPENMPTARGET)
-/**
- * @brief Convert an `execution_space::openmp_target` enum value to a `Kokkos::Experimental::OpenMPTarget` Kokkos::ExecutionSpace type.
- */
-template <>
-struct execution_space_to_kokkos_type<execution_space::openmp_target> {
-    using type = Kokkos::Experimental::OpenMPTarget;
-};
-#endif
-
-#if defined(KOKKOS_ENABLE_OPENACC)
-/**
- * @brief Convert an `execution_space::openacc` enum value to a `Kokkos::Experimental::OpenACC` Kokkos::ExecutionSpace type.
- */
-template <>
-struct execution_space_to_kokkos_type<execution_space::openacc> {
-    using type = Kokkos::Experimental::OpenACC;
-};
-#endif
-
-#if defined(KOKKOS_ENABLE_THREADS)
-/**
- * @brief Convert an `execution_space::threads` enum value to a `Kokkos::Threads` Kokkos::ExecutionSpace type.
- */
-template <>
-struct execution_space_to_kokkos_type<execution_space::threads> {
-    using type = Kokkos::Threads;
-};
-#endif
-
-#if defined(KOKKOS_ENABLE_SERIAL)
-/**
- * @brief Convert an `execution_space::serial` enum value to a `Kokkos::Serial` Kokkos::ExecutionSpace type.
- */
-template <>
-struct execution_space_to_kokkos_type<execution_space::serial> {
-    using type = Kokkos::Serial;
-};
-#endif
-
-/**
- * @brief Convert the `execution_space` @p space to the corresponding Kokkos::ExecutionSpace type.
- * @tparam space the enum value to convert
- */
-template <execution_space space>
-using execution_space_to_kokkos_type_t = typename execution_space_to_kokkos_type<space>::type;
-
-//***************************************************//
-//           kokkos_type_to_execution_space          //
-//***************************************************//
-
-/**
- * @brief Uninstantiated base type to convert a Kokkos::ExecutionSpace type to a `execution_space` enum value.
- */
-template <typename>
-struct kokkos_type_to_execution_space;
-
-#if defined(KOKKOS_ENABLE_CUDA)
-/**
- * @brief Convert a `Kokkos::Cuda` Kokkos::ExecutionSpace type to an `execution_space::cuda` enum value.
- */
-template <>
-struct kokkos_type_to_execution_space<Kokkos::Cuda> {
-    constexpr static execution_space value = execution_space::cuda;
-};
-#endif
-
-#if defined(KOKKOS_ENABLE_HIP)
-/**
- * @brief Convert a `Kokkos::HIP` Kokkos::ExecutionSpace type to an `execution_space::hip` enum value.
- */
-template <>
-struct kokkos_type_to_execution_space<Kokkos::HIP> {
-    constexpr static execution_space value = execution_space::hip;
-};
-#endif
-
-#if defined(KOKKOS_ENABLE_SYCL)
-/**
- * @brief Convert a `Kokkos::SYCL` Kokkos::ExecutionSpace type to an `execution_space::sycl` enum value.
- */
-template <>
-struct kokkos_type_to_execution_space<Kokkos::SYCL> {
-    constexpr static execution_space value = execution_space::sycl;
-};
-#endif
-
-#if defined(KOKKOS_ENABLE_HPX)
-/**
- * @brief Convert a `Kokkos::Experimental::HPX` Kokkos::ExecutionSpace type to an `execution_space::hpx` enum value.
- */
-template <>
-struct kokkos_type_to_execution_space<Kokkos::Experimental::HPX> {
-    constexpr static execution_space value = execution_space::hpx;
-};
-#endif
-
-#if defined(KOKKOS_ENABLE_OPENMP)
-/**
- * @brief Convert a `Kokkos::OpenMP` Kokkos::ExecutionSpace type to an `execution_space::openmp` enum value.
- */
-template <>
-struct kokkos_type_to_execution_space<Kokkos::OpenMP> {
-    constexpr static execution_space value = execution_space::openmp;
-};
-#endif
-
-#if defined(KOKKOS_ENABLE_OPENMPTARGET)
-/**
- * @brief Convert a `Kokkos::Experimental::OpenMPTarget` Kokkos::ExecutionSpace type to an `execution_space::openmp_target` enum value.
- */
-template <>
-struct kokkos_type_to_execution_space<Kokkos::Experimental::OpenMPTarget> {
-    constexpr static execution_space value = execution_space::openmp_target;
-};
-#endif
-
-#if defined(KOKKOS_ENABLE_OPENACC)
-/**
- * @brief Convert a `Kokkos::Experimental::OpenACC` Kokkos::ExecutionSpace type to an `execution_space::openacc` enum value.
- */
-template <>
-struct kokkos_type_to_execution_space<Kokkos::Experimental::OpenACC> {
-    constexpr static execution_space value = execution_space::openacc;
-};
-#endif
-
-#if defined(KOKKOS_ENABLE_THREADS)
-/**
- * @brief Convert a `Kokkos::Threads` Kokkos::ExecutionSpace type to an `execution_space::threads` enum value.
- */
-template <>
-struct kokkos_type_to_execution_space<Kokkos::Threads> {
-    constexpr static execution_space value = execution_space::threads;
-};
-#endif
-
-#if defined(KOKKOS_ENABLE_SERIAL)
-/**
- * @brief Convert a `Kokkos::Serial` Kokkos::ExecutionSpace type to an `execution_space::serial` enum value.
- */
-template <>
-struct kokkos_type_to_execution_space<Kokkos::Serial> {
-    constexpr static execution_space value = execution_space::serial;
-};
-#endif
-
-/**
- * @brief Convert the Kokkos::ExecutionSpace type @p ExecutionSpace to the corresponding `execution_space` enum value.
- * @tparam ExecutionSpace the Kokkos::ExecutionSpace type to convert
- */
-template <typename ExecutionSpace>
-inline constexpr execution_space kokkos_type_to_execution_space_v = kokkos_type_to_execution_space<ExecutionSpace>::value;
-
-//***************************************************//
-//                  other functions                  //
-//***************************************************//
-
-namespace detail {
-
-/**
- * @brief List all available Kokkos::ExecutionSpaces at compile time.
- * @details At least one execution space must **always** be available!
- * @return a `std::array` containing all available execution spaces (`[[nodiscard]]`)
- */
-[[nodiscard]] inline constexpr auto constexpr_available_execution_spaces() noexcept {
-    // Note: the trailing comma is explicitly allowed by the standard
-    // Note: the order is intentionally chosen this way -> the order of the entries determines the priority when using a backend to run our code
-    return std::array{
-#if defined(KOKKOS_ENABLE_CUDA)
-        execution_space::cuda,
-#endif
-#if defined(KOKKOS_ENABLE_HIP)
-        execution_space::hip,
-#endif
-#if defined(KOKKOS_ENABLE_SYCL)
-        execution_space::sycl,
-#endif
-#if defined(KOKKOS_ENABLE_OPENMPTARGET)
-        execution_space::openmp_target,
-#endif
-#if defined(KOKKOS_ENABLE_OPENACC)
-        execution_space::openacc,
-#endif
-#if defined(KOKKOS_ENABLE_OPENMP)
-        execution_space::openmp,
-#endif
-#if defined(KOKKOS_ENABLE_THREADS)
-        execution_space::threads,
-#endif
-#if defined(KOKKOS_ENABLE_HPX)
-        execution_space::hpx,
-#endif
-#if defined(KOKKOS_ENABLE_SERIAL)
-        execution_space::serial,
-#endif
-    };
-}
-
-}  // namespace detail
-
 /**
  * @brief List all available Kokkos::ExecutionSpaces.
  * @details Only Kokkos::ExecutionSpaces that where enabled during the CMake configuration are available.
diff --git a/include/plssvm/backends/Kokkos/execution_space_type_traits.hpp b/include/plssvm/backends/Kokkos/execution_space_type_traits.hpp
new file mode 100644
index 000000000..aa5e31751
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/execution_space_type_traits.hpp
@@ -0,0 +1,238 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Execution space type traits for the ExecutionSpaces in Kokkos.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_EXECUTION_SPACE_TYPE_TRAITS_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_EXECUTION_SPACE_TYPE_TRAITS_HPP_
+#pragma once
+
+#include "plssvm/backends/Kokkos/execution_space.hpp"  // plssvm::kokkos::execution_space
+
+#include "Kokkos_Core.hpp"  // Kokkos macros, Kokkos ExecutionSpace types
+
+namespace plssvm::kokkos {
+
+//***************************************************//
+//           execution_space_to_kokkos_type          //
+//***************************************************//
+
+/**
+ * @brief Uninstantiated base type to convert an `execution_space` enum value to a Kokkos::ExecutionSpace type.
+ */
+template <execution_space>
+struct execution_space_to_kokkos_type;
+
+#if defined(KOKKOS_ENABLE_CUDA)
+/**
+ * @brief Convert an `execution_space::cuda` enum value to a `Kokkos::Cuda` Kokkos::ExecutionSpace type.
+ */
+template <>
+struct execution_space_to_kokkos_type<execution_space::cuda> {
+    using type = Kokkos::Cuda;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_HIP)
+/**
+ * @brief Convert an `execution_space::hip` enum value to a `Kokkos::HIP` Kokkos::ExecutionSpace type.
+ */
+template <>
+struct execution_space_to_kokkos_type<execution_space::hip> {
+    using type = Kokkos::HIP;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_SYCL)
+/**
+ * @brief Convert an `execution_space::sycl` enum value to a `Kokkos::SYCL` Kokkos::ExecutionSpace type.
+ */
+template <>
+struct execution_space_to_kokkos_type<execution_space::sycl> {
+    using type = Kokkos::SYCL;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_HPX)
+/**
+ * @brief Convert an `execution_space::hpx` enum value to a `Kokkos::Experimental::HPX` Kokkos::ExecutionSpace type.
+ */
+template <>
+struct execution_space_to_kokkos_type<execution_space::hpx> {
+    using type = Kokkos::Experimental::HPX;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENMP)
+/**
+ * @brief Convert an `execution_space::openmp` enum value to a `Kokkos::OpenMP` Kokkos::ExecutionSpace type.
+ */
+template <>
+struct execution_space_to_kokkos_type<execution_space::openmp> {
+    using type = Kokkos::OpenMP;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+/**
+ * @brief Convert an `execution_space::openmp_target` enum value to a `Kokkos::Experimental::OpenMPTarget` Kokkos::ExecutionSpace type.
+ */
+template <>
+struct execution_space_to_kokkos_type<execution_space::openmp_target> {
+    using type = Kokkos::Experimental::OpenMPTarget;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENACC)
+/**
+ * @brief Convert an `execution_space::openacc` enum value to a `Kokkos::Experimental::OpenACC` Kokkos::ExecutionSpace type.
+ */
+template <>
+struct execution_space_to_kokkos_type<execution_space::openacc> {
+    using type = Kokkos::Experimental::OpenACC;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_THREADS)
+/**
+ * @brief Convert an `execution_space::threads` enum value to a `Kokkos::Threads` Kokkos::ExecutionSpace type.
+ */
+template <>
+struct execution_space_to_kokkos_type<execution_space::threads> {
+    using type = Kokkos::Threads;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_SERIAL)
+/**
+ * @brief Convert an `execution_space::serial` enum value to a `Kokkos::Serial` Kokkos::ExecutionSpace type.
+ */
+template <>
+struct execution_space_to_kokkos_type<execution_space::serial> {
+    using type = Kokkos::Serial;
+};
+#endif
+
+/**
+ * @brief Convert the `execution_space` @p space to the corresponding Kokkos::ExecutionSpace type.
+ * @tparam space the enum value to convert
+ */
+template <execution_space space>
+using execution_space_to_kokkos_type_t = typename execution_space_to_kokkos_type<space>::type;
+
+//***************************************************//
+//           kokkos_type_to_execution_space          //
+//***************************************************//
+
+/**
+ * @brief Uninstantiated base type to convert a Kokkos::ExecutionSpace type to a `execution_space` enum value.
+ */
+template <typename>
+struct kokkos_type_to_execution_space;
+
+#if defined(KOKKOS_ENABLE_CUDA)
+/**
+ * @brief Convert a `Kokkos::Cuda` Kokkos::ExecutionSpace type to an `execution_space::cuda` enum value.
+ */
+template <>
+struct kokkos_type_to_execution_space<Kokkos::Cuda> {
+    constexpr static execution_space value = execution_space::cuda;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_HIP)
+/**
+ * @brief Convert a `Kokkos::HIP` Kokkos::ExecutionSpace type to an `execution_space::hip` enum value.
+ */
+template <>
+struct kokkos_type_to_execution_space<Kokkos::HIP> {
+    constexpr static execution_space value = execution_space::hip;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_SYCL)
+/**
+ * @brief Convert a `Kokkos::SYCL` Kokkos::ExecutionSpace type to an `execution_space::sycl` enum value.
+ */
+template <>
+struct kokkos_type_to_execution_space<Kokkos::SYCL> {
+    constexpr static execution_space value = execution_space::sycl;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_HPX)
+/**
+ * @brief Convert a `Kokkos::Experimental::HPX` Kokkos::ExecutionSpace type to an `execution_space::hpx` enum value.
+ */
+template <>
+struct kokkos_type_to_execution_space<Kokkos::Experimental::HPX> {
+    constexpr static execution_space value = execution_space::hpx;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENMP)
+/**
+ * @brief Convert a `Kokkos::OpenMP` Kokkos::ExecutionSpace type to an `execution_space::openmp` enum value.
+ */
+template <>
+struct kokkos_type_to_execution_space<Kokkos::OpenMP> {
+    constexpr static execution_space value = execution_space::openmp;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+/**
+ * @brief Convert a `Kokkos::Experimental::OpenMPTarget` Kokkos::ExecutionSpace type to an `execution_space::openmp_target` enum value.
+ */
+template <>
+struct kokkos_type_to_execution_space<Kokkos::Experimental::OpenMPTarget> {
+    constexpr static execution_space value = execution_space::openmp_target;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENACC)
+/**
+ * @brief Convert a `Kokkos::Experimental::OpenACC` Kokkos::ExecutionSpace type to an `execution_space::openacc` enum value.
+ */
+template <>
+struct kokkos_type_to_execution_space<Kokkos::Experimental::OpenACC> {
+    constexpr static execution_space value = execution_space::openacc;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_THREADS)
+/**
+ * @brief Convert a `Kokkos::Threads` Kokkos::ExecutionSpace type to an `execution_space::threads` enum value.
+ */
+template <>
+struct kokkos_type_to_execution_space<Kokkos::Threads> {
+    constexpr static execution_space value = execution_space::threads;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_SERIAL)
+/**
+ * @brief Convert a `Kokkos::Serial` Kokkos::ExecutionSpace type to an `execution_space::serial` enum value.
+ */
+template <>
+struct kokkos_type_to_execution_space<Kokkos::Serial> {
+    constexpr static execution_space value = execution_space::serial;
+};
+#endif
+
+/**
+ * @brief Convert the Kokkos::ExecutionSpace type @p ExecutionSpace to the corresponding `execution_space` enum value.
+ * @tparam ExecutionSpace the Kokkos::ExecutionSpace type to convert
+ */
+template <typename ExecutionSpace>
+inline constexpr execution_space kokkos_type_to_execution_space_v = kokkos_type_to_execution_space<ExecutionSpace>::value;
+
+}  // namespace plssvm::kokkos
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_EXECUTION_SPACE_TYPE_TRAITS_HPP_
diff --git a/src/plssvm/backends/Kokkos/execution_space.cpp b/src/plssvm/backends/Kokkos/execution_space.cpp
index 6179c496d..2e0c08a01 100644
--- a/src/plssvm/backends/Kokkos/execution_space.cpp
+++ b/src/plssvm/backends/Kokkos/execution_space.cpp
@@ -8,7 +8,8 @@
 
 #include "plssvm/backends/Kokkos/execution_space.hpp"
 
-#include "plssvm/detail/string_utility.hpp"  // plssvm::detail::to_lower_case
+#include "plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp"  // plssvm::kokkos::detail::constexpr_available_execution_spaces
+#include "plssvm/detail/string_utility.hpp"                                        // plssvm::detail::to_lower_case
 
 #include <array>    // std::array
 #include <ios>      // std::ios::failbit
diff --git a/tests/backends/Kokkos/CMakeLists.txt b/tests/backends/Kokkos/CMakeLists.txt
index 142f72a37..34ce3881f 100644
--- a/tests/backends/Kokkos/CMakeLists.txt
+++ b/tests/backends/Kokkos/CMakeLists.txt
@@ -9,6 +9,7 @@ set(PLSSVM_KOKKOS_TEST_NAME Kokkos_tests)
 
 # list all necessary sources
 set(PLSSVM_KOKKOS_TEST_SOURCES
+    ${CMAKE_CURRENT_LIST_DIR}/detail/constexpr_available_execution_spaces.cpp
     ${CMAKE_CURRENT_LIST_DIR}/detail/device_ptr.cpp
     ${CMAKE_CURRENT_LIST_DIR}/detail/device_view_wrapper.cpp
     ${CMAKE_CURRENT_LIST_DIR}/detail/device_wrapper.cpp
@@ -18,6 +19,7 @@ set(PLSSVM_KOKKOS_TEST_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/kokkos_csvm.cpp
     ${CMAKE_CURRENT_LIST_DIR}/exceptions.cpp
     ${CMAKE_CURRENT_LIST_DIR}/execution_space.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/execution_space_type_traits.cpp
 )
 
 find_package(Kokkos REQUIRED)
diff --git a/tests/backends/Kokkos/detail/constexpr_available_execution_spaces.cpp b/tests/backends/Kokkos/detail/constexpr_available_execution_spaces.cpp
new file mode 100644
index 000000000..2e8f064e7
--- /dev/null
+++ b/tests/backends/Kokkos/detail/constexpr_available_execution_spaces.cpp
@@ -0,0 +1,18 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Tests for the Kokkos `constexpr_available_execution_spaces()` function.
+ */
+
+#include "plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp"
+
+#include "gtest/gtest.h"  // TEST, EXPECT_TRUE, EXPECT_FALSE
+
+TEST(KokkosConstexprAvailableExecutionSpaces, constexpr_available_execution_spaces) {
+    // at least one execution space must always be available
+    EXPECT_FALSE(plssvm::kokkos::detail::constexpr_available_execution_spaces().empty());
+}
diff --git a/tests/backends/Kokkos/execution_space.cpp b/tests/backends/Kokkos/execution_space.cpp
index 24c3135de..679ccb240 100644
--- a/tests/backends/Kokkos/execution_space.cpp
+++ b/tests/backends/Kokkos/execution_space.cpp
@@ -12,8 +12,7 @@
 
 #include "tests/custom_test_macros.hpp"  // EXPECT_CONVERSION_TO_STRING, EXPECT_CONVERSION_FROM_STRING
 
-#include "gmock/gmock.h"  // EXPECT_THAT; ::testing::AnyOf
-#include "gtest/gtest.h"  // TEST, EXPECT_TRUE
+#include "gtest/gtest.h"  // TEST, EXPECT_TRUE, EXPECT_FALSE
 
 #include <sstream>  // std::istringstream
 
@@ -68,73 +67,6 @@ TEST(KokkosExecutionSpace, from_string_unknown) {
     EXPECT_TRUE(input.fail());
 }
 
-TEST(KokkosExecutionSpace, execution_space_to_kokkos_type) {
-    // check conversions
-#if defined(KOKKOS_ENABLE_CUDA)
-    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::cuda>, Kokkos::Cuda>();
-#endif
-#if defined(KOKKOS_ENABLE_HIP)
-    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::hip>, Kokkos::HIP>();
-#endif
-#if defined(KOKKOS_ENABLE_SYCL)
-    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::sycl>, Kokkos::SYCL>();
-#endif
-#if defined(KOKKOS_ENABLE_HPX)
-    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::hpx>, Kokkos::Experimental::HPX>();
-#endif
-#if defined(KOKKOS_ENABLE_OPENMP)
-    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::openmp>, Kokkos::OpenMP>();
-#endif
-#if defined(KOKKOS_ENABLE_OPENMPTARGET)
-    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::openmp_target>, Kokkos::Experimental::OpenMPTarget>();
-#endif
-#if defined(KOKKOS_ENABLE_OPENACC)
-    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::openacc>, Kokkos::OpenACC>();
-#endif
-#if defined(KOKKOS_ENABLE_THREADS)
-    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::threads>, Kokkos::Threads>();
-#endif
-#if defined(KOKKOS_ENABLE_SERIAL)
-    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::serial>, Kokkos::Serial>();
-#endif
-}
-
-TEST(KokkosExecutionSpace, kokkos_type_to_execution_space) {
-    // check conversions
-#if defined(KOKKOS_ENABLE_CUDA)
-    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Cuda>, plssvm::kokkos::execution_space::cuda);
-#endif
-#if defined(KOKKOS_ENABLE_HIP)
-    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::HIP>, plssvm::kokkos::execution_space::hip);
-#endif
-#if defined(KOKKOS_ENABLE_SYCL)
-    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::SYCL>, plssvm::kokkos::execution_space::sycl);
-#endif
-#if defined(KOKKOS_ENABLE_HPX)
-    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Experimental::HPX>, plssvm::kokkos::execution_space::hpx);
-#endif
-#if defined(KOKKOS_ENABLE_OPENMP)
-    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::OpenMP>, plssvm::kokkos::execution_space::openmp);
-#endif
-#if defined(KOKKOS_ENABLE_OPENMPTARGET)
-    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Experimental::OpenMPTarget>, plssvm::kokkos::execution_space::openmp_target);
-#endif
-#if defined(KOKKOS_ENABLE_OPENACC)
-    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Experimental::OpenACC>, plssvm::kokkos::execution_space::openacc);
-#endif
-#if defined(KOKKOS_ENABLE_THREADS)
-    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Threads>, plssvm::kokkos::execution_space::threads);
-#endif
-#if defined(KOKKOS_ENABLE_SERIAL)
-    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Serial>, plssvm::kokkos::execution_space::serial);
-#endif
-}
-
-TEST(KokkosExecutionSpace, constexpr_available_execution_spaces) {
-    // at least one execution space must always be available
-    EXPECT_FALSE(plssvm::kokkos::detail::constexpr_available_execution_spaces().empty());
-}
-
 TEST(KokkosExecutionSpace, list_available_execution_spaces) {
     // at least one execution space must always be available
     EXPECT_FALSE(plssvm::kokkos::list_available_execution_spaces().empty());
diff --git a/tests/backends/Kokkos/execution_space_type_traits.cpp b/tests/backends/Kokkos/execution_space_type_traits.cpp
new file mode 100644
index 000000000..f813fa836
--- /dev/null
+++ b/tests/backends/Kokkos/execution_space_type_traits.cpp
@@ -0,0 +1,75 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Tests for functions related to the different Kokkos execution spaces.
+ */
+
+#include "plssvm/backends/Kokkos/execution_space_type_traits.hpp"
+
+#include "gtest/gtest.h"  // TEST, EXPECT_EQ, ::testing::StaticAssertTypeEq
+
+TEST(KokkosExecutionSpaceTypeTraits, execution_space_to_kokkos_type) {
+    // check conversions
+#if defined(KOKKOS_ENABLE_CUDA)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::cuda>, Kokkos::Cuda>();
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::hip>, Kokkos::HIP>();
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::sycl>, Kokkos::SYCL>();
+#endif
+#if defined(KOKKOS_ENABLE_HPX)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::hpx>, Kokkos::Experimental::HPX>();
+#endif
+#if defined(KOKKOS_ENABLE_OPENMP)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::openmp>, Kokkos::OpenMP>();
+#endif
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::openmp_target>, Kokkos::Experimental::OpenMPTarget>();
+#endif
+#if defined(KOKKOS_ENABLE_OPENACC)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::openacc>, Kokkos::Experimental::OpenACC>();
+#endif
+#if defined(KOKKOS_ENABLE_THREADS)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::threads>, Kokkos::Threads>();
+#endif
+#if defined(KOKKOS_ENABLE_SERIAL)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::execution_space_to_kokkos_type_t<plssvm::kokkos::execution_space::serial>, Kokkos::Serial>();
+#endif
+}
+
+TEST(KokkosExecutionSpaceTypeTraits, kokkos_type_to_execution_space) {
+    // check conversions
+#if defined(KOKKOS_ENABLE_CUDA)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Cuda>, plssvm::kokkos::execution_space::cuda);
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::HIP>, plssvm::kokkos::execution_space::hip);
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::SYCL>, plssvm::kokkos::execution_space::sycl);
+#endif
+#if defined(KOKKOS_ENABLE_HPX)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Experimental::HPX>, plssvm::kokkos::execution_space::hpx);
+#endif
+#if defined(KOKKOS_ENABLE_OPENMP)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::OpenMP>, plssvm::kokkos::execution_space::openmp);
+#endif
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Experimental::OpenMPTarget>, plssvm::kokkos::execution_space::openmp_target);
+#endif
+#if defined(KOKKOS_ENABLE_OPENACC)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Experimental::OpenACC>, plssvm::kokkos::execution_space::openacc);
+#endif
+#if defined(KOKKOS_ENABLE_THREADS)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Threads>, plssvm::kokkos::execution_space::threads);
+#endif
+#if defined(KOKKOS_ENABLE_SERIAL)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::Serial>, plssvm::kokkos::execution_space::serial);
+#endif
+}

From 29feaeadd5034249447ca0940f4dfcdd2c28394f Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 14:43:07 +0100
Subject: [PATCH 065/123] Add new automatic execution space.

---
 .../constexpr_available_execution_spaces.hpp       |  1 +
 include/plssvm/backends/Kokkos/execution_space.hpp |  2 ++
 src/plssvm/backends/Kokkos/csvm.cpp                | 12 ++++++++++++
 .../backends/Kokkos/detail/device_wrapper.cpp      |  9 +++++++--
 src/plssvm/backends/Kokkos/detail/utility.cpp      |  5 +++++
 src/plssvm/backends/Kokkos/execution_space.cpp     | 12 ++++++++++--
 tests/backends/Kokkos/execution_space.cpp          | 14 +++++++++++---
 7 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp b/include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp
index 5d964f66a..ea5dafb02 100644
--- a/include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp
+++ b/include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp
@@ -27,6 +27,7 @@ namespace plssvm::kokkos::detail {
  * @return a `std::array` containing all available execution spaces (`[[nodiscard]]`)
  */
 [[nodiscard]] inline constexpr auto constexpr_available_execution_spaces() noexcept {
+    // Note: The execution_space::automatic value may NEVER be added here!
     // Note: the trailing comma is explicitly allowed by the standard
     // Note: the order is intentionally chosen this way -> the order of the entries determines the priority when using a backend to run our code
     return std::array{
diff --git a/include/plssvm/backends/Kokkos/execution_space.hpp b/include/plssvm/backends/Kokkos/execution_space.hpp
index d77ae845b..cc9114412 100644
--- a/include/plssvm/backends/Kokkos/execution_space.hpp
+++ b/include/plssvm/backends/Kokkos/execution_space.hpp
@@ -25,6 +25,8 @@ namespace plssvm::kokkos {
  * @brief Enum class for all execution spaces supported by [Kokkos](https://github.com/kokkos/kokkos).
  */
 enum class execution_space {
+    /** Automatically determine the used Kokkos execution space. Note: this does not necessarily correspond to Kokkos::DefaultExecutionSpace! */
+    automatic,
     /** Execution space representing execution on a CUDA device. */
     cuda,
     /** Execution space representing execution on a device supported by HIP. */
diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 28823fd83..58e4e24fe 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -177,8 +177,12 @@ csvm::~csvm() {
 }
 
 std::vector<::plssvm::detail::memory_size> csvm::get_device_memory() const {
+    PLSSVM_ASSERT(space_ != execution_space::automatic, "The automatic execution_space may not be provided to this function!");
+
     std::vector<::plssvm::detail::memory_size> device_memory(this->num_available_devices());
     switch (space_) {
+        case execution_space::automatic:
+            throw backend_exception{ "Unsupported execution_space::automatic provided!" };
         case execution_space::cuda:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() {
                 for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) {
@@ -217,8 +221,12 @@ std::vector<::plssvm::detail::memory_size> csvm::get_device_memory() const {
 }
 
 std::vector<::plssvm::detail::memory_size> csvm::get_max_mem_alloc_size() const {
+    PLSSVM_ASSERT(space_ != execution_space::automatic, "The automatic execution_space may not be provided to this function!");
+
     std::vector<::plssvm::detail::memory_size> max_mem_alloc_size(this->num_available_devices());
     switch (space_) {
+        case execution_space::automatic:
+            throw backend_exception{ "Unsupported execution_space::automatic provided!" };
         case execution_space::cuda:
         case execution_space::hip:
             max_mem_alloc_size = this->get_device_memory();
@@ -246,6 +254,7 @@ std::vector<::plssvm::detail::memory_size> csvm::get_max_mem_alloc_size() const
 
 std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
     PLSSVM_ASSERT(device_id < this->num_available_devices(), "Invalid device {} requested!", device_id);
+    PLSSVM_ASSERT(space_ != execution_space::automatic, "The automatic execution_space may not be provided to this function!");
 
     // NOTE: the maximum theoretical work-group size, may be additionally limited by the amount of used scratch memory
     return devices_[device_id].execute_and_return([](const auto &device) {
@@ -264,10 +273,13 @@ std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
 
 ::plssvm::detail::dim_type csvm::get_max_grid_size([[maybe_unused]] const std::size_t device_id) const {
     PLSSVM_ASSERT(device_id < this->num_available_devices(), "Invalid device {} requested!", device_id);
+    PLSSVM_ASSERT(space_ != execution_space::automatic, "The automatic execution_space may not be provided to this function!");
 
     // NOTE: Kokkos only supports one-dimensional execution ranges!
     // NOTE: we only use two-dimensional kernels!
     switch (space_) {
+        case execution_space::automatic:
+            throw backend_exception{ "Unsupported execution_space::automatic provided!" };
         case execution_space::cuda:
             PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA(([&]() -> ::plssvm::detail::dim_type {
                 const cudaDeviceProp &prop = devices_[device_id].get<execution_space::cuda>().cuda_device_prop();
diff --git a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
index bfd79d9d2..35dd6c2e9 100644
--- a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
@@ -11,6 +11,7 @@
 #include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"  // PLSSVM_KOKKOS_BACKEND_INVOKE_IF_*
 #include "plssvm/backends/Kokkos/exceptions.hpp"                    // plssvm::kokkos::backend_exception
 #include "plssvm/backends/Kokkos/execution_space.hpp"               // plssvm::kokkos::execution_space
+#include "plssvm/detail/assert.hpp"                                 // PLSSVM_ASSERT
 #include "plssvm/detail/logging_without_performance_tracking.hpp"   // plssvm::detail::log_untracked
 #include "plssvm/detail/string_utility.hpp"                         // plssvm::detail::as_lower_case
 #include "plssvm/detail/utility.hpp"                                // plssvm::detail::contains
@@ -19,6 +20,8 @@
 
 #include "Kokkos_Core.hpp"  // Kokkos::num_devices, Kokkos::ExecutionSpace
 
+#include <vector>  // std::vector
+
 #if defined(KOKKOS_ENABLE_CUDA)
     #define PLSSVM_CUDA_ERROR_CHECK(err)                                                                                                            \
         if ((err) != cudaSuccess) {                                                                                                                 \
@@ -33,13 +36,15 @@
         }
 #endif
 
-#include <vector>  // std::vector
-
 namespace plssvm::kokkos::detail {
 
 std::vector<device_wrapper> get_device_list(const execution_space space, [[maybe_unused]] const target_platform target) {
+    PLSSVM_ASSERT(space != execution_space::automatic, "The automatic execution_space may not be provided to this function!");
+
     std::vector<device_wrapper> devices{};
     switch (space) {
+        case execution_space::automatic:
+            throw backend_exception{ "Unsupported execution_space::automatic provided!" };
         case execution_space::cuda:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() {
                 for (int device = 0; device < Kokkos::num_devices(); ++device) {
diff --git a/src/plssvm/backends/Kokkos/detail/utility.cpp b/src/plssvm/backends/Kokkos/detail/utility.cpp
index 45392e509..5dc3f8cda 100644
--- a/src/plssvm/backends/Kokkos/detail/utility.cpp
+++ b/src/plssvm/backends/Kokkos/detail/utility.cpp
@@ -39,6 +39,9 @@ std::map<target_platform, std::vector<execution_space>> available_target_platfor
     // iterate over all available execution spaces
     for (const execution_space space : list_available_execution_spaces()) {
         switch (space) {
+            case execution_space::automatic:
+                // nothing to do here
+                break;
             case execution_space::cuda:
                 // NVIDIA GPUs only
                 available_map[target_platform::gpu_nvidia].push_back(execution_space::cuda);
@@ -121,6 +124,8 @@ std::map<target_platform, std::vector<execution_space>> available_target_platfor
 
 std::string get_device_name([[maybe_unused]] const device_wrapper &dev) {
     switch (dev.get_execution_space()) {
+        case execution_space::automatic:
+            throw backend_exception{ "Unsupported execution_space::automatic provided!" };
         case execution_space::cuda:
             PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA([&]() {
                 return std::string{ dev.get<execution_space::cuda>().cuda_device_prop().name };
diff --git a/src/plssvm/backends/Kokkos/execution_space.cpp b/src/plssvm/backends/Kokkos/execution_space.cpp
index 2e0c08a01..0caae212f 100644
--- a/src/plssvm/backends/Kokkos/execution_space.cpp
+++ b/src/plssvm/backends/Kokkos/execution_space.cpp
@@ -22,6 +22,8 @@ namespace plssvm::kokkos {
 
 std::ostream &operator<<(std::ostream &out, const execution_space space) {
     switch (space) {
+        case execution_space::automatic:
+            return out << "automatic";
         case execution_space::cuda:
             return out << "Cuda";
         case execution_space::hip:
@@ -49,7 +51,9 @@ std::istream &operator>>(std::istream &in, execution_space &space) {
     in >> str;
     ::plssvm::detail::to_lower_case(str);
 
-    if (str == "cuda") {
+    if (str == "automatic" || str == "auto") {
+        space = execution_space::automatic;
+    } else if (str == "cuda") {
         space = execution_space::cuda;
     } else if (str == "hip") {
         space = execution_space::hip;
@@ -74,8 +78,12 @@ std::istream &operator>>(std::istream &in, execution_space &space) {
 }
 
 std::vector<execution_space> list_available_execution_spaces() {
+    // always add the automatic execution space
+    std::vector<execution_space> spaces{ execution_space::automatic };
+    // add all other available execution spaces
     constexpr auto arr = detail::constexpr_available_execution_spaces();
-    return std::vector<execution_space>(arr.cbegin(), arr.cend());
+    spaces.insert(spaces.cend(), arr.begin(), arr.end());
+    return spaces;
 }
 
 }  // namespace plssvm::kokkos
diff --git a/tests/backends/Kokkos/execution_space.cpp b/tests/backends/Kokkos/execution_space.cpp
index 679ccb240..3e54f3be5 100644
--- a/tests/backends/Kokkos/execution_space.cpp
+++ b/tests/backends/Kokkos/execution_space.cpp
@@ -19,6 +19,7 @@
 // check whether the plssvm::kokkos::execution_space -> std::string conversions are correct
 TEST(KokkosExecutionSpace, to_string) {
     // check conversions to std::string
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::automatic, "automatic");
     EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::cuda, "Cuda");
     EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::hip, "HIP");
     EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::sycl, "SYCL");
@@ -32,12 +33,14 @@ TEST(KokkosExecutionSpace, to_string) {
 
 TEST(KokkosExecutionSpace, to_string_unknown) {
     // check conversions to std::string from unknown execution_space
-    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::kokkos::execution_space>(9), "unknown");
+    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::kokkos::execution_space>(10), "unknown");
 }
 
 // check whether the std::string -> plssvm::kokkos::execution_space conversions are correct
 TEST(KokkosExecutionSpace, from_string) {
     // check conversion from std::string
+    EXPECT_CONVERSION_FROM_STRING("Automatic", plssvm::kokkos::execution_space::automatic);
+    EXPECT_CONVERSION_FROM_STRING("AUTO", plssvm::kokkos::execution_space::automatic);
     EXPECT_CONVERSION_FROM_STRING("Cuda", plssvm::kokkos::execution_space::cuda);
     EXPECT_CONVERSION_FROM_STRING("CUDA", plssvm::kokkos::execution_space::cuda);
     EXPECT_CONVERSION_FROM_STRING("Hip", plssvm::kokkos::execution_space::hip);
@@ -68,6 +71,11 @@ TEST(KokkosExecutionSpace, from_string_unknown) {
 }
 
 TEST(KokkosExecutionSpace, list_available_execution_spaces) {
-    // at least one execution space must always be available
-    EXPECT_FALSE(plssvm::kokkos::list_available_execution_spaces().empty());
+    const std::vector<plssvm::kokkos::execution_space> execution_spaces = plssvm::kokkos::list_available_execution_spaces();
+
+    // at least one must be available (automatic)!
+    EXPECT_GE(execution_spaces.size(), 1);
+
+    // the automatic execution space must always be present
+    EXPECT_THAT(execution_spaces, ::testing::Contains(plssvm::kokkos::execution_space::automatic));
 }

From d7f54232467708de53638b63114857fe24a4350c Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 16:22:15 +0100
Subject: [PATCH 066/123] Add the possibility to explicitly specify the Kokkos
 execution space on the command line and using a named parameter.

---
 include/plssvm/backends/Kokkos/csvm.hpp      | 15 ++++-
 include/plssvm/detail/cmd/parser_predict.hpp |  4 ++
 include/plssvm/detail/cmd/parser_train.hpp   |  4 ++
 include/plssvm/parameter.hpp                 | 11 +++-
 src/main_predict.cpp                         | 16 +++--
 src/main_train.cpp                           | 14 +++--
 src/plssvm/backends/Kokkos/csvm.cpp          | 64 +++++++++++++++-----
 src/plssvm/detail/cmd/parser_predict.cpp     | 52 +++++++++++-----
 src/plssvm/detail/cmd/parser_train.cpp       | 62 +++++++++++++------
 9 files changed, 182 insertions(+), 60 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/csvm.hpp b/include/plssvm/backends/Kokkos/csvm.hpp
index d8dcfaab8..2ff662933 100644
--- a/include/plssvm/backends/Kokkos/csvm.hpp
+++ b/include/plssvm/backends/Kokkos/csvm.hpp
@@ -21,11 +21,14 @@
 #include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
 #include "plssvm/constants.hpp"                              // plssvm::real_type
 #include "plssvm/csvm.hpp"                                   // plssvm::detail::csvm_backend_exists
+#include "plssvm/detail/igor_utility.hpp"                    // plssvm::detail::get_value_from_named_parameter
 #include "plssvm/detail/memory_size.hpp"                     // plssvm::detail::memory_size
 #include "plssvm/detail/type_traits.hpp"                     // PLSSVM_REQUIRES
 #include "plssvm/parameter.hpp"                              // plssvm::parameter, plssvm::detail::parameter
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
+#include "igor/igor.hpp"  // igor::parser
+
 #include <cstddef>      // std::size_t
 #include <type_traits>  // std::true_type
 #include <utility>      // std::forward
@@ -77,7 +80,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::devic
      * @throws plssvm::kokkos::backend_exception if the requested target is not available
      * @throws plssvm::kokkos::backend_exception if no device for the requested target was found
      */
-    template <typename... Args, PLSSVM_REQUIRES(::plssvm::detail::has_only_parameter_named_args_v<Args...>)>
+    template <typename... Args, PLSSVM_REQUIRES(::plssvm::detail::has_only_kokkos_parameter_named_args_v<Args...>)>
     explicit csvm(Args &&...named_args) :
         csvm{ plssvm::target_platform::automatic, std::forward<Args>(named_args)... } { }
 
@@ -89,9 +92,17 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::devic
      * @throws plssvm::kokkos::backend_exception if the requested target is not available
      * @throws plssvm::kokkos::backend_exception if no device for the requested target was found
      */
-    template <typename... Args, PLSSVM_REQUIRES(::plssvm::detail::has_only_parameter_named_args_v<Args...>)>
+    template <typename... Args, PLSSVM_REQUIRES(::plssvm::detail::has_only_kokkos_parameter_named_args_v<Args...>)>
     explicit csvm(const target_platform target, Args &&...named_args) :
         base_type{ std::forward<Args>(named_args)... } {
+        // check igor parameter
+        igor::parser parser{ std::forward<Args>(named_args)... };
+
+        // check whether a specific Kokkos execution space has been requested
+        if constexpr (parser.has(kokkos_execution_space)) {
+            // compile time check: the value must have the correct type
+            space_ = ::plssvm::detail::get_value_from_named_parameter<kokkos::execution_space>(parser, kokkos_execution_space);
+        }
         this->init(target);
     }
 
diff --git a/include/plssvm/detail/cmd/parser_predict.hpp b/include/plssvm/detail/cmd/parser_predict.hpp
index 1fc364c7e..4da63c508 100644
--- a/include/plssvm/detail/cmd/parser_predict.hpp
+++ b/include/plssvm/detail/cmd/parser_predict.hpp
@@ -14,6 +14,7 @@
 #pragma once
 
 #include "plssvm/backend_types.hpp"                       // plssvm::backend_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"     // plssvm::kokkos::execution_space
 #include "plssvm/backends/SYCL/implementation_types.hpp"  // plssvm::sycl::implementation_type
 #include "plssvm/target_platforms.hpp"                    // plssvm::target_platform
 
@@ -45,6 +46,9 @@ struct parser_predict {
     /// The SYCL implementation to use with `--backend sycl`.
     sycl::implementation_type sycl_implementation_type{ sycl::implementation_type::automatic };
 
+    /// The Kokkos execution space to use with --backend=kokkos.
+    kokkos::execution_space kokkos_execution_space{ kokkos::execution_space::automatic };
+
     /// `true` if `std::string` should be used as label type instead of the default type `ìnt`.
     bool strings_as_labels{ false };
 
diff --git a/include/plssvm/detail/cmd/parser_train.hpp b/include/plssvm/detail/cmd/parser_train.hpp
index c448a1300..1fea29e57 100644
--- a/include/plssvm/detail/cmd/parser_train.hpp
+++ b/include/plssvm/detail/cmd/parser_train.hpp
@@ -14,6 +14,7 @@
 #pragma once
 
 #include "plssvm/backend_types.hpp"                          // plssvm::backend_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
 #include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::implementation_type
 #include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/classification_types.hpp"                   // plssvm::classification_type
@@ -65,6 +66,9 @@ struct parser_train {
     /// The SYCL implementation to use with --backend=sycl.
     sycl::implementation_type sycl_implementation_type{ sycl::implementation_type::automatic };
 
+    /// The Kokkos execution space to use with --backend=kokkos.
+    kokkos::execution_space kokkos_execution_space{ kokkos::execution_space::automatic };
+
     /// `true` if `std::string` should be used as label type instead of the default type `ìnt`.
     bool strings_as_labels{ false };
 
diff --git a/include/plssvm/parameter.hpp b/include/plssvm/parameter.hpp
index 4e51b90d7..1f229e98a 100644
--- a/include/plssvm/parameter.hpp
+++ b/include/plssvm/parameter.hpp
@@ -56,6 +56,8 @@ IGOR_MAKE_NAMED_ARGUMENT(classification);
 IGOR_MAKE_NAMED_ARGUMENT(sycl_implementation_type);
 /// Create a named argument for the SYCL backend specific kernel invocation type.
 IGOR_MAKE_NAMED_ARGUMENT(sycl_kernel_invocation_type);
+/// Create a named argument for the Kokkos backend specific execution space.
+IGOR_MAKE_NAMED_ARGUMENT(kokkos_execution_space);
 
 /// @endcond
 
@@ -73,6 +75,13 @@ constexpr bool has_only_parameter_named_args_v = !igor::has_other_than<Args...>(
 template <typename... Args>
 constexpr bool has_only_sycl_parameter_named_args_v = !igor::has_other_than<Args...>(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::sycl_implementation_type, plssvm::sycl_kernel_invocation_type);
 
+/**
+ * @brief Trait to check whether @p Args only contains named-parameter that can be used to initialize a `plssvm::parameter` struct including Kokkos specific named-parameters.
+ */
+template <typename... Args>
+constexpr bool has_only_kokkos_parameter_named_args_v = !igor::has_other_than<Args...>(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::kokkos_execution_space);
+
+
 }  // namespace detail
 
 /**
@@ -185,7 +194,7 @@ struct parameter {
         // compile time check: each named parameter must only be passed once
         static_assert(!parser.has_duplicates(), "Can only use each named parameter once!");
         // compile time check: only some named parameters are allowed
-        static_assert(!parser.has_other_than(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::sycl_implementation_type, plssvm::sycl_kernel_invocation_type),
+        static_assert(!parser.has_other_than(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::sycl_implementation_type, plssvm::sycl_kernel_invocation_type, plssvm::kokkos_execution_space),
                       "An illegal named parameter has been passed!");
 
         // shorthand function for emitting a warning if a provided parameter is not used by the current kernel function
diff --git a/src/main_predict.cpp b/src/main_predict.cpp
index bc83ffcfa..58015d928 100644
--- a/src/main_predict.cpp
+++ b/src/main_predict.cpp
@@ -37,8 +37,8 @@
 #include <fstream>     // std::ofstream
 #include <functional>  // std::mem_fn
 #include <iostream>    // std::cerr, std::endl
+#include <memory>      // std::unique_ptr, std::make_unique
 #include <utility>     // std::pair
-#include <memory>  // std::unique_ptr, std::make_unique
 #include <variant>     // std::visit
 #include <vector>      // std::vector
 
@@ -82,11 +82,10 @@ int main(int argc, char *argv[]) {
 
             // check whether SYCL is used as backend (it is either requested directly or as automatic backend)
             const bool use_sycl_as_backend{ cmd_parser.backend == plssvm::backend_type::sycl || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::sycl) };
-
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
             // check whether Kokkos is used as backend (it is either requested directly or as automatic backend)
             const bool use_kokkos_as_backend{ cmd_parser.backend == plssvm::backend_type::kokkos || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::kokkos) };
 
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
             // initialize Kokkos if necessary
             if (use_kokkos_as_backend) {
                 kokkos_guard = std::make_unique<Kokkos::ScopeGuard>(argc, argv);
@@ -95,8 +94,15 @@ int main(int argc, char *argv[]) {
 #endif
 
             // create default csvm
-            const std::unique_ptr<plssvm::csvm> svm = use_sycl_as_backend ? plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type)
-                                                                          : plssvm::make_csvm(cmd_parser.backend, cmd_parser.target);
+            const std::unique_ptr<plssvm::csvm> svm = [&]() {
+                if (use_sycl_as_backend) {
+                    return plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type);
+                } else if (use_kokkos_as_backend) {
+                    return plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, plssvm::kokkos_execution_space = cmd_parser.kokkos_execution_space);
+                } else {
+                    return plssvm::make_csvm(cmd_parser.backend, cmd_parser.target);
+                }
+            }();
 
             // create model
             const plssvm::model<label_type> model{ cmd_parser.model_filename };
diff --git a/src/main_train.cpp b/src/main_train.cpp
index 14cf8941b..93cb2abe8 100644
--- a/src/main_train.cpp
+++ b/src/main_train.cpp
@@ -78,11 +78,10 @@ int main(int argc, char *argv[]) {
 
             // check whether SYCL is used as backend (it is either requested directly or as automatic backend)
             const bool use_sycl_as_backend{ cmd_parser.backend == plssvm::backend_type::sycl || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::sycl) };
-
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
             // check whether Kokkos is used as backend (it is either requested directly or as automatic backend)
             const bool use_kokkos_as_backend{ cmd_parser.backend == plssvm::backend_type::kokkos || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::kokkos) };
 
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
             // initialize Kokkos if necessary
             if (use_kokkos_as_backend) {
                 kokkos_guard = std::make_unique<Kokkos::ScopeGuard>(argc, argv);
@@ -91,8 +90,15 @@ int main(int argc, char *argv[]) {
 #endif
 
             // create SVM
-            const std::unique_ptr<plssvm::csvm> svm = use_sycl_as_backend ? plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, cmd_parser.csvm_params, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_kernel_invocation_type = cmd_parser.sycl_kernel_invocation_type)
-                                                                          : plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, cmd_parser.csvm_params);
+            const std::unique_ptr<plssvm::csvm> svm = [&]() {
+                if (use_sycl_as_backend) {
+                    return plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, cmd_parser.csvm_params, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_kernel_invocation_type = cmd_parser.sycl_kernel_invocation_type);
+                } else if (use_kokkos_as_backend) {
+                    return plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, cmd_parser.csvm_params, plssvm::kokkos_execution_space = cmd_parser.kokkos_execution_space);
+                } else {
+                    return plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, cmd_parser.csvm_params);
+                }
+            }();
 
             // only specify plssvm::max_iter if it isn't its default value
             const plssvm::model<label_type> model =
diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 58e4e24fe..2bcefeeff 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -93,38 +93,72 @@ void csvm::init(const target_platform target) {
             break;
     }
 
+    // check whether the requested execution space is available
+    if (!::plssvm::detail::contains(list_available_execution_spaces(), space_)) {
+        throw backend_exception{ fmt::format("The provided Kokkos::ExecutionSpace {} is not available, available are: {}!", space_, fmt::join(list_available_execution_spaces(), ", ")) };
+    }
+
     // get all available target_platform <-> Kokkos::ExecutionSpace combinations
     const std::map<target_platform, std::vector<execution_space>> available_combinations = detail::available_target_platform_to_execution_space_mapping();
 
-    if (target == target_platform::automatic) {
-        // go through all combinations and choose the first execution space in order: gpu_nvidia -> gpu_amd -> gpu_intel -> cpu
-        for (const target_platform target_order : { target_platform::gpu_nvidia, target_platform::gpu_amd, target_platform::gpu_intel, target_platform::cpu }) {
-            if (::plssvm::detail::contains(available_combinations, target_order)) {
+    // check whether the provided execution space is the automatic one
+    if (space_ == execution_space::automatic) {
+        // automatically determine the execution space and potentially automatically determine the target platform
+        if (target == target_platform::automatic) {
+            // go through all combinations and choose the first execution space in order: gpu_nvidia -> gpu_amd -> gpu_intel -> cpu
+            for (const target_platform target_order : list_available_target_platforms()) {
+                if (::plssvm::detail::contains(available_combinations, target_order)) {
+                    // the target platform is supported -> choose the first execution space to use in the Kokkos backend
+                    space_ = available_combinations.at(target_order).front();
+                    target_ = target_order;
+                    break;
+                }
+            }
+        } else {
+            // check whether the provided target platform is compatible with the currently available Kokkos::ExecutionSpaces
+            if (::plssvm::detail::contains(available_combinations, target)) {
                 // the target platform is supported -> choose the first execution space to use in the Kokkos backend
-                space_ = available_combinations.at(target_order).front();
-                target_ = target_order;
-                break;
+                space_ = available_combinations.at(target).front();
+                target_ = target;
+            } else {
+                // the provided target platform is unsupported -> throw an exception
+                throw backend_exception{ fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform {}!", fmt::join(list_available_execution_spaces(), ", "), target) };
             }
         }
+
+        // output what we use as automatic Kokkos execution space
+        plssvm::detail::log(verbosity_level::full,
+                            "\nUsing {} as automatic Kokkos::ExecutionSpace.",
+                            space_);
     } else {
-        // check whether the provided target platform is compatible with the currently available Kokkos::ExecutionSpaces
-        if (::plssvm::detail::contains(available_combinations, target)) {
-            // the target platform is supported -> choose the first execution space to use in the Kokkos backend
-            space_ = available_combinations.at(target).front();
-            target_ = target;
+        // execution space explicitly provided and potentially automatically determine the target platform
+        if (target == target_platform::automatic) {
+            // go through all combinations (gpu_nvidia -> gpu_amd -> gpu_intel -> cpu) and check whether the requested execution space supports that target platform
+            for (const target_platform target_order : list_available_target_platforms()) {
+                if (::plssvm::detail::contains(available_combinations, target_order) && ::plssvm::detail::contains(available_combinations.at(target_order), space_)) {
+                    // the provided execution space supports the target platform
+                    target_ = target_order;
+                    break;
+                }
+            }
         } else {
-            // the provided target platform is unsupported -> throw an exception
-            throw backend_exception{ fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform {}!", fmt::join(list_available_execution_spaces(), ", "), target) };
+            if (!::plssvm::detail::contains(available_combinations, target) || !::plssvm::detail::contains(available_combinations.at(target), space_)) {
+                // the provided execution space and target platform combination is unsupported
+                throw backend_exception{ fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform {}!", space_, target) };
+            }
         }
     }
 
+    // At this point, space_ may NEVER be execution_space::automatic!
+    PLSSVM_ASSERT(space_ != execution_space::automatic, "At this point, the Kokkos execution space must be determined and must NOT be automatic!");
+
     // Kokkos::Experimental::OpenMPTarget and Kokkos::Experimental::OpenACC currently not supported!
     if (space_ == execution_space::openmp_target || space_ == execution_space::openacc) {
         throw backend_exception{ fmt::format("The Kokkos execution space {} is currently not supported!", space_) };
     }
 
     plssvm::detail::log(verbosity_level::full,
-                        "\nUsing Kokkos ({}) as backend with the Kokkos::ExecutionSpace \"{}\".\n",
+                        "\nUsing Kokkos ({}) as backend with the Kokkos::ExecutionSpace {}.\n",
                         plssvm::detail::tracking::tracking_entry{ "dependencies", "kokkos_version", detail::get_kokkos_version() },
                         plssvm::detail::tracking::tracking_entry{ "dependencies", "kokkos_default_execution_space", space_ });
 
diff --git a/src/plssvm/detail/cmd/parser_predict.cpp b/src/plssvm/detail/cmd/parser_predict.cpp
index 88e91bb2c..298c90da0 100644
--- a/src/plssvm/detail/cmd/parser_predict.cpp
+++ b/src/plssvm/detail/cmd/parser_predict.cpp
@@ -9,6 +9,7 @@
 #include "plssvm/detail/cmd/parser_predict.hpp"
 
 #include "plssvm/backend_types.hpp"                                // plssvm::list_available_backends
+#include "plssvm/backends/Kokkos/execution_space.hpp"              // plssvm::kokkos::list_available_execution_spaces
 #include "plssvm/backends/SYCL/implementation_types.hpp"           // plssvm::sycl::list_available_sycl_implementations
 #include "plssvm/constants.hpp"                                    // plssvm::real_type
 #include "plssvm/detail/assert.hpp"                                // PLSSVM_ASSERT
@@ -17,8 +18,8 @@
 #include "plssvm/verbosity_levels.hpp"                             // plssvm::verbosity, plssvm::verbosity_level
 #include "plssvm/version/version.hpp"                              // plssvm::version::detail::get_version_info
 
-#include "cxxopts.hpp"    // cxxopts::{Options, value, ParseResult}
-#include "fmt/color.h"    // fmt::fg, fmt::color::orange
+#include "cxxopts.hpp"   // cxxopts::{Options, value, ParseResult}
+#include "fmt/color.h"   // fmt::fg, fmt::color::orange
 #include "fmt/format.h"  // fmt::format
 #include "fmt/ranges.h"  // fmt::join
 
@@ -51,6 +52,9 @@ parser_predict::parser_predict(int argc, char **argv) {
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
             ("sycl_implementation_type", fmt::format("choose the SYCL implementation to be used in the SYCL backend: {}", fmt::join(sycl::list_available_sycl_implementations(), "|")), cxxopts::value<sycl::implementation_type>()->default_value(fmt::format("{}", sycl_implementation_type)))
 #endif
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+            ("kokkos_execution_space", fmt::format("choose the Kokkos execution space to be used in the Kokkos backend: {}", fmt::join(kokkos::list_available_execution_spaces(), "|")), cxxopts::value<decltype(kokkos_execution_space)>()->default_value(fmt::format("{}", kokkos_execution_space)))
+#endif
 #if defined(PLSSVM_PERFORMANCE_TRACKER_ENABLED)
            ("performance_tracking", "the output YAML file where the performance tracking results are written to; if not provided, the results are dumped to stderr", cxxopts::value<decltype(performance_tracking_filename)>())
 #endif
@@ -101,18 +105,38 @@ parser_predict::parser_predict(int argc, char **argv) {
     target = result["target_platform"].as<decltype(target)>();
 
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    // parse SYCL implementation used in the SYCL backend
-    sycl_implementation_type = result["sycl_implementation_type"].as<decltype(sycl_implementation_type)>();
-
-    // assembly warning condition
-    const std::vector<plssvm::target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
-    const bool sycl_backend_is_used = backend == backend_type::sycl || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::sycl);
-
-    // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend
-    if (!sycl_backend_is_used && sycl_implementation_type != sycl::implementation_type::automatic) {
-        detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                              "WARNING: explicitly set a SYCL implementation type but the current backend isn't SYCL; ignoring --sycl_implementation_type={}\n",
-                              sycl_implementation_type);
+    {
+        // parse SYCL implementation used in the SYCL backend
+        sycl_implementation_type = result["sycl_implementation_type"].as<decltype(sycl_implementation_type)>();
+
+        // assembly warning condition
+        const std::vector<plssvm::target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
+        const bool sycl_backend_is_used = backend == backend_type::sycl || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::sycl);
+
+        // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend
+        if (!sycl_backend_is_used && sycl_implementation_type != sycl::implementation_type::automatic) {
+            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                                  "WARNING: explicitly set a SYCL implementation type but the current backend isn't SYCL; ignoring --sycl_implementation_type={}\n",
+                                  sycl_implementation_type);
+        }
+    }
+#endif
+
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    {
+        // parse execution space when using Kokkos as backend
+        kokkos_execution_space = result["kokkos_execution_space"].as<decltype(kokkos_execution_space)>();
+
+        // assemble warning condition
+        const std::vector<plssvm::target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
+        const bool kokkos_backend_is_used = backend == backend_type::kokkos || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::kokkos);
+
+        // warn if the kokkos execution space is explicitly set but Kokkos isn't the current (automatic) backend
+        if (!kokkos_backend_is_used && kokkos_execution_space != kokkos::execution_space::automatic) {
+            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                                  "WARNING: explicitly set a Kokkos execution space but the current backend isn't Kokkos; ignoring --kokkos_execution_space={}\n",
+                                  kokkos_execution_space);
+        }
     }
 #endif
 
diff --git a/src/plssvm/detail/cmd/parser_train.cpp b/src/plssvm/detail/cmd/parser_train.cpp
index d0cc4cb26..31964a897 100644
--- a/src/plssvm/detail/cmd/parser_train.cpp
+++ b/src/plssvm/detail/cmd/parser_train.cpp
@@ -9,6 +9,7 @@
 #include "plssvm/detail/cmd/parser_train.hpp"
 
 #include "plssvm/backend_types.hpp"                                // plssvm::list_available_backends, plssvm::determine_default_backend
+#include "plssvm/backends/Kokkos/execution_space.hpp"              // plssvm::kokkos::{list_available_execution_spaces, execution_space}
 #include "plssvm/backends/SYCL/implementation_types.hpp"           // plssvm::sycl::{list_available_sycl_implementations, implementation_type}
 #include "plssvm/backends/SYCL/kernel_invocation_types.hpp"        // plssvm::sycl::kernel_invocation_type
 #include "plssvm/classification_types.hpp"                         // plssvm::classification_type, plssvm::classification_type_to_full_string
@@ -77,6 +78,9 @@ parser_train::parser_train(int argc, char **argv) {
            ("sycl_kernel_invocation_type", "choose the kernel invocation type when using SYCL as backend: automatic|nd_range", cxxopts::value<decltype(sycl_kernel_invocation_type)>()->default_value(fmt::format("{}", sycl_kernel_invocation_type)))
            ("sycl_implementation_type", fmt::format("choose the SYCL implementation to be used in the SYCL backend: {}", fmt::join(sycl::list_available_sycl_implementations(), "|")), cxxopts::value<decltype(sycl_implementation_type)>()->default_value(fmt::format("{}", sycl_implementation_type)))
 #endif
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+            ("kokkos_execution_space", fmt::format("choose the Kokkos execution space to be used in the Kokkos backend: {}", fmt::join(kokkos::list_available_execution_spaces(), "|")), cxxopts::value<decltype(kokkos_execution_space)>()->default_value(fmt::format("{}", kokkos_execution_space)))
+#endif
 #if defined(PLSSVM_PERFORMANCE_TRACKER_ENABLED)
            ("performance_tracking", "the output YAML file where the performance tracking results are written to; if not provided, the results are dumped to stderr", cxxopts::value<decltype(performance_tracking_filename)>())
 #endif
@@ -185,28 +189,48 @@ parser_train::parser_train(int argc, char **argv) {
     solver = result["solver"].as<decltype(solver)>();
 
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    // parse kernel invocation type when using SYCL as backend
-    sycl_kernel_invocation_type = result["sycl_kernel_invocation_type"].as<decltype(sycl_kernel_invocation_type)>();
-
-    // assembly warning condition
-    const std::vector<plssvm::target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
-    const bool sycl_backend_is_used = backend == backend_type::sycl || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::sycl);
-
-    // warn if kernel invocation type is explicitly set but SYCL isn't the current (automatic) backend
-    if (!sycl_backend_is_used && sycl_kernel_invocation_type != sycl::kernel_invocation_type::automatic) {
-        detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                              "WARNING: explicitly set a SYCL kernel invocation type but the current backend isn't SYCL; ignoring --sycl_kernel_invocation_type={}\n",
-                              sycl_kernel_invocation_type);
+    {
+        // parse kernel invocation type when using SYCL as backend
+        sycl_kernel_invocation_type = result["sycl_kernel_invocation_type"].as<decltype(sycl_kernel_invocation_type)>();
+
+        // assemble warning condition
+        const std::vector<plssvm::target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
+        const bool sycl_backend_is_used = backend == backend_type::sycl || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::sycl);
+
+        // warn if kernel invocation type is explicitly set but SYCL isn't the current (automatic) backend
+        if (!sycl_backend_is_used && sycl_kernel_invocation_type != sycl::kernel_invocation_type::automatic) {
+            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                                  "WARNING: explicitly set a SYCL kernel invocation type but the current backend isn't SYCL; ignoring --sycl_kernel_invocation_type={}\n",
+                                  sycl_kernel_invocation_type);
+        }
+
+        // parse SYCL implementation used in the SYCL backend
+        sycl_implementation_type = result["sycl_implementation_type"].as<decltype(sycl_implementation_type)>();
+
+        // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend
+        if (!sycl_backend_is_used && sycl_implementation_type != sycl::implementation_type::automatic) {
+            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                                  "WARNING: explicitly set a SYCL implementation type but the current backend isn't SYCL; ignoring --sycl_implementation_type={}\n",
+                                  sycl_implementation_type);
+        }
     }
+#endif
 
-    // parse SYCL implementation used in the SYCL backend
-    sycl_implementation_type = result["sycl_implementation_type"].as<decltype(sycl_implementation_type)>();
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    {
+        // parse execution space when using Kokkos as backend
+        kokkos_execution_space = result["kokkos_execution_space"].as<decltype(kokkos_execution_space)>();
 
-    // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend
-    if (!sycl_backend_is_used && sycl_implementation_type != sycl::implementation_type::automatic) {
-        detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                              "WARNING: explicitly set a SYCL implementation type but the current backend isn't SYCL; ignoring --sycl_implementation_type={}\n",
-                              sycl_implementation_type);
+        // assemble warning condition
+        const std::vector<plssvm::target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
+        const bool kokkos_backend_is_used = backend == backend_type::kokkos || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::kokkos);
+
+        // warn if the kokkos execution space is explicitly set but Kokkos isn't the current (automatic) backend
+        if (!kokkos_backend_is_used && kokkos_execution_space != kokkos::execution_space::automatic) {
+            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                                  "WARNING: explicitly set a Kokkos execution space but the current backend isn't Kokkos; ignoring --kokkos_execution_space={}\n",
+                                  kokkos_execution_space);
+        }
     }
 #endif
 

From 46da12f1e77afc9b8132be142e695cc26a6912f5 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 16:22:35 +0100
Subject: [PATCH 067/123] Update manpage.

---
 docs/plssvm-train.1.in | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/plssvm-train.1.in b/docs/plssvm-train.1.in
index b52853dac..e34541848 100644
--- a/docs/plssvm-train.1.in
+++ b/docs/plssvm-train.1.in
@@ -17,7 +17,10 @@ plssvm-train is a utility to train an LS-SVM using different backends to target
 set type of kernel function.
     0 -- linear: u'*v
     1 -- polynomial: (gamma*u'*v + coef0)^degree
-    2 -- radial basis function: exp(-gamma*|u-v|^2) (default: 2)
+    2 -- radial basis function: exp(-gamma*|u-v|^2)
+    3 -- sigmoid: tanh(gamma*u'*v+coef0)
+    4 -- laplacian: exp(-gamma*|u-v|_1)
+    5 -- chi_squared: exp(-gamma*sum_i((x[i]-y[i])^2/(x[i]+y[i]))) (default: 2)
 
 .TP
 .B -d, --degree arg
@@ -25,7 +28,7 @@ set degree in kernel function (default: 3)
 
 .TP
 .B -g, --gamma arg
-set gamma in kernel function (default: 1 / num_features)
+set gamma in kernel function (default: automatic)
 
 .TP
 .B -r, --coef0 arg

From 85c581dadebcaff1d73e2bf95f8f4af944dc219e Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 16:23:03 +0100
Subject: [PATCH 068/123] Add available Kokkos execution spaces to the Kokkos
 CMake summary string.

---
 src/plssvm/backends/Kokkos/CMakeLists.txt | 33 ++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/plssvm/backends/Kokkos/CMakeLists.txt b/src/plssvm/backends/Kokkos/CMakeLists.txt
index b66927fa9..90a1f4e74 100644
--- a/src/plssvm/backends/Kokkos/CMakeLists.txt
+++ b/src/plssvm/backends/Kokkos/CMakeLists.txt
@@ -122,8 +122,39 @@ target_link_libraries(${PLSSVM_ALL_LIBRARY_NAME} INTERFACE ${PLSSVM_KOKKOS_BACKE
 # mark backend library as install target
 append_local_and_parent(PLSSVM_TARGETS_TO_INSTALL ${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME})
 
+# assemble Kokkos available execution space string
+set(PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "")
+if (Kokkos_ENABLE_CUDA)
+    list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "Cuda")
+endif ()
+if (Kokkos_ENABLE_HIP)
+    list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "HIP")
+endif ()
+if (Kokkos_ENABLE_SYCL)
+    list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "SYCL")
+endif ()
+if (Kokkos_ENABLE_HPX)
+    list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "HPX")
+endif ()
+if (Kokkos_ENABLE_OPENMP)
+    list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "OpenMP")
+endif ()
+if (Kokkos_ENABLE_OPENMPTARGET)
+    list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "OpenMPTarget")
+endif ()
+if (Kokkos_ENABLE_OPENACC)
+    list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "OpenACC")
+endif ()
+if (Kokkos_ENABLE_THREADS)
+    list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "Threads")
+endif ()
+if (Kokkos_ENABLE_SERIAL)
+    list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "Serial")
+endif ()
+set(PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "${PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES}" PARENT_SCOPE)
+
 # generate summary string
-set(PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_COMPILER " - Kokkos:")
+set(PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_COMPILER " - Kokkos (${PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES}):")
 include(${PROJECT_SOURCE_DIR}/cmake/assemble_summary_string.cmake)
 assemble_summary_string(PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS)
 # do not print any special target architecture information

From 5ed9e536b1600bf395042d9bcd0275a7d89d049f Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 16:36:57 +0100
Subject: [PATCH 069/123] Update README to also include the new Kokkos backend.

---
 README.md | 67 +++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 45 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index e7de709b3..71a96e499 100644
--- a/README.md
+++ b/README.md
@@ -64,6 +64,7 @@ The main highlights of our SVM implementations are:
    - [HIP](https://github.com/ROCm-Developer-Tools/HIP)
    - [OpenCL](https://www.khronos.org/opencl/)
    - [SYCL](https://www.khronos.org/sycl/) (supported implementations are [DPC++](https://github.com/intel/llvm) and [AdaptiveCpp](https://github.com/AdaptiveCpp/AdaptiveCpp) (formerly known as hipSYCL); specifically the versions [sycl-nightly/20231201](https://github.com/intel/llvm/tree/sycl-nightly/20230110) and AdaptiveCpp release [v24.06.0](https://github.com/AdaptiveCpp/AdaptiveCpp/releases/tag/v23.10.0))
+   - [Kokkos](https://github.com/kokkos/kokkos) (all execution spaces supported except `OpenMPTarget` and `OpenACC`); specifically the version [d50de97](https://github.com/kokkos/kokkos/commit/d50de979b4d095dc32dba80f72a5e009f3615db1)
 3. Six different kernel functions to be able to classify a large variety of different problems:
    - linear: $\vec{u}^T$ $\cdot$ $\vec{v}$
    - polynomial: $(\gamma$ $\cdot$ $\vec{u}^T$ $\cdot$ $\vec{v}$ $+$ $coef0)^{d}$
@@ -122,6 +123,10 @@ Additional dependencies for the SYCL backend:
 
 - the code must be compiled with a SYCL capable compiler; currently supported are [DPC++](https://github.com/intel/llvm) and [AdaptiveCpp](https://github.com/AdaptiveCpp/AdaptiveCpp)
 
+Additional dependencies for the Kokkos backend:
+
+- a Kokkos installation with the respective execution spaces enabled; currently all execution spaces are supported except `OpenMPTarget` and `OpenACC`
+
 Additional dependencies for the stdpar backend:
 
 - the code must be compiled with a stdpar capable compiler; currently supported are [nvc++](https://developer.nvidia.com/hpc-sdk), [roc-stdpar](https://github.com/ROCm/roc-stdpar), [icpx](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compiler.html), [AdaptiveCpp](https://github.com/AdaptiveCpp/AdaptiveCpp), and [GNU GCC](https://gcc.gnu.org/))
@@ -262,6 +267,11 @@ The `[optional_options]` can be one or multiple of:
   - `AUTO`: check for the SYCL backend but **do not** fail if not available
   - `OFF`: do not check for the SYCL backend
 
+- `PLSSVM_ENABLE_KOKKOS_BACKEND=ON|OFF|AUTO` (default: `AUTO`):
+    - `ON`: check for the Kokkos backend and fail if not available
+    - `AUTO`: check for the Kokkos backend but **do not** fail if not available
+    - `OFF`: do not check for the Kokkos backend
+
 **Attention:** at least one backend must be enabled and available!
 
 - `PLSSVM_ENABLE_FAST_MATH=ON|OFF` (default depending on `CMAKE_BUILD_TYPE`: `ON` for Release or RelWithDebInfo, `OFF` otherwise): enable `fast-math` compiler flags for all backends
@@ -337,6 +347,10 @@ If more than one SYCL implementation is available the environment variables `PLS
 
 - `PLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` (`dpcpp`|`adaptivecpp`): specify the preferred SYCL implementation if the `sycl_implementation_type` option is set to `automatic`; additional the specified SYCL implementation is used in the `plssvm::sycl` namespace, the other implementations are available in the `plssvm::dpcpp` and `plssvm::adaptivecpp` namespace respectively
 
+If the Kokkos backend is available the following additional option is available (**note**: this option takes only effect if the Kokkos SYCL execution space is available):
+
+- `PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT` (default: `ON`): enable Ahead-of-Time (AOT) compilation for the specified target platforms
+
 If the stdpar backend is available, an additional options can be set.
 
 - `PLSSVM_STDPAR_BACKEND_IMPLEMENTATION` (default: `AUTO`): explicitly specify the used stdpar implementation; must be one of: `AUTO`, `NVHPC`, `roc-stdpar`, `IntelLLVM`, `ACPP`, `GNU_TBB`.
@@ -353,24 +367,6 @@ Available configure presets:
   "openmp"                  - OpenMP backend
   "openmp_python"           - OpenMP backend + Python bindings
   "openmp_test"             - OpenMP backend tests
-  "cuda"                    - CUDA backend
-  "cuda_python"             - CUDA backend + Python bindings
-  "cuda_test"               - CUDA backend tests
-  "hip"                     - HIP backend
-  "hip_python"              - HIP backend + Python bindings
-  "hip_test"                - HIP backend tests
-  "opencl"                  - OpenCL backend
-  "opencl_python"           - OpenCL backend + Python bindings
-  "opencl_test"             - OpenCL backend tests
-  "acpp"                    - AdaptiveCpp SYCL backend
-  "acpp_python"             - AdaptiveCpp SYCL backend + Python bindings
-  "acpp_test"               - AdaptiveCpp SYCL backend tests
-  "dpcpp"                   - DPC++/icpx SYCL backend
-  "dpcpp_python"            - DPC++/icpx backend + Python bindings
-  "dpcpp_test"              - DPC++/icpx backend tests
-  "all"                     - All available backends
-  "all_python"              - All available backends + Python bindings
-  "all_test"                - All available backends tests
   "stdpar"                  - stdpar backend
   "stdpar_python"           - stdpar backend + Python bindings
   "stdpar_test"             - stdpar backend tests
@@ -389,6 +385,27 @@ Available configure presets:
   "stdpar_intelllvm"        - stdpar IntelLLVM (icpx) backend
   "stdpar_intelllvm_python" - stdpar IntelLLVM (icpx) backend + Python bindings
   "stdpar_intelllvm_test"   - stdpar IntelLLVM (icpx) backend tests
+  "cuda"                    - CUDA backend
+  "cuda_python"             - CUDA backend + Python bindings
+  "cuda_test"               - CUDA backend tests
+  "hip"                     - HIP backend
+  "hip_python"              - HIP backend + Python bindings
+  "hip_test"                - HIP backend tests
+  "opencl"                  - OpenCL backend
+  "opencl_python"           - OpenCL backend + Python bindings
+  "opencl_test"             - OpenCL backend tests
+  "acpp"                    - AdaptiveCpp SYCL backend
+  "acpp_python"             - AdaptiveCpp SYCL backend + Python bindings
+  "acpp_test"               - AdaptiveCpp SYCL backend tests
+  "dpcpp"                   - DPC++/icpx SYCL backend
+  "dpcpp_python"            - DPC++/icpx backend + Python bindings
+  "dpcpp_test"              - DPC++/icpx backend tests
+  "kokkos"                  - Kokkos backend
+  "kokkos_python"           - Kokkos backend + Python bindings
+  "kokkos_test"             - Kokkos backend tests
+  "all"                     - All available backends
+  "all_python"              - All available backends + Python bindings
+  "all_test"                - All available backends tests
 ```
 
 With these presets, building and testing, e.g., our CUDA backend is as simple as typing (in the PLSSVM root directory):
@@ -532,6 +549,8 @@ Usage:
                                 choose the kernel invocation type when using SYCL as backend: automatic|nd_range (default: automatic)
       --sycl_implementation_type arg
                                 choose the SYCL implementation to be used in the SYCL backend: automatic|dpcpp|adaptivecpp (default: automatic)
+      --kokkos_execution_space arg
+                                choose the Kokkos execution space to be used in the Kokkos backend: automatic|Cuda|OpenMP|Serial (default: automatic)
       --performance_tracking arg
                                 the output YAML file where the performance tracking results are written to; if not provided, the results are dumped to stderr
       --use_strings_as_labels   use strings as labels instead of plane numbers
@@ -567,10 +586,10 @@ Another example targeting NVIDIA GPUs using the SYCL backend looks like:
 
 The `--backend=automatic` option works as follows:
 
-- if the `gpu_nvidia` target is available, check for existing backends in order `cuda` 🠦 `hip` 🠦 `opencl` 🠦 `sycl` 🠦 `stdpar`
-- otherwise, if the `gpu_amd` target is available, check for existing backends in order `hip` 🠦 `opencl` 🠦 `sycl` 🠦 `stdpar`
-- otherwise, if the `gpu_intel` target is available, check for existing backends in order `sycl` 🠦 `opencl` 🠦 `stdpar`
-- otherwise, if the `cpu` target is available, check for existing backends in order `sycl` 🠦 `opencl` 🠦 `openmp` 🠦 `stdpar`
+- if the `gpu_nvidia` target is available, check for existing backends in order `cuda` 🠦 `hip` 🠦 `opencl` 🠦 `sycl` 🠦 `kokkos` 🠦 `stdpar`
+- otherwise, if the `gpu_amd` target is available, check for existing backends in order `hip` 🠦 `opencl` 🠦 `sycl` 🠦 `kokkos` 🠦 `stdpar`
+- otherwise, if the `gpu_intel` target is available, check for existing backends in order `sycl` 🠦 `opencl` 🠦 `kokkos` 🠦 `stdpar`
+- otherwise, if the `cpu` target is available, check for existing backends in order `sycl` 🠦 `kokkos` 🠦 `opencl` 🠦 `openmp` 🠦 `stdpar`
 
 Note that during CMake configuration it is guaranteed that at least one of the above combinations does exist.
 
@@ -581,11 +600,13 @@ The `--target_platform=automatic` option works for the different backends as fol
 - `HIP`: always selects an AMD GPU (if no AMD GPU is available, throws an exception)
 - `OpenCL`: tries to find available devices in the following order: NVIDIA GPUs 🠦 AMD GPUs 🠦 Intel GPUs 🠦 CPU
 - `SYCL`: tries to find available devices in the following order: NVIDIA GPUs 🠦 AMD GPUs 🠦 Intel GPUs 🠦 CPU
+- `Kokkos`: checks which execution spaces are available and which target platforms they support and then tries to find available devices in the following order: NVIDIA GPUs 🠦 AMD GPUs 🠦 Intel GPUs 🠦 CPU
 - `stdpar`: target device must be selected at compile time (using `PLSSVM_TARGET_PLATFORMS`) or using environment variables at runtime
 
 The `--sycl_kernel_invocation_type` and `--sycl_implementation_type` flags are only used if the `--backend` is `sycl`, otherwise a warning is emitted on `stderr`.
 If the `--sycl_kernel_invocation_type` is `automatic`, the `nd_range` invocation type is currently always used.
 If the `--sycl_implementation_type` is `automatic`, the used SYCL implementation is determined by the `PLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` CMake flag.
+If the `--kokkos_execution_space` is `automatic`, uses the best fitting execution space based on the provided and/or available target platforms.
 
 ### Predicting using `plssvm-predict`
 
@@ -604,6 +625,8 @@ Usage:
   -p, --target_platform arg     choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic)
       --sycl_implementation_type arg
                                 choose the SYCL implementation to be used in the SYCL backend: automatic|dpcpp|adaptivecpp (default: automatic)
+      --kokkos_execution_space arg
+                                choose the Kokkos execution space to be used in the Kokkos backend: automatic|Cuda|OpenMP|Serial (default: automatic)
       --performance_tracking arg
                                 the output YAML file where the performance tracking results are written to; if not provided, the results are dumped to stderr
       --use_strings_as_labels   use strings as labels instead of plane numbers

From 66dfee6e5ca1ba92249d561bfabd2a9d0df6d37f Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 16:41:09 +0100
Subject: [PATCH 070/123] Add support for the Kokkos backend as PLSSVM install
 component.

---
 cmake/plssvm/plssvmConfig.cmake.in     |  2 +-
 cmake/plssvm/plssvmKokkosTargets.cmake | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 cmake/plssvm/plssvmKokkosTargets.cmake

diff --git a/cmake/plssvm/plssvmConfig.cmake.in b/cmake/plssvm/plssvmConfig.cmake.in
index e6be17d15..0e4f989ec 100644
--- a/cmake/plssvm/plssvmConfig.cmake.in
+++ b/cmake/plssvm/plssvmConfig.cmake.in
@@ -25,7 +25,7 @@ find_dependency(fmt REQUIRED)
 include("${CMAKE_CURRENT_LIST_DIR}/plssvmTargets.cmake")
 
 # list all available libraries
-set(PLSSVM_SUPPORTED_COMPONENTS "OpenMP;CUDA;HIP;OpenCL;DPCPP;AdaptiveCpp;stdpar")
+set(PLSSVM_SUPPORTED_COMPONENTS "OpenMP;CUDA;HIP;OpenCL;DPCPP;AdaptiveCpp;Kokkos;stdpar")
 set(PLSSVM_DISABLED_COMPONENTS "${PLSSVM_SUPPORTED_COMPONENTS}")
 
 # check which libraries are available
diff --git a/cmake/plssvm/plssvmKokkosTargets.cmake b/cmake/plssvm/plssvmKokkosTargets.cmake
new file mode 100644
index 000000000..7ec32069a
--- /dev/null
+++ b/cmake/plssvm/plssvmKokkosTargets.cmake
@@ -0,0 +1,21 @@
+## Authors: Alexander Van Craen, Marcel Breyer
+## Copyright (C): 2018-today The PLSSVM project - All Rights Reserved
+## License: This file is part of the PLSSVM project which is released under the MIT license.
+##          See the LICENSE.md file in the project root for full license information.
+########################################################################################################################
+
+include(CMakeFindDependencyMacro)
+
+# check if the Kokkos backend is available
+if (TARGET plssvm::plssvm-Kokkos)
+    # enable Kokkos
+    find_dependency(Kokkos CONFIG)
+    # set alias targets
+    add_library(plssvm::Kokkos ALIAS plssvm::plssvm-Kokkos)
+    add_library(plssvm::kokkos ALIAS plssvm::plssvm-Kokkos)
+    # set COMPONENT to be found
+    set(plssvm_Kokkos_FOUND ON)
+else ()
+    # set COMPONENT to be NOT found
+    set(plssvm_Kokkos_FOUND OFF)
+endif ()
\ No newline at end of file

From e30677fd894c527a1a30e99218641b40a77d1d7f Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 16:47:08 +0100
Subject: [PATCH 071/123] Update manpages to include the Kokkos backend.

---
 CMakeLists.txt           | 12 +++++++++++-
 docs/plssvm-predict.1.in |  2 ++
 docs/plssvm-train.1.in   |  2 ++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 19309a9eb..d555bd8e4 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -729,8 +729,8 @@ message(STATUS "Generating manpage files.")
 string(TIMESTAMP PLSSVM_CURRENT_BUILD_TIME "%d. %B %Y")
 string(REPLACE ";" "|" PLSSVM_PLATFORM_NAME_LIST "${PLSSVM_PLATFORM_NAME_LIST}")
 string(REPLACE ";" "|" PLSSVM_BACKEND_NAME_LIST "${PLSSVM_BACKEND_NAME_LIST}")
-string(REPLACE ";" "|" PLSSVM_SYCL_BACKEND_NAME_LIST "${PLSSVM_SYCL_BACKEND_NAME_LIST}")
 if (TARGET ${PLSSVM_SYCL_BACKEND_LIBRARY_NAME})
+    string(REPLACE ";" "|" PLSSVM_SYCL_BACKEND_NAME_LIST "${PLSSVM_SYCL_BACKEND_NAME_LIST}")
     set(PLSSVM_SYCL_IMPLEMENTATION_TYPE_MANPAGE_ENTRY "
 .TP
 .B --sycl_implementation_type
@@ -743,6 +743,15 @@ choose the kernel invocation type when using SYCL as backend: automatic|nd_range
 ")
 endif ()
 set(PLSSVM_SYCL_MANPAGE_ENTRY "${PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_MANPAGE_ENTRY}${PLSSVM_SYCL_IMPLEMENTATION_TYPE_MANPAGE_ENTRY}")
+## assemble the Kokkos manpage entry
+if (TARGET ${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME})
+    string(REPLACE ";" "|" PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "${PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES}")
+    set(PLSSVM_KOKKOS_MANPAGE_ENTRY "
+.TP
+.B --kokkos_execution_space
+choose the Kokkos execution space to be used in the Kokkos backend: automatic|${PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES} (default: automatic)
+")
+endif ()
 ## assemble the performance tracker manpage entry
 if (PLSSVM_ENABLE_PERFORMANCE_TRACKING)
     set(PLSSVM_PERFORMANCE_TRACKER_MANPAGE_ENTRY "
@@ -758,6 +767,7 @@ configure_file(
         ${CMAKE_CURRENT_SOURCE_DIR}/docs/plssvm-train.1
         @ONLY
 )
+# update manpage entry since plssvm-predict can't recognize the SYCL kernel invocation type
 set(PLSSVM_SYCL_MANPAGE_ENTRY "${PLSSVM_SYCL_IMPLEMENTATION_TYPE_MANPAGE_ENTRY}")
 configure_file(
         ${CMAKE_CURRENT_SOURCE_DIR}/docs/plssvm-predict.1.in
diff --git a/docs/plssvm-predict.1.in b/docs/plssvm-predict.1.in
index bb9e29b6b..17d6081fa 100644
--- a/docs/plssvm-predict.1.in
+++ b/docs/plssvm-predict.1.in
@@ -22,6 +22,8 @@ choose the target platform: @PLSSVM_PLATFORM_NAME_LIST@ (default: automatic)
 
 @PLSSVM_SYCL_MANPAGE_ENTRY@
 
+@PLSSVM_KOKKOS_MANPAGE_ENTRY@
+
 @PLSSVM_PERFORMANCE_TRACKER_MANPAGE_ENTRY@
 
 .TP
diff --git a/docs/plssvm-train.1.in b/docs/plssvm-train.1.in
index e34541848..fad2e4fba 100644
--- a/docs/plssvm-train.1.in
+++ b/docs/plssvm-train.1.in
@@ -64,6 +64,8 @@ choose the target platform: @PLSSVM_PLATFORM_NAME_LIST@ (default: automatic)
 
 @PLSSVM_SYCL_MANPAGE_ENTRY@
 
+@PLSSVM_KOKKOS_MANPAGE_ENTRY@
+
 @PLSSVM_PERFORMANCE_TRACKER_MANPAGE_ENTRY@
 
 .TP

From 7be8170ffdf42c4ba01bffd546459089beda8665 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 16:48:07 +0100
Subject: [PATCH 072/123] Add kokkos to the --help examples.

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 71a96e499..047a9fd23 100644
--- a/README.md
+++ b/README.md
@@ -543,7 +543,7 @@ Usage:
   -i, --max_iter arg            set the maximum number of CG iterations (default: num_features)
   -l, --solver arg              choose the solver: automatic|cg_explicit|cg_implicit (default: automatic)
   -a, --classification arg      the classification strategy to use for multi-class classification: oaa|oao (default: oaa)
-  -b, --backend arg             choose the backend: automatic|openmp|cuda|hip|opencl|sycl|stdpar (default: automatic)
+  -b, --backend arg             choose the backend: automatic|openmp|cuda|hip|opencl|sycl|kokkos|stdpar (default: automatic)
   -p, --target_platform arg     choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic)
       --sycl_kernel_invocation_type arg
                                 choose the kernel invocation type when using SYCL as backend: automatic|nd_range (default: automatic)
@@ -621,7 +621,7 @@ LS-SVM with multiple (GPU-)backends
 Usage:
   ./plssvm-predict [OPTION...] test_file model_file [output_file]
 
-  -b, --backend arg             choose the backend: automatic|openmp|cuda|hip|opencl|sycl|stdpar (default: automatic)
+  -b, --backend arg             choose the backend: automatic|openmp|cuda|hip|opencl|sycl|kokkos|stdpar (default: automatic)
   -p, --target_platform arg     choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic)
       --sycl_implementation_type arg
                                 choose the SYCL implementation to be used in the SYCL backend: automatic|dpcpp|adaptivecpp (default: automatic)

From f967c47bae9a0009338c497eb67a09839fd283e8 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 16:57:04 +0100
Subject: [PATCH 073/123] Add missing include documentation.

---
 bindings/Python/backends/adaptivecpp_csvm.cpp | 2 +-
 bindings/Python/backends/dpcpp_csvm.cpp       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bindings/Python/backends/adaptivecpp_csvm.cpp b/bindings/Python/backends/adaptivecpp_csvm.cpp
index 767853757..bf81b11ae 100644
--- a/bindings/Python/backends/adaptivecpp_csvm.cpp
+++ b/bindings/Python/backends/adaptivecpp_csvm.cpp
@@ -11,7 +11,7 @@
 #include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/csvm.hpp"                                   // plssvm::csvm
 #include "plssvm/exceptions/exceptions.hpp"                  // plssvm::exception
-#include "plssvm/parameter.hpp"                              // plssvm::parameter
+#include "plssvm/parameter.hpp"                              // plssvm::parameter, plssvm::sycl_kernel_invocation_type
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "bindings/Python/utility.hpp"  // check_kwargs_for_correctness, convert_kwargs_to_parameter, register_py_exception
diff --git a/bindings/Python/backends/dpcpp_csvm.cpp b/bindings/Python/backends/dpcpp_csvm.cpp
index 882d6ea37..906cb5979 100644
--- a/bindings/Python/backends/dpcpp_csvm.cpp
+++ b/bindings/Python/backends/dpcpp_csvm.cpp
@@ -11,7 +11,7 @@
 #include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/csvm.hpp"                                   // plssvm::csvm
 #include "plssvm/exceptions/exceptions.hpp"                  // plssvm::exception
-#include "plssvm/parameter.hpp"                              // plssvm::parameter
+#include "plssvm/parameter.hpp"                              // plssvm::parameter, plssvm::sycl_kernel_invocation_type
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "bindings/Python/utility.hpp"  // check_kwargs_for_correctness, convert_kwargs_to_parameter, register_py_exception

From d56397ad570d7da0264d6d2dae937e8bfc7a554e Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 16:58:09 +0100
Subject: [PATCH 074/123] Add kokkos_execution_space named parameter to the
 Kokkos python bindings.

---
 bindings/Python/backends/kokkos_csvm.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/bindings/Python/backends/kokkos_csvm.cpp b/bindings/Python/backends/kokkos_csvm.cpp
index ea0dc17e6..ea6c4af80 100644
--- a/bindings/Python/backends/kokkos_csvm.cpp
+++ b/bindings/Python/backends/kokkos_csvm.cpp
@@ -11,7 +11,7 @@
 #include "plssvm/backends/Kokkos/execution_space.hpp"  // plssvm::kokkos::execution_space
 #include "plssvm/csvm.hpp"                             // plssvm::csvm
 #include "plssvm/exceptions/exceptions.hpp"            // plssvm::exception
-#include "plssvm/parameter.hpp"                        // plssvm::parameter
+#include "plssvm/parameter.hpp"                        // plssvm::parameter, plssvm::kokkos_execution_space
 #include "plssvm/target_platforms.hpp"                 // plssvm::target_platform
 
 #include "bindings/Python/utility.hpp"  // check_kwargs_for_correctness, convert_kwargs_to_parameter, register_py_exception
@@ -35,20 +35,24 @@ void init_kokkos_csvm(py::module_ &m, const py::exception<plssvm::exception> &ba
         .def(py::init<plssvm::target_platform, plssvm::parameter>(), "create an SVM with the provided target platform and parameter object")
         .def(py::init([](const py::kwargs &args) {
                  // check for valid keys
-                 check_kwargs_for_correctness(args, { "kernel_type", "degree", "gamma", "coef0", "cost" });
+                 check_kwargs_for_correctness(args, { "kernel_type", "degree", "gamma", "coef0", "cost", "kokkos_execution_space" });
                  // if one of the value keyword parameter is provided, set the respective value
                  const plssvm::parameter params = convert_kwargs_to_parameter(args);
+                 // set Kokkos execution space
+                 const plssvm::kokkos::execution_space space = args.contains("kokkos_execution_space") ? args["kokkos_execution_space"].cast<plssvm::kokkos::execution_space>() : plssvm::kokkos::execution_space::automatic;
                  // create CSVM with the default target platform
-                 return std::make_unique<plssvm::kokkos::csvm>(params);
+                 return std::make_unique<plssvm::kokkos::csvm>(params, plssvm::kokkos_execution_space = space);
              }),
              "create an SVM with the default target platform and keyword arguments")
         .def(py::init([](const plssvm::target_platform target, const py::kwargs &args) {
                  // check for valid keys
-                 check_kwargs_for_correctness(args, { "kernel_type", "degree", "gamma", "coef0", "cost" });
+                 check_kwargs_for_correctness(args, { "kernel_type", "degree", "gamma", "coef0", "cost", "kokkos_execution_space" });
                  // if one of the value keyword parameter is provided, set the respective value
                  const plssvm::parameter params = convert_kwargs_to_parameter(args);
+                 // set Kokkos execution space
+                 const plssvm::kokkos::execution_space space = args.contains("kokkos_execution_space") ? args["kokkos_execution_space"].cast<plssvm::kokkos::execution_space>() : plssvm::kokkos::execution_space::automatic;
                  // create CSVM with the provided target platform
-                 return std::make_unique<plssvm::kokkos::csvm>(target, params);
+                 return std::make_unique<plssvm::kokkos::csvm>(target, params, plssvm::kokkos_execution_space = space);
              }),
              "create an SVM with the provided target platform and keyword arguments")
         .def("get_execution_space", &plssvm::kokkos::csvm::get_execution_space, "get the Kokkos execution space used in this Kokkos SVM");
@@ -58,6 +62,7 @@ void init_kokkos_csvm(py::module_ &m, const py::exception<plssvm::exception> &ba
 
     // bind the execution space enum classes
     py::enum_<plssvm::kokkos::execution_space>(kokkos_module, "ExecutionSpace")
+        .value("AUTOMATIC", plssvm::kokkos::execution_space::cuda, "automatically determine the used Kokkos execution space (note: this does not necessarily correspond to Kokkos::DefaultExecutionSpace)")
         .value("CUDA", plssvm::kokkos::execution_space::cuda, "execution space representing execution on a CUDA device")
         .value("HIP", plssvm::kokkos::execution_space::hip, "execution space representing execution on a device supported by HIP")
         .value("SYCL", plssvm::kokkos::execution_space::sycl, "execution space representing execution on a device supported by SYCL")

From 2809daacbbcc243c25a6407a8e33e558e1db62c2 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 17:06:14 +0100
Subject: [PATCH 075/123] Allow Kokkos to be built in the all preset.

---
 cmake/presets/all.json | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/cmake/presets/all.json b/cmake/presets/all.json
index 76528069b..f0794f2ba 100644
--- a/cmake/presets/all.json
+++ b/cmake/presets/all.json
@@ -13,7 +13,8 @@
         "PLSSVM_ENABLE_CUDA_BACKEND": "AUTO",
         "PLSSVM_ENABLE_HIP_BACKEND": "AUTO",
         "PLSSVM_ENABLE_OPENCL_BACKEND": "AUTO",
-        "PLSSVM_ENABLE_SYCL_BACKEND": "AUTO"
+        "PLSSVM_ENABLE_SYCL_BACKEND": "AUTO",
+        "PLSSVM_ENABLE_KOKKOS_BACKEND": "AUTO"
       }
     },
     {
@@ -28,6 +29,7 @@
         "PLSSVM_ENABLE_HIP_BACKEND": "AUTO",
         "PLSSVM_ENABLE_OPENCL_BACKEND": "AUTO",
         "PLSSVM_ENABLE_SYCL_BACKEND": "AUTO",
+        "PLSSVM_ENABLE_KOKKOS_BACKEND": "AUTO",
         "PLSSVM_ENABLE_LANGUAGE_BINDINGS": "ON",
         "PLSSVM_ENABLE_PYTHON_BINDINGS": "ON"
       }
@@ -43,7 +45,8 @@
         "PLSSVM_ENABLE_CUDA_BACKEND": "AUTO",
         "PLSSVM_ENABLE_HIP_BACKEND": "AUTO",
         "PLSSVM_ENABLE_OPENCL_BACKEND": "AUTO",
-        "PLSSVM_ENABLE_SYCL_BACKEND": "AUTO"
+        "PLSSVM_ENABLE_SYCL_BACKEND": "AUTO",
+        "PLSSVM_ENABLE_KOKKOS_BACKEND": "AUTO"
       }
     }
   ],
@@ -84,7 +87,7 @@
       "inherits": "common",
       "filter": {
         "include": {
-          "name": "OpenMP.*|CUDA.*|HIP.*|OpenCL.*|AdaptiveCpp.*|DPCPP.*"
+          "name": "OpenMP.*|CUDA.*|HIP.*|OpenCL.*|AdaptiveCpp.*|DPCPP.*|Kokkos.*"
         }
       }
     }

From dcbbe102dd6d19a56d18ce91380877ed544a39fd Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 17:12:48 +0100
Subject: [PATCH 076/123] Update TODOs.

---
 tests/backends/Kokkos/detail/device_wrapper.cpp | 2 +-
 tests/backends/Kokkos/detail/pinned_memory.cpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/backends/Kokkos/detail/device_wrapper.cpp b/tests/backends/Kokkos/detail/device_wrapper.cpp
index 4547281ff..ca644ece7 100644
--- a/tests/backends/Kokkos/detail/device_wrapper.cpp
+++ b/tests/backends/Kokkos/detail/device_wrapper.cpp
@@ -99,7 +99,7 @@ struct device_list_test {
 
         // check the number of returned devices
         if (space == plssvm::kokkos::execution_space::cuda || space == plssvm::kokkos::execution_space::hip || space == plssvm::kokkos::execution_space::sycl) {
-            // TODO: OpenMP Target Offloading / OpenACC
+            // TODO: Change if multi-GPU support for Kokkos::Experimental::OpenMPTarget and/or Kokkos::Experimental::OpenACC is implemented
             // for the device execution spaces AT LEAST ONE device must be found
             EXPECT_GE(devices.size(), 1);
         } else {
diff --git a/tests/backends/Kokkos/detail/pinned_memory.cpp b/tests/backends/Kokkos/detail/pinned_memory.cpp
index aa91612d7..2569e68e7 100644
--- a/tests/backends/Kokkos/detail/pinned_memory.cpp
+++ b/tests/backends/Kokkos/detail/pinned_memory.cpp
@@ -22,7 +22,7 @@ template <typename T>
 struct kokkos_pinned_memory_test_type {
     using pinned_memory_type = plssvm::kokkos::detail::pinned_memory<T>;
 
-    constexpr static bool can_pin = false;  // TODO: try implementing in Kokkos?
+    constexpr static bool can_pin = false;
 };
 
 using kokkos_pinned_memory_tuple = std::tuple<kokkos_pinned_memory_test_type<float>, kokkos_pinned_memory_test_type<double>>;

From 9a53e7fce2404519151f3aa6e134538be27c4861 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 17:32:07 +0100
Subject: [PATCH 077/123] Add missing Kokkos related performance tracking
 entries.

---
 src/plssvm/detail/tracking/performance_tracker.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/plssvm/detail/tracking/performance_tracker.cpp b/src/plssvm/detail/tracking/performance_tracker.cpp
index 26ebda7d3..6d1323e8e 100644
--- a/src/plssvm/detail/tracking/performance_tracker.cpp
+++ b/src/plssvm/detail/tracking/performance_tracker.cpp
@@ -116,6 +116,7 @@ void performance_tracker::add_tracking_entry(const tracking_entry<cmd::parser_tr
         tracking_entries_[entry.entry_category].emplace("target", std::vector<std::string>{ fmt::format("{}", entry.entry_value.target) });
         tracking_entries_[entry.entry_category].emplace("sycl_kernel_invocation_type", std::vector<std::string>{ fmt::format("{}", entry.entry_value.sycl_kernel_invocation_type) });
         tracking_entries_[entry.entry_category].emplace("sycl_implementation_type", std::vector<std::string>{ fmt::format("{}", entry.entry_value.sycl_implementation_type) });
+        tracking_entries_[entry.entry_category].emplace("kokkos_execution_space", std::vector<std::string>{ fmt::format("{}", entry.entry_value.kokkos_execution_space) });
         tracking_entries_[entry.entry_category].emplace("strings_as_labels", std::vector<std::string>{ fmt::format("{}", entry.entry_value.strings_as_labels) });
         tracking_entries_[entry.entry_category].emplace("real_type", std::vector<std::string>{ std::string{ arithmetic_type_name<real_type>() } });
         tracking_entries_[entry.entry_category].emplace("input_filename", std::vector<std::string>{ fmt::format("\"{}\"", entry.entry_value.input_filename) });
@@ -133,6 +134,7 @@ void performance_tracker::add_tracking_entry(const tracking_entry<cmd::parser_pr
         tracking_entries_[entry.entry_category].emplace("backend", std::vector<std::string>{ fmt::format("{}", entry.entry_value.backend) });
         tracking_entries_[entry.entry_category].emplace("target", std::vector<std::string>{ fmt::format("{}", entry.entry_value.target) });
         tracking_entries_[entry.entry_category].emplace("sycl_implementation_type", std::vector<std::string>{ fmt::format("{}", entry.entry_value.sycl_implementation_type) });
+        tracking_entries_[entry.entry_category].emplace("kokkos_execution_space", std::vector<std::string>{ fmt::format("{}", entry.entry_value.kokkos_execution_space) });
         tracking_entries_[entry.entry_category].emplace("strings_as_labels", std::vector<std::string>{ fmt::format("{}", entry.entry_value.strings_as_labels) });
         tracking_entries_[entry.entry_category].emplace("real_type", std::vector<std::string>{ std::string{ arithmetic_type_name<real_type>() } });
         tracking_entries_[entry.entry_category].emplace("input_filename", std::vector<std::string>{ fmt::format("\"{}\"", entry.entry_value.input_filename) });
@@ -297,6 +299,14 @@ void performance_tracker::save(std::ostream &out) {
         "  ADAPTIVECPP_with_accelerated_CPU:  {}\n",
         adaptivecpp_sscp,
         adaptivecpp_accelerated_cpu);
+#endif
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    // check whether Kokkos::SYCL AOT has been enabled
+    constexpr bool kokkos_sycl_aot = PLSSVM_IS_DEFINED(PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT);
+
+    out << fmt::format(
+        "  KOKKOS_sycl_intel_llvm_with_aot:   {}\n",
+        kokkos_sycl_aot);
 #endif
     out << "\n";
 

From 5636ea14b70142782948a78a4edbb74588f8461f Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 17:47:19 +0100
Subject: [PATCH 078/123] Add missing Kokkos entry to Python bindings backend
 type enumeration.

---
 bindings/Python/backend_types.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bindings/Python/backend_types.cpp b/bindings/Python/backend_types.cpp
index 8a1fa29fb..8c6b97e85 100644
--- a/bindings/Python/backend_types.cpp
+++ b/bindings/Python/backend_types.cpp
@@ -24,7 +24,8 @@ void init_backend_types(py::module_ &m) {
         .value("CUDA", plssvm::backend_type::cuda, "CUDA to target NVIDIA GPUs only")
         .value("HIP", plssvm::backend_type::hip, "HIP to target AMD and NVIDIA GPUs")
         .value("OPENCL", plssvm::backend_type::opencl, "OpenCL to target CPUs and GPUs from different vendors")
-        .value("SYCL", plssvm::backend_type::sycl, "SYCL to target CPUs and GPUs from different vendors; currently tested SYCL implementations are DPC++ and AdaptiveCpp");
+        .value("SYCL", plssvm::backend_type::sycl, "SYCL to target CPUs and GPUs from different vendors; currently tested SYCL implementations are DPC++ and AdaptiveCpp")
+        .value("KOKKOS", plssvm::backend_type::kokkos, "Kokkos to target CPUs and GPUs from different vendors; currently all Kokkos execution spaces except Kokkos::Experimental::OpenMPTarget and Kokkos::Experimental::OpenACC are supported");
 
     // bind free functions
     m.def("list_available_backends", &plssvm::list_available_backends, "list the available backends (as found during CMake configuration)");

From c187ec05353e2029a216187b2e11fed179ccd6c9 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 17:54:15 +0100
Subject: [PATCH 079/123] Add missing operator<< output for the new
 kokkos_execution_space option.

---
 src/plssvm/detail/cmd/parser_predict.cpp | 4 ++++
 src/plssvm/detail/cmd/parser_train.cpp   | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/src/plssvm/detail/cmd/parser_predict.cpp b/src/plssvm/detail/cmd/parser_predict.cpp
index 298c90da0..656d9a76d 100644
--- a/src/plssvm/detail/cmd/parser_predict.cpp
+++ b/src/plssvm/detail/cmd/parser_predict.cpp
@@ -202,6 +202,10 @@ std::ostream &operator<<(std::ostream &out, const parser_predict &params) {
         out << fmt::format("SYCL implementation type: {}\n", params.sycl_implementation_type);
     }
 
+    if (params.backend == backend_type::kokkos || params.backend == backend_type::automatic) {
+        out << fmt::format("Kokkos execution space: {}\n", params.kokkos_execution_space);
+    }
+
     out << fmt::format(
         "label_type: {}\n"
         "real_type: {}\n"
diff --git a/src/plssvm/detail/cmd/parser_train.cpp b/src/plssvm/detail/cmd/parser_train.cpp
index 31964a897..31d5b8719 100644
--- a/src/plssvm/detail/cmd/parser_train.cpp
+++ b/src/plssvm/detail/cmd/parser_train.cpp
@@ -326,6 +326,10 @@ std::ostream &operator<<(std::ostream &out, const parser_train &params) {
             params.sycl_kernel_invocation_type);
     }
 
+    if (params.backend == backend_type::kokkos || params.backend == backend_type::automatic) {
+        out << fmt::format("Kokkos execution space: {}\n", params.kokkos_execution_space);
+    }
+
     out << fmt::format(
         "classification_type: {}\n"
         "label_type: {}\n"

From de69b870fa732c192055423ea5f91309a06d2b91 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 18:07:30 +0100
Subject: [PATCH 080/123] Fix using wrong executable in cmd parser test.

---
 tests/detail/cmd/parser_predict.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/detail/cmd/parser_predict.cpp b/tests/detail/cmd/parser_predict.cpp
index 8a04c3b1d..747e1d2bc 100644
--- a/tests/detail/cmd/parser_predict.cpp
+++ b/tests/detail/cmd/parser_predict.cpp
@@ -270,7 +270,7 @@ class ParserPredictVerbosity : public ParserPredict,
 TEST_P(ParserPredictVerbosity, parsing) {
     const auto &[flag, value] = GetParam();
     // create artificial command line arguments in test fixture
-    this->CreateCMDArgs({ "./plssvm-train", flag, value, "data.libsvm", "data.libsvm.model" });
+    this->CreateCMDArgs({ "./plssvm-predict", flag, value, "data.libsvm", "data.libsvm.model" });
     // create parameter object
     const plssvm::detail::cmd::parser_predict parser{ this->get_argc(), this->get_argv() };
     // test for correctness

From 6a18cc650510a016fb0e2c46cf64ee46affde9f2 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 7 Nov 2024 18:18:43 +0100
Subject: [PATCH 081/123] Update parameter and cmd parser tests to reflect new
 kokkos_execution_space option.

---
 tests/detail/cmd/parser_predict.cpp | 71 +++++++++++++++++++++++++----
 tests/detail/cmd/parser_train.cpp   | 50 +++++++++++++++++++-
 tests/parameter.cpp                 |  4 +-
 3 files changed, 113 insertions(+), 12 deletions(-)

diff --git a/tests/detail/cmd/parser_predict.cpp b/tests/detail/cmd/parser_predict.cpp
index 747e1d2bc..fccf2f005 100644
--- a/tests/detail/cmd/parser_predict.cpp
+++ b/tests/detail/cmd/parser_predict.cpp
@@ -11,6 +11,7 @@
 #include "plssvm/detail/cmd/parser_predict.hpp"
 
 #include "plssvm/backend_types.hpp"                       // plssvm::backend_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"     // plssvm::kokkos::execution_space
 #include "plssvm/backends/SYCL/implementation_types.hpp"  // plssvm::sycl::implementation_type
 #include "plssvm/constants.hpp"                           // plssvm::real_type
 #include "plssvm/target_platforms.hpp"                    // plssvm::target_platform
@@ -67,6 +68,7 @@ TEST_F(ParserPredict, minimal_output) {
         "backend: automatic\n"
         "target platform: automatic\n"
         "SYCL implementation type: automatic\n"
+        "Kokkos execution space: automatic\n"
         "label_type: int (default)\n"
         "real_type: {}\n"
         "input file (data set): 'data.libsvm'\n"
@@ -85,6 +87,10 @@ TEST_F(ParserPredict, all_arguments) {
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
     cmd_args.insert(cmd_args.end(), { "--sycl_implementation_type", "dpcpp" });
 #endif
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    const plssvm::kokkos::execution_space space = plssvm::kokkos::list_available_execution_spaces()[1];  // [0] would be automatic
+    cmd_args.insert(cmd_args.end(), { "--kokkos_execution_space", fmt::format("{}", space) });
+#endif
 #if defined(PLSSVM_PERFORMANCE_TRACKER_ENABLED)
     cmd_args.insert(cmd_args.end(), { "--performance_tracking", "tracking.yaml" });
 #endif
@@ -101,6 +107,11 @@ TEST_F(ParserPredict, all_arguments) {
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::dpcpp);
 #else
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::automatic);
+#endif
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    EXPECT_EQ(parser.kokkos_execution_space, space);
+#else
+    EXPECT_EQ(parser.kokkos_execution_space, plssvm::kokkos::execution_space::automatic);
 #endif
     EXPECT_TRUE(parser.strings_as_labels);
     EXPECT_EQ(parser.input_filename, "data.libsvm");
@@ -117,10 +128,14 @@ TEST_F(ParserPredict, all_arguments) {
 
 TEST_F(ParserPredict, all_arguments_output) {
     // create artificial command line arguments in test fixture
-    std::vector<std::string> cmd_args = { "./plssvm-predict", "--backend", "cuda", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" };
+    std::vector<std::string> cmd_args = { "./plssvm-predict", "--backend", "automatic", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" };
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
     cmd_args.insert(cmd_args.end(), { "--sycl_implementation_type", "dpcpp" });
 #endif
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    const plssvm::kokkos::execution_space space = plssvm::kokkos::list_available_execution_spaces()[1];  // [0] would be automatic
+    cmd_args.insert(cmd_args.end(), { "--kokkos_execution_space", fmt::format("{}", space) });
+#endif
 #if defined(PLSSVM_PERFORMANCE_TRACKER_ENABLED)
     cmd_args.insert(cmd_args.end(), { "--performance_tracking", "tracking.yaml" });
 #endif
@@ -131,15 +146,27 @@ TEST_F(ParserPredict, all_arguments_output) {
     const plssvm::detail::cmd::parser_predict parser{ this->get_argc(), this->get_argv() };
 
     // test output string
-    std::string correct = fmt::format(
-        "backend: cuda\n"
+    std::string correct{
+        "backend: automatic\n"
         "target platform: gpu_nvidia\n"
-        "label_type: std::string\n"
-        "real_type: {}\n"
-        "input file (data set): 'data1.libsvm'\n"
-        "input file (model): 'data2.libsvm.model'\n"
-        "output file (prediction): 'data3.libsvm.predict'\n",
-        std::is_same_v<plssvm::real_type, float> ? "float" : "double (default)");
+    };
+#if defined(PLSSVM_HAS_SYCL_BACKEND)
+    correct += "SYCL implementation type: dpcpp\n";
+#else
+    correct += "SYCL implementation type: automatic\n";
+#endif
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    correct += fmt::format("Kokkos execution space: {}\n", space);
+#else
+    correct += "Kokkos execution space: automatic\n";
+#endif
+    correct += fmt::format("label_type: std::string\n"
+                           "real_type: {}\n"
+                           "input file (data set): 'data1.libsvm'\n"
+                           "input file (model): 'data2.libsvm.model'\n"
+                           "output file (prediction): 'data3.libsvm.predict'\n",
+                           std::is_same_v<plssvm::real_type, float> ? "float" : "double (default)");
+
 #if defined(PLSSVM_PERFORMANCE_TRACKER_ENABLED)
     correct += "performance tracking file: 'tracking.yaml'\n";
 #endif
@@ -220,6 +247,32 @@ INSTANTIATE_TEST_SUITE_P(ParserPredict, ParserPredictSYCLImplementation, ::testi
 
 #endif  // PLSSVM_HAS_SYCL_BACKEND
 
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+
+class ParserPredictKokkosExecutionSpace : public ParserPredict,
+                                        public ::testing::WithParamInterface<std::tuple<std::string, std::string>> { };
+
+TEST_P(ParserPredictKokkosExecutionSpace, parsing) {
+    const auto &[flag, value] = GetParam();
+    // convert string to kokkos::execution_space
+    const auto kokkos_execution_space = util::convert_from_string<plssvm::kokkos::execution_space>(value);
+    // create artificial command line arguments in test fixture
+    this->CreateCMDArgs({ "./plssvm-predict", flag, value, "data.libsvm", "data.libsvm.model" });
+    // create parameter object
+    const plssvm::detail::cmd::parser_predict parser{ this->get_argc(), this->get_argv() };
+    // test for correctness
+    EXPECT_EQ(parser.kokkos_execution_space, kokkos_execution_space);
+}
+
+// clang-format off
+INSTANTIATE_TEST_SUITE_P(ParserTrain, ParserPredictKokkosExecutionSpace, ::testing::Combine(
+                ::testing::Values("--kokkos_execution_space"),
+                ::testing::Values("automatic", "Cuda", "HIP", "SYCL", "HPX", "OpenMP", "OpenMPTarget", "OpenACC", "Threads", "Serial")),
+                naming::pretty_print_parameter_flag_and_value<ParserPredictKokkosExecutionSpace>);
+// clang-format on
+
+#endif  // PLSSVM_HAS_KOKKOS_BACKEND
+
 #if defined(PLSSVM_PERFORMANCE_TRACKER_ENABLED)
 
 class ParserPredictPerformanceTrackingFilename : public ParserPredict,
diff --git a/tests/detail/cmd/parser_train.cpp b/tests/detail/cmd/parser_train.cpp
index ba1392d75..78d43a25f 100644
--- a/tests/detail/cmd/parser_train.cpp
+++ b/tests/detail/cmd/parser_train.cpp
@@ -11,6 +11,7 @@
 #include "plssvm/detail/cmd/parser_train.hpp"
 
 #include "plssvm/backend_types.hpp"                          // plssvm::backend_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
 #include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::implementation_type
 #include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/classification_types.hpp"                   // plssvm::classification_type
@@ -88,6 +89,7 @@ TEST_F(ParserTrain, minimal_output) {
         "solver: automatic\n"
         "SYCL implementation type: automatic\n"
         "SYCL kernel invocation type: automatic\n"
+        "Kokkos execution space: automatic\n"
         "classification_type: one vs. all\n"
         "label_type: int\n"
         "real_type: {}\n"
@@ -105,6 +107,10 @@ TEST_F(ParserTrain, all_arguments) {
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
     cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "nd_range", "--sycl_implementation_type", "dpcpp" });
 #endif
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    const plssvm::kokkos::execution_space space = plssvm::kokkos::list_available_execution_spaces()[1];  // [0] would be automatic
+    cmd_args.insert(cmd_args.end(), { "--kokkos_execution_space", fmt::format("{}", space) });
+#endif
 #if defined(PLSSVM_PERFORMANCE_TRACKER_ENABLED)
     cmd_args.insert(cmd_args.end(), { "--performance_tracking", "tracking.yaml" });
 #endif
@@ -134,6 +140,11 @@ TEST_F(ParserTrain, all_arguments) {
 #else
     EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::automatic);
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::automatic);
+#endif
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    EXPECT_EQ(parser.kokkos_execution_space, space);
+#else
+    EXPECT_EQ(parser.kokkos_execution_space, plssvm::kokkos::execution_space::automatic);
 #endif
     EXPECT_TRUE(parser.strings_as_labels);
     EXPECT_EQ(parser.input_filename, "data.libsvm");
@@ -147,10 +158,14 @@ TEST_F(ParserTrain, all_arguments) {
 
 TEST_F(ParserTrain, all_arguments_output) {
     // create artificial command line arguments in test fixture
-    std::vector<std::string> cmd_args = { "./plssvm-train", "--kernel_type", "1", "--degree", "2", "--gamma", "1.5", "--coef0", "-1.5", "--cost", "2", "--epsilon", "1e-10", "--max_iter", "100", "--classification", "oao", "--solver", "cg_implicit", "--backend", "sycl", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" };
+    std::vector<std::string> cmd_args = { "./plssvm-train", "--kernel_type", "1", "--degree", "2", "--gamma", "1.5", "--coef0", "-1.5", "--cost", "2", "--epsilon", "1e-10", "--max_iter", "100", "--classification", "oao", "--solver", "cg_implicit", "--backend", "automatic", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" };
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
     cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "nd_range", "--sycl_implementation_type", "dpcpp" });
 #endif
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    const std::string space = fmt::format("{}", plssvm::kokkos::list_available_execution_spaces()[1]);  // [0] would be automatic
+    cmd_args.insert(cmd_args.end(), { "--kokkos_execution_space", space });
+#endif
 #if defined(PLSSVM_PERFORMANCE_TRACKER_ENABLED)
     cmd_args.insert(cmd_args.end(), { "--performance_tracking", "tracking.yaml" });
 #endif
@@ -169,7 +184,7 @@ TEST_F(ParserTrain, all_arguments_output) {
         "cost: 2\n"
         "epsilon: 1e-10\n"
         "max_iter: 100\n"
-        "backend: sycl\n"
+        "backend: automatic\n"
         "target platform: gpu_nvidia\n"
         "solver: cg_implicit\n";
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
@@ -178,6 +193,11 @@ TEST_F(ParserTrain, all_arguments_output) {
 #else
     correct += "SYCL implementation type: automatic\n"
                "SYCL kernel invocation type: automatic\n";
+#endif
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    correct += fmt::format("Kokkos execution space: {}\n", space);
+#else
+    correct += "Kokkos execution space: automatic\n";
 #endif
     correct += fmt::format(
         "classification_type: one vs. one\n"
@@ -517,6 +537,32 @@ INSTANTIATE_TEST_SUITE_P(ParserTrain, ParserTrainSYCLImplementation, ::testing::
 
 #endif  // PLSSVM_HAS_SYCL_BACKEND
 
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+
+class ParserTrainKokkosExecutionSpace : public ParserTrain,
+                                        public ::testing::WithParamInterface<std::tuple<std::string, std::string>> { };
+
+TEST_P(ParserTrainKokkosExecutionSpace, parsing) {
+    const auto &[flag, value] = GetParam();
+    // convert string to kokkos::execution_space
+    const auto kokkos_execution_space = util::convert_from_string<plssvm::kokkos::execution_space>(value);
+    // create artificial command line arguments in test fixture
+    this->CreateCMDArgs({ "./plssvm-train", flag, value, "data.libsvm" });
+    // create parameter object
+    const plssvm::detail::cmd::parser_train parser{ this->get_argc(), this->get_argv() };
+    // test for correctness
+    EXPECT_EQ(parser.kokkos_execution_space, kokkos_execution_space);
+}
+
+// clang-format off
+INSTANTIATE_TEST_SUITE_P(ParserTrain, ParserTrainKokkosExecutionSpace, ::testing::Combine(
+                ::testing::Values("--kokkos_execution_space"),
+                ::testing::Values("automatic", "Cuda", "HIP", "SYCL", "HPX", "OpenMP", "OpenMPTarget", "OpenACC", "Threads", "Serial")),
+                naming::pretty_print_parameter_flag_and_value<ParserTrainKokkosExecutionSpace>);
+// clang-format on
+
+#endif  // PLSSVM_HAS_KOKKOS_BACKEND
+
 #if defined(PLSSVM_PERFORMANCE_TRACKER_ENABLED)
 
 class ParserTrainPerformanceTrackingFilename : public ParserTrain,
diff --git a/tests/parameter.cpp b/tests/parameter.cpp
index 588fc703c..7db96c6bf 100644
--- a/tests/parameter.cpp
+++ b/tests/parameter.cpp
@@ -10,6 +10,7 @@
 
 #include "plssvm/parameter.hpp"
 
+#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
 #include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::implementation_type
 #include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::real_type
@@ -98,7 +99,8 @@ TEST(Parameter, construct_parameter_and_named_args) {
     const plssvm::parameter param{ param_base,
                                    plssvm::kernel_type = plssvm::kernel_function_type::rbf,
                                    plssvm::sycl_implementation_type = plssvm::sycl::implementation_type::adaptivecpp,
-                                   plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range };
+                                   plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range,
+                                   plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda };
 
     // test default values
     EXPECT_EQ(param.kernel_type, plssvm::kernel_function_type::rbf);

From 0ac498b0d4a50c436e286fd4a54b5ce9c80d960b Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 8 Nov 2024 12:09:43 +0100
Subject: [PATCH 082/123] Add missing command line parameters to tests.

---
 tests/backends/Kokkos/kokkos_csvm.cpp | 775 +++++++++++++++++++++-----
 tests/detail/cmd/parser_predict.cpp   |   2 +-
 tests/detail/cmd/parser_train.cpp     |   2 +-
 3 files changed, 648 insertions(+), 131 deletions(-)

diff --git a/tests/backends/Kokkos/kokkos_csvm.cpp b/tests/backends/Kokkos/kokkos_csvm.cpp
index e7af88d5b..5fe50d46e 100644
--- a/tests/backends/Kokkos/kokkos_csvm.cpp
+++ b/tests/backends/Kokkos/kokkos_csvm.cpp
@@ -8,155 +8,672 @@
  * @brief Tests for the functionality related to the Kokkos backend.
  */
 
-#include "plssvm/backends/Kokkos/csvm.hpp"        // plssvm::kokkos::csvm
-#include "plssvm/backends/Kokkos/exceptions.hpp"  // plssvm::kokkos::backend_exception
-#include "plssvm/detail/type_list.hpp"          // plssvm::detail::label_type_list
-#include "plssvm/kernel_function_types.hpp"     // plssvm::kernel_function_type
-#include "plssvm/parameter.hpp"                 // plssvm::parameter
-#include "plssvm/target_platforms.hpp"          // plssvm::target_platform
+#include "plssvm/backends/Kokkos/csvm.hpp"             // plssvm::kokkos::csvm
+#include "plssvm/backends/Kokkos/detail/utility.hpp"   // plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping
+#include "plssvm/backends/Kokkos/exceptions.hpp"       // plssvm::kokkos::backend_exception
+#include "plssvm/backends/Kokkos/execution_space.hpp"  // plssvm::kokkos::execution_space
+#include "plssvm/detail/type_list.hpp"                 // plssvm::detail::label_type_list
+#include "plssvm/kernel_function_types.hpp"            // plssvm::kernel_function_type
+#include "plssvm/parameter.hpp"                        // plssvm::parameter
+#include "plssvm/target_platforms.hpp"                 // plssvm::target_platform, plssvm::list_available_target_platforms
 
-#include "tests/backends/Kokkos/mock_kokkos_csvm.hpp"
 #include "tests/backends/generic_csvm_tests.hpp"      // generic CSVM tests to instantiate
 #include "tests/backends/generic_gpu_csvm_tests.hpp"  // generic GPU CSVM tests to instantiate
-#include "tests/custom_test_macros.hpp"               // EXPECT_THROW_WHAT
-#include "tests/naming.hpp"                           // naming::test_parameter_to_name
-#include "tests/types_to_test.hpp"                    // util::{cartesian_type_product_t, combine_test_parameters_gtest_t}
-#include "tests/utility.hpp"                          // util::redirect_output
+#include "tests/backends/Kokkos/mock_kokkos_csvm.hpp"
+#include "tests/custom_test_macros.hpp"  // EXPECT_THROW_WHAT
+#include "tests/naming.hpp"              // naming::test_parameter_to_name
+#include "tests/types_to_test.hpp"       // util::{cartesian_type_product_t, combine_test_parameters_gtest_t}
+#include "tests/utility.hpp"             // util::redirect_output
 
 #include "gtest/gtest.h"  // TEST_F, EXPECT_NO_THROW, INSTANTIATE_TYPED_TEST_SUITE_P, ::testing::Test
 
-#include <tuple>  // std::make_tuple, std::tuple
+#include <map>     // std::map
+#include <tuple>   // std::make_tuple, std::tuple
+#include <vector>  // std::vector
 
 class KokkosCSVM : public ::testing::Test,
-                 private util::redirect_output<> { };
-
-//// check whether the constructor correctly fails when using an incompatible target platform
-//TEST_F(CUDACSVM, construct_parameter) {
-//#if defined(PLSSVM_HAS_NVIDIA_TARGET)
-//    // the automatic target platform must always be available
-//    EXPECT_NO_THROW(plssvm::cuda::csvm{ plssvm::parameter{} });
-//#else
-//    EXPECT_THROW_WHAT(plssvm::cuda::csvm{ plssvm::parameter{} },
-//                      plssvm::cuda::backend_exception,
-//                      "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
-//#endif
-//}
-//
-//TEST_F(CUDACSVM, construct_target_and_parameter) {
-//    // create parameter struct
-//    const plssvm::parameter params{};
-//
-//#if defined(PLSSVM_HAS_NVIDIA_TARGET)
-//    // only automatic or gpu_nvidia are allowed as target platform for the CUDA backend
-//    EXPECT_NO_THROW((plssvm::cuda::csvm{ plssvm::target_platform::automatic, params }));
-//    EXPECT_NO_THROW((plssvm::cuda::csvm{ plssvm::target_platform::gpu_nvidia, params }));
-//#else
-//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::automatic, params }),
-//                      plssvm::cuda::backend_exception,
-//                      "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
-//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::gpu_nvidia, params }),
-//                      plssvm::cuda::backend_exception,
-//                      "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
-//#endif
-//
-//    // all other target platforms must throw
-//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::cpu, params }),
-//                      plssvm::cuda::backend_exception,
-//                      "Invalid target platform 'cpu' for the CUDA backend!");
-//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::gpu_amd, params }),
-//                      plssvm::cuda::backend_exception,
-//                      "Invalid target platform 'gpu_amd' for the CUDA backend!");
-//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::gpu_intel, params }),
-//                      plssvm::cuda::backend_exception,
-//                      "Invalid target platform 'gpu_intel' for the CUDA backend!");
-//}
-//
-//TEST_F(CUDACSVM, construct_named_args) {
-//#if defined(PLSSVM_HAS_NVIDIA_TARGET)
-//    // only automatic or gpu_nvidia are allowed as target platform for the CUDA backend
-//    EXPECT_NO_THROW((plssvm::cuda::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
-//    EXPECT_NO_THROW((plssvm::cuda::csvm{ plssvm::cost = 2.0 }));
-//#else
-//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }),
-//                      plssvm::cuda::backend_exception,
-//                      "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
-//#endif
-//}
-//
-//TEST_F(CUDACSVM, construct_target_and_named_args) {
-//#if defined(PLSSVM_HAS_NVIDIA_TARGET)
-//    // only automatic or gpu_nvidia are allowed as target platform for the CUDA backend
-//    EXPECT_NO_THROW((plssvm::cuda::csvm{ plssvm::target_platform::automatic, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
-//    EXPECT_NO_THROW((plssvm::cuda::csvm{ plssvm::target_platform::gpu_nvidia, plssvm::cost = 2.0 }));
-//#else
-//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::automatic, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }),
-//                      plssvm::cuda::backend_exception,
-//                      "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
-//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::gpu_nvidia, plssvm::cost = 2.0 }),
-//                      plssvm::cuda::backend_exception,
-//                      "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
-//#endif
-//
-//    // all other target platforms must throw
-//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::cpu, plssvm::cost = 2.0 }),
-//                      plssvm::cuda::backend_exception,
-//                      "Invalid target platform 'cpu' for the CUDA backend!");
-//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::gpu_amd, plssvm::cost = 2.0 }),
-//                      plssvm::cuda::backend_exception,
-//                      "Invalid target platform 'gpu_amd' for the CUDA backend!");
-//    EXPECT_THROW_WHAT((plssvm::cuda::csvm{ plssvm::target_platform::gpu_intel, plssvm::cost = 2.0 }),
-//                      plssvm::cuda::backend_exception,
-//                      "Invalid target platform 'gpu_intel' for the CUDA backend!");
-//}
-
-template <bool mock_grid_size>
-struct kokkos_csvm_test_type {
-    using mock_csvm_type = mock_kokkos_csvm<mock_grid_size>;
-    using csvm_type = plssvm::kokkos::csvm;
-    using device_ptr_type = typename csvm_type::device_ptr_type;
-    inline constexpr static auto additional_arguments = std::make_tuple();
-};
-
-using kokkos_csvm_test_tuple = std::tuple<kokkos_csvm_test_type<false>>;
-using kokkos_csvm_test_label_type_list = util::cartesian_type_product_t<kokkos_csvm_test_tuple, plssvm::detail::supported_label_types>;
-using kokkos_csvm_test_type_list = util::cartesian_type_product_t<kokkos_csvm_test_tuple>;
+                   private util::redirect_output<> { };
+
+TEST_F(KokkosCSVM, construct_parameter) {  // execution_space automatic, target_platform automatic
+    // check whether the execution space would be automatically determined as either OpenMPTarget or OpenACC
+    const std::map<plssvm::target_platform, std::vector<plssvm::kokkos::execution_space>> available_combinations = plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping();
+    plssvm::kokkos::execution_space space{};
+    for (const plssvm::target_platform target : plssvm::list_available_target_platforms()) {
+        if (plssvm::detail::contains(available_combinations, target)) {
+            space = available_combinations.at(target).front();
+            break;
+        }
+    }
+
+    // must throw an exception if the execution space would be OpenMPTarget or OpenACC
+    if (space == plssvm::kokkos::execution_space::openmp_target || space == plssvm::kokkos::execution_space::openacc) {
+        EXPECT_THROW_WHAT(plssvm::kokkos::csvm{ plssvm::parameter{} },
+                          plssvm::kokkos::backend_exception,
+                          fmt::format("The Kokkos execution space {} is currently not supported !", space));
+    } else {
+        EXPECT_NO_THROW(plssvm::kokkos::csvm{ plssvm::parameter{} });
+    }
+}
+
+TEST_F(KokkosCSVM, construct_target_and_parameter) {  // execution_space automatic, target_platform explicit
+    // create parameter struct
+    const plssvm::parameter params{};
+
+    // automatic should always work
+    EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::automatic, params }));
+
+    const std::map<plssvm::target_platform, std::vector<plssvm::kokkos::execution_space>> available_combinations = plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping();
+    const auto target_supported = [&](const plssvm::target_platform target) {
+        return plssvm::detail::contains(available_combinations, target);
+    };
+
+#if defined(PLSSVM_HAS_CPU_TARGET)
+    if (target_supported(plssvm::target_platform::cpu)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, params }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, params }),
+                          plssvm::kokkos::backend_exception,
+                          fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform cpu!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+    }
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, params }),
+                      plssvm::kokkos::backend_exception,
+                      "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+#endif
+
+#if defined(PLSSVM_HAS_NVIDIA_TARGET)
+    if (target_supported(plssvm::target_platform::gpu_nvidia)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, params }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, params }),
+                          plssvm::kokkos::backend_exception,
+                          fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform gpu_nvidia!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+    }
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, params }),
+                      plssvm::kokkos::backend_exception,
+                      "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+#endif
+
+#if defined(PLSSVM_HAS_AMD_TARGET)
+    if (target_supported(plssvm::target_platform::gpu_amd)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, params }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, params }),
+                          plssvm::kokkos::backend_exception,
+                          fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform gpu_amd!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+    }
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, params }),
+                      plssvm::kokkos::backend_exception,
+                      "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+#endif
+
+#if defined(PLSSVM_HAS_INTEL_TARGET)
+    if (target_supported(plssvm::target_platform::gpu_intel)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, params }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, params }),
+                          plssvm::kokkos::backend_exception,
+                          fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform gpu_intel!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+    }
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, params }),
+                      plssvm::kokkos::backend_exception,
+                      "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+#endif
+}
+
+TEST_F(KokkosCSVM, construct_execution_space_and_parameter) {  // execution_space explicit, target_platform automatic
+    // create parameter struct
+    const plssvm::parameter params{};
+
+    // automatic should always work
+    EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::automatic }));
+
+#if defined(KOKKOS_ENABLE_CUDA)
+    // explicitly providing the Cuda execution space should work
+    EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda }));
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda }),
+                      plssvm::kokkos::backend_exception,
+                      fmt::format("The provided Kokkos::ExecutionSpace Cuda is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+#endif
+
+#if defined(KOKKOS_ENABLE_HIP)
+    // explicitly providing the HIP execution space should work
+    EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hip }));
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hip }),
+                      plssvm::kokkos::backend_exception,
+                      fmt::format("The provided Kokkos::ExecutionSpace HIP is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+#endif
+
+#if defined(KOKKOS_ENABLE_SYCL)
+    // explicitly providing the SYCL execution space should work
+    EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::sycl }));
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::sycl }),
+                      plssvm::kokkos::backend_exception,
+                      fmt::format("The provided Kokkos::ExecutionSpace SYCL is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+#endif
+
+#if defined(KOKKOS_ENABLE_HPX)
+    // explicitly providing the HPX execution space should work
+    EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hpx }));
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hpx }),
+                      plssvm::kokkos::backend_exception,
+                      fmt::format("The provided Kokkos::ExecutionSpace HPX is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENMP)
+    // explicitly providing the OpenMP execution space should work
+    EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp }));
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp }),
+                      plssvm::kokkos::backend_exception,
+                      fmt::format("The provided Kokkos::ExecutionSpace OpenMP is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+    // explicitly providing the OpenMPTarget execution space currently unsupported
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp_target }),
+                      plssvm::kokkos::backend_exception,
+                      "The Kokkos execution space OpenMPTarget is currently not supported !");
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp_target }),
+                      plssvm::kokkos::backend_exception,
+                      fmt::format("The provided Kokkos::ExecutionSpace OpenMPTarget is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENACC)
+    // explicitly providing the OpenACC execution space currently unsupported
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openacc }),
+                      plssvm::kokkos::backend_exception,
+                      "The Kokkos execution space OpenACC is currently not supported !");
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openacc }),
+                      plssvm::kokkos::backend_exception,
+                      fmt::format("The provided Kokkos::ExecutionSpace OpenACC is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+#endif
+
+#if defined(KOKKOS_ENABLE_THREADS)
+    // explicitly providing the Threads execution space should work
+    EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::threads }));
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::threads }),
+                      plssvm::kokkos::backend_exception,
+                      fmt::format("The provided Kokkos::ExecutionSpace Threads is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+#endif
+
+#if defined(KOKKOS_ENABLE_SERIAL)
+    // explicitly providing the Serial execution space should work
+    EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::serial }));
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::serial }),
+                      plssvm::kokkos::backend_exception,
+                      fmt::format("The provided Kokkos::ExecutionSpace Serial is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+#endif
+}
+
+TEST_F(KokkosCSVM, construct_target_and_execution_space_and_parameter) {  // execution_space explicit, target_platform explicit
+    // create parameter struct
+    const plssvm::parameter params{};
+
+    // list all possible execution spaces
+    std::vector<plssvm::kokkos::execution_space> all_execution_spaces{
+        plssvm::kokkos::execution_space::cuda,
+        plssvm::kokkos::execution_space::hip,
+        plssvm::kokkos::execution_space::sycl,
+        plssvm::kokkos::execution_space::hpx,
+        plssvm::kokkos::execution_space::openmp,
+        plssvm::kokkos::execution_space::openmp_target,
+        plssvm::kokkos::execution_space::openacc,
+        plssvm::kokkos::execution_space::threads,
+        plssvm::kokkos::execution_space::serial
+    };
+    const std::map<plssvm::target_platform, std::vector<plssvm::kokkos::execution_space>> available_combinations = plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping();
+    const auto combination_exists = [&](const plssvm::target_platform target, const plssvm::kokkos::execution_space space) {
+        return plssvm::detail::contains(available_combinations, target) && plssvm::detail::contains(available_combinations.at(target), space);
+    };
+    const auto execution_space_available = [&](const plssvm::kokkos::execution_space space) {
+        return plssvm::detail::contains(plssvm::kokkos::list_available_execution_spaces(), space);
+    };
+
+#if defined(PLSSVM_HAS_CPU_TARGET)
+    for (const plssvm::kokkos::execution_space space : all_execution_spaces) {
+        if (!execution_space_available(space)) {
+            EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, params, plssvm::kokkos_execution_space = space }),
+                              plssvm::kokkos::backend_exception,
+                              fmt::format("The provided Kokkos::ExecutionSpace {} is not available, available are: {}!", space, fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+        } else if (combination_exists(plssvm::target_platform::cpu, space)) {
+            EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, params, plssvm::kokkos_execution_space = space }));
+        } else {
+            EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, params, plssvm::kokkos_execution_space = space }),
+                              plssvm::kokkos::backend_exception,
+                              fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform cpu!", space));
+        }
+    }
+#else
+    for (const plssvm::kokkos::execution_space space : all_execution_spaces) {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, params, plssvm::kokkos_execution_space = space }),
+                          plssvm::kokkos::backend_exception,
+                          "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+    }
+#endif
+
+#if defined(PLSSVM_HAS_NVIDIA_TARGET)
+    for (const plssvm::kokkos::execution_space space : all_execution_spaces) {
+        if (!execution_space_available(space)) {
+            EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, params, plssvm::kokkos_execution_space = space }),
+                              plssvm::kokkos::backend_exception,
+                              fmt::format("The provided Kokkos::ExecutionSpace {} is not available, available are: {}!", space, fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+        } else if (combination_exists(plssvm::target_platform::gpu_nvidia, space)) {
+            EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, params, plssvm::kokkos_execution_space = space }));
+        } else {
+            EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, params, plssvm::kokkos_execution_space = space }),
+                              plssvm::kokkos::backend_exception,
+                              fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform gpu_nvidia!", space));
+        }
+    }
+#else
+    for (const plssvm::kokkos::execution_space space : all_execution_spaces) {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, params, plssvm::kokkos_execution_space = space }),
+                          plssvm::kokkos::backend_exception,
+                          "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+    }
+#endif
+
+#if defined(PLSSVM_HAS_AMD_TARGET)
+    for (const plssvm::kokkos::execution_space space : all_execution_spaces) {
+        if (!execution_space_available(space)) {
+            EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, params, plssvm::kokkos_execution_space = space }),
+                              plssvm::kokkos::backend_exception,
+                              fmt::format("The provided Kokkos::ExecutionSpace {} is not available, available are: {}!", space, fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+        } else if (combination_exists(plssvm::target_platform::gpu_amd, space)) {
+            EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, params, plssvm::kokkos_execution_space = space }));
+        } else {
+            EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, params, plssvm::kokkos_execution_space = space }),
+                              plssvm::kokkos::backend_exception,
+                              fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform gpu_amd!", space));
+        }
+    }
+#else
+    for (const plssvm::kokkos::execution_space space : all_execution_spaces) {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, params, plssvm::kokkos_execution_space = space }),
+                          plssvm::kokkos::backend_exception,
+                          "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+    }
+#endif
+
+#if defined(PLSSVM_HAS_INTEL_TARGET)
+    for (const plssvm::kokkos::execution_space space : all_execution_spaces) {
+        if (!execution_space_available(space)) {
+            EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, params, plssvm::kokkos_execution_space = space }),
+                              plssvm::kokkos::backend_exception,
+                              fmt::format("The provided Kokkos::ExecutionSpace {} is not available, available are: {}!", space, fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+        } else if (combination_exists(plssvm::target_platform::gpu_intel, space)) {
+            EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, params, plssvm::kokkos_execution_space = space }));
+        } else {
+            EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, params, plssvm::kokkos_execution_space = space }),
+                              plssvm::kokkos::backend_exception,
+                              fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform gpu_intel!", space));
+        }
+    }
+#else
+    for (const plssvm::kokkos::execution_space space : all_execution_spaces) {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, params, plssvm::kokkos_execution_space = space }),
+                          plssvm::kokkos::backend_exception,
+                          "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+    }
+#endif
+}
+
+TEST_F(KokkosCSVM, construct_named_args) {  // execution_space automatic, target_platform automatic
+    // check whether the execution space would be automatically determined as either OpenMPTarget or OpenACC
+    const std::map<plssvm::target_platform, std::vector<plssvm::kokkos::execution_space>> available_combinations = plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping();
+    plssvm::kokkos::execution_space space{};
+    for (const plssvm::target_platform target : plssvm::list_available_target_platforms()) {
+        if (plssvm::detail::contains(available_combinations, target)) {
+            space = available_combinations.at(target).front();
+            break;
+        }
+    }
+
+    // must throw an exception if the execution space would be OpenMPTarget or OpenACC
+    if (space == plssvm::kokkos::execution_space::openmp_target || space == plssvm::kokkos::execution_space::openacc) {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }),
+                          plssvm::kokkos::backend_exception,
+                          fmt::format("The Kokkos execution space {} is currently not supported !", space));
+    } else {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
+        EXPECT_NO_THROW(plssvm::kokkos::csvm{ plssvm::cost = 2.0 });
+    }
+}
+
+TEST_F(KokkosCSVM, construct_target_and_named_args) {  // execution_space automatic, target_platform explicit
+    // automatic should always work
+    EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::automatic, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
+
+    const std::map<plssvm::target_platform, std::vector<plssvm::kokkos::execution_space>> available_combinations = plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping();
+    const auto target_supported = [&](const plssvm::target_platform target) {
+        return plssvm::detail::contains(available_combinations, target);
+    };
+
+#if defined(PLSSVM_HAS_CPU_TARGET)
+    if (target_supported(plssvm::target_platform::cpu)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }),
+                          plssvm::kokkos::backend_exception,
+                          fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform cpu!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+    }
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }),
+                      plssvm::kokkos::backend_exception,
+                      "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+#endif
+
+#if defined(PLSSVM_HAS_NVIDIA_TARGET)
+    if (target_supported(plssvm::target_platform::gpu_nvidia)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }),
+                          plssvm::kokkos::backend_exception,
+                          fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform gpu_nvidia!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+    }
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }),
+                      plssvm::kokkos::backend_exception,
+                      "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+#endif
+
+#if defined(PLSSVM_HAS_AMD_TARGET)
+    if (target_supported(plssvm::target_platform::gpu_amd)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }),
+                          plssvm::kokkos::backend_exception,
+                          fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform gpu_amd!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+    }
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }),
+                      plssvm::kokkos::backend_exception,
+                      "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+#endif
+
+#if defined(PLSSVM_HAS_INTEL_TARGET)
+    if (target_supported(plssvm::target_platform::gpu_intel)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }),
+                          plssvm::kokkos::backend_exception,
+                          fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform gpu_intel!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+    }
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }),
+                      plssvm::kokkos::backend_exception,
+                      "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+#endif
+}
+
+TEST_F(KokkosCSVM, construct_execution_space_and_named_args) {  // execution_space explicit, target_platform automatic
+    // automatic should always work
+    EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::automatic }));
+
+#if defined(KOKKOS_ENABLE_CUDA)
+    // explicitly providing the Cuda execution space should work
+    EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda }));
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda }),
+                      plssvm::kokkos::backend_exception,
+                      fmt::format("The provided Kokkos::ExecutionSpace Cuda is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+#endif
+
+#if defined(KOKKOS_ENABLE_HIP)
+    // explicitly providing the HIP execution space should work
+    EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hip }));
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hip }),
+                      plssvm::kokkos::backend_exception,
+                      fmt::format("The provided Kokkos::ExecutionSpace HIP is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+#endif
+
+#if defined(KOKKOS_ENABLE_SYCL)
+    // explicitly providing the SYCL execution space should work
+    EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::sycl }));
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::sycl }),
+                      plssvm::kokkos::backend_exception,
+                      fmt::format("The provided Kokkos::ExecutionSpace SYCL is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+#endif
+
+#if defined(KOKKOS_ENABLE_HPX)
+    // explicitly providing the HPX execution space should work
+    EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hpx }));
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hpx }),
+                      plssvm::kokkos::backend_exception,
+                      fmt::format("The provided Kokkos::ExecutionSpace HPX is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENMP)
+    // explicitly providing the OpenMP execution space should work
+    EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp }));
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp }),
+                      plssvm::kokkos::backend_exception,
+                      fmt::format("The provided Kokkos::ExecutionSpace OpenMP is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+    // explicitly providing the OpenMPTarget execution space currently unsupported
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp_target }),
+                      plssvm::kokkos::backend_exception,
+                      "The Kokkos execution space OpenMPTarget is currently not supported !");
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp_target }),
+                      plssvm::kokkos::backend_exception,
+                      fmt::format("The provided Kokkos::ExecutionSpace OpenMPTarget is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+#endif
+
+#if defined(KOKKOS_ENABLE_OPENACC)
+    // explicitly providing the OpenACC execution space currently unsupported
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openacc }),
+                      plssvm::kokkos::backend_exception,
+                      "The Kokkos execution space OpenACC is currently not supported !");
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openacc }),
+                      plssvm::kokkos::backend_exception,
+                      fmt::format("The provided Kokkos::ExecutionSpace OpenACC is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+#endif
+
+#if defined(KOKKOS_ENABLE_THREADS)
+    // explicitly providing the Threads execution space should work
+    EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::threads }));
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::threads }),
+                      plssvm::kokkos::backend_exception,
+                      fmt::format("The provided Kokkos::ExecutionSpace Threads is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+#endif
+
+#if defined(KOKKOS_ENABLE_SERIAL)
+    // explicitly providing the Serial execution space should work
+    EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::serial }));
+#else
+    EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::serial }),
+                      plssvm::kokkos::backend_exception,
+                      fmt::format("The provided Kokkos::ExecutionSpace Serial is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+#endif
+}
+
+TEST_F(KokkosCSVM, construct_target_and_execution_space_and_named_args) {  // execution_space explicit, target_platform explicit
+    // list all possible execution spaces
+    std::vector<plssvm::kokkos::execution_space> all_execution_spaces{
+        plssvm::kokkos::execution_space::cuda,
+        plssvm::kokkos::execution_space::hip,
+        plssvm::kokkos::execution_space::sycl,
+        plssvm::kokkos::execution_space::hpx,
+        plssvm::kokkos::execution_space::openmp,
+        plssvm::kokkos::execution_space::openmp_target,
+        plssvm::kokkos::execution_space::openacc,
+        plssvm::kokkos::execution_space::threads,
+        plssvm::kokkos::execution_space::serial
+    };
+    const std::map<plssvm::target_platform, std::vector<plssvm::kokkos::execution_space>> available_combinations = plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping();
+    const auto combination_exists = [&](const plssvm::target_platform target, const plssvm::kokkos::execution_space space) {
+        return plssvm::detail::contains(available_combinations, target) && plssvm::detail::contains(available_combinations.at(target), space);
+    };
+    const auto execution_space_available = [&](const plssvm::kokkos::execution_space space) {
+        return plssvm::detail::contains(plssvm::kokkos::list_available_execution_spaces(), space);
+    };
+
+#if defined(PLSSVM_HAS_CPU_TARGET)
+    for (const plssvm::kokkos::execution_space space : all_execution_spaces) {
+        if (!execution_space_available(space)) {
+            EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }),
+                              plssvm::kokkos::backend_exception,
+                              fmt::format("The provided Kokkos::ExecutionSpace {} is not available, available are: {}!", space, fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+        } else if (combination_exists(plssvm::target_platform::cpu, space)) {
+            EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }));
+        } else {
+            EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }),
+                              plssvm::kokkos::backend_exception,
+                              fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform cpu!", space));
+        }
+    }
+#else
+    for (const plssvm::kokkos::execution_space space : all_execution_spaces) {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }),
+                          plssvm::kokkos::backend_exception,
+                          "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+    }
+#endif
+
+#if defined(PLSSVM_HAS_NVIDIA_TARGET)
+    for (const plssvm::kokkos::execution_space space : all_execution_spaces) {
+        if (!execution_space_available(space)) {
+            EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }),
+                              plssvm::kokkos::backend_exception,
+                              fmt::format("The provided Kokkos::ExecutionSpace {} is not available, available are: {}!", space, fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+        } else if (combination_exists(plssvm::target_platform::gpu_nvidia, space)) {
+            EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }));
+        } else {
+            EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }),
+                              plssvm::kokkos::backend_exception,
+                              fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform gpu_nvidia!", space));
+        }
+    }
+#else
+    for (const plssvm::kokkos::execution_space space : all_execution_spaces) {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }),
+                          plssvm::kokkos::backend_exception,
+                          "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+    }
+#endif
+
+#if defined(PLSSVM_HAS_AMD_TARGET)
+    for (const plssvm::kokkos::execution_space space : all_execution_spaces) {
+        if (!execution_space_available(space)) {
+            EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }),
+                              plssvm::kokkos::backend_exception,
+                              fmt::format("The provided Kokkos::ExecutionSpace {} is not available, available are: {}!", space, fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+        } else if (combination_exists(plssvm::target_platform::gpu_amd, space)) {
+            EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }));
+        } else {
+            EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }),
+                              plssvm::kokkos::backend_exception,
+                              fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform gpu_amd!", space));
+        }
+    }
+#else
+    for (const plssvm::kokkos::execution_space space : all_execution_spaces) {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }),
+                          plssvm::kokkos::backend_exception,
+                          "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+    }
+#endif
+
+#if defined(PLSSVM_HAS_INTEL_TARGET)
+    for (const plssvm::kokkos::execution_space space : all_execution_spaces) {
+        if (!execution_space_available(space)) {
+            EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }),
+                              plssvm::kokkos::backend_exception,
+                              fmt::format("The provided Kokkos::ExecutionSpace {} is not available, available are: {}!", space, fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", ")));
+        } else if (combination_exists(plssvm::target_platform::gpu_intel, space)) {
+            EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }));
+        } else {
+            EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }),
+                              plssvm::kokkos::backend_exception,
+                              fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform gpu_intel!", space));
+        }
+    }
+#else
+    for (const plssvm::kokkos::execution_space space : all_execution_spaces) {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }),
+                          plssvm::kokkos::backend_exception,
+                          "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
+    }
+#endif
+}
+
+TEST_F(KokkosCSVM, get_execution_space) {
+    // construct default CSVM
+    const plssvm::kokkos::csvm svm{ plssvm::parameter{} };
+
+    // after construction: get_execution_space must refer to a plssvm::kokkos::execution_space that is not automatic
+    EXPECT_NE(svm.get_execution_space(), plssvm::kokkos::execution_space::automatic);
+}
+
+ template <bool mock_grid_size>
+ struct kokkos_csvm_test_type {
+     using mock_csvm_type = mock_kokkos_csvm<mock_grid_size>;
+     using csvm_type = plssvm::kokkos::csvm;
+     using device_ptr_type = typename csvm_type::device_ptr_type;
+     inline constexpr static auto additional_arguments = std::make_tuple();
+ };
+
+ using kokkos_csvm_test_tuple = std::tuple<kokkos_csvm_test_type<false>>;
+ using kokkos_csvm_test_label_type_list = util::cartesian_type_product_t<kokkos_csvm_test_tuple, plssvm::detail::supported_label_types>;
+ using kokkos_csvm_test_type_list = util::cartesian_type_product_t<kokkos_csvm_test_tuple>;
 
 // the tests used in the instantiated GTest test suites
-using kokkos_csvm_test_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list>;
-using kokkos_solver_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list, util::solver_type_list>;
-using kokkos_kernel_function_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list, util::kernel_function_type_list>;
-using kokkos_solver_and_kernel_function_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list, util::solver_and_kernel_function_type_list>;
-using kokkos_label_type_kernel_function_and_classification_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_label_type_list, util::kernel_function_and_classification_type_list>;
-using kokkos_label_type_solver_kernel_function_and_classification_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_label_type_list, util::solver_and_kernel_function_and_classification_type_list>;
+ using kokkos_csvm_test_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list>;
+ using kokkos_solver_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list, util::solver_type_list>;
+ using kokkos_kernel_function_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list, util::kernel_function_type_list>;
+ using kokkos_solver_and_kernel_function_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list, util::solver_and_kernel_function_type_list>;
+ using kokkos_label_type_kernel_function_and_classification_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_label_type_list, util::kernel_function_and_classification_type_list>;
+ using kokkos_label_type_solver_kernel_function_and_classification_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_label_type_list, util::solver_and_kernel_function_and_classification_type_list>;
 
 // instantiate type-parameterized tests
 // generic CSVM tests
-INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVM, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
-INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMKernelFunction, kokkos_kernel_function_type_gtest, naming::test_parameter_to_name);
-INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolver, kokkos_solver_type_gtest, naming::test_parameter_to_name);
-INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolverKernelFunction, kokkos_solver_and_kernel_function_type_gtest, naming::test_parameter_to_name);
-INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMKernelFunctionClassification, kokkos_label_type_kernel_function_and_classification_type_gtest, naming::test_parameter_to_name);
-//INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolverKernelFunctionClassification, kokkos_label_type_solver_kernel_function_and_classification_type_gtest, naming::test_parameter_to_name);
+ INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVM, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
+ INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMKernelFunction, kokkos_kernel_function_type_gtest, naming::test_parameter_to_name);
+ INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolver, kokkos_solver_type_gtest, naming::test_parameter_to_name);
+ INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolverKernelFunction, kokkos_solver_and_kernel_function_type_gtest, naming::test_parameter_to_name);
+ INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMKernelFunctionClassification, kokkos_label_type_kernel_function_and_classification_type_gtest, naming::test_parameter_to_name);
+// INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolverKernelFunctionClassification, kokkos_label_type_solver_kernel_function_and_classification_type_gtest, naming::test_parameter_to_name);
 
 // generic CSVM DeathTests
-INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMDeathTest, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
-INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMSolverDeathTest, kokkos_solver_type_gtest, naming::test_parameter_to_name);
-INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMKernelFunctionDeathTest, kokkos_kernel_function_type_gtest, naming::test_parameter_to_name);
-INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMSolverKernelFunctionDeathTest, kokkos_solver_and_kernel_function_type_gtest, naming::test_parameter_to_name);
+ INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMDeathTest, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
+ INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMSolverDeathTest, kokkos_solver_type_gtest, naming::test_parameter_to_name);
+ INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMKernelFunctionDeathTest, kokkos_kernel_function_type_gtest, naming::test_parameter_to_name);
+ INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMSolverKernelFunctionDeathTest, kokkos_solver_and_kernel_function_type_gtest, naming::test_parameter_to_name);
 
 // generic GPU CSVM tests - correct grid sizes
-INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericGPUCSVM, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
-INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericGPUCSVMKernelFunction, kokkos_kernel_function_type_gtest, naming::test_parameter_to_name);
+ INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericGPUCSVM, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
+ INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericGPUCSVMKernelFunction, kokkos_kernel_function_type_gtest, naming::test_parameter_to_name);
 
 // generic GPU CSVM DeathTests - correct grid sizes
-INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericGPUCSVMDeathTest, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
+ INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericGPUCSVMDeathTest, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
 
-using kokkos_mock_csvm_test_tuple = std::tuple<kokkos_csvm_test_type<true>>;
-using kokkos_mock_csvm_test_type_list = util::cartesian_type_product_t<kokkos_mock_csvm_test_tuple>;
+ using kokkos_mock_csvm_test_tuple = std::tuple<kokkos_csvm_test_type<true>>;
+ using kokkos_mock_csvm_test_type_list = util::cartesian_type_product_t<kokkos_mock_csvm_test_tuple>;
 
-using kokkos_mock_csvm_test_type_gtest = util::combine_test_parameters_gtest_t<kokkos_mock_csvm_test_type_list>;
-using kokkos_mock_kernel_function_type_gtest = util::combine_test_parameters_gtest_t<kokkos_mock_csvm_test_type_list, util::kernel_function_type_list>;
+ using kokkos_mock_csvm_test_type_gtest = util::combine_test_parameters_gtest_t<kokkos_mock_csvm_test_type_list>;
+ using kokkos_mock_kernel_function_type_gtest = util::combine_test_parameters_gtest_t<kokkos_mock_csvm_test_type_list, util::kernel_function_type_list>;
 
 // generic GPU CSVM tests - mocked grid sizes
-INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMFakedGridSize, GenericGPUCSVM, kokkos_mock_csvm_test_type_gtest, naming::test_parameter_to_name);
-INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMFakedGridSize, GenericGPUCSVMKernelFunction, kokkos_mock_kernel_function_type_gtest, naming::test_parameter_to_name);
+ INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMFakedGridSize, GenericGPUCSVM, kokkos_mock_csvm_test_type_gtest, naming::test_parameter_to_name);
+ INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMFakedGridSize, GenericGPUCSVMKernelFunction, kokkos_mock_kernel_function_type_gtest, naming::test_parameter_to_name);
diff --git a/tests/detail/cmd/parser_predict.cpp b/tests/detail/cmd/parser_predict.cpp
index fccf2f005..72311f89b 100644
--- a/tests/detail/cmd/parser_predict.cpp
+++ b/tests/detail/cmd/parser_predict.cpp
@@ -195,7 +195,7 @@ TEST_P(ParserPredictBackend, parsing) {
 // clang-format off
 INSTANTIATE_TEST_SUITE_P(ParserPredict, ParserPredictBackend, ::testing::Combine(
                 ::testing::Values("-b", "--backend"),
-                ::testing::Values("automatic", "OpenMP", "CUDA", "HIP", "OpenCL", "SYCL")),
+                ::testing::Values("automatic", "OpenMP", "stdpar", "CUDA", "HIP", "OpenCL", "SYCL", "Kokkos")),
                 naming::pretty_print_parameter_flag_and_value<ParserPredictBackend>);
 // clang-format on
 
diff --git a/tests/detail/cmd/parser_train.cpp b/tests/detail/cmd/parser_train.cpp
index 78d43a25f..071e867a0 100644
--- a/tests/detail/cmd/parser_train.cpp
+++ b/tests/detail/cmd/parser_train.cpp
@@ -463,7 +463,7 @@ TEST_P(ParserTrainBackend, parsing) {
 // clang-format off
 INSTANTIATE_TEST_SUITE_P(ParserTrain, ParserTrainBackend, ::testing::Combine(
                 ::testing::Values("-b", "--backend"),
-                ::testing::Values("automatic", "OpenMP", "CUDA", "HIP", "OpenCL", "SYCL")),
+                ::testing::Values("automatic", "OpenMP", "stdpar", "CUDA", "HIP", "OpenCL", "SYCL", "Kokkos")),
                 naming::pretty_print_parameter_flag_and_value<ParserTrainBackend>);
 // clang-format on
 

From c27943507be3abe65534ee13439c4b3ea7564f17 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 8 Nov 2024 12:10:06 +0100
Subject: [PATCH 083/123] Fix formatting.

---
 src/main_scale.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main_scale.cpp b/src/main_scale.cpp
index 1253df454..365f2eb2c 100644
--- a/src/main_scale.cpp
+++ b/src/main_scale.cpp
@@ -17,7 +17,7 @@
 #include "plssvm/detail/utility.hpp"                       // PLSSVM_IS_DEFINED
 
 #if defined(PLSSVM_HARDWARE_SAMPLING_ENABLED)
-    #include "hws/system_hardware_sampler.hpp" // hws::system_hardware_sampler
+    #include "hws/system_hardware_sampler.hpp"  // hws::system_hardware_sampler
 #endif
 
 #include <algorithm>   // std::for_each

From 9288a33c45615a749812f3422ab56efe2fe00f7f Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 8 Nov 2024 12:23:07 +0100
Subject: [PATCH 084/123] Fix failing tests.

---
 tests/detail/tracking/performance_tracker.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/detail/tracking/performance_tracker.cpp b/tests/detail/tracking/performance_tracker.cpp
index b81542ee7..b74016dea 100644
--- a/tests/detail/tracking/performance_tracker.cpp
+++ b/tests/detail/tracking/performance_tracker.cpp
@@ -340,7 +340,7 @@ TEST_F(PerformanceTracker, add_parser_train_tracking_entry) {
     // check entries for correctness
     EXPECT_EQ(entries.size(), 1);
 
-    ASSERT_EQ(entries.at("parameter").size(), 17);
+    ASSERT_EQ(entries.at("parameter").size(), 18);
 }
 
 TEST_F(PerformanceTracker, add_parser_predict_tracking_entry) {
@@ -363,7 +363,7 @@ TEST_F(PerformanceTracker, add_parser_predict_tracking_entry) {
     // check entries for correctness
     EXPECT_EQ(entries.size(), 1);
 
-    ASSERT_EQ(entries.at("parameter").size(), 9);
+    ASSERT_EQ(entries.at("parameter").size(), 10);
 }
 
 TEST_F(PerformanceTracker, add_parser_scale_tracking_entry) {

From 1e36ab1386d2551548b2e5ec4e4405727531797c Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 8 Nov 2024 23:55:27 +0100
Subject: [PATCH 085/123] The execution space implementation must always be
 available -> moved the file to the base sources and conditionally disabled
 Kokkos includes to be able to compile the file even if Kokkos is disabled.

---
 CMakeLists.txt                                 | 1 +
 src/plssvm/backends/Kokkos/CMakeLists.txt      | 1 -
 src/plssvm/backends/Kokkos/execution_space.cpp | 8 ++++++--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d555bd8e4..790ec4268 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,6 +72,7 @@ endif ()
 ########################################################################################################################
 ## set base sources
 set(PLSSVM_BASE_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/Kokkos/execution_space.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/SYCL/implementation_types.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/SYCL/kernel_invocation_types.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/stdpar/implementation_types.cpp
diff --git a/src/plssvm/backends/Kokkos/CMakeLists.txt b/src/plssvm/backends/Kokkos/CMakeLists.txt
index 90a1f4e74..960df04b1 100644
--- a/src/plssvm/backends/Kokkos/CMakeLists.txt
+++ b/src/plssvm/backends/Kokkos/CMakeLists.txt
@@ -28,7 +28,6 @@ set(PLSSVM_KOKKOS_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp
     ${CMAKE_CURRENT_LIST_DIR}/csvm.cpp
     ${CMAKE_CURRENT_LIST_DIR}/exceptions.cpp
-    ${CMAKE_CURRENT_LIST_DIR}/execution_space.cpp
 )
 
 # set target properties
diff --git a/src/plssvm/backends/Kokkos/execution_space.cpp b/src/plssvm/backends/Kokkos/execution_space.cpp
index 0caae212f..e6401367c 100644
--- a/src/plssvm/backends/Kokkos/execution_space.cpp
+++ b/src/plssvm/backends/Kokkos/execution_space.cpp
@@ -8,8 +8,10 @@
 
 #include "plssvm/backends/Kokkos/execution_space.hpp"
 
-#include "plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp"  // plssvm::kokkos::detail::constexpr_available_execution_spaces
-#include "plssvm/detail/string_utility.hpp"                                        // plssvm::detail::to_lower_case
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    #include "plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp"  // plssvm::kokkos::detail::constexpr_available_execution_spaces
+#endif
+#include "plssvm/detail/string_utility.hpp"  // plssvm::detail::to_lower_case
 
 #include <array>    // std::array
 #include <ios>      // std::ios::failbit
@@ -80,9 +82,11 @@ std::istream &operator>>(std::istream &in, execution_space &space) {
 std::vector<execution_space> list_available_execution_spaces() {
     // always add the automatic execution space
     std::vector<execution_space> spaces{ execution_space::automatic };
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     // add all other available execution spaces
     constexpr auto arr = detail::constexpr_available_execution_spaces();
     spaces.insert(spaces.cend(), arr.begin(), arr.end());
+#endif
     return spaces;
 }
 

From 1a8e092c5975009576f71a2a84372dad8c0f9f55 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 9 Nov 2024 00:24:13 +0100
Subject: [PATCH 086/123] Now the execution space file can also be compiled if
 Kokkos is not available.

---
 .../constexpr_available_execution_spaces.hpp  | 28 +++++++++++--------
 src/plssvm/backends/Kokkos/CMakeLists.txt     | 16 +++++++++++
 .../backends/Kokkos/execution_space.cpp       |  8 ++----
 3 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp b/include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp
index ea5dafb02..92f908fa7 100644
--- a/include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp
+++ b/include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp
@@ -13,9 +13,13 @@
 #ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_CONSTEXPR_AVAILABLE_EXECUTION_SPACES_HPP_
 #define PLSSVM_BACKENDS_KOKKOS_DETAIL_CONSTEXPR_AVAILABLE_EXECUTION_SPACES_HPP_
 
-#include "plssvm/backends/Kokkos/execution_space.hpp"  // plssvm::kokkos::execution_space
+// if the variable isn't set, no Kokkos execution space is available
+// -> explicitly set it to 0!
+#if !defined(PLSSVM_KOKKOS_BACKEND_NUM_AVAILABLE_EXECUTION_SPACES)
+    #define PLSSVM_KOKKOS_BACKEND_NUM_AVAILABLE_EXECUTION_SPACES 0
+#endif
 
-#include "Kokkos_Core.hpp"  // Kokkos macros, Kokkos ExecutionSpace types
+#include "plssvm/backends/Kokkos/execution_space.hpp"  // plssvm::kokkos::execution_space
 
 #include <array>  // std::array
 
@@ -30,32 +34,32 @@ namespace plssvm::kokkos::detail {
     // Note: The execution_space::automatic value may NEVER be added here!
     // Note: the trailing comma is explicitly allowed by the standard
     // Note: the order is intentionally chosen this way -> the order of the entries determines the priority when using a backend to run our code
-    return std::array{
-#if defined(KOKKOS_ENABLE_CUDA)
+    return std::array<execution_space, PLSSVM_KOKKOS_BACKEND_NUM_AVAILABLE_EXECUTION_SPACES>{
+#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_CUDA)
         execution_space::cuda,
 #endif
-#if defined(KOKKOS_ENABLE_HIP)
+#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_HIP)
         execution_space::hip,
 #endif
-#if defined(KOKKOS_ENABLE_SYCL)
+#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_SYCL)
         execution_space::sycl,
 #endif
-#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_OPENMPTARGET)
         execution_space::openmp_target,
 #endif
-#if defined(KOKKOS_ENABLE_OPENACC)
+#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_OPENACC)
         execution_space::openacc,
 #endif
-#if defined(KOKKOS_ENABLE_OPENMP)
+#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_OPENMP)
         execution_space::openmp,
 #endif
-#if defined(KOKKOS_ENABLE_THREADS)
+#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_THREADS)
         execution_space::threads,
 #endif
-#if defined(KOKKOS_ENABLE_HPX)
+#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_HPX)
         execution_space::hpx,
 #endif
-#if defined(KOKKOS_ENABLE_SERIAL)
+#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_SERIAL)
         execution_space::serial,
 #endif
     };
diff --git a/src/plssvm/backends/Kokkos/CMakeLists.txt b/src/plssvm/backends/Kokkos/CMakeLists.txt
index 960df04b1..bf37122f2 100644
--- a/src/plssvm/backends/Kokkos/CMakeLists.txt
+++ b/src/plssvm/backends/Kokkos/CMakeLists.txt
@@ -122,36 +122,52 @@ target_link_libraries(${PLSSVM_ALL_LIBRARY_NAME} INTERFACE ${PLSSVM_KOKKOS_BACKE
 append_local_and_parent(PLSSVM_TARGETS_TO_INSTALL ${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME})
 
 # assemble Kokkos available execution space string
+# also set compile definitions -> can't use KOKKOS_ENABLE_* directly inside the "constexpr_available_execution_space.hpp"
+# header since we can't include "Kokkos_Core.hpp" there (transitively used in the base library that doesn't know anything about Kokkos
 set(PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "")
 if (Kokkos_ENABLE_CUDA)
+    target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_ENABLE_CUDA)
     list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "Cuda")
 endif ()
 if (Kokkos_ENABLE_HIP)
+    target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_ENABLE_HIP)
     list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "HIP")
 endif ()
 if (Kokkos_ENABLE_SYCL)
+    target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_ENABLE_SYCL)
     list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "SYCL")
 endif ()
 if (Kokkos_ENABLE_HPX)
+    target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_ENABLE_HPX)
     list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "HPX")
 endif ()
 if (Kokkos_ENABLE_OPENMP)
+    target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_ENABLE_OPENMP)
     list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "OpenMP")
 endif ()
 if (Kokkos_ENABLE_OPENMPTARGET)
+    target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_ENABLE_OPENMPTARGET)
     list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "OpenMPTarget")
 endif ()
 if (Kokkos_ENABLE_OPENACC)
+    target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_ENABLE_OPENACC)
     list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "OpenACC")
 endif ()
 if (Kokkos_ENABLE_THREADS)
+    target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_ENABLE_THREADS)
     list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "Threads")
 endif ()
 if (Kokkos_ENABLE_SERIAL)
+    target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_ENABLE_SERIAL)
     list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "Serial")
 endif ()
 set(PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "${PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES}" PARENT_SCOPE)
 
+# also set the number of available Kokkos execution spaces to explicitly set the type of the used std::array
+# -> necessary if NO Kokkos execution space is available and, therefore, the size of the std::array would be 0 (can't automatically be deduced)
+list(LENGTH PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES PLSSVM_KOKKOS_BACKEND_NUM_AVAILABLE_EXECUTION_SPACES)
+target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_NUM_AVAILABLE_EXECUTION_SPACES=${PLSSVM_KOKKOS_BACKEND_NUM_AVAILABLE_EXECUTION_SPACES})
+
 # generate summary string
 set(PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_COMPILER " - Kokkos (${PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES}):")
 include(${PROJECT_SOURCE_DIR}/cmake/assemble_summary_string.cmake)
diff --git a/src/plssvm/backends/Kokkos/execution_space.cpp b/src/plssvm/backends/Kokkos/execution_space.cpp
index e6401367c..0caae212f 100644
--- a/src/plssvm/backends/Kokkos/execution_space.cpp
+++ b/src/plssvm/backends/Kokkos/execution_space.cpp
@@ -8,10 +8,8 @@
 
 #include "plssvm/backends/Kokkos/execution_space.hpp"
 
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
-    #include "plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp"  // plssvm::kokkos::detail::constexpr_available_execution_spaces
-#endif
-#include "plssvm/detail/string_utility.hpp"  // plssvm::detail::to_lower_case
+#include "plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp"  // plssvm::kokkos::detail::constexpr_available_execution_spaces
+#include "plssvm/detail/string_utility.hpp"                                        // plssvm::detail::to_lower_case
 
 #include <array>    // std::array
 #include <ios>      // std::ios::failbit
@@ -82,11 +80,9 @@ std::istream &operator>>(std::istream &in, execution_space &space) {
 std::vector<execution_space> list_available_execution_spaces() {
     // always add the automatic execution space
     std::vector<execution_space> spaces{ execution_space::automatic };
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     // add all other available execution spaces
     constexpr auto arr = detail::constexpr_available_execution_spaces();
     spaces.insert(spaces.cend(), arr.begin(), arr.end());
-#endif
     return spaces;
 }
 

From 7f7520f862a3879541df8c471a99b527e38242a8 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 9 Nov 2024 00:29:05 +0100
Subject: [PATCH 087/123] Fix compilation error involving too many template
 instantiations.

---
 tests/backends/Kokkos/CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/backends/Kokkos/CMakeLists.txt b/tests/backends/Kokkos/CMakeLists.txt
index 34ce3881f..79cb3331e 100644
--- a/tests/backends/Kokkos/CMakeLists.txt
+++ b/tests/backends/Kokkos/CMakeLists.txt
@@ -27,6 +27,11 @@ find_package(Kokkos REQUIRED)
 # add test executable
 add_executable(${PLSSVM_KOKKOS_TEST_NAME} ${CMAKE_CURRENT_LIST_DIR}/../../main.cpp ${PLSSVM_KOKKOS_TEST_SOURCES})
 
+if (Kokkos_ENABLE_CUDA)
+    # fix template limit when using Kokkos::Cuda
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xcudafe --pending_instantiations=0")
+endif ()
+
 # link against test library
 target_link_libraries(${PLSSVM_KOKKOS_TEST_NAME} PRIVATE ${PLSSVM_BASE_TEST_LIBRARY_NAME})
 

From 6904c88fee11a5d791d8c5f68ccfc051bb3f3b4c Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 9 Nov 2024 01:05:46 +0100
Subject: [PATCH 088/123] Use target_compile_options instead of directly
 changing CMAKE_CXX_FLAGS.

---
 tests/backends/Kokkos/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/backends/Kokkos/CMakeLists.txt b/tests/backends/Kokkos/CMakeLists.txt
index 79cb3331e..c6abf4dbe 100644
--- a/tests/backends/Kokkos/CMakeLists.txt
+++ b/tests/backends/Kokkos/CMakeLists.txt
@@ -29,7 +29,7 @@ add_executable(${PLSSVM_KOKKOS_TEST_NAME} ${CMAKE_CURRENT_LIST_DIR}/../../main.c
 
 if (Kokkos_ENABLE_CUDA)
     # fix template limit when using Kokkos::Cuda
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xcudafe --pending_instantiations=0")
+    target_compile_options(${PLSSVM_KOKKOS_TEST_NAME} PRIVATE -Xcudafe --pending_instantiations=0)
 endif ()
 
 # link against test library

From 0566ba0af13f33f4876ed161e32aad0400dbd78b Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 9 Nov 2024 01:18:54 +0100
Subject: [PATCH 089/123] Additionally, output all additional_arguments (if
 any).

---
 tests/naming.hpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/naming.hpp b/tests/naming.hpp
index 811266fe1..dd58c097f 100644
--- a/tests/naming.hpp
+++ b/tests/naming.hpp
@@ -153,7 +153,15 @@ template <typename T>
     } else if constexpr (std::is_base_of_v<plssvm::exception, T>) {
         return std::string{ util::exception_type_name<T>() };
     } else if constexpr (has_csvm_type_member_typedef_v<T>) {
-        return fmt::format("{}", plssvm::csvm_to_backend_type_v<typename T::csvm_type>);
+        // clang-format off
+        return fmt::format("{}{}", plssvm::csvm_to_backend_type_v<typename T::csvm_type>, std::apply([](const auto &...args) {
+                               if constexpr (sizeof...(args) == 0) {
+                                   return std::string{};
+                               } else {
+                                   return (fmt::format("_{}", args.second) + ...);
+                               }
+                           }, T::additional_arguments));
+        // clang-format on
     } else if constexpr (has_device_ptr_type_member_typedef_v<T>) {
         using device_ptr_type = typename T::device_ptr_type;
         return fmt::format("{}", plssvm::detail::arithmetic_type_name<typename device_ptr_type::value_type>());

From 1d6b814fa3fa0c06e44489aacdda54351a630e12 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sun, 10 Nov 2024 20:12:09 +0100
Subject: [PATCH 090/123] Fix wrong size of test array.

---
 tests/types_to_test.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/types_to_test.hpp b/tests/types_to_test.hpp
index 44db342b3..b5eb2c1db 100644
--- a/tests/types_to_test.hpp
+++ b/tests/types_to_test.hpp
@@ -474,7 +474,7 @@ constexpr std::array<plssvm::classification_type, 2> classification_types_to_tes
     plssvm::classification_type::oaa, plssvm::classification_type::oao
 };
 /// A list of all available solver types.
-constexpr std::array<plssvm::solver_type, 4> solver_types_to_test = {
+constexpr std::array<plssvm::solver_type, 3> solver_types_to_test{
     plssvm::solver_type::automatic, plssvm::solver_type::cg_explicit, plssvm::solver_type::cg_implicit
 };
 

From c4dc4ec569e0f8b3760564589426a6104e5daad1 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sun, 10 Nov 2024 20:33:35 +0100
Subject: [PATCH 091/123] Test all available Kokkos execution spaces.

---
 tests/backends/Kokkos/kokkos_csvm.cpp | 149 +++++++++++++++++++-------
 1 file changed, 112 insertions(+), 37 deletions(-)

diff --git a/tests/backends/Kokkos/kokkos_csvm.cpp b/tests/backends/Kokkos/kokkos_csvm.cpp
index 5fe50d46e..51a4fb9c6 100644
--- a/tests/backends/Kokkos/kokkos_csvm.cpp
+++ b/tests/backends/Kokkos/kokkos_csvm.cpp
@@ -27,9 +27,11 @@
 
 #include "gtest/gtest.h"  // TEST_F, EXPECT_NO_THROW, INSTANTIATE_TYPED_TEST_SUITE_P, ::testing::Test
 
-#include <map>     // std::map
-#include <tuple>   // std::make_tuple, std::tuple
-#include <vector>  // std::vector
+#include <array>    // std::array
+#include <cstddef>  // std::size_t
+#include <map>      // std::map
+#include <tuple>    // std::make_tuple, std::tuple
+#include <vector>   // std::vector
 
 class KokkosCSVM : public ::testing::Test,
                    private util::redirect_output<> { };
@@ -626,54 +628,127 @@ TEST_F(KokkosCSVM, get_execution_space) {
     EXPECT_NE(svm.get_execution_space(), plssvm::kokkos::execution_space::automatic);
 }
 
- template <bool mock_grid_size>
- struct kokkos_csvm_test_type {
-     using mock_csvm_type = mock_kokkos_csvm<mock_grid_size>;
-     using csvm_type = plssvm::kokkos::csvm;
-     using device_ptr_type = typename csvm_type::device_ptr_type;
-     inline constexpr static auto additional_arguments = std::make_tuple();
- };
+template <bool mock_grid_size, plssvm::kokkos::execution_space space>
+struct kokkos_csvm_test_type {
+    using mock_csvm_type = mock_kokkos_csvm<mock_grid_size>;
+    using csvm_type = plssvm::kokkos::csvm;
+    using device_ptr_type = typename csvm_type::device_ptr_type;
+    inline static auto additional_arguments = std::make_tuple(std::make_pair(plssvm::kokkos_execution_space, space));
+};
 
- using kokkos_csvm_test_tuple = std::tuple<kokkos_csvm_test_type<false>>;
- using kokkos_csvm_test_label_type_list = util::cartesian_type_product_t<kokkos_csvm_test_tuple, plssvm::detail::supported_label_types>;
- using kokkos_csvm_test_type_list = util::cartesian_type_product_t<kokkos_csvm_test_tuple>;
+namespace impl {
+
+/**
+ * @brief Determine which execution spaces can be tested based on the available Kokkos::ExecutionSpaces and PLSSVM target platforms.
+ * @return the available execution spaces for testing (`[[nodiscard]]`)
+ */
+[[nodiscard]] constexpr auto constexpr_available_execution_spaces_to_test() {
+    return std::array{
+#if defined(KOKKOS_ENABLE_CUDA) && defined(PLSSVM_HAS_NVIDIA_TARGET)  // for Kokkos::Cuda, an NVIDIA target must be available
+        plssvm::kokkos::execution_space::cuda,
+#endif
+#if defined(KOKKOS_ENABLE_HIP) && (defined(PLSSVM_HAS_NVIDIA_TARGET) || defined(PLSSVM_HAS_AMD_TARGET))  // for Kokkos::HIP, an NVIDIA or AMD target must be available
+        plssvm::kokkos::execution_space::hip,
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)  // for Kokkos::SYCL, any target is ok
+        plssvm::kokkos::execution_space::sycl,
+#endif
+#if defined(KOKKOS_ENABLE_HPX) && defined(PLSSVM_HAS_CPU_TARGET)  // for Kokkos::Experimental::HPX, a CPU target must be available
+        plssvm::kokkos::execution_space::hpx,
+#endif
+#if defined(KOKKOS_ENABLE_OPENMP) && defined(PLSSVM_HAS_CPU_TARGET)  // for Kokkos::OpenMP, a CPU target must be available
+        plssvm::kokkos::execution_space::openmp,
+#endif
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)  // for Kokkos::Experimental::OpenMPTarget,any target is ok // TODO: implement correctly based on allowed target platforms
+        plssvm::kokkos::execution_space::openmp_target,
+#endif
+#if defined(KOKKOS_ENABLE_OPENACC)  // for Kokkos::Experimental::OpenACC,any target is ok // TODO: implement correctly based on allowed target platforms
+        plssvm::kokkos::execution_space::openacc,
+#endif
+#if defined(KOKKOS_ENABLE_THREADS) && defined(PLSSVM_HAS_CPU_TARGET)  // for Kokkos::Threads, a CPU target must be available
+        plssvm::kokkos::execution_space::threads,
+#endif
+#if defined(KOKKOS_ENABLE_SERIAL) && defined(PLSSVM_HAS_CPU_TARGET)  // for Kokkos::Serial, a CPU target must be available
+        plssvm::kokkos::execution_space::serial,
+#endif
+    };
+}
+
+/**
+ * @brief Uninstantiated base type to create a `std::tuple` containing all available `kokkos_csvm_test_type` types.
+ */
+template <bool, typename>
+struct create_device_tuple_type_helper;
+
+/**
+ * @brief Helper struct to create a `std::tuple` containing all available `kokkos_csvm_test_type` types by iterating over the `std::array` of
+ *        `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`.
+ * @tparam mock_grid_size whether the maximum grid size should be mocked (i.e. in fact reduced) or not
+ * @tparam Is the indices to index the `std::array`
+ */
+template <bool mock_grid_size, std::size_t... Is>
+struct create_device_tuple_type_helper<mock_grid_size, std::index_sequence<Is...>> {
+    /// The array containing all available execution spaces.
+    constexpr static auto array = constexpr_available_execution_spaces_to_test();
+    /// The resulting variant type.
+    using type = std::tuple<kokkos_csvm_test_type<false, array[Is]>...>;
+};
+
+/**
+ * @brief Create a `std::tuple` containing all available `kokkos_csvm_test_type` types by iterating over the `std::array` of
+ *        `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`.
+ * @tparam mock_grid_size whether the maximum grid size should be mocked (i.e. in fact reduced) or not
+ */
+template <bool mock_grid_size>
+struct create_device_tuple_type {
+    /// The number of types in the final variant.
+    constexpr static std::size_t N = constexpr_available_execution_spaces_to_test().size();
+    /// The final variant type.
+    using type = typename create_device_tuple_type_helper<mock_grid_size, std::make_index_sequence<N>>::type;
+};
+
+}  // namespace impl
+
+using kokkos_csvm_test_tuple = typename impl::create_device_tuple_type<false>::type;
+using kokkos_csvm_test_label_type_list = util::cartesian_type_product_t<kokkos_csvm_test_tuple, plssvm::detail::supported_label_types>;
+using kokkos_csvm_test_type_list = util::cartesian_type_product_t<kokkos_csvm_test_tuple>;
 
 // the tests used in the instantiated GTest test suites
- using kokkos_csvm_test_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list>;
- using kokkos_solver_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list, util::solver_type_list>;
- using kokkos_kernel_function_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list, util::kernel_function_type_list>;
- using kokkos_solver_and_kernel_function_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list, util::solver_and_kernel_function_type_list>;
- using kokkos_label_type_kernel_function_and_classification_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_label_type_list, util::kernel_function_and_classification_type_list>;
- using kokkos_label_type_solver_kernel_function_and_classification_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_label_type_list, util::solver_and_kernel_function_and_classification_type_list>;
+using kokkos_csvm_test_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list>;
+using kokkos_solver_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list, util::solver_type_list>;
+using kokkos_kernel_function_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list, util::kernel_function_type_list>;
+using kokkos_solver_and_kernel_function_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_type_list, util::solver_and_kernel_function_type_list>;
+using kokkos_label_type_kernel_function_and_classification_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_label_type_list, util::kernel_function_and_classification_type_list>;
+using kokkos_label_type_solver_kernel_function_and_classification_type_gtest = util::combine_test_parameters_gtest_t<kokkos_csvm_test_label_type_list, util::solver_and_kernel_function_and_classification_type_list>;
 
 // instantiate type-parameterized tests
 // generic CSVM tests
- INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVM, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
- INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMKernelFunction, kokkos_kernel_function_type_gtest, naming::test_parameter_to_name);
- INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolver, kokkos_solver_type_gtest, naming::test_parameter_to_name);
- INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolverKernelFunction, kokkos_solver_and_kernel_function_type_gtest, naming::test_parameter_to_name);
- INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMKernelFunctionClassification, kokkos_label_type_kernel_function_and_classification_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVM, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMKernelFunction, kokkos_kernel_function_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolver, kokkos_solver_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolverKernelFunction, kokkos_solver_and_kernel_function_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMKernelFunctionClassification, kokkos_label_type_kernel_function_and_classification_type_gtest, naming::test_parameter_to_name);
 // INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolverKernelFunctionClassification, kokkos_label_type_solver_kernel_function_and_classification_type_gtest, naming::test_parameter_to_name);
 
 // generic CSVM DeathTests
- INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMDeathTest, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
- INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMSolverDeathTest, kokkos_solver_type_gtest, naming::test_parameter_to_name);
- INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMKernelFunctionDeathTest, kokkos_kernel_function_type_gtest, naming::test_parameter_to_name);
- INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMSolverKernelFunctionDeathTest, kokkos_solver_and_kernel_function_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMDeathTest, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMSolverDeathTest, kokkos_solver_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMKernelFunctionDeathTest, kokkos_kernel_function_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMSolverKernelFunctionDeathTest, kokkos_solver_and_kernel_function_type_gtest, naming::test_parameter_to_name);
 
 // generic GPU CSVM tests - correct grid sizes
- INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericGPUCSVM, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
- INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericGPUCSVMKernelFunction, kokkos_kernel_function_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericGPUCSVM, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericGPUCSVMKernelFunction, kokkos_kernel_function_type_gtest, naming::test_parameter_to_name);
 
 // generic GPU CSVM DeathTests - correct grid sizes
- INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericGPUCSVMDeathTest, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericGPUCSVMDeathTest, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
 
- using kokkos_mock_csvm_test_tuple = std::tuple<kokkos_csvm_test_type<true>>;
- using kokkos_mock_csvm_test_type_list = util::cartesian_type_product_t<kokkos_mock_csvm_test_tuple>;
+using kokkos_mock_csvm_test_tuple = typename impl::create_device_tuple_type<true>::type;
+using kokkos_mock_csvm_test_type_list = util::cartesian_type_product_t<kokkos_mock_csvm_test_tuple>;
 
- using kokkos_mock_csvm_test_type_gtest = util::combine_test_parameters_gtest_t<kokkos_mock_csvm_test_type_list>;
- using kokkos_mock_kernel_function_type_gtest = util::combine_test_parameters_gtest_t<kokkos_mock_csvm_test_type_list, util::kernel_function_type_list>;
+using kokkos_mock_csvm_test_type_gtest = util::combine_test_parameters_gtest_t<kokkos_mock_csvm_test_type_list>;
+using kokkos_mock_kernel_function_type_gtest = util::combine_test_parameters_gtest_t<kokkos_mock_csvm_test_type_list, util::kernel_function_type_list>;
 
 // generic GPU CSVM tests - mocked grid sizes
- INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMFakedGridSize, GenericGPUCSVM, kokkos_mock_csvm_test_type_gtest, naming::test_parameter_to_name);
- INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMFakedGridSize, GenericGPUCSVMKernelFunction, kokkos_mock_kernel_function_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMFakedGridSize, GenericGPUCSVM, kokkos_mock_csvm_test_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMFakedGridSize, GenericGPUCSVMKernelFunction, kokkos_mock_kernel_function_type_gtest, naming::test_parameter_to_name);

From 70dec59104f513a90854c2519078ad7d265d71b8 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sun, 10 Nov 2024 20:45:07 +0100
Subject: [PATCH 092/123] Reformulate some template meta-programming parts from
 a recursive implementation to an iterative implementation improving
 compilation times.

---
 tests/types_to_test.hpp | 105 +++++++++++++++++++++++++---------------
 1 file changed, 65 insertions(+), 40 deletions(-)

diff --git a/tests/types_to_test.hpp b/tests/types_to_test.hpp
index b5eb2c1db..4147d9b70 100644
--- a/tests/types_to_test.hpp
+++ b/tests/types_to_test.hpp
@@ -115,6 +115,33 @@ constexpr auto test_parameter_value_at_v = std::get<I>(ValueList::values);
 
 namespace detail {
 
+/**
+ * @brief Copy all types in the @p Tuple using the indices @p Is.
+ * @tparam Tuple the tuple types to clone
+ * @tparam Is the index sequence used to clone the tuple types
+ * @param[in] tuple the tuple to clone
+ */
+template <typename Tuple, std::size_t... Is>
+[[nodiscard]] constexpr auto expand_tuple(std::index_sequence<Is...>, const Tuple &tuple) {
+    return std::tuple_cat((static_cast<void>(Is), tuple)...);
+}
+
+/**
+ * @brief Copy all types in the @p Tuple @p N times.
+ * @tparam N the time how often the types should be cloned
+ * @tparam Tuple the tuple types to clone
+ */
+template <std::size_t N, typename Tuple>
+struct clone_tuple_types {
+    using type = decltype(expand_tuple(std::make_index_sequence<N>(), std::declval<Tuple>()));
+};
+
+/**
+ * @brief Shorthand for the `typename clone_tuple_types<N, Tuple>::type` type.
+ */
+template <std::size_t N, typename Tuple>
+using clone_tuple_types_t = typename clone_tuple_types<N, Tuple>::type;
+
 // convert the types in a tuple to GoogleTests ::testing::Type
 template <typename Ts>
 struct tuple_to_gtest_types;
@@ -196,41 +223,38 @@ struct wrap_in_value_list<Array, std::index_sequence<I...>> {
 template <const auto &Array>
 using wrap_in_value_list_t = typename wrap_in_value_list<Array>::type;
 
-template <typename T, std::size_t, std::size_t, const auto &Array, typename Tuple>
-struct combine_values;
+template <typename, const auto &, typename>
+struct combine_values_impl { };
 
 /**
- * @brief Recursion termination: add the last value in the @p Array to the `value_list`s in the std::tuple.
- * @tparam T the type in the array
- * @tparam SIZE the size of the array
- * @tparam Array the array
- * @tparam Types the already existing `value_list`s
+ * @brief Iteratively add the values in @p Array at position @p IS to the `value_list`s in the std::tuple @p Tuple.
+ * @tparam Tuple the tuple types
+ * @tparam Array the array values to add
+ * @tparam Is the indices in @p Array
  */
-template <typename T, std::size_t SIZE, const std::array<T, SIZE> &Array, typename... Types>
-struct combine_values<T, SIZE, 0, Array, std::tuple<Types...>> {
-    using type = std::tuple<add_to_value_list_t<Types, std::get<0>(Array)>...>;
+template <typename Tuple, const auto &Array, std::size_t... Is>
+struct combine_values_impl<Tuple, Array, std::index_sequence<Is...>> {
+    constexpr static std::size_t N = Array.size();
+    using type = std::tuple<add_to_value_list_t<std::tuple_element_t<Is, Tuple>, std::get<Is % N>(Array)>...>;
 };
 
 /**
- * @brief Recursively add the value @p I of the @p Array to the `value_list`s in the std::tuple.
- * @tparam T the type in the array
- * @tparam SIZE the size of the array
- * @tparam I the currently investigated array element
- * @tparam Array the array
- * @tparam Types the already existing `value_list`s
+ * @brief Add the values in @p Array to the `value_list`s in the std::tuple @p Tuple.
+ * @tparam Tuple the tuple types
+ * @tparam Array the array values to add
  */
-template <typename T, std::size_t SIZE, std::size_t I, const std::array<T, SIZE> &Array, typename... Types>
-struct combine_values<T, SIZE, I, Array, std::tuple<Types...>> {
-    using type = concat_tuple_types_t<
-        std::tuple<add_to_value_list_t<Types, std::get<I>(Array)>...>,
-        typename combine_values<T, SIZE, I - 1, Array, std::tuple<Types...>>::type>;
+template <typename Tuple, const auto &Array>
+struct combine_values {
+    // clone the types in the Tuple N-times where N is the number of values in the Array
+    using cloned_tuple = clone_tuple_types_t<Array.size(), Tuple>;
+    using type = typename combine_values_impl<cloned_tuple, Array, std::make_index_sequence<std::tuple_size_v<cloned_tuple>>>::type;
 };
 
 /**
  * @brief Shorthand for `typename combine_values<...>::type`.
  */
 template <const auto &Array, typename Tuple>
-using combine_values_t = typename combine_values<typename plssvm::detail::remove_cvref_t<decltype(Array)>::value_type, Array.size(), Array.size() - 1, Array, Tuple>::type;
+using combine_values_t = typename combine_values<Tuple, Array>::type;
 
 /**
  * @brief Calculate the cartesian product of the values in @p FirstArray and @p RemainingArrays recursively.
@@ -292,37 +316,38 @@ struct wrap_in_type_list<std::tuple<Types...>> {
 template <typename Tuple>
 using wrap_in_type_list_t = typename wrap_in_type_list<Tuple>::type;
 
-template <std::size_t, typename Tuple, typename ResultTuple>
-struct combine_types;
+template <typename Tuple, typename CurrentTuple, typename>
+struct combine_types_impl { };
 
 /**
- * @brief Recursion termination: add the last type in the @p Tuple to the `type_list`s in the std::tuple.
- * @tparam Tuple the std::tuple containing the types to add
- * @tparam ResultTupleTypes the already existing `type_list`s
+ * @brief Iteratively add the types in @p CurrentTuple at position @p IS to the `type_list`s in the std::tuple @p Tuple.
+ * @tparam Tuple the tuple types
+ * @tparam CurrentTuple the types in the current tuple
+ * @tparam Is the indices in @p Array
  */
-template <typename Tuple, typename... ResultTupleTypes>
-struct combine_types<0, Tuple, std::tuple<ResultTupleTypes...>> {
-    using type = std::tuple<add_to_type_list_t<ResultTupleTypes, std::tuple_element_t<0, Tuple>>...>;
+template <typename Tuple, typename CurrentTuple, std::size_t... Is>
+struct combine_types_impl<Tuple, CurrentTuple, std::index_sequence<Is...>> {
+    constexpr static std::size_t N = std::tuple_size_v<CurrentTuple>;
+    using type = std::tuple<add_to_type_list_t<std::tuple_element_t<Is, Tuple>, std::tuple_element_t<Is % N, CurrentTuple>>...>;
 };
 
 /**
- * @brief Recursively add the type @p I of the @p Tuple to the `type_list`s in the std::tuple.
- * @tparam I the currently investigated tuple element
- * @tparam Tuple the tuple
- * @tparam ResultTupleTypes the already existing `type_list`s
+ * @brief Add the types in @p CurrentTuple to the `type_list`s in the std::tuple @p Tuple.
+ * @tparam Tuple the tuple types
+ * @tparam CurrentTuple the types in the current tuple
  */
-template <std::size_t I, typename Tuple, typename... ResultTupleTypes>
-struct combine_types<I, Tuple, std::tuple<ResultTupleTypes...>> {
-    using type = concat_tuple_types_t<
-        std::tuple<add_to_type_list_t<ResultTupleTypes, std::tuple_element_t<I, Tuple>>...>,
-        typename combine_types<I - 1, Tuple, std::tuple<ResultTupleTypes...>>::type>;
+template <typename Tuple, typename CurrentTuple>
+struct combine_types {
+    // clone the types in the Tuple N-times where N is the number of types in the CurrentTuple
+    using cloned_tuple = clone_tuple_types_t<std::tuple_size_v<CurrentTuple>, Tuple>;
+    using type = typename combine_types_impl<cloned_tuple, CurrentTuple, std::make_index_sequence<std::tuple_size_v<cloned_tuple>>>::type;
 };
 
 /**
  * @brief Shorthand for `typename combine_types<...>::type`.
  */
 template <typename Tuple, typename ResultTuple>
-using combine_types_t = typename combine_types<std::tuple_size_v<Tuple> - 1, Tuple, ResultTuple>::type;
+using combine_types_t = typename combine_types<ResultTuple, Tuple>::type;
 
 /**
  * @brief Calculate the cartesian product of the types in @p FirstTuple and @p RemainingTuples recursively.

From d57e619771b9e30aa7eafb1942d831a3cbaaf81a Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sun, 10 Nov 2024 23:42:27 +0100
Subject: [PATCH 093/123] Disable some tests if the Kokkos::Cuda execution
 space is enabled due to template instantiation limits with nvcc.

---
 tests/backends/Kokkos/CMakeLists.txt  | 5 +++++
 tests/backends/Kokkos/kokkos_csvm.cpp | 5 ++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/backends/Kokkos/CMakeLists.txt b/tests/backends/Kokkos/CMakeLists.txt
index c6abf4dbe..4a7d23501 100644
--- a/tests/backends/Kokkos/CMakeLists.txt
+++ b/tests/backends/Kokkos/CMakeLists.txt
@@ -30,6 +30,11 @@ add_executable(${PLSSVM_KOKKOS_TEST_NAME} ${CMAKE_CURRENT_LIST_DIR}/../../main.c
 if (Kokkos_ENABLE_CUDA)
     # fix template limit when using Kokkos::Cuda
     target_compile_options(${PLSSVM_KOKKOS_TEST_NAME} PRIVATE -Xcudafe --pending_instantiations=0)
+    
+    # tests won't compile with nvcc
+    if (NOT PLSSVM_TEST_WITH_REDUCED_LABEL_TYPES)
+        message(FATAL_ERROR "Due to template instantiation limits within nvcc, only reduced label type tests are currently supported!")
+    endif ()
 endif ()
 
 # link against test library
diff --git a/tests/backends/Kokkos/kokkos_csvm.cpp b/tests/backends/Kokkos/kokkos_csvm.cpp
index 51a4fb9c6..b6e892a6c 100644
--- a/tests/backends/Kokkos/kokkos_csvm.cpp
+++ b/tests/backends/Kokkos/kokkos_csvm.cpp
@@ -728,7 +728,10 @@ INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMKernelFunction, kokkos_ker
 INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolver, kokkos_solver_type_gtest, naming::test_parameter_to_name);
 INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolverKernelFunction, kokkos_solver_and_kernel_function_type_gtest, naming::test_parameter_to_name);
 INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMKernelFunctionClassification, kokkos_label_type_kernel_function_and_classification_type_gtest, naming::test_parameter_to_name);
-// INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolverKernelFunctionClassification, kokkos_label_type_solver_kernel_function_and_classification_type_gtest, naming::test_parameter_to_name);
+#if !defined(KOKKOS_ENABLE_CUDA)
+// testcase doesn't compile with Kokkos::Cuda's nvcc due to template instantiation limits
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolverKernelFunctionClassification, kokkos_label_type_solver_kernel_function_and_classification_type_gtest, naming::test_parameter_to_name);
+#endif
 
 // generic CSVM DeathTests
 INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMDeathTest, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);

From 6f210ff6f922f58cb219c526677a67e0972fc52d Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 11 Nov 2024 10:59:54 +0100
Subject: [PATCH 094/123] Also test all available execution spaces for the
 device_ptr class in Kokkos.

---
 tests/backends/Kokkos/detail/device_ptr.cpp | 23 +++--
 tests/backends/Kokkos/kokkos_csvm.cpp       | 90 +++----------------
 tests/backends/Kokkos/utility.hpp           | 96 +++++++++++++++++++++
 tests/naming.hpp                            | 23 ++++-
 4 files changed, 145 insertions(+), 87 deletions(-)
 create mode 100644 tests/backends/Kokkos/utility.hpp

diff --git a/tests/backends/Kokkos/detail/device_ptr.cpp b/tests/backends/Kokkos/detail/device_ptr.cpp
index c96a1ed87..ec525dad5 100644
--- a/tests/backends/Kokkos/detail/device_ptr.cpp
+++ b/tests/backends/Kokkos/detail/device_ptr.cpp
@@ -10,30 +10,39 @@
 
 #include "plssvm/backends/Kokkos/detail/device_ptr.hpp"  // plssvm::kokkos::detail::device_ptr
 
-#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"  // plssvm::kokkos::detail::device_wrapper
-
-#include "Kokkos_Core.hpp"  // Kokkos::DefaultExecutionSpace
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"        // plssvm::kokkos::detail::device_wrapper
+#include "plssvm/backends/Kokkos/execution_space.hpp"              // plssvm::kokkos::execution_space
+#include "plssvm/backends/Kokkos/execution_space_type_traits.hpp"  // plssvm::kokkos::execution_space_to_kokkos_type_t
 
 #include "tests/backends/generic_device_ptr_tests.hpp"  // generic device pointer tests to instantiate
+#include "tests/backends/Kokkos/utility.hpp"            // util::create_kokkos_test_tuple_impl
 #include "tests/naming.hpp"                             // naming::test_parameter_to_name
-#include "tests/types_to_test.hpp"                      // util::{combine_test_parameters_gtest_t, cartesian_type_product_t, layout_type_list}
+#include "tests/types_to_test.hpp"                      // util::{combine_test_parameters_gtest_t, cartesian_type_product_t, layout_type_list},
+                                                        // util::detail::concat_tuple_types_t
 
 #include "gtest/gtest.h"  // INSTANTIATE_TYPED_TEST_SUITE_P
 
 #include <tuple>  // std::tuple
 
-template <typename T>
+template <typename T, plssvm::kokkos::execution_space exec_space>
 struct kokkos_device_ptr_test_type {
     using device_ptr_type = plssvm::kokkos::detail::device_ptr<T>;
     using queue_type = plssvm::kokkos::detail::device_wrapper;
+    constexpr static plssvm::kokkos::execution_space space = exec_space;
 
     static const queue_type &default_queue() {
-        static const queue_type queue{ Kokkos::DefaultExecutionSpace{} };
+        static const queue_type queue{ plssvm::kokkos::execution_space_to_kokkos_type_t<space>{} };
         return queue;
     }
 };
 
-using kokkos_device_ptr_tuple = std::tuple<kokkos_device_ptr_test_type<float>, kokkos_device_ptr_test_type<double>>;
+template <plssvm::kokkos::execution_space space>
+using kokkos_device_ptr_test_type_float = kokkos_device_ptr_test_type<float, space>;
+template <plssvm::kokkos::execution_space space>
+using kokkos_device_ptr_test_type_double = kokkos_device_ptr_test_type<double, space>;
+
+using kokkos_device_ptr_tuple = util::detail::concat_tuple_types_t<util::create_kokkos_test_tuple_t<kokkos_device_ptr_test_type_float>,
+                                                                   util::create_kokkos_test_tuple_t<kokkos_device_ptr_test_type_double>>;
 
 // the tests used in the instantiated GTest test suites
 using kokkos_device_ptr_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<kokkos_device_ptr_tuple>>;
diff --git a/tests/backends/Kokkos/kokkos_csvm.cpp b/tests/backends/Kokkos/kokkos_csvm.cpp
index b6e892a6c..c1ae0cdb7 100644
--- a/tests/backends/Kokkos/kokkos_csvm.cpp
+++ b/tests/backends/Kokkos/kokkos_csvm.cpp
@@ -20,10 +20,11 @@
 #include "tests/backends/generic_csvm_tests.hpp"      // generic CSVM tests to instantiate
 #include "tests/backends/generic_gpu_csvm_tests.hpp"  // generic GPU CSVM tests to instantiate
 #include "tests/backends/Kokkos/mock_kokkos_csvm.hpp"
-#include "tests/custom_test_macros.hpp"  // EXPECT_THROW_WHAT
-#include "tests/naming.hpp"              // naming::test_parameter_to_name
-#include "tests/types_to_test.hpp"       // util::{cartesian_type_product_t, combine_test_parameters_gtest_t}
-#include "tests/utility.hpp"             // util::redirect_output
+#include "tests/backends/Kokkos/utility.hpp"  // util::create_kokkos_test_tuple_impl
+#include "tests/custom_test_macros.hpp"       // EXPECT_THROW_WHAT
+#include "tests/naming.hpp"                   // naming::test_parameter_to_name
+#include "tests/types_to_test.hpp"            // util::{cartesian_type_product_t, combine_test_parameters_gtest_t}
+#include "tests/utility.hpp"                  // util::redirect_output
 
 #include "gtest/gtest.h"  // TEST_F, EXPECT_NO_THROW, INSTANTIATE_TYPED_TEST_SUITE_P, ::testing::Test
 
@@ -636,80 +637,10 @@ struct kokkos_csvm_test_type {
     inline static auto additional_arguments = std::make_tuple(std::make_pair(plssvm::kokkos_execution_space, space));
 };
 
-namespace impl {
+template <plssvm::kokkos::execution_space space>
+using kokkos_csvm_test_type_without_mock = kokkos_csvm_test_type<false, space>;
 
-/**
- * @brief Determine which execution spaces can be tested based on the available Kokkos::ExecutionSpaces and PLSSVM target platforms.
- * @return the available execution spaces for testing (`[[nodiscard]]`)
- */
-[[nodiscard]] constexpr auto constexpr_available_execution_spaces_to_test() {
-    return std::array{
-#if defined(KOKKOS_ENABLE_CUDA) && defined(PLSSVM_HAS_NVIDIA_TARGET)  // for Kokkos::Cuda, an NVIDIA target must be available
-        plssvm::kokkos::execution_space::cuda,
-#endif
-#if defined(KOKKOS_ENABLE_HIP) && (defined(PLSSVM_HAS_NVIDIA_TARGET) || defined(PLSSVM_HAS_AMD_TARGET))  // for Kokkos::HIP, an NVIDIA or AMD target must be available
-        plssvm::kokkos::execution_space::hip,
-#endif
-#if defined(KOKKOS_ENABLE_SYCL)  // for Kokkos::SYCL, any target is ok
-        plssvm::kokkos::execution_space::sycl,
-#endif
-#if defined(KOKKOS_ENABLE_HPX) && defined(PLSSVM_HAS_CPU_TARGET)  // for Kokkos::Experimental::HPX, a CPU target must be available
-        plssvm::kokkos::execution_space::hpx,
-#endif
-#if defined(KOKKOS_ENABLE_OPENMP) && defined(PLSSVM_HAS_CPU_TARGET)  // for Kokkos::OpenMP, a CPU target must be available
-        plssvm::kokkos::execution_space::openmp,
-#endif
-#if defined(KOKKOS_ENABLE_OPENMPTARGET)  // for Kokkos::Experimental::OpenMPTarget,any target is ok // TODO: implement correctly based on allowed target platforms
-        plssvm::kokkos::execution_space::openmp_target,
-#endif
-#if defined(KOKKOS_ENABLE_OPENACC)  // for Kokkos::Experimental::OpenACC,any target is ok // TODO: implement correctly based on allowed target platforms
-        plssvm::kokkos::execution_space::openacc,
-#endif
-#if defined(KOKKOS_ENABLE_THREADS) && defined(PLSSVM_HAS_CPU_TARGET)  // for Kokkos::Threads, a CPU target must be available
-        plssvm::kokkos::execution_space::threads,
-#endif
-#if defined(KOKKOS_ENABLE_SERIAL) && defined(PLSSVM_HAS_CPU_TARGET)  // for Kokkos::Serial, a CPU target must be available
-        plssvm::kokkos::execution_space::serial,
-#endif
-    };
-}
-
-/**
- * @brief Uninstantiated base type to create a `std::tuple` containing all available `kokkos_csvm_test_type` types.
- */
-template <bool, typename>
-struct create_device_tuple_type_helper;
-
-/**
- * @brief Helper struct to create a `std::tuple` containing all available `kokkos_csvm_test_type` types by iterating over the `std::array` of
- *        `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`.
- * @tparam mock_grid_size whether the maximum grid size should be mocked (i.e. in fact reduced) or not
- * @tparam Is the indices to index the `std::array`
- */
-template <bool mock_grid_size, std::size_t... Is>
-struct create_device_tuple_type_helper<mock_grid_size, std::index_sequence<Is...>> {
-    /// The array containing all available execution spaces.
-    constexpr static auto array = constexpr_available_execution_spaces_to_test();
-    /// The resulting variant type.
-    using type = std::tuple<kokkos_csvm_test_type<false, array[Is]>...>;
-};
-
-/**
- * @brief Create a `std::tuple` containing all available `kokkos_csvm_test_type` types by iterating over the `std::array` of
- *        `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`.
- * @tparam mock_grid_size whether the maximum grid size should be mocked (i.e. in fact reduced) or not
- */
-template <bool mock_grid_size>
-struct create_device_tuple_type {
-    /// The number of types in the final variant.
-    constexpr static std::size_t N = constexpr_available_execution_spaces_to_test().size();
-    /// The final variant type.
-    using type = typename create_device_tuple_type_helper<mock_grid_size, std::make_index_sequence<N>>::type;
-};
-
-}  // namespace impl
-
-using kokkos_csvm_test_tuple = typename impl::create_device_tuple_type<false>::type;
+using kokkos_csvm_test_tuple = util::create_kokkos_test_tuple_t<kokkos_csvm_test_type_without_mock>;
 using kokkos_csvm_test_label_type_list = util::cartesian_type_product_t<kokkos_csvm_test_tuple, plssvm::detail::supported_label_types>;
 using kokkos_csvm_test_type_list = util::cartesian_type_product_t<kokkos_csvm_test_tuple>;
 
@@ -746,7 +677,10 @@ INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericGPUCSVMKernelFunction, kokkos_
 // generic GPU CSVM DeathTests - correct grid sizes
 INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericGPUCSVMDeathTest, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name);
 
-using kokkos_mock_csvm_test_tuple = typename impl::create_device_tuple_type<true>::type;
+template <plssvm::kokkos::execution_space space>
+using kokkos_csvm_test_type_with_mock = kokkos_csvm_test_type<true, space>;
+
+using kokkos_mock_csvm_test_tuple = util::create_kokkos_test_tuple_t<kokkos_csvm_test_type_with_mock>;
 using kokkos_mock_csvm_test_type_list = util::cartesian_type_product_t<kokkos_mock_csvm_test_tuple>;
 
 using kokkos_mock_csvm_test_type_gtest = util::combine_test_parameters_gtest_t<kokkos_mock_csvm_test_type_list>;
diff --git a/tests/backends/Kokkos/utility.hpp b/tests/backends/Kokkos/utility.hpp
new file mode 100644
index 000000000..872d4c624
--- /dev/null
+++ b/tests/backends/Kokkos/utility.hpp
@@ -0,0 +1,96 @@
+/**
+* @file
+* @author Alexander Van Craen
+* @author Marcel Breyer
+* @copyright 2018-today The PLSSVM project - All Rights Reserved
+* @license This file is part of the PLSSVM project which is released under the MIT license.
+*          See the LICENSE.md file in the project root for full license information.
+*
+* @brief Determine the execution spaces available for tests with the Kokkos backend.
+*/
+
+#ifndef PLSSVM_TESTS_BACKENDS_KOKKOS_UTILITY_HPP_
+#define PLSSVM_TESTS_BACKENDS_KOKKOS_UTILITY_HPP_
+#pragma once
+
+namespace util {
+
+/**
+ * @brief Determine which execution spaces can be tested based on the available Kokkos::ExecutionSpaces and PLSSVM target platforms.
+ * @return the available execution spaces for testing (`[[nodiscard]]`)
+ */
+[[nodiscard]] constexpr auto constexpr_available_execution_spaces_to_test() {
+    return std::array{
+#if defined(KOKKOS_ENABLE_CUDA) && defined(PLSSVM_HAS_NVIDIA_TARGET)  // for Kokkos::Cuda, an NVIDIA target must be available
+        plssvm::kokkos::execution_space::cuda,
+#endif
+#if defined(KOKKOS_ENABLE_HIP) && (defined(PLSSVM_HAS_NVIDIA_TARGET) || defined(PLSSVM_HAS_AMD_TARGET))  // for Kokkos::HIP, an NVIDIA or AMD target must be available
+        plssvm::kokkos::execution_space::hip,
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)  // for Kokkos::SYCL, any target is ok
+        plssvm::kokkos::execution_space::sycl,
+#endif
+#if defined(KOKKOS_ENABLE_HPX) && defined(PLSSVM_HAS_CPU_TARGET)  // for Kokkos::Experimental::HPX, a CPU target must be available
+        plssvm::kokkos::execution_space::hpx,
+#endif
+#if defined(KOKKOS_ENABLE_OPENMP) && defined(PLSSVM_HAS_CPU_TARGET)  // for Kokkos::OpenMP, a CPU target must be available
+        plssvm::kokkos::execution_space::openmp,
+#endif
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)  // for Kokkos::Experimental::OpenMPTarget,any target is ok // TODO: implement correctly based on allowed target platforms
+        plssvm::kokkos::execution_space::openmp_target,
+#endif
+#if defined(KOKKOS_ENABLE_OPENACC)  // for Kokkos::Experimental::OpenACC,any target is ok // TODO: implement correctly based on allowed target platforms
+        plssvm::kokkos::execution_space::openacc,
+#endif
+#if defined(KOKKOS_ENABLE_THREADS) && defined(PLSSVM_HAS_CPU_TARGET)  // for Kokkos::Threads, a CPU target must be available
+        plssvm::kokkos::execution_space::threads,
+#endif
+#if defined(KOKKOS_ENABLE_SERIAL) && defined(PLSSVM_HAS_CPU_TARGET)  // for Kokkos::Serial, a CPU target must be available
+        plssvm::kokkos::execution_space::serial,
+#endif
+    };
+}
+
+/**
+ * @brief Uninstantiated base type to create a `std::tuple` containing all available `kokkos_csvm_test_type` types.
+ */
+template <template <plssvm::kokkos::execution_space> typename, typename>
+struct create_kokkos_test_tuple_impl;
+
+/**
+ * @brief Helper struct to create a `std::tuple` containing all available `test_type` types by iterating over the `std::array` of
+ *        `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`.
+ * @tparam test_type the test type to instantiate
+ * @tparam Is the indices to index the `std::array`
+ */
+template <template <plssvm::kokkos::execution_space> typename test_type, std::size_t... Is>
+struct create_kokkos_test_tuple_impl<test_type, std::index_sequence<Is...>> {
+    /// The array containing all available execution spaces.
+    constexpr static auto array = constexpr_available_execution_spaces_to_test();
+    /// The resulting variant type.
+    using type = std::tuple<test_type<array[Is]>...>;
+};
+
+/**
+ * @brief Create a `std::tuple` containing all available `test_type` types by iterating over the `std::array` of
+ *        `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`.
+ * @tparam test_type the test type to instantiate
+ */
+template <template <plssvm::kokkos::execution_space> typename test_type>
+struct create_kokkos_test_tuple {
+    /// The number of types in the final variant.
+    constexpr static std::size_t N = constexpr_available_execution_spaces_to_test().size();
+    /// The final tuple type.
+    using type = typename create_kokkos_test_tuple_impl<test_type, std::make_index_sequence<N>>::type;
+};
+
+/**
+ * @brief Shorthand for the `typename create_kokkos_test_tuple<...>::type` type.
+ */
+template <template <plssvm::kokkos::execution_space> typename test_type>
+using create_kokkos_test_tuple_t = typename create_kokkos_test_tuple<test_type>::type;
+
+
+}
+
+#endif  // PLSSVM_TESTS_BACKENDS_KOKKOS_UTILITY_HPP_
diff --git a/tests/naming.hpp b/tests/naming.hpp
index dd58c097f..a7305fe4f 100644
--- a/tests/naming.hpp
+++ b/tests/naming.hpp
@@ -32,7 +32,7 @@
 #include <string>       // std::string
 #include <string_view>  // std::string_view
 #include <tuple>        // std::tuple, std::tuple_element_t, std::tuple_size_v, std::get
-#include <type_traits>  // std::true_type, std::false_type, std::is_same_v, std::is_arithmetic_v, std::is_base_of_v
+#include <type_traits>  // std::true_type, std::false_type, std::is_same_v, std::is_arithmetic_v, std::is_base_of_v, std::void_t
 
 namespace naming {
 
@@ -91,6 +91,21 @@ PLSSVM_CREATE_HAS_MEMBER_TYPEDEF_TYPE_TRAIT(pinned_memory_type)
 
 #undef PLSSVM_CREATE_HAS_MEMBER_TYPEDEF_TYPE_TRAIT
 
+/**
+ * @brief A macro to create type traits for testing whether a type has a static variable declaration called @p def.
+ */
+#define PLSSVM_CREATE_HAS_MEMBER_VARIABLE_TYPE_TRAIT(def)                                                   \
+    template <typename T, typename = void>                                                                  \
+    struct enable_if_##def##_member_variable_exists : std::false_type { };                                  \
+    template <typename T>                                                                                   \
+    struct enable_if_##def##_member_variable_exists<T, std::void_t<decltype(T::def)>> : std::true_type { }; \
+    template <typename T>                                                                                   \
+    constexpr bool has_##def##_member_variable_v = enable_if_##def##_member_variable_exists<T>::value;
+
+PLSSVM_CREATE_HAS_MEMBER_VARIABLE_TYPE_TRAIT(space)
+
+#undef PLSSVM_CREATE_HAS_MEMBER_VARIABLE_TYPE_TRAIT
+
 /**
  * @brief Escape some characters of the string such that GTest accepts it as test case name.
  * @details Replaces some special cases for better readability: "-" with "_M_" (for Minus), " " with "_W_" (for Whitespace), "." with "_D_" (for dot),
@@ -164,7 +179,11 @@ template <typename T>
         // clang-format on
     } else if constexpr (has_device_ptr_type_member_typedef_v<T>) {
         using device_ptr_type = typename T::device_ptr_type;
-        return fmt::format("{}", plssvm::detail::arithmetic_type_name<typename device_ptr_type::value_type>());
+        std::string test_name{ fmt::format("{}", plssvm::detail::arithmetic_type_name<typename device_ptr_type::value_type>()) };
+        if constexpr (has_space_member_variable_v<T>) {
+            test_name += fmt::format("_{}", T::space);
+        }
+        return test_name;
     } else if constexpr (has_pinned_memory_type_member_typedef_v<T>) {
         using pinned_memory_type = typename T::pinned_memory_type;
         return fmt::format("{}", plssvm::detail::arithmetic_type_name<typename pinned_memory_type::value_type>());

From 5bf3b009d87e239f37647ef0a2a71160a7c6a298 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 11 Nov 2024 11:52:48 +0100
Subject: [PATCH 095/123] Add missing [[nodiscard]] and fix nvcc related
 implicit conversion compiler warning.

---
 .../Kokkos/kernel/kernel_functions.hpp        | 31 +++++++++++++------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp b/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
index 0ca70c2de..35cbe8ed1 100644
--- a/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
@@ -36,7 +36,7 @@ namespace plssvm::kokkos::detail {
  * @return the reduced value (`[[nodiscard]]`)
  */
 template <kernel_function_type kernel_function>
-KOKKOS_INLINE_FUNCTION real_type feature_reduce(const real_type val1, const real_type val2) {
+[[nodiscard]] KOKKOS_INLINE_FUNCTION real_type feature_reduce(const real_type val1, const real_type val2) {
     return val1 * val2;
 }
 
@@ -47,7 +47,7 @@ KOKKOS_INLINE_FUNCTION real_type feature_reduce(const real_type val1, const real
  * @return the reduced value (`[[nodiscard]]`)
  */
 template <>
-KOKKOS_INLINE_FUNCTION real_type feature_reduce<kernel_function_type::rbf>(const real_type val1, const real_type val2) {
+[[nodiscard]] KOKKOS_INLINE_FUNCTION real_type feature_reduce<kernel_function_type::rbf>(const real_type val1, const real_type val2) {
     const real_type d = val1 - val2;
     return d * d;
 }
@@ -59,10 +59,25 @@ KOKKOS_INLINE_FUNCTION real_type feature_reduce<kernel_function_type::rbf>(const
  * @return the reduced value (`[[nodiscard]]`)
  */
 template <>
-KOKKOS_INLINE_FUNCTION real_type feature_reduce<kernel_function_type::laplacian>(const real_type val1, const real_type val2) {
+[[nodiscard]] KOKKOS_INLINE_FUNCTION real_type feature_reduce<kernel_function_type::laplacian>(const real_type val1, const real_type val2) {
     return Kokkos::fabs(val1 - val2);
 }
 
+/**
+ * @brief Return the minimum possible floating point value for type @p T.
+ * @brief Function necessary such the the `if constexpr` depends on a template parameter and, therefore, no false-positive implicit conversion warnings are reported.
+ * @tparam T the type to retrieve the minimum value
+ * @return the minimum floating point value for type @p T (`[[nodiscard]]`)
+ */
+template <typename T>
+[[nodiscard]] constexpr KOKKOS_INLINE_FUNCTION T real_type_min() {
+    if constexpr (std::is_same_v<real_type, float>) {
+        return FLT_MIN;
+    } else {
+        return DBL_MIN;
+    }
+}
+
 /**
  * @brief Compute the feature reduction for the chi-squared kernel function.
  * @note Be sure that the denominator isn't 0.0 which may be the case for padding values.
@@ -71,13 +86,9 @@ KOKKOS_INLINE_FUNCTION real_type feature_reduce<kernel_function_type::laplacian>
  * @return the reduced value (`[[nodiscard]]`)
  */
 template <>
-KOKKOS_INLINE_FUNCTION real_type feature_reduce<kernel_function_type::chi_squared>(const real_type val1, const real_type val2) {
+[[nodiscard]] KOKKOS_INLINE_FUNCTION real_type feature_reduce<kernel_function_type::chi_squared>(const real_type val1, const real_type val2) {
     const real_type d = val1 - val2;
-    if constexpr (std::is_same_v<real_type, float>) {
-        return (real_type{ 1.0 } / (val1 + val2 + FLT_MIN)) * d * d;
-    } else {
-        return (real_type{ 1.0 } / (val1 + val2 + DBL_MIN)) * d * d;
-    }
+    return (real_type{ 1.0 } / (val1 + val2 + real_type_min<real_type>())) * d * d;
 }
 
 //***************************************************//
@@ -93,7 +104,7 @@ KOKKOS_INLINE_FUNCTION real_type feature_reduce<kernel_function_type::chi_square
  * @return the result value (`[[nodiscard]]`)
  */
 template <kernel_function_type kernel_function, typename... Args>
-KOKKOS_INLINE_FUNCTION real_type apply_kernel_function(const real_type value, [[maybe_unused]] const detail::standard_layout_tuple<Args...> params) {
+[[nodiscard]] KOKKOS_INLINE_FUNCTION real_type apply_kernel_function(const real_type value, [[maybe_unused]] const detail::standard_layout_tuple<Args...> params) {
     if constexpr (kernel_function == kernel_function_type::linear) {
         return value;
     } else if constexpr (kernel_function == kernel_function_type::polynomial) {

From e4a5cbb4d333f4cb79c662fa6b3a46d7473093b1 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 11 Nov 2024 12:36:16 +0100
Subject: [PATCH 096/123] Lift league size restrictions for the CPU based
 Kokkos execution spaces.

---
 src/plssvm/backends/Kokkos/csvm.cpp | 30 ++---------------------------
 1 file changed, 2 insertions(+), 28 deletions(-)

diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 2bcefeeff..55e57e702 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -346,39 +346,13 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size([[maybe_unused]] const std::s
                 return { native_range[2], native_range[1], native_range[0] };
             }));
         case execution_space::hpx:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HPX(([&]() -> ::plssvm::detail::dim_type {
-                // get the total number of threads
-                const std::size_t num_threads = Kokkos::Experimental::HPX::impl_max_hardware_threads();
-                // set the maximum league size to twice the number of available hardware threads
-                // NOTE: this is just an estimate and can or should be changed depending on the performance
-                const auto league_size = static_cast<unsigned long long>(std::ceil(std::sqrt(num_threads * 2)));
-                return { league_size, league_size, 1ull };
-            }));
         case execution_space::openmp:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMP(([&]() -> ::plssvm::detail::dim_type {
-                // get the total number of threads
-                const std::size_t num_threads = Kokkos::OpenMP::impl_max_hardware_threads();
-                // set the maximum league size to twice the number of available hardware threads
-                // NOTE: this is just an estimate and can or should be changed depending on the performance
-                const auto league_size = static_cast<unsigned long long>(std::ceil(std::sqrt(num_threads * 2)));
-                return { league_size, league_size, 1ull };
-            }));
         case execution_space::threads:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_THREADS(([&]() -> ::plssvm::detail::dim_type {
-                // get the total number of threads
-                const std::size_t num_threads = Kokkos::Threads::impl_max_hardware_threads();
-                // set the maximum league size to twice the number of available hardware threads
-                // NOTE: this is just an estimate and can or should be changed depending on the performance
-                const auto league_size = static_cast<unsigned long long>(std::ceil(std::sqrt(num_threads * 2)));
-                return { league_size, league_size, 1ull };
-            }));
         case execution_space::serial:
-            PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SERIAL(([&]() -> ::plssvm::detail::dim_type {
-                return { std::numeric_limits<int>::max(), std::numeric_limits<int>::max(), 1ull };
-            }));
-        // TODO: implement for Kokkos::Experimental::OpenMPTarget and Kokkos::Experimental::OpenACC
+            return { std::numeric_limits<int>::max(), std::numeric_limits<int>::max(), 1ull };
         case execution_space::openmp_target:
         case execution_space::openacc:
+            // TODO: implement for Kokkos::Experimental::OpenMPTarget and Kokkos::Experimental::OpenACC
             throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) };
     }
     // all possible cases should be handled by the previous switch

From 02dcd747486b245814a500a9c121c135d5731d1e Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 11 Nov 2024 13:20:36 +0100
Subject: [PATCH 097/123] Fix typo.

---
 src/plssvm/backends/Kokkos/csvm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 55e57e702..58c710985 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -174,7 +174,7 @@ void csvm::init(const target_platform target) {
 
     // throw exception if no devices in the current execution space could be found
     if (devices_.empty()) {
-        throw backend_exception{ fmt::format("Not devices found for the Kokkos execution space {} with the target platform {}!", space_, target_) };
+        throw backend_exception{ fmt::format("No devices found for the Kokkos execution space {} with the target platform {}!", space_, target_) };
     }
 
     // print found Kokkos devices

From e3bb207f3541c860207a61a7158b105f1099e314 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 11 Nov 2024 13:41:53 +0100
Subject: [PATCH 098/123] Add assertion that the target_ must be set.

---
 src/plssvm/backends/Kokkos/csvm.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 58c710985..316573240 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -151,6 +151,7 @@ void csvm::init(const target_platform target) {
 
     // At this point, space_ may NEVER be execution_space::automatic!
     PLSSVM_ASSERT(space_ != execution_space::automatic, "At this point, the Kokkos execution space must be determined and must NOT be automatic!");
+    PLSSVM_ASSERT(target_ != target_platform::automatic, "At this point, the target platform must be determined and must NOT be automatic!");
 
     // Kokkos::Experimental::OpenMPTarget and Kokkos::Experimental::OpenACC currently not supported!
     if (space_ == execution_space::openmp_target || space_ == execution_space::openacc) {

From e1ec7dbbe7dd4bd4634657dbcbf3ff1d4f620d9d Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 11 Nov 2024 13:42:07 +0100
Subject: [PATCH 099/123] Fix error not setting target_ in one case.

---
 src/plssvm/backends/Kokkos/csvm.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 316573240..606b25108 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -142,7 +142,10 @@ void csvm::init(const target_platform target) {
                 }
             }
         } else {
-            if (!::plssvm::detail::contains(available_combinations, target) || !::plssvm::detail::contains(available_combinations.at(target), space_)) {
+            if (::plssvm::detail::contains(available_combinations, target) && ::plssvm::detail::contains(available_combinations.at(target), space_)) {
+                // update target
+                target_ = target;
+            } else {
                 // the provided execution space and target platform combination is unsupported
                 throw backend_exception{ fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform {}!", space_, target) };
             }

From d68e86d251e0103a7249db04dfa7ff4a1a8a211d Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 11 Nov 2024 13:42:45 +0100
Subject: [PATCH 100/123] Fix formatting.

---
 tests/detail/cmd/parser_predict.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/detail/cmd/parser_predict.cpp b/tests/detail/cmd/parser_predict.cpp
index 72311f89b..79f99c97f 100644
--- a/tests/detail/cmd/parser_predict.cpp
+++ b/tests/detail/cmd/parser_predict.cpp
@@ -250,7 +250,7 @@ INSTANTIATE_TEST_SUITE_P(ParserPredict, ParserPredictSYCLImplementation, ::testi
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
 
 class ParserPredictKokkosExecutionSpace : public ParserPredict,
-                                        public ::testing::WithParamInterface<std::tuple<std::string, std::string>> { };
+                                          public ::testing::WithParamInterface<std::tuple<std::string, std::string>> { };
 
 TEST_P(ParserPredictKokkosExecutionSpace, parsing) {
     const auto &[flag, value] = GetParam();

From 5e563aba5a297588f0f10059d05241b9b512e43c Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 11 Nov 2024 15:22:23 +0100
Subject: [PATCH 101/123] Fix a problem where the target_platform can't be
 determined, but no exception was thrown.

---
 src/plssvm/backends/Kokkos/csvm.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 606b25108..603a5216c 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -105,15 +105,21 @@ void csvm::init(const target_platform target) {
     if (space_ == execution_space::automatic) {
         // automatically determine the execution space and potentially automatically determine the target platform
         if (target == target_platform::automatic) {
+            bool found_combination{ false };
             // go through all combinations and choose the first execution space in order: gpu_nvidia -> gpu_amd -> gpu_intel -> cpu
             for (const target_platform target_order : list_available_target_platforms()) {
                 if (::plssvm::detail::contains(available_combinations, target_order)) {
                     // the target platform is supported -> choose the first execution space to use in the Kokkos backend
                     space_ = available_combinations.at(target_order).front();
                     target_ = target_order;
+                    found_combination = true;
                     break;
                 }
             }
+            // check whether a valid combination could be found
+            if (!found_combination) {
+                throw backend_exception{ fmt::format("Couldn't find a valid Kokkos::ExecutionSpace ({}) and target_platform ({}) combination!", fmt::join(list_available_execution_spaces(), ", "), fmt::join(list_available_target_platforms(), ", ")) };
+            }
         } else {
             // check whether the provided target platform is compatible with the currently available Kokkos::ExecutionSpaces
             if (::plssvm::detail::contains(available_combinations, target)) {
@@ -133,14 +139,20 @@ void csvm::init(const target_platform target) {
     } else {
         // execution space explicitly provided and potentially automatically determine the target platform
         if (target == target_platform::automatic) {
+            bool found_combination{ false };
             // go through all combinations (gpu_nvidia -> gpu_amd -> gpu_intel -> cpu) and check whether the requested execution space supports that target platform
             for (const target_platform target_order : list_available_target_platforms()) {
                 if (::plssvm::detail::contains(available_combinations, target_order) && ::plssvm::detail::contains(available_combinations.at(target_order), space_)) {
                     // the provided execution space supports the target platform
                     target_ = target_order;
+                    found_combination = true;
                     break;
                 }
             }
+            // check whether a valid combination could be found
+            if (!found_combination) {
+                throw backend_exception{ fmt::format("Couldn't find a valid target_platform for the Kokkos::ExecutionSpace {}!", space_) };
+            }
         } else {
             if (::plssvm::detail::contains(available_combinations, target) && ::plssvm::detail::contains(available_combinations.at(target), space_)) {
                 // update target

From 3518e9e4405350e76928e0b53623893863863d64 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 11 Nov 2024 16:00:46 +0100
Subject: [PATCH 102/123] Fix failing tests if a Kokkos execution space is
 available but no target platform for the specific execution space.

---
 tests/backends/Kokkos/kokkos_csvm.cpp | 104 +++++++++++++++++++++++---
 1 file changed, 92 insertions(+), 12 deletions(-)

diff --git a/tests/backends/Kokkos/kokkos_csvm.cpp b/tests/backends/Kokkos/kokkos_csvm.cpp
index c1ae0cdb7..c99f8c7d5 100644
--- a/tests/backends/Kokkos/kokkos_csvm.cpp
+++ b/tests/backends/Kokkos/kokkos_csvm.cpp
@@ -134,9 +134,19 @@ TEST_F(KokkosCSVM, construct_execution_space_and_parameter) {  // execution_spac
     // automatic should always work
     EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::automatic }));
 
+    const auto target_is_available = [](const plssvm::target_platform target) {
+        return plssvm::detail::contains(plssvm::list_available_target_platforms(), target);
+    };
+
 #if defined(KOKKOS_ENABLE_CUDA)
     // explicitly providing the Cuda execution space should work
-    EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda }));
+    if (target_is_available(plssvm::target_platform::gpu_nvidia)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda }),
+                          plssvm::kokkos::backend_exception,
+                          "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace Cuda!");
+    }
 #else
     EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda }),
                       plssvm::kokkos::backend_exception,
@@ -145,7 +155,13 @@ TEST_F(KokkosCSVM, construct_execution_space_and_parameter) {  // execution_spac
 
 #if defined(KOKKOS_ENABLE_HIP)
     // explicitly providing the HIP execution space should work
-    EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hip }));
+    if (target_is_available(plssvm::target_platform::gpu_nvidia) || target_is_available(plssvm::target_platform::gpu_amd)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hip }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hip }),
+                          plssvm::kokkos::backend_exception,
+                          "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace HIP!");
+    }
 #else
     EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hip }),
                       plssvm::kokkos::backend_exception,
@@ -163,7 +179,13 @@ TEST_F(KokkosCSVM, construct_execution_space_and_parameter) {  // execution_spac
 
 #if defined(KOKKOS_ENABLE_HPX)
     // explicitly providing the HPX execution space should work
-    EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hpx }));
+    if (target_is_available(plssvm::target_platform::cpu)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hpx }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hpx }),
+                          plssvm::kokkos::backend_exception,
+                          "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace HPX!");
+    }
 #else
     EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hpx }),
                       plssvm::kokkos::backend_exception,
@@ -172,7 +194,13 @@ TEST_F(KokkosCSVM, construct_execution_space_and_parameter) {  // execution_spac
 
 #if defined(KOKKOS_ENABLE_OPENMP)
     // explicitly providing the OpenMP execution space should work
-    EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp }));
+    if (target_is_available(plssvm::target_platform::cpu)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp }),
+                          plssvm::kokkos::backend_exception,
+                          "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace OpenMP!");
+    }
 #else
     EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp }),
                       plssvm::kokkos::backend_exception,
@@ -203,7 +231,13 @@ TEST_F(KokkosCSVM, construct_execution_space_and_parameter) {  // execution_spac
 
 #if defined(KOKKOS_ENABLE_THREADS)
     // explicitly providing the Threads execution space should work
-    EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::threads }));
+    if (target_is_available(plssvm::target_platform::cpu)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::threads }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::threads }),
+                          plssvm::kokkos::backend_exception,
+                          "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace Threads!");
+    }
 #else
     EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::threads }),
                       plssvm::kokkos::backend_exception,
@@ -212,7 +246,13 @@ TEST_F(KokkosCSVM, construct_execution_space_and_parameter) {  // execution_spac
 
 #if defined(KOKKOS_ENABLE_SERIAL)
     // explicitly providing the Serial execution space should work
-    EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::serial }));
+    if (target_is_available(plssvm::target_platform::cpu)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::serial }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::serial }),
+                          plssvm::kokkos::backend_exception,
+                          "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace Serial!");
+    }
 #else
     EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::serial }),
                       plssvm::kokkos::backend_exception,
@@ -425,9 +465,19 @@ TEST_F(KokkosCSVM, construct_execution_space_and_named_args) {  // execution_spa
     // automatic should always work
     EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::automatic }));
 
+    const auto target_is_available = [](const plssvm::target_platform target) {
+        return plssvm::detail::contains(plssvm::list_available_target_platforms(), target);
+    };
+
 #if defined(KOKKOS_ENABLE_CUDA)
     // explicitly providing the Cuda execution space should work
-    EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda }));
+    if (target_is_available(plssvm::target_platform::gpu_nvidia)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda }),
+                          plssvm::kokkos::backend_exception,
+                          "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace Cuda!");
+    }
 #else
     EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda }),
                       plssvm::kokkos::backend_exception,
@@ -436,7 +486,13 @@ TEST_F(KokkosCSVM, construct_execution_space_and_named_args) {  // execution_spa
 
 #if defined(KOKKOS_ENABLE_HIP)
     // explicitly providing the HIP execution space should work
-    EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hip }));
+    if (target_is_available(plssvm::target_platform::gpu_nvidia) || target_is_available(plssvm::target_platform::gpu_amd)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hip }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hip }),
+                          plssvm::kokkos::backend_exception,
+                          "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace HIP!");
+    }
 #else
     EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hip }),
                       plssvm::kokkos::backend_exception,
@@ -454,7 +510,13 @@ TEST_F(KokkosCSVM, construct_execution_space_and_named_args) {  // execution_spa
 
 #if defined(KOKKOS_ENABLE_HPX)
     // explicitly providing the HPX execution space should work
-    EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hpx }));
+    if (target_is_available(plssvm::target_platform::cpu)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hpx }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hpx }),
+                          plssvm::kokkos::backend_exception,
+                          "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace HPX!");
+    }
 #else
     EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hpx }),
                       plssvm::kokkos::backend_exception,
@@ -463,7 +525,13 @@ TEST_F(KokkosCSVM, construct_execution_space_and_named_args) {  // execution_spa
 
 #if defined(KOKKOS_ENABLE_OPENMP)
     // explicitly providing the OpenMP execution space should work
-    EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp }));
+    if (target_is_available(plssvm::target_platform::cpu)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp }),
+                          plssvm::kokkos::backend_exception,
+                          "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace OpenMP!");
+    }
 #else
     EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp }),
                       plssvm::kokkos::backend_exception,
@@ -494,7 +562,13 @@ TEST_F(KokkosCSVM, construct_execution_space_and_named_args) {  // execution_spa
 
 #if defined(KOKKOS_ENABLE_THREADS)
     // explicitly providing the Threads execution space should work
-    EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::threads }));
+    if (target_is_available(plssvm::target_platform::cpu)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::threads }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::threads }),
+                          plssvm::kokkos::backend_exception,
+                          "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace Threads!");
+    }
 #else
     EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::threads }),
                       plssvm::kokkos::backend_exception,
@@ -503,7 +577,13 @@ TEST_F(KokkosCSVM, construct_execution_space_and_named_args) {  // execution_spa
 
 #if defined(KOKKOS_ENABLE_SERIAL)
     // explicitly providing the Serial execution space should work
-    EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::serial }));
+    if (target_is_available(plssvm::target_platform::cpu)) {
+        EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::serial }));
+    } else {
+        EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::serial }),
+                          plssvm::kokkos::backend_exception,
+                          "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace Serial!");
+    }
 #else
     EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::serial }),
                       plssvm::kokkos::backend_exception,

From 3c892d8f699667958ad01a6ee823d6bb0cb0b0a7 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 11 Nov 2024 16:01:32 +0100
Subject: [PATCH 103/123] Increase recursive template instantiation limit
 fixing a compilation error with icpx.

---
 tests/backends/Kokkos/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/backends/Kokkos/CMakeLists.txt b/tests/backends/Kokkos/CMakeLists.txt
index 4a7d23501..f29367a27 100644
--- a/tests/backends/Kokkos/CMakeLists.txt
+++ b/tests/backends/Kokkos/CMakeLists.txt
@@ -37,6 +37,9 @@ if (Kokkos_ENABLE_CUDA)
     endif ()
 endif ()
 
+# increase recursive template instantiation limit
+target_compile_options(${PLSSVM_KOKKOS_TEST_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:$<$<CXX_COMPILER_ID:GNU,Clang,IntelLLVM>:-ftemplate-depth=2048>>)
+
 # link against test library
 target_link_libraries(${PLSSVM_KOKKOS_TEST_NAME} PRIVATE ${PLSSVM_BASE_TEST_LIBRARY_NAME})
 

From ae4d02e9932abf9c25a79bf69e0df76fe033b8e0 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 11 Nov 2024 16:01:51 +0100
Subject: [PATCH 104/123] Update some tests.

---
 tests/backends/generic_csvm_tests.hpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/backends/generic_csvm_tests.hpp b/tests/backends/generic_csvm_tests.hpp
index 4c5d59738..ae3fc8a0e 100644
--- a/tests/backends/generic_csvm_tests.hpp
+++ b/tests/backends/generic_csvm_tests.hpp
@@ -35,7 +35,7 @@
 #include "tests/types_to_test.hpp"          // util::{test_parameter_type_at_t, test_parameter_value_at_v}
 #include "tests/utility.hpp"                // util::{redirect_output, generate_specific_matrix, construct_from_tuple, flatten, generate_random_matrix}
 
-#include "fmt/format.h"   // fmt::format
+#include "fmt/format.h"  // fmt::format
 #include "fmt/ranges.h"
 #include "gmock/gmock.h"  // ::testing::HasSubstr
 #include "gtest/gtest.h"  // TYPED_TEST_SUITE_P, TYPED_TEST_P, REGISTER_TYPED_TEST_SUITE_P, EXPECT_EQ, EXPECT_NE, EXPECT_GT, EXPECT_TRUE, EXPECT_DEATH,
@@ -803,9 +803,11 @@ TYPED_TEST_P(GenericCSVMSolver, solve_lssvm_system_of_linear_equations_trivial)
 
     // check the calculated result for correctness
     EXPECT_FLOATING_POINT_MATRIX_NEAR_EPS(calculated_x, (plssvm::aos_matrix<plssvm::real_type>{ B, plssvm::shape{ plssvm::PADDING_SIZE, plssvm::PADDING_SIZE } }), 1e6);
-    EXPECT_TRUE(std::all_of(calculated_rho.cbegin(), calculated_rho.cend(), [front = std::abs(calculated_rho.front())](const plssvm::real_type rho) { return std::abs(rho) == front; }));
+    EXPECT_TRUE(std::all_of(calculated_rho.cbegin(), calculated_rho.cend(), [front = std::abs(calculated_rho.front())](const plssvm::real_type rho) {
+        return std::abs(std::abs(rho) - front) <= std::numeric_limits<plssvm::real_type>::epsilon();
+    }));
     for (const auto rho : calculated_rho) {
-        EXPECT_FLOATING_POINT_NEAR(std::abs(rho) - std::numeric_limits<plssvm::real_type>::epsilon(), std::numeric_limits<plssvm::real_type>::epsilon());
+        EXPECT_LE(std::abs(rho) - std::numeric_limits<plssvm::real_type>::epsilon(), std::numeric_limits<plssvm::real_type>::epsilon());
     }
     EXPECT_THAT(num_iters, ::testing::Each(::testing::Gt(0)));
 }

From 03bd55eae711d965c676281789fe0f8fe0923015 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 11 Nov 2024 16:02:35 +0100
Subject: [PATCH 105/123] Do not pass argc and argv to the Kokkos
 initialization functions.

---
 src/main_predict.cpp | 2 +-
 src/main_train.cpp   | 2 +-
 tests/main.cpp       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/main_predict.cpp b/src/main_predict.cpp
index 58015d928..8d9f6d1a6 100644
--- a/src/main_predict.cpp
+++ b/src/main_predict.cpp
@@ -88,7 +88,7 @@ int main(int argc, char *argv[]) {
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
             // initialize Kokkos if necessary
             if (use_kokkos_as_backend) {
-                kokkos_guard = std::make_unique<Kokkos::ScopeGuard>(argc, argv);
+                kokkos_guard = std::make_unique<Kokkos::ScopeGuard>();
                 PLSSVM_ASSERT(Kokkos::is_initialized(), "Something went wrong initializing the Kokkos environment!");
             }
 #endif
diff --git a/src/main_train.cpp b/src/main_train.cpp
index 93cb2abe8..bf5bc3ec5 100644
--- a/src/main_train.cpp
+++ b/src/main_train.cpp
@@ -84,7 +84,7 @@ int main(int argc, char *argv[]) {
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
             // initialize Kokkos if necessary
             if (use_kokkos_as_backend) {
-                kokkos_guard = std::make_unique<Kokkos::ScopeGuard>(argc, argv);
+                kokkos_guard = std::make_unique<Kokkos::ScopeGuard>();
                 PLSSVM_ASSERT(Kokkos::is_initialized(), "Something went wrong initializing the Kokkos environment!");
             }
 #endif
diff --git a/tests/main.cpp b/tests/main.cpp
index 614b38cff..1500301bf 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -49,7 +49,7 @@ int main(int argc, char **argv) {
 
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     // initialize Kokkos using a Kokkos::ScopeGuard
-    const Kokkos::ScopeGuard guard{ argc, argv };
+    const Kokkos::ScopeGuard guard{};
 #endif
 
     // prevent problems with fork() in the presence of multiple threads

From 4e4be674590ab21da5d969f4e9f1f6d53173c93c Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 11 Nov 2024 16:03:28 +0100
Subject: [PATCH 106/123] Use an atexit handler to fix segfault emerging with
 icpx in the tests testing std::exit(0).

---
 tests/main.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/main.cpp b/tests/main.cpp
index 1500301bf..427426318 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -15,6 +15,8 @@
 
 #include "gtest/gtest.h"  // RUN_ALL_TESTS, ::testing::{InitGoogleTest, GTEST_FLAG},GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST definitions
 
+#include <cstdlib>  // std::atexit
+
 // silence GTest warnings/test errors
 
 // generic CSVM tests
@@ -44,12 +46,21 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtrDeathTest);
 // exception tests
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Exception);
 
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+void kokkos_ensure_finalization() {
+    if (!Kokkos::is_finalized()) {
+        Kokkos::finalize();
+    }
+}
+#endif
+
 int main(int argc, char **argv) {
     ::testing::InitGoogleTest(&argc, argv);
 
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     // initialize Kokkos using a Kokkos::ScopeGuard
     const Kokkos::ScopeGuard guard{};
+    [[maybe_unused]] const int ret = std::atexit(kokkos_ensure_finalization);
 #endif
 
     // prevent problems with fork() in the presence of multiple threads

From cd36293c78c5828d10caa34b8fd9e0573218ec4f Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 13 Nov 2024 17:58:27 +0100
Subject: [PATCH 107/123] Add custom PLSSVM initialization and finalization
 functions including a scope_guard to manage environment setup and teardown.

---
 README.md                           |   8 ++
 bindings/Python/CMakeLists.txt      |   1 +
 bindings/Python/environment.cpp     |  35 +++++++
 bindings/Python/main.cpp            |   2 +
 examples/cpp/main.cpp               |   3 +
 examples/python/main.py             |   3 +
 examples/python/sklearn_like_svc.py |   3 +
 include/plssvm/core.hpp             |   1 +
 include/plssvm/environment.hpp      | 148 ++++++++++++++++++++++++++++
 src/main_predict.cpp                |  21 ++--
 src/main_train.cpp                  |  21 ++--
 tests/main.cpp                      |  20 ++--
 12 files changed, 225 insertions(+), 41 deletions(-)
 create mode 100644 bindings/Python/environment.cpp
 create mode 100644 include/plssvm/environment.hpp

diff --git a/README.md b/README.md
index 047a9fd23..280c3a9e3 100644
--- a/README.md
+++ b/README.md
@@ -704,6 +704,10 @@ A simple C++ program (`main.cpp`) using PLSSVM as library could look like:
 #include <vector>
 
 int main() {
+    // correctly initialize and finalize environments
+    // Note: currently only really necessary if Kokkos is enabled, since only Kokkos needs special environmental setup
+    plssvm::environment::scope_guard environment_guard{};
+    
     try {
         // create a new C-SVM parameter set, explicitly overriding the default kernel function
         const plssvm::parameter params{ plssvm::kernel_type = plssvm::kernel_function_type::polynomial };
@@ -769,6 +773,10 @@ Roughly the same can be achieved using our Python bindings with the following Py
 import plssvm
 from sklearn.metrics import classification_report
 
+# correctly initialize and finalize environments
+# Note: currently only really necessary if Kokkos is enabled, since only Kokkos needs special environmental setup
+environment_guard = plssvm.environment.ScopeGuard()
+
 try:
     # create a new C-SVM parameter set, explicitly overriding the default kernel function
     params = plssvm.Parameter(kernel_type=plssvm.KernelFunctionType.POLYNOMIAL)
diff --git a/bindings/Python/CMakeLists.txt b/bindings/Python/CMakeLists.txt
index 8dfba2e04..d0a34223c 100644
--- a/bindings/Python/CMakeLists.txt
+++ b/bindings/Python/CMakeLists.txt
@@ -41,6 +41,7 @@ set(PLSSVM_PYTHON_BINDINGS_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/classification_types.cpp
     ${CMAKE_CURRENT_LIST_DIR}/csvm.cpp
     ${CMAKE_CURRENT_LIST_DIR}/data_set.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/environment.cpp
     ${CMAKE_CURRENT_LIST_DIR}/file_format_types.cpp
     ${CMAKE_CURRENT_LIST_DIR}/gamma.cpp
     ${CMAKE_CURRENT_LIST_DIR}/kernel_function_types.cpp
diff --git a/bindings/Python/environment.cpp b/bindings/Python/environment.cpp
new file mode 100644
index 000000000..5de39b9dc
--- /dev/null
+++ b/bindings/Python/environment.cpp
@@ -0,0 +1,35 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ */
+
+#include "plssvm/environment.hpp"
+
+#include "plssvm/backend_types.hpp"  // plssvm::backend_type, plssvm::list_available_backends
+
+#include "pybind11/pybind11.h"  // py::module_, py::enum_
+#include "pybind11/stl.h"       // support for STL types: std::variant
+
+#include <vector>  // std::vector
+
+namespace py = pybind11;
+
+void init_environment(py::module_ &m) {
+    // use its own submodule for the environment related bindings
+    py::module_ env_module = m.def_submodule("environment", "a module containing all environment initialization and finalization functionality");
+
+    // bind free functions managing environment setup and teardown
+    env_module.def("is_initialized", &plssvm::environment::is_initialized, "check whether the environments have already been initialized");
+    env_module.def("is_finalized", &plssvm::environment::is_finalized, "check whether the environments have already been finalized");
+
+    env_module.def("initialize", py::overload_cast<const std::vector<plssvm::backend_type> &>(&plssvm::environment::initialize), "initialize all requested backends, if available", py::arg("backends_to_init") = plssvm::list_available_backends());
+    env_module.def("finalize", &plssvm::environment::finalize, "finalize all environments");
+
+    // bind plssvm::environment::scope_guard
+    py::class_<plssvm::environment::scope_guard>(env_module, "ScopeGuard")
+        .def(py::init<>())
+        .def(py::init<std::vector<plssvm::backend_type>>());
+}
diff --git a/bindings/Python/main.cpp b/bindings/Python/main.cpp
index c49d57092..bd20e8f9d 100644
--- a/bindings/Python/main.cpp
+++ b/bindings/Python/main.cpp
@@ -31,6 +31,7 @@ void init_parameter(py::module_ &);
 void init_model(py::module_ &);
 void init_data_set(py::module_ &);
 void init_version(py::module_ &);
+void init_environment(py::module_ &);
 void init_exceptions(py::module_ &, const py::exception<plssvm::exception> &);
 void init_csvm(py::module_ &);
 void init_openmp_csvm(py::module_ &, const py::exception<plssvm::exception> &);
@@ -78,6 +79,7 @@ PYBIND11_MODULE(plssvm, m) {
     init_model(m);
     init_data_set(m);
     init_version(m);
+    init_environment(m);
     init_exceptions(m, base_exception);
     init_csvm(m);
 
diff --git a/examples/cpp/main.cpp b/examples/cpp/main.cpp
index 6b6f4561e..f0e210b86 100644
--- a/examples/cpp/main.cpp
+++ b/examples/cpp/main.cpp
@@ -5,6 +5,9 @@
 #include <vector>
 
 int main() {
+    // correctly initialize and finalize environments
+    plssvm::environment::scope_guard environment_guard{};
+
     try {
         // create a new C-SVM parameter set, explicitly overriding the default kernel function
         const plssvm::parameter params{ plssvm::kernel_type = plssvm::kernel_function_type::polynomial };
diff --git a/examples/python/main.py b/examples/python/main.py
index 1eb831243..611235102 100644
--- a/examples/python/main.py
+++ b/examples/python/main.py
@@ -1,6 +1,9 @@
 import plssvm
 from sklearn.metrics import classification_report
 
+# correctly initialize and finalize environments
+environment_guard = plssvm.environment.ScopeGuard()
+
 try:
     # create a new C-SVM parameter set, explicitly overriding the default kernel function
     params = plssvm.Parameter(kernel_type=plssvm.KernelFunctionType.POLYNOMIAL)
diff --git a/examples/python/sklearn_like_svc.py b/examples/python/sklearn_like_svc.py
index 4b5e5f44f..57ab3e148 100644
--- a/examples/python/sklearn_like_svc.py
+++ b/examples/python/sklearn_like_svc.py
@@ -1,6 +1,9 @@
 from sklearn.datasets import make_classification
 import plssvm
 
+# correctly initialize and finalize environments
+environment_guard = plssvm.environment.ScopeGuard()
+
 num_samples = 2**8
 num_features = 2**6
 
diff --git a/include/plssvm/core.hpp b/include/plssvm/core.hpp
index dce0c5436..79ba695e1 100644
--- a/include/plssvm/core.hpp
+++ b/include/plssvm/core.hpp
@@ -22,6 +22,7 @@
 #include "plssvm/csvm.hpp"                                   // the base C-SVM every backend is inheriting from
 #include "plssvm/csvm_factory.hpp"                           // a factory function to instantiate a C-SVM using a runtime backend; includes the available backend C-SVMs
 #include "plssvm/data_set.hpp"                               // a data set used for training a C-SVM
+#include "plssvm/environment.hpp"                            // environment management functions and classes
 #include "plssvm/exceptions/exceptions.hpp"                  // exception hierarchy
 #include "plssvm/file_format_types.hpp"                      // all supported file format types
 #include "plssvm/gamma.hpp"                                  // the types of the gamma parameter
diff --git a/include/plssvm/environment.hpp b/include/plssvm/environment.hpp
new file mode 100644
index 000000000..0c1ddec17
--- /dev/null
+++ b/include/plssvm/environment.hpp
@@ -0,0 +1,148 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Header handling the correct runtime setup and teardown.
+ * @note Must be implemented in the header file due to linking of backend specific libraries.
+ * @attention This header should **not** be included in any base library file!
+ */
+
+#ifndef PLSSVM_ENVIRONMENT_HPP_
+#define PLSSVM_ENVIRONMENT_HPP_
+
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    #include "Kokkos_Core.hpp"  // Kokkos::is_initialized, Kokkos::is_finalized, Kokkos::initialize, Kokkos::finalize
+#endif
+
+#include "plssvm/backend_types.hpp"   // plssvm::backend_type, plssvm::list_available_backends
+#include "plssvm/detail/utility.hpp"  // plssvm::detail::contains
+
+#include <vector>  // std::vector
+
+namespace plssvm::environment {
+
+/**
+ * @brief Check, whether the environments have already been initialized correctly.
+ * @return `true` if the environments are initialized correctly, `false` otherwise
+ */
+[[nodiscard]] inline bool is_initialized() {
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    return Kokkos::is_initialized();
+#else
+    return true;
+#endif
+}
+
+/**
+ * @brief Check, whether the environments have been finalized correctly.
+ * @return `true` if the environments are finalized correctly, `false` otherwise
+ */
+[[nodiscard]] inline bool is_finalized() {
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    return Kokkos::is_finalized();
+#else
+    return true;
+#endif
+}
+
+/**
+ * @brief Initialize all necessary environments in order for all PLSSVM backends to work properly.
+ */
+inline void initialize([[maybe_unused]] const std::vector<backend_type> &backends_to_init = list_available_backends()) {
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    if (detail::contains(backends_to_init, backend_type::automatic) || detail::contains(backends_to_init, backend_type::kokkos)) {
+        Kokkos::initialize();
+    }
+#endif
+}
+
+/**
+ * @brief Initialize all necessary environments in order for all PLSSVM backends to work properly.
+ * @param[in] argc the number of provided command line arguments
+ * @param[in] argv the provided command line arguments
+ */
+inline void initialize([[maybe_unused]] int &argc, [[maybe_unused]] char **argv, [[maybe_unused]] const std::vector<backend_type> &backends_to_init = list_available_backends()) {
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    if (detail::contains(backends_to_init, backend_type::automatic) || detail::contains(backends_to_init, backend_type::kokkos)) {
+        Kokkos::initialize(argc, argv);
+    }
+#endif
+}
+
+/**
+ * @brief Finalize all necessary environments in order for all PLSSVM backends to work properly.
+ */
+inline void finalize([[maybe_unused]] const std::vector<backend_type> &backends_to_init = list_available_backends()) {
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    if (detail::contains(backends_to_init, backend_type::automatic) || detail::contains(backends_to_init, backend_type::kokkos)) {
+        Kokkos::finalize();
+    }
+#endif
+}
+
+/**
+ * @brief A scope guard to initialize and automatically finalize all necessary environments in order for all PLSSVM backends to work properly.
+ */
+class [[nodiscard]] scope_guard {
+  public:
+    /**
+     * @brief If the environments are not already initialized, initialize all necessary environments in order for all PLSSVM backends to work properly
+     */
+    explicit scope_guard(std::vector<backend_type> backends_to_init = list_available_backends()) :
+        backends_to_init_{ std::move(backends_to_init) } {
+        if (!is_initialized()) {
+            initialize(backends_to_init_);
+        }
+    }
+
+    /**
+     * @brief If the environments are not already initialized, initialize all necessary environments in order for all PLSSVM backends to work properly
+     * @param[in] argc the number of provided command line arguments
+     * @param[in] argv the provided command line arguments
+     */
+    scope_guard(int &argc, char **argv, std::vector<backend_type> backends_to_init = list_available_backends()) :
+        backends_to_init_{ std::move(backends_to_init) } {
+        if (!is_initialized()) {
+            initialize(argc, argv, backends_to_init_);
+        }
+    }
+
+    /**
+     * @brief Delete copy-constructor since a scope_guard is a move-only type.
+     */
+    scope_guard(const scope_guard &) = delete;
+    /**
+     * @brief Default move-constructor.
+     */
+    scope_guard(scope_guard &&) noexcept = default;
+    /**
+     * @brief Delete copy-assignment operator since a scope_guard is a move-only type.
+     * @return `*this`
+     */
+    scope_guard &operator=(const scope_guard &) = delete;
+    /**
+     * @brief Default move-assignment operator.
+     * @return `*this`
+     */
+    scope_guard &operator=(scope_guard &&) noexcept = default;
+
+    /**
+     * @brief If the environments are not already finalized, finalize all necessary environments in order for all PLSSVM backends to work properly.
+     */
+    ~scope_guard() {
+        if (!is_finalized()) {
+            finalize(backends_to_init_);
+        }
+    }
+
+  private:
+    std::vector<backend_type> backends_to_init_{};
+};
+
+}  // namespace plssvm::environment
+
+#endif  // PLSSVM_ENVIRONMENT_HPP_
diff --git a/src/main_predict.cpp b/src/main_predict.cpp
index 8d9f6d1a6..762b68c86 100644
--- a/src/main_predict.cpp
+++ b/src/main_predict.cpp
@@ -18,10 +18,6 @@
 #include "plssvm/detail/assert.hpp"                        // PLSSVM_ASSERT
 #include "plssvm/detail/utility.hpp"                       // PLSSVM_IS_DEFINED
 
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
-    #include "Kokkos_Core.hpp"  // Kokkos::initialize, Kokkos::is_initialized, Kokkos::finalize, Kokkos::is_finalized
-#endif
-
 #if defined(PLSSVM_HARDWARE_SAMPLING_ENABLED)
     #include "hws/system_hardware_sampler.hpp"  // hws::system_hardware_sampler
 #endif
@@ -45,11 +41,9 @@
 using namespace std::chrono_literals;
 
 int main(int argc, char *argv[]) {
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
-    // create std::unique_ptr containing a Kokkos::ScopeGuard
-    // -> used to automatically handle Kokkos::finalize
-    std::unique_ptr<Kokkos::ScopeGuard> kokkos_guard{};
-#endif
+    // create std::unique_ptr containing a plssvm::scope_guard
+    // -> used to automatically handle necessary environment teardown operations
+    std::unique_ptr<plssvm::environment::scope_guard> environment_guard{};
 
     try {
         const std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now();
@@ -85,13 +79,12 @@ int main(int argc, char *argv[]) {
             // check whether Kokkos is used as backend (it is either requested directly or as automatic backend)
             const bool use_kokkos_as_backend{ cmd_parser.backend == plssvm::backend_type::kokkos || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::kokkos) };
 
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
-            // initialize Kokkos if necessary
+            // initialize environments if necessary
+            std::vector<plssvm::backend_type> backends_to_initialize{};
             if (use_kokkos_as_backend) {
-                kokkos_guard = std::make_unique<Kokkos::ScopeGuard>();
-                PLSSVM_ASSERT(Kokkos::is_initialized(), "Something went wrong initializing the Kokkos environment!");
+                backends_to_initialize.push_back(plssvm::backend_type::kokkos);
             }
-#endif
+            environment_guard = std::make_unique<plssvm::environment::scope_guard>(backends_to_initialize);
 
             // create default csvm
             const std::unique_ptr<plssvm::csvm> svm = [&]() {
diff --git a/src/main_train.cpp b/src/main_train.cpp
index bf5bc3ec5..9fb2192d6 100644
--- a/src/main_train.cpp
+++ b/src/main_train.cpp
@@ -17,10 +17,6 @@
 #include "plssvm/detail/assert.hpp"                        // PLSSVM_ASSERT
 #include "plssvm/detail/utility.hpp"                       // PLSSVM_IS_DEFINED
 
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
-    #include "Kokkos_Core.hpp"  // Kokkos::initialize, Kokkos::is_initialized, Kokkos::finalize, Kokkos::is_finalized
-#endif
-
 #if defined(PLSSVM_HARDWARE_SAMPLING_ENABLED)
     #include "hws/system_hardware_sampler.hpp"  // hws::system_hardware_sampler
 #endif
@@ -41,11 +37,9 @@
 using namespace std::chrono_literals;
 
 int main(int argc, char *argv[]) {
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
-    // create std::unique_ptr containing a Kokkos::ScopeGuard
-    // -> used to automatically handle Kokkos::finalize
-    std::unique_ptr<Kokkos::ScopeGuard> kokkos_guard{};
-#endif
+    // create std::unique_ptr containing a plssvm::scope_guard
+    // -> used to automatically handle necessary environment teardown operations
+    std::unique_ptr<plssvm::environment::scope_guard> environment_guard{};
 
     try {
         const std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now();
@@ -81,13 +75,12 @@ int main(int argc, char *argv[]) {
             // check whether Kokkos is used as backend (it is either requested directly or as automatic backend)
             const bool use_kokkos_as_backend{ cmd_parser.backend == plssvm::backend_type::kokkos || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::kokkos) };
 
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
-            // initialize Kokkos if necessary
+            // initialize environments if necessary
+            std::vector<plssvm::backend_type> backends_to_initialize{};
             if (use_kokkos_as_backend) {
-                kokkos_guard = std::make_unique<Kokkos::ScopeGuard>();
-                PLSSVM_ASSERT(Kokkos::is_initialized(), "Something went wrong initializing the Kokkos environment!");
+                backends_to_initialize.push_back(plssvm::backend_type::kokkos);
             }
-#endif
+            environment_guard = std::make_unique<plssvm::environment::scope_guard>(backends_to_initialize);
 
             // create SVM
             const std::unique_ptr<plssvm::csvm> svm = [&]() {
diff --git a/tests/main.cpp b/tests/main.cpp
index 427426318..0623c7a26 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -9,9 +9,7 @@
  * @brief Contains the googletest main function. Sets the DeathTest to "threadsafe" execution instead of "fast".
  */
 
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
-    #include "Kokkos_Core.hpp"  // Kokkos::ScopeGuard
-#endif
+#include "plssvm/environment.hpp"  // plssvm::environment::scope_guard
 
 #include "gtest/gtest.h"  // RUN_ALL_TESTS, ::testing::{InitGoogleTest, GTEST_FLAG},GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST definitions
 
@@ -46,22 +44,18 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtrDeathTest);
 // exception tests
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Exception);
 
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
-void kokkos_ensure_finalization() {
-    if (!Kokkos::is_finalized()) {
-        Kokkos::finalize();
+void ensure_finalization() {
+    if (!plssvm::environment::is_finalized()) {
+        plssvm::environment::finalize();
     }
 }
-#endif
 
 int main(int argc, char **argv) {
     ::testing::InitGoogleTest(&argc, argv);
 
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
-    // initialize Kokkos using a Kokkos::ScopeGuard
-    const Kokkos::ScopeGuard guard{};
-    [[maybe_unused]] const int ret = std::atexit(kokkos_ensure_finalization);
-#endif
+    // initialize environments
+    const plssvm::environment::scope_guard environment_guard{};
+    [[maybe_unused]] const int ret = std::atexit(ensure_finalization);
 
     // prevent problems with fork() in the presence of multiple threads
     // https://github.com/google/googletest/blob/main/docs/advanced.md#death-tests-and-threads

From e5fd3c4c9ed6cb570e2dca2e6d12f30f4adc1b27 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 13 Nov 2024 18:02:30 +0100
Subject: [PATCH 108/123] Change the order of available target platforms so
 that the correct one is used if target_platform::automatic is used.

---
 src/plssvm/target_platforms.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/plssvm/target_platforms.cpp b/src/plssvm/target_platforms.cpp
index f5569b51d..8fc47e223 100644
--- a/src/plssvm/target_platforms.cpp
+++ b/src/plssvm/target_platforms.cpp
@@ -22,9 +22,6 @@ namespace plssvm {
 
 std::vector<target_platform> list_available_target_platforms() {
     std::vector<target_platform> available_targets = { target_platform::automatic };
-#if defined(PLSSVM_HAS_CPU_TARGET)
-    available_targets.push_back(target_platform::cpu);
-#endif
 #if defined(PLSSVM_HAS_NVIDIA_TARGET)
     available_targets.push_back(target_platform::gpu_nvidia);
 #endif
@@ -34,6 +31,9 @@ std::vector<target_platform> list_available_target_platforms() {
 #if defined(PLSSVM_HAS_INTEL_TARGET)
     available_targets.push_back(target_platform::gpu_intel);
 #endif
+#if defined(PLSSVM_HAS_CPU_TARGET)
+    available_targets.push_back(target_platform::cpu);
+#endif
 
     // automatic is ALWAYS available but AT LEAST ONE other target must be available in addition
     PLSSVM_ASSERT(available_targets.size() > 1, "Besides \"automatic\" at least one other target must be available!");

From 4bbea212f5111d4a9782b2d2d10d9c84c791ba37 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 14 Nov 2024 09:41:47 +0100
Subject: [PATCH 109/123] Add missing plssvmKokkosTargets.cmake file to install
 files.

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 790ec4268..d9df65226 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -842,6 +842,7 @@ install(FILES
         "${CMAKE_CURRENT_SOURCE_DIR}/cmake/plssvm/plssvmOpenMPTargets.cmake"
         "${CMAKE_CURRENT_SOURCE_DIR}/cmake/plssvm/plssvmAdaptiveCppTargets.cmake"
         "${CMAKE_CURRENT_SOURCE_DIR}/cmake/plssvm/plssvmDPCPPTargets.cmake"
+        "${CMAKE_CURRENT_SOURCE_DIR}/cmake/plssvm/plssvmKokkosTargets.cmake"
         "${CMAKE_CURRENT_SOURCE_DIR}/cmake/plssvm/plssvmstdparTargets.cmake"
         DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/plssvm/cmake
 )

From c81fcfa54b1989c76d287b9b9eef6f64fc13b9f4 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 15 Nov 2024 14:02:10 +0100
Subject: [PATCH 110/123] Add missing utility header entry after adding new
 exception type.

---
 tests/exceptions/utility.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/exceptions/utility.hpp b/tests/exceptions/utility.hpp
index bd3e1d5aa..de99f06d0 100644
--- a/tests/exceptions/utility.hpp
+++ b/tests/exceptions/utility.hpp
@@ -53,6 +53,7 @@ PLSSVM_CREATE_EXCEPTION_TYPE_NAME(matrix_exception)
 PLSSVM_CREATE_EXCEPTION_TYPE_NAME(kernel_launch_resources)
 PLSSVM_CREATE_EXCEPTION_TYPE_NAME(classification_report_exception)
 PLSSVM_CREATE_EXCEPTION_TYPE_NAME(platform_devices_empty)
+PLSSVM_CREATE_EXCEPTION_TYPE_NAME(environment_exception)
 
 }  // namespace util
 

From 7065929e61fad4ed1b502c3944f5bdcf9f300770 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 15 Nov 2024 14:02:35 +0100
Subject: [PATCH 111/123] The new finalize function must not be enclosed in an
 if.

---
 tests/main.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/main.cpp b/tests/main.cpp
index 0623c7a26..1fc7b9aab 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -45,9 +45,7 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DevicePtrDeathTest);
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Exception);
 
 void ensure_finalization() {
-    if (!plssvm::environment::is_finalized()) {
-        plssvm::environment::finalize();
-    }
+    plssvm::environment::finalize();
 }
 
 int main(int argc, char **argv) {
@@ -55,6 +53,7 @@ int main(int argc, char **argv) {
 
     // initialize environments
     const plssvm::environment::scope_guard environment_guard{};
+    // Note: necessary for Kokkos::SYCL
     [[maybe_unused]] const int ret = std::atexit(ensure_finalization);
 
     // prevent problems with fork() in the presence of multiple threads

From cd8837870545375e680c787a066f32fd6818cb8c Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 15 Nov 2024 14:02:47 +0100
Subject: [PATCH 112/123] Add missing detail:: namespace qualifier.

---
 include/plssvm/environment.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/plssvm/environment.hpp b/include/plssvm/environment.hpp
index 270c90e2d..9f2316651 100644
--- a/include/plssvm/environment.hpp
+++ b/include/plssvm/environment.hpp
@@ -153,7 +153,7 @@ template <auto is_initialized_function, auto is_finalized_function>
         case backend_type::kokkos:
             {
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
-                return determine_status_from_initialized_finalized_functions<Kokkos::is_initialized, Kokkos::is_finalized>();
+                return detail::determine_status_from_initialized_finalized_functions<Kokkos::is_initialized, Kokkos::is_finalized>();
 #else
                 return status::unnecessary;
 #endif

From b5c8df043c00f3e91c611ade62a48a30fa7cfe5c Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 15 Nov 2024 14:05:19 +0100
Subject: [PATCH 113/123] Mark part of function as unreachable.

---
 include/plssvm/environment.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/plssvm/environment.hpp b/include/plssvm/environment.hpp
index 9f2316651..40442995d 100644
--- a/include/plssvm/environment.hpp
+++ b/include/plssvm/environment.hpp
@@ -111,6 +111,8 @@ namespace detail {
     } else if (is_finalized) {
         return status::finalized;
     }
+    // should never be reached!
+    ::plssvm::detail::unreachable();
 }
 
 /**

From 69eff96847b17c9cf1870ebd80892e804ced330a Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 15 Nov 2024 14:07:53 +0100
Subject: [PATCH 114/123] Fix formatting errors.

---
 include/plssvm/backends/Kokkos/exceptions.hpp | 36 +++++++++----------
 include/plssvm/parameter.hpp                  |  1 -
 tests/backends/Kokkos/utility.hpp             | 21 ++++++-----
 3 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/exceptions.hpp b/include/plssvm/backends/Kokkos/exceptions.hpp
index 047b7cad8..60a9fc8dd 100644
--- a/include/plssvm/backends/Kokkos/exceptions.hpp
+++ b/include/plssvm/backends/Kokkos/exceptions.hpp
@@ -1,13 +1,13 @@
 /**
-* @file
-* @author Alexander Van Craen
-* @author Marcel Breyer
-* @copyright 2018-today The PLSSVM project - All Rights Reserved
-* @license This file is part of the PLSSVM project which is released under the MIT license.
-*          See the LICENSE.md file in the project root for full license information.
-*
-* @brief Implements custom exception classes specific to the Kokkos backend.
-*/
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Implements custom exception classes specific to the Kokkos backend.
+ */
 
 #ifndef PLSSVM_BACKENDS_KOKKOS_EXCEPTIONS_HPP_
 #define PLSSVM_BACKENDS_KOKKOS_EXCEPTIONS_HPP_
@@ -21,16 +21,16 @@
 namespace plssvm::kokkos {
 
 /**
-* @brief Exception type thrown if a problem with the Kokkos backend occurs.
-*/
+ * @brief Exception type thrown if a problem with the Kokkos backend occurs.
+ */
 class backend_exception : public exception {
- public:
-   /**
-    * @brief Construct a new exception forwarding the exception message and source location to plssvm::exception.
-    * @param[in] msg the exception's `what()` message
-    * @param[in] loc the exception's call side information
-    */
-   explicit backend_exception(const std::string &msg, source_location loc = source_location::current());
+  public:
+    /**
+     * @brief Construct a new exception forwarding the exception message and source location to plssvm::exception.
+     * @param[in] msg the exception's `what()` message
+     * @param[in] loc the exception's call side information
+     */
+    explicit backend_exception(const std::string &msg, source_location loc = source_location::current());
 };
 
 }  // namespace plssvm::kokkos
diff --git a/include/plssvm/parameter.hpp b/include/plssvm/parameter.hpp
index 1f229e98a..516c66386 100644
--- a/include/plssvm/parameter.hpp
+++ b/include/plssvm/parameter.hpp
@@ -81,7 +81,6 @@ constexpr bool has_only_sycl_parameter_named_args_v = !igor::has_other_than<Args
 template <typename... Args>
 constexpr bool has_only_kokkos_parameter_named_args_v = !igor::has_other_than<Args...>(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::kokkos_execution_space);
 
-
 }  // namespace detail
 
 /**
diff --git a/tests/backends/Kokkos/utility.hpp b/tests/backends/Kokkos/utility.hpp
index 872d4c624..3c3458198 100644
--- a/tests/backends/Kokkos/utility.hpp
+++ b/tests/backends/Kokkos/utility.hpp
@@ -1,13 +1,13 @@
 /**
-* @file
-* @author Alexander Van Craen
-* @author Marcel Breyer
-* @copyright 2018-today The PLSSVM project - All Rights Reserved
-* @license This file is part of the PLSSVM project which is released under the MIT license.
-*          See the LICENSE.md file in the project root for full license information.
-*
-* @brief Determine the execution spaces available for tests with the Kokkos backend.
-*/
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Determine the execution spaces available for tests with the Kokkos backend.
+ */
 
 #ifndef PLSSVM_TESTS_BACKENDS_KOKKOS_UTILITY_HPP_
 #define PLSSVM_TESTS_BACKENDS_KOKKOS_UTILITY_HPP_
@@ -90,7 +90,6 @@ struct create_kokkos_test_tuple {
 template <template <plssvm::kokkos::execution_space> typename test_type>
 using create_kokkos_test_tuple_t = typename create_kokkos_test_tuple<test_type>::type;
 
-
-}
+}  // namespace util
 
 #endif  // PLSSVM_TESTS_BACKENDS_KOKKOS_UTILITY_HPP_

From 07e85c2775abd57a078891d11da08da798233691 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 25 Nov 2024 12:26:45 +0100
Subject: [PATCH 115/123] Update format library and disable JSON support in our
 .clang-format.

---
 .clang-format  | 3 +++
 CMakeLists.txt | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.clang-format b/.clang-format
index 533b9bcab..6dc2d9fba 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,4 +1,7 @@
 ---
+Language: Json
+DisableFormat: true
+---
 Language: Cpp
 AccessModifierOffset: -2
 AlignAfterOpenBracket: Align
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9e7b0ec57..42ef133d7 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -643,7 +643,7 @@ if (PLSSVM_ENABLE_FORMATTING)
     list(APPEND CMAKE_MESSAGE_INDENT "Formatting:  ")
     
     ## install library to add a clang-format target
-    set(PLSSVM_format_VERSION 7021abbf066e2e577926731c3fa4141f456c5024)
+    set(PLSSVM_format_VERSION d22c36043bea6ef85f3eb68b823f50703bd1cc21)
     find_package(format QUIET)
     if (format_FOUND)
         message(STATUS "Found package format.")

From 79072917eab93d8fa9f21ec5d530d8350cb5f55d Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 28 Nov 2024 11:14:50 +0100
Subject: [PATCH 116/123] Fix merge error.

---
 src/main_predict.cpp | 15 +++++----------
 src/main_train.cpp   | 15 +++++----------
 2 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/src/main_predict.cpp b/src/main_predict.cpp
index 9db67a20c..3d47ad53f 100644
--- a/src/main_predict.cpp
+++ b/src/main_predict.cpp
@@ -76,24 +76,19 @@ int main(int argc, char *argv[]) {
 
             // check whether SYCL is used as backend (it is either requested directly or as automatic backend)
             const bool use_sycl_as_backend{ cmd_parser.backend == plssvm::backend_type::sycl || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::sycl) };
-            // check whether Kokkos is used as backend (it is either requested directly or as automatic backend)
-            const bool use_kokkos_as_backend{ cmd_parser.backend == plssvm::backend_type::kokkos || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::kokkos) };
-
-            // initialize environments if necessary
-            std::vector<plssvm::backend_type> backends_to_initialize{};
-            if (use_kokkos_as_backend) {
-                backends_to_initialize.push_back(plssvm::backend_type::kokkos);
-            }
-            environment_guard = std::make_unique<plssvm::environment::scope_guard>(backends_to_initialize);
-
             // check whether HPX is used as backend (it is either requested directly or as automatic backend)
             const bool use_hpx_as_backend{ cmd_parser.backend == plssvm::backend_type::hpx || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::hpx) };
+            // check whether Kokkos is used as backend (it is either requested directly or as automatic backend)
+            const bool use_kokkos_as_backend{ cmd_parser.backend == plssvm::backend_type::kokkos || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::kokkos) };
 
             // initialize environments if necessary
             std::vector<plssvm::backend_type> backends_to_initialize{};
             if (use_hpx_as_backend) {
                 backends_to_initialize.push_back(plssvm::backend_type::hpx);
             }
+            if (use_kokkos_as_backend) {
+                backends_to_initialize.push_back(plssvm::backend_type::kokkos);
+            }
             environment_guard = std::make_unique<plssvm::environment::scope_guard>(backends_to_initialize);
 
             // create default csvm
diff --git a/src/main_train.cpp b/src/main_train.cpp
index 45045c6cd..2e2a39905 100644
--- a/src/main_train.cpp
+++ b/src/main_train.cpp
@@ -72,24 +72,19 @@ int main(int argc, char *argv[]) {
 
             // check whether SYCL is used as backend (it is either requested directly or as automatic backend)
             const bool use_sycl_as_backend{ cmd_parser.backend == plssvm::backend_type::sycl || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::sycl) };
-            // check whether Kokkos is used as backend (it is either requested directly or as automatic backend)
-            const bool use_kokkos_as_backend{ cmd_parser.backend == plssvm::backend_type::kokkos || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::kokkos) };
-
-            // initialize environments if necessary
-            std::vector<plssvm::backend_type> backends_to_initialize{};
-            if (use_kokkos_as_backend) {
-                backends_to_initialize.push_back(plssvm::backend_type::kokkos);
-            }
-            environment_guard = std::make_unique<plssvm::environment::scope_guard>(backends_to_initialize);
-
             // check whether HPX is used as backend (it is either requested directly or as automatic backend)
             const bool use_hpx_as_backend{ cmd_parser.backend == plssvm::backend_type::hpx || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::hpx) };
+            // check whether Kokkos is used as backend (it is either requested directly or as automatic backend)
+            const bool use_kokkos_as_backend{ cmd_parser.backend == plssvm::backend_type::kokkos || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::kokkos) };
 
             // initialize environments if necessary
             std::vector<plssvm::backend_type> backends_to_initialize{};
             if (use_hpx_as_backend) {
                 backends_to_initialize.push_back(plssvm::backend_type::hpx);
             }
+            if (use_kokkos_as_backend) {
+                backends_to_initialize.push_back(plssvm::backend_type::kokkos);
+            }
             environment_guard = std::make_unique<plssvm::environment::scope_guard>(backends_to_initialize);
 
             // create SVM

From efd5f81001099e89fb3e30525271febc82a352aa Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 28 Nov 2024 13:54:39 +0100
Subject: [PATCH 117/123] Increase backend number to fix failing tests.

---
 tests/backend_types.cpp | 2 +-
 tests/csvm_factory.cpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/backend_types.cpp b/tests/backend_types.cpp
index c4d25be14..8a735a26b 100644
--- a/tests/backend_types.cpp
+++ b/tests/backend_types.cpp
@@ -45,7 +45,7 @@ TEST(BackendType, to_string) {
 
 TEST(BackendType, to_string_unknown) {
     // check conversions to std::string from unknown backend_type
-    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::backend_type>(8), "unknown");
+    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::backend_type>(9), "unknown");
 }
 
 // check whether the std::string -> plssvm::backend_type conversions are correct
diff --git a/tests/csvm_factory.cpp b/tests/csvm_factory.cpp
index 926050ad9..f9bdd783e 100644
--- a/tests/csvm_factory.cpp
+++ b/tests/csvm_factory.cpp
@@ -237,7 +237,7 @@ TEST(CSVMFactory, factory_named_parameter) {
 }
 
 TEST(CSVMFactory, invalid_backend) {
-    EXPECT_THROW_WHAT(std::ignore = plssvm::make_csvm(static_cast<plssvm::backend_type>(8)),
+    EXPECT_THROW_WHAT(std::ignore = plssvm::make_csvm(static_cast<plssvm::backend_type>(9)),
                       plssvm::unsupported_backend_exception,
                       "Unrecognized backend provided!");
 }

From a797616e007f5a16d7d3f7ae0d2fe1a2692400ce Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 28 Nov 2024 15:56:19 +0100
Subject: [PATCH 118/123] Fix failing test due to floating point inaccuracies.

---
 tests/backends/generic_gpu_csvm_tests.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/backends/generic_gpu_csvm_tests.hpp b/tests/backends/generic_gpu_csvm_tests.hpp
index 38264f507..c2d8edd36 100644
--- a/tests/backends/generic_gpu_csvm_tests.hpp
+++ b/tests/backends/generic_gpu_csvm_tests.hpp
@@ -223,7 +223,7 @@ TYPED_TEST_P(GenericGPUCSVM, run_w_kernel) {
         const plssvm::soa_matrix<plssvm::real_type> correct_w = ground_truth::calculate_device_specific_w(weights, data.data(), *svm.data_distribution_, device_id);
 
         // check for correctness
-        EXPECT_FLOATING_POINT_MATRIX_NEAR(w, correct_w);
+        EXPECT_FLOATING_POINT_MATRIX_NEAR_EPS(w, correct_w, 1e6);
     }
 }
 

From 0ae874f41763577d2d487c087092832dc2134d9b Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 28 Nov 2024 17:11:20 +0100
Subject: [PATCH 119/123] Do not expose the environment related functions in
 the Python bindings but automatically handle it during module import and
 cleanup. NO user action required anymore.

---
 README.md                           |   3 -
 bindings/Python/CMakeLists.txt      |   1 -
 bindings/Python/README.md           |  30 +-------
 bindings/Python/environment.cpp     | 105 ----------------------------
 bindings/Python/main.cpp            |  11 ++-
 examples/python/main.py             |   3 -
 examples/python/sklearn_like_svc.py |   3 -
 7 files changed, 11 insertions(+), 145 deletions(-)
 delete mode 100644 bindings/Python/environment.cpp

diff --git a/README.md b/README.md
index ba98d03f1..52d95803f 100644
--- a/README.md
+++ b/README.md
@@ -805,9 +805,6 @@ Roughly the same can be achieved using our Python bindings with the following Py
 import plssvm
 from sklearn.metrics import classification_report
 
-# correctly initialize and finalize environments
-environment_guard = plssvm.environment.ScopeGuard()
-
 try:
     # create a new C-SVM parameter set, explicitly overriding the default kernel function
     params = plssvm.Parameter(kernel_type=plssvm.KernelFunctionType.POLYNOMIAL)
diff --git a/bindings/Python/CMakeLists.txt b/bindings/Python/CMakeLists.txt
index dc9615a7d..f7d4e571d 100644
--- a/bindings/Python/CMakeLists.txt
+++ b/bindings/Python/CMakeLists.txt
@@ -41,7 +41,6 @@ set(PLSSVM_PYTHON_BINDINGS_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/classification_types.cpp
     ${CMAKE_CURRENT_LIST_DIR}/csvm.cpp
     ${CMAKE_CURRENT_LIST_DIR}/data_set.cpp
-    ${CMAKE_CURRENT_LIST_DIR}/environment.cpp
     ${CMAKE_CURRENT_LIST_DIR}/file_format_types.cpp
     ${CMAKE_CURRENT_LIST_DIR}/gamma.cpp
     ${CMAKE_CURRENT_LIST_DIR}/kernel_function_types.cpp
diff --git a/bindings/Python/README.md b/bindings/Python/README.md
index c8508d1e6..04d0cee14 100644
--- a/bindings/Python/README.md
+++ b/bindings/Python/README.md
@@ -13,7 +13,6 @@
         - [plssvm.openmp.CSVM, plssvm.hpx.CSVM, plssvm.stdpar.CSVM, plssvm.cuda.CSVM, plssvm.hip.CSVM, plssvm.opencl.CSVM, plssvm.sycl.CSVM, plssvm.dpcpp.CSVM, plssvm.adaptivecpp.CSVM, plssvm.kokkos.CSVM](#plssvmopenmpcsvm-plssvmhpxcsvm-plssvmcudacsvm-plssvmhipcsvm-plssvmopenclcsvm-plssvmsyclcsvm-plssvmdpcppcsvm-plssvmadaptivecppcsvm-plssvmkokkoscsvm)
         - [plssvm.Model](#plssvmmodel)
         - [plssvm.Version](#plssvmversion)
-        - [plssvm.environment.ScopeGuard](#plssvmenvironmentscopeguard)
         - [plssvm.detail.tracking.PerformanceTracker](#plssvmdetailtrackingperformancetracker)
         - [plssvm.detail.tracking.Events](#plssvmdetailtrackingevent-plssvmdetailtrackingevents)
     - [Free functions](#free-functions)
@@ -198,7 +197,6 @@ The following table lists all PLSSVM enumerations exposed on the Python side:
 | `ClassificationType`   | `OAA`, `OAO`                                                            | The different supported multi-class classification strategies (default: `LIBSVM`).                                                                                                                                                                          |
 | `BackendType`          | `AUTOMATIC`, `OPENMP`, `HPX`, `CUDA`, `HIP`, `OPENCL`, `SYCL`, `KOKKOS` | The different supported backends (default: `AUTOMATIC`). If `AUTOMATIC` is provided, the selected backend depends on the used target platform.                                                                                                              |
 | `VerbosityLevel`       | `QUIET`, `LIBSVM`, `TIMING`, `FULL`                                     | The different supported log levels (default: `FULL`). `QUIET` means no output, `LIBSVM` output that is as conformant as possible with LIBSVM's output, `TIMING` all timing related outputs, and `FULL` everything. Can be combined via bit-wise operations. |
-| `Status`               | `UNINITIALIZED`, `INITIALIZED`, `FINALIZED`, `UNNECESSARY`              | The different environment status values. **Note**: located in the `plssvm.environment` module.                                                                                                                                                              |                                                                                                                                                                                                                   |
 
 If a SYCL implementation is available, additional enumerations are available:
 
@@ -343,9 +341,8 @@ If the most performant backend should be used, it is sufficient to use `plssvm.C
 `sycl_implementation_type` to choose between DPC++ and AdaptiveCpp as SYCL implementations
 and `sycl_kernel_invocation_type` to choose between the two different SYCL kernel invocation types.
 
-**Note**: if the backend type is `plssvm.BackendType.HPX` it is necessary to initialize and finalize the HPX runtime.
-The runtime can be manually managed using `plssvm.environment.initialize()` and `plssvm.environment.finalize()`.
-We recommend utilizing `plssvm.environment.ScopeGuard()` to manage the lifetime of the HPX runtime automatically.
+**Note**: if the backend type is `plssvm.BackendType.HPX` or `plssvm.BackendType.Kokkos` special initialization and finalization functions must be called. 
+However, this is **automatically** handled by our Python bindings on the module import and cleanup.
 
 | methods                                                                                                                                      | description                                                                                                                                                                                                         |
 |----------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
@@ -443,19 +440,6 @@ A class encapsulating the version information of the used PLSSVM installation.
 | `minor : int`      | The minor PLSSVM version.                 |
 | `patch : int`      | The patch PLSSVM version.                 |
 
-#### `plssvm.environment.ScopeGuard`
-
-The environmental scope guard can be used to automatically finalize all necessary backend environments when it goes out of scope.
-
-| constructors                            | description                                                                                                                                                                             |
-|-----------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `ScopeGuard([backends={}])`             | Create a new scope guard initializing all available backend environments. If a list of backends is provided, only initializes these backends.                                           |
-| `ScopeGuard(argc, argv, [backends={}])` | Create a new scope guard initializing all available backend environments using the provided command line arguments. If a list of backends is provided, only initializes these backends. |
-
-| methods      | description                                                                                                                       |
-|--------------|-----------------------------------------------------------------------------------------------------------------------------------|
-| `backends()` | Return all initialized backends. All backends returned by this function will be finalized when the scope guard goes out of scope. |
-
 #### `plssvm.detail.tracking.PerformanceTracker`
 
 A submodule used to track various performance statistics like runtimes, but also the used setup and hyperparameters.
@@ -549,15 +533,6 @@ If a stdpar implementation is available, additional free functions are available
 |-------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------|
 | `list_available_stdpar_implementations()` | List all available stdpar implementations (determined during PLSSVM's build step; currently always guaranteed to be only one implementation). |
 
-Additional free functions are available under `plssvm.environment.`.
-
-| function                                | description                                                                                                                                                  |
-|-----------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `get_backend_status(backend)`           | Return the current environment status of the provided backend.                                                                                               |
-| `initialize([backends={}])`             | Initialize all available backend environments. If a list of backends is provided, only initializes these backends.                                           |
-| `initialize(argc, argv, [backends={}])` | Initialize all available backend environments using the provided command line arguments. If a list of backends is provided, only initializes these backends. |
-| `finalize([backends={}])`               | Finalize all available backend environments. If a list of backends is provided, only finalizes these backends.                                               |
-
 ### Exceptions
 
 The PLSSVM Python3 bindings define a few new exception types:
@@ -576,6 +551,5 @@ The PLSSVM Python3 bindings define a few new exception types:
 | `MatrixError`                | If something went wrong in the internal matrix class. **Note**: shouldn't occur in user code.                          |
 | `KernelLaunchResourcesError` | If something went wrong during a kernel launch due to insufficient ressources.                                         |
 | `ClassificationReportError`  | If something in the classification report went wrong. **Note**: shouldn't occur in user code.                          |
-| `EnvironmentError`           | If something during environment initialization or finalization went wrong.                                             |
 
 Depending on the available backends, additional `BackendError`s are also available (e.g., `plssvm.cuda.BackendError`).
diff --git a/bindings/Python/environment.cpp b/bindings/Python/environment.cpp
deleted file mode 100644
index c9a467187..000000000
--- a/bindings/Python/environment.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/**
- * @author Alexander Van Craen
- * @author Marcel Breyer
- * @copyright 2018-today The PLSSVM project - All Rights Reserved
- * @license This file is part of the PLSSVM project which is released under the MIT license.
- *          See the LICENSE.md file in the project root for full license information.
- */
-
-#include "plssvm/environment.hpp"
-
-#include "plssvm/backend_types.hpp"  // plssvm::backend_type, plssvm::list_available_backends
-
-#include "bindings/Python/utility.hpp"  // check_kwargs_for_correctness
-
-#include "pybind11/pybind11.h"  // py::module_, py::enum_
-#include "pybind11/pytypes.h"   // py::kwargs
-#include "pybind11/stl.h"       // support for STL types: std::variant
-
-#include <cstddef>  // std::size_t
-#include <memory>   // std::make_unique
-#include <vector>   // std::vector
-
-namespace py = pybind11;
-
-void init_environment(py::module_ &m) {
-    // use its own submodule for the environment related bindings
-    py::module_ env_module = m.def_submodule("environment", "a module containing all environment initialization and finalization functionality");
-
-    // bind enum class
-    py::enum_<plssvm::environment::status>(m, "Status")
-        .value("UNINITIALIZED", plssvm::environment::status::uninitialized, "the backend environment hasn't been initialized or finalized yet")
-        .value("INITIALIZED", plssvm::environment::status::initialized, "the backend environment has been initialized but not finalized yet")
-        .value("FINALIZED", plssvm::environment::status::finalized, "the backend environment has already been initialized and finalized")
-        .value("UNNECESSARY", plssvm::environment::status::unnecessary, "no backend environment initialization or finalization necessary");
-
-    // bind free functions
-    env_module.def("get_backend_status", &plssvm::environment::get_backend_status, "get the environment status for the provided backend");
-    env_module.def("is_initialization_necessary", &plssvm::environment::is_initialization_necessary, "check if the provided backend needs a special environment initialization");
-
-    env_module.def("initialize", [](const py::kwargs &args) {
-        // check for valid keys
-        check_kwargs_for_correctness(args, { "backends" });
-        if (args.contains("backends")) {
-            plssvm::environment::initialize(args["backends"].cast<std::vector<plssvm::backend_type>>());
-        } else {
-            plssvm::environment::initialize();
-        } }, "initialize all available backends or only the optionally provided once");
-    env_module.def("initialize", [](std::vector<std::string> cmd_args, const py::kwargs &args) {
-        std::vector<char *> cmd_args_ptr(cmd_args.size());
-        for (std::size_t i = 0; i < cmd_args.size(); ++i) {
-            cmd_args_ptr[i] = cmd_args[i].data();
-        }
-        // assemble command line arguments
-        int argc = static_cast<int>(cmd_args_ptr.size());
-        char **argv = cmd_args_ptr.data();
-
-        // check for valid keys
-        check_kwargs_for_correctness(args, { "backends" });
-        if (args.contains("backends")) {
-            plssvm::environment::initialize(argc, argv, args["backends"].cast<std::vector<plssvm::backend_type>>());
-        } else {
-            plssvm::environment::initialize(argc, argv);
-        } }, "initialize all available backends or only the optionally provided once using the provided command line parameters");
-
-    env_module.def("finalize", [](const py::kwargs &args) {
-        // check for valid keys
-        check_kwargs_for_correctness(args, { "backends" });
-        if (args.contains("backends")) {
-            plssvm::environment::finalize(args["backends"].cast<std::vector<plssvm::backend_type>>());
-        } else {
-            plssvm::environment::finalize();
-        } }, "finalize all available backends or only the optionally provided once");
-
-    // bind plssvm::environment::scope_guard
-    py::class_<plssvm::environment::scope_guard>(env_module, "ScopeGuard")
-        .def(py::init([](const py::kwargs &args) {
-                 // check for valid keys
-                 check_kwargs_for_correctness(args, { "backends" });
-                 if (args.contains("backends")) {
-                     return std::make_unique<plssvm::environment::scope_guard>(args["backends"].cast<std::vector<plssvm::backend_type>>());
-                 } else {
-                     return std::make_unique<plssvm::environment::scope_guard>();
-                 }
-             }),
-             "create a new scope_guard and initialize all available backends or only the optionally provided once")
-        .def(py::init([](std::vector<std::string> cmd_args, const py::kwargs &args) {
-                 std::vector<char *> cmd_args_ptr(cmd_args.size());
-                 for (std::size_t i = 0; i < cmd_args.size(); ++i) {
-                     cmd_args_ptr[i] = cmd_args[i].data();
-                 }
-                 // assemble command line arguments
-                 int argc = static_cast<int>(cmd_args_ptr.size());
-                 char **argv = cmd_args_ptr.data();
-
-                 // check for valid keys
-                 check_kwargs_for_correctness(args, { "backends" });
-                 if (args.contains("backends")) {
-                     return std::make_unique<plssvm::environment::scope_guard>(argc, argv, args["backends"].cast<std::vector<plssvm::backend_type>>());
-                 } else {
-                     return std::make_unique<plssvm::environment::scope_guard>(argc, argv);
-                 }
-             }),
-             "create a new scope_guard and initialize all available backends or only the optionally provided once using the provided command line parameters")
-        .def("backends", &plssvm::environment::scope_guard::backends, "return all initialized backends");
-}
diff --git a/bindings/Python/main.cpp b/bindings/Python/main.cpp
index c3e8b3d14..1c1248fb2 100644
--- a/bindings/Python/main.cpp
+++ b/bindings/Python/main.cpp
@@ -7,6 +7,7 @@
  *          See the LICENSE.md file in the project root for full license information.
  */
 
+#include "plssvm/environment.hpp"            // plssvm::environment::{initialize, finalize}
 #include "plssvm/exceptions/exceptions.hpp"  // plssvm::exception
 
 #include "pybind11/pybind11.h"  // PYBIND11_MODULE, py::module_, py::exception, py::register_exception_translator
@@ -32,7 +33,6 @@ void init_parameter(py::module_ &);
 void init_model(py::module_ &);
 void init_data_set(py::module_ &);
 void init_version(py::module_ &);
-void init_environment(py::module_ &);
 void init_exceptions(py::module_ &, const py::exception<plssvm::exception> &);
 void init_csvm(py::module_ &);
 void init_openmp_csvm(py::module_ &, const py::exception<plssvm::exception> &);
@@ -48,6 +48,14 @@ void init_sklearn(py::module_ &);
 PYBIND11_MODULE(plssvm, m) {
     m.doc() = "Parallel Least Squares Support Vector Machine";
 
+    // automatically initialize the environments
+    plssvm::environment::initialize();
+
+    // automatically finalize the environments
+    m.add_object("_cleanup", py::capsule([]() {
+                     plssvm::environment::finalize();
+                 }));
+
     // register PLSSVM base exception
     static py::exception<plssvm::exception> base_exception(m, "PLSSVMError");
     py::register_exception_translator([](std::exception_ptr p) {
@@ -81,7 +89,6 @@ PYBIND11_MODULE(plssvm, m) {
     init_model(m);
     init_data_set(m);
     init_version(m);
-    init_environment(m);
     init_exceptions(m, base_exception);
     init_csvm(m);
 
diff --git a/examples/python/main.py b/examples/python/main.py
index a200524ff..5efd985ec 100644
--- a/examples/python/main.py
+++ b/examples/python/main.py
@@ -1,9 +1,6 @@
 import plssvm
 from sklearn.metrics import classification_report
 
-# correctly initialize and finalize environments
-environment_guard = plssvm.environment.ScopeGuard()
-
 try:
     # create a new C-SVM parameter set, explicitly overriding the default kernel function
     params = plssvm.Parameter(kernel_type=plssvm.KernelFunctionType.POLYNOMIAL)
diff --git a/examples/python/sklearn_like_svc.py b/examples/python/sklearn_like_svc.py
index 57ab3e148..4b5e5f44f 100644
--- a/examples/python/sklearn_like_svc.py
+++ b/examples/python/sklearn_like_svc.py
@@ -1,9 +1,6 @@
 from sklearn.datasets import make_classification
 import plssvm
 
-# correctly initialize and finalize environments
-environment_guard = plssvm.environment.ScopeGuard()
-
 num_samples = 2**8
 num_features = 2**6
 

From eb45c5d9a223ac0e3463a6e2861b01a115b0fb9a Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 28 Nov 2024 17:19:18 +0100
Subject: [PATCH 120/123] Fix Doxygen documentation warnings.

---
 docs/resources/dirs.dox                                   | 4 ++--
 .../detail/constexpr_available_execution_spaces.hpp       | 7 +++++--
 include/plssvm/environment.hpp                            | 8 ++++++--
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/docs/resources/dirs.dox b/docs/resources/dirs.dox
index 4c59aa37f..fd23efcbc 100644
--- a/docs/resources/dirs.dox
+++ b/docs/resources/dirs.dox
@@ -400,7 +400,7 @@
  * @dir include/plssvm/backends/HPX/detail
  * @author Alexander Van Craen
  * @author Marcel Breyer
- * @authir Alexander Strack
+ * @author Alexander Strack
  * @copyright 2018-today The PLSSVM project - All Rights Reserved
  * @license This file is part of the PLSSVM project which is released under the MIT license.
  *          See the LICENSE.md file in the project root for full license information.
@@ -436,7 +436,7 @@
  * @dir include/plssvm/backends/HPX/kernel/cg_implicit
  * @author Alexander Van Craen
  * @author Marcel Breyer
- * @author Alexander Strack 
+ * @author Alexander Strack
  * @copyright 2018-today The PLSSVM project - All Rights Reserved
  * @license This file is part of the PLSSVM project which is released under the MIT license.
  *          See the LICENSE.md file in the project root for full license information.
diff --git a/include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp b/include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp
index 92f908fa7..80d3f8cd9 100644
--- a/include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp
+++ b/include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp
@@ -13,8 +13,11 @@
 #ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_CONSTEXPR_AVAILABLE_EXECUTION_SPACES_HPP_
 #define PLSSVM_BACKENDS_KOKKOS_DETAIL_CONSTEXPR_AVAILABLE_EXECUTION_SPACES_HPP_
 
-// if the variable isn't set, no Kokkos execution space is available
-// -> explicitly set it to 0!
+/**
+ * @def PLSSVM_KOKKOS_BACKEND_NUM_AVAILABLE_EXECUTION_SPACES
+ * @brief Set the macro `PLSSVM_KOKKOS_BACKEND_NUM_AVAILABLE_EXECUTION_SPACES` to 0 if it isn't already defined, i.e., no Kokkos execution space is available.
+ *        Will normally be propagated by CMake with the number of available Kokkos execution spaces.
+ */
 #if !defined(PLSSVM_KOKKOS_BACKEND_NUM_AVAILABLE_EXECUTION_SPACES)
     #define PLSSVM_KOKKOS_BACKEND_NUM_AVAILABLE_EXECUTION_SPACES 0
 #endif
diff --git a/include/plssvm/environment.hpp b/include/plssvm/environment.hpp
index 60573738d..68b36053b 100644
--- a/include/plssvm/environment.hpp
+++ b/include/plssvm/environment.hpp
@@ -458,7 +458,8 @@ inline std::vector<backend_type> finalize() {
 class [[nodiscard]] scope_guard {
   public:
     /**
-     * @copydoc initialize()
+     * @brief Initialize all **available** backends.
+     * @details Only initializes backends that are currently uninitialized.
      */
     scope_guard() {
         backends_ = initialize();
@@ -473,7 +474,10 @@ class [[nodiscard]] scope_guard {
     }
 
     /**
-     * @copydoc initialize(int &, char **)
+     * @brief Initialize all **available** backends.
+     * @details Only initializes backends that are currently uninitialized.
+     * @param[in,out] argc the number of provided command line arguments
+     * @param[in,out] argv the provided command line arguments
      */
     scope_guard(int &argc, char **argv) {
         backends_ = initialize(argc, argv);

From 5bd2828a03b5b2cb4dafe76ca8d85a284697a723 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 3 Dec 2024 12:37:34 +0100
Subject: [PATCH 121/123] Correctly implement is_initialization_necessary for
 HPX and Kokkos.

---
 include/plssvm/environment.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/plssvm/environment.hpp b/include/plssvm/environment.hpp
index 68b36053b..d04e1a277 100644
--- a/include/plssvm/environment.hpp
+++ b/include/plssvm/environment.hpp
@@ -189,7 +189,7 @@ template <auto is_initialized_function, auto is_finalized_function>
 constexpr bool is_initialization_necessary([[maybe_unused]] const backend_type backend) {
     // Note: must be implemented for the backends that need environmental setup
     // currently false for all available backends
-    return false;
+    return backend == backend_type::hpx || backend == backend_type::kokkos;
 }
 
 //****************************************************************************//

From 6bb72615a9568df2eb0a29e7386e9136f813c6d2 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 3 Dec 2024 14:04:57 +0100
Subject: [PATCH 122/123] Add missing includes and update formatting.

---
 include/plssvm/environment.hpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/plssvm/environment.hpp b/include/plssvm/environment.hpp
index d04e1a277..cddb3f31c 100644
--- a/include/plssvm/environment.hpp
+++ b/include/plssvm/environment.hpp
@@ -30,16 +30,17 @@
 #endif
 
 #include "fmt/base.h"     // fmt::formatter
+#include "fmt/format.h"   // fmt::format
 #include "fmt/ostream.h"  // fmt::ostream_formatter
 #include "fmt/ranges.h"   // fmt::join
 
-#include <ios>      // std::ios::failbit
-#include <istream>  // std::istream
-#include <ostream>  // std::ostream
-#include <string>   // std::string
-#include <vector>   // std::vector
-
-
+#include <algorithm>  // std::remove_if
+#include <ios>        // std::ios::failbit
+#include <istream>    // std::istream
+#include <ostream>    // std::ostream
+#include <string>     // std::string
+#include <utility>    // std::move
+#include <vector>     // std::vector
 
 namespace plssvm::environment {
 

From cd6d31ce935ff9fc8ed3a5b15e29178125093c3e Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 3 Dec 2024 17:27:27 +0100
Subject: [PATCH 123/123] Specify latest supported Kokkos release.

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 52d95803f..0091ad85c 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ The main highlights of our SVM implementations are:
    - [HIP](https://github.com/ROCm-Developer-Tools/HIP)
    - [OpenCL](https://www.khronos.org/opencl/)
    - [SYCL](https://www.khronos.org/sycl/) (supported implementations are [DPC++](https://github.com/intel/llvm) and [AdaptiveCpp](https://github.com/AdaptiveCpp/AdaptiveCpp) (formerly known as hipSYCL); specifically the versions [sycl-nightly/20231201](https://github.com/intel/llvm/tree/sycl-nightly/20230110) and AdaptiveCpp release [v24.06.0](https://github.com/AdaptiveCpp/AdaptiveCpp/releases/tag/v23.10.0))
-   - [Kokkos](https://github.com/kokkos/kokkos) (all execution spaces supported except `OpenMPTarget` and `OpenACC`); specifically the version [d50de97](https://github.com/kokkos/kokkos/commit/d50de979b4d095dc32dba80f72a5e009f3615db1)
+   - [Kokkos](https://github.com/kokkos/kokkos) (all execution spaces supported except `OpenMPTarget` and `OpenACC`); specifically the version [4.5.00](https://github.com/kokkos/kokkos/releases/tag/4.5.00)
 3. Six different kernel functions to be able to classify a large variety of different problems:
    - linear: $\vec{u}^T$ $\cdot$ $\vec{v}$
    - polynomial: $(\gamma$ $\cdot$ $\vec{u}^T$ $\cdot$ $\vec{v}$ $+$ $coef0)^{d}$