diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
index df8d453a41d57..c3a263d2d072a 100644
--- a/sycl/doc/syclcompat/README.md
+++ b/sycl/doc/syclcompat/README.md
@@ -1725,7 +1725,51 @@ second operand, respectively. These three APIs return a single 32-bit value with
 the accumulated result, which is unsigned if both operands are `uint32_t` and
 signed otherwise.
 
+Various maths functions are defined operate on any floating point types.
+`syclcompat::is_floating_point_v` extends the standard library's
+`std::is_floating_point_v` to include `sycl::half` and, where available,
+`sycl::ext::oneapi::bfloat16`. The current version of SYCLcompat also provides
+a specialization of `std::common_type_t` for `sycl::ext::oneapi::bfloat16`,
+though this will be moved to the `sycl_ext_oneapi_bfloat16` extension in
+future.
+
+```cpp
+namespace std {
+template <> struct common_type<sycl::ext::oneapi::bfloat16> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+
+template <>
+struct common_type<sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+
+template <typename T> struct common_type<sycl::ext::oneapi::bfloat16, T> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+
+template <typename T> struct common_type<T, sycl::ext::oneapi::bfloat16> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+} // namespace std
+```
+
 ```cpp
+namespace syclcompat{
+
+// Trait for extended floating point definition
+template <typename T>
+struct is_floating_point : std::is_floating_point<T>{};
+
+template <> struct is_floating_point<sycl::half> : std::true_type {};
+
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+template <> struct is_floating_point<sycl::ext::oneapi::bfloat16> : std::true_type {};
+#endif
+template <typename T>
+
+inline constexpr bool is_floating_point_v = is_floating_point<T>::value;
+
 inline unsigned int funnelshift_l(unsigned int low, unsigned int high,
                                   unsigned int shift); 
 
@@ -1752,11 +1796,9 @@ inline std::enable_if_t<ValueT::size() == 2, ValueT> isnan(const ValueT a);
 // cbrt function wrapper.
 template <typename ValueT>
 inline std::enable_if_t<std::is_floating_point_v<ValueT> ||
-                            std::is_same_v<sycl::half, ValueT>,
+                            std::is_same_v<ValueT, sycl::half>,
                         ValueT>
-cbrt(ValueT val) {
-  return sycl::cbrt(static_cast<ValueT>(val));
-}
+cbrt(ValueT val);
 
 // For floating-point types, `float` or `double` arguments are acceptable.
 // For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
@@ -1794,6 +1836,10 @@ template <typename ValueT, typename ValueU>
 inline sycl::vec<std::common_type_t<ValueT, ValueU>, 2>
 fmax_nan(const sycl::vec<ValueT, 2> a, const sycl::vec<ValueU, 2> b);
 
+template <typename ValueT, typename ValueU>
+inline sycl::marray<std::common_type_t<ValueT, ValueU>, 2>
+fmax_nan(const sycl::marray<ValueT, 2> a, const sycl::marray<ValueU, 2> b);
+
 // Performs 2 elements comparison and returns the smaller one. If either of
 // inputs is NaN, then return NaN.
 template <typename ValueT, typename ValueU>
@@ -1803,6 +1849,10 @@ template <typename ValueT, typename ValueU>
 inline sycl::vec<std::common_type_t<ValueT, ValueU>, 2>
 fmin_nan(const sycl::vec<ValueT, 2> a, const sycl::vec<ValueU, 2> b);
 
+template <typename ValueT, typename ValueU>
+inline sycl::marray<std::common_type_t<ValueT, ValueU>, 2>
+fmin_nan(const sycl::marray<ValueT, 2> a, const sycl::marray<ValueU, 2> b);
+
 inline float pow(const float a, const int b) { return sycl::pown(a, b); }
 inline double pow(const double a, const int b) { return sycl::pown(a, b); }
 
@@ -1863,14 +1913,13 @@ unordered_compare_both(const ValueT a, const ValueT b,
                        const BinaryOperation binary_op);
 
 template <typename ValueT, class BinaryOperation>
-inline unsigned compare_mask(const sycl::vec<ValueT, 2> a,
-                             const sycl::vec<ValueT, 2> b,
-                             const BinaryOperation binary_op);
+inline std::enable_if_t<ValueT::size() == 2, unsigned>
+compare_mask(const ValueT a, const ValueT b, const BinaryOperation binary_op);
 
 template <typename ValueT, class BinaryOperation>
-inline unsigned unordered_compare_mask(const sycl::vec<ValueT, 2> a,
-                                       const sycl::vec<ValueT, 2> b,
-                                       const BinaryOperation binary_op);
+inline std::enable_if_t<ValueT::size() == 2, unsigned>
+unordered_compare_mask(const ValueT a, const ValueT b,
+                       const BinaryOperation binary_op);
 
 template <typename S, typename T> inline T vectorized_max(T a, T b);
 
@@ -1924,6 +1973,7 @@ inline dot_product_acc_t<T1, T2> dp2a_hi(T1 a, T2 b,
 template <typename T1, typename T2>
 inline dot_product_acc_t<T1, T2> dp4a(T1 a, T2 b,
                                       dot_product_acc_t<T1, T2> c);
+} // namespace syclcompat
 ```
 
 `vectorized_binary` computes the `BinaryOperation` for two operands,
diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index 785d95f6f2404..a3ee2b2085788 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -31,12 +31,19 @@
 
 #pragma once
 
+#include <sycl/feature_test.hpp>
+#include <type_traits>
+
+// TODO(syclcompat-lib-reviewers): this should not be required
 #ifndef SYCL_EXT_ONEAPI_COMPLEX
 #define SYCL_EXT_ONEAPI_COMPLEX
 #endif
 
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
 #include <sycl/ext/oneapi/experimental/bfloat16_math.hpp>
+#endif
 #include <sycl/ext/oneapi/experimental/complex/complex.hpp>
+#include <syclcompat/traits.hpp>
 
 namespace syclcompat {
 namespace detail {
@@ -46,18 +53,25 @@ namespace complex_namespace = sycl::ext::oneapi::experimental;
 template <typename ValueT>
 using complex_type = detail::complex_namespace::complex<ValueT>;
 
+template <typename T>
+constexpr bool is_int32_type = std::is_same_v<std::decay_t<T>, int32_t> ||
+  std::is_same_v<std::decay_t<T>, uint32_t>;
+
+// Helper constexpr bool to avoid ugly macros where possible
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+constexpr bool support_bfloat16_math = true;
+#else
+constexpr bool support_bfloat16_math = false;
+#endif
+
 template <typename ValueT>
 inline ValueT clamp(ValueT val, ValueT min_val, ValueT max_val) {
   return sycl::clamp(val, min_val, max_val);
 }
-
-template <typename T>
-constexpr bool is_int32_type = std::is_same_v<std::decay_t<T>, int32_t> ||
-                               std::is_same_v<std::decay_t<T>, uint32_t>;
-
 #ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-// TODO: Follow the process to add this to the extension. If added,
-// remove this functionality from the header.
+// TODO(syclcompat-lib-reviewers): Follow the process to add this (& other math
+// fns) to the bfloat16 math function extension. If added, remove this
+// functionality from the header.
 template <>
 inline sycl::ext::oneapi::bfloat16 clamp(sycl::ext::oneapi::bfloat16 val,
                                          sycl::ext::oneapi::bfloat16 min_val,
@@ -68,6 +82,28 @@ inline sycl::ext::oneapi::bfloat16 clamp(sycl::ext::oneapi::bfloat16 val,
     return max_val;
   return val;
 }
+
+template <typename T, int Size>
+inline std::enable_if_t<std::is_same_v<T, sycl::ext::oneapi::bfloat16>,
+                        sycl::vec<T, Size>>
+clamp(sycl::vec<T, Size> val, sycl::vec<T, Size> min_val,
+      sycl::vec<T, Size> max_val) {
+  return [&val, &min_val, &max_val]<int... I>(std::integer_sequence<int, I...>) {
+    return sycl::vec<T, Size>{
+        clamp<sycl::ext::oneapi::bfloat16>(val[I], min_val[I], max_val[I])...};
+  }(std::make_integer_sequence<int, Size>{});
+}
+
+template <typename T, std::size_t Size>
+inline std::enable_if_t<std::is_same_v<T, sycl::ext::oneapi::bfloat16>,
+                        sycl::marray<T, Size>>
+clamp(sycl::marray<T, Size> val, sycl::marray<T, Size> min_val,
+      sycl::marray<T, Size> max_val) {
+  return [&val, &min_val, &max_val]<std::size_t... I>(std::index_sequence<I...>) {
+    return sycl::marray<T, Size>{
+        clamp<sycl::ext::oneapi::bfloat16>(val[I], min_val[I], max_val[I])...};
+  }(std::make_index_sequence<Size>{});
+}
 #endif
 
 template <typename VecT, class BinaryOperation, class = void>
@@ -218,13 +254,13 @@ inline constexpr RetT extend_vbinary4(AT a, BT b, RetT c,
 }
 
 template <typename ValueT> inline bool isnan(const ValueT a) {
-  return sycl::isnan(a);
-}
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-inline bool isnan(const sycl::ext::oneapi::bfloat16 a) {
-  return sycl::ext::oneapi::experimental::isnan(a);
+  if constexpr (std::is_same_v<ValueT, sycl::ext::oneapi::bfloat16>) {
+    static_assert(detail::support_bfloat16_math);
+    return sycl::ext::oneapi::experimental::isnan(a);
+  } else {
+    return sycl::isnan(a);
+  }
 }
-#endif
 
 // FIXME(syclcompat-lib-reviewers): move bfe outside detail once perf is
 // improved & semantics understood
@@ -543,9 +579,8 @@ unordered_compare_both(const ValueT a, const ValueT b,
 /// \param [in] binary_op functor that implements the binary operation
 /// \returns the comparison result
 template <typename ValueT, class BinaryOperation>
-inline unsigned compare_mask(const sycl::vec<ValueT, 2> a,
-                             const sycl::vec<ValueT, 2> b,
-                             const BinaryOperation binary_op) {
+inline std::enable_if_t<ValueT::size() == 2, unsigned>
+compare_mask(const ValueT a, const ValueT b, const BinaryOperation binary_op) {
   // Since compare returns 0 or 1, -compare will be 0x00000000 or 0xFFFFFFFF
   return ((-compare(a[0], b[0], binary_op)) << 16) |
          ((-compare(a[1], b[1], binary_op)) & 0xFFFF);
@@ -559,9 +594,9 @@ inline unsigned compare_mask(const sycl::vec<ValueT, 2> a,
 /// \param [in] binary_op functor that implements the binary operation
 /// \returns the comparison result
 template <typename ValueT, class BinaryOperation>
-inline unsigned unordered_compare_mask(const sycl::vec<ValueT, 2> a,
-                                       const sycl::vec<ValueT, 2> b,
-                                       const BinaryOperation binary_op) {
+inline std::enable_if_t<ValueT::size() == 2, unsigned>
+unordered_compare_mask(const ValueT a, const ValueT b,
+                       const BinaryOperation binary_op) {
   return ((-unordered_compare(a[0], b[0], binary_op)) << 16) |
          ((-unordered_compare(a[1], b[1], binary_op)) & 0xFFFF);
 }
@@ -687,7 +722,7 @@ inline std::enable_if_t<ValueT::size() == 2, ValueT> isnan(const ValueT a) {
 /// cbrt function wrapper.
 template <typename ValueT>
 inline std::enable_if_t<std::is_floating_point_v<ValueT> ||
-                            std::is_same_v<sycl::half, ValueT>,
+                            std::is_same_v<ValueT, sycl::half>,
                         ValueT>
 cbrt(ValueT val) {
   return sycl::cbrt(static_cast<ValueT>(val));
@@ -697,7 +732,7 @@ cbrt(ValueT val) {
 // For floating-point types, `float` or `double` arguments are acceptable.
 // For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
 // `std::int64_t` type arguments are acceptable.
-// sycl::half supported as well.
+// sycl::half supported as well, and sycl::ext::oneapi::bfloat16 if available.
 template <typename ValueT, typename ValueU>
 inline std::enable_if_t<std::is_integral_v<ValueT> &&
                             std::is_integral_v<ValueU>,
@@ -706,15 +741,23 @@ min(ValueT a, ValueU b) {
   return sycl::min(static_cast<std::common_type_t<ValueT, ValueU>>(a),
                    static_cast<std::common_type_t<ValueT, ValueU>>(b));
 }
+
 template <typename ValueT, typename ValueU>
-inline std::enable_if_t<std::is_floating_point_v<ValueT> &&
-                            std::is_floating_point_v<ValueU>,
+inline std::enable_if_t<syclcompat::is_floating_point_v<ValueT> &&
+                            syclcompat::is_floating_point_v<ValueU>,
                         std::common_type_t<ValueT, ValueU>>
 min(ValueT a, ValueU b) {
-  return sycl::fmin(static_cast<std::common_type_t<ValueT, ValueU>>(a),
-                    static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  if constexpr (std::is_same_v<std::common_type_t<ValueT, ValueU>,
+                               sycl::ext::oneapi::bfloat16>) {
+    static_assert(detail::support_bfloat16_math);
+    return sycl::ext::oneapi::experimental::fmin(
+        static_cast<std::common_type_t<ValueT, ValueU>>(a),
+        static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  } else {
+    return sycl::fmin(static_cast<std::common_type_t<ValueT, ValueU>>(a),
+                      static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  }
 }
-inline sycl::half min(sycl::half a, sycl::half b) { return sycl::fmin(a, b); }
 
 template <typename ValueT, typename ValueU>
 inline std::enable_if_t<std::is_integral_v<ValueT> &&
@@ -725,14 +768,21 @@ max(ValueT a, ValueU b) {
                    static_cast<std::common_type_t<ValueT, ValueU>>(b));
 }
 template <typename ValueT, typename ValueU>
-inline std::enable_if_t<std::is_floating_point_v<ValueT> &&
-                            std::is_floating_point_v<ValueU>,
+inline std::enable_if_t<syclcompat::is_floating_point_v<ValueT> &&
+                            syclcompat::is_floating_point_v<ValueU>,
                         std::common_type_t<ValueT, ValueU>>
 max(ValueT a, ValueU b) {
-  return sycl::fmax(static_cast<std::common_type_t<ValueT, ValueU>>(a),
-                    static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  if constexpr (std::is_same_v<std::common_type_t<ValueT, ValueU>,
+                               sycl::ext::oneapi::bfloat16>) {
+    static_assert(detail::support_bfloat16_math);
+    return sycl::ext::oneapi::experimental::fmax(
+        static_cast<std::common_type_t<ValueT, ValueU>>(a),
+        static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  } else {
+    return sycl::fmax(static_cast<std::common_type_t<ValueT, ValueU>>(a),
+                      static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  }
 }
-inline sycl::half max(sycl::half a, sycl::half b) { return sycl::fmax(a, b); }
 
 /// Performs 2 elements comparison and returns the bigger one. If either of
 /// inputs is NaN, then return NaN.
@@ -744,15 +794,21 @@ inline std::common_type_t<ValueT, ValueU> fmax_nan(const ValueT a,
                                                    const ValueU b) {
   if (detail::isnan(a) || detail::isnan(b))
     return NAN;
-  return sycl::fmax(static_cast<std::common_type_t<ValueT, ValueU>>(a),
-                    static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  return syclcompat::max(a, b);
 }
+
 template <typename ValueT, typename ValueU>
 inline sycl::vec<std::common_type_t<ValueT, ValueU>, 2>
 fmax_nan(const sycl::vec<ValueT, 2> a, const sycl::vec<ValueU, 2> b) {
   return {fmax_nan(a[0], b[0]), fmax_nan(a[1], b[1])};
 }
 
+template <typename ValueT, typename ValueU>
+inline sycl::marray<std::common_type_t<ValueT, ValueU>, 2>
+fmax_nan(const sycl::marray<ValueT, 2> a, const sycl::marray<ValueU, 2> b) {
+  return {fmax_nan(a[0], b[0]), fmax_nan(a[1], b[1])};
+}
+
 /// Performs 2 elements comparison and returns the smaller one. If either of
 /// inputs is NaN, then return NaN.
 /// \param [in] a The first value
@@ -763,15 +819,21 @@ inline std::common_type_t<ValueT, ValueU> fmin_nan(const ValueT a,
                                                    const ValueU b) {
   if (detail::isnan(a) || detail::isnan(b))
     return NAN;
-  return sycl::fmin(static_cast<std::common_type_t<ValueT, ValueU>>(a),
-                    static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  return syclcompat::min(a,b);
 }
+
 template <typename ValueT, typename ValueU>
 inline sycl::vec<std::common_type_t<ValueT, ValueU>, 2>
 fmin_nan(const sycl::vec<ValueT, 2> a, const sycl::vec<ValueU, 2> b) {
   return {fmin_nan(a[0], b[0]), fmin_nan(a[1], b[1])};
 }
 
+template <typename ValueT, typename ValueU>
+inline sycl::marray<std::common_type_t<ValueT, ValueU>, 2>
+fmin_nan(const sycl::marray<ValueT, 2> a, const sycl::marray<ValueU, 2> b) {
+  return {fmin_nan(a[0], b[0]), fmin_nan(a[1], b[1])};
+}
+
 // pow functions overload.
 inline float pow(const float a, const int b) { return sycl::pown(a, b); }
 inline double pow(const double a, const int b) { return sycl::pown(a, b); }
@@ -781,10 +843,10 @@ inline typename std::enable_if_t<std::is_floating_point_v<ValueT>, ValueT>
 pow(const ValueT a, const ValueU b) {
   return sycl::pow(a, static_cast<ValueT>(b));
 }
-
-// TODO: calling pow with non-floating point values is currently defaulting to
-// double, which fails on devices without aspect::fp64. This has to be properly
-// documented, and maybe changed to support all devices.
+// TODO(syclcompat-lib-reviewers)  calling pow with non-floating point values
+// is currently defaulting to double, which fails on devices without
+// aspect::fp64. This has to be properly documented, and maybe changed to
+// support all devices.
 template <typename ValueT, typename ValueU>
 inline typename std::enable_if_t<!std::is_floating_point_v<ValueT>, double>
 pow(const ValueT a, const ValueU b) {
@@ -795,24 +857,20 @@ pow(const ValueT a, const ValueU b) {
 /// \param [in] a The input value
 /// \returns the relu saturation result
 template <typename ValueT>
-inline std::enable_if_t<std::is_floating_point_v<ValueT> ||
-                            std::is_same_v<sycl::half, ValueT>,
-                        ValueT>
+inline std::enable_if_t<syclcompat::is_floating_point_v<ValueT>, ValueT>
 relu(const ValueT a) {
   if (!detail::isnan(a) && a < ValueT(0))
     return ValueT(0);
   return a;
 }
 template <class ValueT>
-inline std::enable_if_t<std::is_floating_point_v<ValueT> ||
-                            std::is_same_v<sycl::half, ValueT>,
+inline std::enable_if_t<syclcompat::is_floating_point_v<ValueT>,
                         sycl::vec<ValueT, 2>>
 relu(const sycl::vec<ValueT, 2> a) {
   return {relu(a[0]), relu(a[1])};
 }
 template <class ValueT>
-inline std::enable_if_t<std::is_floating_point_v<ValueT> ||
-                            std::is_same_v<sycl::half, ValueT>,
+inline std::enable_if_t<syclcompat::is_floating_point_v<ValueT>,
                         sycl::marray<ValueT, 2>>
 relu(const sycl::marray<ValueT, 2> a) {
   return {relu(a[0]), relu(a[1])};
diff --git a/sycl/include/syclcompat/traits.hpp b/sycl/include/syclcompat/traits.hpp
index 2f389ccf79484..7ed4d765251bc 100644
--- a/sycl/include/syclcompat/traits.hpp
+++ b/sycl/include/syclcompat/traits.hpp
@@ -22,6 +22,10 @@
 
 #pragma once
 
+#include <sycl/feature_test.hpp>
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+#include <sycl/ext/oneapi/bfloat16.hpp>
+#endif
 #include <cstddef>
 #include <sycl/ext/oneapi/properties/properties.hpp>
 #include <sycl/ext/oneapi/properties/property_value.hpp>
@@ -250,4 +254,40 @@ using are_all_props = std::conjunction<
 
 } // namespace experimental::detail
 
+// Trait for extended floating point definition
+template <typename T>
+struct is_floating_point : std::is_floating_point<T>{};
+
+template <> struct is_floating_point<sycl::half> : std::true_type {};
+
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+template <> struct is_floating_point<sycl::ext::oneapi::bfloat16> : std::true_type {};
+#endif
+
+template <typename T>
+inline constexpr bool is_floating_point_v = is_floating_point<T>::value;
+
 } // namespace syclcompat
+
+// Specialize std::common_type for bfloat16
+// Semantics here match bfloat16.hpp operator overloads (all mixed type math
+// ops return bfloat16)
+// TODO(syclcompat-lib-reviewers) Move this to bfloat extension
+namespace std {
+template <> struct common_type<sycl::ext::oneapi::bfloat16> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+
+template <>
+struct common_type<sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+
+template <typename T> struct common_type<sycl::ext::oneapi::bfloat16, T> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+
+template <typename T> struct common_type<T, sycl::ext::oneapi::bfloat16> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+} // namespace std
diff --git a/sycl/test-e2e/syclcompat/common.hpp b/sycl/test-e2e/syclcompat/common.hpp
index 368089e89e85a..ff840c98209bd 100644
--- a/sycl/test-e2e/syclcompat/common.hpp
+++ b/sycl/test-e2e/syclcompat/common.hpp
@@ -22,6 +22,10 @@
 
 #pragma once
 
+#include <sycl/feature_test.hpp>
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+#include <sycl/ext/oneapi/bfloat16.hpp>
+#endif
 #include <sycl/half_type.hpp>
 #include <tuple>
 
@@ -44,8 +48,42 @@ template <typename Tuple, typename Func> void instantiate_all_types(Func &&f) {
     f<T>();                                                                    \
   });
 
+#define INSTANTIATE_ALL_CONTAINER_TYPES(tuple, container, f)                   \
+  instantiate_all_types<tuple>([](auto index) {                                \
+    using T = std::tuple_element_t<decltype(index)::value, tuple>;             \
+    f<container, T>();                                                         \
+  });
+
 using value_type_list =
-    std::tuple<int, unsigned int, short, unsigned short, long, unsigned long,
-               long long, unsigned long long, float, double, sycl::half>;
+    std::tuple<char, signed char, unsigned char, int, unsigned int, short,
+               unsigned short, long, unsigned long, long long,
+               unsigned long long, float, double, sycl::half
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+               ,sycl::ext::oneapi::bfloat16
+#endif
+>;
+
+using fp_type_list_no_bfloat16 = std::tuple<float, double, sycl::half>;
+
+using fp_type_list = std::tuple<float, double, sycl::half
+
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+                ,sycl::ext::oneapi::bfloat16
+#endif
+>;
 
-using fp_type_list = std::tuple<float, double, sycl::half>;
+using marray_type_list =
+    std::tuple<char, signed char, short, int, long, long long, unsigned char,
+               unsigned short, unsigned int, unsigned long, unsigned long long,
+               float, double, sycl::half
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+              , sycl::ext::oneapi::bfloat16
+#endif
+>;
+using vec_type_list = std::tuple<int8_t, int16_t, int32_t, int64_t, uint8_t,
+                                 uint16_t, uint32_t, uint64_t, float, double,
+                                 sycl::half
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+              , sycl::ext::oneapi::bfloat16
+#endif
+>;
diff --git a/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp b/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp
index a22d54474d9ed..41f9a8cbee747 100644
--- a/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp
+++ b/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp
@@ -58,14 +58,19 @@ void dynamic_local_mem_typed_kernel(T *data, char *local_mem) {
   constexpr size_t num_elements = memsize / sizeof(T);
   T *typed_local_mem = reinterpret_cast<T *>(local_mem);
 
-  const int id =
-      sycl::ext::oneapi::this_work_item::get_nd_item<3>().get_global_linear_id();
-  if (id < num_elements) {
-    typed_local_mem[id] = static_cast<T>(id);
-  }
-  syclcompat::wg_barrier();
-  if (id < num_elements) {
-    data[id] = typed_local_mem[num_elements - id - 1];
+  const int local_id =
+      sycl::ext::oneapi::this_work_item::get_nd_item<3>().get_local_linear_id();
+  const int group_id =
+      sycl::ext::oneapi::this_work_item::get_nd_item<3>().get_group_linear_id();
+  // Only operate in first work-group
+  if (group_id == 0) {
+    if (local_id < num_elements) {
+      typed_local_mem[local_id] = static_cast<T>(local_id);
+    }
+    syclcompat::wg_barrier();
+    if (local_id < num_elements) {
+      data[local_id] = typed_local_mem[num_elements - local_id - 1];
+    }
   }
 };
 
diff --git a/sycl/test-e2e/syclcompat/math/math_compare.cpp b/sycl/test-e2e/syclcompat/math/math_compare.cpp
index c42b22b199888..0f77160a564e7 100644
--- a/sycl/test-e2e/syclcompat/math/math_compare.cpp
+++ b/sycl/test-e2e/syclcompat/math/math_compare.cpp
@@ -56,7 +56,7 @@ template <typename ValueT> void test_compare() {
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr ValueT op1 = static_cast<ValueT>(1.0);
+  const ValueT op1 = static_cast<ValueT>(1.0);
   ValueT op2 = sycl::nan(static_cast<unsigned int>(0));
 
   //  1.0 == 1.0 -> True
@@ -96,13 +96,14 @@ void compare_not_equal_vec_kernel(Container *a, Container *b, Container *r) {
   *r = syclcompat::compare(*a, *b, std::not_equal_to<>());
 }
 
-template <typename ValueT> void test_compare_vec() {
+template <template <typename T, int Dim> typename ContainerT,
+typename ValueT> void test_compare_vec() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using Container = sycl::vec<ValueT, 2>;
+  using Container = ContainerT<ValueT, 2>;
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr Container op1 = {static_cast<ValueT>(1.0),
+  const Container op1 = {static_cast<ValueT>(1.0),
                              static_cast<ValueT>(2.0)};
   Container op2 = {static_cast<ValueT>(1.0),
                    sycl::nan(static_cast<unsigned int>(0))};
@@ -110,12 +111,12 @@ template <typename ValueT> void test_compare_vec() {
   // bool2 does not exist, 1.0 and 0.0 floats are used for true
   // and false instead.
   //  1.0 == 1.0, 2.0 == NaN -> {true, false}
-  constexpr Container res1 = {1.0, 0.0};
+  const Container res1 = {1.0, 0.0};
   BinaryOpTestLauncher<Container, Container>(grid, threads)
       .template launch_test<compare_equal_vec_kernel<Container>>(op1, op2,
                                                                  res1);
   //  1.0 != 1.0, 2.0 != NaN -> {false, false}
-  constexpr Container res2 = {0.0, 0.0};
+  const Container res2 = {0.0, 0.0};
   BinaryOpTestLauncher<Container, Container>(grid, threads)
       .template launch_test<compare_not_equal_vec_kernel<Container>>(op1, op2,
                                                                      res2);
@@ -137,7 +138,7 @@ void test_unordered_compare() {
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr ValueT op1 = static_cast<ValueT>(1.0);
+  const ValueT op1 = static_cast<ValueT>(1.0);
   ValueT op2 = sycl::nan(static_cast<unsigned int>(0));
 
   // Unordered comparison checks if either operand is NaN, or the binaryop holds
@@ -177,13 +178,14 @@ void unordered_compare_not_equal_vec_kernel(Container *a, Container *b,
   *r = syclcompat::unordered_compare(*a, *b, std::not_equal_to<>());
 }
 
-template <typename ValueT> void test_unordered_compare_vec() {
+template <template <typename T, int Dim> typename ContainerT,
+typename ValueT> void test_unordered_compare_vec() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using Container = sycl::vec<ValueT, 2>;
+  using Container = ContainerT<ValueT, 2>;
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr Container op1 = {static_cast<ValueT>(1.0),
+  const Container op1 = {static_cast<ValueT>(1.0),
                              static_cast<ValueT>(2.0)};
   Container op2 = {static_cast<ValueT>(1.0),
                    sycl::nan(static_cast<unsigned int>(0))};
@@ -191,12 +193,12 @@ template <typename ValueT> void test_unordered_compare_vec() {
   // bool2 does not exist, 1.0 and 0.0 floats are used for true
   // and false instead.
   //  1.0 == 1.0, 2.0 == NaN -> {true, true}
-  constexpr Container res1 = {1.0, 1.0};
+  const Container res1 = {1.0, 1.0};
   BinaryOpTestLauncher<Container, Container>(grid, threads)
       .template launch_test<unordered_compare_equal_vec_kernel<Container>>(
           op1, op2, res1);
   //  1.0 != 1.0, 2.0 != NaN -> {false, true}
-  constexpr Container res2 = {0.0, 1.0};
+  const Container res2 = {0.0, 1.0};
   BinaryOpTestLauncher<Container, Container>(grid, threads)
       .template launch_test<unordered_compare_not_equal_vec_kernel<Container>>(
           op1, op2, res2);
@@ -207,13 +209,14 @@ void compare_both_kernel(Container *a, Container *b, bool *r) {
   *r = syclcompat::compare_both(*a, *b, std::equal_to<>());
 }
 
-template <typename ValueT> void test_compare_both() {
+template <template <typename T, int Dim> typename ContainerT,
+typename ValueT> void test_compare_both() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using Container = sycl::vec<ValueT, 2>;
+  using Container = ContainerT<ValueT, 2>;
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr Container op1 = {static_cast<ValueT>(1.0),
+  const Container op1 = {static_cast<ValueT>(1.0),
                              static_cast<ValueT>(2.0)};
   Container op2 = {static_cast<ValueT>(1.0),
                    sycl::nan(static_cast<unsigned int>(0))};
@@ -236,13 +239,14 @@ void unordered_compare_both_kernel(Container *a, Container *b, bool *r) {
   *r = syclcompat::unordered_compare_both(*a, *b, std::equal_to<>());
 }
 
-template <typename ValueT> void test_unordered_compare_both() {
+template <template <typename T, int Dim> typename ContainerT,
+typename ValueT> void test_unordered_compare_both() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using Container = sycl::vec<ValueT, 2>;
+  using Container = ContainerT<ValueT, 2>;
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr Container op1 = {static_cast<ValueT>(1.0),
+  const Container op1 = {static_cast<ValueT>(1.0),
                              static_cast<ValueT>(2.0)};
   Container op2 = {static_cast<ValueT>(1.0),
                    sycl::nan(static_cast<unsigned int>(0))};
@@ -266,19 +270,20 @@ void compare_mask_kernel(Container *a, Container *b, unsigned *r) {
   *r = syclcompat::compare_mask(*a, *b, std::equal_to<>());
 }
 
-template <typename ValueT> void test_compare_mask() {
+template <template <typename T, int Dim> typename ContainerT,
+typename ValueT> void test_compare_mask() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using Container = sycl::vec<ValueT, 2>;
+  using Container = ContainerT<ValueT, 2>;
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr Container op1 = {static_cast<ValueT>(1.0),
+  const Container op1 = {static_cast<ValueT>(1.0),
                              static_cast<ValueT>(2.0)};
-  constexpr Container op2 = {static_cast<ValueT>(2.0),
+  const Container op2 = {static_cast<ValueT>(2.0),
                              static_cast<ValueT>(1.0)};
-  constexpr Container op3 = {static_cast<ValueT>(1.0),
+  const Container op3 = {static_cast<ValueT>(1.0),
                              static_cast<ValueT>(3.0)};
-  constexpr Container op4 = {static_cast<ValueT>(3.0),
+  const Container op4 = {static_cast<ValueT>(3.0),
                              static_cast<ValueT>(2.0)};
   Container op5 = {sycl::nan(static_cast<unsigned int>(0)),
                    sycl::nan(static_cast<unsigned int>(0))};
@@ -314,19 +319,20 @@ void unordered_compare_mask_kernel(Container *a, Container *b, unsigned *r) {
   *r = syclcompat::unordered_compare_mask(*a, *b, std::equal_to<>());
 }
 
-template <typename ValueT> void test_unordered_compare_mask() {
+template <template <typename T, int Dim> typename ContainerT,
+typename ValueT> void test_unordered_compare_mask() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using Container = sycl::vec<ValueT, 2>;
+  using Container = ContainerT<ValueT, 2>;
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr Container op1 = {static_cast<ValueT>(1.0),
+  const Container op1 = {static_cast<ValueT>(1.0),
                              static_cast<ValueT>(2.0)};
-  constexpr Container op2 = {static_cast<ValueT>(2.0),
+  const Container op2 = {static_cast<ValueT>(2.0),
                              static_cast<ValueT>(1.0)};
-  constexpr Container op3 = {static_cast<ValueT>(1.0),
+  const Container op3 = {static_cast<ValueT>(1.0),
                              static_cast<ValueT>(3.0)};
-  constexpr Container op4 = {static_cast<ValueT>(3.0),
+  const Container op4 = {static_cast<ValueT>(3.0),
                              static_cast<ValueT>(2.0)};
   Container op5 = {sycl::nan(static_cast<unsigned int>(0)),
                    sycl::nan(static_cast<unsigned int>(0))};
@@ -360,12 +366,18 @@ template <typename ValueT> void test_unordered_compare_mask() {
 int main() {
   INSTANTIATE_ALL_TYPES(fp_type_list, test_compare);
   INSTANTIATE_ALL_TYPES(fp_type_list, test_unordered_compare);
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_compare_vec);
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_unordered_compare_vec);
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_compare_both);
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_unordered_compare_both);
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_compare_mask);
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_unordered_compare_mask);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_compare_vec);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_compare_vec);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_unordered_compare_vec);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_unordered_compare_vec);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_compare_both);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_compare_both);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_unordered_compare_both);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_unordered_compare_both);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_compare_mask);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_compare_mask);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_unordered_compare_mask);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_unordered_compare_mask);
 
   return 0;
 }
diff --git a/sycl/test-e2e/syclcompat/math/math_complex.cpp b/sycl/test-e2e/syclcompat/math/math_complex.cpp
index d8bedb7082f97..27e2bf8af8f71 100644
--- a/sycl/test-e2e/syclcompat/math/math_complex.cpp
+++ b/sycl/test-e2e/syclcompat/math/math_complex.cpp
@@ -50,6 +50,15 @@ template <typename T> bool check(T x, float *e) {
   return false;
 }
 
+template <typename T> bool check(sycl::marray<T, 2> x, float *e) {
+  float precision = ERROR_TOLERANCE;
+  if ((x[0] - e[0] < precision) && (x[0] - e[0] > -precision) &&
+      (x[1] - e[1] < precision) && (x[1] - e[1] > -precision)) {
+    return true;
+  }
+  return false;
+}
+
 template <> bool check<float>(float x, float *e) {
   float precision = ERROR_TOLERANCE;
   if ((x - e[0] < precision) && (x - e[0] > -precision)) {
@@ -206,10 +215,10 @@ void kernel_mul_add(int *result) {
   m_f2 = sycl::marray<float, 2>(-3.6, 4.5);
   m_f3 = sycl::marray<float, 2>(1.0, -1.0);
 
-  auto a3 = syclcompat::cmul_add(d1, d2, d3);
+  auto a3 = syclcompat::cmul_add(m_d1, m_d2, m_d3);
   r = r && check(a3, expect);
 
-  auto a4 = syclcompat::cmul_add(f1, f2, f3);
+  auto a4 = syclcompat::cmul_add(m_f1, m_f2, m_f3);
   r = r && check(a4, expect + 2);
 
   *result = r;
diff --git a/sycl/test-e2e/syclcompat/math/math_fixt.hpp b/sycl/test-e2e/syclcompat/math/math_fixt.hpp
index 8b395f94faca1..cacd6ea1fb32c 100644
--- a/sycl/test-e2e/syclcompat/math/math_fixt.hpp
+++ b/sycl/test-e2e/syclcompat/math/math_fixt.hpp
@@ -51,21 +51,35 @@ static constexpr bool contained_is_floating_point_v = false;
 template <typename Container>
 static constexpr bool contained_is_floating_point_v<
     Container, std::void_t<typename Container::value_type>> =
-    std::is_floating_point_v<typename Container::value_type> ||
-    std::is_same_v<typename Container::value_type, sycl::half>;
+    syclcompat::is_floating_point_v<typename Container::value_type>;
 
-template <typename ValueT> struct should_skip {
+template <typename... Ts> struct container_common_type;
+
+template <template <typename, int> typename Container, typename T, typename U,
+          int Size>
+struct container_common_type<Container<T, Size>, Container<U, Size>> {
+  using type = Container<std::common_type_t<T, U>, Size>;
+};
+
+template <typename T, typename U> struct container_common_type<T, U> {
+  using type = std::common_type_t<T, U>;
+};
+
+template <typename T, typename U>
+using container_common_type_t = typename container_common_type<T, U>::type;
+
+template <typename ...ValueT> struct should_skip {
   bool operator()(const sycl::device &dev) const {
-    if constexpr (std::is_same_v<ValueT, double> ||
-                  contained_is_same_v<ValueT, double>) {
+    if constexpr ((std::is_same_v<ValueT, double> || ...) ||
+                  (contained_is_same_v<ValueT, double> || ...)) {
       if (!dev.has(sycl::aspect::fp64)) {
         std::cout << "  sycl::aspect::fp64 not supported by the SYCL device."
                   << std::endl;
         return true;
       }
     }
-    if constexpr (std::is_same_v<ValueT, sycl::half> ||
-                  contained_is_same_v<ValueT, sycl::half>) {
+    if constexpr ((std::is_same_v<ValueT, sycl::half> || ...) ||
+                  (contained_is_same_v<ValueT, sycl::half> || ...)) {
       if (!dev.has(sycl::aspect::fp16)) {
         std::cout << "  sycl::aspect::fp16 not supported by the SYCL device."
                   << std::endl;
@@ -79,17 +93,24 @@ template <typename ValueT> struct should_skip {
 #define CHECK(ResultT, RESULT, EXPECTED)                                       \
   if constexpr (std::is_integral_v<ResultT>) {                                 \
     assert(RESULT == EXPECTED);                                                \
-  } else if constexpr (std::is_floating_point_v<ResultT> ||                    \
-                       std::is_same_v<ResultT, sycl::half>) {                  \
-    if (sycl::isnan(RESULT))                                                   \
-      assert(sycl::isnan(EXPECTED));                                           \
+  } else if constexpr (contained_is_integral_v<ResultT>) {                     \
+    for (size_t i = 0; i < RESULT.size(); i++)                                 \
+      assert(RESULT[i] == EXPECTED[i]);                                        \
+  } else if constexpr (syclcompat::is_floating_point_v<ResultT>) {             \
+    if (syclcompat::detail::isnan(RESULT))                                     \
+      assert(syclcompat::detail::isnan(EXPECTED));                             \
     else                                                                       \
       assert(fabs(RESULT - EXPECTED) < ERROR_TOLERANCE);                       \
   } else if constexpr (contained_is_floating_point_v<ResultT>) {               \
-    for (size_t i = 0; i < RESULT.size(); i++)                                 \
-      assert(fabs(RESULT[i] - EXPECTED[i]) < ERROR_TOLERANCE);                 \
+    for (size_t i = 0; i < RESULT.size(); i++) {                               \
+      if (syclcompat::detail::isnan(RESULT[i])) {                              \
+        assert(syclcompat::detail::isnan(EXPECTED[i]));                        \
+      } else {                                                                 \
+        assert(fabs(RESULT[i] - EXPECTED[i]) < ERROR_TOLERANCE);               \
+      }                                                                        \
+    }                                                                          \
   } else {                                                                     \
-    static_assert(0, "Math_fixt.hpp should not have arrived here.");           \
+    static_assert(0, "math_fixt.hpp should not have arrived here.");           \
   }
 
 class OpTestLauncher {
@@ -107,7 +128,7 @@ class OpTestLauncher {
 
 // Templated ResultT to support both arithmetic and boolean operators
 template <typename ValueT, typename ValueU,
-          typename ResultT = std::common_type_t<ValueT, ValueU>>
+          typename ResultT = container_common_type_t<ValueT, ValueU>>
 class BinaryOpTestLauncher : OpTestLauncher {
 protected:
   ValueT *op1_;
@@ -118,9 +139,9 @@ class BinaryOpTestLauncher : OpTestLauncher {
   BinaryOpTestLauncher(const syclcompat::dim3 &grid,
                        const syclcompat::dim3 &threads,
                        const size_t data_size = 1)
-      : OpTestLauncher{
-            grid, threads, data_size,
-            should_skip<ValueT>()(syclcompat::get_current_device())} {
+      : OpTestLauncher{grid, threads, data_size,
+                       should_skip<ValueT, ValueU, ResultT>()(
+                           syclcompat::get_current_device())} {
     if (skip_)
       return;
     op1_ = syclcompat::malloc<ValueT>(data_size);
@@ -162,7 +183,7 @@ class UnaryOpTestLauncher : OpTestLauncher {
                       const size_t data_size = 1)
       : OpTestLauncher{
             grid, threads, data_size,
-            should_skip<ValueT>()(syclcompat::get_current_device())} {
+            should_skip<ValueT, ResultT>()(syclcompat::get_current_device())} {
     if (skip_)
       return;
     op_ = syclcompat::malloc<ValueT>(data_size);
diff --git a/sycl/test-e2e/syclcompat/math/math_ops.cpp b/sycl/test-e2e/syclcompat/math/math_ops.cpp
index 258c2a12ba1e5..d52d9c60d8ded 100644
--- a/sycl/test-e2e/syclcompat/math/math_ops.cpp
+++ b/sycl/test-e2e/syclcompat/math/math_ops.cpp
@@ -34,7 +34,7 @@
 template <typename ValueT, typename ValueU>
 inline void max_kernel(ValueT *a, ValueU *b,
                        std::common_type_t<ValueT, ValueU> *r) {
-  *r = syclcompat::max(*a, *b);
+  *r = syclcompat::max<ValueT, ValueU>(*a, *b);
 }
 
 template <typename ValueT, typename ValueU = ValueT>
@@ -43,9 +43,9 @@ void test_syclcompat_max() {
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr ValueT op1 = static_cast<ValueT>(5);
-  constexpr ValueU op2 = static_cast<ValueU>(10);
-  constexpr std::common_type_t<ValueT, ValueU> res = static_cast<ValueU>(10);
+  const ValueT op1 = static_cast<ValueT>(5);
+  const ValueU op2 = static_cast<ValueU>(10);
+  const std::common_type_t<ValueT, ValueU> res = static_cast<ValueU>(10);
 
   BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
       .template launch_test<max_kernel<ValueT, ValueU>>(op1, op2, res);
@@ -54,7 +54,7 @@ void test_syclcompat_max() {
 template <typename ValueT, typename ValueU>
 inline void min_kernel(ValueT *a, ValueU *b,
                        std::common_type_t<ValueT, ValueU> *r) {
-  *r = syclcompat::min(*a, *b);
+  *r = syclcompat::min<ValueT,ValueU>(*a, *b);
 }
 
 template <typename ValueT, typename ValueU = ValueT>
@@ -63,9 +63,9 @@ void test_syclcompat_min() {
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr ValueT op1 = static_cast<ValueT>(5);
-  constexpr ValueU op2 = static_cast<ValueU>(10);
-  constexpr std::common_type_t<ValueT, ValueU> res =
+  const ValueT op1 = static_cast<ValueT>(5);
+  const ValueU op2 = static_cast<ValueU>(10);
+  const std::common_type_t<ValueT, ValueU> res =
       static_cast<std::common_type_t<ValueT, ValueU>>(5);
 
   BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
@@ -74,7 +74,7 @@ void test_syclcompat_min() {
 
 template <typename ValueT, typename ValueU>
 inline void fmin_nan_kernel(ValueT *a, ValueU *b,
-                            std::common_type_t<ValueT, ValueU> *r) {
+                            container_common_type_t<ValueT, ValueU> *r) {
   *r = syclcompat::fmin_nan(*a, *b);
 }
 
@@ -82,14 +82,16 @@ template <typename ValueT, typename ValueU = ValueT>
 void test_syclcompat_fmin_nan() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
 
+  using ValueTU = std::common_type_t<ValueT, ValueU>;
+
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr ValueT op1 = static_cast<ValueT>(5);
-  constexpr ValueU op2 = static_cast<ValueU>(10);
+  const ValueT op1 = static_cast<ValueT>(5);
+  const ValueU op2 = static_cast<ValueU>(10);
   ValueU op3 = sycl::nan(static_cast<unsigned int>(0));
 
-  constexpr std::common_type_t<ValueT, ValueU> res =
-      static_cast<std::common_type_t<ValueT, ValueU>>(5);
+  const ValueTU res =
+      static_cast<ValueTU>(5);
 
   BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
       .template launch_test<fmin_nan_kernel<ValueT, ValueU>>(op1, op2, res);
@@ -98,9 +100,35 @@ void test_syclcompat_fmin_nan() {
       .template launch_test<fmin_nan_kernel<ValueT, ValueU>>(op1, op3, op3);
 }
 
+template <template <typename T, int Dim> typename ContainerT, typename ValueT, typename ValueU = ValueT>
+void test_container_syclcompat_fmin_nan(){
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  constexpr syclcompat::dim3 grid{1};
+  constexpr syclcompat::dim3 threads{1};
+
+  using ValueTU = std::common_type_t<ValueT, ValueU>;
+  using ContT = ContainerT<ValueT, 2>;
+  using ContU = ContainerT<ValueU, 2>;
+  using ContTU = ContainerT<ValueTU, 2>;
+
+  const ContT op4 = {static_cast<ValueT>(5), static_cast<ValueT>(10)};
+  const ContU op5 = {static_cast<ValueU>(10), static_cast<ValueU>(5)};
+  const ContU op6 = {sycl::nan(static_cast<unsigned int>(0)), sycl::nan(static_cast<unsigned int>(0))};
+  const ContTU op6_res = {sycl::nan(static_cast<unsigned int>(0)), sycl::nan(static_cast<unsigned int>(0))};
+
+  const ContTU res2{static_cast<ValueTU>(5), static_cast<ValueTU>(5)};
+
+  BinaryOpTestLauncher<ContT, ContU>(grid, threads)
+      .template launch_test<fmin_nan_kernel<ContT, ContU>>(op4, op5, res2);
+
+  BinaryOpTestLauncher<ContT, ContU>(grid, threads)
+      .template launch_test<fmin_nan_kernel<ContT, ContU>>(op4, op6, op6_res);
+}
+
 template <typename ValueT, typename ValueU>
 inline void fmax_nan_kernel(ValueT *a, ValueU *b,
-                            std::common_type_t<ValueT, ValueU> *r) {
+                            container_common_type_t<ValueT, ValueU> *r) {
   *r = syclcompat::fmax_nan(*a, *b);
 }
 
@@ -108,14 +136,16 @@ template <typename ValueT, typename ValueU = ValueT>
 void test_syclcompat_fmax_nan() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
 
+  using ValueTU = std::common_type_t<ValueT, ValueU>;
+
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr ValueT op1 = static_cast<ValueT>(5);
-  constexpr ValueU op2 = static_cast<ValueU>(10);
+  const ValueT op1 = static_cast<ValueT>(5);
+  const ValueU op2 = static_cast<ValueU>(10);
   ValueU op3 = sycl::nan(static_cast<unsigned int>(0));
 
-  constexpr std::common_type_t<ValueT, ValueU> res =
-      static_cast<std::common_type_t<ValueT, ValueU>>(10);
+  const ValueTU res =
+      static_cast<ValueTU>(10);
 
   BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
       .template launch_test<fmax_nan_kernel<ValueT, ValueU>>(op1, op2, res);
@@ -124,6 +154,32 @@ void test_syclcompat_fmax_nan() {
       .template launch_test<fmax_nan_kernel<ValueT, ValueU>>(op1, op3, op3);
 }
 
+template <template <typename T, int Dim> typename ContainerT, typename ValueT, typename ValueU = ValueT>
+void test_container_syclcompat_fmax_nan(){
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  constexpr syclcompat::dim3 grid{1};
+  constexpr syclcompat::dim3 threads{1};
+
+  using ValueTU = std::common_type_t<ValueT, ValueU>;
+  using ContT = ContainerT<ValueT, 2>;
+  using ContU = ContainerT<ValueU, 2>;
+  using ContTU = ContainerT<ValueTU, 2>;
+
+  const ContT op4 = {static_cast<ValueT>(5), static_cast<ValueT>(10)};
+  const ContU op5 = {static_cast<ValueU>(10), static_cast<ValueU>(5)};
+  const ContU op6 = {sycl::nan(static_cast<unsigned int>(0)), sycl::nan(static_cast<unsigned int>(0))};
+  const ContTU op6_res = {sycl::nan(static_cast<unsigned int>(0)), sycl::nan(static_cast<unsigned int>(0))};
+
+  const ContTU res2{static_cast<ValueTU>(10), static_cast<ValueTU>(10)};
+
+  BinaryOpTestLauncher<ContT, ContU>(grid, threads)
+      .template launch_test<fmax_nan_kernel<ContT, ContU>>(op4, op5, res2);
+
+  BinaryOpTestLauncher<ContT, ContU>(grid, threads)
+      .template launch_test<fmax_nan_kernel<ContT, ContU>>(op4, op6, op6_res);
+}
+
 template <typename ValueT, typename ValueU>
 inline void pow_kernel(ValueT *a, ValueU *b, ValueT *r) {
   *r = syclcompat::pow(*a, *b);
@@ -146,9 +202,9 @@ void test_syclcompat_pow() {
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
   // 3 ** 3 = 27
-  constexpr ValueT op1 = static_cast<ValueT>(3);
-  constexpr ValueU op2 = static_cast<ValueU>(3);
-  constexpr ValueT res = static_cast<ValueT>(27);
+  const ValueT op1 = static_cast<ValueT>(3);
+  const ValueU op2 = static_cast<ValueU>(3);
+  const ValueT res = static_cast<ValueT>(27);
 
   BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
       .template launch_test<pow_kernel<ValueT, ValueU>>(op1, op2, res);
@@ -165,25 +221,25 @@ template <typename ValueT> void test_syclcompat_relu() {
   constexpr syclcompat::dim3 threads{1};
 
   // relu(3) = 3, relu(-value) = 0
-  constexpr ValueT op1 = static_cast<ValueT>(3);
-  constexpr ValueT res1 = static_cast<ValueT>(3);
+  const ValueT op1 = static_cast<ValueT>(3);
+  const ValueT res1 = static_cast<ValueT>(3);
   UnaryOpTestLauncher<ValueT>(grid, threads)
       .template launch_test<relu_kernel<ValueT>>(op1, res1);
 
-  constexpr ValueT op2 = static_cast<ValueT>(-3);
-  constexpr ValueT res2 = static_cast<ValueT>(0);
+  const ValueT op2 = static_cast<ValueT>(-3);
+  const ValueT res2 = static_cast<ValueT>(0);
   UnaryOpTestLauncher<ValueT>(grid, threads)
       .template launch_test<relu_kernel<ValueT>>(op2, res2);
 
   using ValueU = sycl::vec<ValueT, 2>;
-  constexpr ValueU op3{op1, op2};
-  constexpr ValueU res3{res1, res2};
+  const ValueU op3{op1, op2};
+  const ValueU res3{res1, res2};
   UnaryOpTestLauncher<ValueU>(grid, threads)
       .template launch_test<relu_kernel<ValueU>>(op3, res3);
 
   using ValueV = sycl::marray<ValueT, 2>;
-  constexpr ValueV op4{op1, op2};
-  constexpr ValueV res4{res1, res2};
+  const ValueV op4{op1, op2};
+  const ValueV res4{res1, res2};
   UnaryOpTestLauncher<ValueV>(grid, threads)
       .template launch_test<relu_kernel<ValueV>>(op4, res4);
 }
@@ -198,33 +254,36 @@ template <typename ValueT> void test_syclcompat_cbrt() {
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
 
-  constexpr ValueT op1 = static_cast<ValueT>(1);
-  constexpr ValueT res1 = static_cast<ValueT>(1);
+  const ValueT op1 = static_cast<ValueT>(1);
+  const ValueT res1 = static_cast<ValueT>(1);
   UnaryOpTestLauncher<ValueT>(grid, threads)
       .template launch_test<cbrt_kernel<ValueT>>(op1, res1);
 
-  constexpr ValueT op2 = static_cast<ValueT>(64);
-  constexpr ValueT res2 = static_cast<ValueT>(4);
+  const ValueT op2 = static_cast<ValueT>(64);
+  const ValueT res2 = static_cast<ValueT>(4);
   UnaryOpTestLauncher<ValueT>(grid, threads)
       .template launch_test<cbrt_kernel<ValueT>>(op2, res2);
 }
 
-void isnan_kernel(sycl::float2 *a, sycl::float2 *r) {
+template <typename T>
+void isnan_kernel(T *a, T *r) {
   *r = syclcompat::isnan(*a);
 }
 
+template <template <typename, int> typename ContainerT, typename ValueT>
 void test_isnan() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
 
+  using ContT = ContainerT<ValueT, 2>;
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  sycl::float2 op1 = {sycl::nan(static_cast<unsigned int>(0)), 1.0f};
+  ContT op1 = {sycl::nan(static_cast<unsigned int>(0)), 1.0f};
   // bool2 does not exist,1.0 and 0.0 floats are used for true
   // and false instead.
-  sycl::float2 expect = {1.0, 0.0};
+  ContT expect = {1.0, 0.0};
 
-  UnaryOpTestLauncher<sycl::float2>(grid, threads)
-      .template launch_test<isnan_kernel>(op1, expect);
+  UnaryOpTestLauncher<ContT>(grid, threads)
+      .template launch_test<isnan_kernel<ContT>>(op1, expect);
 }
 
 // Hardcoded limits to avoid a "TernaryOpTestLauncher"
@@ -258,6 +317,24 @@ template <typename ValueT> void test_clamp() {
       .template launch_test<clamp_kernel<ValueT>>(op3, expect3);
 }
 
+template <template <typename T, int Dim> typename ContainerT, typename ValueT> void test_container_clamp() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  constexpr syclcompat::dim3 grid{1};
+  constexpr syclcompat::dim3 threads{1};
+  ValueT op1 = static_cast<ValueT>(7);
+  ValueT expect1 = static_cast<ValueT>(7);
+
+  ValueT op2 = static_cast<ValueT>(MAX_CLAMP + 1);
+  ValueT expect2 = static_cast<ValueT>(MAX_CLAMP);
+
+  using ContT = ContainerT<ValueT, 2>;
+  const ContT op4{op1, op2};
+  const ContT expect4{expect1, expect2};
+  UnaryOpTestLauncher<ContT>(grid, threads)
+      .template launch_test<clamp_kernel<ContT>>(op4, expect4);
+}
+
 int main() {
   INSTANTIATE_ALL_TYPES(value_type_list, test_syclcompat_max);
   INSTANTIATE_ALL_TYPES(value_type_list, test_syclcompat_min);
@@ -265,23 +342,47 @@ int main() {
   // Basic testing of deduction to avoid combinatorial explosion
   test_syclcompat_max<double, float>();
   test_syclcompat_max<long, int>();
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+  test_syclcompat_max<sycl::ext::oneapi::bfloat16, float>();
+#endif
+
   test_syclcompat_min<double, float>();
   test_syclcompat_min<long, int>();
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+  test_syclcompat_min<sycl::ext::oneapi::bfloat16, float>();
+#endif
 
   INSTANTIATE_ALL_TYPES(fp_type_list, test_syclcompat_fmin_nan);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_container_syclcompat_fmin_nan);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_container_syclcompat_fmin_nan);
   test_syclcompat_fmin_nan<double, float>();
+  test_container_syclcompat_fmin_nan<sycl::vec, float, double>();
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+  test_container_syclcompat_fmin_nan<sycl::vec, sycl::ext::oneapi::bfloat16, double>();
+#endif
+
   INSTANTIATE_ALL_TYPES(fp_type_list, test_syclcompat_fmax_nan);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_container_syclcompat_fmax_nan);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_container_syclcompat_fmax_nan);
   test_syclcompat_fmax_nan<double, float>();
+  test_container_syclcompat_fmax_nan<sycl::vec, float, double>();
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+  test_container_syclcompat_fmax_nan<sycl::vec, sycl::ext::oneapi::bfloat16, double>();
+#endif
 
   INSTANTIATE_ALL_TYPES(value_type_list, test_syclcompat_pow);
   test_syclcompat_pow<float, int>();
   test_syclcompat_pow<double, int>();
 
   INSTANTIATE_ALL_TYPES(fp_type_list, test_syclcompat_relu);
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_syclcompat_cbrt);
+  INSTANTIATE_ALL_TYPES(fp_type_list_no_bfloat16, test_syclcompat_cbrt);
+
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_isnan);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_isnan);
 
-  test_isnan();
   INSTANTIATE_ALL_TYPES(value_type_list, test_clamp);
+  INSTANTIATE_ALL_CONTAINER_TYPES(vec_type_list, sycl::vec, test_container_clamp);
+  INSTANTIATE_ALL_CONTAINER_TYPES(marray_type_list, sycl::marray, test_container_clamp);
 
   return 0;
 }