From 4be75e8f836e53200bcf4a0ecf86286a7b9b5862 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SYCLo=C3=AFd?= <213797226+sycloid@users.noreply.github.com>
Date: Fri, 26 Sep 2025 15:50:07 -0500
Subject: [PATCH 1/4] Use is_complex_v instead of is_complex<...>::value

Also make Caster object static constexpr in generic copy kernel
as it is done in contig kernel.
---
 dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
index f19dcb7c8c..023c3d8717 100644
--- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -101,7 +101,7 @@ class GenericCopyFunctor
         const ssize_t &src_offset = offsets.get_first_offset();
         const ssize_t &dst_offset = offsets.get_second_offset();
 
-        CastFnT fn{};
+        static constexpr CastFnT fn{};
         dst_[dst_offset] = fn(src_[src_offset]);
     }
 };
@@ -237,9 +237,9 @@ class ContigCopyFunctor
 
         static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
 
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (!enable_sg_loadstore || is_complex<srcT>::value ||
-                      is_complex<dstT>::value)
+        using dpctl::tensor::type_utils::is_complex_v;
+        if constexpr (!enable_sg_loadstore || is_complex_v<srcT> ||
+                      is_complex_v<dstT>)
         {
             std::uint16_t sgSize = ndit.get_sub_group().get_local_range()[0];
             const std::size_t gid = ndit.get_global_linear_id();

From a6dd8698484a203b1b0918cc5b4266b8e4086f55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SYCLo=C3=AFd?= <213797226+sycloid@users.noreply.github.com>
Date: Fri, 26 Sep 2025 15:56:52 -0500
Subject: [PATCH 2/4] Fix gh-2121 in convert_impl function

Since convert_impl input argument is a reference, implement NumPy's
interpretation of bool (underlying byte has any bits set in it) to
override C++'s interpretation (underlying byte has the first bit set).

To allow such an intepretation to work correctly bool arguments
must be passed by reference, rather than by value. Passing by value
creates a copy where C++ masks higher bits out.
---
 .../libtensor/include/utils/type_utils.hpp    | 38 +++++++++++++------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl/tensor/libtensor/include/utils/type_utils.hpp
index 41c42476b6..d6240519e8 100644
--- a/dpctl/tensor/libtensor/include/utils/type_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/type_utils.hpp
@@ -25,6 +25,7 @@
 #pragma once
 #include <complex>
 #include <cstddef>
+#include <cstdint>
 #include <stdexcept>
 #include <sycl/sycl.hpp>
 #include <type_traits>
@@ -55,26 +56,39 @@ template <typename T> inline constexpr bool is_complex_v = is_complex<T>::value;
 
 template <typename dstTy, typename srcTy> dstTy convert_impl(const srcTy &v)
 {
-    if constexpr (std::is_same<dstTy, srcTy>::value) {
+    if constexpr (std::is_same_v<dstTy, srcTy>) {
         return v;
     }
-    else if constexpr (std::is_same_v<dstTy, bool> && is_complex<srcTy>::value)
-    {
-        // bool(complex_v) == (complex_v.real() != 0) && (complex_v.imag() !=0)
-        return (convert_impl<bool, typename srcTy::value_type>(v.real()) ||
-                convert_impl<bool, typename srcTy::value_type>(v.imag()));
+    else if constexpr (std::is_same_v<dstTy, bool>) {
+        if constexpr (is_complex_v<srcTy>) {
+            // bool(complex_v) ==
+            //     (complex_v.real() != 0) && (complex_v.imag() !=0)
+            return (convert_impl<bool, typename srcTy::value_type>(v.real()) ||
+                    convert_impl<bool, typename srcTy::value_type>(v.imag()));
+        }
+        else {
+            return static_cast<dstTy>(v != srcTy{0});
+        }
+    }
+    else if constexpr (std::is_same_v<srcTy, bool>) {
+        const std::uint8_t &u = sycl::bit_cast<std::uint8_t>(v);
+        if constexpr (is_complex_v<dstTy>) {
+            return (u == 0) ? dstTy{} : dstTy{1, 0};
+        }
+        else {
+            return (u == 0) ? dstTy{} : dstTy{1};
+        }
     }
-    else if constexpr (is_complex<srcTy>::value && !is_complex<dstTy>::value) {
+    else if constexpr (is_complex_v<srcTy> && !is_complex_v<dstTy>) {
         // real_t(complex_v) == real_t(complex_v.real())
         return convert_impl<dstTy, typename srcTy::value_type>(v.real());
     }
-    else if constexpr (!std::is_integral<srcTy>::value &&
-                       !std::is_same<dstTy, bool>::value &&
-                       std::is_integral<dstTy>::value &&
-                       std::is_unsigned<dstTy>::value)
+    else if constexpr (!std::is_integral_v<srcTy> &&
+                       !std::is_same_v<dstTy, bool> &&
+                       std::is_integral_v<dstTy> && std::is_unsigned_v<dstTy>)
     {
         // first cast to signed variant, the cast to unsigned one
-        using signedT = typename std::make_signed<dstTy>::type;
+        using signedT = typename std::make_signed_t<dstTy>;
         return static_cast<dstTy>(convert_impl<signedT, srcTy>(v));
     }
     else {

From cc0756ff0db26724d4ff6780475000061fe7daab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SYCLo=C3=AFd?= <213797226+sycloid@users.noreply.github.com>
Date: Fri, 26 Sep 2025 16:02:34 -0500
Subject: [PATCH 3/4] Revert gh-2122 since gh-2121 was fixed in convert_impl

---
 dpctl/tensor/_copy_utils.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index 3af5ebbe19..88fd1acdb7 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -742,9 +742,6 @@ def astype(
             order=copy_order,
             buffer_ctor_kwargs={"queue": usm_ary.sycl_queue},
         )
-    # see #2121
-    if ary_dtype == dpt.bool:
-        usm_ary = dpt.not_equal(usm_ary, 0, order=copy_order)
     _copy_from_usm_ndarray_to_usm_ndarray(R, usm_ary)
     return R
 

From 6ab3c717310189fc9635a987737bb8422814e612 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SYCLo=C3=AFd?= <213797226+sycloid@users.noreply.github.com>
Date: Wed, 1 Oct 2025 15:03:41 -0500
Subject: [PATCH 4/4] Add comment in convert_impl to the branch handling
 conversion away from bool

---
 dpctl/tensor/libtensor/include/utils/type_utils.hpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl/tensor/libtensor/include/utils/type_utils.hpp
index d6240519e8..4921659166 100644
--- a/dpctl/tensor/libtensor/include/utils/type_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/type_utils.hpp
@@ -71,6 +71,12 @@ template <typename dstTy, typename srcTy> dstTy convert_impl(const srcTy &v)
         }
     }
     else if constexpr (std::is_same_v<srcTy, bool>) {
+        // C++ interprets a byte of storage behind bool by only
+        // testing is least significant bit, leading to both
+        // 0x00 and 0x02 interpreted as False, while 0x01 and 0xFF
+        // interpreted as True. NumPy's interpretation of underlying
+        // storage is different: any bit set is interpreted as True,
+        // no bits set as False, see gh-2121
         const std::uint8_t &u = sycl::bit_cast<std::uint8_t>(v);
         if constexpr (is_complex_v<dstTy>) {
             return (u == 0) ? dstTy{} : dstTy{1, 0};