NVIDIA
diff --git a/‎include/matx/core/capabilities.h‎
Lines changed: 18 additions & 2 deletions b/‎include/matx/core/capabilities.h‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎include/matx/core/half.h‎
Lines changed: 153 additions & 2 deletions b/‎include/matx/core/half.h‎
Lines changed: 153 additions & 2 deletions
diff --git a/‎include/matx/core/half_complex.h‎
Lines changed: 59 additions & 0 deletions b/‎include/matx/core/half_complex.h‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎include/matx/core/jit_includes.h‎
Lines changed: 0 additions & 1 deletion b/‎include/matx/core/jit_includes.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎include/matx/core/log.h‎
Lines changed: 5 additions & 31 deletions b/‎include/matx/core/log.h‎
Lines changed: 5 additions & 31 deletions
@@ -70,6 +70,7 @@ namespace detail {
     MAX_EPT_VEC_LOAD, // The maximum EPT for a vector load.
     ELEMENT_WISE, // Whether the operator is element-wise (safe with aliasing)
     ALIASED_MEMORY, // Whether the operator's input and output pointers alias
+    GLOBAL_KERNEL, // Kernel operates entirely on a global level per chunk of data. False when at least one operator works on a block level
     // Add more capabilities as needed
   };
 
@@ -123,7 +124,7 @@ namespace detail {
   struct capability_attributes<OperatorCapability::SUPPORTS_JIT> {
     using type = bool;
     using input_type = VoidCapabilityType;
-    static constexpr bool default_value = true;
+    static constexpr bool default_value = false;
     static constexpr bool or_identity = false;
     static constexpr bool and_identity = true;
   };
@@ -144,7 +145,16 @@ namespace detail {
     static constexpr bool default_value = false;
     static constexpr bool or_identity = false;
     static constexpr bool and_identity = true;
-  };    
+  }; 
+  
+  template <>
+  struct capability_attributes<OperatorCapability::GLOBAL_KERNEL> {
+    using type = bool;
+    using input_type = VoidCapabilityType;
+    static constexpr bool default_value = true;
+    static constexpr bool or_identity = false;
+    static constexpr bool and_identity = true;
+  };   
 
   template <>
   struct capability_attributes<OperatorCapability::ALIASED_MEMORY> {
@@ -250,6 +260,10 @@ namespace detail {
       if constexpr (Cap == OperatorCapability::JIT_TYPE_QUERY) {
         return detail::type_to_string<OperatorType>();
       }
+      else if constexpr (Cap == OperatorCapability::SUPPORTS_JIT) {
+        // If this is not a matx operator (like a constant or a lambda), we assume it supports JIT.
+        return true;
+      }
       else {
         return capability_attributes<Cap>::default_value;
       }
@@ -274,6 +288,8 @@ namespace detail {
         return CapabilityQueryType::AND_QUERY; // If any sub-operator supports JIT, the expression might be JIT-able.
       case OperatorCapability::ASYNC_LOADS_REQUESTED:
         return CapabilityQueryType::OR_QUERY; // If any sub-operator requires asynchronous loads, the expression might require asynchronous loads.
+      case OperatorCapability::GLOBAL_KERNEL:
+        return CapabilityQueryType::AND_QUERY; // If any sub-operator operates on a global level, the expression might operate on a global level.
       case OperatorCapability::ELEMENTS_PER_THREAD:
         return CapabilityQueryType::RANGE_QUERY; // The expression should use the range of elements per thread of its children.
       case OperatorCapability::SET_ELEMENTS_PER_THREAD:
 
@@ -32,6 +32,8 @@
 
 #pragma once
 
+#include <cuda/std/cstdint>
+#include <cuda/std/bit>
 #include <cuda/std/cmath>
 #include <cuda/std/type_traits>
 
@@ -41,6 +43,83 @@
 
 namespace matx {
 
+// Constexpr helper functions for float to half conversion
+namespace detail {
+
+/**
+ * @brief Constexpr conversion from float to FP16 bits
+ * 
+ * @param f Input float value
+ * @return uint16_t FP16 bit representation
+ */
+constexpr __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ uint16_t float_to_fp16_bits(float f) {
+  // Use bit_cast for constexpr context
+  uint32_t bits = cuda::std::bit_cast<uint32_t>(f);
+  
+  uint32_t sign = (bits >> 16) & 0x8000;
+  int32_t exponent = static_cast<int32_t>(((bits >> 23) & 0xff)) - 127 + 15;
+  uint32_t mantissa = (bits >> 13) & 0x3ff;
+  
+  // Handle special cases
+  if (exponent <= 0) {
+    // Subnormal or zero
+    if (exponent < -10) {
+      // Too small, flush to zero
+      return static_cast<uint16_t>(sign);
+    }
+    // Subnormal
+    mantissa = (mantissa | 0x400) >> (1 - exponent);
+    return static_cast<uint16_t>(sign | mantissa);
+  } else if (exponent >= 0x1f) {
+    // Overflow to infinity or NaN
+    if (exponent == 0x1f + (127 - 15) && mantissa != 0) {
+      // NaN
+      return static_cast<uint16_t>(sign | 0x7e00 | (mantissa != 0 ? 0x200 : 0));
+    }
+    // Infinity
+    return static_cast<uint16_t>(sign | 0x7c00);
+  }
+  
+  return static_cast<uint16_t>(sign | (static_cast<uint32_t>(exponent) << 10) | mantissa);
+}
+
+/**
+ * @brief Constexpr conversion from float to BF16 bits
+ * 
+ * @param f Input float value
+ * @return uint16_t BF16 bit representation (top 16 bits of float)
+ */
+constexpr __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ uint16_t float_to_bf16_bits(float f) {
+  // BF16 is just the top 16 bits of a float32
+  // With rounding to nearest even
+  uint32_t bits = cuda::std::bit_cast<uint32_t>(f);
+  
+  // Round to nearest even
+  uint32_t rounding_bias = 0x00007FFF + ((bits >> 16) & 1);
+  bits += rounding_bias;
+  uint16_t result = static_cast<uint16_t>(bits >> 16);
+  
+  return result;
+}
+
+/**
+ * @brief Helper to convert float to half type at compile time
+ * 
+ * @tparam T The target half type (__half or __nv_bfloat16)
+ * @param f Input float value
+ * @return T Half-precision value
+ */
+template <typename T>
+constexpr __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ T float_to_half_constexpr(float f) {
+  if constexpr (cuda::std::is_same_v<T, __half>) {
+    return cuda::std::bit_cast<__half>(float_to_fp16_bits(f));
+  } else {
+    return cuda::std::bit_cast<__nv_bfloat16>(float_to_bf16_bits(f));
+  }
+}
+
+} // namespace detail
+
 /**
  * Template class for half precison numbers (__half and __nv_bfloat16). CUDA
  * does not have standardized classes/operators available on both host and
@@ -64,12 +143,49 @@ template <typename T> struct alignas(sizeof(T)) matxHalf {
   __MATX_INLINE__ matxHalf(const matxHalf<T> &x_) noexcept = default;
 
   /**
-   * @brief Copy constructor from arbitrary type
+   * @brief Constexpr constructor from float
+   *
+   * @param f Float value to convert
+   */
+  constexpr __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ matxHalf(float f) noexcept
+      : x(detail::float_to_half_constexpr<T>(f))
+  {
+  }
+
+  /**
+   * @brief Constexpr constructor from double
+   *
+   * @param d Double value to convert
+   */
+  constexpr __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ matxHalf(double d) noexcept
+      : x(detail::float_to_half_constexpr<T>(static_cast<float>(d)))
+  {
+  }
+
+  /**
+   * @brief Constructor from integral types (constexpr)
+   *
+   * @tparam T2 Integral type to copy from
+   * @param x_ Value to copy
+   */
+  template <typename T2, 
+            cuda::std::enable_if_t<cuda::std::is_integral_v<T2>, int> = 0>
+  constexpr __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ matxHalf(T2 x_) noexcept
+      : x(detail::float_to_half_constexpr<T>(static_cast<float>(x_)))
+  {
+  }
+
+  /**
+   * @brief Copy constructor from arbitrary type (non-constexpr for non-arithmetic types)
    *
    * @tparam T2 Type to copy from
    * @param x_ Value to copy
    */
-  template <typename T2>
+  template <typename T2, 
+            cuda::std::enable_if_t<
+                !cuda::std::is_same_v<cuda::std::decay_t<T2>, float> &&
+                !cuda::std::is_same_v<cuda::std::decay_t<T2>, double> &&
+                !cuda::std::is_integral_v<T2>, int> = 0>
   __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ matxHalf(const T2 &x_) noexcept
       : x(static_cast<float>(x_))
   {
@@ -1316,3 +1432,38 @@ using matxFp16 = matxHalf<__half>; ///< Alias for fp16
 using matxBf16 = matxHalf<__nv_bfloat16>; ///< Alias for bf16
 
 }; // namespace matx
+
+#ifndef __CUDACC_RTC__
+// Add std::formatter specializations for matxFp16 and matxBf16
+#include <format>
+
+namespace std {
+
+/**
+ * @brief std::formatter specialization for matxFp16
+ * 
+ * Enables matxFp16 to work with std::format by converting to float
+ */
+template <>
+struct formatter<matx::matxFp16> : formatter<float> {
+  template <typename FormatContext>
+  auto format(const matx::matxFp16& val, FormatContext& ctx) const {
+    return formatter<float>::format(static_cast<float>(val), ctx);
+  }
+};
+
+/**
+ * @brief std::formatter specialization for matxBf16
+ * 
+ * Enables matxBf16 to work with std::format by converting to float
+ */
+template <>
+struct formatter<matx::matxBf16> : formatter<float> {
+  template <typename FormatContext>
+  auto format(const matx::matxBf16& val, FormatContext& ctx) const {
+    return formatter<float>::format(static_cast<float>(val), ctx);
+  }
+};
+
+} // namespace std
+#endif
@@ -1056,3 +1056,62 @@ using matxFp16Complex = matxHalfComplex<matxFp16>; ///< Alias for a MatX fp16 co
 using matxBf16Complex = matxHalfComplex<matxBf16>; ///< Alias for a MatXbf16 complex wrapper
 
 }; // namespace matx
+
+#ifndef __CUDACC_RTC__
+// Add std::formatter specializations for matxFp16Complex and matxBf16Complex
+#include <format>
+
+namespace std {
+
+/**
+ * @brief std::formatter specialization for matxFp16Complex
+ * 
+ * Enables matxFp16Complex to work with std::format by converting to complex<float>
+ */
+template <>
+struct formatter<matx::matxFp16Complex> {
+  template <typename ParseContext>
+  constexpr auto parse(ParseContext& ctx) {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  auto format(const matx::matxFp16Complex& val, FormatContext& ctx) const {
+    float real_val = static_cast<float>(val.real());
+    float imag_val = static_cast<float>(val.imag());
+    
+    if (imag_val >= 0) {
+      return std::format_to(ctx.out(), "({}+{}i)", real_val, imag_val);
+    } else {
+      return std::format_to(ctx.out(), "({}{}i)", real_val, imag_val);
+    }
+  }
+};
+
+/**
+ * @brief std::formatter specialization for matxBf16Complex
+ * 
+ * Enables matxBf16Complex to work with std::format by converting to complex<float>
+ */
+template <>
+struct formatter<matx::matxBf16Complex> {
+  template <typename ParseContext>
+  constexpr auto parse(ParseContext& ctx) {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  auto format(const matx::matxBf16Complex& val, FormatContext& ctx) const {
+    float real_val = static_cast<float>(val.real());
+    float imag_val = static_cast<float>(val.imag());
+    
+    if (imag_val >= 0) {
+      return std::format_to(ctx.out(), "({}+{}i)", real_val, imag_val);
+    } else {
+      return std::format_to(ctx.out(), "({}{}i)", real_val, imag_val);
+    }
+  }
+};
+
+} // namespace std
+#endif // __CUDACC_RTC__
@@ -34,7 +34,6 @@
 
 // This file is used for jitify/NVRTC preprocessing. Do NOT include any files in here that can't be
 // parsed on the device, and try to keep this minimal to avoid unnecessary dependencies.
-#include <cuda/barrier>
 #include <cuda/std/__algorithm/min.h>
 #include <cuda/std/__algorithm/max.h>
 #include "matx/core/defines.h"
 
@@ -89,38 +89,12 @@ namespace std {
     }
   };
 
-  // Formatter for matxHalfComplex (fp16/bf16 complex)
-  template<typename T>
-  struct formatter<matx::matxHalfComplex<T>> {
-    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
-    
-    template<typename FormatContext>
-    auto format(const matx::matxHalfComplex<T>& c, FormatContext& ctx) const {
-      return format_to(ctx.out(), "{}", matx::detail::format_complex(c));
-    }
-  };
-  
-  // Formatter for matxFp16 (half-precision float)
-  template<>
-  struct formatter<matx::matxFp16> {
-    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
-    
-    template<typename FormatContext>
-    auto format(const matx::matxFp16& val, FormatContext& ctx) const {
-      return format_to(ctx.out(), "{:g}", static_cast<float>(val));
-    }
-  };
+  // Formatter for matxHalfComplex (fp16/bf16 complex) - moved to half_complex.h
+  // Formatter for matxFp16 (half-precision float) - moved to half.h
+  // Formatter for matxBf16 (bfloat16) - moved to half.h
 
-  // Formatter for matxBf16 (bfloat16)
-  template<>
-  struct formatter<matx::matxBf16> {
-    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
-    
-    template<typename FormatContext>
-    auto format(const matx::matxBf16& val, FormatContext& ctx) const {
-      return format_to(ctx.out(), "{:g}", static_cast<float>(val));
-    }
-  };
+  // Note: The formatters for matxHalfComplex, matxFp16, and matxBf16 are now defined
+  // in their respective header files (half_complex.h and half.h) with proper guards.
 }
 
 namespace matx {