NVIDIA
diff --git a/‎CMakeLists.txt‎
Lines changed: 15 additions & 6 deletions b/‎CMakeLists.txt‎
Lines changed: 15 additions & 6 deletions
diff --git a/‎include/matx/core/capabilities.h‎
Lines changed: 1 addition & 1 deletion b/‎include/matx/core/capabilities.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/matx/core/get_grid_dims.h‎
Lines changed: 67 additions & 53 deletions b/‎include/matx/core/get_grid_dims.h‎
Lines changed: 67 additions & 53 deletions
diff --git a/‎include/matx/core/iterator.h‎
Lines changed: 13 additions & 13 deletions b/‎include/matx/core/iterator.h‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎include/matx/core/jit_includes.h‎
Lines changed: 5 additions & 2 deletions b/‎include/matx/core/jit_includes.h‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎include/matx/core/operator_options.h‎
Lines changed: 10 additions & 0 deletions b/‎include/matx/core/operator_options.h‎
Lines changed: 10 additions & 0 deletions
@@ -79,6 +79,7 @@ option(MATX_EN_COVERAGE OFF "Enable code coverage reporting")
 option(MATX_EN_COMPLEX_OP_NAN_CHECKS "Enable full NaN/Inf handling for complex multiplication and division" OFF)
 option(MATX_EN_CUDA_LINEINFO "Enable line information for CUDA kernels via -lineinfo nvcc flag" OFF)
 option(MATX_EN_EXTENDED_LAMBDA "Enable extended lambda support for device/host lambdas" ON)
+option(MATX_EN_JIT "Enable CUDA JIT compilation support via NVRTC" OFF)
 option(MATX_EN_MATHDX "Enable MathDx support for kernel fusion" OFF)
 option(MATX_EN_UNSAFE_ALIAS_DETECTION "Enable aliased memory detection" OFF)
 option(MATX_DISABLE_EXCEPTIONS "Disable C++ exceptions and log errors instead" OFF)
@@ -316,11 +317,9 @@ if (MATX_EN_CUTENSOR)
     target_link_libraries(matx INTERFACE "-Wl,--disable-new-dtags")
 endif()
 
-if (MATX_EN_MATHDX)
-    set(MathDx_VERSION 25.06)
-    set(MathDx_NANO 0)
-    include(cmake/FindMathDx.cmake)
-    target_compile_definitions(matx INTERFACE MATX_EN_MATHDX)
+# Enable JIT compilation support
+if (MATX_EN_JIT OR MATX_EN_MATHDX)
+    message(STATUS "Enabling JIT compilation support via NVRTC")
     target_compile_definitions(matx INTERFACE MATX_EN_JIT)
 
     # Add NVRTC configuration as compiler definitions
@@ -331,14 +330,24 @@ if (MATX_EN_MATHDX)
     target_compile_definitions(matx INTERFACE NVRTC_CUDA_ARCH="${NVRTC_CUDA_ARCH}")
     target_compile_definitions(matx INTERFACE NVRTC_CXX_STANDARD="${CMAKE_CXX_STANDARD}")
 
+    # Link NVRTC library
+    target_link_libraries(matx INTERFACE CUDA::nvrtc)
+endif()
+
+if (MATX_EN_MATHDX)
+    set(MathDx_VERSION 25.06)
+    set(MathDx_NANO 0)
+    include(cmake/FindMathDx.cmake)
+    target_compile_definitions(matx INTERFACE MATX_EN_MATHDX)
+    
     # Link libmathdx if available
     if(TARGET libmathdx::libmathdx)
         target_link_libraries(matx INTERFACE libmathdx::libmathdx)
         message(STATUS "Linked libmathdx to matx target")
     endif()
 
     # Link mathdx components
-    target_link_libraries(matx INTERFACE mathdx::cufftdx CUDA::nvrtc)
+    target_link_libraries(matx INTERFACE mathdx::cufftdx)
 endif()
 
 if (MATX_EN_CUDSS)
 
@@ -243,7 +243,7 @@ namespace detail {
   __MATX_INLINE__ __MATX_HOST__ typename capability_attributes<Cap>::type
   get_operator_capability(const OperatorType& op, InType& in) {
     static_assert(std::is_same_v<remove_cvref_t<InType>, typename capability_attributes<Cap>::input_type>, "Input type mismatch");
-    if constexpr (is_matx_op<OperatorType>()) {
+    if constexpr (is_matx_jit_class<OperatorType>) {
       return op.template get_capability<Cap, InType>(in);
     } else {
       // Default capabilities for non-MatX ops
 
@@ -185,6 +185,10 @@ inline bool get_grid_dims_jit(dim3 &blocks, dim3 &threads, const cuda::std::arra
   blocks.y = 1;
   blocks.z = 1;    
 
+  if (RANK > 1) {
+    MATX_ASSERT_STR_EXP(sizes[sizes.size() - 2] % groups_per_block, 0, matxInvalidParameter, "Second to last dimension must be divisible by groups_per_block");
+  }
+
   // Dynamic logic to pick thread block size.
   //   Fill in order x, y, z up to 1024 threads
   if constexpr (RANK == 0) {
@@ -216,70 +220,80 @@ inline bool get_grid_dims_jit(dim3 &blocks, dim3 &threads, const cuda::std::arra
 
     // If we have multiple groups per block, we need to adjust the block size
     if (threads.y > 1) {
-      blocks.x = static_cast<int>((static_cast<int64_t>(sizes[0]) + static_cast<int64_t>(threads.y) - 1) / static_cast<int64_t>(threads.y));
+      blocks.x = static_cast<int>(static_cast<int64_t>(sizes[0]) / static_cast<int64_t>(threads.y));
     }
     else {
       blocks.x = static_cast<int>(sizes[0]);
     }
-  }  
-  // We don't support JIT with rank 3 or higher yet
-  // else if constexpr (RANK == 3) {
-  //   if (!force_size) {
-  //     while (nt < max_cta_size) {
-  //       if (static_cast<index_t>(threads.x) * ept < sizes[2]) {
-  //         threads.x *= 2;
-  //       }
+  }
+  else if constexpr (RANK == 3) {
+    if (!force_size) {
+      while (nt < max_cta_size) {
+        if (static_cast<index_t>(threads.x) * ept < sizes[2]) {
+          threads.x *= 2;
+        }
+
+        nt *= 2;
+      }
+    }
 
-  //       nt *= 2;
-  //     }
-  //   }
+    // If we have multiple groups per block, we need to adjust the block size
+    if (threads.y > 1) {
+      blocks.x = static_cast<int>(static_cast<int64_t>(sizes[1]) / static_cast<int64_t>(threads.y));
+    }
+    else {
+      blocks.x = static_cast<int>(sizes[1]);
+    }    
 
-  //   // launch as many blocks as necessary
-  //   blocks.x = static_cast<int>(sizes[1]);
-  //   blocks.y = static_cast<int>(sizes[0]);
+    // launch as many blocks as necessary
+    blocks.y = static_cast<int>(sizes[0]);
 
-  //   if(blocks.x > 65535) {
-  //     blocks.x = 65535;
-  //     stride = true;
-  //   }
-  //   if(blocks.y > 65535) {
-  //     blocks.y = 65535;
-  //     stride = true;
-  //   }
+    if(blocks.x > 65535) {
+      blocks.x = 65535;
+      stride = true;
+    }
+    if(blocks.y > 65535) {
+      blocks.y = 65535;
+      stride = true;
+    }
 
-  // }  
-  // else if constexpr (RANK == 4) {
-  //   if (!force_size) {
-  //     while (nt < max_cta_size) {
-  //       if (static_cast<index_t>(threads.x) * ept < sizes[3]) {
-  //         threads.x *= 2;
-  //     }
+  }  
+  else if constexpr (RANK == 4) {
+    if (!force_size) {
+      while (nt < max_cta_size) {
+        if (static_cast<index_t>(threads.x) * ept < sizes[3]) {
+          threads.x *= 2;
+      }
 
-  //       nt *= 2;
-  //     }
-  //   }
+        nt *= 2;
+      }
+    }
 
-  //   // launch as many blocks as necessary
-  //   blocks.x = static_cast<int>(sizes[2]);
-  //   blocks.y = static_cast<int>(sizes[1]);
-  //   blocks.z = static_cast<int>(sizes[0]);
+    // If we have multiple groups per block, we need to adjust the block size
+    if (threads.y > 1) {
+      blocks.x = static_cast<int>(static_cast<int64_t>(sizes[2]) / static_cast<int64_t>(threads.y));
+    }
+    else {
+      blocks.x = static_cast<int>(sizes[2]);
+    }  
+
+    // launch as many blocks as necessary
+    blocks.y = static_cast<int>(sizes[1]);
+    blocks.z = static_cast<int>(sizes[0]);
 
-  //   if(blocks.x > 65535) {
-  //     blocks.x = 65535;
-  //     stride = true;
-  //   }
-  //   if(blocks.y > 65535) {
-  //     blocks.y = 65535;
-  //     stride = true;
-  //   }
-  //   if(blocks.z > 65535) {
-  //     blocks.z = 65535;
-  //     stride = true;
-  //   }    
-  // }  
-  else {
-    MATX_THROW(matxInvalidParameter, "Rank not supported");
-  } 
+    if(blocks.x > 65535) {
+      blocks.x = 65535;
+      stride = true;
+    }
+    if(blocks.y > 65535) {
+      blocks.y = 65535;
+      stride = true;
+    }
+    if(blocks.z > 65535) {
+      blocks.z = 65535;
+      stride = true;
+    }    
+  }  
 
   MATX_LOG_DEBUG("Blocks {}x{}x{} Threads {}x{}x{} groups_per_block={}", blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z, groups_per_block);
   return stride;
 
@@ -48,8 +48,8 @@ namespace matx {
 template <typename OperatorType, bool ConvertType = true>
 struct RandomOperatorIterator {
   using self_type = RandomOperatorIterator<OperatorType, ConvertType>;
-  using value_type = typename std::conditional_t<ConvertType, detail::convert_matx_type_t<typename OperatorType::value_type>, typename OperatorType::value_type>;
-  // using stride_type = std::conditional_t<is_tensor_view_v<OperatorType>, typename OperatorType::desc_type::stride_type,
+  using value_type = typename cuda::std::conditional_t<ConvertType, detail::convert_matx_type_t<typename OperatorType::value_type>, typename OperatorType::value_type>;
+  // using stride_type = cuda::std::conditional_t<is_tensor_view_v<OperatorType>, typename OperatorType::desc_type::stride_type,
   //                         index_t>;
   using stride_type = index_t;
   using pointer = value_type*;
@@ -66,7 +66,7 @@ struct RandomOperatorIterator {
   __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ RandomOperatorIterator(OperatorType &&t, stride_type offset) : t_(t), offset_(offset) {}
 
   template<typename T = OperatorType>
-    requires (!std::is_same_v<T, OperatorBaseType>)
+    requires (!cuda::std::is_same_v<T, OperatorBaseType>)
   __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ RandomOperatorIterator(const OperatorBaseType &t, stride_type offset) : t_(t), offset_(offset) {}
 
   template<typename T = OperatorType>
@@ -193,8 +193,8 @@ __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t operator-(const RandomOper
 template <typename OperatorType, bool ConvertType = true>
 struct RandomOperatorOutputIterator {
   using self_type = RandomOperatorOutputIterator<OperatorType, ConvertType>;
-  using value_type = typename std::conditional_t<ConvertType, detail::convert_matx_type_t<typename OperatorType::value_type>, typename OperatorType::value_type>;
-  // using stride_type = std::conditional_t<is_tensor_view_v<OperatorType>, typename OperatorType::desc_type::stride_type,
+  using value_type = typename cuda::std::conditional_t<ConvertType, detail::convert_matx_type_t<typename OperatorType::value_type>, typename OperatorType::value_type>;
+  // using stride_type = cuda::std::conditional_t<is_tensor_view_v<OperatorType>, typename OperatorType::desc_type::stride_type,
   //                         index_t>;
   using stride_type = index_t;
   using pointer = value_type*;
@@ -211,11 +211,11 @@ struct RandomOperatorOutputIterator {
   __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ RandomOperatorOutputIterator(OperatorType &&t, stride_type offset) : t_(t), offset_(offset) {}
 
   template<typename T = OperatorType>
-    requires (!std::is_same_v<T, OperatorBaseType>)
+    requires (!cuda::std::is_same_v<T, OperatorBaseType>)
   __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ RandomOperatorOutputIterator(const OperatorBaseType &t, stride_type offset) : t_(t), offset_(offset) {}
 
   template<typename T = OperatorType>
-    requires (!std::is_same_v<T, OperatorBaseType>)
+    requires (!cuda::std::is_same_v<T, OperatorBaseType>)
   __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ RandomOperatorOutputIterator(OperatorBaseType &&t, stride_type offset) : t_(t), offset_(offset) {}
 
   [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ reference operator*()
@@ -338,10 +338,10 @@ template <typename OperatorType, bool ConvertType = true>
 struct RandomOperatorThrustIterator {
   using self_type = RandomOperatorThrustIterator<OperatorType, ConvertType>;
   using const_strip_type = remove_cvref_t<typename OperatorType::value_type>;
-  using value_type = typename std::conditional_t<ConvertType,
+  using value_type = typename cuda::std::conditional_t<ConvertType,
               detail::convert_matx_type_t<const_strip_type>,
               const_strip_type>;
-  // using stride_type = std::conditional_t<is_tensor_view_v<OperatorType>, typename OperatorType::desc_type::stride_type,
+  // using stride_type = cuda::std::conditional_t<is_tensor_view_v<OperatorType>, typename OperatorType::desc_type::stride_type,
   //                         index_t>;
   using stride_type = index_t;
   using pointer = cuda::std::remove_const_t<value_type>*;
@@ -359,11 +359,11 @@ struct RandomOperatorThrustIterator {
   __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ RandomOperatorThrustIterator(OperatorType &&t, stride_type offset) : t_(t), offset_(offset) {}
 
   template<typename T = OperatorType>
-    requires (!std::is_same_v<T, OperatorBaseType>)
+    requires (!cuda::std::is_same_v<T, OperatorBaseType>)
   __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ RandomOperatorThrustIterator(const OperatorBaseType &t, stride_type offset) : t_(t), offset_(offset) {}
 
   template<typename T = OperatorType>
-    requires (!std::is_same_v<T, OperatorBaseType>)
+    requires (!cuda::std::is_same_v<T, OperatorBaseType>)
   __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ RandomOperatorThrustIterator(OperatorBaseType &&t, stride_type offset) : t_(t), offset_(offset) {}
 
   [[nodiscard]] __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ reference operator*() const
@@ -463,7 +463,7 @@ template <typename OperatorType>
 struct BeginOffset {
   using self_type = BeginOffset<OperatorType>;
   using value_type = index_t;
-  // using stride_type = std::conditional_t<is_tensor_view_v<OperatorType>, typename OperatorType::desc_type::stride_type,
+  // using stride_type = cuda::std::conditional_t<is_tensor_view_v<OperatorType>, typename OperatorType::desc_type::stride_type,
   //                         index_t>;
   using stride_type = index_t;
   using pointer = value_type*;
@@ -522,7 +522,7 @@ template <typename OperatorType>
 struct EndOffset {
   using self_type = EndOffset<OperatorType>;
   using value_type = index_t;
-  // using stride_type = std::conditional_t<is_tensor_view_v<OperatorType>, typename OperatorType::desc_type::stride_type,
+  // using stride_type = cuda::std::conditional_t<is_tensor_view_v<OperatorType>, typename OperatorType::desc_type::stride_type,
   //                         index_t>;
   using stride_type = index_t;
   using pointer = value_type*;
 
@@ -34,10 +34,13 @@
 
 // This file is used for jitify/NVRTC preprocessing. Do NOT include any files in here that can't be
 // parsed on the device, and try to keep this minimal to avoid unnecessary dependencies.
+#include <cuda/barrier>
+#include <cuda/std/__algorithm/min.h>
+#include <cuda/std/__algorithm/max.h>
 #include "matx/core/defines.h"
 #include "matx/core/type_utils_both.h"
 #include "matx/core/vector.h"
-#include "matx/operators/scalar_internal.h"
+//#include "matx/operators/scalar_internal.h"
+#include "matx/operators/scalar_ops.h"
 #include "matx/core/operator_utils.h"
-#include <cuda/barrier>
 #include <cub/block/block_load_to_shared.cuh>
@@ -127,6 +127,16 @@ enum class SVDHostAlgo {
   DC   /**< Divide and Conquer method (corresponds to `gesdd`) */
 };
 
+/**
+  * @brief Padding mode
+  *
+  * Specifies the padding mode to use for the pad operator.
+  */
+  enum PadMode {
+  MATX_PAD_MODE_CONSTANT, ///< Constant padding mode. All padding elements will be set to the user-provided pad_value.
+  MATX_PAD_MODE_EDGE ///< Edge padding mode. All padding elements will be set to the edge values of the original operator.
+};
+
 
 namespace detail {
   static constexpr int MAX_FFT_RANK = 2;