NVIDIA
diff --git a/‎CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs_input/basics/debug.rst‎
Lines changed: 79 additions & 0 deletions b/‎docs_input/basics/debug.rst‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎include/matx/core/capabilities.h‎
Lines changed: 13 additions & 0 deletions b/‎include/matx/core/capabilities.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎include/matx/core/operator_options.h‎
Lines changed: 8 additions & 0 deletions b/‎include/matx/core/operator_options.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎include/matx/core/tensor_impl.h‎
Lines changed: 43 additions & 0 deletions b/‎include/matx/core/tensor_impl.h‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎include/matx/core/type_utils_both.h‎
Lines changed: 22 additions & 0 deletions b/‎include/matx/core/type_utils_both.h‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎include/matx/generators/alternate.h‎
Lines changed: 0 additions & 11 deletions b/‎include/matx/generators/alternate.h‎
Lines changed: 0 additions & 11 deletions
diff --git a/‎include/matx/generators/diag.h‎
Lines changed: 0 additions & 10 deletions b/‎include/matx/generators/diag.h‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎include/matx/generators/linspace.h‎
Lines changed: 0 additions & 11 deletions b/‎include/matx/generators/linspace.h‎
Lines changed: 0 additions & 11 deletions
diff --git a/‎include/matx/generators/logspace.h‎
Lines changed: 0 additions & 10 deletions b/‎include/matx/generators/logspace.h‎
Lines changed: 0 additions & 10 deletions
@@ -80,6 +80,7 @@ option(MATX_EN_COMPLEX_OP_NAN_CHECKS "Enable full NaN/Inf handling for complex m
 option(MATX_EN_CUDA_LINEINFO "Enable line information for CUDA kernels via -lineinfo nvcc flag" OFF)
 option(MATX_EN_EXTENDED_LAMBDA "Enable extended lambda support for device/host lambdas" ON)
 option(MATX_EN_MATHDX "Enable MathDx support for kernel fusion" OFF)
+option(MATX_EN_UNSAFE_ALIAS_DETECTION "Enable aliased memory detection" OFF)
 
 set(MATX_EN_PYBIND11 OFF CACHE BOOL "Enable pybind11 support")
 
@@ -212,6 +213,10 @@ else()
     set(MATX_NVPL_INT_TYPE "ilp64")
 endif()
 
+if (MATX_EN_UNSAFE_ALIAS_DETECTION)
+    target_compile_definitions(matx INTERFACE MATX_EN_UNSAFE_ALIAS_DETECTION)
+endif()
+
 # Host support
 if (MATX_EN_NVPL OR MATX_EN_X86_FFTW OR MATX_EN_BLIS OR MATX_EN_OPENBLAS)
     message(STATUS "Enabling OpenMP support")
 
@@ -0,0 +1,79 @@
+.. _debugging:
+
+Debugging
+#########
+
+MatX employs several tools for debugging and improving the correctness of the code. 
+
+Logging
+--------
+
+MatX provides a logging system that can be used to log messages to the console. This is useful for debugging your code and can be used to trace the execution of your code.
+
+See :ref:`logging_basics` for more information on the logging system.
+
+Compile Time
+------------
+
+At compile time MatX uses `static_assert` calls where possible to provide helpful error messages. Static assertions have a limitation that 
+they cannot display a formatted string, so the value of the invalid parameters are not displayed. Common compile time errors include:
+
+- Invalid rank
+- Invalid type
+- Invalid tensor shapes (for static tensor sizes)
+
+Runtime
+-------
+
+At runtime MatX uses C++ exceptions to throw errors. These errors are typically based on expected vs actual outcomes. Several macros are used 
+to raise these errors:
+
+- MATX_ASSERT (boolean assertion)
+- MATX_ASSERT_STR (boolean assertion with a formatted string)
+- MATX_ASSERT_STR_EXP (boolean assertion with a formatted string and an expected value)
+
+These macros are also listed in order of usefulness with the `MATX_ASSERT_STR_EXP` macro providing the most information to the user. Common 
+runtime errors include:
+
+- Invalid sizes
+- Invalid indexing
+- Errors returned from CUDA APIs 
+
+
+Null Pointer Checking
+---------------------
+
+Tensors in MatX may be left unitialized on declaration. This is common when a tensor is used as a class member and is not initialized in the constructor. For example: 
+
+.. code-block:: cpp
+
+    class MyClass {
+        public:
+            MyClass() {
+            }
+        private:
+            tensor_t<float> t; // Uninitialized
+    };
+
+Typically `make_tensor` is used at a later time to declare the shape allocate the memory backing the tensor. Detecting an unitialized tensor on the device 
+has a non-zero performance penalty and is disabled by default. To detect an unitialized tensor on the device, build your application in debug mode with the 
+`NDEBUG` flag undefined. When the `NDEBUG` flag is undefined, MatX will check for unitialized tensors on the device and assert if one is found.
+
+Unsafe Aliased Memory Checking
+------------------------------
+
+MatX provides an imperfect unsafe aliased memory checking system that can be used to detect when an input tensor may overlap with output tensor memory, 
+causing a data race. The word *unsafe* is used here because there are cases where aliasing is safe, such as a direct element-wise operation.
+To have a false positive rate of 0 we would need to check every possible input and output location to see if any of them overlap. 
+This would be impractical for most applications. Instead, we use several checks that can catch the most common cases of memory aliasing. Since aliasing can be 
+an expensive check and it's not perfect, alias checking must be explicitly enabled with the CMake option `MATX_EN_UNSAFE_ALIAS_DETECTION` or the compiler 
+define with the same name.
+
+The types of aliasing that can be detected are:
+
+- Safe element-wise aliasing: (a = a + a) // No aliasing since it's a direct element-wise operation
+- Safe element-wise aliasing: (slice(a, {0}, {5}) = slice(a, {0}, {5}) - slice(a, {0}, {5})) // No aliasing since it's a direct element-wise operation
+- Unsafe element-wise aliasing: (slice(a, {0}, {5}) = slice(a, {3}, {8}) - slice(a, {0}, {5})) // Unsafe since inputs and outputs overlap to different locations
+- Unsafe matrix multiplication: (c = matmul(c, d)) // Unsafe since matmul doesn't allow aliasing on input and output memory
+- Safe FFT: (c = fft(c)) // No aliasing since FFT allows aliasing
+- False positive: (slice(a, {0}, {6}, {2}) = slice(a, {0}, {6}, {2}) + slice(a, {0}, {6}, {2})) // Non-unity strides throw false positive currently
@@ -68,6 +68,8 @@ namespace detail {
     SET_GROUPS_PER_BLOCK, // Set the number of groups per block for the operator.
     ASYNC_LOADS_REQUESTED, // Whether the operator requires asynchronous loads.
     MAX_EPT_VEC_LOAD, // The maximum EPT for a vector load.
+    ELEMENT_WISE, // Whether the operator is element-wise (safe with aliasing)
+    ALIASED_MEMORY, // Whether the operator's input and output pointers alias
     // Add more capabilities as needed
   };
 
@@ -139,6 +141,15 @@ namespace detail {
     static constexpr bool and_identity = true;
   };    
 
+  template <>
+  struct capability_attributes<OperatorCapability::ALIASED_MEMORY> {
+    using type = bool;
+    using input_type = AliasedMemoryQueryInput;
+    static constexpr bool default_value = false;
+    static constexpr bool or_identity = false;
+    static constexpr bool and_identity = true;
+  };    
+
   template <>
   struct capability_attributes<OperatorCapability::GROUPS_PER_BLOCK> {
     using type = cuda::std::array<int, 2>; // min/max elements per thread
@@ -266,6 +277,8 @@ namespace detail {
         return CapabilityQueryType::AND_QUERY; // The expression should use the range of groups per block of its children.
       case OperatorCapability::GROUPS_PER_BLOCK:
         return CapabilityQueryType::RANGE_QUERY; // The expression should use the range of groups per block of its children.
+      case OperatorCapability::ALIASED_MEMORY:
+        return CapabilityQueryType::OR_QUERY; // The expression should use the aliased memory of its children.
       case OperatorCapability::MAX_EPT_VEC_LOAD:
         return CapabilityQueryType::MIN_QUERY; // The expression should use the minimum EPT for a vector load of its children.
       case OperatorCapability::JIT_CLASS_QUERY:
 
@@ -168,6 +168,7 @@ namespace detail {
 
   // Input structure for types that require it
 
+  // Capabilities structures
 
   struct EPTQueryInput {
     bool jit;
@@ -187,6 +188,13 @@ namespace detail {
     int groups_per_block;
   };
 
+  struct AliasedMemoryQueryInput {
+    bool permutes_input_output;
+    bool is_prerun;
+    void *start_ptr;
+    void *end_ptr;
+  };
+
 }
 
 };
@@ -1462,6 +1462,49 @@ MATX_IGNORE_WARNING_POP_GCC
         return false;
 #endif
       }
+      else if constexpr (Cap == OperatorCapability::ALIASED_MEMORY) {
+        // Check if this tensor's memory overlaps with the query input range
+        static_assert(std::is_same_v<remove_cvref_t<InType>, detail::AliasedMemoryQueryInput>, 
+                      "ALIASED_MEMORY capability requires AliasedMemoryQueryInput");
+
+        // Rank 0 (scalars) don't need aliasing checks
+        if constexpr (Rank() == 0) {
+          return false;
+        }
+        else if constexpr (is_sparse_data_v<TensorData>) {
+          return false;
+        }
+        else {
+          // The logic to detect overlaps is as follows: If we have a complete overlap (first and last pointers are identical), 
+          // ie (a = a), then we need to check if the tensor is contiguous or if the input permutes the input and output. If either 
+          // of those are true then this will alias. Otherwise we have a partial overlap. For a partial overlap we always say this
+          // can alias.
+
+          // Get address of first element using operator()(0, 0, ...)
+          auto get_first = [this]<size_t... Is>(cuda::std::index_sequence<Is...>) {
+            return &(const_cast<tensor_impl_t*>(this)->operator()(static_cast<index_t>(Is*0)...));
+          };
+          void* tensor_start = static_cast<void*>(const_cast<T*>(get_first(cuda::std::make_index_sequence<Rank()>{})));
+          
+          // Get address of last element using operator()(Size(0)-1, Size(1)-1, ...)
+          auto get_last = [this]<size_t... Is>(cuda::std::index_sequence<Is...>) {
+            return &(const_cast<tensor_impl_t*>(this)->operator()(static_cast<index_t>(Size(Is)-1)...));
+          };
+          void* tensor_end = static_cast<void*>(static_cast<char*>(static_cast<void*>(const_cast<T*>(get_last(cuda::std::make_index_sequence<Rank()>{})))) + sizeof(T));
+
+          bool complete_overlap = tensor_start == in.start_ptr && tensor_end == in.end_ptr;
+          if (complete_overlap) {      
+            MATX_LOG_TRACE("Complete overlap of tensors. Contiguous: {}", IsContiguous());
+            return !IsContiguous() || in.permutes_input_output;
+          }
+          
+          // Check for overlap: two ranges [a1, a2) and [b1, b2) overlap if a1 < b2 && b1 < a2
+          bool overlaps = (tensor_start < in.end_ptr) && (in.start_ptr < tensor_end);
+          
+          MATX_LOG_TRACE("Overlap of tensors: {}", overlaps);
+          return overlaps;
+        }
+      }
       else {
         return detail::capability_attributes<Cap>::default_value;
       }
 
@@ -172,6 +172,28 @@ template <typename T> constexpr __MATX_HOST__ __MATX_DEVICE__ bool is_matx_trans
   return detail::is_matx_transform_op_impl<typename remove_cvref<T>::type>::value;
 }
 
+namespace detail {
+template <typename T, typename = void>
+struct has_can_alias_impl : cuda::std::false_type {
+};
+
+template <typename T>
+struct has_can_alias_impl<T, cuda::std::void_t<typename remove_cvref_t<T>::can_alias>> : cuda::std::true_type {
+};
+}
+
+/**
+ * @brief Determine if operator can alias
+ * 
+ * Returns true if the type is a transform operator and has the can_alias trait set
+ * 
+ * @tparam T Type to test
+ */
+template <typename T> constexpr __MATX_HOST__ __MATX_DEVICE__ bool can_alias()
+{
+  return is_matx_transform_op<T>() && detail::has_can_alias_impl<typename remove_cvref<T>::type>::value;
+}
+
 namespace detail {
 template <typename T, typename = void>
 struct has_matx_op_type : cuda::std::false_type {
 
@@ -60,17 +60,6 @@ namespace matx
           }
         }       
 
-        template <OperatorCapability Cap>
-        __MATX_INLINE__ __MATX_HOST__ auto get_capability_proc() const {
-          if constexpr (Cap == OperatorCapability::ELEMENTS_PER_THREAD) {
-            const auto my_cap = cuda::std::array<ElementsPerThread, 2>{ElementsPerThread::ONE, ElementsPerThread::ONE};
-            return my_cap;
-          } else {        
-            auto self_has_cap = detail::capability_attributes<Cap>::default_value;
-            return self_has_cap;
-          }
-        } 
-
         template <typename CapType>
         __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto operator()(index_t i) const
         {
 
@@ -68,16 +68,6 @@ namespace matx
         }
       }
 
-      template <OperatorCapability Cap>
-      __MATX_INLINE__ __MATX_HOST__ auto get_capability_proc() const {
-        if constexpr (Cap == OperatorCapability::ELEMENTS_PER_THREAD) {
-          const auto my_cap = cuda::std::array<ElementsPerThread, 2>{ElementsPerThread::ONE, ElementsPerThread::ONE};
-          return my_cap;
-        } else {        
-          return detail::capability_attributes<Cap>::default_value;
-        }
-      }
-
       // Does not support vectorization yet
       template <typename CapType, typename... Is>
       __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto operator()(Is... indices) const {
 
@@ -79,17 +79,6 @@ namespace matx
           }
         }
 
-        template <OperatorCapability Cap>
-        __MATX_INLINE__ __MATX_HOST__ auto get_capability_proc() const {
-          if constexpr (Cap == OperatorCapability::ELEMENTS_PER_THREAD) {
-            const auto my_cap = cuda::std::array<ElementsPerThread, 2>{ElementsPerThread::ONE, ElementsPerThread::ONE};
-            return my_cap;
-          } else {          
-            auto self_has_cap = detail::capability_attributes<Cap>::default_value;
-            return self_has_cap;
-          }
-        }
-
         template <typename CapType, typename... Is>
         __MATX_DEVICE__ __MATX_HOST__ __MATX_INLINE__ auto operator()(Is... indices) const { 
           static_assert(sizeof...(indices) == NUM_RC, "Number of indices incorrect in linspace");
 
@@ -98,16 +98,6 @@ namespace matx
           }
         }       
 
-        template <OperatorCapability Cap>
-        __MATX_INLINE__ __MATX_HOST__ auto get_capability_proc() const {
-          if constexpr (Cap == OperatorCapability::ELEMENTS_PER_THREAD) {
-            const auto my_cap = cuda::std::array<ElementsPerThread, 2>{ElementsPerThread::ONE, ElementsPerThread::ONE};
-            return my_cap;
-          } else {          
-            auto self_has_cap = detail::capability_attributes<Cap>::default_value;
-            return self_has_cap;
-          }
-        }
 
         __MATX_DEVICE__ __MATX_HOST__ __MATX_INLINE__ auto operator()(index_t idx) const
         {
Original file line number	Diff line number	Diff line change
`@@ -60,17 +60,6 @@ namespace matx`
`60`	`60`	`}`
`61`	`61`	`}`
`62`	`62`
`63`		`- template <OperatorCapability Cap>`
`64`		`- __MATX_INLINE__ __MATX_HOST__ auto get_capability_proc() const {`
`65`		`- if constexpr (Cap == OperatorCapability::ELEMENTS_PER_THREAD) {`
`66`		`- const auto my_cap = cuda::std::array<ElementsPerThread, 2>{ElementsPerThread::ONE, ElementsPerThread::ONE};`
`67`		`- return my_cap;`
`68`		`- } else {`
`69`		`- auto self_has_cap = detail::capability_attributes<Cap>::default_value;`
`70`		`- return self_has_cap;`
`71`		`- }`
`72`		`- }`
`73`		`-`
`74`	`63`	`template <typename CapType>`
`75`	`64`	`__MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto operator()(index_t i) const`
`76`	`65`	`{`
Original file line number	Diff line number	Diff line change
`@@ -98,16 +98,6 @@ namespace matx`
`98`	`98`	`}`
`99`	`99`	`}`
`100`	`100`
`101`		`- template <OperatorCapability Cap>`
`102`		`- __MATX_INLINE__ __MATX_HOST__ auto get_capability_proc() const {`
`103`		`- if constexpr (Cap == OperatorCapability::ELEMENTS_PER_THREAD) {`
`104`		`- const auto my_cap = cuda::std::array<ElementsPerThread, 2>{ElementsPerThread::ONE, ElementsPerThread::ONE};`
`105`		`- return my_cap;`
`106`		`- } else {`
`107`		`- auto self_has_cap = detail::capability_attributes<Cap>::default_value;`
`108`		`- return self_has_cap;`
`109`		`- }`
`110`		`- }`
`111`	`101`
`112`	`102`	`__MATX_DEVICE__ __MATX_HOST__ __MATX_INLINE__ auto operator()(index_t idx) const`
`113`	`103`	`{`