NVIDIA
diff --git a/‎CMakeLists.txt‎
Lines changed: 32 additions & 4 deletions b/‎CMakeLists.txt‎
Lines changed: 32 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cmake/FindMathDx.cmake‎
Lines changed: 165 additions & 0 deletions b/‎cmake/FindMathDx.cmake‎
Lines changed: 165 additions & 0 deletions
diff --git a/‎cmake/versions.json‎
Lines changed: 2 additions & 2 deletions b/‎cmake/versions.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs_input/api/synchronization/sync.rst‎
Lines changed: 2 additions & 4 deletions b/‎docs_input/api/synchronization/sync.rst‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎docs_input/basics/fusion.rst‎
Lines changed: 61 additions & 1 deletion b/‎docs_input/basics/fusion.rst‎
Lines changed: 61 additions & 1 deletion
@@ -79,6 +79,7 @@ option(MATX_EN_COVERAGE OFF "Enable code coverage reporting")
 option(MATX_EN_COMPLEX_OP_NAN_CHECKS "Enable full NaN/Inf handling for complex multiplication and division" OFF)
 option(MATX_EN_CUDA_LINEINFO "Enable line information for CUDA kernels via -lineinfo nvcc flag" OFF)
 option(MATX_EN_EXTENDED_LAMBDA "Enable extended lambda support for device/host lambdas" ON)
+option(MATX_EN_MATHDX "Enable MathDx support for kernel fusion" OFF)
 
 set(MATX_EN_PYBIND11 OFF CACHE BOOL "Enable pybind11 support")
 
@@ -96,9 +97,9 @@ if (MATX_BUILD_DOCS)
     add_subdirectory(docs_input)
 endif()
 
-# MatX requires C++17 to build. Enforce on all libraries pulled in as well
-set(CMAKE_CXX_STANDARD 17)
-set(CUDA_CXX_STANDARD 17)
+# MatX requires C++20 to build. Enforce on all libraries pulled in as well
+set(CMAKE_CXX_STANDARD 20)
+set(CUDA_CXX_STANDARD 20)
 
 if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
     execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
@@ -124,6 +125,8 @@ target_include_directories(matx INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOU
                                          "$<INSTALL_INTERFACE:include>")
 target_include_directories(matx INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/matx/kernels>"
 "$<INSTALL_INTERFACE:include/matx/kernels>")
+
+                           
 target_compile_features(matx INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 
 # 11.2 and above required for async allocation
@@ -141,7 +144,7 @@ target_link_libraries(matx INTERFACE CCCL::CCCL)
 
 # Set flags for compiling tests faster (only for nvcc)
 if (NOT CMAKE_CUDA_COMPILER_ID STREQUAL "Clang")
-    set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)
+    set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0 --extended-lambda)
 endif()
 
 # Hack because CMake doesn't have short circult evaluation
@@ -304,6 +307,31 @@ if (MATX_EN_CUTENSOR)
     target_link_libraries(matx INTERFACE "-Wl,--disable-new-dtags")
 endif()
 
+if (MATX_EN_MATHDX)
+    set(MathDx_VERSION 25.06)
+    set(MathDx_NANO 0)
+    include(cmake/FindMathDx.cmake)
+    target_compile_definitions(matx INTERFACE MATX_EN_MATHDX)
+    target_compile_definitions(matx INTERFACE MATX_EN_JIT)
+    
+    # Add NVRTC configuration as compiler definitions
+    list(GET CMAKE_CUDA_ARCHITECTURES 0 NVRTC_CUDA_ARCH)
+    # Strip -real or -virt postfix if present
+    string(REGEX REPLACE "-real$" "" NVRTC_CUDA_ARCH "${NVRTC_CUDA_ARCH}")
+    string(REGEX REPLACE "-virtual$" "" NVRTC_CUDA_ARCH "${NVRTC_CUDA_ARCH}")
+    target_compile_definitions(matx INTERFACE NVRTC_CUDA_ARCH="${NVRTC_CUDA_ARCH}")
+    target_compile_definitions(matx INTERFACE NVRTC_CXX_STANDARD="${CMAKE_CXX_STANDARD}")
+    
+    # Link libmathdx if available
+    if(TARGET libmathdx::libmathdx)
+        target_link_libraries(matx INTERFACE libmathdx::libmathdx)
+        message(STATUS "Linked libmathdx to matx target")
+    endif()
+    
+    # Link mathdx components
+    target_link_libraries(matx INTERFACE mathdx::cufftdx CUDA::nvrtc)
+endif()
+
 if (MATX_EN_CUDSS)
     set(cuDSS_VERSION 0.7.0.20)
     include(cmake/FindcuDSS.cmake)
 
@@ -50,9 +50,9 @@ are necessary
 ## Requirements
 MatX support is currently limited to **Linux only** due to the time to test Windows. If you'd like to voice your support for native Windows support using Visual Studio, please comment on the issue here: https://github.com/NVIDIA/MatX/issues/153.
 
-**Note**: CUDA 12.0.0 through 12.2.0 have an issue that causes building MatX unit tests to show a compiler error or cause a segfault in the compiler. Please use CUDA 11.8 or CUDA 12.2.1+ with MatX.
+**Note**: CUDA 12.0.0 through 12.2.0 have an issue that causes building MatX unit tests to show a compiler error or cause a segfault in the compiler. Please use CUDA 12.2.1+ with MatX.
 
-MatX is using features in C++17 and the latest CUDA compilers and libraries. For this reason, when running with GPU support, CUDA 11.8 and g++9, nvc++ 24.5, or clang 17 or newer is required. You can download the CUDA Toolkit [here](https://developer.nvidia.com/cuda-downloads).
+MatX is using features in C++20 and the latest CUDA compilers and libraries. For this reason, when running with GPU support, CUDA 12.2.1 and g++9, nvc++ 24.5, or clang 17 or newer is required. You can download the CUDA Toolkit [here](https://developer.nvidia.com/cuda-downloads).
 
 MatX has been tested on and supports Volta, Ampere, Ada, Hopper, and Blackwell GPU architectures. Jetson products are supported with Jetpack 5.0 or above.
 
 
@@ -0,0 +1,165 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+#[=======================================================================[.rst:
+FindMathDx
+--------
+
+Find MathDx
+
+Imported targets
+^^^^^^^^^^^^^^^^
+
+This module defines the following :prop_tgt:`IMPORTED` target(s):
+
+``MathDx::MathDx``
+  The MathDx library, if found.
+
+Result variables
+^^^^^^^^^^^^^^^^
+
+This module will set the following variables in your project:
+
+``MathDx_FOUND``
+  True if MathDx is found.
+``MathDx_INCLUDE_DIRS``
+  The include directories needed to use MathDx.
+``MathDx_VERSION_STRING``
+  The version of the MathDx library found. [OPTIONAL]
+
+#]=======================================================================]
+set(MathDx_VERSION_FULL ${MathDx_VERSION}.${MathDx_NANO})
+
+# Prefer using a Config module if it exists for this project
+set(MathDx_NO_CONFIG FALSE)
+if(NOT MathDx_NO_CONFIG)
+  find_package(MathDx CONFIG QUIET HINTS ${MathDx_DIR})
+  if(MathDx_FOUND)
+    find_package_handle_standard_args(MathDx DEFAULT_MSG MathDx_CONFIG)
+    return()
+  endif()
+endif()
+
+find_path(MathDx_INCLUDE_DIR NAMES MathDx.h)
+
+# Search for the MathDx library
+find_library(MathDx_LIBRARY 
+  NAMES MathDx mathdx
+  HINTS ${MathDx_DIR}
+  PATH_SUFFIXES lib lib64
+)
+
+include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
+
+find_package_handle_standard_args(MathDx
+                                  REQUIRED_VARS MathDx_LIBRARY MathDx_INCLUDE_DIR
+                                  VERSION_VAR )
+
+if(NOT MathDx_FOUND)
+  set(MathDx_FILENAME libMathDx-linux-x86_64-${MathDx_VERSION}-archive)
+
+  message(STATUS "MathDx not found. Downloading library. By continuing this download you accept to the license terms of MathDx")
+
+  CPMAddPackage(
+    NAME MathDx
+    VERSION ${MathDx_VERSION}
+    URL https://developer.download.nvidia.com/compute/cuFFTDx/redist/cuFFTDx/nvidia-mathdx-${MathDx_VERSION_FULL}.tar.gz
+    DOWNLOAD_ONLY YES 
+  )
+endif()
+
+# Download libmathdx based on CUDA version and platform
+# Detect CUDA version (12 or 13)
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+  set(LIBMATHDX_CUDA_VERSION "cuda13")
+  set(LIBMATHDX_CUDA_SUFFIX "cuda13.0")
+else()
+  set(LIBMATHDX_CUDA_VERSION "cuda12")
+  set(LIBMATHDX_CUDA_SUFFIX "cuda12.0")
+endif()
+
+# Detect platform
+if(WIN32)
+  set(LIBMATHDX_PLATFORM "win32-x86_64")
+  set(LIBMATHDX_EXT "zip")
+elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64")
+    set(LIBMATHDX_PLATFORM "Linux-aarch64")
+  else()
+    set(LIBMATHDX_PLATFORM "Linux-x86_64")
+  endif()
+  set(LIBMATHDX_EXT "tar.gz")
+else()
+  message(WARNING "Unsupported platform for libmathdx download")
+endif()
+
+# Set libmathdx version
+set(LIBMATHDX_VERSION "0.2.3")
+
+# Download libmathdx if platform is supported
+if(DEFINED LIBMATHDX_PLATFORM)
+  set(LIBMATHDX_URL "https://developer.nvidia.com/downloads/compute/cublasdx/redist/cublasdx/${LIBMATHDX_CUDA_VERSION}/libmathdx-${LIBMATHDX_PLATFORM}-${LIBMATHDX_VERSION}-${LIBMATHDX_CUDA_SUFFIX}.${LIBMATHDX_EXT}")
+  
+  message(STATUS "Downloading libmathdx for ${LIBMATHDX_PLATFORM} with ${LIBMATHDX_CUDA_VERSION}")
+  message(STATUS "libmathdx URL: ${LIBMATHDX_URL}")
+  
+  CPMAddPackage(
+    NAME libmathdx
+    VERSION ${LIBMATHDX_VERSION}
+    URL ${LIBMATHDX_URL}
+    DOWNLOAD_ONLY YES
+  )
+  
+  # Add libmathdx to the search paths
+  set(LIBMATHDX_ROOT "${PROJECT_BINARY_DIR}/_deps/libmathdx-src")
+  list(APPEND CMAKE_PREFIX_PATH "${LIBMATHDX_ROOT}")
+  
+  # Find libmathdx library file
+  find_library(LIBMATHDX_LIBRARY
+    NAMES mathdx libmathdx
+    PATHS "${LIBMATHDX_ROOT}/lib"
+    NO_DEFAULT_PATH
+  )
+  
+  # Set include directories (in both local and parent scope)
+  set(LIBMATHDX_INCLUDE_DIR "${LIBMATHDX_ROOT}/include")
+  set(LIBMATHDX_INCLUDE_DIR "${LIBMATHDX_INCLUDE_DIR}" PARENT_SCOPE)
+  
+  if(LIBMATHDX_LIBRARY AND EXISTS ${LIBMATHDX_INCLUDE_DIR})
+    message(STATUS "Found libmathdx library: ${LIBMATHDX_LIBRARY}")
+    message(STATUS "Found libmathdx include dir: ${LIBMATHDX_INCLUDE_DIR}")
+    
+    # Create libmathdx target
+    if(NOT TARGET libmathdx::libmathdx)
+      add_library(libmathdx::libmathdx INTERFACE IMPORTED)
+      set_target_properties(libmathdx::libmathdx PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES "${LIBMATHDX_INCLUDE_DIR}"
+        INTERFACE_LINK_LIBRARIES "${LIBMATHDX_LIBRARY}"
+      )
+    endif()
+  else()
+    message(WARNING "Could not find libmathdx library or include directory after download")
+  endif()
+endif()
+
+find_package(mathdx REQUIRED COMPONENTS cufftdx CONFIG
+PATHS
+    "${PROJECT_BINARY_DIR}/_deps/mathdx-src/nvidia/mathdx/${MathDx_VERSION}/lib/cmake/mathdx/"
+    "${PROJECT_BINARY_DIR}/_deps/libmathdx-src/lib/cmake/libmathdx/"
+    "${PROJECT_BINARY_DIR}/_deps/libmathdx-src"
+    "/opt/nvidia/mathdx/${MathDx_VERSION_FULL}"
+)
+
@@ -1,10 +1,10 @@
 {
   "packages": {
     "CCCL": {
-      "version": "3.0.0",
+      "version": "3.2.0",
       "git_shallow": false,
       "git_url": "https://github.com/NVIDIA/cccl.git",
-      "git_tag": "e944297"
+      "git_tag": "0320434"
     },
     "nvbench" : {
       "version" : "0.0",
 
@@ -3,10 +3,8 @@
 sync
 ====
 
-Wait for any code running on an executor to complete.
-
-.. doxygenfunction:: matx::cudaExecutor::sync()
-.. doxygenfunction:: matx::HostExecutor::sync()
+Wait for any code running on an executor to complete. For CUDA executors this typically synchronizes 
+the stream backing the executor, while host executors wait until the calling thread completes.
 
 Examples
 ~~~~~~~~
 
@@ -3,6 +3,13 @@
 Operator Fusion
 ###############
 
+MatX supports operator fusion for all element-wise operators, and CUDA JIT kernel fusion for math functions with a 
+supporting MathDx function. JIT kernel fusion is considered *experimental* currently and may contain bugs that don't 
+occur with JIT enabled.
+
+Element-wise Operator Fusion
+============================
+
 When writing a simple arithmetic expression like the following:
 
 .. code-block:: cpp
@@ -43,4 +50,57 @@ expressions, this opens the possibility to selectively fuse more complex express
 
 The type system can see that we have a multiply where the right-hand side is an FFT transform and the left side is another
 operator. This allows MatX to potentially fuse the output of the FFT with a multiply of B at compile-time. In general, the 
-more information it can deduce during compilation and runtime, the better the performance will be.
+more information it can deduce during compilation and runtime, the better the performance will be.
+
+CUDA JIT Kernel Fusion
+======================
+
+.. note::
+
+    CUDA JIT kernel fusion is considered an experimental feature. There may be bugs that don't occur with JIT disabled, and new features are being added over time.
+
+MatX supports CUDA JIT kernel fusion that compiles the entire expression into a single kernel. Currently this is enabled 
+for all standard MatX element-wise operators and FFT operations via MathDx. To enable fusion with MathDx, 
+the following options must be enabled: ``-DMATX_EN_MATHDX=ON``. Once enabled, the ``CUDAJITExecutor`` can be used perform JIT compilation
+in supported situations. If the expression cannot be JIT compiled, the JITExecutor will fall back to the normal non-JIT path.
+
+While JIT compilation can provide a large performance boost, there are two overheads that occur when using JIT compilation:
+- The first pass to JIT the code takes time. The first time a ``run()`` statement is executed on a new operator, MatX identifies this and performs JIT compilation. Depending on the complexity of the operator, this could be anywhere from milliseconds to seconds to complete. Once finished, MatX will cache the compiled kernel so that subsequent runs of the same operator will not require JIT compilation.
+- A lookup is done to find kernels that have already been compiled. This is a small overhead and may not be noticeable.
+
+As mentioned above, there is no difference in syntax between MatX statements that perform JIT compilation and those that do not. The executor 
+is the only change, just as it would be with a host executor. For example, in the following code:
+
+.. code-block:: cpp
+
+    (A = B * fft(C)).run(CUDAExecutor{});
+    (A = B * fft(C)).run(CUDAJITExecutor{});
+
+When MathDx is disabled, the the first statement will execute the FFT into a temporary buffer, then the multiply will be executed. This results 
+in a minimum of 2 kernels (one for MatX and at least one for cuFFT). The second statement will execute the FFT and multiply in a single kernel if 
+possible.
+
+Some operators cannot be JIT compiled. For example, if the FFT above is a size not compatible with the cuFFTDx library or if MathDx is disabled 
+the expression will not be JIT compiled. To determine if an operator can be JIT compiled, use the ``matx::jit_supported(op)`` function: 
+
+.. code-block:: cpp
+
+    auto my_op = (fft(b) + c);
+    if (matx::jit_supported(my_op)) {
+      printf("FFT is supported by JIT\n");
+    } else {
+      printf("FFT is not supported by JIT\n");
+    }
+
+Even if the MathDx library supports a particular operation, other operators in the expression may prevent JIT compilation. For 
+example: 
+
+.. code-block:: cpp
+
+    auto my_op = (fftshift1D(fft(b)));
+
+In this case the MathDx library requires at least 2 elements per thread for the FFT, but the ``fftshift1D`` operator requires 
+only 1 element per thread. Therefore, the entire expression cannot be JIT-compiled and will fall back to the non-JIT path. Some of 
+these restrictions may be relaxed in newer versions of MatX or the MathDx library.
+
+