diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh
index 60d0faa8..7894fea8 100755
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@@ -4,210 +4,126 @@
 
 set -ex
 
-NCCL_VERSION=v2.25.1-1
-CUDNN_VERSION=9.5.1.17
-
-# Include Hopper support if using CUDA11.8 or above
-NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90"
-
-function install_cusparselt_040 {
-    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
-    mkdir tmp_cusparselt && pushd tmp_cusparselt
-    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
-    tar xf libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
-    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/include/* /usr/local/cuda/include/
-    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/lib/* /usr/local/cuda/lib64/
-    popd
-    rm -rf tmp_cusparselt
-}
-
-function install_cusparselt_062 {
-    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
-    mkdir tmp_cusparselt && pushd tmp_cusparselt
-    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz
-    tar xf libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz
-    cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/include/* /usr/local/cuda/include/
-    cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/lib/* /usr/local/cuda/lib64/
-    popd
-    rm -rf tmp_cusparselt
-}
-
-function install_cusparselt_063 {
-    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
-    mkdir tmp_cusparselt && pushd tmp_cusparselt
-    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
-    tar xf libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
-    cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/include/* /usr/local/cuda/include/
-    cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/lib/* /usr/local/cuda/lib64/
-    popd
-    rm -rf tmp_cusparselt
-}
-
-function install_118 {
-    CUDNN_VERSION=9.1.0.70
-    NCCL_VERSION=v2.21.5-1
-    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
-    rm -rf /usr/local/cuda-11.8 /usr/local/cuda
-    # install CUDA 11.8.0 in the same container
-    wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
-    chmod +x cuda_11.8.0_520.61.05_linux.run
-    ./cuda_11.8.0_520.61.05_linux.run --toolkit --silent
-    rm -f cuda_11.8.0_520.61.05_linux.run
-    rm -f /usr/local/cuda && ln -s /usr/local/cuda-11.8 /usr/local/cuda
-
-    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-    mkdir tmp_cudnn && cd tmp_cudnn
-    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
-    tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
-    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/include/* /usr/local/cuda/include/
-    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/lib/* /usr/local/cuda/lib64/
-    cd ..
-    rm -rf tmp_cudnn
-
-    # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
-    # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
-    git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
-    cd nccl && make -j src.build NVCC_GENCODE="${NVCC_GENCODE}"
-    cp -a build/include/* /usr/local/cuda/include/
-    cp -a build/lib/* /usr/local/cuda/lib64/
-    cd ..
-    rm -rf nccl
-
-    install_cusparselt_040
-
-    ldconfig
+arch_path=''
+targetarch=${TARGETARCH:-$(uname -m)}
+if [ ${targetarch} = 'amd64' ] || [ "${targetarch}" = 'x86_64' ]; then
+  arch_path='x86_64'
+else
+  arch_path='sbsa'
+fi
+
+function install_cuda {
+  version=$1
+  runfile=$2
+  major_minor=${version%.*}
+  rm -rf /usr/local/cuda-${major_minor} /usr/local/cuda
+  if [[ ${arch_path} == 'sbsa' ]]; then
+      runfile="${runfile}_sbsa"
+  fi
+  runfile="${runfile}.run"
+  wget -q https://developer.download.nvidia.com/compute/cuda/${version}/local_installers/${runfile} -O ${runfile}
+  chmod +x ${runfile}
+  ./${runfile} --toolkit --silent
+  rm -f ${runfile}
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-${major_minor} /usr/local/cuda
 }
 
-function install_124 {
-  CUDNN_VERSION=9.1.0.70
-  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
-  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
-  # install CUDA 12.4.1 in the same container
-  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
-  chmod +x cuda_12.4.1_550.54.15_linux.run
-  ./cuda_12.4.1_550.54.15_linux.run --toolkit --silent
-  rm -f cuda_12.4.1_550.54.15_linux.run
-  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
-
-  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+function install_cudnn {
+  cuda_major_version=$1
+  cudnn_version=$2
   mkdir tmp_cudnn && cd tmp_cudnn
-  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
-  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
-  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
-  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  filepath="cudnn-linux-${arch_path}-${cudnn_version}_cuda${cuda_major_version}-archive"
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-${arch_path}/${filepath}.tar.xz
+  tar xf ${filepath}.tar.xz
+  cp -a ${filepath}/include/* /usr/local/cuda/include/
+  cp -a ${filepath}/lib/* /usr/local/cuda/lib64/
   cd ..
   rm -rf tmp_cudnn
+}
 
+function install_nccl {
+  nccl_version=$1
+  nvcc_gencode=$2
   # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
   # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
-  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
-  cd nccl && make -j src.build NVCC_GENCODE="${NVCC_GENCODE}"
+  git clone -b ${nccl_version} --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl
+  make -j src.build NVCC_GENCODE="${nvcc_gencode}"
   cp -a build/include/* /usr/local/cuda/include/
   cp -a build/lib/* /usr/local/cuda/lib64/
   cd ..
   rm -rf nccl
+}
 
-  install_cusparselt_062
+function install_cusparselt {
+  cusparselt_version=$1
+  # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+  mkdir tmp_cusparselt && cd tmp_cusparselt
+  cusparselt_name="libcusparse_lt-linux-${arch_path}-${cusparselt_version}-archive"
+  curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${cusparselt_name}.tar.xz
 
-  ldconfig
+  tar xf ${cusparselt_name}.tar.xz
+  cp -a ${cusparselt_name}/include/* /usr/local/cuda/include/
+  cp -a ${cusparselt_name}/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cusparselt
 }
 
 function install_126 {
-  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
-  rm -rf /usr/local/cuda-12.6 /usr/local/cuda
-  # install CUDA 12.6.3 in the same container
-  wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run
-  chmod +x cuda_12.6.3_560.35.05_linux.run
-  ./cuda_12.6.3_560.35.05_linux.run --toolkit --silent
-  rm -f cuda_12.6.3_560.35.05_linux.run
-  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda
+  CUDNN_VERSION=9.10.2.21
+  NCCL_VERSION=v2.27.3-1
+  NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90"
+  CUSPARSELT_VERSION=0.7.1.0
 
-  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  mkdir tmp_cudnn && cd tmp_cudnn
-  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
-  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
-  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
-  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
-  cd ..
-  rm -rf tmp_cudnn
+  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt ${CUSPARSELT_VERSION}"
+  install_cuda 12.6.3 cuda_12.6.3_560.35.05_linux
 
-  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
-  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
-  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
-  cd nccl && make -j src.build NVCC_GENCODE="${NVCC_GENCODE}"
-  cp -a build/include/* /usr/local/cuda/include/
-  cp -a build/lib/* /usr/local/cuda/lib64/
-  cd ..
-  rm -rf nccl
+  install_cudnn 12 $CUDNN_VERSION
+
+  install_nccl $NCCL_VERSION $NVCC_GENCODE
 
-  install_cusparselt_063
+  install_cusparselt $CUSPARSELT_VERSION
 
   ldconfig
 }
 
-function prune_118 {
-    echo "Pruning CUDA 11.8 and cuDNN"
-    #####################################################################################
-    # CUDA 11.8 prune static libs
-    #####################################################################################
-    export NVPRUNE="/usr/local/cuda-11.8/bin/nvprune"
-    export CUDA_LIB_DIR="/usr/local/cuda-11.8/lib64"
+function install_128 {
+  CUDNN_VERSION=9.8.0.87
+  NCCL_VERSION=v2.27.3-1
+  NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100 -gencode=arch=compute_120,code=sm_120 -gencode=arch=compute_120,code=compute_120"
+  CUSPARSELT_VERSION=0.7.1.0
 
-    export GENCODE="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
-    export GENCODE_CUDNN="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt ${CUSPARSELT_VERSION}"
+  # install CUDA 12.8.1 in the same container
+  install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux
 
-    if [[ -n "$OVERRIDE_GENCODE" ]]; then
-        export GENCODE=$OVERRIDE_GENCODE
-    fi
+  install_cudnn 12 $CUDNN_VERSION
 
-    # all CUDA libs except CuDNN and CuBLAS (cudnn and cublas need arch 3.7 included)
-    ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
-      | xargs -I {} bash -c \
-                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+  install_nccl $NCCL_VERSION $NVCC_GENCODE
 
-    # prune CuDNN and CuBLAS
-    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
-    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+  install_cusparselt $CUSPARSELT_VERSION
 
-    #####################################################################################
-    # CUDA 11.8 prune visual tools
-    #####################################################################################
-    export CUDA_BASE="/usr/local/cuda-11.8/"
-    rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2022.3.0 $CUDA_BASE/nsight-systems-2022.4.2/
+  ldconfig
 }
 
-function prune_124 {
-  echo "Pruning CUDA 12.4"
-  #####################################################################################
-  # CUDA 12.4 prune static libs
-  #####################################################################################
-  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
-  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
+function install_129 {
+  CUDNN_VERSION=9.10.2.21
+  NCCL_VERSION=v2.27.3-1
+  NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100 -gencode=arch=compute_120,code=sm_120 -gencode=arch=compute_120,code=compute_120"
+  CUSPARSELT_VERSION=0.7.1.0
 
-  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
-  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt ${CUSPARSELT_VERSION}"
+  # install CUDA 12.9.1 in the same container
+  install_cuda 12.9.1 cuda_12.9.1_575.57.08_linux
 
-  if [[ -n "$OVERRIDE_GENCODE" ]]; then
-      export GENCODE=$OVERRIDE_GENCODE
-  fi
-  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
-      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
-  fi
+  install_cudnn 12 $CUDNN_VERSION
 
-  # all CUDA libs except CuDNN and CuBLAS
-  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
-      | xargs -I {} bash -c \
-                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+  install_nccl $NCCL_VERSION $NVCC_GENCODE
 
-  # prune CuDNN and CuBLAS
-  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
-  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+  install_cusparselt $CUSPARSELT_VERSION
 
-  #####################################################################################
-  # CUDA 12.4 prune visual tools
-  #####################################################################################
-  export CUDA_BASE="/usr/local/cuda-12.4/"
-  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
+  ldconfig
 }
 
 function prune_126 {
@@ -244,56 +160,18 @@ function prune_126 {
   rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
 }
 
-function install_128 {
-  CUDNN_VERSION=9.7.1.26
-  # Include Blackwell support if using CUDA12.8 or above
-  NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100 -gencode=arch=compute_120,code=sm_120 -gencode=arch=compute_120,code=compute_120"
-  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
-  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
-  # install CUDA 12.8.0 in the same container
-  wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux.run
-  chmod +x cuda_12.8.0_570.86.10_linux.run
-  ./cuda_12.8.0_570.86.10_linux.run --toolkit --silent
-  rm -f cuda_12.8.0_570.86.10_linux.run
-  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.8 /usr/local/cuda
-
-  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  mkdir tmp_cudnn && cd tmp_cudnn
-  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
-  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
-  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
-  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
-  cd ..
-  rm -rf tmp_cudnn
-
-  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
-  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
-  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
-  cd nccl && make -j src.build NVCC_GENCODE="${NVCC_GENCODE}"
-  cp -a build/include/* /usr/local/cuda/include/
-  cp -a build/lib/* /usr/local/cuda/lib64/
-  cd ..
-  rm -rf nccl
-
-  install_cusparselt_063
-
-  ldconfig
-}
-
 # idiomatic parameter and option handling in sh
 while test $# -gt 0
 do
     case "$1" in
-    11.8) install_118; prune_118
-        ;;
-    12.4) install_124; prune_124
+    12.6|12.6.*) install_126; prune_126
         ;;
-    12.6) install_126; prune_126
+    12.8|12.8.*) install_128;
         ;;
-    12.8) install_128;
+    12.9|12.9.*) install_129;
         ;;
     *) echo "bad argument $1"; exit 1
         ;;
     esac
     shift
-done
+done
\ No newline at end of file
diff --git a/.github/workflows/publish_devel_image.yml b/.github/workflows/publish_devel_image.yml
index c7429fb8..df16c858 100644
--- a/.github/workflows/publish_devel_image.yml
+++ b/.github/workflows/publish_devel_image.yml
@@ -21,7 +21,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cuda: ["12.6", "12.8"]
+        cuda: ["12.6", "12.8", "12.9"]
         gcc: ["12"]
     runs-on: [self-hosted, linux]
     steps:
diff --git a/nccl b/nccl
new file mode 160000
index 00000000..72d24320
--- /dev/null
+++ b/nccl
@@ -0,0 +1 @@
+Subproject commit 72d2432094d6ae36abd6e511c3a16a2d052dbf94