Enable CUDA 13 for GenAI builds (#4837)

q10 · facebook-github-bot · commit 2033a0a08fbc · 2025-09-09T11:28:21.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1872 Pull Request resolved: #4837 Reviewed By: cthi Differential Revision: D81996487 Pulled By: q10 fbshipit-source-id: 24d23dc50d07601f6154aa8c1bdce2cd52a63f4e
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -123,6 +123,8 @@ __configure_fbgemm_gpu_build_nvcc () {
   if ! [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] && [[ "$BUILD_CUDA_VERSION" != "cu126" ]]; then
     # NOTE: This flag is only supported in NVCC 12.8+
     nvcc_prepend_flags+=(
+      # when "-static-global-template-stub=true" in whole program compilation mode ("-rdc=false"), a __global__ function template instantiation or specialization ("split_embedding_backward_codegen_partial_rowwise_adam_weighted_kernel_warp_per_row_1< ::c10::Float8_e4m3fn,  ::c10::BFloat16,  ::c10::Half, long,  ::c10::BFloat16,  ::c10::BFloat16, (int)2, (int)32, (bool)1> ") must have a definition in the current translation unit. To resolve this issue, either use separate compilation mode ("-rdc=true"), or explicitly set "-static-global-template-stub=false" (but see nvcc documentation about downsides of turning it off)
+      -diag-suppress 20280
       # warn: in whole program compilation mode ("-rdc=false"), a __global__ function template instantiation or specialization will be required to have a definition in the current translation unit, when "-static-global-template-stub" will be set to "true" by default in the future. To resolve this issue, either use "-rdc=true", or explicitly set "-static-global-template-stub=false" (but see nvcc documentation about downsides of turning it off)
       -diag-suppress 20281
     )
@@ -159,9 +161,12 @@ __configure_fbgemm_gpu_build_nvcc () {
 __configure_fbgemm_gpu_cuda_home () {
   # NOTE: This only matches for non-Nova builds, as CUDA versions in Nova builds
   # are formatted as `cu12x“
-  if  [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
-      [[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]] ||
-      [[ "$BUILD_CUDA_VERSION" =~ ^12.9.*$ ]]; then
+  if  [[ "$BUILD_CUDA_VERSION" =~ ^11.*$ ]] ||
+      [[ "$BUILD_CUDA_VERSION" =~ ^12.1.*$ ]] ||
+      [[ "$BUILD_CUDA_VERSION" =~ ^12.4.*$ ]]; then
+    echo "[BUILD] No need to set CUDA_TOOLKIT_ROOT_DIR and CUDAToolkit_ROOT on older CUDA installations ..."
+
+  else
     # shellcheck disable=SC2155,SC2086
     local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
     local new_cuda_home="${conda_prefix}/targets/${MACHINE_NAME_LC}-linux"
@@ -173,6 +178,7 @@ __configure_fbgemm_gpu_cuda_home () {
       # uses CUDAToolkit_ROOT.
       #
       # https://github.com/conda-forge/cuda-feedstock/issues/59
+      # https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA.cmake#L40
       -DCUDA_TOOLKIT_ROOT_DIR="${new_cuda_home}"
       -DCUDAToolkit_ROOT="${new_cuda_home}"
     )
@@ -286,7 +292,8 @@ __configure_fbgemm_gpu_build_cuda () {
     # appending 7.0/7.5 to the back of the list mysteriously results in
     # undefined symbol errors on .SO loads
     if [[ $fbgemm_build_target == "hstu" ]]; then
-      if  [[ $cuda_version_nvcc == *"V12"* ]]; then
+      if  [[ $cuda_version_nvcc == *"V13"* ]] ||
+          [[ $cuda_version_nvcc == *"V12"* ]]; then
         # NOTE: Compiling 9.0a code will fail if sm_80 output is also is also
         # enabled, bc the code relies on the following function that is not
         # supported in sm_80:
@@ -297,7 +304,8 @@ __configure_fbgemm_gpu_build_cuda () {
         local arch_list="7.5;8.0"
       fi
 
-    elif  [[ $cuda_version_nvcc == *"V12.9"* ]] ||
+    elif  [[ $cuda_version_nvcc == *"V13.0"* ]] ||
+          [[ $cuda_version_nvcc == *"V12.9"* ]] ||
           [[ $cuda_version_nvcc == *"V12.8"* ]]; then
       local arch_list="7.5;8.0;9.0a;10.0a;12.0a"
 
diff --git a/.github/scripts/generate_ci_matrix.py b/.github/scripts/generate_ci_matrix.py
@@ -303,6 +303,8 @@ def cuda_versions(self) -> List[str]:
         if self.target == TARGET_HSTU:
             # FBGEMM HSTU is expensive, so conserve CI resources
             return ["12.9.1"]
+        elif self.target == TARGET_GENAI:
+            return ["12.6.3", "12.8.1", "12.9.1", "13.0.0"]
         else:
             # GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138
             return ["12.6.3", "12.8.1", "12.9.1"]
diff --git a/.github/scripts/nova_dir.bash b/.github/scripts/nova_dir.bash
@@ -22,8 +22,12 @@ fi
 ## Overwrite existing ENV VAR in Nova
 if [[ "$CONDA_ENV" != "" ]]; then export CONDA_RUN="conda run --no-capture-output -p ${CONDA_ENV}" && echo "$CONDA_RUN"; fi
 
-if  [[ "$CU_VERSION" == "cu129" ]] ||
-    [[ "$CU_VERSION" == "cu128" ]]; then
+if  [[ "$CU_VERSION" == "cu130" ]]; then
+    export TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0a;10.0a;12.0a"
+    echo "[NOVA] Set TORCH_CUDA_ARCH_LIST to: ${TORCH_CUDA_ARCH_LIST}"
+
+elif [[ "$CU_VERSION" == "cu129" ]] ||
+     [[ "$CU_VERSION" == "cu128" ]]; then
     export TORCH_CUDA_ARCH_LIST="7.0;8.0;9.0a;10.0a;12.0a"
     echo "[NOVA] Set TORCH_CUDA_ARCH_LIST to: ${TORCH_CUDA_ARCH_LIST}"
 
diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
@@ -18,9 +18,12 @@ __set_cuda_symlinks_envvars () {
   local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
   local new_cuda_home="${conda_prefix}/targets/${MACHINE_NAME_LC}-linux"
 
-  if  [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
-      [[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]] ||
-      [[ "$BUILD_CUDA_VERSION" =~ ^12.9.*$ ]]; then
+  if  [[ "$BUILD_CUDA_VERSION" =~ ^11.*$ ]] ||
+      [[ "$BUILD_CUDA_VERSION" =~ ^12.1.*$ ]] ||
+      [[ "$BUILD_CUDA_VERSION" =~ ^12.4.*$ ]]; then
+    echo "[INSTALL] Target CUDA version is ${BUILD_CUDA_VERSION}, no need to add extra symlinks and env vars ..."
+
+  else
     # CUDA 12.6 installation has a very different package layout than previous
     # CUDA versions - notably, NVTX has been moved elsewhere, which causes
     # PyTorch CMake scripts to complain.
@@ -91,9 +94,11 @@ __set_nvcc_prepend_flags () {
   # which overrides whatever `-ccbin` flag we set manually, so remove this
   # unwanted hook
   print_exec ls -la "${conda_prefix}/etc/conda/activate.d"
-  if  [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
-      [[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]] ||
-      [[ "$BUILD_CUDA_VERSION" =~ ^12.9.*$ ]]; then
+  if  [[ "$BUILD_CUDA_VERSION" =~ ^11.*$ ]] ||
+      [[ "$BUILD_CUDA_VERSION" =~ ^12.1.*$ ]] ||
+      [[ "$BUILD_CUDA_VERSION" =~ ^12.4.*$ ]]; then
+    echo "[INSTALL] No need to update NVCC activation scripts on older CUDA installations ..."
+  else
     echo "[INSTALL] Removing the -ccbin=CXX hook from NVCC activation scripts ..."
     print_exec sed -i '/-ccbin=/d' "${conda_prefix}/etc/conda/activate.d/*cuda-nvcc_activate.sh"
   fi
@@ -186,26 +191,25 @@ install_cuda () {
   local env_prefix=$(env_name_or_prefix "${env_name}")
   echo "[INSTALL] Installing CUDA ${cuda_version} ..."
 
-  # NOTE: Currently, CUDA 12.6 cannot be installed using the nvidia/label/cuda-*
-  # conda channels, because we run into the following error:
+  # NOTE: Currently, CUDA 12.6 and later cannot be installed using the
+  # nvidia/label/cuda-* conda channels, because we run into the following error:
   #
   #   LibMambaUnsatisfiableError: Encountered problems while solving:
   #     - nothing provides __win needed by cuda-12.6.3-0
   #
-  # For now, we only use conda-forge for installing 12.6, but it is likely that
-  # in the future, we will be using conda-forge for installing all CUDA versions
-  # (except for versions 11.8 and below, which are only available through
+  # As such, we use conda-forge for installing all CUDA versions, except for
+  # versions 12.4 and below, which are only available through
   # nvidia/label/cuda-*)
-  if  [[ "$cuda_version" =~ ^12.6.*$ ]] ||
-      [[ "$cuda_version" =~ ^12.8.*$ ]] ||
-      [[ "$cuda_version" =~ ^12.9.*$ ]]; then
-    # shellcheck disable=SC2086
-    (exec_with_retries 3 conda install --force-reinstall ${env_prefix} -c conda-forge --override-channels -y \
-      cuda=${cuda_version}) || return 1
-  else
+  if  [[ "$BUILD_CUDA_VERSION" =~ ^11.*$ ]] ||
+      [[ "$BUILD_CUDA_VERSION" =~ ^12.1.*$ ]] ||
+      [[ "$BUILD_CUDA_VERSION" =~ ^12.4.*$ ]]; then
     # shellcheck disable=SC2086
     (exec_with_retries 3 conda install --force-reinstall ${env_prefix} -c "nvidia/label/cuda-${cuda_version}" -y \
       cuda) || return 1
+  else
+    # shellcheck disable=SC2086
+    (exec_with_retries 3 conda install --force-reinstall ${env_prefix} -c conda-forge --override-channels -y \
+      cuda=${cuda_version}) || return 1
   fi
 
   # Set the symlinks and environment variables not covered by conda install
diff --git a/.github/workflows/build_wheels_genai_linux_x86.yml b/.github/workflows/build_wheels_genai_linux_x86.yml
@@ -51,13 +51,12 @@ jobs:
         MAT: ${{ needs.generate-matrix.outputs.matrix }}
       # Nova Coordinate Filters:
       # cuda/11.8: No longer supported in FBGEMM
-      # cuda/13.0: Not supported in FBGEMM CI yet
       # rocm/3.13t: causes segfaults at runtime
       run: |
         set -ex
         pwd
         ls
-        MATRIX_BLOB="$( python .github/scripts/filter_nova_matrix.py --filter gpu_arch_version:11.8 --filter gpu_arch_version:13.0 --filter 'gpu_arch_type:rocm;python_version:3.13t' )"
+        MATRIX_BLOB="$( python .github/scripts/filter_nova_matrix.py --filter gpu_arch_version:11.8 --filter 'gpu_arch_type:rocm;python_version:3.13t' )"
         echo "${MATRIX_BLOB}"
         echo "matrix=${MATRIX_BLOB}" >> "${GITHUB_OUTPUT}"