Skip to content

Commit 2033a0a

Browse files
q10facebook-github-bot
authored andcommitted
Enable CUDA 13 for GenAI builds (#4837)
Summary: X-link: facebookresearch/FBGEMM#1872 Pull Request resolved: #4837 Reviewed By: cthi Differential Revision: D81996487 Pulled By: q10 fbshipit-source-id: 24d23dc50d07601f6154aa8c1bdce2cd52a63f4e
1 parent 0fc23ca commit 2033a0a

File tree

5 files changed

+44
-27
lines changed

5 files changed

+44
-27
lines changed

.github/scripts/fbgemm_gpu_build.bash

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,8 @@ __configure_fbgemm_gpu_build_nvcc () {
123123
if ! [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] && [[ "$BUILD_CUDA_VERSION" != "cu126" ]]; then
124124
# NOTE: This flag is only supported in NVCC 12.8+
125125
nvcc_prepend_flags+=(
126+
# when "-static-global-template-stub=true" in whole program compilation mode ("-rdc=false"), a __global__ function template instantiation or specialization ("split_embedding_backward_codegen_partial_rowwise_adam_weighted_kernel_warp_per_row_1< ::c10::Float8_e4m3fn, ::c10::BFloat16, ::c10::Half, long, ::c10::BFloat16, ::c10::BFloat16, (int)2, (int)32, (bool)1> ") must have a definition in the current translation unit. To resolve this issue, either use separate compilation mode ("-rdc=true"), or explicitly set "-static-global-template-stub=false" (but see nvcc documentation about downsides of turning it off)
127+
-diag-suppress 20280
126128
# warn: in whole program compilation mode ("-rdc=false"), a __global__ function template instantiation or specialization will be required to have a definition in the current translation unit, when "-static-global-template-stub" will be set to "true" by default in the future. To resolve this issue, either use "-rdc=true", or explicitly set "-static-global-template-stub=false" (but see nvcc documentation about downsides of turning it off)
127129
-diag-suppress 20281
128130
)
@@ -159,9 +161,12 @@ __configure_fbgemm_gpu_build_nvcc () {
159161
__configure_fbgemm_gpu_cuda_home () {
160162
# NOTE: This only matches for non-Nova builds, as CUDA versions in Nova builds
161163
# are formatted as `cu12x“
162-
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
163-
[[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]] ||
164-
[[ "$BUILD_CUDA_VERSION" =~ ^12.9.*$ ]]; then
164+
if [[ "$BUILD_CUDA_VERSION" =~ ^11.*$ ]] ||
165+
[[ "$BUILD_CUDA_VERSION" =~ ^12.1.*$ ]] ||
166+
[[ "$BUILD_CUDA_VERSION" =~ ^12.4.*$ ]]; then
167+
echo "[BUILD] No need to set CUDA_TOOLKIT_ROOT_DIR and CUDAToolkit_ROOT on older CUDA installations ..."
168+
169+
else
165170
# shellcheck disable=SC2155,SC2086
166171
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
167172
local new_cuda_home="${conda_prefix}/targets/${MACHINE_NAME_LC}-linux"
@@ -173,6 +178,7 @@ __configure_fbgemm_gpu_cuda_home () {
173178
# uses CUDAToolkit_ROOT.
174179
#
175180
# https://github.com/conda-forge/cuda-feedstock/issues/59
181+
# https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA.cmake#L40
176182
-DCUDA_TOOLKIT_ROOT_DIR="${new_cuda_home}"
177183
-DCUDAToolkit_ROOT="${new_cuda_home}"
178184
)
@@ -286,7 +292,8 @@ __configure_fbgemm_gpu_build_cuda () {
286292
# appending 7.0/7.5 to the back of the list mysteriously results in
287293
# undefined symbol errors on .SO loads
288294
if [[ $fbgemm_build_target == "hstu" ]]; then
289-
if [[ $cuda_version_nvcc == *"V12"* ]]; then
295+
if [[ $cuda_version_nvcc == *"V13"* ]] ||
296+
[[ $cuda_version_nvcc == *"V12"* ]]; then
290297
# NOTE: Compiling 9.0a code will fail if sm_80 output is also is also
291298
# enabled, bc the code relies on the following function that is not
292299
# supported in sm_80:
@@ -297,7 +304,8 @@ __configure_fbgemm_gpu_build_cuda () {
297304
local arch_list="7.5;8.0"
298305
fi
299306

300-
elif [[ $cuda_version_nvcc == *"V12.9"* ]] ||
307+
elif [[ $cuda_version_nvcc == *"V13.0"* ]] ||
308+
[[ $cuda_version_nvcc == *"V12.9"* ]] ||
301309
[[ $cuda_version_nvcc == *"V12.8"* ]]; then
302310
local arch_list="7.5;8.0;9.0a;10.0a;12.0a"
303311

.github/scripts/generate_ci_matrix.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,8 @@ def cuda_versions(self) -> List[str]:
303303
if self.target == TARGET_HSTU:
304304
# FBGEMM HSTU is expensive, so conserve CI resources
305305
return ["12.9.1"]
306+
elif self.target == TARGET_GENAI:
307+
return ["12.6.3", "12.8.1", "12.9.1", "13.0.0"]
306308
else:
307309
# GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138
308310
return ["12.6.3", "12.8.1", "12.9.1"]

.github/scripts/nova_dir.bash

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,12 @@ fi
2222
## Overwrite existing ENV VAR in Nova
2323
if [[ "$CONDA_ENV" != "" ]]; then export CONDA_RUN="conda run --no-capture-output -p ${CONDA_ENV}" && echo "$CONDA_RUN"; fi
2424

25-
if [[ "$CU_VERSION" == "cu129" ]] ||
26-
[[ "$CU_VERSION" == "cu128" ]]; then
25+
if [[ "$CU_VERSION" == "cu130" ]]; then
26+
export TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0a;10.0a;12.0a"
27+
echo "[NOVA] Set TORCH_CUDA_ARCH_LIST to: ${TORCH_CUDA_ARCH_LIST}"
28+
29+
elif [[ "$CU_VERSION" == "cu129" ]] ||
30+
[[ "$CU_VERSION" == "cu128" ]]; then
2731
export TORCH_CUDA_ARCH_LIST="7.0;8.0;9.0a;10.0a;12.0a"
2832
echo "[NOVA] Set TORCH_CUDA_ARCH_LIST to: ${TORCH_CUDA_ARCH_LIST}"
2933

.github/scripts/utils_cuda.bash

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,12 @@ __set_cuda_symlinks_envvars () {
1818
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
1919
local new_cuda_home="${conda_prefix}/targets/${MACHINE_NAME_LC}-linux"
2020

21-
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
22-
[[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]] ||
23-
[[ "$BUILD_CUDA_VERSION" =~ ^12.9.*$ ]]; then
21+
if [[ "$BUILD_CUDA_VERSION" =~ ^11.*$ ]] ||
22+
[[ "$BUILD_CUDA_VERSION" =~ ^12.1.*$ ]] ||
23+
[[ "$BUILD_CUDA_VERSION" =~ ^12.4.*$ ]]; then
24+
echo "[INSTALL] Target CUDA version is ${BUILD_CUDA_VERSION}, no need to add extra symlinks and env vars ..."
25+
26+
else
2427
# CUDA 12.6 installation has a very different package layout than previous
2528
# CUDA versions - notably, NVTX has been moved elsewhere, which causes
2629
# PyTorch CMake scripts to complain.
@@ -91,9 +94,11 @@ __set_nvcc_prepend_flags () {
9194
# which overrides whatever `-ccbin` flag we set manually, so remove this
9295
# unwanted hook
9396
print_exec ls -la "${conda_prefix}/etc/conda/activate.d"
94-
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]] ||
95-
[[ "$BUILD_CUDA_VERSION" =~ ^12.8.*$ ]] ||
96-
[[ "$BUILD_CUDA_VERSION" =~ ^12.9.*$ ]]; then
97+
if [[ "$BUILD_CUDA_VERSION" =~ ^11.*$ ]] ||
98+
[[ "$BUILD_CUDA_VERSION" =~ ^12.1.*$ ]] ||
99+
[[ "$BUILD_CUDA_VERSION" =~ ^12.4.*$ ]]; then
100+
echo "[INSTALL] No need to update NVCC activation scripts on older CUDA installations ..."
101+
else
97102
echo "[INSTALL] Removing the -ccbin=CXX hook from NVCC activation scripts ..."
98103
print_exec sed -i '/-ccbin=/d' "${conda_prefix}/etc/conda/activate.d/*cuda-nvcc_activate.sh"
99104
fi
@@ -186,26 +191,25 @@ install_cuda () {
186191
local env_prefix=$(env_name_or_prefix "${env_name}")
187192
echo "[INSTALL] Installing CUDA ${cuda_version} ..."
188193

189-
# NOTE: Currently, CUDA 12.6 cannot be installed using the nvidia/label/cuda-*
190-
# conda channels, because we run into the following error:
194+
# NOTE: Currently, CUDA 12.6 and later cannot be installed using the
195+
# nvidia/label/cuda-* conda channels, because we run into the following error:
191196
#
192197
# LibMambaUnsatisfiableError: Encountered problems while solving:
193198
# - nothing provides __win needed by cuda-12.6.3-0
194199
#
195-
# For now, we only use conda-forge for installing 12.6, but it is likely that
196-
# in the future, we will be using conda-forge for installing all CUDA versions
197-
# (except for versions 11.8 and below, which are only available through
200+
# As such, we use conda-forge for installing all CUDA versions, except for
201+
# versions 12.4 and below, which are only available through
198202
# nvidia/label/cuda-*)
199-
if [[ "$cuda_version" =~ ^12.6.*$ ]] ||
200-
[[ "$cuda_version" =~ ^12.8.*$ ]] ||
201-
[[ "$cuda_version" =~ ^12.9.*$ ]]; then
202-
# shellcheck disable=SC2086
203-
(exec_with_retries 3 conda install --force-reinstall ${env_prefix} -c conda-forge --override-channels -y \
204-
cuda=${cuda_version}) || return 1
205-
else
203+
if [[ "$BUILD_CUDA_VERSION" =~ ^11.*$ ]] ||
204+
[[ "$BUILD_CUDA_VERSION" =~ ^12.1.*$ ]] ||
205+
[[ "$BUILD_CUDA_VERSION" =~ ^12.4.*$ ]]; then
206206
# shellcheck disable=SC2086
207207
(exec_with_retries 3 conda install --force-reinstall ${env_prefix} -c "nvidia/label/cuda-${cuda_version}" -y \
208208
cuda) || return 1
209+
else
210+
# shellcheck disable=SC2086
211+
(exec_with_retries 3 conda install --force-reinstall ${env_prefix} -c conda-forge --override-channels -y \
212+
cuda=${cuda_version}) || return 1
209213
fi
210214

211215
# Set the symlinks and environment variables not covered by conda install

.github/workflows/build_wheels_genai_linux_x86.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,12 @@ jobs:
5151
MAT: ${{ needs.generate-matrix.outputs.matrix }}
5252
# Nova Coordinate Filters:
5353
# cuda/11.8: No longer supported in FBGEMM
54-
# cuda/13.0: Not supported in FBGEMM CI yet
5554
# rocm/3.13t: causes segfaults at runtime
5655
run: |
5756
set -ex
5857
pwd
5958
ls
60-
MATRIX_BLOB="$( python .github/scripts/filter_nova_matrix.py --filter gpu_arch_version:11.8 --filter gpu_arch_version:13.0 --filter 'gpu_arch_type:rocm;python_version:3.13t' )"
59+
MATRIX_BLOB="$( python .github/scripts/filter_nova_matrix.py --filter gpu_arch_version:11.8 --filter 'gpu_arch_type:rocm;python_version:3.13t' )"
6160
echo "${MATRIX_BLOB}"
6261
echo "matrix=${MATRIX_BLOB}" >> "${GITHUB_OUTPUT}"
6362

0 commit comments

Comments
 (0)