Skip to content

Commit 781c5f5

Browse files
jerrymanniljithunnair-amdjeffdaily
committed
[ROCm] cpp_extension allow user to override default flags (pytorch#152432) (#2374)
cherry-pick of pytorch@e4adf5d We need -fgpu-rdc for projects such as DeepEP + rocSHMEM. The default of -no-gpu-rdc doesn't work for such cases. As per pytorch#152432 (comment): "rocshmem shares the same global variable in different files, as deepEP uses CUDAExtention to build the project https://github.com/deepseek-ai/DeepEP/blob/65e2a700f0330f3fb1c26f49a0250d1f9d0ac1e3/setup.py#L51 and depends on rocshmem, this -fgpu-rdc is needed. The current logic in Pytorch prevents users from overriding this flag." Pull Request resolved: pytorch#152432 Approved by: https://github.com/jeffdaily Co-authored-by: Jithun Nair <[email protected]> Co-authored-by: Jeff Daily <[email protected]>
1 parent 76481f7 commit 781c5f5

File tree

1 file changed

+11
-4
lines changed

1 file changed

+11
-4
lines changed

torch/utils/cpp_extension.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2110,11 +2110,18 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
21102110

21112111
def _get_rocm_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
21122112
# If cflags is given, there may already be user-provided arch flags in it
2113-
# (from `extra_compile_args`)
2113+
# (from `extra_compile_args`). If user also specified -fgpu-rdc or -fno-gpu-rdc, we
2114+
# assume they know what they're doing. Otherwise, we force -fno-gpu-rdc default.
2115+
has_gpu_rdc_flag = False
21142116
if cflags is not None:
2117+
has_custom_flags = False
21152118
for flag in cflags:
21162119
if 'amdgpu-target' in flag or 'offload-arch' in flag:
2117-
return ['-fno-gpu-rdc']
2120+
has_custom_flags = True
2121+
elif 'gpu-rdc' in flag:
2122+
has_gpu_rdc_flag = True
2123+
if has_custom_flags:
2124+
return [] if has_gpu_rdc_flag else ['-fno-gpu-rdc']
21182125
# Use same defaults as used for building PyTorch
21192126
# Allow env var to override, just like during initial cmake build.
21202127
_archs = os.environ.get('PYTORCH_ROCM_ARCH', None)
@@ -2127,7 +2134,7 @@ def _get_rocm_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
21272134
else:
21282135
archs = _archs.replace(' ', ';').split(';')
21292136
flags = [f'--offload-arch={arch}' for arch in archs]
2130-
flags += ['-fno-gpu-rdc']
2137+
flags += [] if has_gpu_rdc_flag else ['-fno-gpu-rdc']
21312138
return flags
21322139

21332140
def _get_build_directory(name: str, verbose: bool) -> str:
@@ -2312,8 +2319,8 @@ def _write_ninja_file_to_build_library(path,
23122319

23132320
if with_cuda and IS_HIP_EXTENSION:
23142321
cuda_flags = ['-DWITH_HIP'] + cflags + COMMON_HIP_FLAGS + COMMON_HIPCC_FLAGS
2315-
cuda_flags += extra_cuda_cflags
23162322
cuda_flags += _get_rocm_arch_flags(cuda_flags)
2323+
cuda_flags += extra_cuda_cflags
23172324
elif with_cuda:
23182325
cuda_flags = common_cflags + COMMON_NVCC_FLAGS + _get_cuda_arch_flags()
23192326
if IS_WINDOWS:

0 commit comments

Comments
 (0)