From fdc2ad605fa03c760cc92fa9407d6b3e8402be50 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Wed, 9 Oct 2024 15:26:43 +0100 Subject: [PATCH 1/5] [SYCL][NVPTX] Enable -fcuda-short-ptr by default This makes pointers to CUDA shared, const, and local address spaces as being 32-bit pointers. This should bring decent performance improvements in certain programs. --- clang/lib/Driver/ToolChains/Clang.cpp | 5 +++-- clang/lib/Driver/ToolChains/Cuda.cpp | 9 +++++---- libclc/CMakeLists.txt | 7 ++++++- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index a7f8a61402533..9ce96cfb53223 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -8537,9 +8537,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, } } - if (IsCuda) { + if (IsCuda || (IsSYCLDevice && Triple.isNVPTX())) { + bool UseShortPtr = IsSYCLDevice && Triple.isNVPTX(); if (Args.hasFlag(options::OPT_fcuda_short_ptr, - options::OPT_fno_cuda_short_ptr, false)) + options::OPT_fno_cuda_short_ptr, UseShortPtr)) CmdArgs.push_back("-fcuda-short-ptr"); } diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index a2b6db69b9284..11faf26261f6a 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -1010,6 +1010,11 @@ void CudaToolChain::addClangTargetOptions( CC1Args.push_back(DriverArgs.MakeArgString(LibSpirvFile)); } + bool UseShortPtr = DeviceOffloadingKind == Action::OFK_SYCL; + if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr, + options::OPT_fno_cuda_short_ptr, UseShortPtr)) + CC1Args.append({"-mllvm", "--nvptx-short-ptr"}); + if (DriverArgs.hasArg(options::OPT_nogpulib)) return; @@ -1035,10 +1040,6 @@ void CudaToolChain::addClangTargetOptions( clang::CudaVersion CudaInstallationVersion = CudaInstallation.version(); - if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr, - options::OPT_fno_cuda_short_ptr, false)) - CC1Args.append({"-mllvm", "--nvptx-short-ptr"}); - if (CudaInstallationVersion >= CudaVersion::UNKNOWN) CC1Args.push_back( DriverArgs.MakeArgString(Twine("-target-sdk-version=") + diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 366fffd454d58..92072c226f6b6 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -450,10 +450,15 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) list(APPEND flags -D__unix__) endif() + set(spirv_flags ${flags}) + if( ARCH STREQUAL nvptx OR ARCH STREQUAL nvptx64 ) + list(APPEND spirv_flags -Xclang -fcuda-short-ptr -mllvm -nvptx-short-ptr) + endif() + add_libclc_builtin_set(libspirv-${arch_suffix} TRIPLE ${clang_triple} TARGET_ENV libspirv - COMPILE_OPT ${flags} + COMPILE_OPT ${spirv_flags} OPT_FLAGS ${opt_flags} FILES ${libspirv_files} GEN_FILES ${libspirv_gen_files} From 9ceb7d3e7d60b034d85ffb09e2b364b9983d2461 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 3 Dec 2024 12:26:05 +0000 Subject: [PATCH 2/5] reduce scope of PR --- clang/lib/Driver/ToolChains/Clang.cpp | 3 +-- libclc/CMakeLists.txt | 23 ++++++++++++++++++++--- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 9501955d31879..c1da2d73a1cab 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -8544,9 +8544,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, } if (IsCuda || (IsSYCLDevice && Triple.isNVPTX())) { - bool UseShortPtr = IsSYCLDevice && Triple.isNVPTX(); if (Args.hasFlag(options::OPT_fcuda_short_ptr, - options::OPT_fno_cuda_short_ptr, UseShortPtr)) + options::OPT_fno_cuda_short_ptr, false)) CmdArgs.push_back("-fcuda-short-ptr"); } diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index d63f40548d40c..bd9e14dfc0b8a 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -506,10 +506,27 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) list(APPEND build_flags -D__unix__) endif() - add_libclc_builtin_set(libspirv-${arch_suffix} + string( TOUPPER "CLC_${MACRO_ARCH}" CLC_TARGET_DEFINE ) + + list( APPEND build_flags + -D__CLC_INTERNAL + -D${CLC_TARGET_DEFINE} + # All libclc builtin libraries see CLC headers + -I${CMAKE_CURRENT_SOURCE_DIR}/clc/include + # FIXME: Fix libclc to not require disabling this noisy warning + -Wno-bitwise-conditional-parentheses + ) + + if( NOT "${cpu}" STREQUAL "" ) + list( APPEND build_flags -mcpu=${cpu} ) + endif() + + add_libclc_builtin_set( + CLC_INTERNAL + ARCH ${ARCH} + ARCH_SUFFIX clc-${arch_suffix} TRIPLE ${clang_triple} - TARGET_ENV libspirv - COMPILE_OPT ${build_flags} + COMPILE_FLAGS ${build_flags} OPT_FLAGS ${opt_flags} LIB_FILES ${clc_lib_files} ) From 4b1ce3bf2406d22ff12b25b436df315b3994ff19 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 3 Dec 2024 12:51:13 +0000 Subject: [PATCH 3/5] add tests --- clang/lib/Driver/ToolChains/Clang.cpp | 1 + clang/test/CodeGenSYCL/nvptx-short-ptr.cpp | 10 ++++++++++ clang/test/Driver/sycl-nvptx-short-ptr.cpp | 16 ++++++++++++++++ 3 files changed, 27 insertions(+) create mode 100644 clang/test/CodeGenSYCL/nvptx-short-ptr.cpp create mode 100644 clang/test/Driver/sycl-nvptx-short-ptr.cpp diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index c1da2d73a1cab..b6b93cb60f4a8 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -8543,6 +8543,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, } } + // Propagate -fcuda-short-ptr if compiling CUDA or SYCL for NVPTX if (IsCuda || (IsSYCLDevice && Triple.isNVPTX())) { if (Args.hasFlag(options::OPT_fcuda_short_ptr, options::OPT_fno_cuda_short_ptr, false)) diff --git a/clang/test/CodeGenSYCL/nvptx-short-ptr.cpp b/clang/test/CodeGenSYCL/nvptx-short-ptr.cpp new file mode 100644 index 0000000000000..8d65baae78b1c --- /dev/null +++ b/clang/test/CodeGenSYCL/nvptx-short-ptr.cpp @@ -0,0 +1,10 @@ +// RUN: %clang_cc1 -fsycl-is-device -disable-llvm-passes \ +// RUN: -triple nvptx-nvidia-cuda -emit-llvm -fcuda-short-ptr -mllvm -nvptx-short-ptr %s -o - \ +// RUN: | FileCheck %s --check-prefix CHECK32 + +// RUN: %clang_cc1 -fsycl-is-device -disable-llvm-passes \ +// RUN: -triple nvptx64-nvidia-cuda -emit-llvm -fcuda-short-ptr -mllvm -nvptx-short-ptr %s -o - \ +// RUN: | FileCheck %s --check-prefix CHECK64 + +// CHECK32: target datalayout = "e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" +// CHECK64: target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" diff --git a/clang/test/Driver/sycl-nvptx-short-ptr.cpp b/clang/test/Driver/sycl-nvptx-short-ptr.cpp new file mode 100644 index 0000000000000..d00057a4709e5 --- /dev/null +++ b/clang/test/Driver/sycl-nvptx-short-ptr.cpp @@ -0,0 +1,16 @@ +// REQUIRES: nvptx-registered-target + +// RUN: %clang -### -nocudalib \ +// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda %s 2>&1 \ +// RUN: | FileCheck --check-prefix=CHECK-DEFAULT %s + +// RUN: %clang -### -nocudalib \ +// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda -fcuda-short-ptr %s 2>&1 \ +// RUN: | FileCheck --check-prefix=CHECK-SHORT %s + + +// CHECK-SHORT: "-mllvm" "--nvptx-short-ptr" +// CHECK-SHORT: "-fcuda-short-ptr" + +// CHECK-DEFAULT-NOT: "--nvptx-short-ptr" +// CHECK-DEFAULT-NOT: "-fcuda-short-ptr" From 4243e07ce4fd27d6d79cc95055bb55fb3a23b2e4 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 3 Dec 2024 17:45:47 +0000 Subject: [PATCH 4/5] remove requires from test --- clang/test/Driver/sycl-nvptx-short-ptr.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/clang/test/Driver/sycl-nvptx-short-ptr.cpp b/clang/test/Driver/sycl-nvptx-short-ptr.cpp index d00057a4709e5..e83ba5954d5cf 100644 --- a/clang/test/Driver/sycl-nvptx-short-ptr.cpp +++ b/clang/test/Driver/sycl-nvptx-short-ptr.cpp @@ -1,5 +1,3 @@ -// REQUIRES: nvptx-registered-target - // RUN: %clang -### -nocudalib \ // RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda %s 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-DEFAULT %s From 516261c3cbccb9665c37fc12ff450857f3d9285d Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Mon, 9 Dec 2024 11:21:26 +0000 Subject: [PATCH 5/5] update test with comments and more checks --- clang/test/CodeGenSYCL/nvptx-short-ptr.cpp | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/clang/test/CodeGenSYCL/nvptx-short-ptr.cpp b/clang/test/CodeGenSYCL/nvptx-short-ptr.cpp index 8d65baae78b1c..90db7373eb967 100644 --- a/clang/test/CodeGenSYCL/nvptx-short-ptr.cpp +++ b/clang/test/CodeGenSYCL/nvptx-short-ptr.cpp @@ -1,10 +1,27 @@ +// Check that when we see the expected data layouts for NVPTX when we pass the +// -nvptx-short-ptr option. + +// RUN: %clang_cc1 -fsycl-is-device -disable-llvm-passes \ +// RUN: -triple nvptx-nvidia-cuda -emit-llvm %s -o - \ +// RUN: | FileCheck %s --check-prefix CHECK32 + // RUN: %clang_cc1 -fsycl-is-device -disable-llvm-passes \ // RUN: -triple nvptx-nvidia-cuda -emit-llvm -fcuda-short-ptr -mllvm -nvptx-short-ptr %s -o - \ // RUN: | FileCheck %s --check-prefix CHECK32 +// RUN: %clang_cc1 -fsycl-is-device -disable-llvm-passes \ +// RUN: -triple nvptx64-nvidia-cuda -emit-llvm %s -o - \ +// RUN: | FileCheck %s --check-prefix CHECK64-DEFAULT + // RUN: %clang_cc1 -fsycl-is-device -disable-llvm-passes \ // RUN: -triple nvptx64-nvidia-cuda -emit-llvm -fcuda-short-ptr -mllvm -nvptx-short-ptr %s -o - \ -// RUN: | FileCheck %s --check-prefix CHECK64 +// RUN: | FileCheck %s --check-prefix CHECK64-SHORT +// Targeting a 32-bit NVPTX, check that we see universal 32-bit pointers (the +// option changes nothing) // CHECK32: target datalayout = "e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" -// CHECK64: target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +// Targeting a 64-bit NVPTX target, check that we see 32-bit pointers for +// shared (3), const (4), and local (5) address spaces only. +// CHECK64-DEFAULT: target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +// CHECK64-SHORT: target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"