diff --git a/clang/test/CodeGenCUDA/memcpy-libcall.cu b/clang/test/CodeGenCUDA/memcpy-libcall.cu index c20fa2faceb01..5a15201121dd6 100644 --- a/clang/test/CodeGenCUDA/memcpy-libcall.cu +++ b/clang/test/CodeGenCUDA/memcpy-libcall.cu @@ -32,18 +32,20 @@ void __global__ copy_param_to_global(S *global, S param) { // PTX: st.global.b32 } -// PTX-LABEL: .entry _Z19copy_param_to_localPU3AS51SS_( -void __global__ copy_param_to_local(__attribute__((address_space(5))) S *local, - S param) { - __builtin_memcpy(local, ¶m, sizeof(S)); +// PTX-LABEL: .func (.param .b32 func_retval0) _Z19copy_param_to_local1Si( +int __device__ copy_param_to_local(S param, int i) { + S local; + __builtin_memcpy(&local, ¶m, sizeof(S)); // PTX: ld.param.b32 // PTX: st.local.b32 + return local.data[i]; } -// PTX-LABEL: .func _Z21copy_local_to_genericP1SPU3AS5S_( -void __device__ copy_local_to_generic(S *generic, - __attribute__((address_space(5))) S *src) { - __builtin_memcpy(generic, src, sizeof(S)); +// PTX-LABEL: .func _Z21copy_local_to_genericP1Sii( +void __device__ copy_local_to_generic(S *generic, int i, int j) { + S src = {{0, i, 2*i, 3*i, 4*i, 5*i, 6*i, 7*i}}; + src.data[j] = src.data[j+1]; + __builtin_memcpy(generic, &src, sizeof(S)); // PTX: ld.local.b32 // PTX: st.b32 } diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 83c1264aef12b..3955fd10dad0d 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -119,6 +119,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/ModRef.h" +#include "llvm/Support/NVPTXAddrSpace.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -2927,6 +2928,20 @@ void Verifier::visitFunction(const Function &F) { "Calling convention does not support varargs or " "perfect forwarding!", &F); + if (F.getCallingConv() == CallingConv::PTX_Kernel && + TT.getOS() == Triple::CUDA) { + for (const Argument &Arg : F.args()) { + if (Arg.getType()->isPointerTy()) { + auto AS = Arg.getType()->getPointerAddressSpace(); + Check(AS != NVPTXAS::AddressSpace::ADDRESS_SPACE_SHARED, + ".shared ptr kernel args unsupported in CUDA.", &Arg, &F); + Check(AS != NVPTXAS::AddressSpace::ADDRESS_SPACE_CONST, + ".const ptr kernel args unsupported in CUDA.", &Arg, &F); + Check(AS != NVPTXAS::AddressSpace::ADDRESS_SPACE_LOCAL, + ".local ptr kernel args unsupported in CUDA.", &Arg, &F); + } + } + } break; } diff --git a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll index a56b85de80143..e85ccf34bb6ac 100644 --- a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll +++ b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll @@ -1,5 +1,6 @@ -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas -arch=sm_60 - %} +; RUN: llc < %s -mcpu=sm_60 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mcpu=sm_60 | %ptxas -arch=sm_60 - %} +target triple = "nvptx64-nvidia-nvcl" %struct.Large = type { [16 x double] } diff --git a/llvm/test/CodeGen/NVPTX/lower-args-cuda.ll b/llvm/test/CodeGen/NVPTX/lower-args-cuda.ll new file mode 100644 index 0000000000000..5909eff59728f --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/lower-args-cuda.ll @@ -0,0 +1,13 @@ +; RUN: not llc < %s -mcpu=sm_75 -o /dev/null 2>&1 | FileCheck %s + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; Make sure we exit with an error message for this input, as pointers to the +; shared address-space are only supported as kernel args in NVCL, not CUDA. +; CHECK: .shared ptr kernel args unsupported in CUDA. +define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) { + %v = load i32, ptr addrspace(3) %in, align 4 + store i32 %v, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll b/llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll new file mode 100644 index 0000000000000..104e7d50457d1 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll @@ -0,0 +1,17 @@ +; RUN: opt < %s -S -nvptx-lower-args | FileCheck %s --check-prefixes COMMON,IR +; RUN: llc < %s -mcpu=sm_20 | FileCheck %s --check-prefixes COMMON,PTX +; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %} + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-nvcl" + +; COMMON-LABEL: ptr_nongeneric +define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) { +; IR-NOT: addrspacecast +; PTX-NOT: cvta.to.global +; PTX: ld.shared.b32 +; PTX st.global.b32 + %v = load i32, ptr addrspace(3) %in, align 4 + store i32 %v, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll index 246408ecf6a3a..269e36d6f3728 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args.ll @@ -140,29 +140,6 @@ define ptx_kernel void @ptr_generic(ptr %out, ptr %in) { ret void } -define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) { -; IR-LABEL: define ptx_kernel void @ptr_nongeneric( -; IR-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) { -; IR-NEXT: [[V:%.*]] = load i32, ptr addrspace(3) [[IN]], align 4 -; IR-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 -; IR-NEXT: ret void -; -; PTX-LABEL: ptr_nongeneric( -; PTX: { -; PTX-NEXT: .reg .b32 %r<2>; -; PTX-NEXT: .reg .b64 %rd<3>; -; PTX-EMPTY: -; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.b64 %rd1, [ptr_nongeneric_param_0]; -; PTX-NEXT: ld.param.b64 %rd2, [ptr_nongeneric_param_1]; -; PTX-NEXT: ld.shared.b32 %r1, [%rd2]; -; PTX-NEXT: st.global.b32 [%rd1], %r1; -; PTX-NEXT: ret; - %v = load i32, ptr addrspace(3) %in, align 4 - store i32 %v, ptr addrspace(1) %out, align 4 - ret void -} - define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) { ; IRC-LABEL: define ptx_kernel void @ptr_as_int( ; IRC-SAME: i64 noundef [[I:%.*]], i32 noundef [[V:%.*]]) {