Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions clang/test/CodeGenCUDA/memcpy-libcall.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,20 @@ void __global__ copy_param_to_global(S *global, S param) {
// PTX: st.global.b32
}

// PTX-LABEL: .entry _Z19copy_param_to_localPU3AS51SS_(
void __global__ copy_param_to_local(__attribute__((address_space(5))) S *local,
S param) {
__builtin_memcpy(local, &param, sizeof(S));
// PTX-LABEL: .func (.param .b32 func_retval0) _Z19copy_param_to_local1Si(
int __device__ copy_param_to_local(S param, int i) {
S local;
__builtin_memcpy(&local, &param, sizeof(S));
// PTX: ld.param.b32
// PTX: st.local.b32
return local.data[i];
}

// PTX-LABEL: .func _Z21copy_local_to_genericP1SPU3AS5S_(
void __device__ copy_local_to_generic(S *generic,
__attribute__((address_space(5))) S *src) {
__builtin_memcpy(generic, src, sizeof(S));
// PTX-LABEL: .func _Z21copy_local_to_genericP1Sii(
void __device__ copy_local_to_generic(S *generic, int i, int j) {
S src = {{0, i, 2*i, 3*i, 4*i, 5*i, 6*i, 7*i}};
src.data[j] = src.data[j+1];
__builtin_memcpy(generic, &src, sizeof(S));
// PTX: ld.local.b32
// PTX: st.b32
}
Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/IR/Verifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/ModRef.h"
#include "llvm/Support/NVPTXAddrSpace.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
Expand Down Expand Up @@ -2927,6 +2928,20 @@ void Verifier::visitFunction(const Function &F) {
"Calling convention does not support varargs or "
"perfect forwarding!",
&F);
if (F.getCallingConv() == CallingConv::PTX_Kernel &&
TT.getOS() == Triple::CUDA) {
for (const Argument &Arg : F.args()) {
if (Arg.getType()->isPointerTy()) {
auto AS = Arg.getType()->getPointerAddressSpace();
Check(AS != NVPTXAS::AddressSpace::ADDRESS_SPACE_SHARED,
".shared ptr kernel args unsupported in CUDA.", &Arg, &F);
Comment on lines +2936 to +2937
Copy link
Member

@Artem-B Artem-B May 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the check should be rephrased to only allow generic and global AS, and error out on anything else, so we don't have to update it when a new AS is added, or if/when someone uses a nonsensical AS.

Check(AS != NVPTXAS::AddressSpace::ADDRESS_SPACE_CONST,
".const ptr kernel args unsupported in CUDA.", &Arg, &F);
Check(AS != NVPTXAS::AddressSpace::ADDRESS_SPACE_LOCAL,
".local ptr kernel args unsupported in CUDA.", &Arg, &F);
}
}
}
break;
}

Expand Down
5 changes: 3 additions & 2 deletions llvm/test/CodeGen/NVPTX/kernel-param-align.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas -arch=sm_60 - %}
; RUN: llc < %s -mcpu=sm_60 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mcpu=sm_60 | %ptxas -arch=sm_60 - %}
target triple = "nvptx64-nvidia-nvcl"

%struct.Large = type { [16 x double] }

Expand Down
13 changes: 13 additions & 0 deletions llvm/test/CodeGen/NVPTX/lower-args-cuda.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
; RUN: not llc < %s -mcpu=sm_75 -o /dev/null 2>&1 | FileCheck %s

target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

; Make sure we exit with an error message for this input, as pointers to the
; shared address-space are only supported as kernel args in NVCL, not CUDA.
; CHECK: .shared ptr kernel args unsupported in CUDA.
define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) {
%v = load i32, ptr addrspace(3) %in, align 4
store i32 %v, ptr addrspace(1) %out, align 4
ret void
}
17 changes: 17 additions & 0 deletions llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
; RUN: opt < %s -S -nvptx-lower-args | FileCheck %s --check-prefixes COMMON,IR
; RUN: llc < %s -mcpu=sm_20 | FileCheck %s --check-prefixes COMMON,PTX
; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %}

target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-nvcl"

; COMMON-LABEL: ptr_nongeneric
define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; IR-NOT: addrspacecast
; PTX-NOT: cvta.to.global
; PTX: ld.shared.b32
; PTX st.global.b32
%v = load i32, ptr addrspace(3) %in, align 4
store i32 %v, ptr addrspace(1) %out, align 4
ret void
}
23 changes: 0 additions & 23 deletions llvm/test/CodeGen/NVPTX/lower-args.ll
Original file line number Diff line number Diff line change
Expand Up @@ -140,29 +140,6 @@ define ptx_kernel void @ptr_generic(ptr %out, ptr %in) {
ret void
}

define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; IR-LABEL: define ptx_kernel void @ptr_nongeneric(
; IR-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) {
; IR-NEXT: [[V:%.*]] = load i32, ptr addrspace(3) [[IN]], align 4
; IR-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; IR-NEXT: ret void
;
; PTX-LABEL: ptr_nongeneric(
; PTX: {
; PTX-NEXT: .reg .b32 %r<2>;
; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [ptr_nongeneric_param_0];
; PTX-NEXT: ld.param.b64 %rd2, [ptr_nongeneric_param_1];
; PTX-NEXT: ld.shared.b32 %r1, [%rd2];
; PTX-NEXT: st.global.b32 [%rd1], %r1;
; PTX-NEXT: ret;
%v = load i32, ptr addrspace(3) %in, align 4
store i32 %v, ptr addrspace(1) %out, align 4
ret void
}

define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) {
; IRC-LABEL: define ptx_kernel void @ptr_as_int(
; IRC-SAME: i64 noundef [[I:%.*]], i32 noundef [[V:%.*]]) {
Expand Down