-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[clang][CodeGen] Preserve addrspace of enqueue_kernel builtin. #148062
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: None (jofrn) Changes__enqueue_kernel_varargs last parameter is in addrspace(5), but CodeGen currently misses this qualifier. This commit fixes the code to preserve the qualifier by referencing Alloca, which has its casts removed, rather than TmpPtr. Full diff: https://github.com/llvm/llvm-project/pull/148062.diff 2 Files Affected:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 48c91eb4a5b4f..760842d5c0180 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5987,7 +5987,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
auto *Zero = llvm::ConstantInt::get(IntTy, 0);
for (unsigned I = First; I < NumArgs; ++I) {
auto *Index = llvm::ConstantInt::get(IntTy, I - First);
- auto *GEP = Builder.CreateGEP(Tmp.getElementType(), TmpPtr,
+ auto *GEP = Builder.CreateGEP(Tmp.getElementType(), Alloca,
{Zero, Index});
if (I == First)
ElemPtr = GEP;
diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index bbb55b7e14941..bfbed79dc7f16 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -209,9 +209,9 @@ kernel void test_target_features_kernel(global int *i) {
// NOCPU-NEXT: [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 5
// NOCPU-NEXT: [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8
// NOCPU-NEXT: store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8
-// NOCPU-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0
-// NOCPU-NEXT: store i64 100, ptr [[TMP18]], align 8
-// NOCPU-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]])
+// NOCPU-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
+// NOCPU-NEXT: store i64 100, ptr addrspace(5) [[TMP18]], align 8
+// NOCPU-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr addrspace(5) [[TMP18]])
// NOCPU-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0
// NOCPU-NEXT: store i32 32, ptr [[BLOCK_SIZE22]], align 8
// NOCPU-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 1
@@ -587,9 +587,9 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR9]]
-// GFX900-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0
-// GFX900-NEXT: store i64 100, ptr [[TMP18]], align 8
-// GFX900-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]])
+// GFX900-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
+// GFX900-NEXT: store i64 100, ptr addrspace(5) [[TMP18]], align 8
+// GFX900-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr addrspace(5) [[TMP18]])
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR9]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR9]]
// GFX900-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0
|
|
@llvm/pr-subscribers-clang Author: None (jofrn) Changes__enqueue_kernel_varargs last parameter is in addrspace(5), but CodeGen currently misses this qualifier. This commit fixes the code to preserve the qualifier by referencing Alloca, which has its casts removed, rather than TmpPtr. Full diff: https://github.com/llvm/llvm-project/pull/148062.diff 2 Files Affected:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 48c91eb4a5b4f..760842d5c0180 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5987,7 +5987,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
auto *Zero = llvm::ConstantInt::get(IntTy, 0);
for (unsigned I = First; I < NumArgs; ++I) {
auto *Index = llvm::ConstantInt::get(IntTy, I - First);
- auto *GEP = Builder.CreateGEP(Tmp.getElementType(), TmpPtr,
+ auto *GEP = Builder.CreateGEP(Tmp.getElementType(), Alloca,
{Zero, Index});
if (I == First)
ElemPtr = GEP;
diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index bbb55b7e14941..bfbed79dc7f16 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -209,9 +209,9 @@ kernel void test_target_features_kernel(global int *i) {
// NOCPU-NEXT: [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 5
// NOCPU-NEXT: [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8
// NOCPU-NEXT: store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8
-// NOCPU-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0
-// NOCPU-NEXT: store i64 100, ptr [[TMP18]], align 8
-// NOCPU-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]])
+// NOCPU-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
+// NOCPU-NEXT: store i64 100, ptr addrspace(5) [[TMP18]], align 8
+// NOCPU-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr addrspace(5) [[TMP18]])
// NOCPU-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0
// NOCPU-NEXT: store i32 32, ptr [[BLOCK_SIZE22]], align 8
// NOCPU-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 1
@@ -587,9 +587,9 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR9]]
-// GFX900-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0
-// GFX900-NEXT: store i64 100, ptr [[TMP18]], align 8
-// GFX900-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]])
+// GFX900-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
+// GFX900-NEXT: store i64 100, ptr addrspace(5) [[TMP18]], align 8
+// GFX900-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr addrspace(5) [[TMP18]])
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR9]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR9]]
// GFX900-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0
|
|
@llvm/pr-subscribers-clang-codegen Author: None (jofrn) Changes__enqueue_kernel_varargs last parameter is in addrspace(5), but CodeGen currently misses this qualifier. This commit fixes the code to preserve the qualifier by referencing Alloca, which has its casts removed, rather than TmpPtr. Full diff: https://github.com/llvm/llvm-project/pull/148062.diff 2 Files Affected:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 48c91eb4a5b4f..760842d5c0180 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5987,7 +5987,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
auto *Zero = llvm::ConstantInt::get(IntTy, 0);
for (unsigned I = First; I < NumArgs; ++I) {
auto *Index = llvm::ConstantInt::get(IntTy, I - First);
- auto *GEP = Builder.CreateGEP(Tmp.getElementType(), TmpPtr,
+ auto *GEP = Builder.CreateGEP(Tmp.getElementType(), Alloca,
{Zero, Index});
if (I == First)
ElemPtr = GEP;
diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index bbb55b7e14941..bfbed79dc7f16 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -209,9 +209,9 @@ kernel void test_target_features_kernel(global int *i) {
// NOCPU-NEXT: [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 5
// NOCPU-NEXT: [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8
// NOCPU-NEXT: store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8
-// NOCPU-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0
-// NOCPU-NEXT: store i64 100, ptr [[TMP18]], align 8
-// NOCPU-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]])
+// NOCPU-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
+// NOCPU-NEXT: store i64 100, ptr addrspace(5) [[TMP18]], align 8
+// NOCPU-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr addrspace(5) [[TMP18]])
// NOCPU-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0
// NOCPU-NEXT: store i32 32, ptr [[BLOCK_SIZE22]], align 8
// NOCPU-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 1
@@ -587,9 +587,9 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR9]]
-// GFX900-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0
-// GFX900-NEXT: store i64 100, ptr [[TMP18]], align 8
-// GFX900-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]])
+// GFX900-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
+// GFX900-NEXT: store i64 100, ptr addrspace(5) [[TMP18]], align 8
+// GFX900-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr addrspace(5) [[TMP18]])
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR9]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR9]]
// GFX900-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
__enqueue_kernel_varargs last parameter is in addrspace(5), but CodeGen currently misses this qualifier. This commit fixes the code to preserve the qualifier by referencing Alloca, which has its casts removed, rather than TmpPtr.
bd5126d to
b8c027c
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#130216 also relevant
__enqueue_kernel_varargs last parameter is in addrspace(5), but CodeGen currently misses this qualifier. This commit fixes the code to preserve the qualifier by referencing Alloca, which has its casts removed, rather than TmpPtr.