Skip to content

Commit cd667f5

Browse files
committed
clang: Hack around opencl enqueue_block using wrong ABI for aggregrate
EmitAggExprToLValue started wrapping the temporary alloca in an addrspacecast at some point. We take the direct type from this as the pointer argument for the runtime function type, but this isn't correct. Technically, we should be querying the target's ABI for what IR to produce for this sequence. The assumption seems to always have been that this will be indirectly passed with byval (or byref). I started working on a patch to go through the ABI handling, but it seems to require more time and/or clang expertise than I have at the moment.
1 parent 6311e3f commit cd667f5

File tree

2 files changed

+26
-14
lines changed

2 files changed

+26
-14
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6205,13 +6205,25 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
62056205
llvm::Value *Flags = EmitScalarExpr(E->getArg(1));
62066206
LValue NDRangeL = EmitAggExprToLValue(E->getArg(2));
62076207
llvm::Value *Range = NDRangeL.getAddress().emitRawPointer(*this);
6208-
llvm::Type *RangeTy = NDRangeL.getAddress().getType();
6208+
6209+
// FIXME: Look through the addrspacecast which may exist to the stack
6210+
// temporary as a hack.
6211+
//
6212+
// This is hardcoding the assumed ABI of the target function. This assumes
6213+
// direct passing for every argument except NDRange, which is assumed to be
6214+
// byval or byref indirect passed.
6215+
//
6216+
// This should be fixed to query a signature from CGOpenCLRuntime, and go
6217+
// through EmitCallArgs to get the correct target ABI.
6218+
Range = Range->stripPointerCasts();
6219+
6220+
llvm::Type *RangePtrTy = Range->getType();
62096221

62106222
if (NumArgs == 4) {
62116223
// The most basic form of the call with parameters:
62126224
// queue_t, kernel_enqueue_flags_t, ndrange_t, block(void)
62136225
Name = "__enqueue_kernel_basic";
6214-
llvm::Type *ArgTys[] = {QueueTy, Int32Ty, RangeTy, GenericVoidPtrTy,
6226+
llvm::Type *ArgTys[] = {QueueTy, Int32Ty, RangePtrTy, GenericVoidPtrTy,
62156227
GenericVoidPtrTy};
62166228
llvm::FunctionType *FTy = llvm::FunctionType::get(
62176229
Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
@@ -6286,7 +6298,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
62866298
Block, ConstantInt::get(IntTy, NumArgs - 4),
62876299
ElemPtr};
62886300
llvm::Type *const ArgTys[] = {
6289-
QueueTy, IntTy, RangeTy, GenericVoidPtrTy,
6301+
QueueTy, IntTy, RangePtrTy, GenericVoidPtrTy,
62906302
GenericVoidPtrTy, IntTy, ElemPtr->getType()};
62916303

62926304
llvm::FunctionType *FTy = llvm::FunctionType::get(Int32Ty, ArgTys, false);
@@ -6337,7 +6349,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
63376349
Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
63386350

63396351
std::vector<llvm::Type *> ArgTys = {
6340-
QueueTy, Int32Ty, RangeTy, Int32Ty,
6352+
QueueTy, Int32Ty, RangePtrTy, Int32Ty,
63416353
PtrTy, PtrTy, GenericVoidPtrTy, GenericVoidPtrTy};
63426354

63436355
std::vector<llvm::Value *> Args = {Queue, Flags, Range,

clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ kernel void test_target_features_kernel(global int *i) {
140140
// NOCPU-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 4
141141
// NOCPU-NEXT: [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1
142142
// NOCPU-NEXT: store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8
143-
// NOCPU-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr [[TMP_ASCAST]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]])
143+
// NOCPU-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]])
144144
// NOCPU-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8
145145
// NOCPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4
146146
// NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP2_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false)
@@ -162,7 +162,7 @@ kernel void test_target_features_kernel(global int *i) {
162162
// NOCPU-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 5
163163
// NOCPU-NEXT: [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8
164164
// NOCPU-NEXT: store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8
165-
// NOCPU-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr [[TMP2_ASCAST]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]])
165+
// NOCPU-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr addrspace(5) [[VARTMP2]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]])
166166
// NOCPU-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8
167167
// NOCPU-NEXT: [[TMP13:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4
168168
// NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP11_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false)
@@ -186,7 +186,7 @@ kernel void test_target_features_kernel(global int *i) {
186186
// NOCPU-NEXT: store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8
187187
// NOCPU-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0
188188
// NOCPU-NEXT: store i64 100, ptr [[TMP18]], align 8
189-
// NOCPU-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr [[TMP11_ASCAST]], ptr @__test_block_invoke_3_kernel, ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]])
189+
// NOCPU-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr @__test_block_invoke_3_kernel, ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]])
190190
// NOCPU-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0
191191
// NOCPU-NEXT: store i32 32, ptr [[BLOCK_SIZE22]], align 8
192192
// NOCPU-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 1
@@ -204,7 +204,7 @@ kernel void test_target_features_kernel(global int *i) {
204204
// NOCPU-NEXT: [[TMP23:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4
205205
// NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP27_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false)
206206
// NOCPU-NEXT: [[TMP24:%.*]] = load ptr, ptr [[BLOCK20_ASCAST]], align 8
207-
// NOCPU-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr [[TMP27_ASCAST]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]])
207+
// NOCPU-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]])
208208
// NOCPU-NEXT: ret void
209209
//
210210
//
@@ -365,7 +365,7 @@ kernel void test_target_features_kernel(global int *i) {
365365
// NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8
366366
// NOCPU-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4
367367
// NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false)
368-
// NOCPU-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr [[TMP_ASCAST]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
368+
// NOCPU-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
369369
// NOCPU-NEXT: ret void
370370
//
371371
//
@@ -486,7 +486,7 @@ kernel void test_target_features_kernel(global int *i) {
486486
// GFX900-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 4
487487
// GFX900-NEXT: [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]]
488488
// GFX900-NEXT: store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8, !tbaa [[TBAA16]]
489-
// GFX900-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr [[TMP_ASCAST]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]])
489+
// GFX900-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]])
490490
// GFX900-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]]
491491
// GFX900-NEXT: [[TMP6:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]]
492492
// GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP2_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
@@ -508,7 +508,7 @@ kernel void test_target_features_kernel(global int *i) {
508508
// GFX900-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 5
509509
// GFX900-NEXT: [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
510510
// GFX900-NEXT: store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8, !tbaa [[TBAA3]]
511-
// GFX900-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr [[TMP2_ASCAST]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]])
511+
// GFX900-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr addrspace(5) [[VARTMP2]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]])
512512
// GFX900-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]]
513513
// GFX900-NEXT: [[TMP13:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]]
514514
// GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP11_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
@@ -533,7 +533,7 @@ kernel void test_target_features_kernel(global int *i) {
533533
// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR8]]
534534
// GFX900-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0
535535
// GFX900-NEXT: store i64 100, ptr [[TMP18]], align 8
536-
// GFX900-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr [[TMP11_ASCAST]], ptr @__test_block_invoke_3_kernel, ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]])
536+
// GFX900-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr @__test_block_invoke_3_kernel, ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]])
537537
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR8]]
538538
// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR8]]
539539
// GFX900-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0
@@ -553,7 +553,7 @@ kernel void test_target_features_kernel(global int *i) {
553553
// GFX900-NEXT: [[TMP23:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]]
554554
// GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP27_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
555555
// GFX900-NEXT: [[TMP24:%.*]] = load ptr, ptr [[BLOCK20_ASCAST]], align 8, !tbaa [[TBAA16]]
556-
// GFX900-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr [[TMP27_ASCAST]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]])
556+
// GFX900-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]])
557557
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR8]]
558558
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
559559
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
@@ -709,7 +709,7 @@ kernel void test_target_features_kernel(global int *i) {
709709
// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]]
710710
// GFX900-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]]
711711
// GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
712-
// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr [[TMP_ASCAST]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
712+
// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
713713
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
714714
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
715715
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]]

0 commit comments

Comments
 (0)