diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 9d29f31c77881..7d84310ba0bf5 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -614,6 +614,20 @@ void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention( FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel)); } +/// Return IR struct type for rtinfo struct in rocm-device-libs used for device +/// enqueue. +/// +/// ptr addrspace(1) kernel_object, i32 private_segment_size, +/// i32 group_segment_size + +static llvm::StructType * +getAMDGPURuntimeHandleType(llvm::LLVMContext &C, + llvm::Type *KernelDescriptorPtrTy) { + llvm::Type *Int32 = llvm::Type::getInt32Ty(C); + return llvm::StructType::create(C, {KernelDescriptorPtrTy, Int32, Int32}, + "block.runtime.handle.t"); +} + /// Create an OpenCL kernel for an enqueued block. /// /// The type of the first argument (the block literal) is the struct type @@ -653,23 +667,29 @@ llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel( ArgNames.push_back( llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str())); } - std::string Name = Invoke->getName().str() + "_kernel"; + + llvm::Module &Mod = CGF.CGM.getModule(); + const llvm::DataLayout &DL = Mod.getDataLayout(); + + llvm::Twine Name = Invoke->getName() + "_kernel"; auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false); + + // The kernel itself can be internal, the runtime does not directly access the + // kernel address (only the kernel descriptor). auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name, - &CGF.CGM.getModule()); + &Mod); F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); llvm::AttrBuilder KernelAttrs(C); // FIXME: The invoke isn't applying the right attributes either // FIXME: This is missing setTargetAttributes CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs); - KernelAttrs.addAttribute("enqueued-block"); F->addFnAttrs(KernelAttrs); auto IP = CGF.Builder.saveIP(); auto *BB = llvm::BasicBlock::Create(C, "entry", F); Builder.SetInsertPoint(BB); - const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy); + const auto BlockAlign = DL.getPrefTypeAlign(BlockTy); auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr); BlockPtr->setAlignment(BlockAlign); Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign); @@ -692,7 +712,39 @@ llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel( if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata) F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames)); - return F; + llvm::StructType *HandleTy = getAMDGPURuntimeHandleType( + C, llvm::PointerType::get(C, DL.getDefaultGlobalsAddressSpace())); + llvm::Constant *RuntimeHandleInitializer = + llvm::ConstantAggregateZero::get(HandleTy); + + llvm::Twine RuntimeHandleName = F->getName() + ".runtime.handle"; + + // The runtime needs access to the runtime handle as an external symbol. The + // runtime handle will need to be made external later, in + // AMDGPUExportOpenCLEnqueuedBlocks. The kernel itself has a hidden reference + // inside the runtime handle, and is not directly referenced. + + // TODO: We would initialize the first field by declaring F->getName() + ".kd" + // to reference the kernel descriptor. The runtime wouldn't need to bother + // setting it. We would need to have a final symbol name though. + // TODO: Can we directly use an external symbol with getGlobalIdentifier? + auto *RuntimeHandle = new llvm::GlobalVariable( + Mod, HandleTy, + /*isConstant=*/true, llvm::GlobalValue::InternalLinkage, + /*Initializer=*/RuntimeHandleInitializer, RuntimeHandleName, + /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, + DL.getDefaultGlobalsAddressSpace(), + /*isExternallyInitialized=*/true); + + llvm::MDNode *HandleAsMD = + llvm::MDNode::get(C, llvm::ValueAsMetadata::get(RuntimeHandle)); + F->setMetadata(llvm::LLVMContext::MD_associated, HandleAsMD); + + RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle"); + + CGF.CGM.addUsedGlobal(F); + CGF.CGM.addUsedGlobal(RuntimeHandle); + return RuntimeHandle; } void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr( diff --git a/clang/test/CMakeLists.txt b/clang/test/CMakeLists.txt index 1c93e2b0d9844..51fd4820d7edb 100644 --- a/clang/test/CMakeLists.txt +++ b/clang/test/CMakeLists.txt @@ -136,6 +136,7 @@ if( NOT CLANG_BUILT_STANDALONE ) llvm-dis llvm-dwarfdump llvm-ifs + llvm-link llvm-lto2 llvm-mc llvm-modextract diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel-linking.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel-linking.cl new file mode 100644 index 0000000000000..a61c5530c8a73 --- /dev/null +++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel-linking.cl @@ -0,0 +1,80 @@ +// Make sure that invoking blocks in static functions with the same name in +// different modules are linked together. + +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -fno-ident -DKERNEL_NAME=test_kernel_first -DTYPE=float -DCONST=256.0f -emit-llvm-bc -o %t.0.bc %s +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -fno-ident -DKERNEL_NAME=test_kernel_second -DTYPE=int -DCONST=128.0f -emit-llvm-bc -o %t.1.bc %s + +// Make sure nothing strange happens with the linkage choices. +// RUN: opt -passes=globalopt -o %t.opt.0.bc %t.0.bc +// RUN: opt -passes=globalopt -o %t.opt.1.bc %t.1.bc + +// Check the result of linking +// RUN: llvm-link -S %t.opt.0.bc %t.opt.1.bc -o - | FileCheck %s + +// Make sure that a block invoke used with the same name works in multiple +// translation units + +// CHECK: @llvm.used = appending addrspace(1) global [4 x ptr] [ptr @__static_invoker_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle to ptr), ptr @__static_invoker_block_invoke_kernel.2, ptr addrspacecast (ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle.3 to ptr)], section "llvm.metadata" + + +// CHECK: @__static_invoker_block_invoke_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle" +// CHECK: @__static_invoker_block_invoke_kernel.runtime.handle.3 = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle" + +// CHECK: define internal amdgpu_kernel void @__static_invoker_block_invoke_kernel(<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1) }> %0) #{{[0-9]+}} !associated ![[ASSOC_FIRST_MD:[0-9]+]] + + +// CHECK-LABEL: define internal void @__static_invoker_block_invoke(ptr noundef %.block_descriptor) +// CHECK: call float @llvm.fmuladd.f32 + + +// CHECK-LABEL: define dso_local amdgpu_kernel void @test_kernel_first( + + +// CHECK-LABEL: define internal fastcc void @static_invoker(ptr addrspace(1) noundef %outptr, ptr addrspace(1) noundef %argptr) +// CHECK: call i32 @__enqueue_kernel_basic(ptr addrspace(1) %{{[0-9]+}}, i32 %{{[0-9]+}}, ptr addrspace(5) %tmp, ptr addrspacecast (ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle to ptr), ptr %{{.+}}) + +// CHECK: declare i32 @__enqueue_kernel_basic(ptr addrspace(1), i32, ptr addrspace(5), ptr, ptr) local_unnamed_addr + + +// CHECK: define internal amdgpu_kernel void @__static_invoker_block_invoke_kernel.2(<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1) }> %0) #{{[0-9]+}} !associated ![[ASSOC_SECOND_MD:[0-9]+]] +// CHECK: call void @__static_invoker_block_invoke.4(ptr % + + +// CHECK-LABEL: define internal void @__static_invoker_block_invoke.4(ptr noundef %.block_descriptor) +// CHECK: mul nsw i32 +// CHECK: sitofp +// CHECK: fadd +// CHECK: fptosi + +// CHECK-LABEL: define dso_local amdgpu_kernel void @test_kernel_second(ptr addrspace(1) noundef align 4 %outptr, ptr addrspace(1) noundef align 4 %argptr, ptr addrspace(1) noundef align 4 %difference) + +// CHECK-LABEL: define internal fastcc void @static_invoker.5(ptr addrspace(1) noundef %outptr, ptr addrspace(1) noundef %argptr) unnamed_addr #{{[0-9]+}} { +// CHECK: call i32 @__enqueue_kernel_basic(ptr addrspace(1) %{{[0-9]+}}, i32 %{{[0-9]+}}, ptr addrspace(5) %tmp, ptr addrspacecast (ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle.3 to ptr), ptr %{{.+}}) + + +typedef struct {int a;} ndrange_t; + +static void static_invoker(global TYPE* outptr, global TYPE* argptr) { + queue_t default_queue; + unsigned flags = 0; + ndrange_t ndrange; + + enqueue_kernel(default_queue, flags, ndrange, + ^(void) { + global TYPE* f = argptr; + outptr[0] = f[1] * f[2] + CONST; + }); +} + +kernel void KERNEL_NAME(global TYPE *outptr, global TYPE *argptr, global TYPE *difference) { + queue_t default_queue; + unsigned flags = 0; + ndrange_t ndrange; + + static_invoker(outptr, argptr); + + *difference = CONST; +} + +// CHECK: ![[ASSOC_FIRST_MD]] = !{ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle} +// CHECK: ![[ASSOC_SECOND_MD]] = !{ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle.3} diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl index 9ba36643c4b9e..367217579e765 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl @@ -61,7 +61,13 @@ kernel void test_target_features_kernel(global int *i) { } //. +// CHECK: @__test_block_invoke_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle" +// CHECK: @__test_block_invoke_2_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.0 zeroinitializer, section ".amdgpu.kernel.runtime.handle" +// CHECK: @__test_block_invoke_3_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.1 zeroinitializer, section ".amdgpu.kernel.runtime.handle" +// CHECK: @__test_block_invoke_4_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.2 zeroinitializer, section ".amdgpu.kernel.runtime.handle" // CHECK: @__block_literal_global = internal addrspace(1) constant { i32, i32, ptr } { i32 16, i32 8, ptr @__test_target_features_kernel_block_invoke }, align 8 #0 +// CHECK: @__test_target_features_kernel_block_invoke_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.3 zeroinitializer, section ".amdgpu.kernel.runtime.handle" +// CHECK: @llvm.used = appending addrspace(1) global [10 x ptr] [ptr @__test_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr @__test_block_invoke_2_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr @__test_block_invoke_3_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr @__test_block_invoke_4_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr)], section "llvm.metadata" // CHECK: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 //. // NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone @@ -140,7 +146,7 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 4 // NOCPU-NEXT: [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1 // NOCPU-NEXT: store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8 -// NOCPU-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]]) +// NOCPU-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr [[BLOCK_ASCAST]]) // NOCPU-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8 // NOCPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4 // NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP2_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false) @@ -162,7 +168,7 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 5 // NOCPU-NEXT: [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8 // NOCPU-NEXT: store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8 -// NOCPU-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr addrspace(5) [[VARTMP2]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]]) +// NOCPU-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr addrspace(5) [[VARTMP2]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr [[BLOCK3_ASCAST]]) // NOCPU-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8 // NOCPU-NEXT: [[TMP13:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4 // NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP11_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false) @@ -186,7 +192,7 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8 // NOCPU-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0 // NOCPU-NEXT: store i64 100, ptr [[TMP18]], align 8 -// NOCPU-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr @__test_block_invoke_3_kernel, ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]]) +// NOCPU-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]]) // NOCPU-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0 // NOCPU-NEXT: store i32 32, ptr [[BLOCK_SIZE22]], align 8 // NOCPU-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 1 @@ -204,7 +210,7 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: [[TMP23:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4 // NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP27_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false) // NOCPU-NEXT: [[TMP24:%.*]] = load ptr, ptr [[BLOCK20_ASCAST]], align 8 -// NOCPU-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]]) +// NOCPU-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr [[BLOCK21_ASCAST]]) // NOCPU-NEXT: ret void // // @@ -229,7 +235,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent nounwind // NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !kernel_arg_addr_space [[META7:![0-9]+]] !kernel_arg_access_qual [[META8:![0-9]+]] !kernel_arg_type [[META9:![0-9]+]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10:![0-9]+]] { +// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !associated [[META7:![0-9]+]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META9:![0-9]+]] !kernel_arg_type [[META10:![0-9]+]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11:![0-9]+]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -265,7 +271,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent nounwind // NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR5]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10]] { +// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META12:![0-9]+]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META10]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -307,7 +313,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent nounwind // NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR5]] !kernel_arg_addr_space [[META11:![0-9]+]] !kernel_arg_access_qual [[META12:![0-9]+]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META13]] !kernel_arg_type_qual [[META14:![0-9]+]] { +// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR5]] !associated [[META13:![0-9]+]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META15:![0-9]+]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META17:![0-9]+]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8 @@ -336,7 +342,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent nounwind // NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR5]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10]] { +// NOCPU-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META18:![0-9]+]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META10]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -347,7 +353,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone // NOCPU-LABEL: define {{[^@]+}}@test_target_features_kernel -// NOCPU-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR6:[0-9]+]] !kernel_arg_addr_space [[META15:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META10]] { +// NOCPU-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR6:[0-9]+]] !kernel_arg_addr_space [[META19:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META20:![0-9]+]] !kernel_arg_base_type [[META20]] !kernel_arg_type_qual [[META11]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // NOCPU-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) @@ -365,7 +371,7 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8 // NOCPU-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4 // NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false) -// NOCPU-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr)) +// NOCPU-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr)) // NOCPU-NEXT: ret void // // @@ -385,7 +391,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent nounwind // NOCPU-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel -// NOCPU-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR5]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10]] { +// NOCPU-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR5]] !associated [[META21:![0-9]+]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META10]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11]] { // NOCPU-NEXT: entry: // NOCPU-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5) // NOCPU-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -467,10 +473,10 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16:![0-9]+]] // GFX900-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]] // GFX900-NEXT: store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8:[0-9]+]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7:[0-9]+]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]] // GFX900-NEXT: store i32 0, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17:![0-9]+]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]] // GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19:![0-9]+]] // GFX900-NEXT: [[TMP1:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21:![0-9]+]] @@ -486,7 +492,7 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 4 // GFX900-NEXT: [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]] // GFX900-NEXT: store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8, !tbaa [[TBAA16]] -// GFX900-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr @__test_block_invoke_kernel, ptr [[BLOCK_ASCAST]]) +// GFX900-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr [[BLOCK_ASCAST]]) // GFX900-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]] // GFX900-NEXT: [[TMP6:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP2_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] @@ -508,7 +514,7 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 5 // GFX900-NEXT: [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]] // GFX900-NEXT: store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8, !tbaa [[TBAA3]] -// GFX900-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr addrspace(5) [[VARTMP2]], ptr @__test_block_invoke_2_kernel, ptr [[BLOCK3_ASCAST]]) +// GFX900-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr addrspace(5) [[VARTMP2]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr [[BLOCK3_ASCAST]]) // GFX900-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]] // GFX900-NEXT: [[TMP13:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP11_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] @@ -530,12 +536,12 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 5 // GFX900-NEXT: [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]] // GFX900-NEXT: store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8, !tbaa [[TBAA3]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR8]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR7]] // GFX900-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0 // GFX900-NEXT: store i64 100, ptr [[TMP18]], align 8 -// GFX900-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr @__test_block_invoke_3_kernel, ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]]) -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR8]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR8]] +// GFX900-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]]) +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR7]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR7]] // GFX900-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0 // GFX900-NEXT: store i32 32, ptr [[BLOCK_SIZE22]], align 8 // GFX900-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 1 @@ -553,11 +559,11 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[TMP23:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP27_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] // GFX900-NEXT: [[TMP24:%.*]] = load ptr, ptr [[BLOCK20_ASCAST]], align 8, !tbaa [[TBAA16]] -// GFX900-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr @__test_block_invoke_4_kernel, ptr [[BLOCK21_ASCAST]]) -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR8]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]] +// GFX900-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr [[BLOCK21_ASCAST]]) +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR7]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7]] // GFX900-NEXT: ret void // // @@ -579,7 +585,7 @@ kernel void test_target_features_kernel(global int *i) { // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_kernel -// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] !kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] { +// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META22:![0-9]+]] !kernel_arg_addr_space [[META23:![0-9]+]] !kernel_arg_access_qual [[META24:![0-9]+]] !kernel_arg_type [[META25:![0-9]+]] !kernel_arg_base_type [[META25]] !kernel_arg_type_qual [[META26:![0-9]+]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -612,7 +618,7 @@ kernel void test_target_features_kernel(global int *i) { // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel -// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR6]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] { +// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META27:![0-9]+]] !kernel_arg_addr_space [[META23]] !kernel_arg_access_qual [[META24]] !kernel_arg_type [[META25]] !kernel_arg_base_type [[META25]] !kernel_arg_type_qual [[META26]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -630,7 +636,7 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr // GFX900-NEXT: [[LP_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LP_ADDR]] to ptr // GFX900-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8 -// GFX900-NEXT: store ptr addrspace(3) [[LP]], ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA26:![0-9]+]] +// GFX900-NEXT: store ptr addrspace(3) [[LP]], ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA28:![0-9]+]] // GFX900-NEXT: [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6 // GFX900-NEXT: [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA16]] // GFX900-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3 @@ -643,7 +649,7 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !tbaa [[TBAA7]] // GFX900-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0 // GFX900-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[TBAA3]] -// GFX900-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA26]] +// GFX900-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA28]] // GFX900-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0 // GFX900-NEXT: store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: ret void @@ -651,7 +657,7 @@ kernel void test_target_features_kernel(global int *i) { // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel -// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR6]] !kernel_arg_addr_space [[META27:![0-9]+]] !kernel_arg_access_qual [[META28:![0-9]+]] !kernel_arg_type [[META29:![0-9]+]] !kernel_arg_base_type [[META29]] !kernel_arg_type_qual [[META30:![0-9]+]] { +// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR5]] !associated [[META29:![0-9]+]] !kernel_arg_addr_space [[META30:![0-9]+]] !kernel_arg_access_qual [[META31:![0-9]+]] !kernel_arg_type [[META32:![0-9]+]] !kernel_arg_base_type [[META32]] !kernel_arg_type_qual [[META33:![0-9]+]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8 @@ -671,13 +677,13 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA3]] // GFX900-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4 // GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA7]] -// GFX900-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR9:[0-9]+]] +// GFX900-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR8:[0-9]+]] // GFX900-NEXT: ret void // // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel -// GFX900-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR6]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] { +// GFX900-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META34:![0-9]+]] !kernel_arg_addr_space [[META23]] !kernel_arg_access_qual [[META24]] !kernel_arg_type [[META25]] !kernel_arg_base_type [[META25]] !kernel_arg_type_qual [[META26]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -688,7 +694,7 @@ kernel void test_target_features_kernel(global int *i) { // // GFX900: Function Attrs: convergent norecurse nounwind // GFX900-LABEL: define {{[^@]+}}@test_target_features_kernel -// GFX900-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META31:![0-9]+]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META32:![0-9]+]] !kernel_arg_base_type [[META32]] !kernel_arg_type_qual [[META25]] { +// GFX900-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META35:![0-9]+]] !kernel_arg_access_qual [[META24]] !kernel_arg_type [[META36:![0-9]+]] !kernel_arg_base_type [[META36]] !kernel_arg_type_qual [[META26]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // GFX900-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) @@ -700,19 +706,19 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[FLAGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FLAGS]] to ptr // GFX900-NEXT: [[NDRANGE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NDRANGE]] to ptr // GFX900-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr -// GFX900-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA33:![0-9]+]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]] +// GFX900-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA37:![0-9]+]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]] // GFX900-NEXT: store i32 0, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]] // GFX900-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime() // GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]] // GFX900-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] -// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr)) -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]] +// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr)) +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7]] // GFX900-NEXT: ret void // // @@ -729,7 +735,7 @@ kernel void test_target_features_kernel(global int *i) { // // GFX900: Function Attrs: convergent nounwind // GFX900-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel -// GFX900-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR6]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] { +// GFX900-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR5]] !associated [[META39:![0-9]+]] !kernel_arg_addr_space [[META23]] !kernel_arg_access_qual [[META24]] !kernel_arg_type [[META25]] !kernel_arg_base_type [[META25]] !kernel_arg_type_qual [[META26]] { // GFX900-NEXT: entry: // GFX900-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5) // GFX900-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -743,7 +749,7 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU: attributes #[[ATTR2]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } // NOCPU: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } // NOCPU: attributes #[[ATTR4]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// NOCPU: attributes #[[ATTR5]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "enqueued-block" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// NOCPU: attributes #[[ATTR5]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // NOCPU: attributes #[[ATTR6]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" } // NOCPU: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } // NOCPU: attributes #[[ATTR8]] = { convergent nounwind } @@ -754,10 +760,9 @@ kernel void test_target_features_kernel(global int *i) { // GFX900: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } // GFX900: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } // GFX900: attributes #[[ATTR5]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } -// GFX900: attributes #[[ATTR6]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "enqueued-block" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } -// GFX900: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } -// GFX900: attributes #[[ATTR8]] = { nounwind } -// GFX900: attributes #[[ATTR9]] = { convergent nounwind } +// GFX900: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } +// GFX900: attributes #[[ATTR7]] = { nounwind } +// GFX900: attributes #[[ATTR8]] = { convergent nounwind } //. // NOCPU: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} // NOCPU: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} @@ -766,16 +771,21 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU: [[META4]] = !{!"none", !"none", !"none", !"none"} // NOCPU: [[META5]] = !{!"char*", !"char", !"long*", !"long"} // NOCPU: [[META6]] = !{!"", !"", !"", !""} -// NOCPU: [[META7]] = !{i32 0} -// NOCPU: [[META8]] = !{!"none"} -// NOCPU: [[META9]] = !{!"__block_literal"} -// NOCPU: [[META10]] = !{!""} -// NOCPU: [[META11]] = !{i32 0, i32 3} -// NOCPU: [[META12]] = !{!"none", !"none"} -// NOCPU: [[META13]] = !{!"__block_literal", !"void*"} -// NOCPU: [[META14]] = !{!"", !""} -// NOCPU: [[META15]] = !{i32 1} -// NOCPU: [[META16]] = !{!"int*"} +// NOCPU: [[META7]] = !{ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle} +// NOCPU: [[META8]] = !{i32 0} +// NOCPU: [[META9]] = !{!"none"} +// NOCPU: [[META10]] = !{!"__block_literal"} +// NOCPU: [[META11]] = !{!""} +// NOCPU: [[META12]] = !{ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle} +// NOCPU: [[META13]] = !{ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle} +// NOCPU: [[META14]] = !{i32 0, i32 3} +// NOCPU: [[META15]] = !{!"none", !"none"} +// NOCPU: [[META16]] = !{!"__block_literal", !"void*"} +// NOCPU: [[META17]] = !{!"", !""} +// NOCPU: [[META18]] = !{ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle} +// NOCPU: [[META19]] = !{i32 1} +// NOCPU: [[META20]] = !{!"int*"} +// NOCPU: [[META21]] = !{ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle} //. // GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} // GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} @@ -799,19 +809,24 @@ kernel void test_target_features_kernel(global int *i) { // GFX900: [[TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0} // GFX900: [[META20]] = !{!"queue_t", [[META5]], i64 0} // GFX900: [[TBAA_STRUCT21]] = !{i64 0, i64 4, [[TBAA17]]} -// GFX900: [[META22]] = !{i32 0} -// GFX900: [[META23]] = !{!"none"} -// GFX900: [[META24]] = !{!"__block_literal"} -// GFX900: [[META25]] = !{!""} -// GFX900: [[TBAA26]] = !{[[META9]], [[META9]], i64 0} -// GFX900: [[META27]] = !{i32 0, i32 3} -// GFX900: [[META28]] = !{!"none", !"none"} -// GFX900: [[META29]] = !{!"__block_literal", !"void*"} -// GFX900: [[META30]] = !{!"", !""} -// GFX900: [[META31]] = !{i32 1} -// GFX900: [[META32]] = !{!"int*"} -// GFX900: [[TBAA33]] = !{[[META34:![0-9]+]], [[META34]], i64 0} -// GFX900: [[META34]] = !{!"p1 int", [[META9]], i64 0} +// GFX900: [[META22]] = !{ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle} +// GFX900: [[META23]] = !{i32 0} +// GFX900: [[META24]] = !{!"none"} +// GFX900: [[META25]] = !{!"__block_literal"} +// GFX900: [[META26]] = !{!""} +// GFX900: [[META27]] = !{ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle} +// GFX900: [[TBAA28]] = !{[[META9]], [[META9]], i64 0} +// GFX900: [[META29]] = !{ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle} +// GFX900: [[META30]] = !{i32 0, i32 3} +// GFX900: [[META31]] = !{!"none", !"none"} +// GFX900: [[META32]] = !{!"__block_literal", !"void*"} +// GFX900: [[META33]] = !{!"", !""} +// GFX900: [[META34]] = !{ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle} +// GFX900: [[META35]] = !{i32 1} +// GFX900: [[META36]] = !{!"int*"} +// GFX900: [[TBAA37]] = !{[[META38:![0-9]+]], [[META38]], i64 0} +// GFX900: [[META38]] = !{!"p1 int", [[META9]], i64 0} +// GFX900: [[META39]] = !{ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle} //. //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: // CHECK: {{.*}} diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index def6addd595e8..162267bd00554 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -2320,6 +2320,9 @@ if needed. as position independent code. See :ref:`amdgpu-code-conventions` for information on conventions used in the isa generation. +``.amdgpu.kernel.runtime.handle`` + Symbols used for device enqueue. + .. _amdgpu-note-records: Note Records diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index cb4ecc60aa473..0e4e135e90972 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -48,6 +48,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Regex.h" #include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" #include #include #include @@ -5518,6 +5519,51 @@ struct AMDGPUUnsafeFPAtomicsUpgradeVisitor }; } // namespace +static StructType *getAMDGPURuntimeHandleType(LLVMContext &C, + Type *KernelDescriptorPtrTy) { + Type *Int32 = Type::getInt32Ty(C); + return StructType::create(C, {KernelDescriptorPtrTy, Int32, Int32}, + "block.runtime.handle.t"); +} + +/// Rewrite to new scheme for enqueued block lowering +static void upgradeAMDGPUKernelEnqueuedBlock(Function &F) { + if (F.isMaterializable()) { + // A verifier error is produced if we add metadata to the function during + // linking. + return; + } + + const StringLiteral EnqueuedBlockName("enqueued-block"); + if (!F.hasFnAttribute(EnqueuedBlockName)) + return; + + F.removeFnAttr(EnqueuedBlockName); + + Module *M = F.getParent(); + LLVMContext &Ctx = M->getContext(); + const DataLayout &DL = M->getDataLayout(); + + StructType *HandleTy = getAMDGPURuntimeHandleType( + Ctx, PointerType::get(Ctx, DL.getDefaultGlobalsAddressSpace())); + + Twine RuntimeHandleName = F.getName() + ".runtime.handle"; + + auto *RuntimeHandle = new GlobalVariable( + *M, HandleTy, + /*isConstant=*/true, F.getLinkage(), + /*Initializer=*/ConstantAggregateZero::get(HandleTy), RuntimeHandleName, + /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal, + DL.getDefaultGlobalsAddressSpace(), + /*isExternallyInitialized=*/true); + RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle"); + + MDNode *HandleAsMD = MDNode::get(Ctx, ValueAsMetadata::get(RuntimeHandle)); + F.setMetadata(LLVMContext::MD_associated, HandleAsMD); + + appendToUsed(*M, {&F, RuntimeHandle}); +} + void llvm::UpgradeFunctionAttributes(Function &F) { // If a function definition doesn't have the strictfp attribute, // convert any callsite strictfp attributes to nobuiltin. @@ -5558,6 +5604,9 @@ void llvm::UpgradeFunctionAttributes(Function &F) { F.removeFnAttr("amdgpu-unsafe-fp-atomics"); } } + + if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) + upgradeAMDGPUKernelEnqueuedBlock(F); } static bool isOldLoopArgument(Metadata *MD) { diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt index eb00829fd8c70..a78c58c807f6a 100644 --- a/llvm/lib/IR/CMakeLists.txt +++ b/llvm/lib/IR/CMakeLists.txt @@ -92,6 +92,7 @@ add_llvm_component_library(LLVMCore LINK_COMPONENTS BinaryFormat Demangle + TransformUtils Remarks Support TargetParser diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 57297288eecb4..8d693fbdb3ba3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -456,9 +456,9 @@ void initializeAMDGPUExternalAAWrapperPass(PassRegistry&); void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &); -ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(); -void initializeAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(PassRegistry &); -extern char &AMDGPUOpenCLEnqueuedBlockLoweringLegacyID; +ModulePass *createAMDGPUExportKernelRuntimeHandlesLegacyPass(); +void initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(PassRegistry &); +extern char &AMDGPUExportKernelRuntimeHandlesLegacyID; void initializeGCNNSAReassignLegacyPass(PassRegistry &); extern char &GCNNSAReassignID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExportKernelRuntimeHandles.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExportKernelRuntimeHandles.cpp new file mode 100644 index 0000000000000..a42d94d56b802 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUExportKernelRuntimeHandles.cpp @@ -0,0 +1,110 @@ +//===- AMDGPUExportKernelRuntimeHandles.cpp - Lower enqueued block --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// \file +// +// Give any globals used for OpenCL block enqueue runtime handles external +// linkage so the runtime may access them. These should behave like internal +// functions for purposes of linking, but need to have an external symbol in the +// final object for the runtime to access them. +// +// TODO: This could be replaced with a new linkage type or global object +// metadata that produces an external symbol in the final object, but allows +// rename on IR linking. Alternatively if we can rely on +// GlobalValue::getGlobalIdentifier we can just make these external symbols to +// begin with. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUExportKernelRuntimeHandles.h" +#include "AMDGPU.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +#define DEBUG_TYPE "amdgpu-export-kernel-runtime-handles" + +using namespace llvm; + +namespace { + +/// Lower enqueued blocks. +class AMDGPUExportKernelRuntimeHandlesLegacy : public ModulePass { +public: + static char ID; + + explicit AMDGPUExportKernelRuntimeHandlesLegacy() : ModulePass(ID) {} + +private: + bool runOnModule(Module &M) override; +}; + +} // end anonymous namespace + +char AMDGPUExportKernelRuntimeHandlesLegacy::ID = 0; + +char &llvm::AMDGPUExportKernelRuntimeHandlesLegacyID = + AMDGPUExportKernelRuntimeHandlesLegacy::ID; + +INITIALIZE_PASS(AMDGPUExportKernelRuntimeHandlesLegacy, DEBUG_TYPE, + "Externalize enqueued block runtime handles", false, false) + +ModulePass *llvm::createAMDGPUExportKernelRuntimeHandlesLegacyPass() { + return new AMDGPUExportKernelRuntimeHandlesLegacy(); +} + +static bool exportKernelRuntimeHandles(Module &M) { + bool Changed = false; + + const StringLiteral HandleSectionName(".amdgpu.kernel.runtime.handle"); + + for (GlobalVariable &GV : M.globals()) { + if (GV.getSection() == HandleSectionName) { + GV.setLinkage(GlobalValue::ExternalLinkage); + GV.setDSOLocal(false); + Changed = true; + } + } + + if (!Changed) + return false; + + // FIXME: We shouldn't really need to export the kernel address. We can + // initialize the runtime handle with the kernel descriptor. + for (Function &F : M) { + if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL) + continue; + + const MDNode *Associated = F.getMetadata(LLVMContext::MD_associated); + if (!Associated) + continue; + + auto *VM = cast(Associated->getOperand(0)); + auto *Handle = dyn_cast(VM->getValue()); + if (Handle && Handle->getSection() == HandleSectionName) { + F.setLinkage(GlobalValue::ExternalLinkage); + F.setVisibility(GlobalValue::ProtectedVisibility); + } + } + + return Changed; +} + +bool AMDGPUExportKernelRuntimeHandlesLegacy::runOnModule(Module &M) { + return exportKernelRuntimeHandles(M); +} + +PreservedAnalyses +AMDGPUExportKernelRuntimeHandlesPass::run(Module &M, + ModuleAnalysisManager &MAM) { + if (!exportKernelRuntimeHandles(M)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet>(); + return PA; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUExportKernelRuntimeHandles.h similarity index 50% rename from llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.h rename to llvm/lib/Target/AMDGPU/AMDGPUExportKernelRuntimeHandles.h index 16ed7c18d8523..6e7ebc977668b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUExportKernelRuntimeHandles.h @@ -1,4 +1,4 @@ -//===- AMDGPUOpenCLEnqueuedBlockLowering.h -----------------------*- C++-*-===// +//===- AMDGPUExportKernelRuntimeHandles.h -----------------------*- C++-*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,18 +6,18 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_AMDGPU_OPENCLENQUEUEDBLOCKLOWERING_H -#define LLVM_LIB_TARGET_AMDGPU_OPENCLENQUEUEDBLOCKLOWERING_H +#ifndef LLVM_LIB_TARGET_AMDGPU_EXPORTKERNELRUNTIMEHANDLES_H +#define LLVM_LIB_TARGET_AMDGPU_EXPORTKERNELRUNTIMEHANDLES_H #include "llvm/IR/PassManager.h" namespace llvm { -class AMDGPUOpenCLEnqueuedBlockLoweringPass - : public PassInfoMixin { +class AMDGPUExportKernelRuntimeHandlesPass + : public PassInfoMixin { public: - AMDGPUOpenCLEnqueuedBlockLoweringPass() = default; + AMDGPUExportKernelRuntimeHandlesPass() = default; PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); }; } // namespace llvm -#endif // LLVM_LIB_TARGET_AMDGPU_OPENCLENQUEUEDBLOCKLOWERING_H +#endif // LLVM_LIB_TARGET_AMDGPU_EXPORTKERNELRUNTIMEHANDLES_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index ee8a700f988dc..2991778a1bbc7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -21,6 +21,8 @@ #include "llvm/IR/Module.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" +#include "llvm/Target/TargetLoweringObjectFile.h" + using namespace llvm; static std::pair getArgumentTypeAlign(const Argument &Arg, @@ -38,6 +40,27 @@ static std::pair getArgumentTypeAlign(const Argument &Arg, return std::pair(Ty, *ArgAlign); } +/// Find the mangled symbol name for the runtime handle for \p EnqueuedBlock +static std::string getEnqueuedBlockSymbolName(const AMDGPUTargetMachine &TM, + const Function &EnqueuedBlock) { + const MDNode *Associated = + EnqueuedBlock.getMetadata(LLVMContext::MD_associated); + if (!Associated) + return ""; + + auto *VM = cast(Associated->getOperand(0)); + auto *RuntimeHandle = + dyn_cast(VM->getValue()->stripPointerCasts()); + if (!RuntimeHandle || + RuntimeHandle->getSection() != ".amdgpu.kernel.runtime.handle") + return ""; + + SmallString<128> Name; + TM.getNameWithPrefix(Name, RuntimeHandle, + TM.getObjFileLowering()->getMangler()); + return Name.str().str(); +} + namespace llvm { static cl::opt DumpHSAMetadata( @@ -230,7 +253,8 @@ void MetadataStreamerMsgPackV4::emitKernelLanguage(const Function &Func, Kern[".language_version"] = LanguageVersion; } -void MetadataStreamerMsgPackV4::emitKernelAttrs(const Function &Func, +void MetadataStreamerMsgPackV4::emitKernelAttrs(const AMDGPUTargetMachine &TM, + const Function &Func, msgpack::MapDocNode Kern) { if (auto *Node = Func.getMetadata("reqd_work_group_size")) @@ -244,11 +268,13 @@ void MetadataStreamerMsgPackV4::emitKernelAttrs(const Function &Func, mdconst::extract(Node->getOperand(1))->getZExtValue()), /*Copy=*/true); } - if (Func.hasFnAttribute("runtime-handle")) { - Kern[".device_enqueue_symbol"] = Kern.getDocument()->getNode( - Func.getFnAttribute("runtime-handle").getValueAsString().str(), - /*Copy=*/true); + + std::string HandleName = getEnqueuedBlockSymbolName(TM, Func); + if (!HandleName.empty()) { + Kern[".device_enqueue_symbol"] = + Kern.getDocument()->getNode(std::move(HandleName), /*Copy=*/true); } + if (Func.hasFnAttribute("device-init")) Kern[".kind"] = Kern.getDocument()->getNode("init"); else if (Func.hasFnAttribute("device-fini")) @@ -567,12 +593,13 @@ void MetadataStreamerMsgPackV4::emitKernel(const MachineFunction &MF, auto Kernels = getRootMetadata("amdhsa.kernels").getArray(/*Convert=*/true); + auto &TM = static_cast(MF.getTarget()); { Kern[".name"] = Kern.getDocument()->getNode(Func.getName()); Kern[".symbol"] = Kern.getDocument()->getNode( (Twine(Func.getName()) + Twine(".kd")).str(), /*Copy=*/true); emitKernelLanguage(Func, Kern); - emitKernelAttrs(Func, Kern); + emitKernelAttrs(TM, Func, Kern); emitKernelArgs(MF, Kern); } @@ -698,9 +725,10 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs( emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args); } -void MetadataStreamerMsgPackV5::emitKernelAttrs(const Function &Func, +void MetadataStreamerMsgPackV5::emitKernelAttrs(const AMDGPUTargetMachine &TM, + const Function &Func, msgpack::MapDocNode Kern) { - MetadataStreamerMsgPackV4::emitKernelAttrs(Func, Kern); + MetadataStreamerMsgPackV4::emitKernelAttrs(TM, Func, Kern); if (Func.getFnAttribute("uniform-work-group-size").getValueAsBool()) Kern[".uniform_work_group_size"] = Kern.getDocument()->getNode(1); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index fd76666dc360b..22dfcb4a4ec1d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -23,6 +23,7 @@ namespace llvm { +class AMDGPUTargetMachine; class AMDGPUTargetStreamer; class Argument; class DataLayout; @@ -59,7 +60,8 @@ class MetadataStreamer { virtual void emitVersion() = 0; virtual void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset, msgpack::ArrayDocNode Args) = 0; - virtual void emitKernelAttrs(const Function &Func, + virtual void emitKernelAttrs(const AMDGPUTargetMachine &TM, + const Function &Func, msgpack::MapDocNode Kern) = 0; }; @@ -100,7 +102,8 @@ class LLVM_EXTERNAL_VISIBILITY MetadataStreamerMsgPackV4 void emitKernelLanguage(const Function &Func, msgpack::MapDocNode Kern); - void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern) override; + void emitKernelAttrs(const AMDGPUTargetMachine &TM, const Function &Func, + msgpack::MapDocNode Kern) override; void emitKernelArgs(const MachineFunction &MF, msgpack::MapDocNode Kern); @@ -146,7 +149,8 @@ class MetadataStreamerMsgPackV5 : public MetadataStreamerMsgPackV4 { void emitVersion() override; void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset, msgpack::ArrayDocNode Args) override; - void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern) override; + void emitKernelAttrs(const AMDGPUTargetMachine &TM, const Function &Func, + msgpack::MapDocNode Kern) override; public: MetadataStreamerMsgPackV5() = default; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp deleted file mode 100644 index fbd15ad176e3b..0000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp +++ /dev/null @@ -1,136 +0,0 @@ -//===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// \file -// This post-linking pass replaces the function pointer of enqueued -// block kernel with a global variable (runtime handle) and adds -// "runtime-handle" attribute to the enqueued block kernel. -// -// In LLVM CodeGen the runtime-handle metadata will be translated to -// RuntimeHandle metadata in code object. Runtime allocates a global buffer -// for each kernel with RuntimeHandle metadata and saves the kernel address -// required for the AQL packet into the buffer. __enqueue_kernel function -// in device library knows that the invoke function pointer in the block -// literal is actually runtime handle and loads the kernel address from it -// and put it into AQL packet for dispatching. -// -// This cannot be done in FE since FE cannot create a unique global variable -// with external linkage across LLVM modules. The global variable with internal -// linkage does not work since optimization passes will try to replace loads -// of the global variable with its initialization value. -// -// It also identifies the kernels directly or indirectly enqueues kernels -// and adds "calls-enqueue-kernel" function attribute to them, which will -// be used to determine whether to emit runtime metadata for the kernel -// enqueue related hidden kernel arguments. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUOpenCLEnqueuedBlockLowering.h" -#include "AMDGPU.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Mangler.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Support/Debug.h" - -#define DEBUG_TYPE "amdgpu-lower-enqueued-block" - -using namespace llvm; - -namespace { - -/// Lower enqueued blocks. -class AMDGPUOpenCLEnqueuedBlockLowering { -public: - bool run(Module &M); -}; - -class AMDGPUOpenCLEnqueuedBlockLoweringLegacy : public ModulePass { -public: - static char ID; - - explicit AMDGPUOpenCLEnqueuedBlockLoweringLegacy() : ModulePass(ID) {} - -private: - bool runOnModule(Module &M) override; -}; - -} // end anonymous namespace - -char AMDGPUOpenCLEnqueuedBlockLoweringLegacy::ID = 0; - -char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringLegacyID = - AMDGPUOpenCLEnqueuedBlockLoweringLegacy::ID; - -INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLoweringLegacy, DEBUG_TYPE, - "Lower OpenCL enqueued blocks", false, false) - -ModulePass *llvm::createAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass() { - return new AMDGPUOpenCLEnqueuedBlockLoweringLegacy(); -} - -bool AMDGPUOpenCLEnqueuedBlockLoweringLegacy::runOnModule(Module &M) { - AMDGPUOpenCLEnqueuedBlockLowering Impl; - return Impl.run(M); -} - -PreservedAnalyses -AMDGPUOpenCLEnqueuedBlockLoweringPass::run(Module &M, ModuleAnalysisManager &) { - AMDGPUOpenCLEnqueuedBlockLowering Impl; - if (Impl.run(M)) - return PreservedAnalyses::none(); - return PreservedAnalyses::all(); -} - -bool AMDGPUOpenCLEnqueuedBlockLowering::run(Module &M) { - DenseSet Callers; - auto &C = M.getContext(); - bool Changed = false; - - // ptr kernel_object, i32 private_segment_size, i32 group_segment_size - StructType *HandleTy = nullptr; - - for (auto &F : M.functions()) { - if (F.hasFnAttribute("enqueued-block")) { - if (!F.hasName()) { - SmallString<64> Name; - Mangler::getNameWithPrefix(Name, "__amdgpu_enqueued_kernel", - M.getDataLayout()); - F.setName(Name); - } - LLVM_DEBUG(dbgs() << "found enqueued kernel: " << F.getName() << '\n'); - auto RuntimeHandle = (F.getName() + ".runtime_handle").str(); - if (!HandleTy) { - Type *Int32 = Type::getInt32Ty(C); - HandleTy = - StructType::create(C, {PointerType::getUnqual(C), Int32, Int32}, - "block.runtime.handle.t"); - } - - auto *GV = new GlobalVariable( - M, HandleTy, - /*isConstant=*/true, GlobalValue::ExternalLinkage, - /*Initializer=*/Constant::getNullValue(HandleTy), RuntimeHandle, - /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal, - AMDGPUAS::GLOBAL_ADDRESS, - /*isExternallyInitialized=*/true); - LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n'); - - F.replaceAllUsesWith(ConstantExpr::getAddrSpaceCast(GV, F.getType())); - F.addFnAttr("runtime-handle", RuntimeHandle); - F.setLinkage(GlobalValue::ExternalLinkage); - Changed = true; - } - } - - return Changed; -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 1050855176c04..d072c23b6aa85 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -17,10 +17,10 @@ #define MODULE_PASS(NAME, CREATE_PASS) #endif MODULE_PASS("amdgpu-always-inline", AMDGPUAlwaysInlinePass()) +MODULE_PASS("amdgpu-export-kernel-runtime-handles", AMDGPUExportKernelRuntimeHandlesPass()) MODULE_PASS("amdgpu-lower-buffer-fat-pointers", AMDGPULowerBufferFatPointersPass(*this)) MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass()) -MODULE_PASS("amdgpu-lower-enqueued-block", AMDGPUOpenCLEnqueuedBlockLoweringPass()) MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this)) MODULE_PASS("amdgpu-perf-hint", AMDGPUPerfHintAnalysisPass( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index ce3dcd920bce3..41658a60fce4b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -19,10 +19,10 @@ #include "AMDGPUAliasAnalysis.h" #include "AMDGPUCtorDtorLowering.h" #include "AMDGPUExportClustering.h" +#include "AMDGPUExportKernelRuntimeHandles.h" #include "AMDGPUIGroupLP.h" #include "AMDGPUISelDAGToDAG.h" #include "AMDGPUMacroFusion.h" -#include "AMDGPUOpenCLEnqueuedBlockLowering.h" #include "AMDGPUPerfHintAnalysis.h" #include "AMDGPURemoveIncompatibleFunctions.h" #include "AMDGPUSplitModule.h" @@ -517,7 +517,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPULowerKernelArgumentsPass(*PR); initializeAMDGPUPromoteKernelArgumentsPass(*PR); initializeAMDGPULowerKernelAttributesPass(*PR); - initializeAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(*PR); + initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); initializeAMDGPUPreLegalizerCombinerPass(*PR); initializeAMDGPURegBankCombinerPass(*PR); @@ -1225,8 +1225,8 @@ void AMDGPUPassConfig::addIRPasses() { if (Arch == Triple::r600) addPass(createR600OpenCLImageTypeLoweringPass()); - // Replace OpenCL enqueued block function pointers with global variables. - addPass(createAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass()); + // Make enqueued block runtime handles externally visible. + addPass(createAMDGPUExportKernelRuntimeHandlesLegacyPass()); // Lower LDS accesses to global memory pass if address sanitizer is enabled. if (EnableSwLowerLDS) @@ -1972,7 +1972,7 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { addPass(AMDGPUAlwaysInlinePass()); addPass(AlwaysInlinerPass()); - addPass(AMDGPUOpenCLEnqueuedBlockLoweringPass()); + addPass(AMDGPUExportKernelRuntimeHandlesPass()); if (EnableSwLowerLDS) addPass(AMDGPUSwLowerLDSPass(TM)); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 408da0536237e..09a3096602fc3 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -55,6 +55,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUCombinerHelper.cpp AMDGPUCtorDtorLowering.cpp AMDGPUExportClustering.cpp + AMDGPUExportKernelRuntimeHandles.cpp AMDGPUFrameLowering.cpp AMDGPUGlobalISelDivergenceLowering.cpp AMDGPUGlobalISelUtils.cpp @@ -84,7 +85,6 @@ add_llvm_target(AMDGPUCodeGen AMDGPUMCResourceInfo.cpp AMDGPUMarkLastScratchLoad.cpp AMDGPUMIRFormatter.cpp - AMDGPUOpenCLEnqueuedBlockLowering.cpp AMDGPUPerfHintAnalysis.cpp AMDGPUPostLegalizerCombiner.cpp AMDGPUPreLegalizerCombiner.cpp diff --git a/llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll b/llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll new file mode 100644 index 0000000000000..41521c1f2025d --- /dev/null +++ b/llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll @@ -0,0 +1,138 @@ +; RUN: llvm-as < %s | llvm-dis | FileCheck %s + +%struct.ndrange_t = type { i32 } +%opencl.queue_t = type opaque + +; CHECK: %block.runtime.handle.t = type { ptr, i32, i32 } +; CHECK: %block.runtime.handle.t.0 = type { ptr, i32, i32 } +; CHECK: %block.runtime.handle.t.1 = type { ptr, i32, i32 } +; CHECK: %block.runtime.handle.t.2 = type { ptr, i32, i32 } +; CHECK: %block.runtime.handle.t.3 = type { ptr, i32, i32 } +; CHECK: %block.runtime.handle.t.4 = type { ptr, i32, i32 } + + +; CHECK: @kernel_address_user = global [1 x ptr] [ptr @block_has_used_kernel_address] +; CHECK: @__test_block_invoke_kernel.runtime.handle = internal externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle" +; CHECK: @__test_block_invoke_2_kernel.runtime.handle = internal externally_initialized constant %block.runtime.handle.t.0 zeroinitializer, section ".amdgpu.kernel.runtime.handle" +; CHECK: @block_has_used_kernel_address.runtime.handle = internal externally_initialized constant %block.runtime.handle.t.1 zeroinitializer, section ".amdgpu.kernel.runtime.handle" +; CHECK: @.runtime.handle = internal externally_initialized constant %block.runtime.handle.t.2 zeroinitializer, section ".amdgpu.kernel.runtime.handle" +; CHECK: @.runtime.handle.1 = internal externally_initialized constant %block.runtime.handle.t.3 zeroinitializer, section ".amdgpu.kernel.runtime.handle" +; CHECK: @kernel_linkonce_odr_block.runtime.handle = linkonce_odr externally_initialized constant %block.runtime.handle.t.4 zeroinitializer, section ".amdgpu.kernel.runtime.handle" +; CHECK: @llvm.used = appending global [12 x ptr] [ptr @__test_block_invoke_kernel, ptr @__test_block_invoke_kernel.runtime.handle, ptr @__test_block_invoke_2_kernel, ptr @__test_block_invoke_2_kernel.runtime.handle, ptr @block_has_used_kernel_address, ptr @block_has_used_kernel_address.runtime.handle, ptr @0, ptr @.runtime.handle, ptr @1, ptr @.runtime.handle.1, ptr @kernel_linkonce_odr_block, ptr @kernel_linkonce_odr_block.runtime.handle], section "llvm.metadata" + + +define amdgpu_kernel void @non_caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) { + ret void +} + +define amdgpu_kernel void @caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) { +entry: + %block = alloca <{ i32, i32, ptr addrspace(1), i8 }>, align 8, addrspace(5) + %inst = alloca %struct.ndrange_t, align 4, addrspace(5) + %block2 = alloca <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) + %inst3 = alloca %struct.ndrange_t, align 4, addrspace(5) + %block.size = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 0 + store i32 25, ptr addrspace(5) %block.size, align 8 + %block.align = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 1 + store i32 8, ptr addrspace(5) %block.align, align 4 + %block.captured = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 2 + store ptr addrspace(1) %a, ptr addrspace(5) %block.captured, align 8 + %block.captured1 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 3 + store i8 %b, ptr addrspace(5) %block.captured1, align 8 + %inst4 = addrspacecast ptr addrspace(5) %block to ptr + %inst5 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) poison, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst, + ptr @__test_block_invoke_kernel, ptr nonnull %inst4) #2 + %inst10 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) poison, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst, + ptr @__test_block_invoke_kernel, ptr nonnull %inst4) #2 + %inst11 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) poison, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst, + ptr @0, ptr nonnull %inst4) #2 + %inst12 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) poison, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst, + ptr @1, ptr nonnull %inst4) #2 + %block.size4 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 0 + store i32 41, ptr addrspace(5) %block.size4, align 8 + %block.align5 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 1 + store i32 8, ptr addrspace(5) %block.align5, align 4 + %block.captured7 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 2 + store ptr addrspace(1) %a, ptr addrspace(5) %block.captured7, align 8 + %block.captured8 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 5 + store i8 %b, ptr addrspace(5) %block.captured8, align 8 + %block.captured9 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 3 + store ptr addrspace(1) %c, ptr addrspace(5) %block.captured9, align 8 + %block.captured10 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 4 + store i64 %d, ptr addrspace(5) %block.captured10, align 8 + %inst8 = addrspacecast ptr addrspace(5) %block2 to ptr + %inst9 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) poison, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst3, + ptr @__test_block_invoke_2_kernel, ptr nonnull %inst8) #2 + ret void +} + +; __enqueue_kernel* functions may get inlined +define amdgpu_kernel void @inlined_caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) { +entry: + %inst = load i64, ptr addrspace(1) addrspacecast (ptr @__test_block_invoke_kernel to ptr addrspace(1)) + store i64 %inst, ptr addrspace(1) %c + ret void +} + +; CHECK: define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, ptr addrspace(1), i8 }> %arg) !associated !0 { +define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 { +entry: + %.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 2 + %.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 3 + store i8 %.fca.4.extract, ptr addrspace(1) %.fca.3.extract, align 1 + ret void +} + +declare i32 @__enqueue_kernel_basic(ptr addrspace(1), i32, ptr addrspace(5), ptr, ptr) local_unnamed_addr + +; CHECK: define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg) !associated !1 { +define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg) #0 { +entry: + %.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 2 + %.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 3 + %.fca.5.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 4 + %.fca.6.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 5 + store i8 %.fca.6.extract, ptr addrspace(1) %.fca.3.extract, align 1 + store i64 %.fca.5.extract, ptr addrspace(1) %.fca.4.extract, align 8 + ret void +} + +@kernel_address_user = global [1 x ptr] [ ptr @block_has_used_kernel_address ] + +; CHECK: define internal amdgpu_kernel void @block_has_used_kernel_address(<{ i32, i32, ptr addrspace(1), i8 }> %arg) !associated !2 { +define internal amdgpu_kernel void @block_has_used_kernel_address(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 { +entry: + %.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 2 + %.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 3 + store i8 %.fca.4.extract, ptr addrspace(1) %.fca.3.extract, align 1 + ret void +} + +define amdgpu_kernel void @user_of_kernel_address(ptr addrspace(1) %arg) { + store ptr @block_has_used_kernel_address, ptr addrspace(1) %arg + ret void +} + +; CHECK: define internal amdgpu_kernel void @0(<{ i32, i32, ptr addrspace(1), i8 }> %arg) !associated !3 { +define internal amdgpu_kernel void @0(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 { + ret void +} + +; CHECK: define internal amdgpu_kernel void @1(<{ i32, i32, ptr addrspace(1), i8 }> %arg) !associated !4 { +define internal amdgpu_kernel void @1(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 { + ret void +} + +; CHECK: define linkonce_odr amdgpu_kernel void @kernel_linkonce_odr_block() !associated !5 { +define linkonce_odr amdgpu_kernel void @kernel_linkonce_odr_block() #0 { + ret void +} + +attributes #0 = { "enqueued-block" } + +; CHECK: !0 = !{ptr @__test_block_invoke_kernel.runtime.handle} +; CHECK: !1 = !{ptr @__test_block_invoke_2_kernel.runtime.handle} +; CHECK: !2 = !{ptr @block_has_used_kernel_address.runtime.handle} +; CHECK: !3 = !{ptr @.runtime.handle} +; CHECK: !4 = !{ptr @.runtime.handle.1} +; CHECK: !5 = !{ptr @kernel_linkonce_odr_block.runtime.handle} diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-export-kernel-runtime-handles.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-export-kernel-runtime-handles.ll new file mode 100644 index 0000000000000..58b7441d85955 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-export-kernel-runtime-handles.ll @@ -0,0 +1,62 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-export-kernel-runtime-handles < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-export-kernel-runtime-handles < %s | FileCheck %s + +%block.runtime.handle.t = type { ptr addrspace(1), i32, i32 } + +; associated globals without the correct section should be ignored. +@block.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle" +@not.a.block.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer + +;. +; CHECK: @block.handle = addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle" +; CHECK: @not.a.block.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer +;. +define internal amdgpu_kernel void @block_kernel() !associated !0 { +; CHECK-LABEL: define protected amdgpu_kernel void @block_kernel( +; CHECK-SAME: ) !associated [[META0:![0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +define internal dso_local amdgpu_kernel void @dso_local_block_kernel() !associated !0 { +; CHECK-LABEL: define protected amdgpu_kernel void @dso_local_block_kernel( +; CHECK-SAME: ) !associated [[META0]] { +; CHECK-NEXT: ret void +; + ret void +} + +define internal amdgpu_kernel void @not_block_kernel() !associated !1 { +; CHECK-LABEL: define internal amdgpu_kernel void @not_block_kernel( +; CHECK-SAME: ) !associated [[META1:![0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +define internal amdgpu_kernel void @associated_null() !associated !2 { +; CHECK-LABEL: define internal amdgpu_kernel void @associated_null( +; CHECK-SAME: ) !associated [[META2:![0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +define internal amdgpu_kernel void @no_metadata() { +; CHECK-LABEL: define internal amdgpu_kernel void @no_metadata() { +; CHECK-NEXT: ret void +; + ret void +} + +!0 = !{ptr addrspace(1) @block.handle } +!1 = !{ptr addrspace(1) @not.a.block.handle } +!2 = !{ptr addrspace(1) null } + +;. +; CHECK: [[META0]] = !{ptr addrspace(1) @block.handle} +; CHECK: [[META1]] = !{ptr addrspace(1) @not.a.block.handle} +; CHECK: [[META2]] = !{ptr addrspace(1) null} +;. diff --git a/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll b/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll deleted file mode 100644 index d7c8e47f98883..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll +++ /dev/null @@ -1,215 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs -; RUN: opt -data-layout=A5 -amdgpu-lower-enqueued-block -S < %s | FileCheck %s -; RUN: opt -data-layout=A5 -mtriple=amdgcn -passes=amdgpu-lower-enqueued-block -S < %s | FileCheck %s - -%struct.ndrange_t = type { i32 } -%opencl.queue_t = type opaque - -define amdgpu_kernel void @non_caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) { - ret void -} - -define amdgpu_kernel void @caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) { -entry: - %block = alloca <{ i32, i32, ptr addrspace(1), i8 }>, align 8, addrspace(5) - %inst = alloca %struct.ndrange_t, align 4, addrspace(5) - %block2 = alloca <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) - %inst3 = alloca %struct.ndrange_t, align 4, addrspace(5) - %block.size = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 0 - store i32 25, ptr addrspace(5) %block.size, align 8 - %block.align = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 1 - store i32 8, ptr addrspace(5) %block.align, align 4 - %block.captured = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 2 - store ptr addrspace(1) %a, ptr addrspace(5) %block.captured, align 8 - %block.captured1 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 3 - store i8 %b, ptr addrspace(5) %block.captured1, align 8 - %inst4 = addrspacecast ptr addrspace(5) %block to ptr - %inst5 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst, - ptr @__test_block_invoke_kernel, ptr nonnull %inst4) #2 - %inst10 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst, - ptr @__test_block_invoke_kernel, ptr nonnull %inst4) #2 - %inst11 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst, - ptr @0, ptr nonnull %inst4) #2 - %inst12 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst, - ptr @1, ptr nonnull %inst4) #2 - %block.size4 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 0 - store i32 41, ptr addrspace(5) %block.size4, align 8 - %block.align5 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 1 - store i32 8, ptr addrspace(5) %block.align5, align 4 - %block.captured7 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 2 - store ptr addrspace(1) %a, ptr addrspace(5) %block.captured7, align 8 - %block.captured8 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 5 - store i8 %b, ptr addrspace(5) %block.captured8, align 8 - %block.captured9 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 3 - store ptr addrspace(1) %c, ptr addrspace(5) %block.captured9, align 8 - %block.captured10 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 4 - store i64 %d, ptr addrspace(5) %block.captured10, align 8 - %inst8 = addrspacecast ptr addrspace(5) %block2 to ptr - %inst9 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst3, - ptr @__test_block_invoke_2_kernel, ptr nonnull %inst8) #2 - ret void -} - -; __enqueue_kernel* functions may get inlined -define amdgpu_kernel void @inlined_caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) { -entry: - %inst = load i64, ptr addrspace(1) addrspacecast (ptr @__test_block_invoke_kernel to ptr addrspace(1)) - store i64 %inst, ptr addrspace(1) %c - ret void -} - -define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 { -entry: - %.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 2 - %.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 3 - store i8 %.fca.4.extract, ptr addrspace(1) %.fca.3.extract, align 1 - ret void -} - -declare i32 @__enqueue_kernel_basic(ptr addrspace(1), i32, ptr addrspace(5), ptr, ptr) local_unnamed_addr - -define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg) #0 { -entry: - %.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 2 - %.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 3 - %.fca.5.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 4 - %.fca.6.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 5 - store i8 %.fca.6.extract, ptr addrspace(1) %.fca.3.extract, align 1 - store i64 %.fca.5.extract, ptr addrspace(1) %.fca.4.extract, align 8 - ret void -} - -@kernel_address_user = global [1 x ptr] [ ptr @block_has_used_kernel_address ] - -define internal amdgpu_kernel void @block_has_used_kernel_address(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 { -entry: - %.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 2 - %.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 3 - store i8 %.fca.4.extract, ptr addrspace(1) %.fca.3.extract, align 1 - ret void -} - -define amdgpu_kernel void @user_of_kernel_address(ptr addrspace(1) %arg) { - store ptr @block_has_used_kernel_address, ptr addrspace(1) %arg - ret void -} - -define internal amdgpu_kernel void @0(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 { - ret void -} - -define internal amdgpu_kernel void @1(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 { - ret void -} - -attributes #0 = { "enqueued-block" } -;. -; CHECK: @[[KERNEL_ADDRESS_USER:[a-zA-Z0-9_$"\\.-]+]] = global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @block_has_used_kernel_address.runtime_handle to ptr)] -; CHECK: @[[__TEST_BLOCK_INVOKE_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer -; CHECK: @[[__TEST_BLOCK_INVOKE_2_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer -; CHECK: @[[BLOCK_HAS_USED_KERNEL_ADDRESS_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer -; CHECK: @[[__AMDGPU_ENQUEUED_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer -; CHECK: @[[__AMDGPU_ENQUEUED_KERNEL_1_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer -;. -; CHECK-LABEL: define {{[^@]+}}@non_caller -; CHECK-SAME: (ptr addrspace(1) [[A:%.*]], i8 [[B:%.*]], ptr addrspace(1) [[C:%.*]], i64 [[D:%.*]]) { -; CHECK-NEXT: ret void -; -; -; CHECK-LABEL: define {{[^@]+}}@caller -; CHECK-SAME: (ptr addrspace(1) [[A:%.*]], i8 [[B:%.*]], ptr addrspace(1) [[C:%.*]], i64 [[D:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[BLOCK:%.*]] = alloca <{ i32, i32, ptr addrspace(1), i8 }>, align 8, addrspace(5) -; CHECK-NEXT: [[INST:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5) -; CHECK-NEXT: [[BLOCK2:%.*]] = alloca <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) -; CHECK-NEXT: [[INST3:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5) -; CHECK-NEXT: [[BLOCK_SIZE:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 0 -; CHECK-NEXT: store i32 25, ptr addrspace(5) [[BLOCK_SIZE]], align 8 -; CHECK-NEXT: [[BLOCK_ALIGN:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 1 -; CHECK-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN]], align 4 -; CHECK-NEXT: [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 2 -; CHECK-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[BLOCK_CAPTURED]], align 8 -; CHECK-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 3 -; CHECK-NEXT: store i8 [[B]], ptr addrspace(5) [[BLOCK_CAPTURED1]], align 8 -; CHECK-NEXT: [[INST4:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr -; CHECK-NEXT: [[INST5:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) nonnull byval([[STRUCT_NDRANGE_T]]) [[INST]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime_handle to ptr), ptr nonnull [[INST4]]) -; CHECK-NEXT: [[INST10:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) nonnull byval([[STRUCT_NDRANGE_T]]) [[INST]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime_handle to ptr), ptr nonnull [[INST4]]) -; CHECK-NEXT: [[INST11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) nonnull byval([[STRUCT_NDRANGE_T]]) [[INST]], ptr addrspacecast (ptr addrspace(1) @__amdgpu_enqueued_kernel.runtime_handle to ptr), ptr nonnull [[INST4]]) -; CHECK-NEXT: [[INST12:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) nonnull byval([[STRUCT_NDRANGE_T]]) [[INST]], ptr addrspacecast (ptr addrspace(1) @__amdgpu_enqueued_kernel.1.runtime_handle to ptr), ptr nonnull [[INST4]]) -; CHECK-NEXT: [[BLOCK_SIZE4:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK2]], i32 0, i32 0 -; CHECK-NEXT: store i32 41, ptr addrspace(5) [[BLOCK_SIZE4]], align 8 -; CHECK-NEXT: [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK2]], i32 0, i32 1 -; CHECK-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN5]], align 4 -; CHECK-NEXT: [[BLOCK_CAPTURED7:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK2]], i32 0, i32 2 -; CHECK-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[BLOCK_CAPTURED7]], align 8 -; CHECK-NEXT: [[BLOCK_CAPTURED8:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK2]], i32 0, i32 5 -; CHECK-NEXT: store i8 [[B]], ptr addrspace(5) [[BLOCK_CAPTURED8]], align 8 -; CHECK-NEXT: [[BLOCK_CAPTURED9:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK2]], i32 0, i32 3 -; CHECK-NEXT: store ptr addrspace(1) [[C]], ptr addrspace(5) [[BLOCK_CAPTURED9]], align 8 -; CHECK-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK2]], i32 0, i32 4 -; CHECK-NEXT: store i64 [[D]], ptr addrspace(5) [[BLOCK_CAPTURED10]], align 8 -; CHECK-NEXT: [[INST8:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK2]] to ptr -; CHECK-NEXT: [[INST9:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) nonnull byval([[STRUCT_NDRANGE_T]]) [[INST3]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime_handle to ptr), ptr nonnull [[INST8]]) -; CHECK-NEXT: ret void -; -; -; CHECK-LABEL: define {{[^@]+}}@inlined_caller -; CHECK-SAME: (ptr addrspace(1) [[A:%.*]], i8 [[B:%.*]], ptr addrspace(1) [[C:%.*]], i64 [[D:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[INST:%.*]] = load i64, ptr addrspace(1) @__test_block_invoke_kernel.runtime_handle, align 4 -; CHECK-NEXT: store i64 [[INST]], ptr addrspace(1) [[C]], align 4 -; CHECK-NEXT: ret void -; -; -; CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_kernel -; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> [[ARG]], 2 -; CHECK-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> [[ARG]], 3 -; CHECK-NEXT: store i8 [[DOTFCA_4_EXTRACT]], ptr addrspace(1) [[DOTFCA_3_EXTRACT]], align 1 -; CHECK-NEXT: ret void -; -; -; CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel -; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG]], 2 -; CHECK-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG]], 3 -; CHECK-NEXT: [[DOTFCA_5_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG]], 4 -; CHECK-NEXT: [[DOTFCA_6_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG]], 5 -; CHECK-NEXT: store i8 [[DOTFCA_6_EXTRACT]], ptr addrspace(1) [[DOTFCA_3_EXTRACT]], align 1 -; CHECK-NEXT: store i64 [[DOTFCA_5_EXTRACT]], ptr addrspace(1) [[DOTFCA_4_EXTRACT]], align 8 -; CHECK-NEXT: ret void -; -; -; CHECK-LABEL: define {{[^@]+}}@block_has_used_kernel_address -; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR2:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> [[ARG]], 2 -; CHECK-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> [[ARG]], 3 -; CHECK-NEXT: store i8 [[DOTFCA_4_EXTRACT]], ptr addrspace(1) [[DOTFCA_3_EXTRACT]], align 1 -; CHECK-NEXT: ret void -; -; -; CHECK-LABEL: define {{[^@]+}}@user_of_kernel_address -; CHECK-SAME: (ptr addrspace(1) [[ARG:%.*]]) { -; CHECK-NEXT: store ptr addrspacecast (ptr addrspace(1) @block_has_used_kernel_address.runtime_handle to ptr), ptr addrspace(1) [[ARG]], align 8 -; CHECK-NEXT: ret void -; -; -; CHECK-LABEL: define {{[^@]+}}@__amdgpu_enqueued_kernel -; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR3:[0-9]+]] { -; CHECK-NEXT: ret void -; -; -; CHECK-LABEL: define {{[^@]+}}@__amdgpu_enqueued_kernel.1 -; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR4:[0-9]+]] { -; CHECK-NEXT: ret void -; -;. -; CHECK: attributes #[[ATTR0]] = { "enqueued-block" "runtime-handle"="__test_block_invoke_kernel.runtime_handle" } -; CHECK: attributes #[[ATTR1]] = { "enqueued-block" "runtime-handle"="__test_block_invoke_2_kernel.runtime_handle" } -; CHECK: attributes #[[ATTR2]] = { "enqueued-block" "runtime-handle"="block_has_used_kernel_address.runtime_handle" } -; CHECK: attributes #[[ATTR3]] = { "enqueued-block" "runtime-handle"="__amdgpu_enqueued_kernel.runtime_handle" } -; CHECK: attributes #[[ATTR4]] = { "enqueued-block" "runtime-handle"="__amdgpu_enqueued_kernel.1.runtime_handle" } -;. diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll index 28246d7f9e6fb..f0c3a493d05f1 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll @@ -14,7 +14,8 @@ %struct.B = type { ptr addrspace(1) } %opencl.clk_event_t = type opaque -@__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant ptr addrspace(1) +@__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant ptr addrspace(1), section ".amdgpu.kernel.runtime.handle" +@not.a.handle = external addrspace(1) externally_initialized constant ptr addrspace(1) ; CHECK: --- ; CHECK-NEXT: amdhsa.kernels: @@ -1678,7 +1679,7 @@ define amdgpu_kernel void @test_pointee_align_attribute(ptr addrspace(1) align 1 ; CHECK: .name: __test_block_invoke_kernel ; CHECK: .symbol: __test_block_invoke_kernel.kd define amdgpu_kernel void @__test_block_invoke_kernel( - <{ i32, i32, ptr, ptr addrspace(1), i8 }> %arg) #1 + <{ i32, i32, ptr, ptr addrspace(1), i8 }> %arg) #1 !associated !112 !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !110 !kernel_arg_base_type !110 !kernel_arg_type_qual !4 { ret void @@ -1734,6 +1735,29 @@ define amdgpu_kernel void @unknown_addrspace_kernarg(ptr addrspace(12345) %ptr) ret void } +; Make sure the device_enqueue_symbol is not reported +; CHECK: - .args: [] +; CHECK-NEXT: .group_segment_fixed_size: 0 +; CHECK-NEXT: .kernarg_segment_align: 4 +; CHECK-NEXT: .kernarg_segment_size: 0 +; CHECK-NEXT: .language: OpenCL C +; CHECK-NEXT: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK-NEXT: .max_flat_workgroup_size: 1024 +; CHECK-NEXT: .name: associated_global_not_handle +; CHECK-NEXT: .private_segment_fixed_size: 0 +; CHECK-NEXT: .sgpr_count: +; CHECK-NEXT: .sgpr_spill_count: 0 +; CHECK-NEXT: .symbol: associated_global_not_handle.kd +; CHECK-NEXT: .vgpr_count: +; CHECK-NEXT: .vgpr_spill_count: 0 +; CHECK-NEXT: .wavefront_size: 64 +; CHECK-NOT: device_enqueue_symbol +define amdgpu_kernel void @associated_global_not_handle() #3 !associated !113 { + ret void +} + ; CHECK: amdhsa.printf: ; CHECK-NEXT: - '1:1:4:%d\n' ; CHECK-NEXT: - '2:1:8:%g\n' @@ -1744,6 +1768,7 @@ define amdgpu_kernel void @unknown_addrspace_kernarg(ptr addrspace(12345) %ptr) attributes #0 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" } attributes #1 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" } attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" } +attributes #3 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} @@ -1803,5 +1828,7 @@ attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" } !101 = !{!"2:1:8:%g\5Cn"} !110 = !{!"__block_literal"} !111 = !{!"char", !"char"} +!112 = !{ptr addrspace(1) @__test_block_invoke_kernel_runtime_handle } +!113 = !{ptr addrspace(1) @not.a.handle } ; PARSER: AMDGPU HSA Metadata Parser Test: PASS diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index d7f54f3b8e9e2..9aca7a5fc741f 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -38,7 +38,7 @@ ; GCN-O0-NEXT: Dominator Tree Construction ; GCN-O0-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O0-NEXT: Function Alias Analysis Results -; GCN-O0-NEXT: Lower OpenCL enqueued blocks +; GCN-O0-NEXT: Externalize enqueued block runtime handles ; GCN-O0-NEXT: AMDGPU Software lowering of LDS ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager @@ -188,7 +188,7 @@ ; GCN-O1-NEXT: Dominator Tree Construction ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results -; GCN-O1-NEXT: Lower OpenCL enqueued blocks +; GCN-O1-NEXT: Externalize enqueued block runtime handles ; GCN-O1-NEXT: AMDGPU Software lowering of LDS ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: FunctionPass Manager @@ -473,7 +473,7 @@ ; GCN-O1-OPTS-NEXT: Dominator Tree Construction ; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results -; GCN-O1-OPTS-NEXT: Lower OpenCL enqueued blocks +; GCN-O1-OPTS-NEXT: Externalize enqueued block runtime handles ; GCN-O1-OPTS-NEXT: AMDGPU Software lowering of LDS ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager @@ -788,7 +788,7 @@ ; GCN-O2-NEXT: Dominator Tree Construction ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results -; GCN-O2-NEXT: Lower OpenCL enqueued blocks +; GCN-O2-NEXT: Externalize enqueued block runtime handles ; GCN-O2-NEXT: AMDGPU Software lowering of LDS ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager @@ -1107,7 +1107,7 @@ ; GCN-O3-NEXT: Dominator Tree Construction ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results -; GCN-O3-NEXT: Lower OpenCL enqueued blocks +; GCN-O3-NEXT: Externalize enqueued block runtime handles ; GCN-O3-NEXT: AMDGPU Software lowering of LDS ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager