@@ -614,6 +614,20 @@ void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
614614 FT, FT->getExtInfo ().withCallingConv (CC_OpenCLKernel));
615615}
616616
617+ // / Return IR struct type for rtinfo struct in rocm-device-libs used for device
618+ // / enqueue.
619+ // /
620+ // / ptr addrspace(1) kernel_object, i32 private_segment_size,
621+ // / i32 group_segment_size
622+
623+ static llvm::StructType *
624+ getAMDGPURuntimeHandleType (llvm::LLVMContext &C,
625+ llvm::Type *KernelDescriptorPtrTy) {
626+ llvm::Type *Int32 = llvm::Type::getInt32Ty (C);
627+ return llvm::StructType::create (C, {KernelDescriptorPtrTy, Int32, Int32},
628+ " block.runtime.handle.t" );
629+ }
630+
617631// / Create an OpenCL kernel for an enqueued block.
618632// /
619633// / The type of the first argument (the block literal) is the struct type
@@ -653,23 +667,29 @@ llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
653667 ArgNames.push_back (
654668 llvm::MDString::get (C, (Twine (" local_arg" ) + Twine (I)).str ()));
655669 }
656- std::string Name = Invoke->getName ().str () + " _kernel" ;
670+
671+ llvm::Module &Mod = CGF.CGM .getModule ();
672+ const llvm::DataLayout &DL = Mod.getDataLayout ();
673+
674+ llvm::Twine Name = Invoke->getName () + " _kernel" ;
657675 auto *FT = llvm::FunctionType::get (llvm::Type::getVoidTy (C), ArgTys, false );
676+
677+ // The kernel itself can be internal, the runtime does not directly access the
678+ // kernel address (only the kernel descriptor).
658679 auto *F = llvm::Function::Create (FT, llvm::GlobalValue::InternalLinkage, Name,
659- &CGF. CGM . getModule () );
680+ &Mod );
660681 F->setCallingConv (llvm::CallingConv::AMDGPU_KERNEL);
661682
662683 llvm::AttrBuilder KernelAttrs (C);
663684 // FIXME: The invoke isn't applying the right attributes either
664685 // FIXME: This is missing setTargetAttributes
665686 CGF.CGM .addDefaultFunctionDefinitionAttributes (KernelAttrs);
666- KernelAttrs.addAttribute (" enqueued-block" );
667687 F->addFnAttrs (KernelAttrs);
668688
669689 auto IP = CGF.Builder .saveIP ();
670690 auto *BB = llvm::BasicBlock::Create (C, " entry" , F);
671691 Builder.SetInsertPoint (BB);
672- const auto BlockAlign = CGF. CGM . getDataLayout () .getPrefTypeAlign (BlockTy);
692+ const auto BlockAlign = DL .getPrefTypeAlign (BlockTy);
673693 auto *BlockPtr = Builder.CreateAlloca (BlockTy, nullptr );
674694 BlockPtr->setAlignment (BlockAlign);
675695 Builder.CreateAlignedStore (F->arg_begin (), BlockPtr, BlockAlign);
@@ -692,7 +712,39 @@ llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
692712 if (CGF.CGM .getCodeGenOpts ().EmitOpenCLArgMetadata )
693713 F->setMetadata (" kernel_arg_name" , llvm::MDNode::get (C, ArgNames));
694714
695- return F;
715+ llvm::StructType *HandleTy = getAMDGPURuntimeHandleType (
716+ C, llvm::PointerType::get (C, DL.getDefaultGlobalsAddressSpace ()));
717+ llvm::Constant *RuntimeHandleInitializer =
718+ llvm::ConstantAggregateZero::get (HandleTy);
719+
720+ llvm::Twine RuntimeHandleName = F->getName () + " .runtime.handle" ;
721+
722+ // The runtime needs access to the runtime handle as an external symbol. The
723+ // runtime handle will need to be made external later, in
724+ // AMDGPUExportOpenCLEnqueuedBlocks. The kernel itself has a hidden reference
725+ // inside the runtime handle, and is not directly referenced.
726+
727+ // TODO: We would initialize the first field by declaring F->getName() + ".kd"
728+ // to reference the kernel descriptor. The runtime wouldn't need to bother
729+ // setting it. We would need to have a final symbol name though.
730+ // TODO: Can we directly use an external symbol with getGlobalIdentifier?
731+ auto *RuntimeHandle = new llvm::GlobalVariable (
732+ Mod, HandleTy,
733+ /* isConstant=*/ true , llvm::GlobalValue::InternalLinkage,
734+ /* Initializer=*/ RuntimeHandleInitializer, RuntimeHandleName,
735+ /* InsertBefore=*/ nullptr , llvm::GlobalValue::NotThreadLocal,
736+ DL.getDefaultGlobalsAddressSpace (),
737+ /* isExternallyInitialized=*/ true );
738+
739+ llvm::MDNode *HandleAsMD =
740+ llvm::MDNode::get (C, llvm::ValueAsMetadata::get (RuntimeHandle));
741+ F->setMetadata (llvm::LLVMContext::MD_associated, HandleAsMD);
742+
743+ RuntimeHandle->setSection (" .amdgpu.kernel.runtime.handle" );
744+
745+ CGF.CGM .addUsedGlobal (F);
746+ CGF.CGM .addUsedGlobal (RuntimeHandle);
747+ return RuntimeHandle;
696748}
697749
698750void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr (
0 commit comments