@@ -2621,25 +2621,17 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
26212621 const amd::Kernel& kernel, const_address parameters,
26222622 bool nativeMem, uint32_t sharedMemBytes,
26232623 bool anyOrder) {
2624- size_t newOffset[3 ] = {0 , 0 , 0 };
2625- size_t newGlobalSize[3 ] = {0 , 0 , 0 };
26262624 state_.anyOrder_ = anyOrder;
26272625
2628- int dim = -1 ;
2629- int iteration = 1 ;
2630- size_t globalStep = 0 ;
2631- for (uint i = 0 ; i < sizes.dimensions (); i++) {
2632- newGlobalSize[i] = sizes.global ()[i];
2633- newOffset[i] = sizes.offset ()[i];
2634- }
2635-
26362626 // Get the HSA kernel object
26372627 const HSAILKernel& hsaKernel = static_cast <const HSAILKernel&>(*(kernel.getDeviceKernel (dev ())));
26382628
26392629 // If RGP capturing is enabled, then start SQTT trace
26402630 if (rgpCaptureEna ()) {
26412631 size_t newLocalSize[3 ] = {1 , 1 , 1 };
2632+ size_t newGlobalSize[3 ] = {0 , 0 , 0 };
26422633 for (uint i = 0 ; i < sizes.dimensions (); i++) {
2634+ newGlobalSize[i] = sizes.global ()[i];
26432635 if (sizes.local ()[i] != 0 ) {
26442636 newLocalSize[i] = sizes.local ()[i];
26452637 }
@@ -2671,13 +2663,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
26712663
26722664 if (PAL_EMBED_KERNEL_MD) {
26732665 char buf[256 ];
2674- sprintf (buf,
2675- " kernel: %s\n "
2676- " private mem size: %x\n "
2677- " group mem size: %x\n " ,
2678- hsaKernel.name ().c_str (),
2679- hsaKernel.spillSegSize (),
2680- hsaKernel.ldsSize ());
2666+ sprintf (buf, " kernel: %s\n private mem size: %x\n group mem size: %x\n " ,
2667+ hsaKernel.name ().c_str (), hsaKernel.spillSegSize (), hsaKernel.ldsSize ());
26812668 iCmd ()->CmdCommentString (buf);
26822669 }
26832670
@@ -2694,128 +2681,77 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
26942681 // Add ISA memory object to the resource tracking list
26952682 AddKernel (kernel);
26962683
2697- // Check if it is blit kernel. If it is, then check if split is needed.
2698- if (hsaKernel.isInternalKernel ()) {
2699- // Calculate new group size for each submission
2700- for (uint i = 0 ; i < sizes.dimensions (); i++) {
2701- if (sizes.global ()[i] > static_cast <size_t >(0xffffffff )) {
2702- dim = i;
2703- iteration = sizes.global ()[i] / 0xC0000000 + ((sizes.global ()[i] % 0xC0000000 ) ? 1 : 0 );
2704- globalStep = (sizes.global ()[i] / sizes.local ()[i]) / iteration * sizes.local ()[dim];
2705- break ;
2706- }
2707- }
2708- }
2709-
2710- for (int iter = 0 ; iter < iteration; ++iter) {
2711- GpuEvent gpuEvent (queues_[MainEngine]->cmdBufId ());
2712- uint32_t id = gpuEvent.id_ ;
2713- // Reset global size for dimension dim if split is needed
2714- if (dim != -1 ) {
2715- newOffset[dim] = sizes.offset ()[dim] + globalStep * iter;
2716- if (((newOffset[dim] + globalStep) < sizes.global ()[dim]) && (iter != (iteration - 1 ))) {
2717- newGlobalSize[dim] = globalStep;
2718- } else {
2719- newGlobalSize[dim] = sizes.global ()[dim] - newOffset[dim];
2720- }
2721- }
2722-
2723- amd::NDRangeContainer tmpSizes (sizes.dimensions (), &newOffset[0 ], &newGlobalSize[0 ],
2724- &(const_cast <amd::NDRangeContainer&>(sizes).local ()[0 ]));
2725-
2726- if (iter > 0 ) {
2727- // Updates the timestamp values, since a CB flush could occur.
2728- // Resource processing was moved from loadArguments() and
2729- // an extra loop is required.
2730- const amd::KernelParameters& kernelParams = kernel.parameters ();
2731- amd::Memory* const * memories =
2732- reinterpret_cast <amd::Memory* const *>(parameters + kernelParams.memoryObjOffset ());
2733- for (uint32_t i = 0 ; i < kernel.signature ().numMemories (); ++i) {
2734- if (nativeMem) {
2735- Memory* gpuMem = reinterpret_cast <Memory* const *>(memories)[i];
2736- if (gpuMem != nullptr ) {
2737- gpuMem->setBusy (*this , gpuEvent);
2738- }
2739- } else {
2740- amd::Memory* mem = memories[i];
2741- if (mem != nullptr ) {
2742- dev ().getGpuMemory (mem)->setBusy (*this , gpuEvent);
2743- }
2744- }
2745- }
2746- }
2747-
2748- uint64_t vmParentWrap = 0 ;
2749- uint32_t aql_index = 0 ;
2750- // Program the kernel arguments for the GPU execution
2751- hsa_kernel_dispatch_packet_t * aqlPkt = hsaKernel.loadArguments (
2752- *this , kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap, &aql_index);
2753- if (nullptr == aqlPkt) {
2754- LogError (" Couldn't load kernel arguments" );
2684+ GpuEvent gpuEvent (queues_[MainEngine]->cmdBufId ());
2685+ uint32_t id = gpuEvent.id_ ;
2686+ uint64_t vmParentWrap = 0 ;
2687+ uint32_t aql_index = 0 ;
2688+ // Program the kernel arguments for the GPU execution
2689+ hsa_kernel_dispatch_packet_t * aqlPkt = hsaKernel.loadArguments (
2690+ *this , kernel, sizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap, &aql_index);
2691+ assert ((nullptr != aqlPkt) && " Couldn't load kernel arguments" );
2692+
2693+ // Dynamic call stack size is considered to calculate private segment size and scratch regs
2694+ // in LightningKernel::postLoad(). As it is not called during hipModuleLaunchKernel unlike
2695+ // hipLaunchKernel/hipLaunchKernelGGL, Updated value is passed to dispatch packet.
2696+ size_t privateMemSize = hsaKernel.spillSegSize ();
2697+ if ((hsaKernel.workGroupInfo ()->usedStackSize_ & 0x1 ) == 0x1 ) {
2698+ privateMemSize = std::max<uint32_t >(static_cast <uint32_t >(device ().StackSize ()),
2699+ hsaKernel.workGroupInfo ()->scratchRegs_ * sizeof (uint32_t )) ;
2700+ // Validate privateMemSize is more than max allowed.
2701+ size_t maxStackSize = device ().MaxStackSize ();
2702+ if (privateMemSize > maxStackSize) {
2703+ ClPrint (amd::LOG_INFO, amd::LOG_KERN,
2704+ " Scratch size (%zu) exceeds max allowed (%zu) for kernel : %s" ,
2705+ privateMemSize, maxStackSize, hsaKernel.name ().c_str ());
2706+ LogError (" Scratch size exceeds max allowed." );
27552707 return false ;
27562708 }
2757- // Dynamic call stack size is considered to calculate private segment size and scratch regs
2758- // in LightningKernel::postLoad(). As it is not called during hipModuleLaunchKernel unlike
2759- // hipLaunchKernel/hipLaunchKernelGGL, Updated value is passed to dispatch packet.
2760- size_t privateMemSize = hsaKernel.spillSegSize ();
2761- if ((hsaKernel.workGroupInfo ()->usedStackSize_ & 0x1 ) == 0x1 ) {
2762- privateMemSize = std::max<uint32_t >(static_cast <uint32_t >(device ().StackSize ()),
2763- hsaKernel.workGroupInfo ()->scratchRegs_ * sizeof (uint32_t )) ;
2764- // Validate privateMemSize is more than max allowed.
2765- size_t maxStackSize = device ().MaxStackSize ();
2766- if (privateMemSize > maxStackSize) {
2767- ClPrint (amd::LOG_INFO, amd::LOG_KERN,
2768- " Scratch size (%zu) exceeds max allowed (%zu) for kernel : %s" ,
2769- privateMemSize, maxStackSize, hsaKernel.name ().c_str ());
2770- LogError (" Scratch size exceeds max allowed." );
2771- return false ;
2772- }
2773- }
2709+ }
27742710
2775- // Set up the dispatch information
2776- Pal::DispatchAqlParams dispatchParam = {};
2777- dispatchParam.pAqlPacket = aqlPkt;
2778- if (privateMemSize > 0 ) {
2779- const Device::ScratchBuffer* scratch = dev ().scratch (hwRing ());
2780- dispatchParam.scratchAddr = scratch->memObj_ ->vmAddress ();
2781- dispatchParam.scratchSize = scratch->size_ ;
2782- dispatchParam.scratchOffset = scratch->offset_ ;
2783- dispatchParam.workitemPrivateSegmentSize = privateMemSize;
2784- }
2785- dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode ();
2786- dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress ();
2787- if (!hsaKernel.prog ().isLC () && hsaKernel.workGroupInfo ()->wavesPerSimdHint_ != 0 ) {
2788- constexpr uint32_t kWavesPerSimdLimit = 4 ;
2789- dispatchParam.wavesPerSh = kWavesPerSimdLimit *
2790- dev ().info ().cuPerShaderArray_ * dev ().info ().simdPerCU_ ;
2791- } else {
2792- dispatchParam.wavesPerSh = 0 ;
2793- }
2794- dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize ();
2795- dispatchParam.aqlPacketIndex = aql_index;
2796- // Run AQL dispatch in HW
2797- eventBegin (MainEngine);
2798- iCmd ()->CmdDispatchAql (dispatchParam);
2711+ // Set up the dispatch information
2712+ Pal::DispatchAqlParams dispatchParam = {};
2713+ dispatchParam.pAqlPacket = aqlPkt;
2714+ if (privateMemSize > 0 ) {
2715+ const Device::ScratchBuffer* scratch = dev ().scratch (hwRing ());
2716+ dispatchParam.scratchAddr = scratch->memObj_ ->vmAddress ();
2717+ dispatchParam.scratchSize = scratch->size_ ;
2718+ dispatchParam.scratchOffset = scratch->offset_ ;
2719+ dispatchParam.workitemPrivateSegmentSize = privateMemSize;
2720+ }
2721+ dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode ();
2722+ dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress ();
2723+ if (!hsaKernel.prog ().isLC () && hsaKernel.workGroupInfo ()->wavesPerSimdHint_ != 0 ) {
2724+ constexpr uint32_t kWavesPerSimdLimit = 4 ;
2725+ dispatchParam.wavesPerSh = kWavesPerSimdLimit *
2726+ dev ().info ().cuPerShaderArray_ * dev ().info ().simdPerCU_ ;
2727+ } else {
2728+ dispatchParam.wavesPerSh = 0 ;
2729+ }
2730+ dispatchParam.useAtc = dev ().settings ().svmFineGrainSystem_ ? true : false ;
2731+ dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize ();
2732+ dispatchParam.aqlPacketIndex = aql_index;
2733+ // Run AQL dispatch in HW
2734+ eventBegin (MainEngine);
2735+ iCmd ()->CmdDispatchAql (dispatchParam);
27992736
2800- if (id != gpuEvent.id_ ) {
2801- LogError (" Something is wrong. ID mismatch!\n " );
2802- }
2803- eventEnd (MainEngine, gpuEvent);
2804- AqlPacketUpdateTs (aql_index, gpuEvent);
2737+ if (id != gpuEvent.id_ ) {
2738+ LogError (" Something is wrong. ID mismatch!\n " );
2739+ }
2740+ eventEnd (MainEngine, gpuEvent);
2741+ AqlPacketUpdateTs (aql_index, gpuEvent);
28052742
2806- // Execute scheduler for device enqueue
2807- if (hsaKernel.dynamicParallelism ()) {
2808- PostDeviceEnqueue (kernel, hsaKernel, gpuDefQueue, vmDefQueue, vmParentWrap, &gpuEvent);
2809- }
2743+ // Execute scheduler for device enqueue
2744+ if (hsaKernel.dynamicParallelism ()) {
2745+ PostDeviceEnqueue (kernel, hsaKernel, gpuDefQueue, vmDefQueue, vmParentWrap, &gpuEvent);
2746+ }
28102747
2811- // Update the global GPU event
2812- constexpr bool kNeedFLush = false ;
2813- setGpuEvent (gpuEvent, kNeedFLush );
2748+ // Update the global GPU event
2749+ constexpr bool kNeedFLush = false ;
2750+ setGpuEvent (gpuEvent, kNeedFLush );
28142751
2815- if (printfEnabled && !printfDbgHSA ().output (*this , printfEnabled, hsaKernel.printfInfo ())) {
2816- LogError (" Couldn't read printf data from the buffer!\n " );
2817- return false ;
2818- }
2752+ if (printfEnabled && !printfDbgHSA ().output (*this , printfEnabled, hsaKernel.printfInfo ())) {
2753+ LogError (" Couldn't read printf data from the buffer!\n " );
2754+ return false ;
28192755 }
28202756
28212757 // Check if image buffer write back is required
0 commit comments