Skip to content

Commit da198ac

Browse files
authored
SWDEV-531678 - Remove split path from the dispatch (#283)
The split path for blit kernels are no longer necessary, since the new blit kernels don't use the copy size as the global workload
1 parent acb1f7e commit da198ac

File tree

2 files changed

+332
-452
lines changed

2 files changed

+332
-452
lines changed

rocclr/device/pal/palvirtual.cpp

Lines changed: 68 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -2621,25 +2621,17 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
26212621
const amd::Kernel& kernel, const_address parameters,
26222622
bool nativeMem, uint32_t sharedMemBytes,
26232623
bool anyOrder) {
2624-
size_t newOffset[3] = {0, 0, 0};
2625-
size_t newGlobalSize[3] = {0, 0, 0};
26262624
state_.anyOrder_ = anyOrder;
26272625

2628-
int dim = -1;
2629-
int iteration = 1;
2630-
size_t globalStep = 0;
2631-
for (uint i = 0; i < sizes.dimensions(); i++) {
2632-
newGlobalSize[i] = sizes.global()[i];
2633-
newOffset[i] = sizes.offset()[i];
2634-
}
2635-
26362626
// Get the HSA kernel object
26372627
const HSAILKernel& hsaKernel = static_cast<const HSAILKernel&>(*(kernel.getDeviceKernel(dev())));
26382628

26392629
// If RGP capturing is enabled, then start SQTT trace
26402630
if (rgpCaptureEna()) {
26412631
size_t newLocalSize[3] = {1, 1, 1};
2632+
size_t newGlobalSize[3] = {0, 0, 0};
26422633
for (uint i = 0; i < sizes.dimensions(); i++) {
2634+
newGlobalSize[i] = sizes.global()[i];
26432635
if (sizes.local()[i] != 0) {
26442636
newLocalSize[i] = sizes.local()[i];
26452637
}
@@ -2671,13 +2663,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
26712663

26722664
if (PAL_EMBED_KERNEL_MD) {
26732665
char buf[256];
2674-
sprintf(buf,
2675-
"kernel: %s\n"
2676-
"private mem size: %x\n"
2677-
"group mem size: %x\n",
2678-
hsaKernel.name().c_str(),
2679-
hsaKernel.spillSegSize(),
2680-
hsaKernel.ldsSize());
2666+
sprintf(buf, "kernel: %s\n private mem size: %x\n group mem size: %x\n",
2667+
hsaKernel.name().c_str(), hsaKernel.spillSegSize(), hsaKernel.ldsSize());
26812668
iCmd()->CmdCommentString(buf);
26822669
}
26832670

@@ -2694,128 +2681,77 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
26942681
// Add ISA memory object to the resource tracking list
26952682
AddKernel(kernel);
26962683

2697-
// Check if it is blit kernel. If it is, then check if split is needed.
2698-
if (hsaKernel.isInternalKernel()) {
2699-
// Calculate new group size for each submission
2700-
for (uint i = 0; i < sizes.dimensions(); i++) {
2701-
if (sizes.global()[i] > static_cast<size_t>(0xffffffff)) {
2702-
dim = i;
2703-
iteration = sizes.global()[i] / 0xC0000000 + ((sizes.global()[i] % 0xC0000000) ? 1 : 0);
2704-
globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration * sizes.local()[dim];
2705-
break;
2706-
}
2707-
}
2708-
}
2709-
2710-
for (int iter = 0; iter < iteration; ++iter) {
2711-
GpuEvent gpuEvent(queues_[MainEngine]->cmdBufId());
2712-
uint32_t id = gpuEvent.id_;
2713-
// Reset global size for dimension dim if split is needed
2714-
if (dim != -1) {
2715-
newOffset[dim] = sizes.offset()[dim] + globalStep * iter;
2716-
if (((newOffset[dim] + globalStep) < sizes.global()[dim]) && (iter != (iteration - 1))) {
2717-
newGlobalSize[dim] = globalStep;
2718-
} else {
2719-
newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim];
2720-
}
2721-
}
2722-
2723-
amd::NDRangeContainer tmpSizes(sizes.dimensions(), &newOffset[0], &newGlobalSize[0],
2724-
&(const_cast<amd::NDRangeContainer&>(sizes).local()[0]));
2725-
2726-
if (iter > 0) {
2727-
// Updates the timestamp values, since a CB flush could occur.
2728-
// Resource processing was moved from loadArguments() and
2729-
// an extra loop is required.
2730-
const amd::KernelParameters& kernelParams = kernel.parameters();
2731-
amd::Memory* const* memories =
2732-
reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
2733-
for (uint32_t i = 0; i < kernel.signature().numMemories(); ++i) {
2734-
if (nativeMem) {
2735-
Memory* gpuMem = reinterpret_cast<Memory* const*>(memories)[i];
2736-
if (gpuMem != nullptr) {
2737-
gpuMem->setBusy(*this, gpuEvent);
2738-
}
2739-
} else {
2740-
amd::Memory* mem = memories[i];
2741-
if (mem != nullptr) {
2742-
dev().getGpuMemory(mem)->setBusy(*this, gpuEvent);
2743-
}
2744-
}
2745-
}
2746-
}
2747-
2748-
uint64_t vmParentWrap = 0;
2749-
uint32_t aql_index = 0;
2750-
// Program the kernel arguments for the GPU execution
2751-
hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
2752-
*this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap, &aql_index);
2753-
if (nullptr == aqlPkt) {
2754-
LogError("Couldn't load kernel arguments");
2684+
GpuEvent gpuEvent(queues_[MainEngine]->cmdBufId());
2685+
uint32_t id = gpuEvent.id_;
2686+
uint64_t vmParentWrap = 0;
2687+
uint32_t aql_index = 0;
2688+
// Program the kernel arguments for the GPU execution
2689+
hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
2690+
*this, kernel, sizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap, &aql_index);
2691+
assert((nullptr != aqlPkt) && "Couldn't load kernel arguments");
2692+
2693+
// Dynamic call stack size is considered to calculate private segment size and scratch regs
2694+
// in LightningKernel::postLoad(). As it is not called during hipModuleLaunchKernel unlike
2695+
// hipLaunchKernel/hipLaunchKernelGGL, Updated value is passed to dispatch packet.
2696+
size_t privateMemSize = hsaKernel.spillSegSize();
2697+
if ((hsaKernel.workGroupInfo()->usedStackSize_ & 0x1) == 0x1) {
2698+
privateMemSize = std::max<uint32_t>(static_cast<uint32_t>(device().StackSize()),
2699+
hsaKernel.workGroupInfo()->scratchRegs_ * sizeof(uint32_t)) ;
2700+
// Validate privateMemSize is more than max allowed.
2701+
size_t maxStackSize = device().MaxStackSize();
2702+
if (privateMemSize > maxStackSize) {
2703+
ClPrint(amd::LOG_INFO, amd::LOG_KERN,
2704+
"Scratch size (%zu) exceeds max allowed (%zu) for kernel : %s",
2705+
privateMemSize, maxStackSize, hsaKernel.name().c_str());
2706+
LogError("Scratch size exceeds max allowed.");
27552707
return false;
27562708
}
2757-
// Dynamic call stack size is considered to calculate private segment size and scratch regs
2758-
// in LightningKernel::postLoad(). As it is not called during hipModuleLaunchKernel unlike
2759-
// hipLaunchKernel/hipLaunchKernelGGL, Updated value is passed to dispatch packet.
2760-
size_t privateMemSize = hsaKernel.spillSegSize();
2761-
if ((hsaKernel.workGroupInfo()->usedStackSize_ & 0x1) == 0x1) {
2762-
privateMemSize = std::max<uint32_t>(static_cast<uint32_t>(device().StackSize()),
2763-
hsaKernel.workGroupInfo()->scratchRegs_ * sizeof(uint32_t)) ;
2764-
// Validate privateMemSize is more than max allowed.
2765-
size_t maxStackSize = device().MaxStackSize();
2766-
if (privateMemSize > maxStackSize) {
2767-
ClPrint(amd::LOG_INFO, amd::LOG_KERN,
2768-
"Scratch size (%zu) exceeds max allowed (%zu) for kernel : %s",
2769-
privateMemSize, maxStackSize, hsaKernel.name().c_str());
2770-
LogError("Scratch size exceeds max allowed.");
2771-
return false;
2772-
}
2773-
}
2709+
}
27742710

2775-
// Set up the dispatch information
2776-
Pal::DispatchAqlParams dispatchParam = {};
2777-
dispatchParam.pAqlPacket = aqlPkt;
2778-
if (privateMemSize > 0) {
2779-
const Device::ScratchBuffer* scratch = dev().scratch(hwRing());
2780-
dispatchParam.scratchAddr = scratch->memObj_->vmAddress();
2781-
dispatchParam.scratchSize = scratch->size_;
2782-
dispatchParam.scratchOffset = scratch->offset_;
2783-
dispatchParam.workitemPrivateSegmentSize = privateMemSize;
2784-
}
2785-
dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode();
2786-
dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
2787-
if (!hsaKernel.prog().isLC() && hsaKernel.workGroupInfo()->wavesPerSimdHint_ != 0) {
2788-
constexpr uint32_t kWavesPerSimdLimit = 4;
2789-
dispatchParam.wavesPerSh = kWavesPerSimdLimit *
2790-
dev().info().cuPerShaderArray_ * dev().info().simdPerCU_;
2791-
} else {
2792-
dispatchParam.wavesPerSh = 0;
2793-
}
2794-
dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
2795-
dispatchParam.aqlPacketIndex = aql_index;
2796-
// Run AQL dispatch in HW
2797-
eventBegin(MainEngine);
2798-
iCmd()->CmdDispatchAql(dispatchParam);
2711+
// Set up the dispatch information
2712+
Pal::DispatchAqlParams dispatchParam = {};
2713+
dispatchParam.pAqlPacket = aqlPkt;
2714+
if (privateMemSize > 0) {
2715+
const Device::ScratchBuffer* scratch = dev().scratch(hwRing());
2716+
dispatchParam.scratchAddr = scratch->memObj_->vmAddress();
2717+
dispatchParam.scratchSize = scratch->size_;
2718+
dispatchParam.scratchOffset = scratch->offset_;
2719+
dispatchParam.workitemPrivateSegmentSize = privateMemSize;
2720+
}
2721+
dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode();
2722+
dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
2723+
if (!hsaKernel.prog().isLC() && hsaKernel.workGroupInfo()->wavesPerSimdHint_ != 0) {
2724+
constexpr uint32_t kWavesPerSimdLimit = 4;
2725+
dispatchParam.wavesPerSh = kWavesPerSimdLimit *
2726+
dev().info().cuPerShaderArray_ * dev().info().simdPerCU_;
2727+
} else {
2728+
dispatchParam.wavesPerSh = 0;
2729+
}
2730+
dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
2731+
dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
2732+
dispatchParam.aqlPacketIndex = aql_index;
2733+
// Run AQL dispatch in HW
2734+
eventBegin(MainEngine);
2735+
iCmd()->CmdDispatchAql(dispatchParam);
27992736

2800-
if (id != gpuEvent.id_) {
2801-
LogError("Something is wrong. ID mismatch!\n");
2802-
}
2803-
eventEnd(MainEngine, gpuEvent);
2804-
AqlPacketUpdateTs(aql_index, gpuEvent);
2737+
if (id != gpuEvent.id_) {
2738+
LogError("Something is wrong. ID mismatch!\n");
2739+
}
2740+
eventEnd(MainEngine, gpuEvent);
2741+
AqlPacketUpdateTs(aql_index, gpuEvent);
28052742

2806-
// Execute scheduler for device enqueue
2807-
if (hsaKernel.dynamicParallelism()) {
2808-
PostDeviceEnqueue(kernel, hsaKernel, gpuDefQueue, vmDefQueue, vmParentWrap, &gpuEvent);
2809-
}
2743+
// Execute scheduler for device enqueue
2744+
if (hsaKernel.dynamicParallelism()) {
2745+
PostDeviceEnqueue(kernel, hsaKernel, gpuDefQueue, vmDefQueue, vmParentWrap, &gpuEvent);
2746+
}
28102747

2811-
// Update the global GPU event
2812-
constexpr bool kNeedFLush = false;
2813-
setGpuEvent(gpuEvent, kNeedFLush);
2748+
// Update the global GPU event
2749+
constexpr bool kNeedFLush = false;
2750+
setGpuEvent(gpuEvent, kNeedFLush);
28142751

2815-
if (printfEnabled && !printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) {
2816-
LogError("Couldn't read printf data from the buffer!\n");
2817-
return false;
2818-
}
2752+
if (printfEnabled && !printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) {
2753+
LogError("Couldn't read printf data from the buffer!\n");
2754+
return false;
28192755
}
28202756

28212757
// Check if image buffer write back is required

0 commit comments

Comments
 (0)