1010#include " shared/source/built_ins/built_ins.h"
1111#include " shared/source/built_ins/sip.h"
1212#include " shared/source/command_container/command_encoder.h"
13+ #include " shared/source/command_container/implicit_scaling.h"
1314#include " shared/source/command_stream/command_stream_receiver_hw.h"
1415#include " shared/source/command_stream/linear_stream.h"
1516#include " shared/source/command_stream/preemption.h"
@@ -72,6 +73,9 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
7273 using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
7374 using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
7475
76+ using MI_LOAD_REGISTER_MEM = typename GfxFamily::MI_LOAD_REGISTER_MEM;
77+ using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM;
78+
7579 auto lockCSR = csr->obtainUniqueOwnership ();
7680
7781 auto anyCommandListWithCooperativeKernels = false ;
@@ -177,6 +181,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
177181 heapContainer.push_back (element);
178182 }
179183 }
184+
185+ partitionCount = std::max (partitionCount, commandList->partitionCount );
180186 }
181187
182188 size_t linearStreamSizeEstimate = totalCmdBuffers * sizeof (MI_BATCH_BUFFER_START);
@@ -240,6 +246,10 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
240246 }
241247
242248 linearStreamSizeEstimate += isCopyOnlyCommandQueue ? NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite () : NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation (hwInfo);
249+ if (partitionCount > 1 ) {
250+ linearStreamSizeEstimate += sizeof (MI_LOAD_REGISTER_MEM) + sizeof (MI_LOAD_REGISTER_IMM);
251+ }
252+
243253 size_t alignedSize = alignUp<size_t >(linearStreamSizeEstimate, minCmdBufferPtrAlign);
244254 size_t padding = alignedSize - linearStreamSizeEstimate;
245255 reserveLinearStreamSize (alignedSize);
@@ -399,6 +409,17 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
399409
400410 commandQueuePreemptionMode = statePreemption;
401411
412+ if (partitionCount > 1 ) {
413+ uint64_t workPartitionAddress = csr->getWorkPartitionAllocationGpuAddress ();
414+ NEO::EncodeSetMMIO<GfxFamily>::encodeMEM (child,
415+ NEO::PartitionRegisters<GfxFamily>::wparidCCSOffset,
416+ workPartitionAddress);
417+ NEO::EncodeSetMMIO<GfxFamily>::encodeIMM (child,
418+ NEO::PartitionRegisters<GfxFamily>::addressOffsetCCSOffset,
419+ addressOffset,
420+ true );
421+ }
422+
402423 if (hFence) {
403424 csr->makeResident (fence->getAllocation ());
404425 if (isCopyOnlyCommandQueue) {
@@ -407,6 +428,10 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
407428 NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw (child, fence->getGpuAddress (), Fence::STATE_SIGNALED, args);
408429 } else {
409430 NEO::PipeControlArgs args (true );
431+ if (partitionCount > 1 ) {
432+ args.workloadPartitionOffset = true ;
433+ fence->setPartitionCount (partitionCount);
434+ }
410435 NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation (
411436 child, POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
412437 fence->getGpuAddress (),
@@ -539,6 +564,9 @@ void CommandQueueHw<gfxCoreFamily>::dispatchTaskCountWrite(NEO::LinearStream &co
539564 NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw (commandStream, gpuAddress, taskCountToWrite, args);
540565 } else {
541566 NEO::PipeControlArgs args (true );
567+ if (partitionCount > 1 ) {
568+ args.workloadPartitionOffset = true ;
569+ }
542570 args.notifyEnable = csr->isUsedNotifyEnableForPostSync ();
543571 NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation (
544572 commandStream,
0 commit comments