Skip to content

Commit 6b299a3

Browse files
Make partitioned post sync operations for partitioned workloads
Signed-off-by: Zbigniew Zdanowicz <[email protected]>
1 parent 86f8150 commit 6b299a3

File tree

22 files changed

+358
-55
lines changed

22 files changed

+358
-55
lines changed

level_zero/core/source/cmdlist/cmdlist.h

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -216,12 +216,6 @@ struct CommandList : _ze_command_list_handle_t {
216216
TYPE_IMMEDIATE = 1u
217217
};
218218

219-
CommandQueue *cmdQImmediate = nullptr;
220-
NEO::CommandStreamReceiver *csr = nullptr;
221-
uint32_t cmdListType = CommandListType::TYPE_REGULAR;
222-
Device *device = nullptr;
223-
std::vector<Kernel *> printfFunctionContainer;
224-
225219
virtual ze_result_t executeCommandListImmediate(bool performMigration) = 0;
226220
virtual ze_result_t initialize(Device *device, NEO::EngineGroupType engineGroupType, ze_command_list_flags_t flags) = 0;
227221
virtual ~CommandList();
@@ -241,33 +235,41 @@ struct CommandList : _ze_command_list_handle_t {
241235
return commandsToPatch;
242236
}
243237

244-
bool isSyncModeQueue = false;
245-
bool commandListSLMEnabled = false;
246-
uint32_t commandListPerThreadScratchSize = 0u;
238+
void makeResidentAndMigrate(bool);
239+
void migrateSharedAllocations();
240+
241+
std::vector<Kernel *> printfFunctionContainer;
242+
CommandQueue *cmdQImmediate = nullptr;
243+
NEO::CommandStreamReceiver *csr = nullptr;
244+
Device *device = nullptr;
247245
NEO::PreemptionMode commandListPreemptionMode = NEO::PreemptionMode::Initial;
246+
uint32_t cmdListType = CommandListType::TYPE_REGULAR;
247+
uint32_t commandListPerThreadScratchSize = 0u;
248248
uint32_t threadArbitrationPolicy = NEO::ThreadArbitrationPolicy::RoundRobin;
249+
uint32_t partitionCount = 1;
249250
bool isFlushTaskSubmissionEnabled = false;
250-
251-
void makeResidentAndMigrate(bool);
252-
void migrateSharedAllocations();
251+
bool isSyncModeQueue = false;
252+
bool commandListSLMEnabled = false;
253253

254254
protected:
255-
std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap;
256-
NEO::EngineGroupType engineGroupType;
257-
ze_command_list_flags_t flags = 0u;
258-
UnifiedMemoryControls unifiedMemoryControls;
259-
bool indirectAllocationsAllowed = false;
260-
bool internalUsage = false;
261-
bool containsCooperativeKernelsFlag = false;
262255
NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize);
263256
NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize);
264-
bool containsStatelessUncachedResource = false;
257+
258+
std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap;
259+
std::vector<NEO::GraphicsAllocation *> ownedPrivateAllocations;
265260

266261
NEO::StreamProperties requiredStreamState{};
267262
NEO::StreamProperties finalStreamState{};
268263
CommandsToPatch commandsToPatch{};
269264

270-
std::vector<NEO::GraphicsAllocation *> ownedPrivateAllocations;
265+
ze_command_list_flags_t flags = 0u;
266+
UnifiedMemoryControls unifiedMemoryControls;
267+
268+
NEO::EngineGroupType engineGroupType;
269+
bool indirectAllocationsAllowed = false;
270+
bool internalUsage = false;
271+
bool containsCooperativeKernelsFlag = false;
272+
bool containsStatelessUncachedResource = false;
271273
};
272274

273275
using CommandListAllocatorFn = CommandList *(*)(uint32_t);

level_zero/core/source/cmdlist/cmdlist_hw.inl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::reset() {
108108
device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc);
109109
}
110110
this->ownedPrivateAllocations.clear();
111-
111+
partitionCount = 1;
112112
return ZE_RESULT_SUCCESS;
113113
}
114114

level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
228228
partitionCount,
229229
internalUsage,
230230
isCooperative);
231+
this->partitionCount = std::max(partitionCount, this->partitionCount);
231232
if (hEvent) {
232233
auto event = Event::fromHandle(hEvent);
233234
if (partitionCount > 1) {

level_zero/core/source/cmdqueue/cmdqueue.cpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,17 @@ ze_result_t CommandQueueImp::synchronizeByPollingForTaskCount(uint64_t timeout)
106106
timeoutMicroseconds = NEO::TimeoutControls::maxTimeout;
107107
}
108108

109-
csr->waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, this->taskCount);
110-
111-
if (*csr->getTagAddress() < taskCountToWait) {
112-
return ZE_RESULT_NOT_READY;
109+
if (partitionCount > 1) {
110+
volatile uint32_t *pollAddress = csr->getTagAddress();
111+
for (uint32_t i = 0; i < partitionCount; i++) {
112+
csr->waitForCompletionWithTimeout(pollAddress, enableTimeout, timeoutMicroseconds, this->taskCount);
113+
pollAddress += addressOffsetDwords;
114+
}
115+
} else {
116+
csr->waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, this->taskCount);
117+
if (*csr->getTagAddress() < taskCountToWait) {
118+
return ZE_RESULT_NOT_READY;
119+
}
113120
}
114121

115122
postSyncOperations();

level_zero/core/source/cmdqueue/cmdqueue.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ struct CommandQueue : _ze_command_queue_handle_t {
5757

5858
protected:
5959
NEO::PreemptionMode commandQueuePreemptionMode = NEO::PreemptionMode::Initial;
60+
uint32_t partitionCount = 1;
6061
bool preemptionCmdSyncProgramming = true;
6162
bool commandQueueDebugCmdsProgrammed = false;
6263
bool isCopyOnlyCommandQueue = false;

level_zero/core/source/cmdqueue/cmdqueue_hw.inl

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "shared/source/built_ins/built_ins.h"
1111
#include "shared/source/built_ins/sip.h"
1212
#include "shared/source/command_container/command_encoder.h"
13+
#include "shared/source/command_container/implicit_scaling.h"
1314
#include "shared/source/command_stream/command_stream_receiver_hw.h"
1415
#include "shared/source/command_stream/linear_stream.h"
1516
#include "shared/source/command_stream/preemption.h"
@@ -72,6 +73,9 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
7273
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
7374
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
7475

76+
using MI_LOAD_REGISTER_MEM = typename GfxFamily::MI_LOAD_REGISTER_MEM;
77+
using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM;
78+
7579
auto lockCSR = csr->obtainUniqueOwnership();
7680

7781
auto anyCommandListWithCooperativeKernels = false;
@@ -177,6 +181,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
177181
heapContainer.push_back(element);
178182
}
179183
}
184+
185+
partitionCount = std::max(partitionCount, commandList->partitionCount);
180186
}
181187

182188
size_t linearStreamSizeEstimate = totalCmdBuffers * sizeof(MI_BATCH_BUFFER_START);
@@ -240,6 +246,10 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
240246
}
241247

242248
linearStreamSizeEstimate += isCopyOnlyCommandQueue ? NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite() : NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
249+
if (partitionCount > 1) {
250+
linearStreamSizeEstimate += sizeof(MI_LOAD_REGISTER_MEM) + sizeof(MI_LOAD_REGISTER_IMM);
251+
}
252+
243253
size_t alignedSize = alignUp<size_t>(linearStreamSizeEstimate, minCmdBufferPtrAlign);
244254
size_t padding = alignedSize - linearStreamSizeEstimate;
245255
reserveLinearStreamSize(alignedSize);
@@ -399,6 +409,17 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
399409

400410
commandQueuePreemptionMode = statePreemption;
401411

412+
if (partitionCount > 1) {
413+
uint64_t workPartitionAddress = csr->getWorkPartitionAllocationGpuAddress();
414+
NEO::EncodeSetMMIO<GfxFamily>::encodeMEM(child,
415+
NEO::PartitionRegisters<GfxFamily>::wparidCCSOffset,
416+
workPartitionAddress);
417+
NEO::EncodeSetMMIO<GfxFamily>::encodeIMM(child,
418+
NEO::PartitionRegisters<GfxFamily>::addressOffsetCCSOffset,
419+
addressOffset,
420+
true);
421+
}
422+
402423
if (hFence) {
403424
csr->makeResident(fence->getAllocation());
404425
if (isCopyOnlyCommandQueue) {
@@ -407,6 +428,10 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
407428
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(child, fence->getGpuAddress(), Fence::STATE_SIGNALED, args);
408429
} else {
409430
NEO::PipeControlArgs args(true);
431+
if (partitionCount > 1) {
432+
args.workloadPartitionOffset = true;
433+
fence->setPartitionCount(partitionCount);
434+
}
410435
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
411436
child, POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
412437
fence->getGpuAddress(),
@@ -539,6 +564,9 @@ void CommandQueueHw<gfxCoreFamily>::dispatchTaskCountWrite(NEO::LinearStream &co
539564
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(commandStream, gpuAddress, taskCountToWrite, args);
540565
} else {
541566
NEO::PipeControlArgs args(true);
567+
if (partitionCount > 1) {
568+
args.workloadPartitionOffset = true;
569+
}
542570
args.notifyEnable = csr->isUsedNotifyEnableForPostSync();
543571
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
544572
commandStream,

level_zero/core/source/cmdqueue/cmdqueue_imp.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ struct CommandQueueImp : public CommandQueue {
6262
MemoryConstants::cacheLineSize +
6363
NEO::CSRequirements::csOverfetchSize;
6464

65+
static constexpr uint32_t addressOffsetDwords = 2u;
66+
static constexpr uint32_t addressOffset = sizeof(uint32_t) * addressOffsetDwords;
67+
6568
CommandQueueImp() = delete;
6669
CommandQueueImp(Device *device, NEO::CommandStreamReceiver *csr, const ze_command_queue_desc_t *desc);
6770

level_zero/core/source/fence/fence.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,15 @@ ze_result_t FenceImp::queryStatus() {
3737
csr->downloadAllocations();
3838
}
3939

40-
uint64_t *hostAddr = static_cast<uint64_t *>(allocation->getUnderlyingBuffer());
40+
void *hostAddr = static_cast<uint64_t *>(allocation->getUnderlyingBuffer());
4141
uint32_t queryVal = Fence::STATE_CLEARED;
42-
memcpy_s(static_cast<void *>(&queryVal), sizeof(uint32_t), static_cast<void *>(hostAddr), sizeof(uint32_t));
42+
for (uint32_t i = 0; i < partitionCount; i++) {
43+
memcpy_s(static_cast<void *>(&queryVal), sizeof(uint32_t), hostAddr, sizeof(uint32_t));
44+
if (queryVal == Fence::STATE_CLEARED) {
45+
break;
46+
}
47+
hostAddr = ptrOffset(hostAddr, CommandQueueImp::addressOffset);
48+
}
4349
return queryVal == Fence::STATE_CLEARED ? ZE_RESULT_NOT_READY : ZE_RESULT_SUCCESS;
4450
}
4551

level_zero/core/source/fence/fence.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,13 @@ struct Fence : _ze_fence_handle_t {
4747
return allocation->getGpuAddress();
4848
}
4949

50+
void setPartitionCount(uint32_t newPartitionCount) {
51+
partitionCount = newPartitionCount;
52+
}
53+
5054
protected:
5155
NEO::GraphicsAllocation *allocation = nullptr;
56+
uint32_t partitionCount = 1;
5257
};
5358

5459
struct FenceImp : public Fence {

level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,10 @@ struct WhiteBox<::L0::CommandQueue> : public ::L0::CommandQueueImp {
2727
using BaseClass::printfFunctionContainer;
2828
using BaseClass::submitBatchBuffer;
2929
using BaseClass::synchronizeByPollingForTaskCount;
30+
using BaseClass::taskCount;
3031
using CommandQueue::commandQueuePreemptionMode;
3132
using CommandQueue::internalUsage;
33+
using CommandQueue::partitionCount;
3234

3335
WhiteBox(Device *device, NEO::CommandStreamReceiver *csr,
3436
const ze_command_queue_desc_t *desc);
@@ -85,6 +87,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
8587
using BaseClass::commandStream;
8688
using BaseClass::printfFunctionContainer;
8789
using L0::CommandQueue::internalUsage;
90+
using L0::CommandQueue::partitionCount;
8891
using L0::CommandQueue::preemptionCmdSyncProgramming;
8992
using L0::CommandQueueImp::csr;
9093

0 commit comments

Comments
 (0)