Skip to content

Commit 0134845

Browse files
Add multi tile barrier to marker command on multi tile device
Related-To: NEO-6262 Signed-off-by: Zbigniew Zdanowicz <[email protected]>
1 parent d8ea439 commit 0134845

12 files changed

+414
-313
lines changed

opencl/source/command_queue/enqueue_common.h

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -168,8 +168,8 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
168168
DeviceQueueHw<GfxFamily> *devQueueHw = castToObject<DeviceQueueHw<GfxFamily>>(devQueue);
169169

170170
TagNodeBase *hwTimeStamps = nullptr;
171-
172-
auto commandStreamReceiverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
171+
CommandStreamReceiver &computeCommandStreamReceiver = getGpgpuCommandStreamReceiver();
172+
auto commandStreamReceiverOwnership = computeCommandStreamReceiver.obtainUniqueOwnership();
173173

174174
EventBuilder eventBuilder;
175175
setupEvent(eventBuilder, event, commandType);
@@ -206,17 +206,17 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
206206
BlitPropertiesContainer blitPropertiesContainer;
207207

208208
if (this->context->getRootDeviceIndices().size() > 1) {
209-
eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, getGpgpuCommandStreamReceiver());
209+
eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, computeCommandStreamReceiver);
210210
}
211211

212212
bool enqueueWithBlitAuxTranslation = isBlitAuxTranslationRequired(multiDispatchInfo);
213213

214-
if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
214+
if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
215215
if (!clearDependenciesForSubCapture) {
216-
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
216+
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, computeCommandStreamReceiver, CsrDependencies::DependenciesType::OnCsr);
217217
}
218218

219-
auto allocator = getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
219+
auto allocator = computeCommandStreamReceiver.getTimestampPacketAllocator();
220220

221221
size_t nodesCount = 0u;
222222
if (isCacheFlushCommand(commandType) || isMarkerWithProfiling) {
@@ -231,7 +231,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
231231
}
232232

233233
if (nodesCount > 0) {
234-
obtainNewTimestampPacketNodes(nodesCount, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, getGpgpuCommandStreamReceiver());
234+
obtainNewTimestampPacketNodes(nodesCount, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, computeCommandStreamReceiver);
235235
csrDeps.timestampPacketContainer.push_back(&timestampPacketDependencies.previousEnqueueNodes);
236236
}
237237
}
@@ -250,10 +250,10 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
250250
}
251251

252252
if (!blockQueue && isOOQEnabled()) {
253-
setupBarrierTimestampForBcsEngines(getGpgpuCommandStreamReceiver().getOsContext().getEngineType(), timestampPacketDependencies);
253+
setupBarrierTimestampForBcsEngines(computeCommandStreamReceiver.getOsContext().getEngineType(), timestampPacketDependencies);
254254
}
255255

256-
if (eventBuilder.getEvent() && getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
256+
if (eventBuilder.getEvent() && computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
257257
eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer);
258258
eventBuilder.getEvent()->addTimestampPacketNodes(timestampPacketDependencies.nonAuxToAuxNodes);
259259
eventBuilder.getEvent()->addTimestampPacketNodes(timestampPacketDependencies.auxToNonAuxNodes);
@@ -267,9 +267,9 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
267267
timestampPacketDependencies);
268268
} else if (isCacheFlushCommand(commandType)) {
269269
processDispatchForCacheFlush(surfacesForResidency, numSurfaceForResidency, &commandStream, csrDeps);
270-
} else if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
270+
} else if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
271271
if (CL_COMMAND_BARRIER == commandType) {
272-
getGpgpuCommandStreamReceiver().requestStallingCommandsOnNextFlush();
272+
computeCommandStreamReceiver.requestStallingCommandsOnNextFlush();
273273
}
274274

275275
for (size_t i = 0; i < eventsRequest.numEventsInWaitList; i++) {
@@ -288,8 +288,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
288288

289289
if (isMarkerWithProfiling) {
290290
if (numEventsInWaitList == 0) {
291-
PipeControlArgs args(false);
292-
MemorySynchronizationCommands<GfxFamily>::addPipeControl(commandStream, args);
291+
computeCommandStreamReceiver.programComputeBarrierCommand(commandStream);
293292
}
294293
processDispatchForMarkerWithTimestampPacket(*this, &commandStream, eventsRequest, csrDeps);
295294
}
@@ -305,7 +304,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
305304

306305
if (!blockQueue && multiDispatchInfo.peekMainKernel() && multiDispatchInfo.peekMainKernel()->requiresMemoryMigration()) {
307306
for (auto &arg : multiDispatchInfo.peekMainKernel()->getMemObjectsToMigrate()) {
308-
MigrationController::handleMigration(*this->context, getGpgpuCommandStreamReceiver(), arg.second);
307+
MigrationController::handleMigration(*this->context, computeCommandStreamReceiver, arg.second);
309308
migratedMemory = true;
310309
}
311310
}
@@ -315,7 +314,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
315314
}
316315

317316
if (enqueueProperties.operation == EnqueueProperties::Operation::GpuKernel) {
318-
csrDeps.makeResident(getGpgpuCommandStreamReceiver());
317+
csrDeps.makeResident(computeCommandStreamReceiver);
319318

320319
completionStamp = enqueueNonBlocked<commandType>(
321320
surfacesForResidency,
@@ -334,7 +333,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
334333
getBcsForAuxTranslation());
335334

336335
if (parentKernel) {
337-
getGpgpuCommandStreamReceiver().setMediaVFEStateDirty(true);
336+
computeCommandStreamReceiver.setMediaVFEStateDirty(true);
338337

339338
if (devQueueHw->getSchedulerReturnInstance() > 0) {
340339
waitUntilComplete(completionStamp.taskCount, {}, completionStamp.flushStamp, false);
@@ -427,7 +426,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
427426
}
428427
}
429428
if (migratedMemory) {
430-
getGpgpuCommandStreamReceiver().flushBatchedSubmissions();
429+
computeCommandStreamReceiver.flushBatchedSubmissions();
431430
}
432431
}
433432

opencl/source/command_queue/gpgpu_walker_base.inl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, c
208208
expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredForTimestampPacketWrite();
209209
if (isMarkerWithProfiling) {
210210
if (!eventsInWaitlist) {
211-
expectedSizeCS += MemorySynchronizationCommands<GfxFamily>::getSizeForSinglePipeControl();
211+
expectedSizeCS += commandQueue.getGpgpuCommandStreamReceiver().getCmdsSizeForComputeBarrierCommand();
212212
}
213213
expectedSizeCS += 4 * EncodeStoreMMIO<GfxFamily>::size;
214214
}

opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
*
66
*/
77

8+
#include "shared/source/command_container/implicit_scaling.h"
89
#include "shared/source/command_stream/scratch_space_controller.h"
910
#include "shared/source/helpers/hw_helper.h"
1011
#include "shared/source/memory_manager/allocations_list.h"
@@ -1044,6 +1045,26 @@ HWTEST_F(EnqueueKernelTest, givenTimestampWriteEnableWhenMarkerProfilingWithoutW
10441045

10451046
EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO<FamilyType>::size + MemorySynchronizationCommands<FamilyType>::getSizeForSinglePipeControl(), extendedCommandStreamSize);
10461047
}
1048+
1049+
HWCMDTEST_F(IGFX_XE_HP_CORE, EnqueueKernelTest, givenTimestampWriteEnableOnMultiTileQueueWhenMarkerProfilingWithoutWaitListThenSizeHasFourMMIOStoresAndCrossTileBarrier) {
1050+
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
1051+
csr.timestampPacketWriteEnabled = true;
1052+
csr.activePartitions = 2;
1053+
csr.activePartitionsConfig = 2;
1054+
csr.staticWorkPartitioningEnabled = true;
1055+
1056+
MockKernelWithInternals mockKernel(*pClDevice);
1057+
DispatchInfo dispatchInfo;
1058+
MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel);
1059+
dispatchInfo.setKernel(mockKernel.mockKernel);
1060+
multiDispatchInfo.push(dispatchInfo);
1061+
1062+
auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false);
1063+
auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false);
1064+
1065+
EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO<FamilyType>::size + ImplicitScalingDispatch<FamilyType>::getBarrierSize(csr.peekHwInfo(), false, false), extendedCommandStreamSize);
1066+
}
1067+
10471068
HWTEST_F(EnqueueKernelTest, givenTimestampWriteEnableWhenMarkerProfilingWithWaitListThenSizeHasFourMMIOStores) {
10481069
pDevice->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
10491070
MockKernelWithInternals mockKernel(*pClDevice);

0 commit comments

Comments
 (0)