Skip to content

Commit 61510e9

Browse files
Revert optimization of gpgpu csr's mutex lock in the enqueue blit
optimization available under flag ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission Related-To: NEO-7011 Signed-off-by: Cencelewska, Katarzyna <[email protected]>
1 parent 19cac22 commit 61510e9

File tree

6 files changed

+108
-6
lines changed

6 files changed

+108
-6
lines changed

opencl/source/command_queue/command_queue_hw.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -497,7 +497,7 @@ class CommandQueueHw : public CommandQueue {
497497
KernelOperation *blockedCommandsData,
498498
TimestampPacketDependencies &timestampPacketDependencies);
499499

500-
bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies) const;
500+
MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies) const;
501501
void setupEvent(EventBuilder &eventBuilder, cl_event *outEvent, uint32_t cmdType);
502502

503503
bool isBlitAuxTranslationRequired(const MultiDispatchInfo &multiDispatchInfo);

opencl/source/command_queue/enqueue_common.h

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1117,6 +1117,9 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
11171117

11181118
std::unique_ptr<KernelOperation> blockedCommandsData;
11191119
TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
1120+
if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() != 1) {
1121+
commandStreamReceiverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
1122+
}
11201123

11211124
auto blockQueue = false;
11221125
auto taskLevel = 0u;
@@ -1165,7 +1168,9 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
11651168
LinearStream *gpgpuCommandStream = {};
11661169
size_t gpgpuCommandStreamStart = {};
11671170
if (gpgpuSubmission) {
1168-
commandStreamReceiverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
1171+
if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) {
1172+
commandStreamReceiverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
1173+
}
11691174
gpgpuCommandStream = obtainCommandStream<cmdType>(csrDeps, true, blockQueue, multiDispatchInfo, eventsRequest, blockedCommandsData, nullptr, 0, false);
11701175
gpgpuCommandStreamStart = gpgpuCommandStream->getUsed();
11711176
}
@@ -1182,7 +1187,9 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
11821187
}
11831188

11841189
if (gpgpuSubmission) {
1185-
commandStreamReceiverOwnership.unlock();
1190+
if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) {
1191+
commandStreamReceiverOwnership.unlock();
1192+
}
11861193
}
11871194

11881195
if (eventBuilder.getEvent()) {
@@ -1199,13 +1206,17 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
11991206
enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr);
12001207

12011208
if (gpgpuSubmission) {
1202-
commandStreamReceiverOwnership.unlock();
1209+
if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) {
1210+
commandStreamReceiverOwnership.unlock();
1211+
}
12031212
}
12041213
}
12051214

12061215
timestampPacketDependencies.moveNodesToNewContainer(*deferredTimestampPackets);
12071216
csrDeps.copyNodesToNewContainer(*deferredTimestampPackets);
1208-
1217+
if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() != 1) {
1218+
commandStreamReceiverOwnership.unlock();
1219+
}
12091220
queueOwnership.unlock();
12101221
bcsCommandStreamReceiverOwnership.unlock();
12111222

opencl/test/unit_test/command_queue/blit_enqueue_2_tests.cpp

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,81 @@ HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenSubmissionT
390390
}
391391
}
392392

393+
using BlitEnqueueForceFlagsTests = BlitEnqueueTests<1>;
394+
HWTEST_TEMPLATED_F(BlitEnqueueForceFlagsTests, givenFlagsToForceCsrLockAndNonBlockedQueueWhenEnqueueBlitThenLockAreSetCorrectly) {
395+
using CsrType = UltCommandStreamReceiver<FamilyType>;
396+
auto mockCommandQueue = static_cast<MockCommandQueueHw<FamilyType> *>(commandQueue.get());
397+
auto mockCsr = static_cast<CsrType *>(&mockCommandQueue->getGpgpuCommandStreamReceiver());
398+
399+
auto buffer = createBuffer(1, false);
400+
buffer->forceDisallowCPUCopy = true;
401+
mockCommandQueue->setQueueBlocked = false;
402+
int hostPtr = 0;
403+
{
404+
DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.set(-1);
405+
DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(-1);
406+
mockCsr->recursiveLockCounter = 0u;
407+
mockCommandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr);
408+
EXPECT_EQ(1u, mockCsr->recursiveLockCounter);
409+
}
410+
{
411+
DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.set(-1);
412+
DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(1);
413+
mockCsr->recursiveLockCounter = 0u;
414+
mockCommandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr);
415+
EXPECT_EQ(1u, mockCsr->recursiveLockCounter);
416+
}
417+
{
418+
DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.set(1);
419+
DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(-1);
420+
mockCsr->recursiveLockCounter = 0u;
421+
mockCommandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr);
422+
EXPECT_EQ(0u, mockCsr->recursiveLockCounter);
423+
}
424+
{
425+
DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.set(1);
426+
DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(1);
427+
mockCsr->recursiveLockCounter = 0u;
428+
mockCommandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr);
429+
EXPECT_EQ(1u, mockCsr->recursiveLockCounter);
430+
}
431+
}
432+
433+
HWTEST_TEMPLATED_F(BlitEnqueueForceFlagsTests, givenFlagToForceCsrLockAndBlockedQueueWhenGpgpuSubmissionForBcsNotRequiredAndCallEnqueueBlitThenLockAreSetCorrectly) {
434+
using CsrType = UltCommandStreamReceiver<FamilyType>;
435+
auto mockCommandQueue = static_cast<MockCommandQueueHw<FamilyType> *>(commandQueue.get());
436+
auto mockCsr = static_cast<CsrType *>(&mockCommandQueue->getGpgpuCommandStreamReceiver());
437+
438+
auto buffer = createBuffer(1, false);
439+
buffer->forceDisallowCPUCopy = true;
440+
int hostPtr = 0;
441+
442+
DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.set(1);
443+
444+
mockCsr->recursiveLockCounter = 0u;
445+
mockCommandQueue->setQueueBlocked = true;
446+
mockCommandQueue->forceGpgpuSubmissionForBcsRequired = 0;
447+
mockCommandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr);
448+
EXPECT_EQ(0u, mockCsr->recursiveLockCounter);
449+
}
450+
HWTEST_TEMPLATED_F(BlitEnqueueForceFlagsTests, givenFlagToForceCsrLockAndBlockedQueueWhenGpgpuSubmissionForBcsRequiredAndCallEnqueueBlitThenLockAreSetCorrectly) {
451+
using CsrType = UltCommandStreamReceiver<FamilyType>;
452+
auto mockCommandQueue = static_cast<MockCommandQueueHw<FamilyType> *>(commandQueue.get());
453+
auto mockCsr = static_cast<CsrType *>(&mockCommandQueue->getGpgpuCommandStreamReceiver());
454+
455+
auto buffer = createBuffer(1, false);
456+
buffer->forceDisallowCPUCopy = true;
457+
int hostPtr = 0;
458+
459+
DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.set(1);
460+
461+
mockCsr->recursiveLockCounter = 0u;
462+
mockCommandQueue->setQueueBlocked = true;
463+
mockCommandQueue->forceGpgpuSubmissionForBcsRequired = 1;
464+
mockCommandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr);
465+
EXPECT_EQ(1u, mockCsr->recursiveLockCounter);
466+
}
467+
393468
using BlitCopyTests = BlitEnqueueTests<1>;
394469

395470
HWTEST_TEMPLATED_F(BlitCopyTests, givenKernelAllocationInLocalMemoryWhenCreatingWithoutAllowedCpuAccessThenUseBcsForTransfer) {

opencl/test/unit_test/mocks/mock_command_queue.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,18 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
382382
isBlitEnqueueImageAllowed = BaseClass::blitEnqueueImageAllowed(origin, region, image);
383383
return isBlitEnqueueImageAllowed;
384384
}
385+
bool isQueueBlocked() override {
386+
if (setQueueBlocked != -1) {
387+
return setQueueBlocked;
388+
}
389+
return BaseClass::isQueueBlocked();
390+
}
391+
bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies) const override {
392+
if (forceGpgpuSubmissionForBcsRequired != -1) {
393+
return forceGpgpuSubmissionForBcsRequired;
394+
}
395+
return BaseClass::isGpgpuSubmissionForBcsRequired(queueBlocked, timestampPacketDependencies);
396+
}
385397

386398
unsigned int lastCommandType;
387399
std::vector<Kernel *> lastEnqueuedKernels;
@@ -396,6 +408,8 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
396408
bool notifyEnqueueSVMMemcpyCalled = false;
397409
bool cpuDataTransferHandlerCalled = false;
398410
bool useBcsCsrOnNotifyEnabled = false;
411+
int setQueueBlocked = -1;
412+
int forceGpgpuSubmissionForBcsRequired = -1;
399413
mutable bool isBlitEnqueueImageAllowed = false;
400414
struct OverrideReturnValue {
401415
bool enabled = false;

shared/source/debug_settings/debug_variables_base.inl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, OverrideLeastOccupiedBank, -1, "-1: default, >=
112112
DECLARE_DEBUG_VARIABLE(int32_t, OverrideRevision, -1, "-1: default, >=0: Revision id")
113113
DECLARE_DEBUG_VARIABLE(int32_t, ForceCacheFlushForBcs, -1, "Force cache flush from gpgpu engine before dispatching BCS copy. -1: default, 1: enabled, 0: disabled")
114114
DECLARE_DEBUG_VARIABLE(int32_t, ForceGpgpuSubmissionForBcsEnqueue, -1, "-1: Default, 1: Submit gpgpu command buffer with cache flushing and completion synchronization, 0: Do nothing, if possible")
115+
DECLARE_DEBUG_VARIABLE(int32_t, ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission, -1, "-1: Default, 1: Force gpgpu command stream receiver lock for bcs enqueue only when gpgpu submission, 0: Do nothing, if possible")
115116
DECLARE_DEBUG_VARIABLE(int32_t, EnableUsmCompression, -1, "enable compression support for L0 USM Device and Shared Device side: -1 default, 0: disable, 1: enable")
116117
DECLARE_DEBUG_VARIABLE(int32_t, EnableHostUsmSupport, -1, "-1: default, 0: disable, 1: enable, Enables USM host memory")
117118
DECLARE_DEBUG_VARIABLE(int32_t, MediaVfeStateMaxSubSlices, -1, ">=0: Programs Media Vfe State Maximum Number of Dual-Subslices to given value ")

shared/test/common/test_files/igdrcl.config

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -440,4 +440,5 @@ OverrideL1CachePolicyInSurfaceStateAndStateless = -1
440440
EnableBcsSwControlWa = -1
441441
ExperimentalEnableL0DebuggerForOpenCL = 0
442442
DebuggerDisableSingleAddressSbaTracking = 0
443-
ForceImagesSupport = -1
443+
ForceImagesSupport = -1
444+
ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission = -1

0 commit comments

Comments
 (0)