Skip to content

Commit 60805cd

Browse files
Fix direct submission wait on multi tile device using single tile context
Related-To: NEO-6244 Signed-off-by: Zbigniew Zdanowicz <[email protected]>
1 parent fe432ab commit 60805cd

File tree

6 files changed

+134
-7
lines changed

6 files changed

+134
-7
lines changed

shared/source/direct_submission/direct_submission_hw.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,5 +154,6 @@ class DirectSubmissionHw {
154154
bool disableCacheFlush = false;
155155
bool disableMonitorFence = false;
156156
bool partitionedMode = false;
157+
bool partitionConfigSet = true;
157158
};
158159
} // namespace NEO

shared/source/direct_submission/direct_submission_hw.inl

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -144,19 +144,20 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::initialize(bool submitOnInit) {
144144
if (ret && submitOnInit) {
145145
size_t startBufferSize = Dispatcher::getSizePreemption() +
146146
getSizeSemaphoreSection();
147-
if (this->partitionedMode) {
148-
startBufferSize += EncodeSetMMIO<GfxFamily>::sizeMEM;
149-
startBufferSize += EncodeSetMMIO<GfxFamily>::sizeIMM;
150-
}
147+
151148
Dispatcher::dispatchPreemption(ringCommandStream);
152149
if (this->partitionedMode) {
150+
startBufferSize += (EncodeSetMMIO<GfxFamily>::sizeMEM +
151+
EncodeSetMMIO<GfxFamily>::sizeIMM);
152+
153153
EncodeSetMMIO<GfxFamily>::encodeMEM(ringCommandStream,
154154
PartitionRegisters<GfxFamily>::wparidCCSOffset,
155155
this->workPartitionAllocation->getGpuAddress());
156156
EncodeSetMMIO<GfxFamily>::encodeIMM(ringCommandStream,
157157
PartitionRegisters<GfxFamily>::addressOffsetCCSOffset,
158158
CommonConstants::partitionAddressOffset,
159159
true);
160+
this->partitionConfigSet = true;
160161
}
161162
if (workloadMode == 1) {
162163
dispatchDiagnosticModeSection();
@@ -178,12 +179,27 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
178179
}
179180

180181
size_t startSize = getSizeSemaphoreSection();
182+
if (!this->partitionConfigSet) {
183+
startSize += (EncodeSetMMIO<GfxFamily>::sizeMEM +
184+
EncodeSetMMIO<GfxFamily>::sizeIMM);
185+
}
181186
size_t requiredSize = startSize + getSizeDispatch() + getSizeEnd();
182187
if (ringCommandStream.getAvailableSpace() < requiredSize) {
183188
switchRingBuffers();
184189
}
185190
uint64_t gpuStartVa = getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0));
186191

192+
if (!this->partitionConfigSet) {
193+
EncodeSetMMIO<GfxFamily>::encodeMEM(ringCommandStream,
194+
PartitionRegisters<GfxFamily>::wparidCCSOffset,
195+
this->workPartitionAllocation->getGpuAddress());
196+
EncodeSetMMIO<GfxFamily>::encodeIMM(ringCommandStream,
197+
PartitionRegisters<GfxFamily>::addressOffsetCCSOffset,
198+
CommonConstants::partitionAddressOffset,
199+
true);
200+
this->partitionConfigSet = true;
201+
}
202+
187203
currentQueueWorkCount++;
188204
dispatchSemaphoreSection(currentQueueWorkCount);
189205

shared/source/direct_submission/linux/drm_direct_submission.inl

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,17 @@ DrmDirectSubmission<GfxFamily, Dispatcher>::DrmDirectSubmission(Device &device,
2929
if (DebugManager.flags.DirectSubmissionDisableMonitorFence.get() != -1) {
3030
this->disableMonitorFence = DebugManager.flags.DirectSubmissionDisableMonitorFence.get();
3131
}
32-
auto subDevices = device.getDeviceBitfield();
32+
33+
auto osContextLinux = static_cast<OsContextLinux *>(&this->osContext);
34+
35+
auto subDevices = osContextLinux->getDeviceBitfield();
3336
bool dispatcherSupport = Dispatcher::isMultiTileSynchronizationSupported();
3437
if (ImplicitScalingHelper::isImplicitScalingEnabled(subDevices, true) && dispatcherSupport) {
3538
this->activeTiles = static_cast<uint32_t>(subDevices.count());
3639
}
3740
this->partitionedMode = this->activeTiles > 1u;
38-
auto osContextLinux = static_cast<OsContextLinux *>(&this->osContext);
41+
this->partitionConfigSet = !this->partitionedMode;
42+
3943
osContextLinux->getDrm().setDirectSubmissionActive(true);
4044

4145
if (this->partitionedMode) {

shared/test/common/mocks/mock_direct_submission_hw.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw<GfxFamily, Dispatcher>
4545
using BaseClass::getSizeSwitchRingBufferSection;
4646
using BaseClass::hwInfo;
4747
using BaseClass::osContext;
48+
using BaseClass::partitionConfigSet;
4849
using BaseClass::partitionedMode;
4950
using BaseClass::performDiagnosticMode;
5051
using BaseClass::ringBuffer;
@@ -56,6 +57,7 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw<GfxFamily, Dispatcher>
5657
using BaseClass::semaphorePtr;
5758
using BaseClass::semaphores;
5859
using BaseClass::setReturnAddress;
60+
using BaseClass::startRingBuffer;
5961
using BaseClass::stopRingBuffer;
6062
using BaseClass::switchRingBuffersAllocations;
6163
using BaseClass::workloadMode;

shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,15 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, DirectSubmissionDispatchBufferTest,
4444

4545
MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>> directSubmission(*pDevice,
4646
*osContext.get());
47+
EXPECT_TRUE(directSubmission.partitionConfigSet);
48+
directSubmission.partitionConfigSet = false;
4749
directSubmission.disableMonitorFence = false;
4850
directSubmission.partitionedMode = true;
4951
directSubmission.workPartitionAllocation = ultCsr->getWorkPartitionAllocation();
5052

5153
bool ret = directSubmission.initialize(true);
5254
EXPECT_TRUE(ret);
55+
EXPECT_TRUE(directSubmission.partitionConfigSet);
5356
EXPECT_NE(0x0u, directSubmission.ringCommandStream.getUsed());
5457
GraphicsAllocation *oldRingAllocation = directSubmission.ringCommandStream.getGraphicsAllocation();
5558

@@ -119,12 +122,15 @@ HWTEST_F(DirectSubmissionDispatchBufferTest,
119122

120123
MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>> directSubmission(*pDevice,
121124
*osContext.get());
125+
EXPECT_TRUE(directSubmission.partitionConfigSet);
122126
directSubmission.activeTiles = 2;
123127
directSubmission.partitionedMode = true;
128+
directSubmission.partitionConfigSet = false;
124129
directSubmission.workPartitionAllocation = ultCsr->getWorkPartitionAllocation();
125130

126131
bool ret = directSubmission.initialize(true);
127132
EXPECT_TRUE(ret);
133+
EXPECT_TRUE(directSubmission.partitionConfigSet);
128134
EXPECT_NE(0x0u, directSubmission.ringCommandStream.getUsed());
129135

130136
size_t submitSize = RenderDispatcher<FamilyType>::getSizePreemption() +
@@ -158,3 +164,60 @@ HWTEST_F(DirectSubmissionDispatchBufferTest,
158164
uint64_t gpuAddress = ultCsr->getWorkPartitionAllocation()->getGpuAddress();
159165
EXPECT_EQ(gpuAddress, loadRegisterMem->getMemoryAddress());
160166
}
167+
168+
HWTEST_F(DirectSubmissionDispatchBufferTest,
169+
givenDirectSubmissionRingNotStartOnInitWhenMultiTileSupportedThenExpectMultiTileConfigSetDuringExplicitRingStart) {
170+
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
171+
using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM;
172+
173+
pDevice->rootCsrCreated = true;
174+
pDevice->numSubDevices = 2;
175+
176+
auto ultCsr = reinterpret_cast<UltCommandStreamReceiver<FamilyType> *>(pDevice->getDefaultEngine().commandStreamReceiver);
177+
ultCsr->staticWorkPartitioningEnabled = true;
178+
ultCsr->createWorkPartitionAllocation(*pDevice);
179+
180+
FlushStampTracker flushStamp(true);
181+
182+
MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>> directSubmission(*pDevice,
183+
*osContext.get());
184+
EXPECT_TRUE(directSubmission.partitionConfigSet);
185+
directSubmission.activeTiles = 2;
186+
directSubmission.partitionedMode = true;
187+
directSubmission.partitionConfigSet = false;
188+
directSubmission.workPartitionAllocation = ultCsr->getWorkPartitionAllocation();
189+
190+
bool ret = directSubmission.initialize(false);
191+
EXPECT_TRUE(ret);
192+
EXPECT_FALSE(directSubmission.partitionConfigSet);
193+
EXPECT_FALSE(directSubmission.ringStart);
194+
EXPECT_EQ(0x0u, directSubmission.ringCommandStream.getUsed());
195+
196+
ret = directSubmission.startRingBuffer();
197+
EXPECT_TRUE(ret);
198+
EXPECT_TRUE(directSubmission.partitionConfigSet);
199+
EXPECT_TRUE(directSubmission.ringStart);
200+
201+
HardwareParse hwParse;
202+
hwParse.parseCommands<FamilyType>(directSubmission.ringCommandStream, 0);
203+
hwParse.findHardwareCommands<FamilyType>();
204+
205+
ASSERT_NE(hwParse.lriList.end(), hwParse.lriList.begin());
206+
bool partitionRegisterFound = false;
207+
for (auto &it : hwParse.lriList) {
208+
auto loadRegisterImm = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(it);
209+
if (loadRegisterImm->getRegisterOffset() == 0x23B4u) {
210+
211+
EXPECT_EQ(8u, loadRegisterImm->getDataDword());
212+
partitionRegisterFound = true;
213+
}
214+
}
215+
EXPECT_TRUE(partitionRegisterFound);
216+
217+
auto loadRegisterMemItor = find<MI_LOAD_REGISTER_MEM *>(hwParse.cmdList.begin(), hwParse.cmdList.end());
218+
ASSERT_NE(hwParse.cmdList.end(), loadRegisterMemItor);
219+
auto loadRegisterMem = reinterpret_cast<MI_LOAD_REGISTER_MEM *>(*loadRegisterMemItor);
220+
EXPECT_EQ(0x221Cu, loadRegisterMem->getRegisterAddress());
221+
uint64_t gpuAddress = ultCsr->getWorkPartitionAllocation()->getGpuAddress();
222+
EXPECT_EQ(gpuAddress, loadRegisterMem->getMemoryAddress());
223+
}

shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ struct MockDrmDirectSubmission : public DrmDirectSubmission<GfxFamily, Dispatche
6767
using BaseClass::handleNewResourcesSubmission;
6868
using BaseClass::handleResidency;
6969
using BaseClass::isNewResourceHandleNeeded;
70+
using BaseClass::partitionConfigSet;
7071
using BaseClass::partitionedMode;
7172
using BaseClass::ringStart;
7273
using BaseClass::submit;
@@ -314,14 +315,21 @@ HWTEST_F(DrmDirectSubmissionTest, givenMultipleActiveTilesWhenWaitingForTagUpdat
314315
EXPECT_EQ(2u, CpuIntrinsicsTests::pauseCounter);
315316
}
316317

317-
HWTEST_F(DrmDirectSubmissionTest, givenRenderDispatcherAndMultiTileDeviceWhenCreatingDirectSubmissionThenExpectActiveTilesMatchSubDeviceCount) {
318+
HWTEST_F(DrmDirectSubmissionTest,
319+
givenRenderDispatcherAndMultiTileDeviceWhenCreatingDirectSubmissionUsingMultiTileContextThenExpectActiveTilesMatchSubDeviceCount) {
318320
using Dispatcher = RenderDispatcher<FamilyType>;
319321

320322
VariableBackup<bool> backup(&ImplicitScaling::apiSupport, true);
321323
device->deviceBitfield.set(0b11);
322324
device->rootCsrCreated = true;
323325
device->numSubDevices = 2;
324326

327+
osContext = std::make_unique<OsContextLinux>(*executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as<Drm>(), 0u,
328+
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_RCS, EngineUsage::Regular},
329+
PreemptionMode::ThreadGroup, device->getDeviceBitfield()));
330+
osContext->ensureContextInitialized();
331+
EXPECT_EQ(2u, osContext->getDeviceBitfield().count());
332+
325333
auto ultCsr = reinterpret_cast<UltCommandStreamReceiver<FamilyType> *>(device->getDefaultEngine().commandStreamReceiver);
326334
ultCsr->staticWorkPartitioningEnabled = true;
327335
ultCsr->createWorkPartitionAllocation(*device);
@@ -331,6 +339,32 @@ HWTEST_F(DrmDirectSubmissionTest, givenRenderDispatcherAndMultiTileDeviceWhenCre
331339

332340
EXPECT_EQ(2u, directSubmission.activeTiles);
333341
EXPECT_TRUE(directSubmission.partitionedMode);
342+
EXPECT_FALSE(directSubmission.partitionConfigSet);
343+
344+
bool ret = directSubmission.allocateResources();
345+
EXPECT_TRUE(ret);
346+
}
347+
348+
HWTEST_F(DrmDirectSubmissionTest, givenRenderDispatcherAndMultiTileDeviceWhenCreatingDirectSubmissionSingleTileContextThenExpectActiveTilesEqualsSingleTile) {
349+
using Dispatcher = RenderDispatcher<FamilyType>;
350+
351+
VariableBackup<bool> backup(&ImplicitScaling::apiSupport, true);
352+
device->deviceBitfield.set(0b11);
353+
device->rootCsrCreated = true;
354+
device->numSubDevices = 2;
355+
356+
EXPECT_EQ(1u, osContext->getDeviceBitfield().count());
357+
358+
auto ultCsr = reinterpret_cast<UltCommandStreamReceiver<FamilyType> *>(device->getDefaultEngine().commandStreamReceiver);
359+
ultCsr->staticWorkPartitioningEnabled = true;
360+
ultCsr->createWorkPartitionAllocation(*device);
361+
362+
MockDrmDirectSubmission<FamilyType, Dispatcher> directSubmission(*device.get(),
363+
*osContext.get());
364+
365+
EXPECT_EQ(1u, directSubmission.activeTiles);
366+
EXPECT_FALSE(directSubmission.partitionedMode);
367+
EXPECT_TRUE(directSubmission.partitionConfigSet);
334368

335369
bool ret = directSubmission.allocateResources();
336370
EXPECT_TRUE(ret);
@@ -342,11 +376,18 @@ HWTEST_F(DrmDirectSubmissionTest, givenBlitterDispatcherAndMultiTileDeviceWhenCr
342376
VariableBackup<bool> backup(&ImplicitScaling::apiSupport, true);
343377
device->deviceBitfield.set(0b11);
344378

379+
osContext = std::make_unique<OsContextLinux>(*executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as<Drm>(), 0u,
380+
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_RCS, EngineUsage::Regular},
381+
PreemptionMode::ThreadGroup, device->getDeviceBitfield()));
382+
osContext->ensureContextInitialized();
383+
EXPECT_EQ(2u, osContext->getDeviceBitfield().count());
384+
345385
MockDrmDirectSubmission<FamilyType, Dispatcher> directSubmission(*device.get(),
346386
*osContext.get());
347387

348388
EXPECT_EQ(1u, directSubmission.activeTiles);
349389
EXPECT_FALSE(directSubmission.partitionedMode);
390+
EXPECT_TRUE(directSubmission.partitionConfigSet);
350391

351392
bool ret = directSubmission.allocateResources();
352393
EXPECT_TRUE(ret);

0 commit comments

Comments
 (0)