Skip to content

Commit 7a5c9d3

Browse files
Encode dispatch kernel with global bindless heaps
Signed-off-by: Maciej Plewka <[email protected]>
1 parent be90b9f commit 7a5c9d3

File tree

14 files changed

+478
-149
lines changed

14 files changed

+478
-149
lines changed

level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ HWTEST_F(CommandListAppendLaunchKernel, WhenAppendingMultipleTimesThenSshIsNotDe
178178
auto sshHeapSize = ssh->getMaxAvailableSpace();
179179
auto initialAllocation = ssh->getGraphicsAllocation();
180180
EXPECT_NE(nullptr, initialAllocation);
181-
181+
const_cast<KernelDescriptor::AddressingMode &>(kernel->getKernelDescriptor().kernelAttributes.bufferAddressingMode) = KernelDescriptor::BindfulAndStateless;
182182
for (size_t i = 0; i < sshHeapSize / kernelSshSize + 1; i++) {
183183
auto result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr);
184184
ASSERT_EQ(ZE_RESULT_SUCCESS, result);

opencl/source/helpers/hardware_commands_helper_base.inl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
241241
uint32_t samplerCount = 0;
242242
if (patchInfo.samplerStateArray) {
243243
samplerCount = patchInfo.samplerStateArray->Count;
244-
samplerStateOffset = EncodeStates<GfxFamily>::copySamplerState(&dsh, patchInfo.samplerStateArray->Offset, samplerCount, patchInfo.samplerStateArray->BorderColorOffset, kernel.getDynamicStateHeap());
244+
samplerStateOffset = EncodeStates<GfxFamily>::copySamplerState(&dsh, patchInfo.samplerStateArray->Offset, samplerCount, patchInfo.samplerStateArray->BorderColorOffset, kernel.getDynamicStateHeap(), device.getBindlessHeapsHelper());
245245
}
246246

247247
auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload;

shared/source/command_container/command_encoder.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ namespace NEO {
2121
class GmmHelper;
2222
struct HardwareInfo;
2323
class IndirectHeap;
24+
class BindlessHeapsHelper;
2425

2526
template <typename GfxFamily>
2627
struct EncodeDispatchKernel {
@@ -46,8 +47,6 @@ struct EncodeDispatchKernel {
4647

4748
static size_t estimateEncodeDispatchKernelCmdsSize(Device *device);
4849

49-
static void patchBindlessSurfaceStateOffsets(const size_t sshOffset, const KernelDescriptor &kernelDesc, uint8_t *crossThread);
50-
5150
static bool isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels,
5251
size_t *lws,
5352
std::array<uint8_t, 3> walkOrder,
@@ -84,6 +83,7 @@ struct EncodeStates {
8483
using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;
8584
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
8685
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
86+
using SAMPLER_BORDER_COLOR_STATE = typename GfxFamily::SAMPLER_BORDER_COLOR_STATE;
8787

8888
static const uint32_t alignIndirectStatePointer = MemoryConstants::cacheLineSize;
8989
static const size_t alignInterfaceDescriptorData = MemoryConstants::cacheLineSize;
@@ -92,7 +92,8 @@ struct EncodeStates {
9292
uint32_t samplerStateOffset,
9393
uint32_t samplerCount,
9494
uint32_t borderColorOffset,
95-
const void *fnDynamicStateHeap);
95+
const void *fnDynamicStateHeap,
96+
BindlessHeapsHelper *bindlessHeapHelper);
9697

9798
static void adjustStateComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, bool isMultiOsContextCapable, bool requiresCoherency);
9899

shared/source/command_container/command_encoder.inl

Lines changed: 34 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
#include "shared/source/execution_environment/execution_environment.h"
1313
#include "shared/source/gmm_helper/gmm.h"
1414
#include "shared/source/gmm_helper/gmm_helper.h"
15+
#include "shared/source/helpers/api_specific_config.h"
16+
#include "shared/source/helpers/bindless_heaps_helper.h"
1517
#include "shared/source/helpers/hw_helper.h"
1618
#include "shared/source/helpers/local_id_gen.h"
1719
#include "shared/source/helpers/preamble.h"
@@ -30,22 +32,44 @@ uint32_t EncodeStates<Family>::copySamplerState(IndirectHeap *dsh,
3032
uint32_t samplerStateOffset,
3133
uint32_t samplerCount,
3234
uint32_t borderColorOffset,
33-
const void *fnDynamicStateHeap) {
35+
const void *fnDynamicStateHeap,
36+
BindlessHeapsHelper *bindlessHeapHelper) {
3437
auto sizeSamplerState = sizeof(SAMPLER_STATE) * samplerCount;
3538
auto borderColorSize = samplerStateOffset - borderColorOffset;
3639

37-
dsh->align(EncodeStates<Family>::alignIndirectStatePointer);
38-
auto borderColorOffsetInDsh = static_cast<uint32_t>(dsh->getUsed());
40+
SAMPLER_STATE *dstSamplerState = nullptr;
41+
uint32_t samplerStateOffsetInDsh = 0;
3942

40-
auto borderColor = dsh->getSpace(borderColorSize);
43+
dsh->align(EncodeStates<Family>::alignIndirectStatePointer);
44+
uint32_t borderColorOffsetInDsh = 0;
45+
if (!ApiSpecificConfig::getBindlessConfiguration()) {
46+
borderColorOffsetInDsh = static_cast<uint32_t>(dsh->getUsed());
47+
auto borderColor = dsh->getSpace(borderColorSize);
4148

42-
memcpy_s(borderColor, borderColorSize, ptrOffset(fnDynamicStateHeap, borderColorOffset),
43-
borderColorSize);
49+
memcpy_s(borderColor, borderColorSize, ptrOffset(fnDynamicStateHeap, borderColorOffset),
50+
borderColorSize);
4451

45-
dsh->align(INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE);
46-
auto samplerStateOffsetInDsh = static_cast<uint32_t>(dsh->getUsed());
52+
dsh->align(INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE);
53+
samplerStateOffsetInDsh = static_cast<uint32_t>(dsh->getUsed());
4754

48-
auto dstSamplerState = reinterpret_cast<SAMPLER_STATE *>(dsh->getSpace(sizeSamplerState));
55+
dstSamplerState = reinterpret_cast<SAMPLER_STATE *>(dsh->getSpace(sizeSamplerState));
56+
} else {
57+
auto borderColor = reinterpret_cast<const SAMPLER_BORDER_COLOR_STATE *>(ptrOffset(fnDynamicStateHeap, borderColorOffset));
58+
if (borderColor->getBorderColorRed() != 0.0f ||
59+
borderColor->getBorderColorGreen() != 0.0f ||
60+
borderColor->getBorderColorBlue() != 0.0f ||
61+
(borderColor->getBorderColorAlpha() != 0.0f && borderColor->getBorderColorAlpha() != 1.0f)) {
62+
UNRECOVERABLE_IF(true);
63+
} else if (borderColor->getBorderColorAlpha() == 0.0f) {
64+
borderColorOffsetInDsh = bindlessHeapHelper->getDefaultBorderColorOffset();
65+
} else {
66+
borderColorOffsetInDsh = bindlessHeapHelper->getAlphaBorderColorOffset();
67+
}
68+
dsh->align(INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE);
69+
auto samplerStateInDsh = bindlessHeapHelper->allocateSSInHeap(sizeSamplerState, nullptr, BindlessHeapsHelper::BindlesHeapType::GLOBAL_DSH);
70+
dstSamplerState = reinterpret_cast<SAMPLER_STATE *>(samplerStateInDsh.ssPtr);
71+
samplerStateOffsetInDsh = static_cast<uint32_t>(samplerStateInDsh.surfaceStateOffset);
72+
}
4973

5074
auto srcSamplerState = reinterpret_cast<const SAMPLER_STATE *>(ptrOffset(fnDynamicStateHeap, samplerStateOffset));
5175
SAMPLER_STATE state = {};
@@ -56,7 +80,7 @@ uint32_t EncodeStates<Family>::copySamplerState(IndirectHeap *dsh,
5680
}
5781

5882
return samplerStateOffsetInDsh;
59-
}
83+
} // namespace NEO
6084

6185
template <typename Family>
6286
size_t EncodeStates<Family>::getAdjustStateComputeModeSize() {
@@ -382,40 +406,6 @@ void *EncodeDispatchKernel<Family>::getInterfaceDescriptor(CommandContainer &con
382406
return &interfaceDescriptorData[container.nextIddInBlock++];
383407
}
384408

385-
template <typename Family>
386-
void EncodeDispatchKernel<Family>::patchBindlessSurfaceStateOffsets(const size_t sshOffset, const KernelDescriptor &kernelDesc, uint8_t *crossThread) {
387-
auto &hwHelper = HwHelperHw<Family>::get();
388-
389-
for (const auto &argT : kernelDesc.payloadMappings.explicitArgs) {
390-
CrossThreadDataOffset bindless = undefined<CrossThreadDataOffset>;
391-
SurfaceStateHeapOffset bindful = undefined<SurfaceStateHeapOffset>;
392-
393-
switch (argT.type) {
394-
case ArgDescriptor::ArgTPointer: {
395-
auto &arg = argT.as<NEO::ArgDescPointer>();
396-
bindless = arg.bindless;
397-
bindful = arg.bindful;
398-
} break;
399-
400-
case ArgDescriptor::ArgTImage: {
401-
auto &arg = argT.as<NEO::ArgDescImage>();
402-
bindless = arg.bindless;
403-
bindful = arg.bindful;
404-
} break;
405-
406-
default:
407-
break;
408-
}
409-
410-
if (NEO::isValidOffset(bindless)) {
411-
auto patchLocation = ptrOffset(crossThread, bindless);
412-
auto bindlessOffset = static_cast<uint32_t>(sshOffset) + bindful;
413-
auto patchValue = hwHelper.getBindlessSurfaceExtendedMessageDescriptorValue(bindlessOffset);
414-
patchWithRequiredSize(patchLocation, sizeof(patchValue), patchValue);
415-
}
416-
}
417-
}
418-
419409
template <typename Family>
420410
bool EncodeDispatchKernel<Family>::inlineDataProgrammingRequired(const KernelDescriptor &kernelDesc) {
421411
auto checkKernelForInlineData = true;

shared/source/command_container/command_encoder_bdw_plus.inl

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "shared/source/command_stream/preemption.h"
1212
#include "shared/source/execution_environment/execution_environment.h"
1313
#include "shared/source/gmm_helper/gmm_helper.h"
14+
#include "shared/source/helpers/api_specific_config.h"
1415
#include "shared/source/helpers/hw_helper.h"
1516
#include "shared/source/helpers/simd_helper.h"
1617
#include "shared/source/helpers/state_base_address.h"
@@ -79,22 +80,25 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
7980

8081
uint32_t bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries;
8182
uint32_t bindingTablePointer = 0u;
83+
bool isBindlessKernel = kernelDescriptor.kernelAttributes.bufferAddressingMode == KernelDescriptor::BindlessAndStateless;
84+
if (!isBindlessKernel) {
85+
86+
if (bindingTableStateCount > 0u) {
87+
auto ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
88+
sshOffset = ssh->getUsed();
89+
bindingTablePointer = static_cast<uint32_t>(EncodeSurfaceState<Family>::pushBindingTableAndSurfaceStates(
90+
*ssh, bindingTableStateCount,
91+
dispatchInterface->getSurfaceStateHeapData(),
92+
dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount,
93+
kernelDescriptor.payloadMappings.bindingTable.tableOffset));
94+
}
8295

83-
if (bindingTableStateCount > 0u) {
84-
auto ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
85-
sshOffset = ssh->getUsed();
86-
bindingTablePointer = static_cast<uint32_t>(EncodeSurfaceState<Family>::pushBindingTableAndSurfaceStates(
87-
*ssh, bindingTableStateCount,
88-
dispatchInterface->getSurfaceStateHeapData(),
89-
dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount,
90-
kernelDescriptor.payloadMappings.bindingTable.tableOffset));
96+
idd.setBindingTablePointer(bindingTablePointer);
9197
}
9298

93-
idd.setBindingTablePointer(bindingTablePointer);
94-
9599
PreemptionHelper::programInterfaceDescriptorDataPreemption<Family>(&idd, preemptionMode);
96100

97-
auto heap = container.getIndirectHeap(HeapType::DYNAMIC_STATE);
101+
auto heap = ApiSpecificConfig::getBindlessConfiguration() ? device->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH) : container.getIndirectHeap(HeapType::DYNAMIC_STATE);
98102
UNRECOVERABLE_IF(!heap);
99103

100104
uint32_t samplerStateOffset = 0;
@@ -105,7 +109,11 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
105109
samplerStateOffset = EncodeStates<Family>::copySamplerState(heap, kernelDescriptor.payloadMappings.samplerTable.tableOffset,
106110
kernelDescriptor.payloadMappings.samplerTable.numSamplers,
107111
kernelDescriptor.payloadMappings.samplerTable.borderColor,
108-
dispatchInterface->getDynamicStateHeapData());
112+
dispatchInterface->getDynamicStateHeapData(),
113+
device->getBindlessHeapsHelper());
114+
if (ApiSpecificConfig::getBindlessConfiguration()) {
115+
container.getResidencyContainer().push_back(device->getBindlessHeapsHelper()->getHeap(NEO::BindlessHeapsHelper::BindlesHeapType::GLOBAL_DSH)->getGraphicsAllocation());
116+
}
109117
}
110118

111119
idd.setSamplerStatePointer(samplerStateOffset);
@@ -139,10 +147,6 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
139147
EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, gpuPtr, dispatchInterface->getGroupSize());
140148
}
141149

142-
if (kernelDescriptor.payloadMappings.bindingTable.numEntries > 0) {
143-
patchBindlessSurfaceStateOffsets(sshOffset, dispatchInterface->getKernelDescriptor(), reinterpret_cast<uint8_t *>(ptr));
144-
}
145-
146150
ptr = ptrOffset(ptr, sizeCrossThreadData);
147151
memcpy_s(ptr, sizePerThreadDataForWholeGroup,
148152
dispatchInterface->getPerThreadData(), sizePerThreadDataForWholeGroup);

shared/source/generated/gen11/hw_cmds_generated_gen11.inl

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5344,4 +5344,61 @@ typedef struct tagASYNC_SLICE_COUNT_SELECT_REGISTER {
53445344
} ASYNC_SLICE_COUNT_SELECT_REGISTER;
53455345
STATIC_ASSERT(4 == sizeof(ASYNC_SLICE_COUNT_SELECT_REGISTER));
53465346

5347+
typedef struct tagSAMPLER_BORDER_COLOR_STATE {
5348+
union tagTheStructure {
5349+
struct tagCommon {
5350+
// DWORD 0
5351+
float BorderColorRed;
5352+
// DWORD 1
5353+
float BorderColorGreen;
5354+
// DWORD 2
5355+
float BorderColorBlue;
5356+
// DWORD 3
5357+
float BorderColorAlpha;
5358+
} Common;
5359+
uint32_t RawData[4];
5360+
} TheStructure;
5361+
inline void init(void) {
5362+
memset(&TheStructure, 0, sizeof(TheStructure));
5363+
TheStructure.Common.BorderColorRed = 0.0;
5364+
TheStructure.Common.BorderColorGreen = 0.0;
5365+
TheStructure.Common.BorderColorBlue = 0.0;
5366+
TheStructure.Common.BorderColorAlpha = 0.0;
5367+
}
5368+
static tagSAMPLER_BORDER_COLOR_STATE sInit(void) {
5369+
SAMPLER_BORDER_COLOR_STATE state;
5370+
state.init();
5371+
return state;
5372+
}
5373+
inline uint32_t &getRawData(const uint32_t index) {
5374+
UNRECOVERABLE_IF(index >= 4);
5375+
return TheStructure.RawData[index];
5376+
}
5377+
inline void setBorderColorRed(const float value) {
5378+
TheStructure.Common.BorderColorRed = value;
5379+
}
5380+
inline float getBorderColorRed(void) const {
5381+
return TheStructure.Common.BorderColorRed;
5382+
}
5383+
inline void setBorderColorGreen(const float value) {
5384+
TheStructure.Common.BorderColorGreen = value;
5385+
}
5386+
inline float getBorderColorGreen(void) const {
5387+
return TheStructure.Common.BorderColorGreen;
5388+
}
5389+
inline void setBorderColorBlue(const float value) {
5390+
TheStructure.Common.BorderColorBlue = value;
5391+
}
5392+
inline float getBorderColorBlue(void) const {
5393+
return TheStructure.Common.BorderColorBlue;
5394+
}
5395+
inline void setBorderColorAlpha(const float value) {
5396+
TheStructure.Common.BorderColorAlpha = value;
5397+
}
5398+
inline float getBorderColorAlpha(void) const {
5399+
return TheStructure.Common.BorderColorAlpha;
5400+
}
5401+
} SAMPLER_BORDER_COLOR_STATE;
5402+
STATIC_ASSERT(16 == sizeof(SAMPLER_BORDER_COLOR_STATE));
5403+
53475404
#pragma pack()

shared/source/generated/gen12lp/hw_cmds_generated_gen12lp.inl

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6003,4 +6003,61 @@ typedef struct tagXY_FAST_COLOR_BLT {
60036003
} XY_FAST_COLOR_BLT;
60046004
STATIC_ASSERT(48 == sizeof(XY_FAST_COLOR_BLT));
60056005

6006+
typedef struct tagSAMPLER_BORDER_COLOR_STATE {
6007+
union tagTheStructure {
6008+
struct tagCommon {
6009+
// DWORD 0
6010+
float BorderColorRed;
6011+
// DWORD 1
6012+
float BorderColorGreen;
6013+
// DWORD 2
6014+
float BorderColorBlue;
6015+
// DWORD 3
6016+
float BorderColorAlpha;
6017+
} Common;
6018+
uint32_t RawData[4];
6019+
} TheStructure;
6020+
inline void init(void) {
6021+
memset(&TheStructure, 0, sizeof(TheStructure));
6022+
TheStructure.Common.BorderColorRed = 0.0;
6023+
TheStructure.Common.BorderColorGreen = 0.0;
6024+
TheStructure.Common.BorderColorBlue = 0.0;
6025+
TheStructure.Common.BorderColorAlpha = 0.0;
6026+
}
6027+
static tagSAMPLER_BORDER_COLOR_STATE sInit(void) {
6028+
SAMPLER_BORDER_COLOR_STATE state;
6029+
state.init();
6030+
return state;
6031+
}
6032+
inline uint32_t &getRawData(const uint32_t index) {
6033+
UNRECOVERABLE_IF(index >= 4);
6034+
return TheStructure.RawData[index];
6035+
}
6036+
inline void setBorderColorRed(const float value) {
6037+
TheStructure.Common.BorderColorRed = value;
6038+
}
6039+
inline float getBorderColorRed(void) const {
6040+
return TheStructure.Common.BorderColorRed;
6041+
}
6042+
inline void setBorderColorGreen(const float value) {
6043+
TheStructure.Common.BorderColorGreen = value;
6044+
}
6045+
inline float getBorderColorGreen(void) const {
6046+
return TheStructure.Common.BorderColorGreen;
6047+
}
6048+
inline void setBorderColorBlue(const float value) {
6049+
TheStructure.Common.BorderColorBlue = value;
6050+
}
6051+
inline float getBorderColorBlue(void) const {
6052+
return TheStructure.Common.BorderColorBlue;
6053+
}
6054+
inline void setBorderColorAlpha(const float value) {
6055+
TheStructure.Common.BorderColorAlpha = value;
6056+
}
6057+
inline float getBorderColorAlpha(void) const {
6058+
return TheStructure.Common.BorderColorAlpha;
6059+
}
6060+
} SAMPLER_BORDER_COLOR_STATE;
6061+
STATIC_ASSERT(16 == sizeof(SAMPLER_BORDER_COLOR_STATE));
6062+
60066063
#pragma pack()

0 commit comments

Comments
 (0)