Skip to content

Commit 7c6c45f

Browse files
jchodorCompute-Runtime-Automation
authored andcommitted
Add option to allocate private mem per dispatch
Signed-off-by: Jaroslaw Chodor <[email protected]> Signed-off-by: Krystian Chmielewski <[email protected]>
1 parent cf4972d commit 7c6c45f

File tree

12 files changed

+137
-20
lines changed

12 files changed

+137
-20
lines changed

level_zero/core/source/cmdlist/cmdlist.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,8 @@ struct CommandList : _ze_command_list_handle_t {
259259
NEO::StreamProperties requiredStreamState{};
260260
NEO::StreamProperties finalStreamState{};
261261
CommandsToPatch commandsToPatch{};
262+
263+
std::vector<NEO::GraphicsAllocation *> ownedPrivateAllocations;
262264
};
263265

264266
using CommandListAllocatorFn = CommandList *(*)(uint32_t);

level_zero/core/source/cmdlist/cmdlist_hw.inl

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@ inline ze_result_t parseErrorCode(NEO::ErrorCode returnValue) {
5858
template <GFXCORE_FAMILY gfxCoreFamily>
5959
CommandListCoreFamily<gfxCoreFamily>::~CommandListCoreFamily() {
6060
clearCommandsToPatch();
61+
for (auto alloc : this->ownedPrivateAllocations) {
62+
device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc);
63+
}
64+
this->ownedPrivateAllocations.clear();
6165
}
6266

6367
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -98,6 +102,11 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::reset() {
98102
programThreadArbitrationPolicy(device);
99103
}
100104

105+
for (auto alloc : this->ownedPrivateAllocations) {
106+
device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc);
107+
}
108+
this->ownedPrivateAllocations.clear();
109+
101110
return ZE_RESULT_SUCCESS;
102111
}
103112

level_zero/core/source/cmdlist/cmdlist_hw_base.inl

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
4141
bool isPredicate,
4242
bool isCooperative) {
4343
const auto kernel = Kernel::fromHandle(hKernel);
44+
const auto &kernelDescriptor = kernel->getKernelDescriptor();
4445
UNRECOVERABLE_IF(kernel == nullptr);
4546
appendEventForProfiling(hEvent, true);
4647
const auto functionImmutableData = kernel->getImmutableData();
@@ -56,6 +57,14 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
5657

5758
kernel->patchGlobalOffset();
5859

60+
if (kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize != 0U &&
61+
nullptr == kernel->getPrivateMemoryGraphicsAllocation()) {
62+
auto privateMemoryGraphicsAllocation = kernel->allocatePrivateMemoryGraphicsAllocation();
63+
kernel->patchCrossthreadDataWithPrivateAllocation(privateMemoryGraphicsAllocation);
64+
this->commandContainer.addToResidencyContainer(privateMemoryGraphicsAllocation);
65+
this->ownedPrivateAllocations.push_back(privateMemoryGraphicsAllocation);
66+
}
67+
5968
if (!isIndirect) {
6069
kernel->setGroupCount(pThreadGroupDimensions->groupCountX,
6170
pThreadGroupDimensions->groupCountY,

level_zero/core/source/kernel/kernel.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,11 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI {
137137
virtual bool usesSyncBuffer() = 0;
138138
virtual void patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) = 0;
139139

140+
virtual NEO::GraphicsAllocation *allocatePrivateMemoryGraphicsAllocation() = 0;
141+
virtual void patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) = 0;
142+
143+
virtual NEO::GraphicsAllocation *getPrivateMemoryGraphicsAllocation() = 0;
144+
140145
Kernel() = default;
141146
Kernel(const Kernel &) = delete;
142147
Kernel(Kernel &&) = delete;

level_zero/core/source/kernel/kernel_imp.cpp

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -686,6 +686,36 @@ ze_result_t KernelImp::getProperties(ze_kernel_properties_t *pKernelProperties)
686686
return ZE_RESULT_SUCCESS;
687687
}
688688

689+
NEO::GraphicsAllocation *KernelImp::allocatePrivateMemoryGraphicsAllocation() {
690+
auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes;
691+
auto neoDevice = module->getDevice()->getNEODevice();
692+
693+
auto privateSurfaceSize = NEO::KernelHelper::getPrivateSurfaceSize(kernelAttributes.perHwThreadPrivateMemorySize,
694+
neoDevice->getDeviceInfo().computeUnitsUsedForScratch);
695+
696+
UNRECOVERABLE_IF(privateSurfaceSize == 0);
697+
auto privateMemoryGraphicsAllocation = neoDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(
698+
{neoDevice->getRootDeviceIndex(), privateSurfaceSize, NEO::GraphicsAllocation::AllocationType::PRIVATE_SURFACE, neoDevice->getDeviceBitfield()});
699+
700+
UNRECOVERABLE_IF(privateMemoryGraphicsAllocation == nullptr);
701+
return privateMemoryGraphicsAllocation;
702+
}
703+
704+
void KernelImp::patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) {
705+
auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes;
706+
auto neoDevice = module->getDevice()->getNEODevice();
707+
708+
ArrayRef<uint8_t> crossThredDataArrayRef = ArrayRef<uint8_t>(this->crossThreadData.get(), this->crossThreadDataSize);
709+
ArrayRef<uint8_t> surfaceStateHeapArrayRef = ArrayRef<uint8_t>(this->surfaceStateHeapData.get(), this->surfaceStateHeapDataSize);
710+
711+
patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
712+
static_cast<uintptr_t>(privateMemoryGraphicsAllocation->getGpuAddressToPatch()),
713+
*privateMemoryGraphicsAllocation, kernelImmData->getDescriptor().payloadMappings.implicitArgs.privateMemoryAddress,
714+
*neoDevice, kernelAttributes.flags.useGlobalAtomics);
715+
716+
this->residencyContainer.push_back(this->privateMemoryGraphicsAllocation);
717+
}
718+
689719
ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
690720
this->kernelImmData = module->getKernelImmutableData(desc->pKernelName);
691721
if (this->kernelImmData == nullptr) {
@@ -776,25 +806,9 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
776806

777807
auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes;
778808
auto neoDevice = module->getDevice()->getNEODevice();
779-
if (kernelAttributes.perHwThreadPrivateMemorySize != 0) {
780-
auto privateSurfaceSize = NEO::KernelHelper::getPrivateSurfaceSize(kernelAttributes.perHwThreadPrivateMemorySize,
781-
neoDevice->getDeviceInfo().computeUnitsUsedForScratch);
782-
783-
UNRECOVERABLE_IF(privateSurfaceSize == 0);
784-
this->privateMemoryGraphicsAllocation = neoDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(
785-
{neoDevice->getRootDeviceIndex(), privateSurfaceSize, NEO::GraphicsAllocation::AllocationType::PRIVATE_SURFACE, neoDevice->getDeviceBitfield()});
786-
787-
UNRECOVERABLE_IF(this->privateMemoryGraphicsAllocation == nullptr);
788-
789-
ArrayRef<uint8_t> crossThredDataArrayRef = ArrayRef<uint8_t>(this->crossThreadData.get(), this->crossThreadDataSize);
790-
ArrayRef<uint8_t> surfaceStateHeapArrayRef = ArrayRef<uint8_t>(this->surfaceStateHeapData.get(), this->surfaceStateHeapDataSize);
791-
792-
patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
793-
static_cast<uintptr_t>(privateMemoryGraphicsAllocation->getGpuAddressToPatch()),
794-
*privateMemoryGraphicsAllocation, kernelImmData->getDescriptor().payloadMappings.implicitArgs.privateMemoryAddress,
795-
*neoDevice, kernelAttributes.flags.useGlobalAtomics);
796-
797-
this->residencyContainer.push_back(this->privateMemoryGraphicsAllocation);
809+
if ((kernelAttributes.perHwThreadPrivateMemorySize != 0U) && (false == module->shouldAllocatePrivateMemoryPerDispatch())) {
810+
this->privateMemoryGraphicsAllocation = allocatePrivateMemoryGraphicsAllocation();
811+
this->patchCrossthreadDataWithPrivateAllocation(this->privateMemoryGraphicsAllocation);
798812
}
799813

800814
this->createPrintfBuffer();

level_zero/core/source/kernel/kernel_imp.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,13 @@ struct KernelImp : Kernel {
143143
return kernelHasIndirectAccess;
144144
}
145145

146+
NEO::GraphicsAllocation *allocatePrivateMemoryGraphicsAllocation() override;
147+
void patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) override;
148+
149+
NEO::GraphicsAllocation *getPrivateMemoryGraphicsAllocation() override {
150+
return privateMemoryGraphicsAllocation;
151+
}
152+
146153
protected:
147154
KernelImp() = default;
148155

level_zero/core/source/module/module.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ struct Module : _ze_module_handle_t {
5050
virtual const std::vector<std::unique_ptr<KernelImmutableData>> &getKernelImmutableDataVector() const = 0;
5151
virtual uint32_t getMaxGroupSize() const = 0;
5252
virtual bool isDebugEnabled() const = 0;
53+
virtual bool shouldAllocatePrivateMemoryPerDispatch() const = 0;
54+
virtual void checkIfPrivateMemoryPerDispatchIsNeeded() = 0;
5355

5456
Module() = default;
5557
Module(const Module &) = delete;

level_zero/core/source/module/module_imp.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "shared/source/device_binary_format/device_binary_formats.h"
1414
#include "shared/source/helpers/api_specific_config.h"
1515
#include "shared/source/helpers/constants.h"
16+
#include "shared/source/helpers/kernel_helpers.h"
1617
#include "shared/source/helpers/string.h"
1718
#include "shared/source/memory_manager/memory_manager.h"
1819
#include "shared/source/memory_manager/unified_memory_manager.h"
@@ -383,6 +384,8 @@ bool ModuleImp::initialize(const ze_module_desc_t *desc, NEO::Device *neoDevice)
383384
}
384385
this->maxGroupSize = static_cast<uint32_t>(this->translationUnit->device->getNEODevice()->getDeviceInfo().maxWorkGroupSize);
385386

387+
checkIfPrivateMemoryPerDispatchIsNeeded();
388+
386389
if (debugEnabled) {
387390
if (device->getSourceLevelDebugger()) {
388391
for (auto kernelInfo : this->translationUnit->programInfo.kernelInfos) {
@@ -642,6 +645,24 @@ void ModuleImp::verifyDebugCapabilities() {
642645
debugEnabled = debugCapabilities;
643646
}
644647

648+
void ModuleImp::checkIfPrivateMemoryPerDispatchIsNeeded() {
649+
size_t modulePrivateMemorySize = 0;
650+
for (auto &kernelImmData : this->kernelImmDatas) {
651+
if (0 == kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize) {
652+
continue;
653+
}
654+
auto kernelPrivateMemorySize = NEO::KernelHelper::getPrivateSurfaceSize(kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize,
655+
this->device->getNEODevice()->getDeviceInfo().computeUnitsUsedForScratch);
656+
modulePrivateMemorySize += kernelPrivateMemorySize;
657+
}
658+
659+
this->allocatePrivateMemoryPerDispatch = false;
660+
if (modulePrivateMemorySize > 0U) {
661+
auto globalMemorySize = device->getNEODevice()->getRootDevice()->getGlobalMemorySize(static_cast<uint32_t>(device->getNEODevice()->getDeviceBitfield().to_ulong()));
662+
this->allocatePrivateMemoryPerDispatch = modulePrivateMemorySize > globalMemorySize;
663+
}
664+
}
665+
645666
ze_result_t ModuleImp::getProperties(ze_module_properties_t *pModuleProperties) {
646667

647668
pModuleProperties->flags = 0;

level_zero/core/source/module/module_imp.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,13 +114,18 @@ struct ModuleImp : public Module {
114114

115115
bool isDebugEnabled() const override;
116116

117+
bool shouldAllocatePrivateMemoryPerDispatch() const override {
118+
return allocatePrivateMemoryPerDispatch;
119+
}
120+
117121
ModuleTranslationUnit *getTranslationUnit() {
118122
return this->translationUnit.get();
119123
}
120124

121125
protected:
122126
void copyPatchedSegments(const NEO::Linker::PatchableSegments &isaSegmentsForPatching);
123127
void verifyDebugCapabilities();
128+
void checkIfPrivateMemoryPerDispatchIsNeeded() override;
124129

125130
Device *device = nullptr;
126131
PRODUCT_FAMILY productFamily{};
@@ -132,6 +137,7 @@ struct ModuleImp : public Module {
132137
NEO::Linker::RelocatedSymbolsMap symbols;
133138
bool debugEnabled = false;
134139
bool isFullyLinked = false;
140+
bool allocatePrivateMemoryPerDispatch = true;
135141
ModuleType type;
136142
NEO::Linker::UnresolvedExternals unresolvedExternalsInfo{};
137143
std::set<NEO::GraphicsAllocation *> importedSymbolAllocations{};

level_zero/core/test/unit_tests/fixtures/module_fixture.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,12 @@ struct ModuleImmutableDataFixture : public DeviceFixture {
102102
const KernelImmutableData *getKernelImmutableData(const char *functionName) const override {
103103
return mockKernelImmData;
104104
}
105+
106+
void checkIfPrivateMemoryPerDispatchIsNeeded() override {
107+
const_cast<KernelDescriptor &>(kernelImmDatas[0]->getDescriptor()).kernelAttributes.perHwThreadPrivateMemorySize = mockKernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize;
108+
ModuleImp::checkIfPrivateMemoryPerDispatchIsNeeded();
109+
}
110+
105111
MockImmutableData *mockKernelImmData = nullptr;
106112
};
107113

0 commit comments

Comments
 (0)