Skip to content

Commit 1b9b78a

Browse files
feature: implement ISA allocation pooling in OpenCL
Related-To: NEO-12287 Signed-off-by: Fabian Zwoliński <[email protected]>
1 parent a627594 commit 1b9b78a

25 files changed

+824
-87
lines changed

opencl/source/command_queue/hardware_interface_xehp_and_later.inl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,8 @@ inline void HardwareInterface<GfxFamily>::programWalker(
124124
auto isCcsUsed = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType());
125125

126126
if constexpr (heaplessModeEnabled == false) {
127-
if (auto kernelAllocation = kernelInfo.getGraphicsAllocation()) {
128-
EncodeMemoryPrefetch<GfxFamily>::programMemoryPrefetch(commandStream, *kernelAllocation, kernelInfo.heapInfo.kernelHeapSize, 0, rootDeviceEnvironment);
127+
if (auto kernelAllocation = kernelInfo.getIsaGraphicsAllocation()) {
128+
EncodeMemoryPrefetch<GfxFamily>::programMemoryPrefetch(commandStream, *kernelAllocation, kernelInfo.heapInfo.kernelHeapSize, kernelInfo.getIsaOffsetInParentAllocation(), rootDeviceEnvironment);
129129
}
130130
}
131131

opencl/source/kernel/kernel.cpp

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -515,7 +515,7 @@ cl_int Kernel::getInfo(cl_kernel_info paramName, size_t paramValueSize,
515515
srcSize = getKernelHeapSize();
516516
break;
517517
case CL_KERNEL_BINARY_GPU_ADDRESS_INTEL:
518-
nonCannonizedGpuAddress = gmmHelper->decanonize(kernelInfo.kernelAllocation->getGpuAddress());
518+
nonCannonizedGpuAddress = gmmHelper->decanonize(kernelInfo.getIsaGraphicsAllocation()->getGpuAddress() + kernelInfo.getIsaOffsetInParentAllocation());
519519
pSrc = &nonCannonizedGpuAddress;
520520
srcSize = sizeof(nonCannonizedGpuAddress);
521521
break;
@@ -788,21 +788,23 @@ void Kernel::substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize)
788788
pKernelInfo->isKernelHeapSubstituted = true;
789789
auto memoryManager = executionEnvironment.memoryManager.get();
790790

791-
auto currentAllocationSize = pKernelInfo->kernelAllocation->getUnderlyingBufferSize();
791+
auto currentAllocationSize = pKernelInfo->getIsaSize();
792792
bool status = false;
793793
auto &rootDeviceEnvironment = clDevice.getRootDeviceEnvironment();
794794
auto &helper = rootDeviceEnvironment.getHelper<GfxCoreHelper>();
795795
size_t isaPadding = helper.getPaddingForISAAllocation();
796796

797+
DEBUG_BREAK_IF(nullptr != pKernelInfo->getIsaParentAllocation());
798+
797799
if (currentAllocationSize >= newKernelHeapSize + isaPadding) {
798800
auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>();
799-
auto useBlitter = productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *pKernelInfo->getGraphicsAllocation());
801+
auto useBlitter = productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *pKernelInfo->getIsaGraphicsAllocation());
800802
status = MemoryTransferHelper::transferMemoryToAllocation(useBlitter,
801-
clDevice.getDevice(), pKernelInfo->getGraphicsAllocation(), 0, newKernelHeap,
803+
clDevice.getDevice(), pKernelInfo->getIsaGraphicsAllocation(), 0, newKernelHeap,
802804
static_cast<size_t>(newKernelHeapSize));
803805
} else {
804-
memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(pKernelInfo->kernelAllocation);
805-
pKernelInfo->kernelAllocation = nullptr;
806+
memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(pKernelInfo->getIsaGraphicsAllocation());
807+
pKernelInfo->setIsaPerKernelAllocation(nullptr);
806808
status = pKernelInfo->createKernelAllocation(clDevice.getDevice(), isBuiltIn);
807809
}
808810
UNRECOVERABLE_IF(!status);
@@ -1315,7 +1317,7 @@ void Kernel::makeResident(CommandStreamReceiver &commandStreamReceiver) {
13151317
}
13161318
makeArgsResident(commandStreamReceiver);
13171319

1318-
auto kernelIsaAllocation = this->kernelInfo.kernelAllocation;
1320+
auto kernelIsaAllocation = this->kernelInfo.getIsaGraphicsAllocation();
13191321
if (kernelIsaAllocation) {
13201322
commandStreamReceiver.makeResident(*kernelIsaAllocation);
13211323
}
@@ -1381,8 +1383,8 @@ void Kernel::getResidency(std::vector<Surface *> &dst) {
13811383
}
13821384
}
13831385

1384-
auto kernelIsaAllocation = this->kernelInfo.kernelAllocation;
1385-
if (kernelIsaAllocation) {
1386+
if (auto kernelIsaAllocation = this->kernelInfo.getIsaGraphicsAllocation();
1387+
kernelIsaAllocation != nullptr) {
13861388
GeneralSurface *surface = new GeneralSurface(kernelIsaAllocation);
13871389
dst.push_back(surface);
13881390
}
@@ -1922,12 +1924,13 @@ bool Kernel::hasIndirectStatelessAccessToHostMemory() const {
19221924
}
19231925

19241926
uint64_t Kernel::getKernelStartAddress(const bool localIdsGenerationByRuntime, const bool kernelUsesLocalIds, const bool isCssUsed, const bool returnFullAddress) const {
1925-
19261927
uint64_t kernelStartOffset = 0;
19271928

1928-
if (kernelInfo.getGraphicsAllocation()) {
1929-
kernelStartOffset = returnFullAddress ? kernelInfo.getGraphicsAllocation()->getGpuAddress()
1930-
: kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
1929+
if (kernelInfo.getIsaGraphicsAllocation()) {
1930+
auto offsetInParentAllocation = kernelInfo.getIsaOffsetInParentAllocation();
1931+
kernelStartOffset = returnFullAddress ? kernelInfo.getIsaGraphicsAllocation()->getGpuAddress() + offsetInParentAllocation
1932+
: kernelInfo.getIsaGraphicsAllocation()->getGpuAddressToPatch() + offsetInParentAllocation;
1933+
19311934
if (localIdsGenerationByRuntime == false && kernelUsesLocalIds == true) {
19321935
kernelStartOffset += kernelInfo.kernelDescriptor.entryPoints.skipPerThreadDataLoad;
19331936
}

opencl/source/program/process_device_binary.cpp

Lines changed: 23 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "shared/source/helpers/compiler_product_helper.h"
1717
#include "shared/source/helpers/debug_helpers.h"
1818
#include "shared/source/helpers/file_io.h"
19+
#include "shared/source/helpers/gfx_core_helper.h"
1920
#include "shared/source/helpers/hw_info.h"
2021
#include "shared/source/helpers/ptr_math.h"
2122
#include "shared/source/helpers/string.h"
@@ -109,14 +110,16 @@ cl_int Program::linkBinary(Device *pDevice, const void *constantsInitData, size_
109110
exportedFunctionsKernelId = static_cast<size_t>(linkerInput->getExportedFunctionsSegmentId());
110111
// Exported functions reside in instruction heap of one of kernels
111112
auto exportedFunctionHeapId = linkerInput->getExportedFunctionsSegmentId();
112-
buildInfos[rootDeviceIndex].exportedFunctionsSurface = kernelInfoArray[exportedFunctionHeapId]->getGraphicsAllocation();
113+
buildInfos[rootDeviceIndex].exportedFunctionsSurface = kernelInfoArray[exportedFunctionHeapId]->getIsaGraphicsAllocation();
114+
auto offsetInParentAllocation = kernelInfoArray[exportedFunctionHeapId]->getIsaOffsetInParentAllocation();
115+
113116
auto &compilerProductHelper = pDevice->getCompilerProductHelper();
114117
if (compilerProductHelper.isHeaplessModeEnabled(pDevice->getHardwareInfo())) {
115-
exportedFunctions.gpuAddress = static_cast<uintptr_t>(buildInfos[rootDeviceIndex].exportedFunctionsSurface->getGpuAddress());
118+
exportedFunctions.gpuAddress = static_cast<uintptr_t>(buildInfos[rootDeviceIndex].exportedFunctionsSurface->getGpuAddress() + offsetInParentAllocation);
116119
} else {
117-
exportedFunctions.gpuAddress = static_cast<uintptr_t>(buildInfos[rootDeviceIndex].exportedFunctionsSurface->getGpuAddressToPatch());
120+
exportedFunctions.gpuAddress = static_cast<uintptr_t>(buildInfos[rootDeviceIndex].exportedFunctionsSurface->getGpuAddressToPatch() + offsetInParentAllocation);
118121
}
119-
exportedFunctions.segmentSize = buildInfos[rootDeviceIndex].exportedFunctionsSurface->getUnderlyingBufferSize();
122+
exportedFunctions.segmentSize = kernelInfoArray[exportedFunctionHeapId]->getIsaSize();
120123
}
121124
Linker::PatchableSegments isaSegmentsForPatching;
122125
std::vector<std::vector<char>> patchedIsaTempStorage;
@@ -128,8 +131,8 @@ cl_int Program::linkBinary(Device *pDevice, const void *constantsInitData, size_
128131
auto &kernHeapInfo = kernelInfo->heapInfo;
129132
const char *originalIsa = reinterpret_cast<const char *>(kernHeapInfo.pKernelHeap);
130133
patchedIsaTempStorage.push_back(std::vector<char>(originalIsa, originalIsa + kernHeapInfo.kernelHeapSize));
131-
DEBUG_BREAK_IF(nullptr == kernelInfo->getGraphicsAllocation());
132-
isaSegmentsForPatching.push_back(Linker::PatchableSegment{patchedIsaTempStorage.rbegin()->data(), static_cast<uintptr_t>(kernelInfo->getGraphicsAllocation()->getGpuAddressToPatch()), kernHeapInfo.kernelHeapSize});
134+
DEBUG_BREAK_IF(nullptr == kernelInfo->getIsaGraphicsAllocation());
135+
isaSegmentsForPatching.push_back(Linker::PatchableSegment{patchedIsaTempStorage.rbegin()->data(), static_cast<uintptr_t>(kernelInfo->getIsaGraphicsAllocation()->getGpuAddressToPatch() + kernelInfo->getIsaOffsetInParentAllocation()), kernHeapInfo.kernelHeapSize});
133136
kernelDescriptors.push_back(&kernelInfo->kernelDescriptor);
134137
}
135138
}
@@ -151,16 +154,8 @@ cl_int Program::linkBinary(Device *pDevice, const void *constantsInitData, size_
151154
updateBuildLog(pDevice->getRootDeviceIndex(), error.c_str(), error.size());
152155
return CL_INVALID_BINARY;
153156
} else if (linkerInput->getTraits().requiresPatchingOfInstructionSegments) {
154-
for (auto kernelId = 0u; kernelId < kernelInfoArray.size(); kernelId++) {
155-
const auto &kernelInfo = kernelInfoArray[kernelId];
156-
auto &kernHeapInfo = kernelInfo->heapInfo;
157-
auto segmentId = &kernelInfo - &kernelInfoArray[0];
158-
auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
159-
const auto &productHelper = pDevice->getProductHelper();
160-
MemoryTransferHelper::transferMemoryToAllocation(productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *kernelInfo->getGraphicsAllocation()),
161-
*pDevice, kernelInfo->getGraphicsAllocation(), 0, isaSegmentsForPatching[segmentId].hostPointer,
162-
static_cast<size_t>(kernHeapInfo.kernelHeapSize));
163-
}
157+
[[maybe_unused]] auto success = transferIsaSegmentsToAllocation(pDevice, kernelInfoArray, &isaSegmentsForPatching, rootDeviceIndex);
158+
DEBUG_BREAK_IF(!success);
164159
}
165160
DBG_LOG(PrintRelocations, NEO::constructRelocationsDebugMessage(this->getSymbols(pDevice->getRootDeviceIndex())));
166161
return CL_SUCCESS;
@@ -328,17 +323,9 @@ cl_int Program::processProgramInfo(ProgramInfo &src, const ClDevice &clDevice) {
328323
}
329324
buildInfos[rootDeviceIndex].kernelMiscInfoPos = src.kernelMiscInfoPos;
330325

331-
for (auto &kernelInfo : kernelInfoArray) {
332-
cl_int retVal = CL_SUCCESS;
333-
if (kernelInfo->heapInfo.kernelHeapSize) {
334-
retVal = kernelInfo->createKernelAllocation(clDevice.getDevice(), isBuiltIn) ? CL_SUCCESS : CL_OUT_OF_HOST_MEMORY;
335-
}
336-
337-
if (retVal != CL_SUCCESS) {
338-
return retVal;
339-
}
340-
341-
kernelInfo->apply(deviceInfoConstants);
326+
if (auto retVal = setIsaGraphicsAllocations(clDevice.getDevice(), kernelInfoArray, deviceInfoConstants, rootDeviceIndex);
327+
retVal != CL_SUCCESS) {
328+
return retVal;
342329
}
343330

344331
indirectDetectionVersion = src.indirectDetectionVersion;
@@ -383,8 +370,16 @@ Zebin::Debug::Segments Program::getZebinSegments(uint32_t rootDeviceIndex) {
383370
buildInfos[rootDeviceIndex].constStringSectionData.size};
384371
std::vector<NEO::Zebin::Debug::Segments::KernelNameIsaTupleT> kernels;
385372
for (const auto &kernelInfo : buildInfos[rootDeviceIndex].kernelInfoArray) {
373+
NEO::Zebin::Debug::Segments::Segment segment;
374+
375+
if (kernelInfo->getIsaParentAllocation()) {
376+
segment.address = static_cast<uintptr_t>(kernelInfo->getIsaGraphicsAllocation()->getGpuAddress() + kernelInfo->getIsaOffsetInParentAllocation());
377+
segment.size = kernelInfo->getIsaSubAllocationSize();
378+
} else {
379+
segment.address = static_cast<uintptr_t>(kernelInfo->getIsaGraphicsAllocation()->getGpuAddress());
380+
segment.size = kernelInfo->getIsaGraphicsAllocation()->getUnderlyingBufferSize();
381+
}
386382

387-
NEO::Zebin::Debug::Segments::Segment segment = {static_cast<uintptr_t>(kernelInfo->getGraphicsAllocation()->getGpuAddress()), kernelInfo->getGraphicsAllocation()->getUnderlyingBufferSize()};
388383
kernels.push_back({kernelInfo->kernelDescriptor.kernelMetadata.kernelName, segment});
389384
}
390385
return Zebin::Debug::Segments(getGlobalSurface(rootDeviceIndex), getConstantSurface(rootDeviceIndex), strings, kernels);

0 commit comments

Comments
 (0)