Skip to content

Commit 34ad95a

Browse files
Handle printf output when implicit args are required
Related-To: NEO-5081 Signed-off-by: Mateusz Jablonski <[email protected]>
1 parent 8b36473 commit 34ad95a

File tree

7 files changed

+141
-58
lines changed

7 files changed

+141
-58
lines changed

level_zero/core/source/printf_handler/printf_handler.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,13 @@ NEO::GraphicsAllocation *PrintfHandler::createPrintfBuffer(Device *device) {
2929
void PrintfHandler::printOutput(const KernelImmutableData *kernelData,
3030
NEO::GraphicsAllocation *printfBuffer, Device *device) {
3131
bool using32BitGpuPointers = kernelData->getDescriptor().kernelAttributes.gpuPointerSize == 4u;
32+
33+
auto usesStringMap = kernelData->getDescriptor().kernelAttributes.flags.usesStringMapForPrintf || kernelData->getDescriptor().kernelAttributes.flags.requiresImplicitArgs;
3234
NEO::PrintFormatter printfFormatter{
3335
static_cast<uint8_t *>(printfBuffer->getUnderlyingBuffer()),
3436
static_cast<uint32_t>(printfBuffer->getUnderlyingBufferSize()),
3537
using32BitGpuPointers,
36-
kernelData->getDescriptor().kernelAttributes.flags.usesStringMapForPrintf ? &kernelData->getDescriptor().kernelMetadata.printfStringsMap : nullptr};
38+
usesStringMap ? &kernelData->getDescriptor().kernelMetadata.printfStringsMap : nullptr};
3739
printfFormatter.printKernelOutput();
3840

3941
*reinterpret_cast<uint32_t *>(printfBuffer->getUnderlyingBuffer()) =

level_zero/core/test/unit_tests/fixtures/module_fixture.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ struct ModuleImmutableDataFixture : public DeviceFixture {
4242
};
4343

4444
struct MockImmutableData : KernelImmutableData {
45+
using KernelImmutableData::crossThreadDataSize;
46+
using KernelImmutableData::crossThreadDataTemplate;
4547
using KernelImmutableData::kernelDescriptor;
4648
using KernelImmutableData::kernelInfo;
4749
MockImmutableData(uint32_t perHwThreadPrivateMemorySize) {
@@ -113,6 +115,7 @@ struct ModuleImmutableDataFixture : public DeviceFixture {
113115

114116
class MockKernel : public WhiteBox<L0::KernelImp> {
115117
public:
118+
using KernelImp::crossThreadData;
116119
using KernelImp::kernelArgHandlers;
117120
using KernelImp::kernelHasIndirectAccess;
118121
using KernelImp::privateMemoryGraphicsAllocation;

level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,11 @@
3232
#include "level_zero/core/test/unit_tests/mocks/mock_kernel.h"
3333
#include "level_zero/core/test/unit_tests/mocks/mock_module.h"
3434

35-
void NEO::populateKernelDescriptor(KernelDescriptor &dst, const PatchTokenBinary::KernelFromPatchtokens &src, uint32_t gpuPointerSizeInBytes);
35+
namespace NEO {
36+
void populatePointerKernelArg(ArgDescPointer &dst,
37+
CrossThreadDataOffset stateless, uint8_t pointerSize, SurfaceStateHeapOffset bindful, CrossThreadDataOffset bindless,
38+
KernelDescriptor::AddressingMode addressingMode);
39+
}
3640

3741
namespace L0 {
3842
namespace ult {
@@ -2123,6 +2127,36 @@ TEST_F(KernelImplicitArgTests, givenKernelWithImplicitArgsWhenSettingKernelParam
21232127
EXPECT_EQ(0, memcmp(pImplicitArgs, &expectedImplicitArgs, sizeof(ImplicitArgs)));
21242128
}
21252129

2130+
TEST_F(KernelImplicitArgTests, givenKernelWithImplicitArgsAndPrintfStringsMapWhenPrintOutputThenProperStringIsPrinted) {
2131+
std::unique_ptr<MockImmutableData> mockKernelImmData = std::make_unique<MockImmutableData>(0u);
2132+
2133+
auto kernelDescriptor = mockKernelImmData->kernelDescriptor;
2134+
kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = true;
2135+
kernelDescriptor->kernelAttributes.flags.usesPrintf = false;
2136+
kernelDescriptor->kernelAttributes.flags.usesStringMapForPrintf = false;
2137+
std::string expectedString("test123");
2138+
kernelDescriptor->kernelMetadata.printfStringsMap.insert(std::make_pair(0u, expectedString));
2139+
2140+
createModuleFromBinary(0u, false, mockKernelImmData.get());
2141+
2142+
auto kernel = std::make_unique<MockKernel>(module.get());
2143+
2144+
ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC};
2145+
kernel->initialize(&kernelDesc);
2146+
2147+
auto printfAllocation = reinterpret_cast<uint32_t *>(kernel->getPrintfBufferAllocation()->getUnderlyingBuffer());
2148+
printfAllocation[0] = 8;
2149+
printfAllocation[1] = 0;
2150+
2151+
EXPECT_TRUE(kernel->getKernelDescriptor().kernelAttributes.flags.requiresImplicitArgs);
2152+
ASSERT_NE(nullptr, kernel->getImplicitArgs());
2153+
2154+
testing::internal::CaptureStdout();
2155+
kernel->printPrintfOutput();
2156+
std::string output = testing::internal::GetCapturedStdout();
2157+
EXPECT_STREQ(expectedString.c_str(), output.c_str());
2158+
}
2159+
21262160
TEST_F(KernelImplicitArgTests, givenKernelWithoutImplicitArgsWhenPatchingImplicitArgsThenNothingHappens) {
21272161
std::unique_ptr<MockImmutableData> mockKernelImmData = std::make_unique<MockImmutableData>(0u);
21282162
mockKernelImmData->kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = false;

opencl/source/helpers/task_information.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ class CommandComputeKernel : public Command {
139139

140140
LinearStream *getCommandStream() override { return kernelOperation->commandStream.get(); }
141141
Kernel *peekKernel() const { return kernel; }
142+
PrintfHandler *peekPrintfHandler() const { return printfHandler.get(); }
142143

143144
protected:
144145
std::vector<Surface *> surfaces;

opencl/source/program/printf_handler.cpp

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -61,16 +61,19 @@ void PrintfHandler::prepareDispatch(const MultiDispatchInfo &multiDispatchInfo)
6161
device.getDevice(), printfSurface, 0, printfSurfaceInitialDataSizePtr.get(),
6262
sizeof(*printfSurfaceInitialDataSizePtr.get()));
6363

64-
const auto &printfSurfaceArg = kernel->getKernelInfo().kernelDescriptor.payloadMappings.implicitArgs.printfSurfaceAddress;
65-
auto printfPatchAddress = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getCrossThreadData()), printfSurfaceArg.stateless);
66-
patchWithRequiredSize(printfPatchAddress, printfSurfaceArg.pointerSize, (uintptr_t)printfSurface->getGpuAddressToPatch());
67-
if (isValidOffset(printfSurfaceArg.bindful)) {
68-
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap()), printfSurfaceArg.bindful);
69-
void *addressToPatch = printfSurface->getUnderlyingBuffer();
70-
size_t sizeToPatch = printfSurface->getUnderlyingBufferSize();
71-
Buffer::setSurfaceState(&device.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, printfSurface, 0, 0,
72-
kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics,
73-
kernel->areMultipleSubDevicesInContext());
64+
if (kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.usesPrintf) {
65+
66+
const auto &printfSurfaceArg = kernel->getKernelInfo().kernelDescriptor.payloadMappings.implicitArgs.printfSurfaceAddress;
67+
auto printfPatchAddress = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getCrossThreadData()), printfSurfaceArg.stateless);
68+
patchWithRequiredSize(printfPatchAddress, printfSurfaceArg.pointerSize, (uintptr_t)printfSurface->getGpuAddressToPatch());
69+
if (isValidOffset(printfSurfaceArg.bindful)) {
70+
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap()), printfSurfaceArg.bindful);
71+
void *addressToPatch = printfSurface->getUnderlyingBuffer();
72+
size_t sizeToPatch = printfSurface->getUnderlyingBufferSize();
73+
Buffer::setSurfaceState(&device.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, printfSurface, 0, 0,
74+
kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics,
75+
kernel->areMultipleSubDevicesInContext());
76+
}
7477
}
7578
auto pImplicitArgs = kernel->getImplicitArgs();
7679
if (pImplicitArgs) {
@@ -83,31 +86,28 @@ void PrintfHandler::makeResident(CommandStreamReceiver &commandStreamReceiver) {
8386
}
8487

8588
void PrintfHandler::printEnqueueOutput() {
89+
auto usesStringMap = kernel->getDescriptor().kernelAttributes.flags.usesStringMapForPrintf || nullptr != kernel->getImplicitArgs();
8690
const auto &hwInfoConfig = *HwInfoConfig::get(device.getHardwareInfo().platform.eProductFamily);
91+
auto printfOutputBuffer = reinterpret_cast<const uint8_t *>(printfSurface->getUnderlyingBuffer());
92+
auto printfOutputSize = static_cast<uint32_t>(printfSurface->getUnderlyingBufferSize());
93+
std::unique_ptr<uint8_t[]> printfOutputDecompressed;
8794
if (hwInfoConfig.allowStatelessCompression(device.getHardwareInfo())) {
88-
auto printOutputSize = static_cast<uint32_t>(printfSurface->getUnderlyingBufferSize());
89-
auto printOutputDecompressed = std::make_unique<uint8_t[]>(printOutputSize);
95+
printfOutputDecompressed = std::make_unique<uint8_t[]>(printfOutputSize);
96+
printfOutputBuffer = printfOutputDecompressed.get();
9097
auto &bcsEngine = device.getEngine(EngineHelpers::getBcsEngineType(device.getHardwareInfo(), device.getDeviceBitfield(), device.getSelectorCopyEngine(), true), EngineUsage::Regular);
9198

9299
BlitPropertiesContainer blitPropertiesContainer;
93100
blitPropertiesContainer.push_back(
94101
BlitProperties::constructPropertiesForReadWrite(BlitterConstants::BlitDirection::BufferToHostPtr,
95102
*bcsEngine.commandStreamReceiver, printfSurface, nullptr,
96-
printOutputDecompressed.get(),
103+
printfOutputDecompressed.get(),
97104
printfSurface->getGpuAddress(),
98-
0, 0, 0, Vec3<size_t>(printOutputSize, 0, 0), 0, 0, 0, 0));
105+
0, 0, 0, Vec3<size_t>(printfOutputSize, 0, 0), 0, 0, 0, 0));
99106
bcsEngine.commandStreamReceiver->blitBuffer(blitPropertiesContainer, true, false, device.getDevice());
100-
101-
PrintFormatter printFormatter(printOutputDecompressed.get(), printOutputSize,
102-
kernel->is32Bit(),
103-
kernel->getDescriptor().kernelAttributes.flags.usesStringMapForPrintf ? &kernel->getDescriptor().kernelMetadata.printfStringsMap : nullptr);
104-
printFormatter.printKernelOutput();
105-
return;
106107
}
107108

108-
PrintFormatter printFormatter(reinterpret_cast<const uint8_t *>(printfSurface->getUnderlyingBuffer()), static_cast<uint32_t>(printfSurface->getUnderlyingBufferSize()),
109-
kernel->is32Bit(),
110-
kernel->getDescriptor().kernelAttributes.flags.usesStringMapForPrintf ? &kernel->getDescriptor().kernelMetadata.printfStringsMap : nullptr);
109+
PrintFormatter printFormatter(printfOutputBuffer, printfOutputSize, kernel->is32Bit(),
110+
usesStringMap ? &kernel->getDescriptor().kernelMetadata.printfStringsMap : nullptr);
111111
printFormatter.printKernelOutput();
112112
}
113113
} // namespace NEO

opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp

Lines changed: 75 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -623,49 +623,91 @@ HWCMDTEST_P(IGFX_GEN8_CORE, EnqueueKernelPrintfTest, GivenKernelWithPrintfBlocke
623623
}
624624

625625
HWTEST_P(EnqueueKernelPrintfTest, GivenKernelWithPrintfBlockedByEventWhenEventUnblockedThenOutputPrinted) {
626-
typedef typename FamilyType::PARSE PARSE;
626+
testing::internal::CaptureStdout();
627627

628-
// In scenarios with 32bit allocator and 64 bit tests this code won't work
629-
// due to inability to retrieve original buffer pointer as it is done in this test.
630-
auto memoryManager = pDevice->getMemoryManager();
631-
if (!memoryManager->peekForce32BitAllocations() && !memoryManager->isLimitedRange(0)) {
632-
testing::internal::CaptureStdout();
628+
auto userEvent = make_releaseable<UserEvent>(context);
633629

634-
auto userEvent = make_releaseable<UserEvent>(context);
630+
MockKernelWithInternals mockKernel(*pClDevice);
631+
mockKernel.kernelInfo.setPrintfSurface(sizeof(uintptr_t), 0);
632+
std::string testString = "test";
633+
mockKernel.kernelInfo.addToPrintfStringsMap(0, testString);
635634

636-
MockKernelWithInternals mockKernel(*pClDevice);
637-
mockKernel.kernelInfo.setPrintfSurface(sizeof(uintptr_t), 0);
638-
std::string testString = "test";
639-
mockKernel.kernelInfo.addToPrintfStringsMap(0, testString);
635+
cl_uint workDim = 1;
636+
size_t globalWorkOffset[3] = {0, 0, 0};
640637

641-
cl_uint workDim = 1;
642-
size_t globalWorkOffset[3] = {0, 0, 0};
638+
FillValues();
643639

644-
FillValues();
640+
cl_event blockedEvent = userEvent.get();
641+
cl_event outEvent{};
642+
auto retVal = pCmdQ->enqueueKernel(
643+
mockKernel,
644+
workDim,
645+
globalWorkOffset,
646+
globalWorkSize,
647+
localWorkSize,
648+
1,
649+
&blockedEvent,
650+
&outEvent);
645651

646-
cl_event blockedEvent = userEvent.get();
647-
auto retVal = pCmdQ->enqueueKernel(
648-
mockKernel,
649-
workDim,
650-
globalWorkOffset,
651-
globalWorkSize,
652-
localWorkSize,
653-
1,
654-
&blockedEvent,
655-
nullptr);
652+
ASSERT_EQ(CL_SUCCESS, retVal);
656653

657-
ASSERT_EQ(CL_SUCCESS, retVal);
654+
auto pOutEvent = castToObject<Event>(outEvent);
658655

659-
auto crossThreadData = reinterpret_cast<uint64_t *>(mockKernel.mockKernel->getCrossThreadData());
660-
auto printfAllocation = reinterpret_cast<uint32_t *>(*crossThreadData);
661-
printfAllocation[0] = 8;
662-
printfAllocation[1] = 0;
656+
auto printfAllocation = reinterpret_cast<uint32_t *>(static_cast<CommandComputeKernel *>(pOutEvent->peekCommand())->peekPrintfHandler()->getSurface()->getUnderlyingBuffer());
657+
printfAllocation[0] = 8;
658+
printfAllocation[1] = 0;
663659

664-
userEvent->setStatus(CL_COMPLETE);
660+
pOutEvent->release();
665661

666-
std::string output = testing::internal::GetCapturedStdout();
667-
EXPECT_STREQ("test", output.c_str());
668-
}
662+
userEvent->setStatus(CL_COMPLETE);
663+
664+
std::string output = testing::internal::GetCapturedStdout();
665+
EXPECT_STREQ("test", output.c_str());
666+
}
667+
668+
HWTEST_P(EnqueueKernelPrintfTest, GivenKernelWithImplicitArgsWithoutPrintfInParentKernelBlockedByEventWhenEventUnblockedThenOutputPrinted) {
669+
auto userEvent = make_releaseable<UserEvent>(context);
670+
671+
MockKernelWithInternals mockKernel(*pClDevice);
672+
std::string testString = "test";
673+
mockKernel.kernelInfo.addToPrintfStringsMap(0, testString);
674+
mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.flags.usesPrintf = false;
675+
mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.flags.usesStringMapForPrintf = false;
676+
mockKernel.mockKernel->pImplicitArgs = std::make_unique<ImplicitArgs>();
677+
*mockKernel.mockKernel->pImplicitArgs = {};
678+
679+
cl_uint workDim = 1;
680+
size_t globalWorkOffset[3] = {0, 0, 0};
681+
682+
FillValues();
683+
684+
cl_event blockedEvent = userEvent.get();
685+
cl_event outEvent{};
686+
auto retVal = pCmdQ->enqueueKernel(
687+
mockKernel,
688+
workDim,
689+
globalWorkOffset,
690+
globalWorkSize,
691+
localWorkSize,
692+
1,
693+
&blockedEvent,
694+
&outEvent);
695+
696+
ASSERT_EQ(CL_SUCCESS, retVal);
697+
698+
auto pOutEvent = castToObject<Event>(outEvent);
699+
700+
auto printfAllocation = reinterpret_cast<uint32_t *>(static_cast<CommandComputeKernel *>(pOutEvent->peekCommand())->peekPrintfHandler()->getSurface()->getUnderlyingBuffer());
701+
printfAllocation[0] = 8;
702+
printfAllocation[1] = 0;
703+
704+
pOutEvent->release();
705+
706+
testing::internal::CaptureStdout();
707+
userEvent->setStatus(CL_COMPLETE);
708+
std::string output = testing::internal::GetCapturedStdout();
709+
710+
EXPECT_STREQ("test", output.c_str());
669711
}
670712

671713
INSTANTIATE_TEST_CASE_P(EnqueueKernel,

opencl/test/unit_test/mocks/mock_kernel.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ class MockKernel : public Kernel {
124124
using Kernel::parentEventOffset;
125125
using Kernel::patchBufferOffset;
126126
using Kernel::patchWithImplicitSurface;
127+
using Kernel::pImplicitArgs;
127128
using Kernel::preferredWkgMultipleOffset;
128129
using Kernel::privateSurface;
129130
using Kernel::singleSubdevicePreferredInCurrentEnqueue;

0 commit comments

Comments
 (0)