Skip to content

Commit 26b036a

Browse files
Jaime ArteagaCompute-Runtime-Automation
authored andcommitted
Improve zeCommandListAppendMemoryFill Performance (2)
Add missing kernel for remainder kernel when pattern size is 1. Signed-off-by: Jaime Arteaga <[email protected]>
1 parent a0db607 commit 26b036a

File tree

2 files changed

+37
-5
lines changed

2 files changed

+37
-5
lines changed

level_zero/core/source/cmdlist/cmdlist_hw.inl

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1182,12 +1182,25 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
11821182

11831183
uint32_t groups = static_cast<uint32_t>(size) / groupSizeX;
11841184
ze_group_count_t dispatchFuncArgs{groups, 1u, 1u};
1185-
res = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernel(builtinFunction->toHandle(),
1186-
&dispatchFuncArgs, nullptr,
1187-
0, nullptr);
1185+
res = appendLaunchKernelSplit(builtinFunction->toHandle(), &dispatchFuncArgs, hSignalEvent);
11881186
if (res) {
11891187
return res;
11901188
}
1189+
1190+
uint32_t groupRemainderSizeX = static_cast<uint32_t>(size) % groupSizeX;
1191+
if (groupRemainderSizeX) {
1192+
builtinFunction->setGroupSize(groupRemainderSizeX, 1u, 1u);
1193+
ze_group_count_t dispatchFuncRemainderArgs{1u, 1u, 1u};
1194+
1195+
size_t dstOffset = dstAllocation.offset + (size - groupRemainderSizeX);
1196+
builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
1197+
builtinFunction->setArgumentValue(1, sizeof(dstOffset), &dstOffset);
1198+
1199+
res = appendLaunchKernelSplit(builtinFunction->toHandle(), &dispatchFuncRemainderArgs, hSignalEvent);
1200+
if (res) {
1201+
return res;
1202+
}
1203+
}
11911204
} else {
11921205
auto builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddle);
11931206

level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ class AppendFillFixture : public DeviceFixture, public ::testing::Test {
6868

6969
void SetUp() override {
7070
dstPtr = new uint8_t[allocSize];
71+
immediateDstPtr = new uint8_t[allocSize];
7172

7273
neoDevice = NEO::MockDevice::createWithNewExecutionEnvironment<NEO::MockDevice>(NEO::defaultHwInfo.get());
7374
auto mockBuiltIns = new MockBuiltins();
@@ -80,20 +81,38 @@ class AppendFillFixture : public DeviceFixture, public ::testing::Test {
8081
}
8182

8283
void TearDown() override {
84+
delete[] immediateDstPtr;
8385
delete[] dstPtr;
8486
}
8587

8688
std::unique_ptr<Mock<MockDriverFillHandle>> driverHandle;
8789
NEO::MockDevice *neoDevice = nullptr;
8890
L0::Device *device = nullptr;
89-
static constexpr size_t allocSize = 512;
90-
static constexpr size_t patternSize = 4;
91+
static constexpr size_t allocSize = 70;
92+
static constexpr size_t patternSize = 8;
9193
uint8_t *dstPtr = nullptr;
9294
uint8_t pattern[patternSize] = {1, 2, 3, 4};
95+
96+
static constexpr size_t immediateAllocSize = 106;
97+
uint8_t immediatePattern = 4;
98+
uint8_t *immediateDstPtr = nullptr;
9399
};
94100

95101
using Platforms = IsAtLeastProduct<IGFX_SKYLAKE>;
96102

103+
HWTEST2_F(AppendFillFixture,
104+
givenCallToAppendMemoryFillWithImmediateValueThenSuccessIsReturned, Platforms) {
105+
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
106+
107+
auto commandList = std::make_unique<WhiteBox<MockCommandList<gfxCoreFamily>>>();
108+
commandList->initialize(device, NEO::EngineGroupType::RenderCompute);
109+
110+
auto result = commandList->appendMemoryFill(immediateDstPtr, &immediatePattern,
111+
sizeof(immediatePattern),
112+
immediateAllocSize, nullptr, 0, nullptr);
113+
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
114+
}
115+
97116
HWTEST2_F(AppendFillFixture,
98117
givenCallToAppendMemoryFillThenSuccessIsReturned, Platforms) {
99118
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;

0 commit comments

Comments
 (0)