Skip to content

Commit 479d01c

Browse files
Jaime ArteagaCompute-Runtime-Automation
authored andcommitted
Improve zeCommandListAppendMemoryFill Performance
Improve L0 fill operations by copying the pattern using two kernels: one that copies four bytes at a time, and one that takes care of the remainder. Additionally, a new allocation is created to fill up at least a cacheline. Signed-off-by: Jaime Arteaga <[email protected]>
1 parent 444b959 commit 479d01c

File tree

8 files changed

+300
-254
lines changed

8 files changed

+300
-254
lines changed

level_zero/core/source/builtin/builtin_functions_lib.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2019-2020 Intel Corporation
2+
* Copyright (C) 2019-2021 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -26,6 +26,8 @@ enum class Builtin : uint32_t {
2626
CopyBufferToBufferSide,
2727
FillBufferImmediate,
2828
FillBufferSSHOffset,
29+
FillBufferMiddle,
30+
FillBufferRightLeftover,
2931
QueryKernelTimestamps,
3032
QueryKernelTimestampsWithOffsets,
3133
COUNT

level_zero/core/source/builtin/builtin_functions_lib_impl.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2019-2020 Intel Corporation
2+
* Copyright (C) 2019-2021 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -49,6 +49,14 @@ void BuiltinFunctionsLibImpl::initFunctions() {
4949
builtinName = "FillBufferSSHOffset";
5050
builtin = NEO::EBuiltInOps::FillBuffer;
5151
break;
52+
case Builtin::FillBufferMiddle:
53+
builtinName = "FillBufferMiddle";
54+
builtin = NEO::EBuiltInOps::FillBuffer;
55+
break;
56+
case Builtin::FillBufferRightLeftover:
57+
builtinName = "FillBufferRightLeftover";
58+
builtin = NEO::EBuiltInOps::FillBuffer;
59+
break;
5260
case Builtin::QueryKernelTimestamps:
5361
builtinName = "QueryKernelTimestamps";
5462
builtin = NEO::EBuiltInOps::QueryKernelTimestamps;

level_zero/core/source/cmdlist/cmdlist.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2019-2020 Intel Corporation
2+
* Copyright (C) 2019-2021 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -190,6 +190,9 @@ struct CommandList : _ze_command_list_handle_t {
190190
virtual ~CommandList();
191191
NEO::CommandContainer commandContainer;
192192
bool getContainsStatelessUncachedResource() { return containsStatelessUncachedResource; }
193+
std::map<const void *, NEO::GraphicsAllocation *> &getHostPtrMap() {
194+
return hostPtrMap;
195+
};
193196

194197
protected:
195198
std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap;

level_zero/core/source/cmdlist/cmdlist_hw.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -199,11 +199,11 @@ struct CommandListCoreFamily : CommandListImp {
199199
size_t bytesPerPixel, Vec3<size_t> copySize,
200200
Vec3<uint32_t> srcSize, Vec3<uint32_t> dstSize, ze_event_handle_t hSignalEvent);
201201

202-
ze_result_t appendLaunchKernelWithParams(ze_kernel_handle_t hKernel,
203-
const ze_group_count_t *pThreadGroupDimensions,
204-
ze_event_handle_t hEvent,
205-
bool isIndirect,
206-
bool isPredicate);
202+
MOCKABLE_VIRTUAL ze_result_t appendLaunchKernelWithParams(ze_kernel_handle_t hKernel,
203+
const ze_group_count_t *pThreadGroupDimensions,
204+
ze_event_handle_t hEvent,
205+
bool isIndirect,
206+
bool isPredicate);
207207
ze_result_t appendLaunchKernelSplit(ze_kernel_handle_t hKernel, const ze_group_count_t *pThreadGroupDimensions, ze_event_handle_t hEvent);
208208
ze_result_t prepareIndirectParams(const ze_group_count_t *pThreadGroupDimensions);
209209

level_zero/core/source/cmdlist/cmdlist_hw.inl

Lines changed: 82 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1135,9 +1135,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
11351135
return appendBlitFill(ptr, pattern, patternSize, size, hSignalEvent, numWaitEvents, phWaitEvents);
11361136
}
11371137

1138-
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents);
1139-
if (ret) {
1140-
return ret;
1138+
ze_result_t res = addEventsToCmdList(numWaitEvents, phWaitEvents);
1139+
if (res) {
1140+
return res;
11411141
}
11421142

11431143
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
@@ -1159,20 +1159,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
11591159
}
11601160

11611161
auto dstAllocation = this->getAlignedAllocation(this->device, ptr, size);
1162-
1163-
uintptr_t srcPtr = reinterpret_cast<uintptr_t>(const_cast<void *>(pattern));
1164-
size_t srcOffset = 0;
1165-
NEO::EncodeSurfaceState<GfxFamily>::getSshAlignedPointer(srcPtr, srcOffset);
1166-
11671162
auto lock = device->getBuiltinFunctionsLib()->obtainUniqueOwnership();
11681163

1169-
Kernel *builtinFunction = nullptr;
1170-
uint32_t groupSizeX = 1u;
1171-
11721164
if (patternSize == 1) {
1173-
builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediate);
1165+
auto builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediate);
11741166

1175-
groupSizeX = builtinFunction->getImmutableData()->getDescriptor().kernelAttributes.simdSize;
1167+
uint32_t groupSizeX = builtinFunction->getImmutableData()->getDescriptor().kernelAttributes.simdSize;
11761168
if (groupSizeX > static_cast<uint32_t>(size)) {
11771169
groupSizeX = static_cast<uint32_t>(size);
11781170
}
@@ -1186,50 +1178,92 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
11861178
builtinFunction->setArgumentValue(1, sizeof(dstAllocation.offset), &dstAllocation.offset);
11871179
builtinFunction->setArgumentValue(2, sizeof(value), &value);
11881180

1189-
} else {
1190-
builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferSSHOffset);
1181+
appendEventForProfilingAllWalkers(hSignalEvent, true);
11911182

1192-
auto patternAlloc = this->getAlignedAllocation(this->device, reinterpret_cast<void *>(srcPtr), srcOffset + patternSize);
1193-
if (patternAlloc.alloc == nullptr) {
1194-
DEBUG_BREAK_IF(true);
1195-
return ZE_RESULT_ERROR_UNKNOWN;
1183+
uint32_t groups = static_cast<uint32_t>(size) / groupSizeX;
1184+
ze_group_count_t dispatchFuncArgs{groups, 1u, 1u};
1185+
res = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernel(builtinFunction->toHandle(),
1186+
&dispatchFuncArgs, nullptr,
1187+
0, nullptr);
1188+
if (res) {
1189+
return res;
11961190
}
1197-
srcOffset += patternAlloc.offset;
1198-
1199-
groupSizeX = static_cast<uint32_t>(std::min(patternSize, size));
1200-
if (builtinFunction->setGroupSize(groupSizeX, 1u, 1u)) {
1201-
DEBUG_BREAK_IF(true);
1202-
return ZE_RESULT_ERROR_UNKNOWN;
1191+
} else {
1192+
auto builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddle);
1193+
1194+
size_t middleElSize = sizeof(uint32_t);
1195+
size_t adjustedSize = size / middleElSize;
1196+
uint32_t groupSizeX = static_cast<uint32_t>(adjustedSize);
1197+
uint32_t groupSizeY = 1, groupSizeZ = 1;
1198+
builtinFunction->suggestGroupSize(groupSizeX, groupSizeY, groupSizeZ, &groupSizeX, &groupSizeY, &groupSizeZ);
1199+
builtinFunction->setGroupSize(groupSizeX, groupSizeY, groupSizeZ);
1200+
1201+
uint32_t groups = static_cast<uint32_t>(adjustedSize) / groupSizeX;
1202+
uint32_t groupRemainderSizeX = static_cast<uint32_t>(size) % groupSizeX;
1203+
1204+
size_t patternAllocationSize = alignUp(patternSize, MemoryConstants::cacheLineSize);
1205+
uint32_t patternSizeInEls = static_cast<uint32_t>(patternAllocationSize / middleElSize);
1206+
1207+
auto patternGfxAlloc = getAllocationFromHostPtrMap(pattern, patternAllocationSize);
1208+
if (patternGfxAlloc == nullptr) {
1209+
patternGfxAlloc = device->getDriverHandle()->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getNEODevice()->getRootDeviceIndex(),
1210+
patternAllocationSize,
1211+
NEO::GraphicsAllocation::AllocationType::FILL_PATTERN,
1212+
device->getNEODevice()->getDeviceBitfield()});
1213+
hostPtrMap.insert(std::make_pair(pattern, patternGfxAlloc));
12031214
}
1215+
void *patternGfxAllocPtr = patternGfxAlloc->getUnderlyingBuffer();
12041216

1205-
builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
1206-
builtinFunction->setArgumentValue(1, sizeof(dstAllocation.offset), &dstAllocation.offset);
1207-
builtinFunction->setArgBufferWithAlloc(2, patternAlloc.alignedAllocationPtr,
1208-
patternAlloc.alloc);
1209-
builtinFunction->setArgumentValue(3, sizeof(srcOffset), &srcOffset);
1210-
}
1217+
uint64_t patternAllocPtr = reinterpret_cast<uintptr_t>(patternGfxAllocPtr);
1218+
uint64_t patternAllocOffset = 0;
1219+
uint64_t patternSizeToCopy = patternSize;
1220+
do {
1221+
memcpy_s(reinterpret_cast<void *>(patternAllocPtr + patternAllocOffset),
1222+
patternSizeToCopy, pattern, patternSizeToCopy);
12111223

1212-
appendEventForProfilingAllWalkers(hSignalEvent, true);
1213-
1214-
uint32_t groups = static_cast<uint32_t>(size) / groupSizeX;
1215-
ze_group_count_t dispatchFuncArgs{groups, 1u, 1u};
1224+
if ((patternAllocOffset + patternSizeToCopy) > patternAllocationSize) {
1225+
patternSizeToCopy = patternAllocationSize - patternAllocOffset;
1226+
}
12161227

1217-
ze_result_t res = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(builtinFunction->toHandle(), &dispatchFuncArgs, hSignalEvent);
1228+
patternAllocOffset += patternSizeToCopy;
1229+
} while (patternAllocOffset < patternAllocationSize);
12181230

1219-
if (res) {
1220-
return res;
1221-
}
1231+
builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
1232+
builtinFunction->setArgumentValue(1, sizeof(dstAllocation.offset), &dstAllocation.offset);
1233+
builtinFunction->setArgBufferWithAlloc(2, reinterpret_cast<uintptr_t>(patternGfxAllocPtr), patternGfxAlloc);
1234+
builtinFunction->setArgumentValue(3, sizeof(patternSizeInEls), &patternSizeInEls);
12221235

1223-
uint32_t groupRemainderSizeX = static_cast<uint32_t>(size) % groupSizeX;
1224-
if (groupRemainderSizeX) {
1225-
builtinFunction->setGroupSize(groupRemainderSizeX, 1u, 1u);
1226-
ze_group_count_t dispatchFuncArgs{1u, 1u, 1u};
1236+
appendEventForProfilingAllWalkers(hSignalEvent, true);
12271237

1228-
size_t dstOffset = dstAllocation.offset + (size - groupRemainderSizeX);
1229-
builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
1230-
builtinFunction->setArgumentValue(1, sizeof(dstOffset), &dstOffset);
1238+
ze_group_count_t dispatchFuncArgs{groups, 1u, 1u};
1239+
res = appendLaunchKernelSplit(builtinFunction->toHandle(), &dispatchFuncArgs, hSignalEvent);
1240+
if (res) {
1241+
return res;
1242+
}
12311243

1232-
res = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(builtinFunction->toHandle(), &dispatchFuncArgs, hSignalEvent);
1244+
if (groupRemainderSizeX) {
1245+
uint32_t dstOffsetRemainder = groups * groupSizeX * static_cast<uint32_t>(middleElSize);
1246+
uint64_t patternOffsetRemainder = (groupSizeX * groups & (patternSizeInEls - 1)) * middleElSize;
1247+
1248+
auto builtinFunctionRemainder = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferRightLeftover);
1249+
builtinFunctionRemainder->setGroupSize(groupRemainderSizeX, 1u, 1u);
1250+
ze_group_count_t dispatchFuncArgs{1u, 1u, 1u};
1251+
1252+
builtinFunctionRemainder->setArgBufferWithAlloc(0,
1253+
dstAllocation.alignedAllocationPtr,
1254+
dstAllocation.alloc);
1255+
builtinFunctionRemainder->setArgumentValue(1,
1256+
sizeof(dstOffsetRemainder),
1257+
&dstOffsetRemainder);
1258+
builtinFunctionRemainder->setArgBufferWithAlloc(2,
1259+
reinterpret_cast<uintptr_t>(patternGfxAllocPtr) + patternOffsetRemainder,
1260+
patternGfxAlloc);
1261+
builtinFunctionRemainder->setArgumentValue(3, sizeof(patternSizeInEls), &patternSizeInEls);
1262+
res = appendLaunchKernelSplit(builtinFunctionRemainder->toHandle(), &dispatchFuncArgs, hSignalEvent);
1263+
if (res) {
1264+
return res;
1265+
}
1266+
}
12331267
}
12341268

12351269
appendEventForProfilingAllWalkers(hSignalEvent, false);
@@ -1488,7 +1522,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfiling(ze_event_hand
14881522
appendWriteKernelTimestamp(hEvent, beforeWalker, true);
14891523
} else {
14901524

1491-
NEO::PipeControlArgs args;
1525+
NEO::PipeControlArgs args = {};
14921526
args.dcFlushEnable = true;
14931527

14941528
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);

level_zero/core/test/unit_tests/sources/cmdlist/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (C) 2020 Intel Corporation
2+
# Copyright (C) 2020-2021 Intel Corporation
33
#
44
# SPDX-License-Identifier: MIT
55
#
@@ -16,5 +16,6 @@ target_sources(${TARGET_NAME} PRIVATE
1616
${CMAKE_CURRENT_SOURCE_DIR}/test_cmdlist_append_signal_event.cpp
1717
${CMAKE_CURRENT_SOURCE_DIR}/test_cmdlist_append_wait_on_events.cpp
1818
${CMAKE_CURRENT_SOURCE_DIR}/test_cmdlist_blit.cpp
19+
${CMAKE_CURRENT_SOURCE_DIR}/test_cmdlist_fill.cpp
1920
)
2021
add_subdirectories()

0 commit comments

Comments
 (0)