@@ -1135,9 +1135,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
11351135 return appendBlitFill (ptr, pattern, patternSize, size, hSignalEvent, numWaitEvents, phWaitEvents);
11361136 }
11371137
1138- ze_result_t ret = addEventsToCmdList (numWaitEvents, phWaitEvents);
1139- if (ret ) {
1140- return ret ;
1138+ ze_result_t res = addEventsToCmdList (numWaitEvents, phWaitEvents);
1139+ if (res ) {
1140+ return res ;
11411141 }
11421142
11431143 using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
@@ -1159,20 +1159,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
11591159 }
11601160
11611161 auto dstAllocation = this ->getAlignedAllocation (this ->device , ptr, size);
1162-
1163- uintptr_t srcPtr = reinterpret_cast <uintptr_t >(const_cast <void *>(pattern));
1164- size_t srcOffset = 0 ;
1165- NEO::EncodeSurfaceState<GfxFamily>::getSshAlignedPointer (srcPtr, srcOffset);
1166-
11671162 auto lock = device->getBuiltinFunctionsLib ()->obtainUniqueOwnership ();
11681163
1169- Kernel *builtinFunction = nullptr ;
1170- uint32_t groupSizeX = 1u ;
1171-
11721164 if (patternSize == 1 ) {
1173- builtinFunction = device->getBuiltinFunctionsLib ()->getFunction (Builtin::FillBufferImmediate);
1165+ auto builtinFunction = device->getBuiltinFunctionsLib ()->getFunction (Builtin::FillBufferImmediate);
11741166
1175- groupSizeX = builtinFunction->getImmutableData ()->getDescriptor ().kernelAttributes .simdSize ;
1167+ uint32_t groupSizeX = builtinFunction->getImmutableData ()->getDescriptor ().kernelAttributes .simdSize ;
11761168 if (groupSizeX > static_cast <uint32_t >(size)) {
11771169 groupSizeX = static_cast <uint32_t >(size);
11781170 }
@@ -1186,50 +1178,92 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
11861178 builtinFunction->setArgumentValue (1 , sizeof (dstAllocation.offset ), &dstAllocation.offset );
11871179 builtinFunction->setArgumentValue (2 , sizeof (value), &value);
11881180
1189- } else {
1190- builtinFunction = device->getBuiltinFunctionsLib ()->getFunction (Builtin::FillBufferSSHOffset);
1181+ appendEventForProfilingAllWalkers (hSignalEvent, true );
11911182
1192- auto patternAlloc = this ->getAlignedAllocation (this ->device , reinterpret_cast <void *>(srcPtr), srcOffset + patternSize);
1193- if (patternAlloc.alloc == nullptr ) {
1194- DEBUG_BREAK_IF (true );
1195- return ZE_RESULT_ERROR_UNKNOWN;
1183+ uint32_t groups = static_cast <uint32_t >(size) / groupSizeX;
1184+ ze_group_count_t dispatchFuncArgs{groups, 1u , 1u };
1185+ res = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernel (builtinFunction->toHandle (),
1186+ &dispatchFuncArgs, nullptr ,
1187+ 0 , nullptr );
1188+ if (res) {
1189+ return res;
11961190 }
1197- srcOffset += patternAlloc.offset ;
1198-
1199- groupSizeX = static_cast <uint32_t >(std::min (patternSize, size));
1200- if (builtinFunction->setGroupSize (groupSizeX, 1u , 1u )) {
1201- DEBUG_BREAK_IF (true );
1202- return ZE_RESULT_ERROR_UNKNOWN;
1191+ } else {
1192+ auto builtinFunction = device->getBuiltinFunctionsLib ()->getFunction (Builtin::FillBufferMiddle);
1193+
1194+ size_t middleElSize = sizeof (uint32_t );
1195+ size_t adjustedSize = size / middleElSize;
1196+ uint32_t groupSizeX = static_cast <uint32_t >(adjustedSize);
1197+ uint32_t groupSizeY = 1 , groupSizeZ = 1 ;
1198+ builtinFunction->suggestGroupSize (groupSizeX, groupSizeY, groupSizeZ, &groupSizeX, &groupSizeY, &groupSizeZ);
1199+ builtinFunction->setGroupSize (groupSizeX, groupSizeY, groupSizeZ);
1200+
1201+ uint32_t groups = static_cast <uint32_t >(adjustedSize) / groupSizeX;
1202+ uint32_t groupRemainderSizeX = static_cast <uint32_t >(size) % groupSizeX;
1203+
1204+ size_t patternAllocationSize = alignUp (patternSize, MemoryConstants::cacheLineSize);
1205+ uint32_t patternSizeInEls = static_cast <uint32_t >(patternAllocationSize / middleElSize);
1206+
1207+ auto patternGfxAlloc = getAllocationFromHostPtrMap (pattern, patternAllocationSize);
1208+ if (patternGfxAlloc == nullptr ) {
1209+ patternGfxAlloc = device->getDriverHandle ()->getMemoryManager ()->allocateGraphicsMemoryWithProperties ({device->getNEODevice ()->getRootDeviceIndex (),
1210+ patternAllocationSize,
1211+ NEO::GraphicsAllocation::AllocationType::FILL_PATTERN,
1212+ device->getNEODevice ()->getDeviceBitfield ()});
1213+ hostPtrMap.insert (std::make_pair (pattern, patternGfxAlloc));
12031214 }
1215+ void *patternGfxAllocPtr = patternGfxAlloc->getUnderlyingBuffer ();
12041216
1205- builtinFunction-> setArgBufferWithAlloc ( 0 , dstAllocation. alignedAllocationPtr , dstAllocation. alloc );
1206- builtinFunction-> setArgumentValue ( 1 , sizeof (dstAllocation. offset ), &dstAllocation. offset ) ;
1207- builtinFunction-> setArgBufferWithAlloc ( 2 , patternAlloc. alignedAllocationPtr ,
1208- patternAlloc. alloc );
1209- builtinFunction-> setArgumentValue ( 3 , sizeof (srcOffset), &srcOffset);
1210- }
1217+ uint64_t patternAllocPtr = reinterpret_cast < uintptr_t >(patternGfxAllocPtr );
1218+ uint64_t patternAllocOffset = 0 ;
1219+ uint64_t patternSizeToCopy = patternSize;
1220+ do {
1221+ memcpy_s ( reinterpret_cast < void *>(patternAllocPtr + patternAllocOffset),
1222+ patternSizeToCopy, pattern, patternSizeToCopy);
12111223
1212- appendEventForProfilingAllWalkers (hSignalEvent, true );
1213-
1214- uint32_t groups = static_cast <uint32_t >(size) / groupSizeX;
1215- ze_group_count_t dispatchFuncArgs{groups, 1u , 1u };
1224+ if ((patternAllocOffset + patternSizeToCopy) > patternAllocationSize) {
1225+ patternSizeToCopy = patternAllocationSize - patternAllocOffset;
1226+ }
12161227
1217- ze_result_t res = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit (builtinFunction->toHandle (), &dispatchFuncArgs, hSignalEvent);
1228+ patternAllocOffset += patternSizeToCopy;
1229+ } while (patternAllocOffset < patternAllocationSize);
12181230
1219- if (res) {
1220- return res;
1221- }
1231+ builtinFunction->setArgBufferWithAlloc (0 , dstAllocation.alignedAllocationPtr , dstAllocation.alloc );
1232+ builtinFunction->setArgumentValue (1 , sizeof (dstAllocation.offset ), &dstAllocation.offset );
1233+ builtinFunction->setArgBufferWithAlloc (2 , reinterpret_cast <uintptr_t >(patternGfxAllocPtr), patternGfxAlloc);
1234+ builtinFunction->setArgumentValue (3 , sizeof (patternSizeInEls), &patternSizeInEls);
12221235
1223- uint32_t groupRemainderSizeX = static_cast <uint32_t >(size) % groupSizeX;
1224- if (groupRemainderSizeX) {
1225- builtinFunction->setGroupSize (groupRemainderSizeX, 1u , 1u );
1226- ze_group_count_t dispatchFuncArgs{1u , 1u , 1u };
1236+ appendEventForProfilingAllWalkers (hSignalEvent, true );
12271237
1228- size_t dstOffset = dstAllocation.offset + (size - groupRemainderSizeX);
1229- builtinFunction->setArgBufferWithAlloc (0 , dstAllocation.alignedAllocationPtr , dstAllocation.alloc );
1230- builtinFunction->setArgumentValue (1 , sizeof (dstOffset), &dstOffset);
1238+ ze_group_count_t dispatchFuncArgs{groups, 1u , 1u };
1239+ res = appendLaunchKernelSplit (builtinFunction->toHandle (), &dispatchFuncArgs, hSignalEvent);
1240+ if (res) {
1241+ return res;
1242+ }
12311243
1232- res = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit (builtinFunction->toHandle (), &dispatchFuncArgs, hSignalEvent);
1244+ if (groupRemainderSizeX) {
1245+ uint32_t dstOffsetRemainder = groups * groupSizeX * static_cast <uint32_t >(middleElSize);
1246+ uint64_t patternOffsetRemainder = (groupSizeX * groups & (patternSizeInEls - 1 )) * middleElSize;
1247+
1248+ auto builtinFunctionRemainder = device->getBuiltinFunctionsLib ()->getFunction (Builtin::FillBufferRightLeftover);
1249+ builtinFunctionRemainder->setGroupSize (groupRemainderSizeX, 1u , 1u );
1250+ ze_group_count_t dispatchFuncArgs{1u , 1u , 1u };
1251+
1252+ builtinFunctionRemainder->setArgBufferWithAlloc (0 ,
1253+ dstAllocation.alignedAllocationPtr ,
1254+ dstAllocation.alloc );
1255+ builtinFunctionRemainder->setArgumentValue (1 ,
1256+ sizeof (dstOffsetRemainder),
1257+ &dstOffsetRemainder);
1258+ builtinFunctionRemainder->setArgBufferWithAlloc (2 ,
1259+ reinterpret_cast <uintptr_t >(patternGfxAllocPtr) + patternOffsetRemainder,
1260+ patternGfxAlloc);
1261+ builtinFunctionRemainder->setArgumentValue (3 , sizeof (patternSizeInEls), &patternSizeInEls);
1262+ res = appendLaunchKernelSplit (builtinFunctionRemainder->toHandle (), &dispatchFuncArgs, hSignalEvent);
1263+ if (res) {
1264+ return res;
1265+ }
1266+ }
12331267 }
12341268
12351269 appendEventForProfilingAllWalkers (hSignalEvent, false );
@@ -1488,7 +1522,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfiling(ze_event_hand
14881522 appendWriteKernelTimestamp (hEvent, beforeWalker, true );
14891523 } else {
14901524
1491- NEO::PipeControlArgs args;
1525+ NEO::PipeControlArgs args = {} ;
14921526 args.dcFlushEnable = true ;
14931527
14941528 NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl (*commandContainer.getCommandStream (), args);
0 commit comments