@@ -224,8 +224,17 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchCooperativeKernel(
224224 return ret;
225225 }
226226
227- return appendLaunchKernelWithParams (hKernel, pLaunchFuncArgs,
228- hSignalEvent, false , false , true );
227+ ret = appendLaunchKernelWithParams (hKernel, pLaunchFuncArgs,
228+ hSignalEvent, false , false , true );
229+ if (ret) {
230+ return ret;
231+ }
232+
233+ if (hSignalEvent) {
234+ programEventL3Flush (hSignalEvent, this ->device , this ->partitionCount , commandContainer);
235+ }
236+
237+ return ret;
229238}
230239
231240template <GFXCORE_FAMILY gfxCoreFamily>
@@ -242,6 +251,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelIndirect(ze_
242251 appendEventForProfiling (hEvent, true , false );
243252 ret = appendLaunchKernelWithParams (hKernel, pDispatchArgumentsBuffer,
244253 nullptr , true , false , false );
254+ if (ret) {
255+ return ret;
256+ }
257+ if (hEvent) {
258+ programEventL3Flush (hEvent, this ->device , this ->partitionCount , commandContainer);
259+ }
245260 appendSignalEventPostWalker (hEvent, false );
246261
247262 return ret;
@@ -276,7 +291,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchMultipleKernelsInd
276291 return ret;
277292 }
278293 }
279-
294+ if (hEvent) {
295+ programEventL3Flush (hEvent, this ->device , this ->partitionCount , commandContainer);
296+ }
280297 appendSignalEventPostWalker (hEvent, false );
281298
282299 return ret;
@@ -800,22 +817,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemAdvise(ze_device_hand
800817 return ZE_RESULT_ERROR_INVALID_ARGUMENT;
801818}
802819
803- template <GFXCORE_FAMILY gfxCoreFamily>
804- ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(ze_kernel_handle_t hKernel,
805- const ze_group_count_t *pThreadGroupDimensions,
806- ze_event_handle_t hEvent) {
807- return appendLaunchKernelWithParams (hKernel, pThreadGroupDimensions, nullptr , false , false , false );
808- }
809-
810- template <GFXCORE_FAMILY gfxCoreFamily>
811- void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(ze_event_handle_t hEvent, bool beforeWalker) {
812- if (beforeWalker) {
813- appendEventForProfiling (hEvent, true , false );
814- } else {
815- appendSignalEventPostWalker (hEvent, false );
816- }
817- }
818-
819820template <GFXCORE_FAMILY gfxCoreFamily>
820821ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyKernelWithGA(void *dstPtr,
821822 NEO::GraphicsAllocation *dstPtrAlloc,
@@ -1069,6 +1070,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
10691070 }
10701071
10711072 appendEventForProfilingAllWalkers (hSignalEvent, true );
1073+ adjustEventKernelCount (hSignalEvent);
10721074
10731075 if (ret == ZE_RESULT_SUCCESS && leftSize) {
10741076 Builtin func = Builtin::CopyBufferToBufferSide;
@@ -1128,16 +1130,22 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
11281130 isStateless);
11291131 }
11301132
1133+ if (hSignalEvent) {
1134+ programEventL3Flush (hSignalEvent, this ->device , this ->partitionCount , commandContainer);
1135+ }
11311136 appendEventForProfilingAllWalkers (hSignalEvent, false );
11321137
11331138 const auto &hwInfo = this ->device ->getHwInfo ();
11341139 if (NEO::MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable (true , hwInfo)) {
11351140 auto event = Event::fromHandle (hSignalEvent);
11361141 if (event) {
11371142 dstAllocationStruct.needsFlush &= !event->signalScope ;
1143+ dstAllocationStruct.needsFlush &= !event->l3FlushWaApplied ;
11381144 }
11391145
1140- if (dstAllocationStruct.needsFlush && !isCopyOnly ()) {
1146+ dstAllocationStruct.needsFlush &= !isCopyOnly ();
1147+
1148+ if (dstAllocationStruct.needsFlush ) {
11411149 NEO::PipeControlArgs args;
11421150 args.dcFlushEnable = true ;
11431151 NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl (*commandContainer.getCommandStream (), args);
@@ -1452,6 +1460,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
14521460 builtinFunction->setArgumentValue (2 , sizeof (value), &value);
14531461
14541462 appendEventForProfilingAllWalkers (hSignalEvent, true );
1463+ adjustEventKernelCount (hSignalEvent);
14551464
14561465 uint32_t groups = static_cast <uint32_t >(size) / groupSizeX;
14571466 ze_group_count_t dispatchFuncArgs{groups, 1u , 1u };
@@ -1526,6 +1535,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
15261535 builtinFunction->setArgumentValue (3 , sizeof (patternSizeInEls), &patternSizeInEls);
15271536
15281537 appendEventForProfilingAllWalkers (hSignalEvent, true );
1538+ adjustEventKernelCount (hSignalEvent);
15291539
15301540 ze_group_count_t dispatchFuncArgs{groups, 1u , 1u };
15311541 res = appendLaunchKernelSplit (builtinFunction->toHandle (), &dispatchFuncArgs, hSignalEvent);
@@ -1564,15 +1574,21 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
15641574 }
15651575 }
15661576
1577+ if (hSignalEvent) {
1578+ programEventL3Flush (hSignalEvent, this ->device , this ->partitionCount , commandContainer);
1579+ }
15671580 appendEventForProfilingAllWalkers (hSignalEvent, false );
15681581
15691582 const auto &hwInfo = this ->device ->getHwInfo ();
15701583 if (NEO::MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable (true , hwInfo)) {
15711584 auto event = Event::fromHandle (hSignalEvent);
15721585 if (event) {
15731586 hostPointerNeedsFlush &= !event->signalScope ;
1587+ hostPointerNeedsFlush &= !event->l3FlushWaApplied ;
15741588 }
15751589
1590+ hostPointerNeedsFlush &= !isCopyOnly ();
1591+
15761592 if (hostPointerNeedsFlush) {
15771593 NEO::PipeControlArgs args;
15781594 args.dcFlushEnable = true ;
0 commit comments