@@ -57,27 +57,6 @@ bool IsSharedPointer(ur_context_handle_t Context, const void *Ptr) {
5757 return (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_SHARED);
5858}
5959
60- // Helper Function to check if the Copy Engine should be preferred given the
61- // types of memory used.
62- bool PreferCopyEngineUsage (ur_device_handle_t Device,
63- ur_context_handle_t Context, const void *Src,
64- void *Dst) {
65- bool PreferCopyEngine = false ;
66- // Given Integrated Devices, Copy Engines are not preferred for any Copy
67- // operations.
68- if (!Device->isIntegrated ()) {
69- // Given non D2D Copies, for better performance, Copy Engines are preferred
70- // only if one has both the Main and Link Copy Engines.
71- if (Device->hasLinkCopyEngine () && Device->hasMainCopyEngine () &&
72- (!IsDevicePointer (Context, Src) || !IsDevicePointer (Context, Dst))) {
73- PreferCopyEngine = true ;
74- }
75- }
76- // Temporary option added to use force engine for D2D copy
77- PreferCopyEngine |= UseCopyEngineForD2DCopy;
78- return PreferCopyEngine;
79- }
80-
8160// Shared by all memory read/write/copy PI interfaces.
8261// PI interfaces must have queue's and destination buffer's mutexes locked for
8362// exclusive use and source buffer's mutex locked for shared use on entry.
@@ -1259,10 +1238,23 @@ ur_result_t urEnqueueUSMMemcpy(
12591238 ur_event_handle_t *OutEvent) {
12601239 std::scoped_lock<ur_shared_mutex> lock (Queue->Mutex );
12611240
1241+ // Device to Device copies are found to execute slower on copy engine
1242+ // (versus compute engine).
1243+ bool PreferCopyEngine = !IsDevicePointer (Queue->Context , Src) ||
1244+ !IsDevicePointer (Queue->Context , Dst);
1245+ // For better performance, Copy Engines are not preferred given Shared
1246+ // pointers on DG2.
1247+ if (Queue->Device ->isDG2 () && (IsSharedPointer (Queue->Context , Src) ||
1248+ IsSharedPointer (Queue->Context , Dst))) {
1249+ PreferCopyEngine = false ;
1250+ }
1251+
1252+ // Temporary option added to use copy engine for D2D copy
1253+ PreferCopyEngine |= UseCopyEngineForD2DCopy;
1254+
12621255 return enqueueMemCopyHelper ( // TODO: do we need a new command type for this?
12631256 UR_COMMAND_MEM_BUFFER_COPY, Queue, Dst, Blocking, Size, Src,
1264- NumEventsInWaitList, EventWaitList, OutEvent,
1265- PreferCopyEngineUsage (Queue->Device , Queue->Context , Src, Dst));
1257+ NumEventsInWaitList, EventWaitList, OutEvent, PreferCopyEngine);
12661258}
12671259
12681260ur_result_t urEnqueueUSMPrefetch (
@@ -1462,13 +1454,26 @@ ur_result_t urEnqueueUSMMemcpy2D(
14621454
14631455 std::scoped_lock<ur_shared_mutex> lock (Queue->Mutex );
14641456
1457+ // Device to Device copies are found to execute slower on copy engine
1458+ // (versus compute engine).
1459+ bool PreferCopyEngine = !IsDevicePointer (Queue->Context , Src) ||
1460+ !IsDevicePointer (Queue->Context , Dst);
1461+ // For better performance, Copy Engines are not preferred given Shared
1462+ // pointers on DG2.
1463+ if (Queue->Device ->isDG2 () && (IsSharedPointer (Queue->Context , Src) ||
1464+ IsSharedPointer (Queue->Context , Dst))) {
1465+ PreferCopyEngine = false ;
1466+ }
1467+
1468+ // Temporary option added to use copy engine for D2D copy
1469+ PreferCopyEngine |= UseCopyEngineForD2DCopy;
1470+
14651471 return enqueueMemCopyRectHelper ( // TODO: do we need a new command type for
14661472 // this?
14671473 UR_COMMAND_MEM_BUFFER_COPY_RECT, Queue, Src, Dst, ZeroOffset, ZeroOffset,
14681474 Region, SrcPitch, DstPitch, 0 , /* SrcSlicePitch=*/
14691475 0 , /* DstSlicePitch=*/
1470- Blocking, NumEventsInWaitList, EventWaitList, Event,
1471- PreferCopyEngineUsage (Queue->Device , Queue->Context , Src, Dst));
1476+ Blocking, NumEventsInWaitList, EventWaitList, Event, PreferCopyEngine);
14721477}
14731478
14741479ur_result_t urMemImageCreate (
0 commit comments