Skip to content

Commit ecbff5d

Browse files
author
Hugh Delaney
committed
Several changes to kernel launch
- Don't automatically initialize event vector unless extra events needed for mem args. - Make event recording happen if needed for mem args, even if phEvent is nullptr. - Make migrateMemoryToDeviceIfNeeded into an async enqueue. This is needed since all the previous work that the memory migration depends on is synchronized with on a stream. So we must dispatch the memory transfers on the same stream. - Add helper func to check if a list contains an elem.
1 parent 78d0203 commit ecbff5d

File tree

7 files changed

+196
-103
lines changed

7 files changed

+196
-103
lines changed

source/adapters/cuda/enqueue.cpp

Lines changed: 50 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -414,8 +414,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
414414
UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
415415
UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
416416

417-
std::vector<ur_event_handle_t> DepEvents(
418-
phEventWaitList, phEventWaitList + numEventsInWaitList);
417+
std::vector<ur_event_handle_t> MemMigrationEvents;
419418
std::vector<std::pair<ur_mem_handle_t, ur_lock>> MemMigrationLocks;
420419

421420
// phEventWaitList only contains events that are handed to UR by the SYCL
@@ -427,9 +426,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
427426
for (auto &MemArg : hKernel->Args.MemObjArgs) {
428427
bool PushBack = false;
429428
if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj;
430-
MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(),
431-
MemDepEvent) == DepEvents.end()) {
432-
DepEvents.push_back(MemDepEvent);
429+
MemDepEvent && !listContainsElem(numEventsInWaitList, phEventWaitList,
430+
MemDepEvent)) {
431+
MemMigrationEvents.push_back(MemDepEvent);
433432
PushBack = true;
434433
}
435434
if ((MemArg.AccessFlags &
@@ -477,19 +476,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
477476
CUstream CuStream = hQueue->getNextComputeStream(
478477
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
479478

480-
if (DepEvents.size()) {
481-
UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, DepEvents.size(),
482-
DepEvents.data()));
483-
}
479+
UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
480+
phEventWaitList));
484481

485482
// For memory migration across devices in the same context
486483
if (hQueue->getContext()->Devices.size() > 1) {
484+
if (MemMigrationEvents.size()) {
485+
UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream,
486+
MemMigrationEvents.size(),
487+
MemMigrationEvents.data()));
488+
}
487489
for (auto &MemArg : hKernel->Args.MemObjArgs) {
488-
migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice());
490+
enqueueMigrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice(),
491+
CuStream);
489492
}
490493
}
491494

492-
if (phEvent) {
495+
if (phEvent || MemMigrationEvents.size()) {
493496
RetImplEvent =
494497
std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
495498
UR_COMMAND_KERNEL_LAUNCH, hQueue, CuStream, StreamToken));
@@ -522,8 +525,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
522525
if (phEvent) {
523526
UR_CHECK_ERROR(RetImplEvent->record());
524527
*phEvent = RetImplEvent.release();
528+
} else if (MemMigrationEvents.size()) {
529+
UR_CHECK_ERROR(RetImplEvent->record());
530+
for (auto &MemArg : hKernel->Args.MemObjArgs) {
531+
// If no event is passed to entry point, we still need to have an event
532+
// if ur_mem_handle_t s are used. Here we give ownership of the event
533+
// to the ur_mem_handle_t
534+
if (MemArg.AccessFlags &
535+
(UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) {
536+
MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.release());
537+
}
538+
}
525539
}
526-
527540
} catch (ur_result_t Err) {
528541
return Err;
529542
}
@@ -603,8 +616,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
603616
}
604617
}
605618

606-
std::vector<ur_event_handle_t> DepEvents(
607-
phEventWaitList, phEventWaitList + numEventsInWaitList);
619+
std::vector<ur_event_handle_t> MemMigrationEvents;
608620
std::vector<std::pair<ur_mem_handle_t, ur_lock>> MemMigrationLocks;
609621

610622
// phEventWaitList only contains events that are handed to UR by the SYCL
@@ -616,9 +628,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
616628
for (auto &MemArg : hKernel->Args.MemObjArgs) {
617629
bool PushBack = false;
618630
if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj;
619-
MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(),
620-
MemDepEvent) == DepEvents.end()) {
621-
DepEvents.push_back(MemDepEvent);
631+
MemDepEvent && !listContainsElem(numEventsInWaitList, phEventWaitList,
632+
MemDepEvent)) {
633+
MemMigrationEvents.push_back(MemDepEvent);
622634
PushBack = true;
623635
}
624636
if ((MemArg.AccessFlags &
@@ -666,19 +678,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
666678
CUstream CuStream = hQueue->getNextComputeStream(
667679
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
668680

669-
if (DepEvents.size()) {
670-
UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, DepEvents.size(),
671-
DepEvents.data()));
672-
}
681+
UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
682+
phEventWaitList));
673683

674684
// For memory migration across devices in the same context
675685
if (hQueue->getContext()->Devices.size() > 1) {
686+
if (MemMigrationEvents.size()) {
687+
UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream,
688+
MemMigrationEvents.size(),
689+
MemMigrationEvents.data()));
690+
}
676691
for (auto &MemArg : hKernel->Args.MemObjArgs) {
677-
migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice());
692+
enqueueMigrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice(),
693+
CuStream);
678694
}
679695
}
680696

681-
if (phEvent) {
697+
if (phEvent || MemMigrationEvents.size()) {
682698
RetImplEvent =
683699
std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
684700
UR_COMMAND_KERNEL_LAUNCH, hQueue, CuStream, StreamToken));
@@ -724,6 +740,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
724740
if (phEvent) {
725741
UR_CHECK_ERROR(RetImplEvent->record());
726742
*phEvent = RetImplEvent.release();
743+
} else if (MemMigrationEvents.size()) {
744+
UR_CHECK_ERROR(RetImplEvent->record());
745+
for (auto &MemArg : hKernel->Args.MemObjArgs) {
746+
// If no event is passed to entry point, we still need to have an event
747+
// if ur_mem_handle_t s are used. Here we give ownership of the event
748+
// to the ur_mem_handle_t
749+
if (MemArg.AccessFlags &
750+
(UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) {
751+
MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.release());
752+
}
753+
}
727754
}
728755

729756
} catch (ur_result_t Err) {

source/adapters/cuda/memory.cpp

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
#include "common.hpp"
1414
#include "context.hpp"
15+
#include "enqueue.hpp"
1516
#include "memory.hpp"
1617

1718
/// Creates a UR Memory object using a CUDA memory allocation.
@@ -238,7 +239,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
238239
try {
239240
if (PerformInitialCopy) {
240241
for (const auto &Device : hContext->getDevices()) {
241-
UR_CHECK_ERROR(migrateMemoryToDeviceIfNeeded(URMemObj.get(), Device));
242+
// Synchronous behaviour is best in this case
243+
ScopedContext Active(Device);
244+
CUstream Stream{0}; // Use default stream
245+
UR_CHECK_ERROR(enqueueMigrateMemoryToDeviceIfNeeded(URMemObj.get(),
246+
Device, Stream));
247+
UR_CHECK_ERROR(cuStreamSynchronize(Stream));
242248
}
243249
}
244250

@@ -496,27 +502,29 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
496502
}
497503

498504
namespace {
499-
ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem,
500-
ur_device_handle_t hDevice) {
505+
ur_result_t enqueueMigrateBufferToDevice(ur_mem_handle_t Mem,
506+
ur_device_handle_t hDevice,
507+
CUstream Stream) {
501508
auto &Buffer = std::get<BufferMem>(Mem->Mem);
502509
if (Mem->LastEventWritingToMemObj == nullptr) {
503510
// Device allocation being initialized from host for the first time
504511
if (Buffer.HostPtr) {
505-
UR_CHECK_ERROR(
506-
cuMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size));
512+
UR_CHECK_ERROR(cuMemcpyHtoDAsync(Buffer.getPtr(hDevice), Buffer.HostPtr,
513+
Buffer.Size, Stream));
507514
}
508515
} else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
509516
hDevice) {
510-
UR_CHECK_ERROR(cuMemcpyDtoD(
517+
UR_CHECK_ERROR(cuMemcpyDtoDAsync(
511518
Buffer.getPtr(hDevice),
512519
Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
513-
Buffer.Size));
520+
Buffer.Size, Stream));
514521
}
515522
return UR_RESULT_SUCCESS;
516523
}
517524

518-
ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
519-
ur_device_handle_t hDevice) {
525+
ur_result_t enqueueMigrateImageToDevice(ur_mem_handle_t Mem,
526+
ur_device_handle_t hDevice,
527+
CUstream Stream) {
520528
auto &Image = std::get<SurfaceMem>(Mem->Mem);
521529
// When a dimension isn't used image_desc has the size set to 1
522530
size_t PixelSizeBytes = Image.PixelTypeSizeBytes *
@@ -550,21 +558,24 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
550558
if (Mem->LastEventWritingToMemObj == nullptr) {
551559
if (Image.HostPtr) {
552560
if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
553-
UR_CHECK_ERROR(
554-
cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
561+
UR_CHECK_ERROR(cuMemcpyHtoAAsync(ImageArray, 0, Image.HostPtr,
562+
ImageSizeBytes, Stream));
555563
} else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
556564
CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
557565
CpyDesc2D.srcHost = Image.HostPtr;
558-
UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
566+
UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc2D, Stream));
559567
} else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
560568
CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
561569
CpyDesc3D.srcHost = Image.HostPtr;
562-
UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
570+
UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc3D, Stream));
563571
}
564572
}
565573
} else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
566574
hDevice) {
567575
if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
576+
// Blocking wait needed since we need to sync LastEventWritingToMemObj's
577+
// queue, as well as the current queue with LastEventWritingToMemObj
578+
UR_CHECK_ERROR(urEventWait(1, &Mem->LastEventWritingToMemObj));
568579
// FIXME: 1D memcpy from DtoD going through the host.
569580
UR_CHECK_ERROR(cuMemcpyAtoH(
570581
Image.HostPtr,
@@ -574,13 +585,15 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
574585
UR_CHECK_ERROR(
575586
cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
576587
} else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
588+
CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE;
577589
CpyDesc2D.srcArray = Image.getArray(
578590
Mem->LastEventWritingToMemObj->getQueue()->getDevice());
579-
UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
591+
UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc2D, Stream));
580592
} else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
593+
CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE;
581594
CpyDesc3D.srcArray = Image.getArray(
582595
Mem->LastEventWritingToMemObj->getQueue()->getDevice());
583-
UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
596+
UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc3D, Stream));
584597
}
585598
}
586599
return UR_RESULT_SUCCESS;
@@ -589,8 +602,8 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
589602

590603
// If calling this entry point it is necessary to lock the memoryMigrationMutex
591604
// beforehand
592-
ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
593-
const ur_device_handle_t hDevice) {
605+
ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(
606+
ur_mem_handle_t Mem, const ur_device_handle_t hDevice, CUstream Stream) {
594607
UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
595608
// Device allocation has already been initialized with most up to date
596609
// data in buffer
@@ -601,9 +614,9 @@ ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
601614

602615
ScopedContext Active(hDevice);
603616
if (Mem->isBuffer()) {
604-
UR_CHECK_ERROR(migrateBufferToDevice(Mem, hDevice));
617+
UR_CHECK_ERROR(enqueueMigrateBufferToDevice(Mem, hDevice, Stream));
605618
} else {
606-
UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice));
619+
UR_CHECK_ERROR(enqueueMigrateImageToDevice(Mem, hDevice, Stream));
607620
}
608621

609622
Mem->HaveMigratedToDeviceSinceLastWrite[Mem->getContext()->getDeviceIndex(

source/adapters/cuda/memory.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@
2020
#include "device.hpp"
2121
#include "event.hpp"
2222

23+
ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
24+
const ur_device_handle_t);
25+
ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
26+
const ur_device_handle_t,
27+
CUstream);
28+
2329
// Handler for plain, pointer-based CUDA allocations
2430
struct BufferMem {
2531

0 commit comments

Comments
 (0)