66 * All rights reserved.
77 * Copyright (c) Amazon.com, Inc. or its affiliates.
88 * All Rights reserved.
9+ * Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
910 * $COPYRIGHT$
1011 *
1112 * Additional copyrights may follow
@@ -106,6 +107,14 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
106107 accelerator_cuda_get_buffer_id
107108};
108109
110+ static inline opal_accelerator_cuda_delayed_init_check (void )
111+ {
112+ if (OPAL_UNLIKELY (true != mca_accelerator_cuda_init_complete )) {
113+ return opal_accelerator_cuda_delayed_init ();
114+ }
115+ return OPAL_SUCCESS ;
116+ }
117+
109118static int accelerator_cuda_check_addr (const void * addr , int * dev_id , uint64_t * flags )
110119{
111120 CUresult result ;
@@ -236,15 +245,15 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
236245 }
237246 }
238247 /* First access on a device pointer finalizes CUDA support initialization. */
239- opal_accelerator_cuda_delayed_init ();
248+ ( void ) opal_accelerator_cuda_delayed_init_check ();
240249 return 1 ;
241250}
242251
243252static int accelerator_cuda_create_stream (int dev_id , opal_accelerator_stream_t * * stream )
244253{
245254 CUresult result ;
246- int delayed_init = opal_accelerator_cuda_delayed_init ();
247- if (OPAL_UNLIKELY (0 != delayed_init )) {
255+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
256+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
248257 return delayed_init ;
249258 }
250259 * stream = (opal_accelerator_stream_t * )OBJ_NEW (opal_accelerator_cuda_stream_t );
@@ -293,8 +302,8 @@ OBJ_CLASS_INSTANCE(
293302static int accelerator_cuda_create_event (int dev_id , opal_accelerator_event_t * * event , bool enable_ipc )
294303{
295304 CUresult result ;
296- int delayed_init = opal_accelerator_cuda_delayed_init ();
297- if (OPAL_UNLIKELY (0 != delayed_init )) {
305+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
306+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
298307 return delayed_init ;
299308 }
300309
@@ -396,8 +405,8 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void *
396405{
397406 CUresult result ;
398407
399- int delayed_init = opal_accelerator_cuda_delayed_init ();
400- if (OPAL_UNLIKELY (0 != delayed_init )) {
408+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
409+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
401410 return delayed_init ;
402411 }
403412
@@ -423,8 +432,8 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
423432{
424433 CUresult result ;
425434
426- int delayed_init = opal_accelerator_cuda_delayed_init ();
427- if (OPAL_UNLIKELY (0 != delayed_init )) {
435+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
436+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
428437 return delayed_init ;
429438 }
430439
@@ -464,8 +473,8 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest,
464473 CUdeviceptr tmp ;
465474 CUresult result ;
466475
467- int delayed_init = opal_accelerator_cuda_delayed_init ();
468- if (OPAL_UNLIKELY (0 != delayed_init )) {
476+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
477+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
469478 return delayed_init ;
470479 }
471480
@@ -503,8 +512,8 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
503512{
504513 CUresult result ;
505514
506- int delayed_init = opal_accelerator_cuda_delayed_init ();
507- if (OPAL_UNLIKELY (0 != delayed_init )) {
515+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
516+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
508517 return delayed_init ;
509518 }
510519
@@ -542,8 +551,8 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
542551{
543552 CUresult result ;
544553
545- int delayed_init = opal_accelerator_cuda_delayed_init ();
546- if (OPAL_UNLIKELY (0 != delayed_init )) {
554+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
555+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
547556 return delayed_init ;
548557 }
549558
@@ -566,25 +575,80 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
566575
567576static bool accelerator_cuda_is_ipc_enabled (void )
568577{
569- return false;
578+ return true;
579+ }
580+
581+ static void mca_accelerator_cuda_ipc_handle_destruct (opal_accelerator_cuda_ipc_handle_t * handle )
582+ {
583+ if (NULL != handle && NULL != handle -> base .dev_ptr ) {
584+ cuIpcCloseMemHandle ((CUdeviceptr ) handle -> base .dev_ptr );
585+ handle -> base .dev_ptr = NULL ;
586+ }
570587}
571588
589+ OBJ_CLASS_INSTANCE (
590+ opal_accelerator_cuda_ipc_handle_t ,
591+ opal_accelerator_ipc_handle_t ,
592+ NULL ,
593+ mca_accelerator_cuda_ipc_handle_destruct );
594+
572595static int accelerator_cuda_get_ipc_handle (int dev_id , void * dev_ptr ,
573596 opal_accelerator_ipc_handle_t * handle )
574597{
575- return OPAL_ERR_NOT_IMPLEMENTED ;
598+ if (NULL == dev_ptr || NULL == handle ) {
599+ return OPAL_ERR_BAD_PARAM ;
600+ }
601+
602+ CUipcMemHandle cuda_ipc_handle ;
603+ opal_accelerator_cuda_ipc_handle_t * cuda_handle = (opal_accelerator_cuda_ipc_handle_t * ) handle ;
604+
605+ OBJ_CONSTRUCT (cuda_handle , opal_accelerator_cuda_ipc_handle_t );
606+ cuda_handle -> base .dev_ptr = NULL ;
607+
608+ CUresult err = cuIpcGetMemHandle (& cuda_ipc_handle ,
609+ (CUdeviceptr )dev_ptr );
610+ if (OPAL_UNLIKELY (CUDA_SUCCESS != err )) {
611+ opal_output_verbose (10 , opal_accelerator_base_framework .framework_output ,
612+ "Error in cuIpcGetMemHandle dev_ptr %p" , dev_ptr );
613+ OBJ_DESTRUCT (cuda_handle );
614+ return OPAL_ERROR ;
615+ }
616+ memcpy (cuda_handle -> base .handle , & cuda_ipc_handle , IPC_MAX_HANDLE_SIZE );
617+
618+ return OPAL_SUCCESS ;
576619}
577620
578621static int accelerator_cuda_import_ipc_handle (int dev_id , uint8_t ipc_handle [IPC_MAX_HANDLE_SIZE ],
579622 opal_accelerator_ipc_handle_t * handle )
580623{
581- return OPAL_ERR_NOT_IMPLEMENTED ;
624+ opal_accelerator_cuda_ipc_handle_t * cuda_handle = (opal_accelerator_cuda_ipc_handle_t * ) handle ;
625+ OBJ_CONSTRUCT (cuda_handle , opal_accelerator_cuda_ipc_handle_t );
626+ memcpy (cuda_handle -> base .handle , ipc_handle , IPC_MAX_HANDLE_SIZE );
627+
628+ return OPAL_SUCCESS ;
582629}
583630
584631static int accelerator_cuda_open_ipc_handle (int dev_id , opal_accelerator_ipc_handle_t * handle ,
585632 void * * dev_ptr )
586633{
587- return OPAL_ERR_NOT_IMPLEMENTED ;
634+ if (NULL == dev_ptr || NULL == handle ) {
635+ return OPAL_ERR_BAD_PARAM ;
636+ }
637+
638+ CUresult err = cuIpcOpenMemHandle ((CUdeviceptr * ) & handle -> dev_ptr ,
639+ * (CUipcMemHandle * )handle -> handle ,
640+ CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS );
641+ if (CUDA_ERROR_ALREADY_MAPPED == err ) {
642+ return OPAL_ERR_WOULD_BLOCK ;
643+ }
644+ else if (CUDA_SUCCESS != err ) {
645+ opal_output_verbose (10 , opal_accelerator_base_framework .framework_output ,
646+ "error in cuIpcOpenMemHandle" );
647+ return OPAL_ERROR ;
648+ }
649+ * dev_ptr = handle -> dev_ptr ;
650+
651+ return OPAL_SUCCESS ;
588652}
589653
590654static int accelerator_cuda_compare_ipc_handles (uint8_t handle_1 [IPC_MAX_HANDLE_SIZE ],
@@ -593,29 +657,84 @@ static int accelerator_cuda_compare_ipc_handles(uint8_t handle_1[IPC_MAX_HANDLE_
593657 return memcmp (handle_1 , handle_2 , IPC_MAX_HANDLE_SIZE );
594658}
595659
660+ static void mca_accelerator_cuda_ipc_event_handle_destruct (opal_accelerator_cuda_ipc_handle_t * handle )
661+ {
662+ // Just a place holder, there is no cuIpcCloseEventHandle.
663+ }
664+
665+ OBJ_CLASS_INSTANCE (
666+ opal_accelerator_cuda_ipc_event_handle_t ,
667+ opal_accelerator_ipc_event_handle_t ,
668+ NULL ,
669+ mca_accelerator_cuda_ipc_event_handle_destruct );
670+
596671static int accelerator_cuda_get_ipc_event_handle (opal_accelerator_event_t * event ,
597672 opal_accelerator_ipc_event_handle_t * handle )
598673{
599- return OPAL_ERR_NOT_IMPLEMENTED ;
674+ if (NULL == event || NULL == handle ) {
675+ return OPAL_ERR_BAD_PARAM ;
676+ }
677+
678+ CUipcEventHandle cuda_ipc_handle ;
679+ opal_accelerator_cuda_ipc_event_handle_t * cuda_handle = (opal_accelerator_cuda_ipc_event_handle_t * ) handle ;
680+ OBJ_CONSTRUCT (cuda_handle , opal_accelerator_cuda_ipc_event_handle_t );
681+
682+ memset (cuda_ipc_handle .reserved , 0 , CU_IPC_HANDLE_SIZE );
683+ CUresult err = cuIpcGetEventHandle (& cuda_ipc_handle ,
684+ * ((CUevent * )event -> event ));
685+ if (OPAL_UNLIKELY (CUDA_SUCCESS != err )) {
686+ opal_output_verbose (10 , opal_accelerator_base_framework .framework_output ,
687+ "error in cuIpcGetEventHandle" );
688+ OBJ_DESTRUCT (cuda_handle );
689+ return OPAL_ERROR ;
690+ }
691+ memcpy (cuda_handle -> base .handle , & cuda_ipc_handle , IPC_MAX_HANDLE_SIZE );
692+
693+ return OPAL_SUCCESS ;
600694}
601695
602696static int accelerator_cuda_import_ipc_event_handle (uint8_t ipc_handle [IPC_MAX_HANDLE_SIZE ],
603697 opal_accelerator_ipc_event_handle_t * handle )
604698{
605- return OPAL_ERR_NOT_IMPLEMENTED ;
699+ opal_accelerator_cuda_ipc_handle_t * cuda_handle = (opal_accelerator_cuda_ipc_handle_t * ) handle ;
700+
701+ OBJ_CONSTRUCT (cuda_handle , opal_accelerator_cuda_ipc_handle_t );
702+ memcpy (cuda_handle -> base .handle , ipc_handle , IPC_MAX_HANDLE_SIZE );
703+
704+ return OPAL_SUCCESS ;
606705}
607706
608707static int accelerator_cuda_open_ipc_event_handle (opal_accelerator_ipc_event_handle_t * handle ,
609708 opal_accelerator_event_t * event )
610709{
611- return OPAL_ERR_NOT_IMPLEMENTED ;
710+ if (NULL == event || NULL == handle ) {
711+ return OPAL_ERR_BAD_PARAM ;
712+ }
713+
714+ opal_accelerator_cuda_ipc_event_handle_t * cuda_handle = (opal_accelerator_cuda_ipc_event_handle_t * ) handle ;
715+ opal_accelerator_cuda_event_t * cuda_event = (opal_accelerator_cuda_event_t * ) event ;
716+ OBJ_CONSTRUCT (cuda_event , opal_accelerator_cuda_event_t );
717+ cuda_event -> base .event = malloc (sizeof (CUevent ));
718+ if (NULL == cuda_event -> base .event ) {
719+ return OPAL_ERR_OUT_OF_RESOURCE ;
720+ }
721+
722+ CUresult err = cuIpcOpenEventHandle ( (CUevent * )cuda_event -> base .event ,
723+ * ((CUipcEventHandle * )cuda_handle -> base .handle ));
724+ if (OPAL_UNLIKELY (CUDA_SUCCESS != err )) {
725+ opal_output_verbose (10 , opal_accelerator_base_framework .framework_output ,
726+ "error in cuIpcOpenEventHandle" );
727+ return OPAL_ERROR ;
728+ }
729+
730+ return OPAL_SUCCESS ;
612731}
613732
614733static int accelerator_cuda_host_register (int dev_id , void * ptr , size_t size )
615734{
616735 CUresult result ;
617- int delayed_init = opal_accelerator_cuda_delayed_init ();
618- if (OPAL_UNLIKELY (0 != delayed_init )) {
736+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
737+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
619738 return delayed_init ;
620739 }
621740
@@ -652,8 +771,8 @@ static int accelerator_cuda_get_device(int *dev_id)
652771 CUdevice cuDev ;
653772 CUresult result ;
654773
655- int delayed_init = opal_accelerator_cuda_delayed_init ();
656- if (OPAL_UNLIKELY (0 != delayed_init )) {
774+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
775+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
657776 return delayed_init ;
658777 }
659778
@@ -714,8 +833,8 @@ static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int de
714833{
715834 CUresult result ;
716835
717- int delayed_init = opal_accelerator_cuda_delayed_init ();
718- if (OPAL_UNLIKELY (0 != delayed_init )) {
836+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
837+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
719838 return delayed_init ;
720839 }
721840
@@ -744,8 +863,8 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc
744863 CUresult result ;
745864 int enable = 1 ;
746865
747- int delayed_init = opal_accelerator_cuda_delayed_init ();
748- if (OPAL_UNLIKELY (0 != delayed_init )) {
866+ int delayed_init = opal_accelerator_cuda_delayed_init_check ();
867+ if (OPAL_UNLIKELY (OPAL_SUCCESS != delayed_init )) {
749868 return delayed_init ;
750869 }
751870
0 commit comments