@@ -305,7 +305,7 @@ int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream)
305305 int memflags = CL_MEM_ALLOC_HOST_PTR ;
306306 nbytes += alignment + size_meminfo - 1 ;
307307# if defined(ACC_OPENCL_XHINTS )
308- if (0 != (4 & c_dbcsr_acc_opencl_config .xhints ) && (0 != devinfo -> nv || NULL != (ACC_OPENCL_XHINTS ))) {
308+ if (0 != (8 & c_dbcsr_acc_opencl_config .xhints ) && (0 != devinfo -> nv || NULL != (ACC_OPENCL_XHINTS ))) {
309309 host_ptr = ACC_OPENCL_MEM_ALLOC (nbytes , alignment );
310310 if (NULL != host_ptr ) memflags = CL_MEM_USE_HOST_PTR ;
311311 }
@@ -316,7 +316,7 @@ int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream)
316316 if (NULL == host_ptr ) {
317317 mapped = clEnqueueMapBuffer (str -> queue , memory , CL_TRUE /*always block*/ ,
318318# if defined(ACC_OPENCL_XHINTS ) && (defined (CL_VERSION_1_2 ) || defined (CL_MAP_WRITE_INVALIDATE_REGION ))
319- (16 & c_dbcsr_acc_opencl_config .xhints ) ? CL_MAP_WRITE_INVALIDATE_REGION :
319+ (32 & c_dbcsr_acc_opencl_config .xhints ) ? CL_MAP_WRITE_INVALIDATE_REGION :
320320# endif
321321 (CL_MAP_READ | CL_MAP_WRITE ),
322322 0 /*offset*/ , nbytes , 0 , NULL , NULL , & result );
@@ -396,33 +396,35 @@ void CL_CALLBACK c_dbcsr_acc_memcpy_notify(cl_event /*event*/, cl_int /*event_st
396396void CL_CALLBACK c_dbcsr_acc_memcpy_notify (cl_event event , cl_int event_status , void * data ) {
397397 int result = EXIT_SUCCESS ;
398398 const double durdev = c_dbcsr_acc_opencl_duration (event , & result );
399- c_dbcsr_acc_opencl_info_memptr_t info ;
400- cl_command_type type ;
401- size_t size = 0 , offset = 0 ;
399+ cl_command_type type = CL_COMMAND_SVM_MEMCPY ;
402400 LIBXSMM_UNUSED (event_status );
403- assert (CL_COMPLETE == event_status && NULL != data );
404- if (EXIT_SUCCESS == result && EXIT_SUCCESS == clGetEventInfo (event , CL_EVENT_COMMAND_TYPE , sizeof (type ), & type , NULL ) &&
405- EXIT_SUCCESS == c_dbcsr_acc_opencl_info_devptr_lock (& info , NULL , data , 1 /*elsize*/ , NULL /*amount*/ , & offset ) &&
406- EXIT_SUCCESS == clGetMemObjectInfo (info .memory , CL_MEM_SIZE , sizeof (size_t ), & size , NULL ) && offset <= size )
407- {
408- /*const double durhst = libxsmm_timer_duration((libxsmm_timer_tickint)info.data, libxsmm_timer_tick());
409- const double durtot = durdev - LIBXSMM_MIN(durdev, durhst);*/
410- const size_t amount = size - offset ;
411- const double vals [] = {(double )amount , durdev };
412- const int mb = (int )((amount + (1 << 19 )) >> 20 );
401+ assert (CL_COMPLETE == event_status && NULL != data && 8 == sizeof (data ));
402+ if (EXIT_SUCCESS == result && EXIT_SUCCESS == clGetEventInfo (event , CL_EVENT_COMMAND_TYPE , sizeof (type ), & type , NULL )) {
403+ const size_t size = 0x3FFFFFFFFFFFFFFF & (size_t )data ;
404+ const int kind = (int )(((size_t )data ) >> 62 );
405+ const double vals [] = {(double )size , durdev };
406+ const int mb = (int )((size + (1 << 19 )) >> 20 );
407+ if (CL_COMMAND_WRITE_BUFFER != type && CL_COMMAND_READ_BUFFER != type && CL_COMMAND_COPY_BUFFER != type ) {
408+ switch (kind ) {
409+ case c_dbcsr_acc_event_kind_h2d : type = CL_COMMAND_WRITE_BUFFER ; break ;
410+ case c_dbcsr_acc_event_kind_d2h : type = CL_COMMAND_READ_BUFFER ; break ;
411+ case c_dbcsr_acc_event_kind_d2d : type = CL_COMMAND_COPY_BUFFER ; break ;
412+ default : assert (c_dbcsr_acc_event_kind_none == kind ); /* should not happen */
413+ }
414+ }
413415 switch (type ) {
414416 case CL_COMMAND_WRITE_BUFFER : {
415- assert (NULL != c_dbcsr_acc_opencl_config .hist_h2d );
417+ assert (NULL != c_dbcsr_acc_opencl_config .hist_h2d && c_dbcsr_acc_event_kind_h2d == kind );
416418 c_dbcsr_acc_opencl_hist_set (c_dbcsr_acc_opencl_config .lock_memory , c_dbcsr_acc_opencl_config .hist_h2d , vals );
417419 if (0 > c_dbcsr_acc_opencl_config .profile ) fprintf (stderr , "PROF ACC/OpenCL: H2D mb=%i us=%.0f\n" , mb , durdev * 1E6 );
418420 } break ;
419421 case CL_COMMAND_READ_BUFFER : {
420- assert (NULL != c_dbcsr_acc_opencl_config .hist_d2h );
422+ assert (NULL != c_dbcsr_acc_opencl_config .hist_d2h && c_dbcsr_acc_event_kind_d2h == kind );
421423 c_dbcsr_acc_opencl_hist_set (c_dbcsr_acc_opencl_config .lock_memory , c_dbcsr_acc_opencl_config .hist_d2h , vals );
422424 if (0 > c_dbcsr_acc_opencl_config .profile ) fprintf (stderr , "PROF ACC/OpenCL: D2H mb=%i us=%.0f\n" , mb , durdev * 1E6 );
423425 } break ;
424426 case CL_COMMAND_COPY_BUFFER : {
425- assert (NULL != c_dbcsr_acc_opencl_config .hist_d2d );
427+ assert (NULL != c_dbcsr_acc_opencl_config .hist_d2d && c_dbcsr_acc_event_kind_d2d == kind );
426428 c_dbcsr_acc_opencl_hist_set (c_dbcsr_acc_opencl_config .lock_memory , c_dbcsr_acc_opencl_config .hist_d2d , vals );
427429 if (0 > c_dbcsr_acc_opencl_config .profile ) fprintf (stderr , "PROF ACC/OpenCL: D2D mb=%i us=%.0f\n" , mb , durdev * 1E6 );
428430 } break ;
@@ -489,7 +491,7 @@ int c_dbcsr_acc_dev_mem_allocate(void** dev_mem, size_t nbytes) {
489491 {
490492# if defined(ACC_OPENCL_XHINTS )
491493 const int devuid = devinfo -> uid , devuids = (0x4905 == devuid || 0x020a == devuid || (0x0bd0 <= devuid && 0x0bdb >= devuid ));
492- const int try_flag = ((0 != (8 & c_dbcsr_acc_opencl_config .xhints ) && 0 != devinfo -> intel && 0 == devinfo -> unified &&
494+ const int try_flag = ((0 != (16 & c_dbcsr_acc_opencl_config .xhints ) && 0 != devinfo -> intel && 0 == devinfo -> unified &&
493495 (devuids || NULL != (ACC_OPENCL_XHINTS )))
494496 ? (1u << 22 )
495497 : 0 );
@@ -667,7 +669,8 @@ int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, v
667669 assert (NULL != str );
668670# if (1 >= ACC_OPENCL_USM )
669671 if (NULL != devinfo -> clEnqueueMemcpyINTEL ) {
670- result = devinfo -> clEnqueueMemcpyINTEL (str -> queue , finish , dev_mem , host_mem , nbytes , 0 , NULL , NULL );
672+ result = devinfo -> clEnqueueMemcpyINTEL (
673+ str -> queue , finish , dev_mem , host_mem , nbytes , 0 , NULL , NULL == c_dbcsr_acc_opencl_config .hist_h2d ? NULL : & event );
671674 }
672675 else
673676# endif
@@ -690,18 +693,18 @@ int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, v
690693 if (NULL != info ) {
691694 result = clEnqueueWriteBuffer (str -> queue , info -> memory , finish , offset , nbytes , host_mem , 0 , NULL ,
692695 NULL == c_dbcsr_acc_opencl_config .hist_h2d ? NULL : & event );
693- /*if (NULL != event && EXIT_SUCCESS == result) info->data = (void*)libxsmm_timer_tick();*/
694696 }
695697 else result = EXIT_FAILURE ;
696698 }
697699 ACC_OPENCL_RELEASE (c_dbcsr_acc_opencl_config .lock_memory );
698700 if (NULL != event ) { /* c_dbcsr_acc_memcpy_notify must be outside of locked region */
699701 if (EXIT_SUCCESS == result ) {
702+ void * const data = (void * )(nbytes | ((size_t )c_dbcsr_acc_event_kind_h2d ) << 62 );
700703 assert (NULL != c_dbcsr_acc_opencl_config .hist_h2d );
701704 if (!finish ) { /* asynchronous */
702- result = clSetEventCallback (event , CL_COMPLETE , c_dbcsr_acc_memcpy_notify , dev_mem );
705+ result = clSetEventCallback (event , CL_COMPLETE , c_dbcsr_acc_memcpy_notify , data );
703706 }
704- else c_dbcsr_acc_memcpy_notify (event , CL_COMPLETE , dev_mem ); /* synchronous */
707+ else c_dbcsr_acc_memcpy_notify (event , CL_COMPLETE , data ); /* synchronous */
705708 }
706709 else ACC_OPENCL_EXPECT (EXIT_SUCCESS == clReleaseEvent (event ));
707710 }
@@ -818,16 +821,16 @@ int c_dbcsr_acc_memcpy_d2h(const void* dev_mem, void* host_mem, size_t nbytes, v
818821 else {
819822 result = c_dbcsr_acc_opencl_memcpy_d2h (
820823 info -> memory , host_mem , offset , nbytes , str -> queue , finish , NULL == c_dbcsr_acc_opencl_config .hist_d2h ? NULL : & event );
821- /*if (NULL != event && EXIT_SUCCESS == result) info->data = (void*)libxsmm_timer_tick();*/
822824 }
823825 ACC_OPENCL_RELEASE (c_dbcsr_acc_opencl_config .lock_memory );
824826 if (NULL != event ) { /* c_dbcsr_acc_memcpy_notify must be outside of locked region */
825827 if (EXIT_SUCCESS == result ) {
826- assert (NULL != c_dbcsr_acc_opencl_config .hist_d2h /*&& NULL == c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL*/ );
828+ void * const data = (void * )(nbytes | ((size_t )c_dbcsr_acc_event_kind_d2h ) << 62 );
829+ assert (NULL != c_dbcsr_acc_opencl_config .hist_d2h );
827830 if (!finish ) { /* asynchronous */
828- result = clSetEventCallback (event , CL_COMPLETE , c_dbcsr_acc_memcpy_notify , nconst . ptr );
831+ result = clSetEventCallback (event , CL_COMPLETE , c_dbcsr_acc_memcpy_notify , data );
829832 }
830- else c_dbcsr_acc_memcpy_notify (event , CL_COMPLETE , nconst . ptr ); /* synchronous */
833+ else c_dbcsr_acc_memcpy_notify (event , CL_COMPLETE , data ); /* synchronous */
831834 }
832835 else ACC_OPENCL_EXPECT (EXIT_SUCCESS == clReleaseEvent (event ));
833836 }
@@ -867,7 +870,8 @@ int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbyt
867870 assert (NULL != str && NULL != devinfo -> context );
868871# if (1 >= ACC_OPENCL_USM )
869872 if (NULL != devinfo -> clEnqueueMemcpyINTEL ) {
870- result = devinfo -> clEnqueueMemcpyINTEL (str -> queue , CL_FALSE /*blocking*/ , devmem_dst , devmem_src , nbytes , 0 , NULL , pevent );
873+ result = devinfo -> clEnqueueMemcpyINTEL (str -> queue , CL_FALSE /*blocking*/ , devmem_dst , devmem_src , nbytes , 0 , NULL ,
874+ NULL == c_dbcsr_acc_opencl_config .hist_d2d ? pevent : & event );
871875 }
872876 else
873877# endif
@@ -892,26 +896,22 @@ int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbyt
892896 if (NULL != info_src && NULL != info_dst ) {
893897 result = clEnqueueCopyBuffer (str -> queue , info_src -> memory , info_dst -> memory , offset_src , offset_dst , nbytes , 0 , NULL ,
894898 NULL == c_dbcsr_acc_opencl_config .hist_d2d ? pevent : & event );
895- /*if (NULL != event && EXIT_SUCCESS == result && NULL != c_dbcsr_acc_opencl_config.hist_d2d) {
896- info_src->data = (void*)libxsmm_timer_tick();
897- }*/
898899 }
899900 else result = EXIT_FAILURE ;
900901 }
901902 ACC_OPENCL_RELEASE (c_dbcsr_acc_opencl_config .lock_memory );
902903 if (NULL != event ) { /* c_dbcsr_acc_memcpy_notify must be outside of locked region */
903904 if (EXIT_SUCCESS == result ) {
905+ void * const data = (void * )(nbytes | ((size_t )c_dbcsr_acc_event_kind_d2d ) << 62 );
904906 if (NULL == pevent ) { /* asynchronous */
905- assert (NULL == devinfo -> clEnqueueMemcpyINTEL );
906907 assert (NULL != c_dbcsr_acc_opencl_config .hist_d2d );
907- result = clSetEventCallback (event , CL_COMPLETE , c_dbcsr_acc_memcpy_notify , nconst . ptr );
908+ result = clSetEventCallback (event , CL_COMPLETE , c_dbcsr_acc_memcpy_notify , data );
908909 }
909910 else { /* synchronous */
910911 result = clWaitForEvents (1 , & event );
911912 if (EXIT_SUCCESS == result ) {
912913 if (NULL != c_dbcsr_acc_opencl_config .hist_d2d ) {
913- assert (NULL == devinfo -> clEnqueueMemcpyINTEL );
914- c_dbcsr_acc_memcpy_notify (event , CL_COMPLETE , nconst .ptr );
914+ c_dbcsr_acc_memcpy_notify (event , CL_COMPLETE , data );
915915 }
916916 else result = clReleaseEvent (event );
917917 }
0 commit comments