Merge branch 'oneapi-src:main' into victor/fix_ur_return

Victor Lomuller · web-flow · commit b79d1157563c · 2024-02-21T11:53:37.000Z
diff --git a/include/ur_api.h b/include/ur_api.h
@@ -8692,8 +8692,12 @@ urEnqueueCooperativeKernelLaunchExp(
 ///     - ::UR_RESULT_ERROR_INVALID_KERNEL
 UR_APIEXPORT ur_result_t UR_APICALL
 urKernelSuggestMaxCooperativeGroupCountExp(
-    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
-    uint32_t *pGroupCountRet    ///< [out] pointer to maximum number of groups
+    ur_kernel_handle_t hKernel,     ///< [in] handle of the kernel object
+    size_t localWorkSize,           ///< [in] number of local work-items that will form a work-group when the
+                                    ///< kernel is launched
+    size_t dynamicSharedMemorySize, ///< [in] size of dynamic shared memory, for each work-group, in bytes,
+                                    ///< that will be used when the kernel is launched
+    uint32_t *pGroupCountRet        ///< [out] pointer to maximum number of groups
 );
 
 #if !defined(__GNUC__)
@@ -9641,6 +9645,8 @@ typedef struct ur_kernel_set_specialization_constants_params_t {
 ///     allowing the callback the ability to modify the parameter's value
 typedef struct ur_kernel_suggest_max_cooperative_group_count_exp_params_t {
     ur_kernel_handle_t *phKernel;
+    size_t *plocalWorkSize;
+    size_t *pdynamicSharedMemorySize;
     uint32_t **ppGroupCountRet;
 } ur_kernel_suggest_max_cooperative_group_count_exp_params_t;
 
diff --git a/include/ur_ddi.h b/include/ur_ddi.h
@@ -627,6 +627,8 @@ typedef ur_result_t(UR_APICALL *ur_pfnGetKernelProcAddrTable_t)(
 /// @brief Function-pointer for urKernelSuggestMaxCooperativeGroupCountExp
 typedef ur_result_t(UR_APICALL *ur_pfnKernelSuggestMaxCooperativeGroupCountExp_t)(
     ur_kernel_handle_t,
+    size_t,
+    size_t,
     uint32_t *);
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
@@ -11399,6 +11399,16 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
     ur::details::printPtr(os,
                           *(params->phKernel));
 
+    os << ", ";
+    os << ".localWorkSize = ";
+
+    os << *(params->plocalWorkSize);
+
+    os << ", ";
+    os << ".dynamicSharedMemorySize = ";
+
+    os << *(params->pdynamicSharedMemorySize);
+
     os << ", ";
     os << ".pGroupCountRet = ";
 
diff --git a/scripts/core/exp-cooperative-kernels.yml b/scripts/core/exp-cooperative-kernels.yml
@@ -78,6 +78,12 @@ params:
     - type: $x_kernel_handle_t
       name: hKernel
       desc: "[in] handle of the kernel object"
+    - type: size_t
+      name: localWorkSize
+      desc: "[in] number of local work-items that will form a work-group when the kernel is launched"
+    - type: size_t
+      name: dynamicSharedMemorySize
+      desc: "[in] size of dynamic shared memory, for each work-group, in bytes, that will be used when the kernel is launched"
     - type: "uint32_t*"
       name: "pGroupCountRet"
       desc: "[out] pointer to maximum number of groups"
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
@@ -245,13 +245,14 @@ setKernelParams(const ur_context_handle_t Context,
           return UR_RESULT_SUCCESS;
         };
 
-        size_t KernelLocalWorkGroupSize = 0;
+        size_t KernelLocalWorkGroupSize = 1;
         for (size_t Dim = 0; Dim < WorkDim; Dim++) {
           auto Err = IsValid(Dim);
           if (Err != UR_RESULT_SUCCESS)
             return Err;
-          // If no error then sum the total local work size per dim.
-          KernelLocalWorkGroupSize += LocalWorkSize[Dim];
+          // If no error then compute the total local work size as a product of
+          // all dims.
+          KernelLocalWorkGroupSize *= LocalWorkSize[Dim];
         }
 
         if (hasExceededMaxRegistersPerBlock(Device, Kernel,
@@ -493,6 +494,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   return Result;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
+                               pGlobalWorkSize, pLocalWorkSize,
+                               numEventsInWaitList, phEventWaitList, phEvent);
+}
+
 /// Set parameters for general 3D memory copy.
 /// If the source and/or destination is on the device, SrcPtr and/or DstPtr
 /// must be a pointer to a CUdeviceptr
diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp
@@ -169,6 +169,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle(
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
+    ur_kernel_handle_t hKernel, size_t localWorkSize,
+    size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
+  (void)hKernel;
+  (void)localWorkSize;
+  (void)dynamicSharedMemorySize;
+  *pGroupCountRet = 1;
+  return UR_RESULT_SUCCESS;
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue(
     ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize,
     const ur_kernel_arg_value_properties_t *pProperties,
diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp
@@ -404,7 +404,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
     return result;
   }
 
-  pDdiTable->pfnCooperativeKernelLaunchExp = nullptr;
+  pDdiTable->pfnCooperativeKernelLaunchExp =
+      urEnqueueCooperativeKernelLaunchExp;
 
   return UR_RESULT_SUCCESS;
 }
@@ -416,7 +417,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable(
     return result;
   }
 
-  pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = nullptr;
+  pDdiTable->pfnSuggestMaxCooperativeGroupCountExp =
+      urKernelSuggestMaxCooperativeGroupCountExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
@@ -465,6 +465,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   return Result;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
+                               pGlobalWorkSize, pLocalWorkSize,
+                               numEventsInWaitList, phEventWaitList, phEvent);
+}
+
 /// Enqueues a wait on the given queue for all events.
 /// See \ref enqueueEventWait
 ///
diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp
@@ -158,6 +158,16 @@ urKernelGetNativeHandle(ur_kernel_handle_t, ur_native_handle_t *) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
+    ur_kernel_handle_t hKernel, size_t localWorkSize,
+    size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
+  (void)hKernel;
+  (void)localWorkSize;
+  (void)dynamicSharedMemorySize;
+  *pGroupCountRet = 1;
+  return UR_RESULT_SUCCESS;
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue(
     ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize,
     const ur_kernel_arg_value_properties_t *, const void *pArgValue) {
diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp
@@ -374,7 +374,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
     return result;
   }
 
-  pDdiTable->pfnCooperativeKernelLaunchExp = nullptr;
+  pDdiTable->pfnCooperativeKernelLaunchExp =
+      urEnqueueCooperativeKernelLaunchExp;
 
   return UR_RESULT_SUCCESS;
 }
@@ -386,7 +387,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable(
     return result;
   }
 
-  pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = nullptr;
+  pDdiTable->pfnSuggestMaxCooperativeGroupCountExp =
+      urKernelSuggestMaxCooperativeGroupCountExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp
@@ -264,6 +264,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
+                               pGlobalWorkSize, pLocalWorkSize,
+                               numEventsInWaitList, phEventWaitList, phEvent);
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
     ur_queue_handle_t Queue,     ///< [in] handle of the queue to submit to.
     ur_program_handle_t Program, ///< [in] handle of the program containing the
@@ -787,6 +797,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
+    ur_kernel_handle_t hKernel, size_t localWorkSize,
+    size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
+  (void)hKernel;
+  (void)localWorkSize;
+  (void)dynamicSharedMemorySize;
+  *pGroupCountRet = 1;
+  return UR_RESULT_SUCCESS;
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     ur_native_handle_t NativeKernel, ///< [in] the native handle of the kernel.
     ur_context_handle_t Context,     ///< [in] handle of the context object
diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp
@@ -451,7 +451,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
     return result;
   }
 
-  pDdiTable->pfnCooperativeKernelLaunchExp = nullptr;
+  pDdiTable->pfnCooperativeKernelLaunchExp =
+      urEnqueueCooperativeKernelLaunchExp;
 
   return UR_RESULT_SUCCESS;
 }
@@ -463,7 +464,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable(
     return result;
   }
 
-  pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = nullptr;
+  pDdiTable->pfnSuggestMaxCooperativeGroupCountExp =
+      urKernelSuggestMaxCooperativeGroupCountExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/null/ur_nullddi.cpp b/source/adapters/null/ur_nullddi.cpp
@@ -5443,15 +5443,22 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
 /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp
 __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
-    uint32_t *pGroupCountRet    ///< [out] pointer to maximum number of groups
+    size_t
+        localWorkSize, ///< [in] number of local work-items that will form a work-group when the
+                       ///< kernel is launched
+    size_t
+        dynamicSharedMemorySize, ///< [in] size of dynamic shared memory, for each work-group, in bytes,
+    ///< that will be used when the kernel is launched
+    uint32_t *pGroupCountRet ///< [out] pointer to maximum number of groups
     ) try {
     ur_result_t result = UR_RESULT_SUCCESS;
 
     // if the driver has created a custom function, then call it instead of using the generic path
     auto pfnSuggestMaxCooperativeGroupCountExp =
         d_context.urDdiTable.KernelExp.pfnSuggestMaxCooperativeGroupCountExp;
     if (nullptr != pfnSuggestMaxCooperativeGroupCountExp) {
-        result = pfnSuggestMaxCooperativeGroupCountExp(hKernel, pGroupCountRet);
+        result = pfnSuggestMaxCooperativeGroupCountExp(
+            hKernel, localWorkSize, dynamicSharedMemorySize, pGroupCountRet);
     } else {
         // generic implementation
     }
diff --git a/source/adapters/opencl/enqueue.cpp b/source/adapters/opencl/enqueue.cpp
@@ -41,6 +41,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
+                               pGlobalWorkSize, pLocalWorkSize,
+                               numEventsInWaitList, phEventWaitList, phEvent);
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
     ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
diff --git a/source/adapters/opencl/kernel.cpp b/source/adapters/opencl/kernel.cpp
@@ -10,6 +10,7 @@
 #include "common.hpp"
 
 #include <algorithm>
+#include <cstddef>
 #include <memory>
 
 UR_APIEXPORT ur_result_t UR_APICALL
@@ -376,6 +377,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
+    ur_kernel_handle_t hKernel, size_t localWorkSize,
+    size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
+  (void)hKernel;
+  (void)localWorkSize;
+  (void)dynamicSharedMemorySize;
+  *pGroupCountRet = 1;
+  return UR_RESULT_SUCCESS;
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     ur_native_handle_t hNativeKernel, ur_context_handle_t, ur_program_handle_t,
     const ur_kernel_native_properties_t *pProperties,
diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp
@@ -395,7 +395,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
     return result;
   }
 
-  pDdiTable->pfnCooperativeKernelLaunchExp = nullptr;
+  pDdiTable->pfnCooperativeKernelLaunchExp =
+      urEnqueueCooperativeKernelLaunchExp;
 
   return UR_RESULT_SUCCESS;
 }
@@ -407,7 +408,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable(
     return result;
   }
 
-  pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = nullptr;
+  pDdiTable->pfnSuggestMaxCooperativeGroupCountExp =
+      urKernelSuggestMaxCooperativeGroupCountExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp
@@ -281,6 +281,9 @@ void SanitizerInterceptor::postLaunchKernel(ur_kernel_handle_t Kernel,
         const char *Func = AH->Func[0] ? AH->Func : "<unknown func>";
         auto KernelName = getKernelName(Kernel);
 
+        // Try to demangle the kernel name
+        KernelName = DemangleName(KernelName);
+
         context.logger.always("\n====ERROR: DeviceSanitizer: {} on {}",
                               DeviceSanitizerFormat(AH->ErrorType),
                               DeviceSanitizerFormat(AH->MemoryType));
diff --git a/source/loader/layers/sanitizer/common.hpp b/source/loader/layers/sanitizer/common.hpp
@@ -17,6 +17,7 @@
 
 #include <cassert>
 #include <cstdint>
+#include <string>
 
 namespace ur_sanitizer_layer {
 
@@ -107,4 +108,6 @@ bool DestroyShadowMem();
 
 void *GetMemFunctionPointer(const char *);
 
+std::string DemangleName(const std::string &name);
+
 } // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/linux/san_utils.cpp b/source/loader/layers/sanitizer/linux/san_utils.cpp
@@ -15,8 +15,10 @@
 #include "ur_sanitizer_layer.hpp"
 
 #include <asm/param.h>
+#include <cxxabi.h>
 #include <dlfcn.h>
 #include <gnu/lib-names.h>
+#include <string>
 #include <sys/mman.h>
 
 extern "C" __attribute__((weak)) void __asan_init(void);
@@ -84,4 +86,15 @@ void *GetMemFunctionPointer(const char *FuncName) {
     return ptr;
 }
 
+std::string DemangleName(const std::string &name) {
+    std::string result = name;
+    char *demangled =
+        abi::__cxa_demangle(name.c_str(), nullptr, nullptr, nullptr);
+    if (demangled) {
+        result = demangled;
+        free(demangled);
+    }
+    return result;
+}
+
 } // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
diff --git a/source/ur_api.cpp b/source/ur_api.cpp

Original file line number	Diff line number	Diff line change
`@@ -404,7 +404,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(`
`404`	`404`	`return result;`
`405`	`405`	`}`
`406`	`406`
`407`		`- pDdiTable->pfnCooperativeKernelLaunchExp = nullptr;`
	`407`	`+ pDdiTable->pfnCooperativeKernelLaunchExp =`
	`408`	`+ urEnqueueCooperativeKernelLaunchExp;`
`408`	`409`
`409`	`410`	`return UR_RESULT_SUCCESS;`
`410`	`411`	`}`
`@@ -416,7 +417,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable(`
`416`	`417`	`return result;`
`417`	`418`	`}`
`418`	`419`
`419`		`- pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = nullptr;`
	`420`	`+ pDdiTable->pfnSuggestMaxCooperativeGroupCountExp =`
	`421`	`+ urKernelSuggestMaxCooperativeGroupCountExp;`
`420`	`422`
`421`	`423`	`return UR_RESULT_SUCCESS;`
`422`	`424`	`}`
Original file line number	Diff line number	Diff line change
`@@ -374,7 +374,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(`
`374`	`374`	`return result;`
`375`	`375`	`}`
`376`	`376`
`377`		`- pDdiTable->pfnCooperativeKernelLaunchExp = nullptr;`
	`377`	`+ pDdiTable->pfnCooperativeKernelLaunchExp =`
	`378`	`+ urEnqueueCooperativeKernelLaunchExp;`
`378`	`379`
`379`	`380`	`return UR_RESULT_SUCCESS;`
`380`	`381`	`}`
`@@ -386,7 +387,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable(`
`386`	`387`	`return result;`
`387`	`388`	`}`
`388`	`389`
`389`		`- pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = nullptr;`
	`390`	`+ pDdiTable->pfnSuggestMaxCooperativeGroupCountExp =`
	`391`	`+ urKernelSuggestMaxCooperativeGroupCountExp;`
`390`	`392`
`391`	`393`	`return UR_RESULT_SUCCESS;`
`392`	`394`	`}`