Handle subdevice partition correctly

PietroGhg · PietroGhg · commit c594cdc0a428 · 2024-04-11T09:56:13.000+01:00
diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp
@@ -98,8 +98,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_LINKER_AVAILABLE:
     return ReturnValue(bool{false});
   case UR_DEVICE_INFO_MAX_COMPUTE_UNITS:
-    return ReturnValue(static_cast<uint32_t>(
-          hDevice->tp.num_threads()));
+    return ReturnValue(static_cast<uint32_t>(hDevice->tp.num_threads()));
   case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES:
     return ReturnValue(uint32_t{0});
   case UR_DEVICE_INFO_SUPPORTED_PARTITIONS:
@@ -139,7 +138,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS:
     return ReturnValue(uint32_t{3});
   case UR_DEVICE_INFO_PARTITION_TYPE:
-    return ReturnValue(ur_device_partition_property_t{});
+    if (pPropSizeRet) {
+      *pPropSizeRet = 0;
+    }
+    return UR_RESULT_SUCCESS;
   case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION:
     return ReturnValue("");
   case UR_DEVICE_INFO_QUEUE_PROPERTIES:
@@ -159,8 +161,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT:
   case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE:
   case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF:
-  // todo: how can we query vector width in a platform
-  // indipendent way?
+  // TODO: How can we query vector width in a platform
+  // independent way?
   case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR:
     return ReturnValue(uint32_t{32});
   case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT:
@@ -266,7 +268,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_ATOMIC_64:
     return ReturnValue(bool{1});
   case UR_DEVICE_INFO_BFLOAT16:
-    return ReturnValue(bool{1});
+    return ReturnValue(bool{0});
   case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT:
     return ReturnValue(bool{0});
   case UR_DEVICE_INFO_IMAGE_SRGB:
diff --git a/source/adapters/native_cpu/device.hpp b/source/adapters/native_cpu/device.hpp
@@ -10,18 +10,12 @@
 
 #pragma once
 
-#include <ur/ur.hpp>
 #include "threadpool.hpp"
+#include <ur/ur.hpp>
 
 struct ur_device_handle_t_ {
   native_cpu::threadpool_t tp;
-  ur_device_handle_t_(ur_platform_handle_t ArgPlt) : Platform(ArgPlt) {
-    tp.start();
-  }
-
-  ~ur_device_handle_t_() {
-    tp.stop();
-  }
+  ur_device_handle_t_(ur_platform_handle_t ArgPlt) : Platform(ArgPlt) {}
 
   ur_platform_handle_t Platform;
 };
diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
@@ -6,15 +6,17 @@
 //
 //===----------------------------------------------------------------------===//
 #include <array>
+#include <cstddef>
 #include <cstdint>
+#include <vector>
 
 #include "ur_api.h"
 
 #include "common.hpp"
 #include "kernel.hpp"
 #include "memory.hpp"
-#include "threadpool.hpp"
 #include "queue.hpp"
+#include "threadpool.hpp"
 
 namespace native_cpu {
 struct NDRDescT {
@@ -37,9 +39,29 @@ struct NDRDescT {
       GlobalOffset[I] = 0;
     }
   }
+
+  void dump(std::ostream &os) const {
+    os << "GlobalSize: " << GlobalSize[0] << " " << GlobalSize[1] << " "
+       << GlobalSize[2] << "\n";
+    os << "LocalSize: " << LocalSize[0] << " " << LocalSize[1] << " "
+       << LocalSize[2] << "\n";
+    os << "GlobalOffset: " << GlobalOffset[0] << " " << GlobalOffset[1] << " "
+       << GlobalOffset[2] << "\n";
+  }
 };
 } // namespace native_cpu
 
+#ifdef NATIVECPU_USE_OCK
+static native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr,
+                                         size_t itemsPerThread) {
+  native_cpu::state resized_state(
+      ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], itemsPerThread,
+      ndr.LocalSize[1], ndr.LocalSize[2], ndr.GlobalOffset[0],
+      ndr.GlobalOffset[1], ndr.GlobalOffset[2]);
+  return resized_state;
+}
+#endif
+
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
     const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -61,38 +83,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
   // TODO: add proper error checking
   // TODO: add proper event dep management
-  native_cpu::NDRDescT ndr(workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize);
-  auto& tp = hQueue->device->tp;
+  native_cpu::NDRDescT ndr(workDim, pGlobalWorkOffset, pGlobalWorkSize,
+                           pLocalWorkSize);
+  auto &tp = hQueue->device->tp;
   const size_t numParallelThreads = tp.num_threads();
   hKernel->updateMemPool(numParallelThreads);
   std::vector<std::future<void>> futures;
+  std::vector<std::function<void(size_t, ur_kernel_handle_t_)>> groups;
   auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
   auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
   auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
-  bool isLocalSizeOne =
-      ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1;
-  
-
   native_cpu::state state(ndr.GlobalSize[0], ndr.GlobalSize[1],
                           ndr.GlobalSize[2], ndr.LocalSize[0], ndr.LocalSize[1],
                           ndr.LocalSize[2], ndr.GlobalOffset[0],
                           ndr.GlobalOffset[1], ndr.GlobalOffset[2]);
-  if (isLocalSizeOne) {
-    // If the local size is one, we make the assumption that we are running a
-    // parallel_for over a sycl::range Todo: we could add compiler checks and
-    // kernel properties for this (e.g. check that no barriers are called, no
-    // local memory args).
-
-  auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
-  auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
-  auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
+#ifndef NATIVECPU_USE_OCK
+  hKernel->handleLocalArgs(1, 0);
   for (unsigned g2 = 0; g2 < numWG2; g2++) {
     for (unsigned g1 = 0; g1 < numWG1; g1++) {
       for (unsigned g0 = 0; g0 < numWG0; g0++) {
-#ifdef NATIVECPU_USE_OCK
-        state.update(g0, g1, g2);
-        hKernel->_subhandler(hKernel->_args.data(), &state);
-#else
         for (unsigned local2 = 0; local2 < ndr.LocalSize[2]; local2++) {
           for (unsigned local1 = 0; local1 < ndr.LocalSize[1]; local1++) {
             for (unsigned local0 = 0; local0 < ndr.LocalSize[0]; local0++) {
@@ -101,13 +110,118 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
             }
           }
         }
-#endif
+      }
+    }
+  }
+#else
+  bool isLocalSizeOne =
+      ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1;
+  if (isLocalSizeOne && ndr.GlobalSize[0] > numParallelThreads) {
+    // If the local size is one, we make the assumption that we are running a
+    // parallel_for over a sycl::range.
+    // Todo: we could add compiler checks and
+    // kernel properties for this (e.g. check that no barriers are called, no
+    // local memory args).
+
+    // Todo: this assumes that dim 0 is the best dimension over which we want to
+    // parallelize
+
+    // Since we also vectorize the kernel, and vectorization happens within the
+    // work group loop, it's better to have a large-ish local size. We can
+    // divide the global range by the number of threads, set that as the local
+    // size and peel everything else.
+
+    size_t new_num_work_groups_0 = numParallelThreads;
+    size_t itemsPerThread = ndr.GlobalSize[0] / numParallelThreads;
+
+    for (unsigned g2 = 0; g2 < numWG2; g2++) {
+      for (unsigned g1 = 0; g1 < numWG1; g1++) {
+        for (unsigned g0 = 0; g0 < new_num_work_groups_0; g0 += 1) {
+          futures.emplace_back(
+              tp.schedule_task([&ndr = std::as_const(ndr), itemsPerThread,
+                                hKernel, g0, g1, g2](size_t) {
+                native_cpu::state resized_state =
+                    getResizedState(ndr, itemsPerThread);
+                resized_state.update(g0, g1, g2);
+                hKernel->_subhandler(hKernel->_args.data(), &resized_state);
+              }));
+        }
+        // Peel the remaining work items. Since the local size is 1, we iterate
+        // over the work groups.
+        for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0;
+             g0++) {
+          state.update(g0, g1, g2);
+          hKernel->_subhandler(hKernel->_args.data(), &state);
+        }
+      }
+    }
+
+  } else {
+    // We are running a parallel_for over an nd_range
+
+    if (numWG1 * numWG2 >= numParallelThreads) {
+      // Dimensions 1 and 2 have enough work, split them across the threadpool
+      for (unsigned g2 = 0; g2 < numWG2; g2++) {
+        for (unsigned g1 = 0; g1 < numWG1; g1++) {
+          futures.emplace_back(
+              tp.schedule_task([state, kernel = *hKernel, numWG0, g1, g2,
+                                numParallelThreads](size_t threadId) mutable {
+                for (unsigned g0 = 0; g0 < numWG0; g0++) {
+                  kernel.handleLocalArgs(numParallelThreads, threadId);
+                  state.update(g0, g1, g2);
+                  kernel._subhandler(kernel._args.data(), &state);
+                }
+              }));
+        }
+      }
+    } else {
+      // Split dimension 0 across the threadpool
+      // Here we try to create groups of workgroups in order to reduce
+      // synchronization overhead
+      for (unsigned g2 = 0; g2 < numWG2; g2++) {
+        for (unsigned g1 = 0; g1 < numWG1; g1++) {
+          for (unsigned g0 = 0; g0 < numWG0; g0++) {
+            groups.push_back(
+                [state, g0, g1, g2, numParallelThreads](
+                    size_t threadId, ur_kernel_handle_t_ kernel) mutable {
+                  kernel.handleLocalArgs(numParallelThreads, threadId);
+                  state.update(g0, g1, g2);
+                  kernel._subhandler(kernel._args.data(), &state);
+                });
+          }
+        }
+      }
+      auto numGroups = groups.size();
+      auto groupsPerThread = numGroups / numParallelThreads;
+      auto remainder = numGroups % numParallelThreads;
+      for (unsigned thread = 0; thread < numParallelThreads; thread++) {
+        futures.emplace_back(tp.schedule_task(
+            [&groups, thread, groupsPerThread, hKernel](size_t threadId) {
+              for (unsigned i = 0; i < groupsPerThread; i++) {
+                auto index = thread * groupsPerThread + i;
+                groups[index](threadId, *hKernel);
+              }
+            }));
+      }
+
+      // schedule the remaining tasks
+      if (remainder) {
+        futures.emplace_back(
+            tp.schedule_task([&groups, remainder,
+                              scheduled = numParallelThreads * groupsPerThread,
+                              hKernel](size_t threadId) {
+              for (unsigned i = 0; i < remainder; i++) {
+                auto index = scheduled + i;
+                groups[index](threadId, *hKernel);
+              }
+            }));
       }
     }
   }
 
   for (auto &f : futures)
     f.get();
+#endif // NATIVECPU_USE_OCK
   // TODO: we should avoid calling clear here by avoiding using push_back
   // in setKernelArgs.
   hKernel->_args.clear();
@@ -553,4 +667,3 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe(
 
   DIE_NO_IMPLEMENTATION;
 }
-
diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp
@@ -40,25 +40,25 @@ struct ur_kernel_handle_t_ : RefCounted {
   ur_kernel_handle_t_(const char *name, nativecpu_task_t subhandler)
       : _name{name}, _subhandler{std::move(subhandler)} {}
 
-  ur_kernel_handle_t_(const ur_kernel_handle_t_& other) : _name(other._name), _subhandler(other._subhandler), 
-  _args(other._args), _localArgInfo(other._localArgInfo), _localMemPool(other._localMemPool), _localMemPoolSize(other._localMemPoolSize) {
+  ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
+      : _name(other._name), _subhandler(other._subhandler), _args(other._args),
+        _localArgInfo(other._localArgInfo), _localMemPool(other._localMemPool),
+        _localMemPoolSize(other._localMemPoolSize) {
     incrementReferenceCount();
   }
 
   ~ur_kernel_handle_t_() {
-    decrementReferenceCount();
-    if (_refCount == 0) {
+    if (decrementReferenceCount() == 0) {
       free(_localMemPool);
     }
-  
   }
 
   const char *_name;
   nativecpu_task_t _subhandler;
   std::vector<native_cpu::NativeCPUArgDesc> _args;
   std::vector<local_arg_info_t> _localArgInfo;
 
-  // To be called before enqueing the kernel.
+  // To be called before enqueueing the kernel.
   void updateMemPool(size_t numParallelThreads) {
     // compute requested size.
     size_t reqSize = 0;
@@ -69,7 +69,7 @@ struct ur_kernel_handle_t_ : RefCounted {
       return;
     }
     // realloc handles nullptr case
-    _localMemPool = (char*)realloc(_localMemPool, reqSize);
+    _localMemPool = (char *)realloc(_localMemPool, reqSize);
     _localMemPoolSize = reqSize;
   }
 
@@ -86,7 +86,6 @@ struct ur_kernel_handle_t_ : RefCounted {
   }
 
 private:
-  char* _localMemPool = nullptr;
+  char *_localMemPool = nullptr;
   size_t _localMemPoolSize = 0;
 };
-
diff --git a/source/adapters/native_cpu/nativecpu_state.hpp b/source/adapters/native_cpu/nativecpu_state.hpp
@@ -19,6 +19,7 @@ struct state {
   size_t MLocal_id[3];
   size_t MNumGroups[3];
   size_t MGlobalOffset[3];
+  uint32_t NumSubGroups, SubGroup_id, SubGroup_local_id, SubGroup_size;
   state(size_t globalR0, size_t globalR1, size_t globalR2, size_t localR0,
         size_t localR1, size_t localR2, size_t globalO0, size_t globalO1,
         size_t globalO2)
@@ -36,6 +37,10 @@ struct state {
     MLocal_id[0] = 0;
     MLocal_id[1] = 0;
     MLocal_id[2] = 0;
+    NumSubGroups = 32;
+    SubGroup_id = 0;
+    SubGroup_local_id = 0;
+    SubGroup_size = 1;
   }
 
   void update(size_t group0, size_t group1, size_t group2, size_t local0,
diff --git a/source/adapters/native_cpu/queue.hpp b/source/adapters/native_cpu/queue.hpp
@@ -12,8 +12,7 @@
 #include "device.hpp"
 
 struct ur_queue_handle_t_ : RefCounted {
-  ur_device_handle_t_ *device;
+  ur_device_handle_t_ *const device;
 
   ur_queue_handle_t_(ur_device_handle_t_ *device) : device(device) {}
-
 };
diff --git a/source/adapters/native_cpu/threadpool.hpp b/source/adapters/native_cpu/threadpool.hpp
diff --git a/source/adapters/native_cpu/usm.cpp b/source/adapters/native_cpu/usm.cpp