[NativeCPU] Simplify enqueue. (#19550)

hvdijk · web-flow · commit 63c70a1425d2 · 2025-07-23T10:43:10.000-07:00
We were creating excessive numbers of threads. When we know we want a
given amount of threads, just divide the number of workgroups by the
number of threads and have each thread process that many workgroups.

This implementation also means we no longer need to resize workgroups,
which was not generally safe.
diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp
@@ -52,17 +52,6 @@ struct NDRDescT {
 };
 } // namespace native_cpu
 
-#ifdef NATIVECPU_USE_OCK
-static native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr,
-                                         size_t itemsPerThread) {
-  native_cpu::state resized_state(
-      ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], itemsPerThread,
-      ndr.LocalSize[1], ndr.LocalSize[2], ndr.GlobalOffset[0],
-      ndr.GlobalOffset[1], ndr.GlobalOffset[2]);
-  return resized_state;
-}
-#endif
-
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
     const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -112,6 +101,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   // TODO: add proper error checking
   native_cpu::NDRDescT ndr(workDim, pGlobalWorkOffset, pGlobalWorkSize,
                            pLocalWorkSize);
+  unsigned long long numWI;
+  auto umulll_overflow = [](unsigned long long a, unsigned long long b,
+                            unsigned long long *c) -> bool {
+#ifdef __GNUC__
+    return __builtin_umulll_overflow(a, b, c);
+#else
+    *c = a * b;
+    return a != 0 && b != *c / a;
+#endif
+  };
+  if (umulll_overflow(ndr.GlobalSize[0], ndr.GlobalSize[1], &numWI) ||
+      umulll_overflow(numWI, ndr.GlobalSize[2], &numWI) || numWI > SIZE_MAX) {
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+
   auto &tp = hQueue->getDevice()->tp;
   const size_t numParallelThreads = tp.num_threads();
   std::vector<std::future<void>> futures;
@@ -130,131 +134,56 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   auto kernel = std::make_unique<ur_kernel_handle_t_>(*hKernel);
   kernel->updateMemPool(numParallelThreads);
 
+  const size_t numWG = numWG0 * numWG1 * numWG2;
+  const size_t numWGPerThread = numWG / numParallelThreads;
+  const size_t remainderWG = numWG - numWGPerThread * numParallelThreads;
+  // The fourth value is the linearized value.
+  std::array<size_t, 4> rangeStart = {0, 0, 0, 0};
+  for (unsigned t = 0; t < numParallelThreads; ++t) {
+    auto rangeEnd = rangeStart;
+    rangeEnd[3] += numWGPerThread + (t < remainderWG);
+    if (rangeEnd[3] == rangeStart[3])
+      break;
+    rangeEnd[0] = rangeEnd[3] % numWG0;
+    rangeEnd[1] = (rangeEnd[3] / numWG0) % numWG1;
+    rangeEnd[2] = rangeEnd[3] / (numWG0 * numWG1);
+    futures.emplace_back(
+        tp.schedule_task([state, &kernel = *kernel, rangeStart,
+                          rangeEnd = rangeEnd[3], numWG0, numWG1,
 #ifndef NATIVECPU_USE_OCK
-  for (unsigned g2 = 0; g2 < numWG2; g2++) {
-    for (unsigned g1 = 0; g1 < numWG1; g1++) {
-      for (unsigned g0 = 0; g0 < numWG0; g0++) {
-        for (unsigned local2 = 0; local2 < ndr.LocalSize[2]; local2++) {
-          for (unsigned local1 = 0; local1 < ndr.LocalSize[1]; local1++) {
-            for (unsigned local0 = 0; local0 < ndr.LocalSize[0]; local0++) {
-              state.update(g0, g1, g2, local0, local1, local2);
-              kernel->_subhandler(kernel->getArgs(1, 0).data(), &state);
-            }
-          }
-        }
-      }
-    }
-  }
+                          localSize = ndr.LocalSize,
+#endif
+                          numParallelThreads](size_t threadId) mutable {
+          for (size_t g0 = rangeStart[0], g1 = rangeStart[1],
+                      g2 = rangeStart[2], g3 = rangeStart[3];
+               g3 < rangeEnd; ++g3) {
+#ifdef NATIVECPU_USE_OCK
+            state.update(g0, g1, g2);
+            kernel._subhandler(
+                kernel.getArgs(numParallelThreads, threadId).data(), &state);
 #else
-  bool isLocalSizeOne =
-      ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1;
-  if (isLocalSizeOne && ndr.GlobalSize[0] > numParallelThreads &&
-      !kernel->hasLocalArgs()) {
-    // If the local size is one, we make the assumption that we are running a
-    // parallel_for over a sycl::range.
-    // Todo: we could add more compiler checks and
-    // kernel properties for this (e.g. check that no barriers are called).
-
-    // Todo: this assumes that dim 0 is the best dimension over which we want to
-    // parallelize
-
-    // Since we also vectorize the kernel, and vectorization happens within the
-    // work group loop, it's better to have a large-ish local size. We can
-    // divide the global range by the number of threads, set that as the local
-    // size and peel everything else.
-
-    size_t new_num_work_groups_0 = numParallelThreads;
-    size_t itemsPerThread = ndr.GlobalSize[0] / numParallelThreads;
-
-    for (unsigned g2 = 0; g2 < numWG2; g2++) {
-      for (unsigned g1 = 0; g1 < numWG1; g1++) {
-        for (unsigned g0 = 0; g0 < new_num_work_groups_0; g0 += 1) {
-          futures.emplace_back(tp.schedule_task(
-              [ndr, itemsPerThread, &kernel = *kernel, g0, g1, g2](size_t) {
-                native_cpu::state resized_state =
-                    getResizedState(ndr, itemsPerThread);
-                resized_state.update(g0, g1, g2);
-                kernel._subhandler(kernel.getArgs().data(), &resized_state);
-              }));
-        }
-        // Peel the remaining work items. Since the local size is 1, we iterate
-        // over the work groups.
-        for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0;
-             g0++) {
-          state.update(g0, g1, g2);
-          kernel->_subhandler(kernel->getArgs().data(), &state);
-        }
-      }
-    }
-
-  } else {
-    // We are running a parallel_for over an nd_range
-
-    if (numWG1 * numWG2 >= numParallelThreads) {
-      // Dimensions 1 and 2 have enough work, split them across the threadpool
-      for (unsigned g2 = 0; g2 < numWG2; g2++) {
-        for (unsigned g1 = 0; g1 < numWG1; g1++) {
-          futures.emplace_back(
-              tp.schedule_task([state, &kernel = *kernel, numWG0, g1, g2,
-                                numParallelThreads](size_t threadId) mutable {
-                for (unsigned g0 = 0; g0 < numWG0; g0++) {
-                  state.update(g0, g1, g2);
+            for (size_t local2 = 0; local2 < localSize[2]; ++local2) {
+              for (size_t local1 = 0; local1 < localSize[1]; ++local1) {
+                for (size_t local0 = 0; local0 < localSize[0]; ++local0) {
+                  state.update(g0, g1, g2, local0, local1, local2);
                   kernel._subhandler(
                       kernel.getArgs(numParallelThreads, threadId).data(),
                       &state);
                 }
-              }));
-        }
-      }
-    } else {
-      // Split dimension 0 across the threadpool
-      // Here we try to create groups of workgroups in order to reduce
-      // synchronization overhead
-      for (unsigned g2 = 0; g2 < numWG2; g2++) {
-        for (unsigned g1 = 0; g1 < numWG1; g1++) {
-          for (unsigned g0 = 0; g0 < numWG0; g0++) {
-            groups.push_back([state, g0, g1, g2, numParallelThreads](
-                                 size_t threadId,
-                                 ur_kernel_handle_t_ &kernel) mutable {
-              state.update(g0, g1, g2);
-              kernel._subhandler(
-                  kernel.getArgs(numParallelThreads, threadId).data(), &state);
-            });
-          }
-        }
-      }
-      auto numGroups = groups.size();
-      auto groupsPerThread = numGroups / numParallelThreads;
-      if (groupsPerThread) {
-        for (unsigned thread = 0; thread < numParallelThreads; thread++) {
-          futures.emplace_back(
-              tp.schedule_task([groups, thread, groupsPerThread,
-                                &kernel = *kernel](size_t threadId) {
-                for (unsigned i = 0; i < groupsPerThread; i++) {
-                  auto index = thread * groupsPerThread + i;
-                  groups[index](threadId, kernel);
-                }
-              }));
-        }
-      }
-
-      // schedule the remaining tasks
-      auto remainder = numGroups % numParallelThreads;
-      if (remainder) {
-        futures.emplace_back(
-            tp.schedule_task([groups, remainder,
-                              scheduled = numParallelThreads * groupsPerThread,
-                              &kernel = *kernel](size_t threadId) {
-              for (unsigned i = 0; i < remainder; i++) {
-                auto index = scheduled + i;
-                groups[index](threadId, kernel);
               }
-            }));
-      }
-    }
+            }
+#endif
+            if (++g0 == numWG0) {
+              g0 = 0;
+              if (++g1 == numWG1) {
+                g1 = 0;
+                ++g2;
+              }
+            }
+          }
+        }));
+    rangeStart = rangeEnd;
   }
-
-#endif // NATIVECPU_USE_OCK
   event->set_futures(futures);
 
   if (phEvent) {