11// ===----------- enqueue.cpp - NATIVE CPU Adapter -------------------------===//
22//
3- // Copyright (C) 2023 Intel Corporation
4- //
5- // Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
6- // Exceptions. See LICENSE.TXT
3+ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+ // See https://llvm.org/LICENSE.txt for license information.
75// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
86//
97// ===----------------------------------------------------------------------===//
108#include < array>
9+ #include < cstddef>
1110#include < cstdint>
11+ #include < vector>
1212
1313#include " ur_api.h"
1414
1515#include " common.hpp"
1616#include " kernel.hpp"
1717#include " memory.hpp"
18+ #include " queue.hpp"
19+ #include " threadpool.hpp"
1820
1921namespace native_cpu {
2022struct NDRDescT {
@@ -37,9 +39,29 @@ struct NDRDescT {
3739 GlobalOffset[I] = 0 ;
3840 }
3941 }
42+
43+ void dump (std::ostream &os) const {
44+ os << " GlobalSize: " << GlobalSize[0 ] << " " << GlobalSize[1 ] << " "
45+ << GlobalSize[2 ] << " \n " ;
46+ os << " LocalSize: " << LocalSize[0 ] << " " << LocalSize[1 ] << " "
47+ << LocalSize[2 ] << " \n " ;
48+ os << " GlobalOffset: " << GlobalOffset[0 ] << " " << GlobalOffset[1 ] << " "
49+ << GlobalOffset[2 ] << " \n " ;
50+ }
4051};
4152} // namespace native_cpu
4253
54+ #ifdef NATIVECPU_USE_OCK
55+ static native_cpu::state getResizedState (const native_cpu::NDRDescT &ndr,
56+ size_t itemsPerThread) {
57+ native_cpu::state resized_state (
58+ ndr.GlobalSize [0 ], ndr.GlobalSize [1 ], ndr.GlobalSize [2 ], itemsPerThread,
59+ ndr.LocalSize [1 ], ndr.LocalSize [2 ], ndr.GlobalOffset [0 ],
60+ ndr.GlobalOffset [1 ], ndr.GlobalOffset [2 ]);
61+ return resized_state;
62+ }
63+ #endif
64+
4365UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch (
4466 ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
4567 const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -63,23 +85,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
6385 // TODO: add proper event dep management
6486 native_cpu::NDRDescT ndr (workDim, pGlobalWorkOffset, pGlobalWorkSize,
6587 pLocalWorkSize);
66- hKernel->handleLocalArgs ();
67-
88+ auto &tp = hQueue->device ->tp ;
89+ const size_t numParallelThreads = tp.num_threads ();
90+ hKernel->updateMemPool (numParallelThreads);
91+ std::vector<std::future<void >> futures;
92+ std::vector<std::function<void (size_t , ur_kernel_handle_t_)>> groups;
93+ auto numWG0 = ndr.GlobalSize [0 ] / ndr.LocalSize [0 ];
94+ auto numWG1 = ndr.GlobalSize [1 ] / ndr.LocalSize [1 ];
95+ auto numWG2 = ndr.GlobalSize [2 ] / ndr.LocalSize [2 ];
6896 native_cpu::state state (ndr.GlobalSize [0 ], ndr.GlobalSize [1 ],
6997 ndr.GlobalSize [2 ], ndr.LocalSize [0 ], ndr.LocalSize [1 ],
7098 ndr.LocalSize [2 ], ndr.GlobalOffset [0 ],
7199 ndr.GlobalOffset [1 ], ndr.GlobalOffset [2 ]);
72-
73- auto numWG0 = ndr.GlobalSize [0 ] / ndr.LocalSize [0 ];
74- auto numWG1 = ndr.GlobalSize [1 ] / ndr.LocalSize [1 ];
75- auto numWG2 = ndr.GlobalSize [2 ] / ndr.LocalSize [2 ];
100+ #ifndef NATIVECPU_USE_OCK
101+ hKernel->handleLocalArgs (1 , 0 );
76102 for (unsigned g2 = 0 ; g2 < numWG2; g2++) {
77103 for (unsigned g1 = 0 ; g1 < numWG1; g1++) {
78104 for (unsigned g0 = 0 ; g0 < numWG0; g0++) {
79- #ifdef NATIVECPU_USE_OCK
80- state.update (g0, g1, g2);
81- hKernel->_subhandler (hKernel->_args .data (), &state);
82- #else
83105 for (unsigned local2 = 0 ; local2 < ndr.LocalSize [2 ]; local2++) {
84106 for (unsigned local1 = 0 ; local1 < ndr.LocalSize [1 ]; local1++) {
85107 for (unsigned local0 = 0 ; local0 < ndr.LocalSize [0 ]; local0++) {
@@ -88,10 +110,118 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
88110 }
89111 }
90112 }
91- #endif
92113 }
93114 }
94115 }
116+ #else
117+ bool isLocalSizeOne =
118+ ndr.LocalSize [0 ] == 1 && ndr.LocalSize [1 ] == 1 && ndr.LocalSize [2 ] == 1 ;
119+ if (isLocalSizeOne && ndr.GlobalSize [0 ] > numParallelThreads) {
120+ // If the local size is one, we make the assumption that we are running a
121+ // parallel_for over a sycl::range.
122+ // Todo: we could add compiler checks and
123+ // kernel properties for this (e.g. check that no barriers are called, no
124+ // local memory args).
125+
126+ // Todo: this assumes that dim 0 is the best dimension over which we want to
127+ // parallelize
128+
129+ // Since we also vectorize the kernel, and vectorization happens within the
130+ // work group loop, it's better to have a large-ish local size. We can
131+ // divide the global range by the number of threads, set that as the local
132+ // size and peel everything else.
133+
134+ size_t new_num_work_groups_0 = numParallelThreads;
135+ size_t itemsPerThread = ndr.GlobalSize [0 ] / numParallelThreads;
136+
137+ for (unsigned g2 = 0 ; g2 < numWG2; g2++) {
138+ for (unsigned g1 = 0 ; g1 < numWG1; g1++) {
139+ for (unsigned g0 = 0 ; g0 < new_num_work_groups_0; g0 += 1 ) {
140+ futures.emplace_back (
141+ tp.schedule_task ([&ndr = std::as_const (ndr), itemsPerThread,
142+ hKernel, g0, g1, g2](size_t ) {
143+ native_cpu::state resized_state =
144+ getResizedState (ndr, itemsPerThread);
145+ resized_state.update (g0, g1, g2);
146+ hKernel->_subhandler (hKernel->_args .data (), &resized_state);
147+ }));
148+ }
149+ // Peel the remaining work items. Since the local size is 1, we iterate
150+ // over the work groups.
151+ for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0;
152+ g0++) {
153+ state.update (g0, g1, g2);
154+ hKernel->_subhandler (hKernel->_args .data (), &state);
155+ }
156+ }
157+ }
158+
159+ } else {
160+ // We are running a parallel_for over an nd_range
161+
162+ if (numWG1 * numWG2 >= numParallelThreads) {
163+ // Dimensions 1 and 2 have enough work, split them across the threadpool
164+ for (unsigned g2 = 0 ; g2 < numWG2; g2++) {
165+ for (unsigned g1 = 0 ; g1 < numWG1; g1++) {
166+ futures.emplace_back (
167+ tp.schedule_task ([state, kernel = *hKernel, numWG0, g1, g2,
168+ numParallelThreads](size_t threadId) mutable {
169+ for (unsigned g0 = 0 ; g0 < numWG0; g0++) {
170+ kernel.handleLocalArgs (numParallelThreads, threadId);
171+ state.update (g0, g1, g2);
172+ kernel._subhandler (kernel._args .data (), &state);
173+ }
174+ }));
175+ }
176+ }
177+ } else {
178+ // Split dimension 0 across the threadpool
179+ // Here we try to create groups of workgroups in order to reduce
180+ // synchronization overhead
181+ for (unsigned g2 = 0 ; g2 < numWG2; g2++) {
182+ for (unsigned g1 = 0 ; g1 < numWG1; g1++) {
183+ for (unsigned g0 = 0 ; g0 < numWG0; g0++) {
184+ groups.push_back (
185+ [state, g0, g1, g2, numParallelThreads](
186+ size_t threadId, ur_kernel_handle_t_ kernel) mutable {
187+ kernel.handleLocalArgs (numParallelThreads, threadId);
188+ state.update (g0, g1, g2);
189+ kernel._subhandler (kernel._args .data (), &state);
190+ });
191+ }
192+ }
193+ }
194+ auto numGroups = groups.size ();
195+ auto groupsPerThread = numGroups / numParallelThreads;
196+ auto remainder = numGroups % numParallelThreads;
197+ for (unsigned thread = 0 ; thread < numParallelThreads; thread++) {
198+ futures.emplace_back (tp.schedule_task (
199+ [&groups, thread, groupsPerThread, hKernel](size_t threadId) {
200+ for (unsigned i = 0 ; i < groupsPerThread; i++) {
201+ auto index = thread * groupsPerThread + i;
202+ groups[index](threadId, *hKernel);
203+ }
204+ }));
205+ }
206+
207+ // schedule the remaining tasks
208+ if (remainder) {
209+ futures.emplace_back (
210+ tp.schedule_task ([&groups, remainder,
211+ scheduled = numParallelThreads * groupsPerThread,
212+ hKernel](size_t threadId) {
213+ for (unsigned i = 0 ; i < remainder; i++) {
214+ auto index = scheduled + i;
215+ groups[index](threadId, *hKernel);
216+ }
217+ }));
218+ }
219+ }
220+ }
221+
222+ for (auto &f : futures)
223+ f.get ();
224+ #endif // NATIVECPU_USE_OCK
95225 // TODO: we should avoid calling clear here by avoiding using push_back
96226 // in setKernelArgs.
97227 hKernel->_args .clear ();
0 commit comments