@@ -49,44 +49,21 @@ __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
4949 return WGSize;
5050}
5151
52- # ifdef __INTEL_PREVIEW_BREAKING_CHANGES
53- // Inline this helper:
54- # endif
55- uint32_t reduGetMaxNumConcurrentWorkGroups ( device_impl &Dev) {
52+ // Returns the estimated number of physical threads on the device associated
53+ // with the given queue.
54+ __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups (handler &cgh) {
55+ const device_impl &Dev = getSyclObjImpl (cgh)-> get_device ();
5656 uint32_t NumThreads = Dev.get_info <sycl::info::device::max_compute_units>();
5757 // TODO: The heuristics here require additional tuning for various devices
5858 // and vendors. Also, it would be better to check vendor/generation/etc.
5959 if (Dev.is_gpu () && Dev.get_info <sycl::info::device::host_unified_memory>())
6060 NumThreads *= 8 ;
6161 return NumThreads;
6262}
63- // Returns the estimated number of physical threads on the device associated
64- // with the given queue.
65- __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups (handler &cgh) {
66- return reduGetMaxNumConcurrentWorkGroups (getSyclObjImpl (cgh)->get_device ());
67- }
6863
69- #ifndef __INTEL_PREVIEW_BREAKING_CHANGES
70- __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups (
71- std::shared_ptr<sycl::detail::queue_impl> Queue) {
72- // TODO: Graphs extension explicit API uses a handler with no queue attached,
73- // so return some value here. In the future we should have access to the
74- // device so can remove this.
75- //
76- // The 8 value was chosen as the hardcoded value as it is the returned
77- // value for sycl::info::device::max_compute_units on
78- // Intel HD Graphics devices used as a L0 backend during development.
79- if (Queue == nullptr ) {
80- return 8 ;
81- }
82- return reduGetMaxNumConcurrentWorkGroups (Queue->getDeviceImpl ());
83- }
84- #endif
85-
86- #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
87- // Inline this helper:
88- #endif
89- size_t reduGetMaxWGSize (device_impl &Dev, size_t LocalMemBytesPerWorkItem) {
64+ __SYCL_EXPORT size_t reduGetMaxWGSize (handler &cgh,
65+ size_t LocalMemBytesPerWorkItem) {
66+ const device_impl &Dev = getSyclObjImpl (cgh)->get_device ();
9067 size_t MaxWGSize = Dev.get_info <sycl::info::device::max_work_group_size>();
9168
9269 size_t WGSizePerMem = MaxWGSize * 2 ;
@@ -123,24 +100,9 @@ size_t reduGetMaxWGSize(device_impl &Dev, size_t LocalMemBytesPerWorkItem) {
123100
124101 return WGSize;
125102}
126- __SYCL_EXPORT size_t reduGetMaxWGSize (handler &cgh,
127- size_t LocalMemBytesPerWorkItem) {
128- return reduGetMaxWGSize (getSyclObjImpl (cgh)->get_device (),
129- LocalMemBytesPerWorkItem);
130- }
131- #ifndef __INTEL_PREVIEW_BREAKING_CHANGES
132- __SYCL_EXPORT
133- size_t reduGetMaxWGSize (std::shared_ptr<sycl::detail::queue_impl> Queue,
134- size_t LocalMemBytesPerWorkItem) {
135- return reduGetMaxWGSize (Queue->getDeviceImpl (), LocalMemBytesPerWorkItem);
136- }
137- #endif
138103
139- #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
140- // Inline this helper:
141- #endif
142- size_t reduGetPreferredWGSize (device_impl &Dev,
143- size_t LocalMemBytesPerWorkItem) {
104+ __SYCL_EXPORT size_t reduGetPreferredWGSize (handler &cgh,
105+ size_t LocalMemBytesPerWorkItem) {
144106 // The maximum WGSize returned by CPU devices is very large and does not
145107 // help the reduction implementation: since all work associated with a
146108 // work-group is typically assigned to one CPU thread, selecting a large
@@ -150,6 +112,7 @@ size_t reduGetPreferredWGSize(device_impl &Dev,
150112 // behavior.
151113 using PrefWGConfig = sycl::detail::SYCLConfig<
152114 sycl::detail::SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE>;
115+ const device_impl &Dev = getSyclObjImpl (cgh)->get_device ();
153116 if (Dev.is_cpu ()) {
154117 size_t CPUMaxWGSize = PrefWGConfig::get (sycl::info::device_type::cpu);
155118 if (CPUMaxWGSize == 0 )
@@ -177,46 +140,8 @@ size_t reduGetPreferredWGSize(device_impl &Dev,
177140 }
178141
179142 // Use the maximum work-group size otherwise.
180- return reduGetMaxWGSize (Dev, LocalMemBytesPerWorkItem);
181- }
182- __SYCL_EXPORT size_t reduGetPreferredWGSize (handler &cgh,
183- size_t LocalMemBytesPerWorkItem) {
184- return reduGetPreferredWGSize (getSyclObjImpl (cgh)->get_device (),
185- LocalMemBytesPerWorkItem);
186- }
187- #ifndef __INTEL_PREVIEW_BREAKING_CHANGES
188- __SYCL_EXPORT size_t reduGetPreferredWGSize (std::shared_ptr<queue_impl> &Queue,
189- size_t LocalMemBytesPerWorkItem) {
190- // TODO: Graphs extension explicit API uses a handler with a null queue to
191- // process CGFs, in future we should have access to the device so we can
192- // correctly calculate this.
193- //
194- // The 32 value was chosen as the hardcoded value as it is the returned
195- // value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on
196- // Intel HD Graphics devices used as a L0 backend during development.
197- if (Queue == nullptr ) {
198- return 32 ;
199- }
200- device_impl &Dev = Queue->getDeviceImpl ();
201-
202- return reduGetPreferredWGSize (Dev, LocalMemBytesPerWorkItem);
203- }
204- #endif
205-
206- #ifndef __INTEL_PREVIEW_BREAKING_CHANGES
207- __SYCL_EXPORT void
208- addCounterInit (handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,
209- std::shared_ptr<int > &Counter) {
210- auto EventImpl = detail::event_impl::create_device_event (*Queue);
211- EventImpl->setContextImpl (Queue->getContextImpl ());
212- EventImpl->setStateIncomplete ();
213- ur_event_handle_t UREvent = nullptr ;
214- MemoryManager::fill_usm (Counter.get (), *Queue, sizeof (int ), {0 }, {},
215- &UREvent);
216- EventImpl->setHandle (UREvent);
217- CGH.depends_on (createSyclObjFromImpl<event>(EventImpl));
143+ return reduGetMaxWGSize (cgh, LocalMemBytesPerWorkItem);
218144}
219- #endif
220145
221146__SYCL_EXPORT void verifyReductionProps (const property_list &Props) {
222147 auto CheckDataLessProperties = [](int PropertyKind) {
0 commit comments