File tree Expand file tree Collapse file tree 1 file changed +4
-1
lines changed
dpctl/tensor/libtensor/include/utils Expand file tree Collapse file tree 1 file changed +4
-1
lines changed Original file line number Diff line number Diff line change @@ -166,7 +166,10 @@ T custom_reduce_over_group(const GroupT &wg,
166166 const T &local_val,
167167 const OpT &op)
168168{
169+ // value experimentally tuned to achieve best runtime on Iris Xe,
170+ // Arc A140V integrated Intel GPUs, and discrete Intel Max GPU.
169171 constexpr std::uint32_t low_sz = 8u ;
172+ // maximal work-group size
170173 constexpr std::uint32_t high_sz = 1024u ;
171174 const std::uint32_t wgs = wg.get_local_linear_range ();
172175 const std::uint32_t lid = wg.get_local_linear_id ();
@@ -192,7 +195,7 @@ T custom_reduce_over_group(const GroupT &wg,
192195#pragma unroll
193196 for (std::uint32_t sz = high_sz; sz >= low_sz; sz >>= 1 ) {
194197 if (n_witems >= sz) {
195- n_witems = (n_witems + 1 ) >> 1 ;
198+ n_witems >>= 1 ;
196199 _fold (local_mem_acc, lid, n_witems, op);
197200 sycl::group_barrier (wg, sycl::memory_scope::work_group);
198201 }
You can’t perform that action at this time.
0 commit comments