Fixed add_contig_impl and add_matrix_vector_broadcasting_contig_impl

oleksandr-pavlyk · oleksandr-pavlyk · commit d7b42382390a · 2023-05-09T09:43:39.000-05:00
Corrected/added checks for validity of sub-groups reads/writes.

Added -fno-approx-func flag to compile element-wise functions, as well
as -fno-finite-math-only flag.

Fixed test_cos_order test to account for NumPy using float16 for intermediate
computations for inputs of type "i1", but CPU RT does not support float16.
diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
@@ -53,7 +53,7 @@ if (WIN32)
 endif()
 set_source_files_properties(
   ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
-  PROPERTIES COMPILE_OPTIONS "${_clang_prefx}-fno-approx-func")
+  PROPERTIES COMPILE_OPTIONS "${_clang_prefx}-fno-approx-func;${_clang_prefx}-fno-finite-math-only")
 target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int)
 target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
 if(UNIX)
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -66,7 +66,8 @@ struct AddContigFunctor
                           (ndit.get_group(0) * ndit.get_local_range(0) +
                            sg.get_group_id()[0] * maxsgSize);
 
-            if (base + n_vecs * vec_sz < nelems_) {
+            if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
+                (sgSize == maxsgSize)) {
                 using in_ptrT1 =
                     sycl::multi_ptr<const argT1,
                                     sycl::access::address_space::global_space>;
@@ -428,7 +429,8 @@ sycl::event add_contig_matrix_contig_row_broadcast_impl(
         cgh.depends_on(make_padded_vec_ev);
 
         auto lwsRange = sycl::range<1>(lws);
-        size_t n_groups = (n0 * n1 + lws - 1) / lws;
+        size_t n_elems = n0 * n1;
+        size_t n_groups = (n_elems + lws - 1) / lws;
         auto gwsRange = sycl::range<1>(n_groups * lws);
 
             cgh.parallel_for<class add_matrix_vector_broadcast_sg_krn<argT1, argT2, resT>>(
@@ -438,24 +440,31 @@ sycl::event add_contig_matrix_contig_row_broadcast_impl(
                 auto sg = ndit.get_sub_group();
                 size_t gid = ndit.get_global_linear_id();
 
+                std::uint8_t sgSize = sg.get_local_range()[0];
                 size_t base = gid - sg.get_local_id()[0];
 
-                using in_ptrT1 =
-                    sycl::multi_ptr<const argT1,
-                                    sycl::access::address_space::global_space>;
-                using in_ptrT2 =
-                    sycl::multi_ptr<const argT2,
-                                    sycl::access::address_space::global_space>;
-                using res_ptrT =
-                    sycl::multi_ptr<resT,
-                                    sycl::access::address_space::global_space>;
+                if (base + sgSize < n_elems) {
+                    using in_ptrT1 = sycl::multi_ptr<
+                        const argT1, sycl::access::address_space::global_space>;
+                    using in_ptrT2 = sycl::multi_ptr<
+                        const argT2, sycl::access::address_space::global_space>;
+                    using res_ptrT = sycl::multi_ptr<
+                        resT, sycl::access::address_space::global_space>;
 
-                const argT1 mat_el = sg.load(in_ptrT1(&mat[base]));
-                const argT2 vec_el = sg.load(in_ptrT2(&padded_vec[base % n1]));
+                    const argT1 mat_el = sg.load(in_ptrT1(&mat[base]));
+                    const argT2 vec_el =
+                        sg.load(in_ptrT2(&padded_vec[base % n1]));
 
-                resT res_el = mat_el + vec_el;
+                    resT res_el = mat_el + vec_el;
 
-                sg.store(res_ptrT(&res[base]), res_el);
+                    sg.store(res_ptrT(&res[base]), res_el);
+                }
+                else {
+                    for (size_t k = base + sg.get_local_id()[0]; k < n_elems;
+                         k += sgSize) {
+                        res[k] = mat[k] + padded_vec[k % n1];
+                    }
+                }
                 }
             );
     });
diff --git a/dpctl/tests/test_tensor_elementwise.py b/dpctl/tests/test_tensor_elementwise.py
@@ -290,7 +290,9 @@ def test_cos_usm_type(usm_type):
     expected_Y = np.empty(input_shape, dtype=arg_dt)
     expected_Y[..., 0::2] = np.cos(np.float32(np.pi / 6))
     expected_Y[..., 1::2] = np.cos(np.float32(np.pi / 3))
-    assert np.allclose(dpt.asnumpy(Y), expected_Y)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    np.testing.assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
 
 
 @pytest.mark.parametrize("dtype", _all_dtypes)
@@ -309,7 +311,13 @@ def test_cos_order(dtype):
             U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
             Y = dpt.cos(U, order=ord)
             expected_Y = np.cos(dpt.asnumpy(U))
-            assert np.allclose(dpt.asnumpy(Y), expected_Y)
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            np.testing.assert_allclose(
+                dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
+            )
 
 
 @pytest.mark.parametrize("dtype", _all_dtypes)