Merge branch 'master' into diff-count-nonzero-array-api

ndgrigorian · ndgrigorian · commit 37c4fbfd990d · 2024-07-30T20:18:05.000Z
diff --git a/.github/workflows/openssf-scorecard.yml b/.github/workflows/openssf-scorecard.yml
@@ -38,7 +38,7 @@ jobs:
           persist-credentials: false
 
       - name: "Run analysis"
-        uses: ossf/scorecard-action@dc50aa9510b46c811795eb24b2f1ba02a914e534 # v2.3.3
+        uses: ossf/scorecard-action@62b2cac7ed8198b15735ed49ab1e5cf35480ba46 # v2.4.0
         with:
           results_file: results.sarif
           results_format: sarif
@@ -68,6 +68,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@2d790406f505036ef40ecba973cc774a50395aac # v3.25.13
+        uses: github/codeql-action/upload-sarif@afb54ba388a7dca6ecae48f608c4ff05ff4cc77a # v3.25.15
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/os-llvm-sycl-build.yml b/.github/workflows/os-llvm-sycl-build.yml
@@ -15,7 +15,6 @@ jobs:
       DOWNLOAD_URL_PREFIX: https://github.com/intel/llvm/releases/download
       DRIVER_PATH: 2023-WW27
       OCLCPUEXP_FN: oclcpuexp-2023.16.6.0.28_rel.tar.gz
-      FPGAEMU_FN: fpgaemu-2023.16.6.0.28_rel.tar.gz
       TBB_URL: https://github.com/oneapi-src/oneTBB/releases/download/v2021.9.0/
       TBB_INSTALL_DIR: oneapi-tbb-2021.9.0
       TBB_FN: oneapi-tbb-2021.9.0-lin.tgz
@@ -83,15 +82,12 @@ jobs:
               rm -rf ${ARTIFACT_NAME}.tar.gz
               wget ${DOWNLOAD_URL_PREFIX}/${DEPLOY_NIGHTLY_TAG}/${ARTIFACT_NAME}.tar.gz && echo ${DEPLOY_LLVM_TAG_SHA} > bundle_id.txt || rm -rf bundle_id.txt
               [ -f ${OCLCPUEXP_FN} ] || wget ${DOWNLOAD_URL_PREFIX}/${DRIVER_PATH}/${OCLCPUEXP_FN} || rm -rf bundle_id.txt
-              [ -f ${FPGAEMU_FN} ] || wget ${DOWNLOAD_URL_PREFIX}/${DRIVER_PATH}/${FPGAEMU_FN} || rm -rf bundle_id.txt
               [ -f ${TBB_FN} ] || wget ${TBB_URL}/${TBB_FN} || rm -rf bundle_id.txt
               rm -rf dpcpp_compiler
               mkdir -p dpcpp_compiler
               tar xf ${ARTIFACT_NAME}.tar.gz -C dpcpp_compiler
               mkdir -p oclcpuexp
-              mkdir -p fpgaemu
               [ -d oclcpuexp/x64 ] || tar xf ${OCLCPUEXP_FN} -C oclcpuexp
-              [ -d fpgaemu/x64 ] || tar xf ${FPGAEMU_FN} -C fpgaemu
               [ -d ${TBB_INSTALL_DIR}/lib ] || tar xf ${TBB_FN}
               cp oclcpuexp/x64/libOpenCL.so* dpcpp_compiler/lib/
           fi
@@ -110,7 +106,7 @@ jobs:
       - name: Install dpctl dependencies
         shell: bash -l {0}
         run: |
-          pip install numpy"<1.26.0" cython setuptools pytest scikit-build cmake ninja versioneer[toml]==0.29
+          pip install numpy cython setuptools pytest scikit-build cmake ninja versioneer[toml]==0.29
 
       - name: Checkout repo
         uses: actions/checkout@v4.1.7
@@ -126,10 +122,9 @@ jobs:
           export PATH=${SYCL_BUNDLE_FOLDER}/dpcpp_compiler/bin:${PATH}
           export LD_LIBRARY_PATH=${SYCL_BUNDLE_FOLDER}/dpcpp_compiler/lib:${LD_LIBRARY_PATH}
           export LD_LIBRARY_PATH=${SYCL_BUNDLE_FOLDER}/oclcpuexp/x64:${LD_LIBRARY_PATH}
-          export LD_LIBRARY_PATH=${SYCL_BUNDLE_FOLDER}/fpgaemu/x64:${LD_LIBRARY_PATH}
           export LD_LIBRARY_PATH=${SYCL_BUNDLE_FOLDER}/${TBB_INSTALL_DIR}/lib/intel64/gcc4.8:${LD_LIBRARY_PATH}
           export OCL_ICD_VENDORS=
-          export OCL_ICD_FILENAMES=libintelocl.so:libintelocl_emu.so
+          export OCL_ICD_FILENAMES=libintelocl.so
           EOF
           chmod +x set_allvars.sh
           cat set_allvars.sh
diff --git a/dpctl/tensor/_clip.py b/dpctl/tensor/_clip.py
@@ -295,6 +295,11 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
         )
     if order not in ["K", "C", "F", "A"]:
         order = "K"
+    if x.dtype.kind in "iu":
+        if isinstance(min, int) and min <= dpt.iinfo(x.dtype).min:
+            min = None
+        if isinstance(max, int) and max >= dpt.iinfo(x.dtype).max:
+            max = None
     if min is None and max is None:
         exec_q = x.sycl_queue
         orig_out = out
diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -635,7 +635,7 @@ void copy_and_cast_from_host_impl(
 
     // perform explicit synchronization. Implicit synchronization would be
     // performed by sycl::buffer destructor.
-    copy_and_cast_from_host_ev.wait_and_throw();
+    copy_and_cast_from_host_ev.wait();
 
     return;
 }
diff --git a/dpctl/tensor/libtensor/source/accumulators.cpp b/dpctl/tensor/libtensor/source/accumulators.cpp
@@ -160,10 +160,14 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
                       ? mask_positions_contig_i32_dispatch_vector[mask_typeid]
                       : mask_positions_contig_i64_dispatch_vector[mask_typeid];
 
-        size_t total_set = fn(exec_q, mask_size, mask_data, cumsum_data,
-                              host_task_events, depends);
+        size_t total_set;
+
         {
             py::gil_scoped_release release;
+
+            total_set = fn(exec_q, mask_size, mask_data, cumsum_data,
+                           host_task_events, depends);
+
             sycl::event::wait(host_task_events);
         }
         return total_set;
@@ -198,12 +202,13 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
     sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
 
     if (2 * static_cast<size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
-        copy_shape_ev.wait();
         {
             py::gil_scoped_release release;
+
+            copy_shape_ev.wait();
             sycl::event::wait(host_task_events);
+            sycl::free(shape_strides, exec_q);
         }
-        sycl::free(shape_strides, exec_q);
         throw std::runtime_error("Unexpected error");
     }
 
@@ -213,15 +218,17 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
     dependent_events.insert(dependent_events.end(), depends.begin(),
                             depends.end());
 
-    size_t total_set =
-        strided_fn(exec_q, mask_size, mask_data, nd, shape_strides, cumsum_data,
-                   host_task_events, dependent_events);
+    size_t total_set;
 
     {
         py::gil_scoped_release release;
+
+        total_set = strided_fn(exec_q, mask_size, mask_data, nd, shape_strides,
+                               cumsum_data, host_task_events, dependent_events);
+
         sycl::event::wait(host_task_events);
+        sycl::free(shape_strides, exec_q);
     }
-    sycl::free(shape_strides, exec_q);
 
     return total_set;
 }
@@ -352,8 +359,12 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
     sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
 
     if (2 * static_cast<size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
-        copy_shape_ev.wait();
-        sycl::event::wait(host_task_events);
+        {
+            py::gil_scoped_release release;
+
+            copy_shape_ev.wait();
+            sycl::event::wait(host_task_events);
+        }
         sycl::free(shape_strides, exec_q);
         throw std::runtime_error("Unexpected error");
     }
diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
@@ -116,21 +116,29 @@ void copy_numpy_ndarray_into_usm_ndarray(
 
     // check for applicability of special cases:
     //      (same type && (both C-contiguous || both F-contiguous)
-    bool both_c_contig =
+    const bool both_c_contig =
         ((src_flags & py::array::c_style) && dst.is_c_contiguous());
-    bool both_f_contig =
+    const bool both_f_contig =
         ((src_flags & py::array::f_style) && dst.is_f_contiguous());
+
+    const bool same_data_types = (src_type_id == dst_type_id);
+
     if (both_c_contig || both_f_contig) {
-        if (src_type_id == dst_type_id) {
+        if (same_data_types) {
             int src_elem_size = npy_src.itemsize();
 
             sycl::event copy_ev =
                 exec_q.memcpy(static_cast<void *>(dst_data),
                               static_cast<const void *>(src_data),
                               src_nelems * src_elem_size, depends);
 
-            // wait for copy_ev to complete
-            copy_ev.wait_and_throw();
+            {
+                // wait for copy_ev to complete
+                // release GIL to allow other threads (host_tasks)
+                // a chance to acquire GIL
+                py::gil_scoped_release lock{};
+                copy_ev.wait();
+            }
 
             return;
         }
@@ -202,6 +210,30 @@ void copy_numpy_ndarray_into_usm_ndarray(
         simplified_dst_strides.push_back(1);
     }
 
+    const bool can_use_memcpy =
+        (same_data_types && (nd == 1) && (src_offset == 0) &&
+         (dst_offset == 0) && (simplified_src_strides[0] == 1) &&
+         (simplified_dst_strides[0] == 1));
+
+    if (can_use_memcpy) {
+        int src_elem_size = npy_src.itemsize();
+
+        sycl::event copy_ev = exec_q.memcpy(
+            static_cast<void *>(dst_data), static_cast<const void *>(src_data),
+            src_nelems * src_elem_size, depends);
+
+        {
+            // wait for copy_ev to complete
+            // release GIL to allow other threads (host_tasks)
+            // a chance to acquire GIL
+            py::gil_scoped_release lock{};
+
+            copy_ev.wait();
+        }
+
+        return;
+    }
+
     // Minimum and maximum element offsets for source np.ndarray
     py::ssize_t npy_src_min_nelem_offset(src_offset);
     py::ssize_t npy_src_max_nelem_offset(src_offset);
@@ -230,17 +262,22 @@ void copy_numpy_ndarray_into_usm_ndarray(
     }
     const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
 
-    // Get implementation function pointer
-    auto copy_and_cast_from_host_blocking_fn =
-        copy_and_cast_from_host_blocking_dispatch_table[dst_type_id]
-                                                       [src_type_id];
+    {
+        // release GIL for the blocking call
+        py::gil_scoped_release lock{};
+
+        // Get implementation function pointer
+        auto copy_and_cast_from_host_blocking_fn =
+            copy_and_cast_from_host_blocking_dispatch_table[dst_type_id]
+                                                           [src_type_id];
 
-    copy_and_cast_from_host_blocking_fn(
-        exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
-        npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
-        dst_offset, depends, {copy_shape_ev});
+        copy_and_cast_from_host_blocking_fn(
+            exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
+            npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
+            dst_offset, depends, {copy_shape_ev});
 
-    sycl::free(shape_strides, exec_q);
+        sycl::free(shape_strides, exec_q);
+    }
 
     return;
 }
diff --git a/dpctl/tests/test_tensor_clip.py b/dpctl/tests/test_tensor_clip.py
@@ -767,3 +767,11 @@ def test_clip_readonly_out():
 
     with pytest.raises(ValueError):
         dpt.clip(x, out=r)
+
+
+def test_clip_gh_1744():
+    get_queue_or_skip()
+    x = dpt.asarray([0, 255], dtype=dpt.uint8)
+    y = dpt.clip(x, -300, 300)
+
+    assert dpt.all(x == y)
diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py
@@ -1063,11 +1063,34 @@ def test_tofrom_numpy(shape, dtype, usm_type):
     skip_if_dtype_not_supported(dtype, q)
     Xusm = dpt.zeros(shape, dtype=dtype, usm_type=usm_type, sycl_queue=q)
     Ynp = np.ones(shape, dtype=dtype)
+    Ynp[(0,) * len(shape)] = 0
     ind = (slice(None, None, None),) * Ynp.ndim
     Xusm[ind] = Ynp
     assert np.array_equal(dpt.to_numpy(Xusm), Ynp)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    _all_dtypes,
+)
+@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
+def test_tofrom_numpy_permuted(dtype, usm_type):
+    shape = (3, 5, 7)
+    perm = (1, 2, 0)
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    Xusm = dpt.permute_dims(
+        dpt.zeros(shape, dtype=dtype, usm_type=usm_type, sycl_queue=q), perm
+    )
+    Ynp = np.transpose(np.ones(shape, dtype=dtype), perm)
+    Ynp[:, ::2, ::2] = 0
+    ind = (slice(None, None, None),) * Ynp.ndim
+    # even though Xusm and Ynp are strided, simple memcpy could be done.
+    # This test validates that it is being done correctly
+    Xusm[ind] = Ynp
+    assert np.array_equal(dpt.to_numpy(Xusm), Ynp)
+
+
 @pytest.mark.parametrize(
     "dtype",
     _all_dtypes,
diff --git a/dpctl/utils/_order_manager.py b/dpctl/utils/_order_manager.py
@@ -1,3 +1,4 @@
+import weakref
 from collections import defaultdict
 from contextvars import ContextVar
 
@@ -88,7 +89,16 @@ def __getitem__(self, q: SyclQueue) -> _SequentialOrderManager:
     def clear(self):
         """Clear content of internal dictionary"""
         _local = self._map.get()
+        for v in _local.values():
+            v.wait()
         _local.clear()
 
 
 SequentialOrderManager = SyclQueueToOrderManagerMap()
+
+
+def _callback(som):
+    som.clear()
+
+
+f = weakref.finalize(SequentialOrderManager, _callback, SequentialOrderManager)
diff --git a/libsyclinterface/source/dpctl_sycl_device_interface.cpp b/libsyclinterface/source/dpctl_sycl_device_interface.cpp
@@ -598,6 +598,16 @@ DPCTLDevice_CreateSubDevicesEqually(__dpctl_keep const DPCTLSyclDeviceRef DRef,
             return nullptr;
         }
         auto D = unwrap<device>(DRef);
+        const auto &supported_properties =
+            D->get_info<info::device::partition_properties>();
+        const auto &beg_it = supported_properties.begin();
+        const auto &end_it = supported_properties.end();
+        if (std::find(beg_it, end_it,
+                      info::partition_property::partition_equally) == end_it)
+        {
+            // device does not support partition equally
+            return nullptr;
+        }
         try {
             auto subDevices = D->create_sub_devices<
                 info::partition_property::partition_equally>(count);
@@ -631,6 +641,16 @@ DPCTLDevice_CreateSubDevicesByCounts(__dpctl_keep const DPCTLSyclDeviceRef DRef,
     }
     if (DRef) {
         auto D = unwrap<device>(DRef);
+        const auto &supported_properties =
+            D->get_info<info::device::partition_properties>();
+        const auto &beg_it = supported_properties.begin();
+        const auto &end_it = supported_properties.end();
+        if (std::find(beg_it, end_it,
+                      info::partition_property::partition_by_counts) == end_it)
+        {
+            // device does not support partition by counts
+            return nullptr;
+        }
         std::vector<std::remove_pointer<decltype(D)>::type> subDevices;
         try {
             subDevices = D->create_sub_devices<
@@ -661,9 +681,29 @@ __dpctl_give DPCTLDeviceVectorRef DPCTLDevice_CreateSubDevicesByAffinity(
     vecTy *Devices = nullptr;
     auto D = unwrap<device>(DRef);
     if (D) {
+        const auto &supported_properties =
+            D->get_info<info::device::partition_properties>();
+        const auto &beg_it = supported_properties.begin();
+        const auto &end_it = supported_properties.end();
+        if (std::find(beg_it, end_it,
+                      info::partition_property::partition_by_affinity_domain) ==
+            end_it)
+        {
+            // device does not support partition by affinity domain
+            return nullptr;
+        }
         try {
             auto domain = DPCTL_DPCTLPartitionAffinityDomainTypeToSycl(
                 PartitionAffinityDomainTy);
+            const auto &supported_affinity_domains =
+                D->get_info<info::device::partition_affinity_domains>();
+            const auto &beg_it = supported_affinity_domains.begin();
+            const auto &end_it = supported_affinity_domains.end();
+            if (std::find(beg_it, end_it, domain) == end_it) {
+                // device does not support partitioning by this particular
+                // affinity domain
+                return nullptr;
+            }
             auto subDevices = D->create_sub_devices<
                 info::partition_property::partition_by_affinity_domain>(domain);
             Devices = new vecTy();

Original file line number	Diff line number	Diff line change
`@@ -635,7 +635,7 @@ void copy_and_cast_from_host_impl(`
`635`	`635`
`636`	`636`	`// perform explicit synchronization. Implicit synchronization would be`
`637`	`637`	`// performed by sycl::buffer destructor.`
`638`		`- copy_and_cast_from_host_ev.wait_and_throw();`
	`638`	`+ copy_and_cast_from_host_ev.wait();`
`639`	`639`
`640`	`640`	`return;`
`641`	`641`	`}`