Skip to content

Commit 37c4fbf

Browse files
committed
Merge branch 'master' into diff-count-nonzero-array-api
2 parents eb42463 + 7b64374 commit 37c4fbf

File tree

10 files changed

+163
-34
lines changed

10 files changed

+163
-34
lines changed

.github/workflows/openssf-scorecard.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ jobs:
3838
persist-credentials: false
3939

4040
- name: "Run analysis"
41-
uses: ossf/scorecard-action@dc50aa9510b46c811795eb24b2f1ba02a914e534 # v2.3.3
41+
uses: ossf/scorecard-action@62b2cac7ed8198b15735ed49ab1e5cf35480ba46 # v2.4.0
4242
with:
4343
results_file: results.sarif
4444
results_format: sarif
@@ -68,6 +68,6 @@ jobs:
6868

6969
# Upload the results to GitHub's code scanning dashboard.
7070
- name: "Upload to code-scanning"
71-
uses: github/codeql-action/upload-sarif@2d790406f505036ef40ecba973cc774a50395aac # v3.25.13
71+
uses: github/codeql-action/upload-sarif@afb54ba388a7dca6ecae48f608c4ff05ff4cc77a # v3.25.15
7272
with:
7373
sarif_file: results.sarif

.github/workflows/os-llvm-sycl-build.yml

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ jobs:
1515
DOWNLOAD_URL_PREFIX: https://github.com/intel/llvm/releases/download
1616
DRIVER_PATH: 2023-WW27
1717
OCLCPUEXP_FN: oclcpuexp-2023.16.6.0.28_rel.tar.gz
18-
FPGAEMU_FN: fpgaemu-2023.16.6.0.28_rel.tar.gz
1918
TBB_URL: https://github.com/oneapi-src/oneTBB/releases/download/v2021.9.0/
2019
TBB_INSTALL_DIR: oneapi-tbb-2021.9.0
2120
TBB_FN: oneapi-tbb-2021.9.0-lin.tgz
@@ -83,15 +82,12 @@ jobs:
8382
rm -rf ${ARTIFACT_NAME}.tar.gz
8483
wget ${DOWNLOAD_URL_PREFIX}/${DEPLOY_NIGHTLY_TAG}/${ARTIFACT_NAME}.tar.gz && echo ${DEPLOY_LLVM_TAG_SHA} > bundle_id.txt || rm -rf bundle_id.txt
8584
[ -f ${OCLCPUEXP_FN} ] || wget ${DOWNLOAD_URL_PREFIX}/${DRIVER_PATH}/${OCLCPUEXP_FN} || rm -rf bundle_id.txt
86-
[ -f ${FPGAEMU_FN} ] || wget ${DOWNLOAD_URL_PREFIX}/${DRIVER_PATH}/${FPGAEMU_FN} || rm -rf bundle_id.txt
8785
[ -f ${TBB_FN} ] || wget ${TBB_URL}/${TBB_FN} || rm -rf bundle_id.txt
8886
rm -rf dpcpp_compiler
8987
mkdir -p dpcpp_compiler
9088
tar xf ${ARTIFACT_NAME}.tar.gz -C dpcpp_compiler
9189
mkdir -p oclcpuexp
92-
mkdir -p fpgaemu
9390
[ -d oclcpuexp/x64 ] || tar xf ${OCLCPUEXP_FN} -C oclcpuexp
94-
[ -d fpgaemu/x64 ] || tar xf ${FPGAEMU_FN} -C fpgaemu
9591
[ -d ${TBB_INSTALL_DIR}/lib ] || tar xf ${TBB_FN}
9692
cp oclcpuexp/x64/libOpenCL.so* dpcpp_compiler/lib/
9793
fi
@@ -110,7 +106,7 @@ jobs:
110106
- name: Install dpctl dependencies
111107
shell: bash -l {0}
112108
run: |
113-
pip install numpy"<1.26.0" cython setuptools pytest scikit-build cmake ninja versioneer[toml]==0.29
109+
pip install numpy cython setuptools pytest scikit-build cmake ninja versioneer[toml]==0.29
114110
115111
- name: Checkout repo
116112
uses: actions/[email protected]
@@ -126,10 +122,9 @@ jobs:
126122
export PATH=${SYCL_BUNDLE_FOLDER}/dpcpp_compiler/bin:${PATH}
127123
export LD_LIBRARY_PATH=${SYCL_BUNDLE_FOLDER}/dpcpp_compiler/lib:${LD_LIBRARY_PATH}
128124
export LD_LIBRARY_PATH=${SYCL_BUNDLE_FOLDER}/oclcpuexp/x64:${LD_LIBRARY_PATH}
129-
export LD_LIBRARY_PATH=${SYCL_BUNDLE_FOLDER}/fpgaemu/x64:${LD_LIBRARY_PATH}
130125
export LD_LIBRARY_PATH=${SYCL_BUNDLE_FOLDER}/${TBB_INSTALL_DIR}/lib/intel64/gcc4.8:${LD_LIBRARY_PATH}
131126
export OCL_ICD_VENDORS=
132-
export OCL_ICD_FILENAMES=libintelocl.so:libintelocl_emu.so
127+
export OCL_ICD_FILENAMES=libintelocl.so
133128
EOF
134129
chmod +x set_allvars.sh
135130
cat set_allvars.sh

dpctl/tensor/_clip.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,11 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
295295
)
296296
if order not in ["K", "C", "F", "A"]:
297297
order = "K"
298+
if x.dtype.kind in "iu":
299+
if isinstance(min, int) and min <= dpt.iinfo(x.dtype).min:
300+
min = None
301+
if isinstance(max, int) and max >= dpt.iinfo(x.dtype).max:
302+
max = None
298303
if min is None and max is None:
299304
exec_q = x.sycl_queue
300305
orig_out = out

dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -635,7 +635,7 @@ void copy_and_cast_from_host_impl(
635635

636636
// perform explicit synchronization. Implicit synchronization would be
637637
// performed by sycl::buffer destructor.
638-
copy_and_cast_from_host_ev.wait_and_throw();
638+
copy_and_cast_from_host_ev.wait();
639639

640640
return;
641641
}

dpctl/tensor/libtensor/source/accumulators.cpp

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -160,10 +160,14 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
160160
? mask_positions_contig_i32_dispatch_vector[mask_typeid]
161161
: mask_positions_contig_i64_dispatch_vector[mask_typeid];
162162

163-
size_t total_set = fn(exec_q, mask_size, mask_data, cumsum_data,
164-
host_task_events, depends);
163+
size_t total_set;
164+
165165
{
166166
py::gil_scoped_release release;
167+
168+
total_set = fn(exec_q, mask_size, mask_data, cumsum_data,
169+
host_task_events, depends);
170+
167171
sycl::event::wait(host_task_events);
168172
}
169173
return total_set;
@@ -198,12 +202,13 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
198202
sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
199203

200204
if (2 * static_cast<size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
201-
copy_shape_ev.wait();
202205
{
203206
py::gil_scoped_release release;
207+
208+
copy_shape_ev.wait();
204209
sycl::event::wait(host_task_events);
210+
sycl::free(shape_strides, exec_q);
205211
}
206-
sycl::free(shape_strides, exec_q);
207212
throw std::runtime_error("Unexpected error");
208213
}
209214

@@ -213,15 +218,17 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
213218
dependent_events.insert(dependent_events.end(), depends.begin(),
214219
depends.end());
215220

216-
size_t total_set =
217-
strided_fn(exec_q, mask_size, mask_data, nd, shape_strides, cumsum_data,
218-
host_task_events, dependent_events);
221+
size_t total_set;
219222

220223
{
221224
py::gil_scoped_release release;
225+
226+
total_set = strided_fn(exec_q, mask_size, mask_data, nd, shape_strides,
227+
cumsum_data, host_task_events, dependent_events);
228+
222229
sycl::event::wait(host_task_events);
230+
sycl::free(shape_strides, exec_q);
223231
}
224-
sycl::free(shape_strides, exec_q);
225232

226233
return total_set;
227234
}
@@ -352,8 +359,12 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
352359
sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
353360

354361
if (2 * static_cast<size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
355-
copy_shape_ev.wait();
356-
sycl::event::wait(host_task_events);
362+
{
363+
py::gil_scoped_release release;
364+
365+
copy_shape_ev.wait();
366+
sycl::event::wait(host_task_events);
367+
}
357368
sycl::free(shape_strides, exec_q);
358369
throw std::runtime_error("Unexpected error");
359370
}

dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -116,21 +116,29 @@ void copy_numpy_ndarray_into_usm_ndarray(
116116

117117
// check for applicability of special cases:
118118
// (same type && (both C-contiguous || both F-contiguous)
119-
bool both_c_contig =
119+
const bool both_c_contig =
120120
((src_flags & py::array::c_style) && dst.is_c_contiguous());
121-
bool both_f_contig =
121+
const bool both_f_contig =
122122
((src_flags & py::array::f_style) && dst.is_f_contiguous());
123+
124+
const bool same_data_types = (src_type_id == dst_type_id);
125+
123126
if (both_c_contig || both_f_contig) {
124-
if (src_type_id == dst_type_id) {
127+
if (same_data_types) {
125128
int src_elem_size = npy_src.itemsize();
126129

127130
sycl::event copy_ev =
128131
exec_q.memcpy(static_cast<void *>(dst_data),
129132
static_cast<const void *>(src_data),
130133
src_nelems * src_elem_size, depends);
131134

132-
// wait for copy_ev to complete
133-
copy_ev.wait_and_throw();
135+
{
136+
// wait for copy_ev to complete
137+
// release GIL to allow other threads (host_tasks)
138+
// a chance to acquire GIL
139+
py::gil_scoped_release lock{};
140+
copy_ev.wait();
141+
}
134142

135143
return;
136144
}
@@ -202,6 +210,30 @@ void copy_numpy_ndarray_into_usm_ndarray(
202210
simplified_dst_strides.push_back(1);
203211
}
204212

213+
const bool can_use_memcpy =
214+
(same_data_types && (nd == 1) && (src_offset == 0) &&
215+
(dst_offset == 0) && (simplified_src_strides[0] == 1) &&
216+
(simplified_dst_strides[0] == 1));
217+
218+
if (can_use_memcpy) {
219+
int src_elem_size = npy_src.itemsize();
220+
221+
sycl::event copy_ev = exec_q.memcpy(
222+
static_cast<void *>(dst_data), static_cast<const void *>(src_data),
223+
src_nelems * src_elem_size, depends);
224+
225+
{
226+
// wait for copy_ev to complete
227+
// release GIL to allow other threads (host_tasks)
228+
// a chance to acquire GIL
229+
py::gil_scoped_release lock{};
230+
231+
copy_ev.wait();
232+
}
233+
234+
return;
235+
}
236+
205237
// Minimum and maximum element offsets for source np.ndarray
206238
py::ssize_t npy_src_min_nelem_offset(src_offset);
207239
py::ssize_t npy_src_max_nelem_offset(src_offset);
@@ -230,17 +262,22 @@ void copy_numpy_ndarray_into_usm_ndarray(
230262
}
231263
const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
232264

233-
// Get implementation function pointer
234-
auto copy_and_cast_from_host_blocking_fn =
235-
copy_and_cast_from_host_blocking_dispatch_table[dst_type_id]
236-
[src_type_id];
265+
{
266+
// release GIL for the blocking call
267+
py::gil_scoped_release lock{};
268+
269+
// Get implementation function pointer
270+
auto copy_and_cast_from_host_blocking_fn =
271+
copy_and_cast_from_host_blocking_dispatch_table[dst_type_id]
272+
[src_type_id];
237273

238-
copy_and_cast_from_host_blocking_fn(
239-
exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
240-
npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
241-
dst_offset, depends, {copy_shape_ev});
274+
copy_and_cast_from_host_blocking_fn(
275+
exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
276+
npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
277+
dst_offset, depends, {copy_shape_ev});
242278

243-
sycl::free(shape_strides, exec_q);
279+
sycl::free(shape_strides, exec_q);
280+
}
244281

245282
return;
246283
}

dpctl/tests/test_tensor_clip.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -767,3 +767,11 @@ def test_clip_readonly_out():
767767

768768
with pytest.raises(ValueError):
769769
dpt.clip(x, out=r)
770+
771+
772+
def test_clip_gh_1744():
773+
get_queue_or_skip()
774+
x = dpt.asarray([0, 255], dtype=dpt.uint8)
775+
y = dpt.clip(x, -300, 300)
776+
777+
assert dpt.all(x == y)

dpctl/tests/test_usm_ndarray_ctor.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1063,11 +1063,34 @@ def test_tofrom_numpy(shape, dtype, usm_type):
10631063
skip_if_dtype_not_supported(dtype, q)
10641064
Xusm = dpt.zeros(shape, dtype=dtype, usm_type=usm_type, sycl_queue=q)
10651065
Ynp = np.ones(shape, dtype=dtype)
1066+
Ynp[(0,) * len(shape)] = 0
10661067
ind = (slice(None, None, None),) * Ynp.ndim
10671068
Xusm[ind] = Ynp
10681069
assert np.array_equal(dpt.to_numpy(Xusm), Ynp)
10691070

10701071

1072+
@pytest.mark.parametrize(
1073+
"dtype",
1074+
_all_dtypes,
1075+
)
1076+
@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
1077+
def test_tofrom_numpy_permuted(dtype, usm_type):
1078+
shape = (3, 5, 7)
1079+
perm = (1, 2, 0)
1080+
q = get_queue_or_skip()
1081+
skip_if_dtype_not_supported(dtype, q)
1082+
Xusm = dpt.permute_dims(
1083+
dpt.zeros(shape, dtype=dtype, usm_type=usm_type, sycl_queue=q), perm
1084+
)
1085+
Ynp = np.transpose(np.ones(shape, dtype=dtype), perm)
1086+
Ynp[:, ::2, ::2] = 0
1087+
ind = (slice(None, None, None),) * Ynp.ndim
1088+
# even though Xusm and Ynp are strided, simple memcpy could be done.
1089+
# This test validates that it is being done correctly
1090+
Xusm[ind] = Ynp
1091+
assert np.array_equal(dpt.to_numpy(Xusm), Ynp)
1092+
1093+
10711094
@pytest.mark.parametrize(
10721095
"dtype",
10731096
_all_dtypes,

dpctl/utils/_order_manager.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import weakref
12
from collections import defaultdict
23
from contextvars import ContextVar
34

@@ -88,7 +89,16 @@ def __getitem__(self, q: SyclQueue) -> _SequentialOrderManager:
8889
def clear(self):
8990
"""Clear content of internal dictionary"""
9091
_local = self._map.get()
92+
for v in _local.values():
93+
v.wait()
9194
_local.clear()
9295

9396

9497
SequentialOrderManager = SyclQueueToOrderManagerMap()
98+
99+
100+
def _callback(som):
101+
som.clear()
102+
103+
104+
f = weakref.finalize(SequentialOrderManager, _callback, SequentialOrderManager)

libsyclinterface/source/dpctl_sycl_device_interface.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,16 @@ DPCTLDevice_CreateSubDevicesEqually(__dpctl_keep const DPCTLSyclDeviceRef DRef,
598598
return nullptr;
599599
}
600600
auto D = unwrap<device>(DRef);
601+
const auto &supported_properties =
602+
D->get_info<info::device::partition_properties>();
603+
const auto &beg_it = supported_properties.begin();
604+
const auto &end_it = supported_properties.end();
605+
if (std::find(beg_it, end_it,
606+
info::partition_property::partition_equally) == end_it)
607+
{
608+
// device does not support partition equally
609+
return nullptr;
610+
}
601611
try {
602612
auto subDevices = D->create_sub_devices<
603613
info::partition_property::partition_equally>(count);
@@ -631,6 +641,16 @@ DPCTLDevice_CreateSubDevicesByCounts(__dpctl_keep const DPCTLSyclDeviceRef DRef,
631641
}
632642
if (DRef) {
633643
auto D = unwrap<device>(DRef);
644+
const auto &supported_properties =
645+
D->get_info<info::device::partition_properties>();
646+
const auto &beg_it = supported_properties.begin();
647+
const auto &end_it = supported_properties.end();
648+
if (std::find(beg_it, end_it,
649+
info::partition_property::partition_by_counts) == end_it)
650+
{
651+
// device does not support partition by counts
652+
return nullptr;
653+
}
634654
std::vector<std::remove_pointer<decltype(D)>::type> subDevices;
635655
try {
636656
subDevices = D->create_sub_devices<
@@ -661,9 +681,29 @@ __dpctl_give DPCTLDeviceVectorRef DPCTLDevice_CreateSubDevicesByAffinity(
661681
vecTy *Devices = nullptr;
662682
auto D = unwrap<device>(DRef);
663683
if (D) {
684+
const auto &supported_properties =
685+
D->get_info<info::device::partition_properties>();
686+
const auto &beg_it = supported_properties.begin();
687+
const auto &end_it = supported_properties.end();
688+
if (std::find(beg_it, end_it,
689+
info::partition_property::partition_by_affinity_domain) ==
690+
end_it)
691+
{
692+
// device does not support partition by affinity domain
693+
return nullptr;
694+
}
664695
try {
665696
auto domain = DPCTL_DPCTLPartitionAffinityDomainTypeToSycl(
666697
PartitionAffinityDomainTy);
698+
const auto &supported_affinity_domains =
699+
D->get_info<info::device::partition_affinity_domains>();
700+
const auto &beg_it = supported_affinity_domains.begin();
701+
const auto &end_it = supported_affinity_domains.end();
702+
if (std::find(beg_it, end_it, domain) == end_it) {
703+
// device does not support partitioning by this particular
704+
// affinity domain
705+
return nullptr;
706+
}
667707
auto subDevices = D->create_sub_devices<
668708
info::partition_property::partition_by_affinity_domain>(domain);
669709
Devices = new vecTy();

0 commit comments

Comments
 (0)