Skip to content

Commit 9218f1c

Browse files
authored
Indexing routine to work with huge arrays (#2721)
The PR enables compiling indexing extension with `-fno-sycl-id-queries-fit-in-int` option to support huge arrays. Otherwise there will be an exception raised: ``` > h_ev, choose_ev = indexing_ext._choose(inds, chcs, out, mode, q, dep_evs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ E RuntimeError: Provided range and/or offset does not fit in int. Pass `-fno-sycl-id-queries-fit-in-int' to remove this limit. ````
1 parent dd9976d commit 9218f1c

File tree

14 files changed

+304
-164
lines changed

14 files changed

+304
-164
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
4141
* Aligned the signature of `dpnp.reshape` function with Python array API by making `shape` a required argument [#2673](https://github.com/IntelPython/dpnp/pull/2673)
4242
* Unified `dpnp` public API exports by consolidating function exports in `__init__.py` and removing wildcard imports [#2665](https://github.com/IntelPython/dpnp/pull/2665) [#2666](https://github.com/IntelPython/dpnp/pull/2666)
4343
* Updated tests to reflect the new scalar conversion rules for non-0D `usm_ndarray` [#2694](https://github.com/IntelPython/dpnp/pull/2694)
44+
* Compile indexing extension with `-fno-sycl-id-queries-fit-in-int` to support huge arrays [#2721](https://github.com/IntelPython/dpnp/pull/2721)
4445

4546
### Deprecated
4647

dpnp/backend/extensions/indexing/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ else()
8484
)
8585
endif()
8686

87+
target_compile_options(${python_module_name} PUBLIC -fno-sycl-id-queries-fit-in-int)
8788
target_link_options(${python_module_name} PUBLIC -fsycl-device-code-split=per_kernel)
8889

8990
if(DPNP_GENERATE_COVERAGE)

dpnp/tests/third_party/cupy/core_tests/test_carray.py

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import unittest
1+
from __future__ import annotations
22

33
import pytest
44

@@ -8,7 +8,7 @@
88
pytest.skip("CArray is not supported", allow_module_level=True)
99

1010

11-
class TestCArray(unittest.TestCase):
11+
class TestCArray:
1212

1313
def test_size(self):
1414
x = cupy.arange(3).astype("i")
@@ -63,39 +63,38 @@ def test_getitem_idx(self):
6363
testing.assert_array_equal(y, x)
6464

6565

66-
@testing.parameterize(
67-
{"size": 2**31 - 1024},
68-
{"size": 2**31},
69-
{"size": 2**31 + 1024},
70-
{"size": 2**32 - 1024},
71-
{"size": 2**32},
72-
{"size": 2**32 + 1024},
66+
@pytest.mark.parametrize(
67+
"size",
68+
[2**31 - 1024, 2**31, 2**31 + 1024, 2**32 - 1024, 2**32, 2**32 + 1024],
7369
)
74-
@testing.slow
75-
class TestCArray32BitBoundary(unittest.TestCase):
70+
@pytest.mark.slow
71+
@pytest.mark.thread_unsafe(reason="too large allocations")
72+
class TestCArray32BitBoundary:
7673
# This test case is intended to confirm CArray indexing work correctly
7774
# with input/output arrays whose size is so large that it crosses the
7875
# 32-bit boundary (in terms of both number of elements and size in bytes).
7976
# This test requires approx. 8 GiB GPU memory to run.
8077
# See https://github.com/cupy/cupy/pull/882 for detailed discussions.
81-
82-
def tearDown(self):
83-
# Free huge memory for slow test
78+
def teardown_method(self):
8479
cupy.get_default_memory_pool().free_all_blocks()
8580

8681
# HIP is known to fail with sizes > 2**32-1024
87-
@unittest.skipIf(cupy.cuda.runtime.is_hip, "HIP does not support this")
88-
def test(self):
82+
@pytest.mark.skipif(
83+
cupy.cuda.runtime.is_hip, reason="HIP does not support this"
84+
)
85+
def test(self, size):
8986
# Elementwise
90-
a = cupy.full((1, self.size), 7, dtype=cupy.int8)
87+
a = cupy.full((1, size), 7, dtype=cupy.int8)
9188
# Reduction
9289
result = a.sum(axis=0, dtype=cupy.int8)
9390
# Explicitly specify the dtype to absorb Linux/Windows difference.
94-
assert result.sum(dtype=cupy.int64) == self.size * 7
91+
assert result.sum(dtype=cupy.int64) == size * 7
9592

9693
# HIP is known to fail with sizes > 2**32-1024
97-
@unittest.skipIf(cupy.cuda.runtime.is_hip, "HIP does not support this")
98-
def test_assign(self):
99-
a = cupy.zeros(self.size, dtype=cupy.int8)
94+
@pytest.mark.skipif(
95+
cupy.cuda.runtime.is_hip, reason="HIP does not support this"
96+
)
97+
def test_assign(self, size):
98+
a = cupy.zeros(size, dtype=cupy.int8)
10099
a[-1] = 1.0
101100
assert a.sum() == 1

dpnp/tests/third_party/cupy/core_tests/test_core.py

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88

99
import dpnp as cupy
1010
from dpnp.tests.third_party.cupy import testing
11+
from dpnp.tests.third_party.cupy.testing._protocol_helpers import (
12+
DummyObjectWithCudaArrayInterface,
13+
DummyObjectWithCuPyGetNDArray,
14+
)
1115

1216

1317
class TestSize(unittest.TestCase):
@@ -37,6 +41,7 @@ def test_size_axis_error(self, dtype):
3741

3842
@testing.numpy_cupy_equal()
3943
@testing.slow
44+
# @pytest.mark.thread_unsafe(reason="Allocation too large.")
4045
def test_size_huge(self, xp):
4146
a = xp.ndarray(2**32, "b") # 4 GiB
4247
return xp.size(a)
@@ -95,33 +100,44 @@ def test_cupy_ndarray(self, dtype):
95100
for v in (arr, (arr, arr)):
96101
assert cupy.min_scalar_type(v) is arr.dtype
97102

98-
99-
@testing.parameterize(
100-
*testing.product(
101-
{
102-
"cxx": (None, "--std=c++14"),
103-
}
103+
@pytest.mark.parametrize(
104+
"cupy_like",
105+
[
106+
DummyObjectWithCuPyGetNDArray,
107+
DummyObjectWithCudaArrayInterface,
108+
],
104109
)
105-
)
106-
@pytest.mark.skip("compiling cupy headers are not supported")
107-
class TestCuPyHeaders(unittest.TestCase):
110+
def test_cupy_likes_and_nested(self, cupy_like):
111+
arr = cupy.array([[-1, 1]], dtype="int8")
108112

109-
def setUp(self):
113+
obj = cupy_like(arr)
114+
assert cupy.min_scalar_type(obj) is arr.dtype
115+
if cupy_like is DummyObjectWithCuPyGetNDArray:
116+
# __cupy_get_ndarray__ path currently assumes .shape and .dtype
117+
obj.shape = arr.shape
118+
obj.dtype = arr.dtype
119+
assert cupy.min_scalar_type([obj, obj]) is arr.dtype
120+
121+
122+
@pytest.mark.skip("compiling cupy headers are not supported")
123+
class TestCuPyHeaders:
124+
def setup_method(self):
110125
self.temporary_cache_dir_context = test_raw.use_temporary_cache_dir()
111126
self.cache_dir = self.temporary_cache_dir_context.__enter__()
112127
self.header = "\n".join(
113128
["#include <" + h + ">" for h in core._cupy_header_list]
114129
)
115130

116-
def tearDown(self):
131+
def teardown_method(self):
117132
self.temporary_cache_dir_context.__exit__(*sys.exc_info())
118133

119-
def test_compiling_core_header(self):
134+
@pytest.mark.parametrize("cxx", (None, "--std=c++17"))
135+
def test_compiling_core_header(self, cxx):
120136
code = r"""
121137
extern "C" __global__ void _test_ker_() { }
122138
"""
123139
code = self.header + code
124-
options = () if self.cxx is None else (self.cxx,)
140+
options = () if cxx is None else (cxx,)
125141
ker = cupy.RawKernel(
126142
code, "_test_ker_", options=options, backend="nvrtc"
127143
)

dpnp/tests/third_party/cupy/core_tests/test_cub_reduction.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
import sys
24
import unittest
35
from itertools import combinations
@@ -19,17 +21,18 @@
1921
# This test class and its children below only test if CUB backend can be used
2022
# or not; they don't verify its correctness as it's already extensively covered
2123
# by existing tests
22-
@unittest.skipIf(_environment.get_cub_path() is None, "CUB not found")
2324
class CubReductionTestBase(unittest.TestCase):
2425
"""
2526
Note: call self.can_use() when arrays are already allocated, otherwise
2627
call self._test_can_use().
2728
"""
2829

2930
def setUp(self):
31+
if _environment.get_cub_path() is None:
32+
pytest.skip("CUB not found")
3033
if cupy.cuda.runtime.is_hip:
3134
if _environment.get_hipcc_path() is None:
32-
self.skipTest("hipcc is not found")
35+
pytest.skip("hipcc is not found")
3336

3437
self.can_use = cupy._core._cub_reduction._can_use_cub_block_reduction
3538

dpnp/tests/third_party/cupy/core_tests/test_dlpack.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
import dpctl
24
import dpctl.tensor._dlpack as dlp
35
import numpy
@@ -60,6 +62,8 @@ class TestNewDLPackConversion:
6062
def pool(self, request):
6163
self.memory = request.param
6264
if self.memory == "managed":
65+
# if cuda.runtime.is_hip:
66+
# pytest.skip("HIP does not support managed memory")
6367
old_pool = cupy.get_default_memory_pool()
6468
new_pool = cuda.MemoryPool(cuda.malloc_managed)
6569
cuda.set_allocator(new_pool.malloc)
@@ -201,6 +205,8 @@ def test_conversion_device_to_cpu(self):
201205
@pytest.mark.skip("due to dpctl-2213")
202206
def test_stream(self):
203207
allowed_streams = ["null", True]
208+
# if not cuda.runtime.is_hip:
209+
# allowed_streams.append("ptds")
204210

205211
# stream order is automatically established via DLPack protocol
206212
for src_s in [self._get_stream(s) for s in allowed_streams]:
@@ -226,18 +232,18 @@ class TestDLTensorMemory:
226232

227233
@pytest.fixture
228234
def pool(self):
229-
pass
235+
# old_pool = cupy.get_default_memory_pool()
236+
# pool = cupy.cuda.MemoryPool()
237+
# cupy.cuda.set_allocator(pool.malloc)
230238

231-
# old_pool = cupy.get_default_memory_pool()
232-
# pool = cupy.cuda.MemoryPool()
233-
# cupy.cuda.set_allocator(pool.malloc)
239+
# yield pool
234240

235-
# yield pool
236-
237-
# pool.free_all_blocks()
238-
# cupy.cuda.set_allocator(old_pool.malloc)
241+
# pool.free_all_blocks()
242+
# cupy.cuda.set_allocator(old_pool.malloc)
243+
pass
239244

240245
@pytest.mark.parametrize("max_version", [None, (1, 0)])
246+
# @pytest.mark.thread_unsafe(reason="modifies pool and tracks allocations")
241247
def test_deleter(self, pool, max_version):
242248
# memory is freed when tensor is deleted, as it's not consumed
243249
array = cupy.empty(10)
@@ -252,6 +258,7 @@ def test_deleter(self, pool, max_version):
252258
# assert pool.n_free_blocks() == 1
253259

254260
@pytest.mark.parametrize("max_version", [None, (1, 0)])
261+
# @pytest.mark.thread_unsafe(reason="modifies pool and tracks allocations")
255262
def test_deleter2(self, pool, max_version):
256263
# memory is freed when array2 is deleted, as tensor is consumed
257264
array = cupy.empty(10)

dpnp/tests/third_party/cupy/core_tests/test_ndarray.py

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,11 @@ def test_copy_multi_device_non_contiguous_K(self):
246246
# See cupy/cupy#5004
247247
@pytest.mark.skip("RawKernel() is not supported")
248248
@testing.multi_gpu(2)
249+
# @pytest.mark.xfail(
250+
# runtime.is_hip,
251+
# reason='ROCm may work differently in async D2D copy with streams')
252+
# @pytest.mark.thread_unsafe(
253+
# reason="order is unclear multithread. Also, hard crash in threaded!")
249254
def test_copy_multi_device_with_stream(self):
250255
# Kernel that takes long enough then finally writes values.
251256
src = _test_copy_multi_device_with_stream_src
@@ -430,21 +435,6 @@ def test_cuda_array_interface_stream(self):
430435
assert iface["stream"] == stream.ptr
431436

432437

433-
@pytest.mark.skip("CUDA interface is not supported")
434-
class TestNdarrayCudaInterfaceNoneCUDA(unittest.TestCase):
435-
436-
def setUp(self):
437-
self.arr = cupy.zeros(shape=(2, 3), dtype=cupy.float64)
438-
439-
def test_cuda_array_interface_hasattr(self):
440-
assert not hasattr(self.arr, "__cuda_array_interface__")
441-
442-
def test_cuda_array_interface_getattr(self):
443-
with pytest.raises(AttributeError) as e:
444-
getattr(self.arr, "__cuda_array_interface__")
445-
assert "HIP" in str(e.value)
446-
447-
448438
@testing.parameterize(
449439
*testing.product(
450440
{

dpnp/tests/third_party/cupy/core_tests/test_ndarray_adv_indexing.py

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
import itertools
24

35
import numpy
@@ -435,6 +437,15 @@ def test_invalid_adv_getitem(self):
435437
a[self.indexes]
436438

437439

440+
class TestArrayBadDTypeIndexAdvGetitem:
441+
@pytest.mark.parametrize("dtype", [object, "i,i", "float32", "str"])
442+
def test_bad_dtype_adv_getitem(self, dtype):
443+
# Test various bad dtypes, supported by CuPy or not.
444+
a = cupy.arange(10)
445+
with pytest.raises(IndexError, match="arrays used as indices"):
446+
a[numpy.array([1, 2], dtype=dtype)]
447+
448+
438449
@testing.parameterize(
439450
{"shape": (0,), "indexes": ([False],)},
440451
{
@@ -950,6 +961,60 @@ class TestArrayAdvancedIndexingSetitemTranspose:
950961
def test_adv_setitem_transp(self, xp):
951962
shape = (2, 3, 4)
952963
a = xp.zeros(shape).transpose(0, 2, 1)
953-
slices = (xp.array([1, 0]), slice(None), xp.array([2, 1]))
964+
slices = (numpy.array([1, 0]), slice(None), numpy.array([2, 1]))
954965
a[slices] = 1
955966
return a
967+
968+
969+
class TestHugeArrays:
970+
# These tests require a lot of memory
971+
@testing.slow
972+
def test_advanced(self):
973+
try:
974+
arr = cupy.ones((1, 2**30), dtype=cupy.int8)
975+
idx = cupy.zeros(3, dtype=cupy.int32)
976+
res = arr[idx, :]
977+
# sanity check, we mostly care about it not crashing.
978+
assert res.sum() == 3 * 2**30
979+
del res
980+
981+
arr[idx, :] = cupy.array([[3], [3], [3]], dtype=cupy.int8)
982+
# Check 3 got written (order may not be strictly guaranteed)
983+
assert arr.sum() == 2**30 * 3
984+
except MemoryError:
985+
pytest.skip("out of memory in test.")
986+
987+
@testing.slow
988+
def test_take_array(self):
989+
try:
990+
arr = cupy.ones((1, 2**32), dtype=cupy.int8)
991+
arr[0, 2**30] = 0 # We should see each of these once
992+
arr[0, -1] = 0
993+
res = arr.take(cupy.array([0, 0]), axis=0)
994+
# sanity check, we mostly care about it not crashing.
995+
assert res.sum() == 2 * (2**32 - 2)
996+
except MemoryError:
997+
pytest.skip("out of memory in test.")
998+
999+
@testing.slow
1000+
def test_take_scalar(self):
1001+
try:
1002+
arr = cupy.ones((1, 2**32), dtype=cupy.int8)
1003+
arr[0, 2**30] = 0 # We should see each of these once
1004+
arr[0, -1] = 0
1005+
res = arr.take(0, axis=0)
1006+
# sanity check, we mostly care about it not crashing.
1007+
assert res.sum() == 2**32 - 2
1008+
except MemoryError:
1009+
pytest.skip("out of memory in test.")
1010+
1011+
@testing.slow
1012+
def test_choose(self):
1013+
try:
1014+
choices = cupy.zeros((2, 2**31), dtype=cupy.int8)
1015+
choices[1, :] = 1
1016+
res = choices[1, :].choose(choices)
1017+
# sanity check, we mostly care about it not crashing.
1018+
assert res.sum() == 2**31
1019+
except MemoryError:
1020+
pytest.skip("out of memory in test.")

0 commit comments

Comments
 (0)