Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions numba_cuda/numba/cuda/tests/test_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import numpy as np

from numba import cuda
from numba.cuda.testing import CUDATestCase, skip_on_cudasim

@cuda.jit
def warp_size_kernel(out):
i = cuda.grid(1)
if i < out.size:
out[i] = cuda.config.WARP_SIZE
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

logic: cuda.config.WARP_SIZE is not implemented in the codebase. Searched the entire repository and found no device-side constant definition for this attribute. The cuda module's CudaModuleTemplate in cudadecl.py has no resolve_config method, and there's no mechanism to expose config values inside kernels.


@cuda.jit
def max_threads_kernel(out):
i = cuda.grid(1)
if i < out.size:
out[i] = cuda.config.MAX_THREADS_PER_BLOCK
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

logic: cuda.config.MAX_THREADS_PER_BLOCK is not implemented. While driver.get_device().MAX_THREADS_PER_BLOCK exists for host-side access (see kernels/transpose.py:35), there's no device-side constant accessible within kernels.


@cuda.jit
def config_control_flow_kernel(inp, out):
i = cuda.grid(1)
if i < inp.size:
if cuda.config.WARP_SIZE >= 32:
out[i] = inp[i] * 2
else:
out[i] = inp[i]

@skip_on_cudasim("CUDA config values are backend-specific")
class TestCudaConfig(CUDATestCase):

def _launch_1d(self, kernel, args, size):
threadsperblock = 128
blockspergrid = (size + threadsperblock - 1) // threadsperblock
kernel[blockspergrid, threadsperblock](*args)
cuda.synchronize()

def test_warp_size_visible_in_kernel(self):
out = np.zeros(8, dtype=np.int32)
d_out = cuda.to_device(out)
self._launch_1d(warp_size_kernel, (d_out,), out.size)
result = d_out.copy_to_host()
# Warp size is expected to be consistent across all threads
self.assertTrue(np.all(result == result[0]))
self.assertGreater(result[0], 0)

def test_max_threads_visible_in_kernel(self):
out = np.zeros(4, dtype=np.int32)
d_out = cuda.to_device(out)
self._launch_1d(max_threads_kernel, (d_out,), out.size)
result = d_out.copy_to_host()
self.assertTrue(np.all(result == result[0]))
self.assertGreaterEqual(result[0], 64)

def test_config_used_in_control_flow(self):
inp = np.arange(6, dtype=np.int32)
out = np.zeros_like(inp)
d_inp = cuda.to_device(inp)
d_out = cuda.to_device(out)
self._launch_1d(
config_control_flow_kernel,
(d_inp, d_out),
inp.size,
)
expected = inp * 2 if cuda.config.WARP_SIZE >= 32 else inp
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

logic: This line attempts host-side access to cuda.config.WARP_SIZE, but this also doesn't exist. The test references a non-existent API on both device and host sides.

np.testing.assert_array_equal(
d_out.copy_to_host(),
expected,
)