diff --git a/doc/source/vocabulary.rst b/doc/source/vocabulary.rst index 084d09cb..a35db6d6 100644 --- a/doc/source/vocabulary.rst +++ b/doc/source/vocabulary.rst @@ -13,19 +13,24 @@ In general, it is best to avoid using these parameter names for purposes other t .. code-block:: python - kernel_tuner #is inserted by Kernel Tuner to signal the code is compiled using the tuner + kernel_tuner # is inserted by Kernel Tuner to signal the code is compiled using the tuner - block_size_* #reserved for thread block dimensions - grid_size_* #reserved for grid dimensions, if you want to tune these use problem_size + block_size_* # reserved for thread block dimensions + grid_size_* # reserved for grid dimensions, if you want to tune these use problem_size - compiler_opt_* #reserved for future support for tuning compiler options + compiler_opt_* # reserved for future support for tuning compiler options + + loop_unroll_factor_* # reserved for tunable parameters that specify loop unrolling factors + + nvml_* # reserved for tunable parameters and outputs related to NVML + nvml_pwr_limit # use NVML to set power limit + nvml_gr_clock # use NVML to set graphics clock + nvml_mem_clock # use NVML to set memory clock + + cuda_* # reserved for setting parameters related to CUDA kernel execution + cuda_sm_percentage # set the percentage of active SMs (requires cuda-python) - loop_unroll_factor_* #reserved for tunable parameters that specify loop unrolling factors - nvml_* #reserved for tunable parameters and outputs related to NVML - nvml_pwr_limit #use NVML to set power limit - nvml_gr_clock #use NVML to set graphics clock - nvml_mem_clock #use NVML to set memory clock There are also a number of names that Kernel Tuner uses for reporting benchmarking results. @@ -33,7 +38,7 @@ Because these are reported along with the tunable parameters, it is generally a .. code-block:: python - time* #reserved for time measurements + time* # reserved for time measurements # Information that can be observed using kernel_tuner.nvml.NVMLObserver: nvml_energy diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 2c8edc41..f55933c4 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -8,7 +8,7 @@ from kernel_tuner.backends.backend import GPUBackend from kernel_tuner.observers.nvcuda import CudaRuntimeObserver from kernel_tuner.util import SkippableFailure -from kernel_tuner.utils.nvcuda import cuda_error_check, to_valid_nvrtc_gpu_arch_cc, find_cuda_home +from kernel_tuner.utils.nvcuda import cuda_error_check, to_valid_nvrtc_gpu_arch_cc, find_cuda_home, _check # embedded in try block to be able to generate documentation # and run tests without cuda-python installed @@ -84,6 +84,9 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None cuda_error_check(err) err, self.end = driver.cuEventCreate(0) cuda_error_check(err) + self.current_sm_percentage = 100 + self.green_ctx_cache = {} + self.green_ctx = None # default dynamically allocated shared memory size, can be overwritten using smem_args self.smem_size = 0 @@ -116,10 +119,84 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None observer.register_device(self) def __del__(self): + # Cleanup streams and green contexts, if any + if self.green_ctx_cache: + for val in self.green_ctx_cache.values(): + green_ctx, stream, _ = val + _check(driver.cuStreamDestroy(stream)) + _check(driver.cuGreenCtxDestroy(green_ctx)) + + # Cleanup for device_memory in self.allocations: if isinstance(device_memory, driver.CUdeviceptr): - err = driver.cuMemFree(device_memory) - cuda_error_check(err) + _check(driver.cuMemFree(device_memory)) + + + def set_sm_percentage(self, sm_percentage): + """ Set the active SM percentage + + Create a CUDA green context owning ~`sm_percentage` of the device's SMs + and a stream bound to it. Kernels launched afterwards are restricted + to that SM partition. Green contexts are cached in self.green_ctx_cache. + The actual number of SMs in the partition may not exactly match the + requested percentage. An observer may be used to query: + + * Currently assigned number of SMs: self.assigned_sm_count + * Currently requested SM percentage: self.current_sm_percentage + + Requires: CUDA >= 12.4 and a GPU that supports SM partitioning. + """ + + if not 0 < sm_percentage <= 100: + raise ValueError("sm_percentage must be in (0, 100]") + + # Check if sm_percentage is already applied + if sm_percentage == self.current_sm_percentage: + return + + # Check if this sm_percentage has been requested before + if sm_percentage in self.green_ctx_cache: + self.green_ctx, self.stream, self.assigned_sm_count = self.green_ctx_cache[sm_percentage] + self.current_sm_percentage = sm_percentage + return + + # Get total SMs and desired percentage + total_sms = _check(driver.cuDeviceGetAttribute( + driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, self.device)) + want = max(1, round(total_sms * sm_percentage / 100.0)) + + # Full SM resource pool of the device. + sm_resource = _check(driver.cuDeviceGetDevResource( + self.device, driver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM)) + + # Split off one group of at least `want` SMs. The driver rounds up to the + # device's partitioning granularity, so the actual count may be larger. + groups, _nb, _remaining = _check(driver.cuDevSmResourceSplitByCount( + 1, # number of groups requested + sm_resource, # input resource + 0, # useFlags (0 = default) + want, # minCount of SMs per group + )) + group = groups[0] + assigned = group.sm.smCount + + # Descriptor -> green context. + desc = _check(driver.cuDevResourceGenerateDesc([group], 1)) + green_ctx = _check(driver.cuGreenCtxCreate( + desc, self.device, driver.CUgreenCtxCreate_flags.CU_GREEN_CTX_DEFAULT_STREAM)) + + # A stream from the green context confines launches to its SMs. + stream = _check(driver.cuGreenCtxStreamCreate( + green_ctx, + driver.CUstream_flags.CU_STREAM_NON_BLOCKING, + 0, # priority + )) + self.green_ctx_cache[sm_percentage] = (green_ctx, stream, assigned) + self.green_ctx = green_ctx + self.stream = stream + self.assigned_sm_count = assigned + self.current_sm_percentage = sm_percentage + def ready_argument_list(self, arguments): """Ready argument list to be passed to the kernel, allocates gpu mem. @@ -146,6 +223,7 @@ def ready_argument_list(self, arguments): gpu_args.append(arg) return gpu_args + def compile(self, kernel_instance): """Call the CUDA compiler to compile the kernel, return the device function. diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 33089534..645f1f49 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -470,8 +470,12 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett logging.debug("thread block dimensions x,y,z=%d,%d,%d", *instance.threads) logging.debug("grid dimensions x,y,z=%d,%d,%d", *instance.grid) + # Set execution parameters if self.use_nvml and not skip_nvml_setting: self.set_nvml_parameters(instance) + if "cuda_sm_percentage" in instance.params: + # Currently only supported on cuda-python (NVCUDA) + self.dev.set_sm_percentage(instance.params["cuda_sm_percentage"]) # Call the observers to register the configuration to be benchmarked for obs in self.dev.observers: diff --git a/kernel_tuner/utils/nvcuda.py b/kernel_tuner/utils/nvcuda.py index cffbfea0..83dca98a 100644 --- a/kernel_tuner/utils/nvcuda.py +++ b/kernel_tuner/utils/nvcuda.py @@ -62,6 +62,15 @@ def cuda_error_check(error): raise RuntimeError(f"NVRTC error: {desc.decode()}") +def _check(call_result): + """Unwrap a cuda-python (CUresult, ...) return tuple and raise on error.""" + err, *rest = call_result + cuda_error_check(err) + if not rest: + return None + return rest[0] if len(rest) == 1 else tuple(rest) + + def to_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> str: """Returns a valid Compute Capability for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options.""" return max(NVRTC_VALID_CC[NVRTC_VALID_CC <= compute_capability], default="75") diff --git a/test/test_cuda_functions.py b/test/test_cuda_functions.py index 1fe509c1..f6bd3e8b 100644 --- a/test/test_cuda_functions.py +++ b/test/test_cuda_functions.py @@ -56,6 +56,23 @@ def test_compile(): dev = nvcuda.CudaFunctions(0) dev.compile(kernel_instance) + +@skip_if_no_cuda +def test_set_sm_percentage(): + + dev = nvcuda.CudaFunctions(0) + default_stream = dev.stream + + test_value = 50 + dev.set_sm_percentage(test_value) + + assert dev.current_sm_percentage == test_value + assert test_value in dev.green_ctx_cache + assert dev.green_ctx is not None + assert not dev.stream == default_stream + assert dev.assigned_sm_count + + @skip_if_no_cuda def test_compile_template():