From a67453aa37ca95012b38b5aea13a7e79fbc17fc0 Mon Sep 17 00:00:00 2001 From: Georgi Mirazchiyski Date: Wed, 20 Nov 2024 18:08:20 +0000 Subject: [PATCH] [Cuda] Implement device info ext properties for HW_THREADS_PER_EU and EU_SIMD_WIDTH on Cuda --- source/adapters/cuda/device.cpp | 19 +++++++++++++++++-- source/adapters/cuda/device.hpp | 7 +++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp index cb6b757dd3..7709972ca0 100644 --- a/source/adapters/cuda/device.cpp +++ b/source/adapters/cuda/device.cpp @@ -1082,13 +1082,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_COMPOSITE_DEVICE: case UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS: case UR_DEVICE_INFO_GPU_EU_COUNT: - case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH: case UR_DEVICE_INFO_GPU_EU_SLICES: case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE: case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE: - case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU: return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH: { + // Nvidia's GPU SIMD units are warp-size wide. + return ReturnValue(hDevice->getWarpSize()); + } + case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU: { + int MaxHwThreads{0}; + UR_CHECK_ERROR(cuDeviceGetAttribute( + reinterpret_cast(&MaxHwThreads), + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, hDevice->get())); + detail::ur::assertion(MaxHwThreads > 0); + // calculate the maximum number of resident warps per SM. + const uint32_t WarpSize = hDevice->getWarpSize(); + detail::ur::assertion(WarpSize > 0); + uint32_t ResidentWarpCount = static_cast(MaxHwThreads) / WarpSize; + return ReturnValue(ResidentWarpCount); + } + case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: case UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP: return ReturnValue(true); diff --git a/source/adapters/cuda/device.hpp b/source/adapters/cuda/device.hpp index 3654f2bb36..b4b4c78fe7 100644 --- a/source/adapters/cuda/device.hpp +++ b/source/adapters/cuda/device.hpp @@ -33,6 +33,7 @@ struct ur_device_handle_t_ { int MaxChosenLocalMem{0}; bool MaxLocalMemSizeChosen{false}; uint32_t NumComputeUnits{0}; + uint32_t WarpSize{0}; public: ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase, @@ -59,6 +60,10 @@ struct ur_device_handle_t_ { reinterpret_cast(&NumComputeUnits), CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuDevice)); + UR_CHECK_ERROR(cuDeviceGetAttribute(reinterpret_cast(&WarpSize), + CU_DEVICE_ATTRIBUTE_WARP_SIZE, + cuDevice)); + // Set local mem max size if env var is present static const char *LocalMemSizePtrUR = std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE"); @@ -114,6 +119,8 @@ struct ur_device_handle_t_ { bool maxLocalMemSizeChosen() { return MaxLocalMemSizeChosen; }; uint32_t getNumComputeUnits() const noexcept { return NumComputeUnits; }; + + uint32_t getWarpSize() const noexcept { return WarpSize; }; }; int getAttribute(ur_device_handle_t Device, CUdevice_attribute Attribute);