diff --git a/.github/workflows/reusable_gpu.yml b/.github/workflows/reusable_gpu.yml index 1a5d54230a..815de5ef97 100644 --- a/.github/workflows/reusable_gpu.yml +++ b/.github/workflows/reusable_gpu.yml @@ -19,7 +19,7 @@ jobs: name: Level-Zero env: VCPKG_PATH: "${{github.workspace}}/../../../../vcpkg/packages/hwloc_x64-windows;${{github.workspace}}/../../../../vcpkg/packages/tbb_x64-windows;${{github.workspace}}/../../../../vcpkg/packages/jemalloc_x64-windows" - COVERAGE_NAME : "exports-coverage-gpu" + COVERAGE_NAME : "exports-coverage-gpu-L0" # run only on upstream; forks will not have the HW if: github.repository == 'oneapi-src/unified-memory-framework' strategy: @@ -130,18 +130,26 @@ jobs: name: CUDA env: COVERAGE_NAME : "exports-coverage-gpu-CUDA" + VCPKG_PATH: "${{github.workspace}}/build/vcpkg/packages/hwloc_x64-windows;${{github.workspace}}/build/vcpkg/packages/tbb_x64-windows;${{github.workspace}}/build/vcpkg/packages/jemalloc_x64-windows;" + CUDA_PATH: "c:/cuda" + # run only on upstream; forks will not have the HW if: github.repository == 'oneapi-src/unified-memory-framework' strategy: matrix: shared_library: ['ON', 'OFF'] build_type: ['Debug', 'Release'] - # TODO add windows - os: ['Ubuntu'] + os: ['Ubuntu', 'Windows'] include: + - os: 'Windows' + compiler: {c: cl, cxx: cl} + number_of_processors: '$Env:NUMBER_OF_PROCESSORS' - os: 'Ubuntu' compiler: {c: gcc, cxx: g++} number_of_processors: '$(nproc)' + exclude: + - os: 'Windows' + build_type: 'Debug' runs-on: ["DSS-CUDA", "DSS-${{matrix.os}}"] steps: @@ -154,10 +162,47 @@ jobs: if: matrix.os == 'Ubuntu' run: .github/scripts/get_system_info.sh + - name: Initialize vcpkg + if: matrix.os == 'Windows' + uses: lukka/run-vcpkg@5e0cab206a5ea620130caf672fce3e4a6b5666a1 # v11.5 + with: + vcpkgGitCommitId: 3dd44b931481d7a8e9ba412621fa810232b66289 + vcpkgDirectory: ${{env.BUILD_DIR}}/vcpkg + vcpkgJsonGlob: '**/vcpkg.json' + + - name: Install dependencies (windows-latest) + if: matrix.os == 'Windows' + run: vcpkg install + shell: pwsh # Specifies PowerShell as the shell for running the script. + + - name: Configure build for Win + if: matrix.os == 'Windows' + run: > + cmake + -DCMAKE_PREFIX_PATH="${{env.VCPKG_PATH}}${{env.CUDA_PATH}}" + -B ${{env.BUILD_DIR}} + -DCMAKE_INSTALL_PREFIX="${{env.INSTL_DIR}}" + -DCMAKE_BUILD_TYPE=${{matrix.build_type}} + -DCMAKE_C_COMPILER=${{matrix.compiler.c}} + -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}} + -DUMF_BUILD_SHARED_LIBRARY=${{matrix.shared_library}} + -DUMF_BUILD_BENCHMARKS=ON + -DUMF_BUILD_TESTS=ON + -DUMF_BUILD_GPU_TESTS=ON + -DUMF_BUILD_GPU_EXAMPLES=ON + -DUMF_FORMAT_CODE_STYLE=OFF + -DUMF_DEVELOPER_MODE=ON + -DUMF_BUILD_LIBUMF_POOL_DISJOINT=ON + -DUMF_BUILD_LIBUMF_POOL_JEMALLOC=ON + -DUMF_BUILD_LEVEL_ZERO_PROVIDER=OFF + -DUMF_BUILD_CUDA_PROVIDER=ON + -DUMF_TESTS_FAIL_ON_SKIP=ON + - name: Configure build for Ubuntu if: matrix.os == 'Ubuntu' run: > - cmake -B ${{env.BUILD_DIR}} + cmake + -B ${{env.BUILD_DIR}} -DCMAKE_INSTALL_PREFIX="${{env.INSTL_DIR}}" -DCMAKE_BUILD_TYPE=${{matrix.build_type}} -DCMAKE_C_COMPILER=${{matrix.compiler.c}} diff --git a/cmake/FindCUDA.cmake b/cmake/FindCUDA.cmake index 92ef5c830a..5e4e2eeada 100644 --- a/cmake/FindCUDA.cmake +++ b/cmake/FindCUDA.cmake @@ -11,7 +11,7 @@ get_filename_component(CUDA_LIB_DIR ${CUDA_LIBRARIES} DIRECTORY) set(CUDA_LIBRARY_DIRS ${CUDA_LIB_DIR}) if(WINDOWS) - find_file(CUDA_DLL NAMES "bin/cuda.dll" "cuda.dll") + find_file(CUDA_DLL NAMES "nvcuda.dll") get_filename_component(CUDA_DLL_DIR ${CUDA_DLL} DIRECTORY) set(CUDA_DLL_DIRS ${CUDA_DLL_DIR}) endif() diff --git a/examples/cmake/FindCUDA.cmake b/examples/cmake/FindCUDA.cmake index 92ef5c830a..5e4e2eeada 100644 --- a/examples/cmake/FindCUDA.cmake +++ b/examples/cmake/FindCUDA.cmake @@ -11,7 +11,7 @@ get_filename_component(CUDA_LIB_DIR ${CUDA_LIBRARIES} DIRECTORY) set(CUDA_LIBRARY_DIRS ${CUDA_LIB_DIR}) if(WINDOWS) - find_file(CUDA_DLL NAMES "bin/cuda.dll" "cuda.dll") + find_file(CUDA_DLL NAMES "nvcuda.dll") get_filename_component(CUDA_DLL_DIR ${CUDA_DLL} DIRECTORY) set(CUDA_DLL_DIRS ${CUDA_DLL_DIR}) endif() diff --git a/examples/cuda_shared_memory/cuda_shared_memory.c b/examples/cuda_shared_memory/cuda_shared_memory.c index 4b30935226..55a7dd12f1 100644 --- a/examples/cuda_shared_memory/cuda_shared_memory.c +++ b/examples/cuda_shared_memory/cuda_shared_memory.c @@ -14,8 +14,18 @@ #include #include +// disable warning 4201: nonstandard extension used: nameless struct/union +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4201) +#endif // _MSC_VER + #include +#if defined(_MSC_VER) +#pragma warning(pop) +#endif // _MSC_VER + int main(void) { // A result object for storing UMF API result status umf_result_t res; diff --git a/src/provider/provider_cuda.c b/src/provider/provider_cuda.c index 715e6e790e..a1f9df0344 100644 --- a/src/provider/provider_cuda.c +++ b/src/provider/provider_cuda.c @@ -21,8 +21,18 @@ umf_memory_provider_ops_t *umfCUDAMemoryProviderOps(void) { #else // !defined(UMF_NO_CUDA_PROVIDER) +// disable warning 4201: nonstandard extension used: nameless struct/union +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4201) +#endif // _MSC_VER + #include "cuda.h" +#if defined(_MSC_VER) +#pragma warning(pop) +#endif // _MSC_VER + #include "base_alloc_global.h" #include "utils_assert.h" #include "utils_common.h" @@ -100,7 +110,7 @@ static umf_result_t cu2umf_result(CUresult result) { static void init_cu_global_state(void) { #ifdef _WIN32 - const char *lib_name = "cudart.dll"; + const char *lib_name = "nvcuda.dll"; #else const char *lib_name = "libcuda.so"; #endif @@ -159,6 +169,7 @@ static umf_result_t cu_memory_provider_initialize(void *params, if (cu_params->memory_type == UMF_MEMORY_TYPE_UNKNOWN || cu_params->memory_type > UMF_MEMORY_TYPE_SHARED) { + LOG_ERR("Invalid memory type value"); return UMF_RESULT_ERROR_INVALID_ARGUMENT; } diff --git a/src/utils/utils_load_library.c b/src/utils/utils_load_library.c index 2c13acc8d2..cbe7be445d 100644 --- a/src/utils/utils_load_library.c +++ b/src/utils/utils_load_library.c @@ -16,15 +16,18 @@ #include // clang-format on -#else +#else // _WIN32 #define _GNU_SOURCE 1 #include // forces linking with libdl on Linux -#endif +#endif // !_WIN32 + +#include #include "utils_load_library.h" +#include "utils_log.h" #ifdef _WIN32 @@ -47,7 +50,13 @@ void *utils_get_symbol_addr(void *handle, const char *symbol, } handle = GetModuleHandle(libname); } - return (void *)GetProcAddress((HMODULE)handle, symbol); + + void *addr = (void *)GetProcAddress((HMODULE)handle, symbol); + if (addr == NULL) { + LOG_ERR("Required symbol not found: %s", symbol); + } + + return addr; } #else /* Linux */ @@ -68,7 +77,13 @@ void *utils_get_symbol_addr(void *handle, const char *symbol, if (!handle) { handle = RTLD_DEFAULT; } - return dlsym(handle, symbol); + + void *addr = dlsym(handle, symbol); + if (addr == NULL) { + LOG_ERR("Required symbol not found: %s", symbol); + } + + return addr; } #endif diff --git a/test/providers/cuda_helpers.cpp b/test/providers/cuda_helpers.cpp index 734f287e00..37e71bd6a8 100644 --- a/test/providers/cuda_helpers.cpp +++ b/test/providers/cuda_helpers.cpp @@ -18,6 +18,7 @@ struct libcu_ops { CUresult (*cuCtxCreate)(CUcontext *pctx, unsigned int flags, CUdevice dev); CUresult (*cuCtxDestroy)(CUcontext ctx); CUresult (*cuCtxGetCurrent)(CUcontext *pctx); + CUresult (*cuCtxSetCurrent)(CUcontext ctx); CUresult (*cuDeviceGet)(CUdevice *device, int ordinal); CUresult (*cuMemAlloc)(CUdeviceptr *dptr, size_t size); CUresult (*cuMemFree)(CUdeviceptr dptr); @@ -34,6 +35,7 @@ struct libcu_ops { CUpointer_attribute *attributes, void **data, CUdeviceptr ptr); CUresult (*cuStreamSynchronize)(CUstream hStream); + CUresult (*cuCtxSynchronize)(void); } libcu_ops; #if USE_DLOPEN @@ -48,7 +50,7 @@ struct DlHandleCloser { std::unique_ptr cuDlHandle = nullptr; int InitCUDAOps() { #ifdef _WIN32 - const char *lib_name = "cudart.dll"; + const char *lib_name = "nvcuda.dll"; #else const char *lib_name = "libcuda.so"; #endif @@ -84,6 +86,12 @@ int InitCUDAOps() { fprintf(stderr, "cuCtxGetCurrent symbol not found in %s\n", lib_name); return -1; } + *(void **)&libcu_ops.cuCtxSetCurrent = + utils_get_symbol_addr(cuDlHandle.get(), "cuCtxSetCurrent", lib_name); + if (libcu_ops.cuCtxSetCurrent == nullptr) { + fprintf(stderr, "cuCtxSetCurrent symbol not found in %s\n", lib_name); + return -1; + } *(void **)&libcu_ops.cuDeviceGet = utils_get_symbol_addr(cuDlHandle.get(), "cuDeviceGet", lib_name); if (libcu_ops.cuDeviceGet == nullptr) { @@ -153,6 +161,12 @@ int InitCUDAOps() { lib_name); return -1; } + *(void **)&libcu_ops.cuCtxSynchronize = + utils_get_symbol_addr(cuDlHandle.get(), "cuCtxSynchronize", lib_name); + if (libcu_ops.cuCtxSynchronize == nullptr) { + fprintf(stderr, "cuCtxSynchronize symbol not found in %s\n", lib_name); + return -1; + } return 0; } @@ -165,6 +179,7 @@ int InitCUDAOps() { libcu_ops.cuCtxCreate = cuCtxCreate; libcu_ops.cuCtxDestroy = cuCtxDestroy; libcu_ops.cuCtxGetCurrent = cuCtxGetCurrent; + libcu_ops.cuCtxSetCurrent = cuCtxSetCurrent; libcu_ops.cuDeviceGet = cuDeviceGet; libcu_ops.cuMemAlloc = cuMemAlloc; libcu_ops.cuMemAllocHost = cuMemAllocHost; @@ -176,11 +191,31 @@ int InitCUDAOps() { libcu_ops.cuPointerGetAttribute = cuPointerGetAttribute; libcu_ops.cuPointerGetAttributes = cuPointerGetAttributes; libcu_ops.cuStreamSynchronize = cuStreamSynchronize; + libcu_ops.cuCtxSynchronize = cuCtxSynchronize; return 0; } #endif // USE_DLOPEN +static CUresult set_context(CUcontext required_ctx, CUcontext *restore_ctx) { + CUcontext current_ctx = NULL; + CUresult cu_result = libcu_ops.cuCtxGetCurrent(¤t_ctx); + if (cu_result != CUDA_SUCCESS) { + fprintf(stderr, "cuCtxGetCurrent() failed.\n"); + return cu_result; + } + + *restore_ctx = current_ctx; + if (current_ctx != required_ctx) { + cu_result = libcu_ops.cuCtxSetCurrent(required_ctx); + if (cu_result != CUDA_SUCCESS) { + fprintf(stderr, "cuCtxSetCurrent() failed.\n"); + } + } + + return cu_result; +} + static int init_cuda_lib(void) { CUresult result = libcu_ops.cuInit(0); if (result != CUDA_SUCCESS) { @@ -191,8 +226,6 @@ static int init_cuda_lib(void) { int cuda_fill(CUcontext context, CUdevice device, void *ptr, size_t size, const void *pattern, size_t pattern_size) { - - (void)context; (void)device; (void)pattern_size; @@ -202,23 +235,40 @@ int cuda_fill(CUcontext context, CUdevice device, void *ptr, size_t size, return -1; } + // set required context + CUcontext curr_context = nullptr; + set_context(context, &curr_context); + int ret = 0; CUresult res = libcu_ops.cuMemsetD32((CUdeviceptr)ptr, *(unsigned int *)pattern, size / sizeof(unsigned int)); if (res != CUDA_SUCCESS) { - fprintf(stderr, "cuMemsetD32() failed!\n"); + fprintf(stderr, "cuMemsetD32(%llu, %u, %zu) failed!\n", + (CUdeviceptr)ptr, *(unsigned int *)pattern, + size / pattern_size); + return -1; + } + + res = libcu_ops.cuCtxSynchronize(); + if (res != CUDA_SUCCESS) { + fprintf(stderr, "cuCtxSynchronize() failed!\n"); return -1; } + // restore context + set_context(curr_context, &curr_context); return ret; } -int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, void *src_ptr, - size_t size) { - (void)context; +int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, + const void *src_ptr, size_t size) { (void)device; + // set required context + CUcontext curr_context = nullptr; + set_context(context, &curr_context); + int ret = 0; CUresult res = libcu_ops.cuMemcpy((CUdeviceptr)dst_ptr, (CUdeviceptr)src_ptr, size); @@ -227,12 +277,14 @@ int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, void *src_ptr, return -1; } - res = libcu_ops.cuStreamSynchronize(0); + res = libcu_ops.cuCtxSynchronize(); if (res != CUDA_SUCCESS) { - fprintf(stderr, "cuStreamSynchronize() failed!\n"); + fprintf(stderr, "cuCtxSynchronize() failed!\n"); return -1; } + // restore context + set_context(curr_context, &curr_context); return ret; } diff --git a/test/providers/cuda_helpers.h b/test/providers/cuda_helpers.h index 5e42153bb7..fc349fc143 100644 --- a/test/providers/cuda_helpers.h +++ b/test/providers/cuda_helpers.h @@ -10,8 +10,18 @@ #include +// disable warning 4201: nonstandard extension used: nameless struct/union +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4201) +#endif // _MSC_VER + #include "cuda.h" +#if defined(_MSC_VER) +#pragma warning(pop) +#endif // _MSC_VER + #ifdef __cplusplus extern "C" { #endif @@ -21,8 +31,8 @@ int destroy_context(CUcontext context); int cuda_fill(CUcontext context, CUdevice device, void *ptr, size_t size, const void *pattern, size_t pattern_size); -int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, void *src_ptr, - size_t size); +int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, + const void *src_ptr, size_t size); umf_usm_memory_type_t get_mem_type(CUcontext context, void *ptr); diff --git a/test/providers/provider_cuda.cpp b/test/providers/provider_cuda.cpp index c0173bb82d..58e3beb9e4 100644 --- a/test/providers/provider_cuda.cpp +++ b/test/providers/provider_cuda.cpp @@ -114,7 +114,7 @@ TEST_P(umfCUDAProviderTest, basic) { // check if the pattern was successfully applied uint32_t *hostMemory = (uint32_t *)calloc(1, size); memAccessor->copy(hostMemory, ptr, size); - for (size_t i = 0; i < size / sizeof(int); i++) { + for (size_t i = 0; i < size / sizeof(uint32_t); i++) { ASSERT_EQ(hostMemory[i], pattern); } free(hostMemory); @@ -171,15 +171,18 @@ TEST_P(umfCUDAProviderTest, allocInvalidSize) { ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); ASSERT_NE(provider, nullptr); - // try to alloc (int)-1 void *ptr = nullptr; - umf_result = umfMemoryProviderAlloc(provider, -1, 0, &ptr); - ASSERT_EQ(umf_result, UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY); - // in case of size == 0 we should got INVALID_ARGUMENT error - // NOTE: this is invalid only for the DEVICE or SHARED allocations - if (params.memory_type != UMF_MEMORY_TYPE_HOST) { + // NOTE: some scenarios are invalid only for the DEVICE allocations + if (params.memory_type == UMF_MEMORY_TYPE_DEVICE) { + // try to alloc SIZE_MAX + umf_result = umfMemoryProviderAlloc(provider, SIZE_MAX, 0, &ptr); + ASSERT_EQ(ptr, nullptr); + ASSERT_EQ(umf_result, UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY); + + // in case of size == 0 we should got INVALID_ARGUMENT error umf_result = umfMemoryProviderAlloc(provider, 0, 0, &ptr); + ASSERT_EQ(ptr, nullptr); ASSERT_EQ(umf_result, UMF_RESULT_ERROR_INVALID_ARGUMENT); }