Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 49 additions & 4 deletions .github/workflows/reusable_gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
name: Level-Zero
env:
VCPKG_PATH: "${{github.workspace}}/../../../../vcpkg/packages/hwloc_x64-windows;${{github.workspace}}/../../../../vcpkg/packages/tbb_x64-windows;${{github.workspace}}/../../../../vcpkg/packages/jemalloc_x64-windows"
COVERAGE_NAME : "exports-coverage-gpu"
COVERAGE_NAME : "exports-coverage-gpu-L0"
# run only on upstream; forks will not have the HW
if: github.repository == 'oneapi-src/unified-memory-framework'
strategy:
Expand Down Expand Up @@ -130,18 +130,26 @@ jobs:
name: CUDA
env:
COVERAGE_NAME : "exports-coverage-gpu-CUDA"
VCPKG_PATH: "${{github.workspace}}/build/vcpkg/packages/hwloc_x64-windows;${{github.workspace}}/build/vcpkg/packages/tbb_x64-windows;${{github.workspace}}/build/vcpkg/packages/jemalloc_x64-windows;"
CUDA_PATH: "c:/cuda"

# run only on upstream; forks will not have the HW
if: github.repository == 'oneapi-src/unified-memory-framework'
strategy:
matrix:
shared_library: ['ON', 'OFF']
build_type: ['Debug', 'Release']
# TODO add windows
os: ['Ubuntu']
os: ['Ubuntu', 'Windows']
include:
- os: 'Windows'
compiler: {c: cl, cxx: cl}
number_of_processors: '$Env:NUMBER_OF_PROCESSORS'
- os: 'Ubuntu'
compiler: {c: gcc, cxx: g++}
number_of_processors: '$(nproc)'
exclude:
- os: 'Windows'
build_type: 'Debug'

runs-on: ["DSS-CUDA", "DSS-${{matrix.os}}"]
steps:
Expand All @@ -154,10 +162,47 @@ jobs:
if: matrix.os == 'Ubuntu'
run: .github/scripts/get_system_info.sh

- name: Initialize vcpkg
if: matrix.os == 'Windows'
uses: lukka/run-vcpkg@5e0cab206a5ea620130caf672fce3e4a6b5666a1 # v11.5
with:
vcpkgGitCommitId: 3dd44b931481d7a8e9ba412621fa810232b66289
vcpkgDirectory: ${{env.BUILD_DIR}}/vcpkg
vcpkgJsonGlob: '**/vcpkg.json'

- name: Install dependencies (windows-latest)
if: matrix.os == 'Windows'
run: vcpkg install
shell: pwsh # Specifies PowerShell as the shell for running the script.

- name: Configure build for Win
if: matrix.os == 'Windows'
run: >
cmake
-DCMAKE_PREFIX_PATH="${{env.VCPKG_PATH}}${{env.CUDA_PATH}}"
-B ${{env.BUILD_DIR}}
-DCMAKE_INSTALL_PREFIX="${{env.INSTL_DIR}}"
-DCMAKE_BUILD_TYPE=${{matrix.build_type}}
-DCMAKE_C_COMPILER=${{matrix.compiler.c}}
-DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
-DUMF_BUILD_SHARED_LIBRARY=${{matrix.shared_library}}
-DUMF_BUILD_BENCHMARKS=ON
-DUMF_BUILD_TESTS=ON
-DUMF_BUILD_GPU_TESTS=ON
-DUMF_BUILD_GPU_EXAMPLES=ON
-DUMF_FORMAT_CODE_STYLE=OFF
-DUMF_DEVELOPER_MODE=ON
-DUMF_BUILD_LIBUMF_POOL_DISJOINT=ON
-DUMF_BUILD_LIBUMF_POOL_JEMALLOC=ON
-DUMF_BUILD_LEVEL_ZERO_PROVIDER=OFF
-DUMF_BUILD_CUDA_PROVIDER=ON
-DUMF_TESTS_FAIL_ON_SKIP=ON

- name: Configure build for Ubuntu
if: matrix.os == 'Ubuntu'
run: >
cmake -B ${{env.BUILD_DIR}}
cmake
-B ${{env.BUILD_DIR}}
-DCMAKE_INSTALL_PREFIX="${{env.INSTL_DIR}}"
-DCMAKE_BUILD_TYPE=${{matrix.build_type}}
-DCMAKE_C_COMPILER=${{matrix.compiler.c}}
Expand Down
2 changes: 1 addition & 1 deletion cmake/FindCUDA.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ get_filename_component(CUDA_LIB_DIR ${CUDA_LIBRARIES} DIRECTORY)
set(CUDA_LIBRARY_DIRS ${CUDA_LIB_DIR})

if(WINDOWS)
find_file(CUDA_DLL NAMES "bin/cuda.dll" "cuda.dll")
find_file(CUDA_DLL NAMES "nvcuda.dll")
get_filename_component(CUDA_DLL_DIR ${CUDA_DLL} DIRECTORY)
set(CUDA_DLL_DIRS ${CUDA_DLL_DIR})
endif()
Expand Down
2 changes: 1 addition & 1 deletion examples/cmake/FindCUDA.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ get_filename_component(CUDA_LIB_DIR ${CUDA_LIBRARIES} DIRECTORY)
set(CUDA_LIBRARY_DIRS ${CUDA_LIB_DIR})

if(WINDOWS)
find_file(CUDA_DLL NAMES "bin/cuda.dll" "cuda.dll")
find_file(CUDA_DLL NAMES "nvcuda.dll")
get_filename_component(CUDA_DLL_DIR ${CUDA_DLL} DIRECTORY)
set(CUDA_DLL_DIRS ${CUDA_DLL_DIR})
endif()
Expand Down
10 changes: 10 additions & 0 deletions examples/cuda_shared_memory/cuda_shared_memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,18 @@
#include <umf/pools/pool_disjoint.h>
#include <umf/providers/provider_cuda.h>

// disable warning 4201: nonstandard extension used: nameless struct/union
#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(disable : 4201)
#endif // _MSC_VER

#include <cuda.h>

#if defined(_MSC_VER)
#pragma warning(pop)
#endif // _MSC_VER

int main(void) {
// A result object for storing UMF API result status
umf_result_t res;
Expand Down
13 changes: 12 additions & 1 deletion src/provider/provider_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,18 @@ umf_memory_provider_ops_t *umfCUDAMemoryProviderOps(void) {

#else // !defined(UMF_NO_CUDA_PROVIDER)

// disable warning 4201: nonstandard extension used: nameless struct/union
#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(disable : 4201)
#endif // _MSC_VER

#include "cuda.h"

#if defined(_MSC_VER)
#pragma warning(pop)
#endif // _MSC_VER

#include "base_alloc_global.h"
#include "utils_assert.h"
#include "utils_common.h"
Expand Down Expand Up @@ -100,7 +110,7 @@ static umf_result_t cu2umf_result(CUresult result) {

static void init_cu_global_state(void) {
#ifdef _WIN32
const char *lib_name = "cudart.dll";
const char *lib_name = "nvcuda.dll";
#else
const char *lib_name = "libcuda.so";
#endif
Expand Down Expand Up @@ -159,6 +169,7 @@ static umf_result_t cu_memory_provider_initialize(void *params,

if (cu_params->memory_type == UMF_MEMORY_TYPE_UNKNOWN ||
cu_params->memory_type > UMF_MEMORY_TYPE_SHARED) {
LOG_ERR("Invalid memory type value");
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
}

Expand Down
23 changes: 19 additions & 4 deletions src/utils/utils_load_library.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,18 @@
#include <libloaderapi.h>
// clang-format on

#else
#else // _WIN32

#define _GNU_SOURCE 1

#include <dlfcn.h> // forces linking with libdl on Linux

#endif
#endif // !_WIN32

#include <stddef.h>

#include "utils_load_library.h"
#include "utils_log.h"

#ifdef _WIN32

Expand All @@ -47,7 +50,13 @@ void *utils_get_symbol_addr(void *handle, const char *symbol,
}
handle = GetModuleHandle(libname);
}
return (void *)GetProcAddress((HMODULE)handle, symbol);

void *addr = (void *)GetProcAddress((HMODULE)handle, symbol);
if (addr == NULL) {
LOG_ERR("Required symbol not found: %s", symbol);
}

return addr;
}

#else /* Linux */
Expand All @@ -68,7 +77,13 @@ void *utils_get_symbol_addr(void *handle, const char *symbol,
if (!handle) {
handle = RTLD_DEFAULT;
}
return dlsym(handle, symbol);

void *addr = dlsym(handle, symbol);
if (addr == NULL) {
LOG_ERR("Required symbol not found: %s", symbol);
}

return addr;
}

#endif
70 changes: 61 additions & 9 deletions test/providers/cuda_helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ struct libcu_ops {
CUresult (*cuCtxCreate)(CUcontext *pctx, unsigned int flags, CUdevice dev);
CUresult (*cuCtxDestroy)(CUcontext ctx);
CUresult (*cuCtxGetCurrent)(CUcontext *pctx);
CUresult (*cuCtxSetCurrent)(CUcontext ctx);
CUresult (*cuDeviceGet)(CUdevice *device, int ordinal);
CUresult (*cuMemAlloc)(CUdeviceptr *dptr, size_t size);
CUresult (*cuMemFree)(CUdeviceptr dptr);
Expand All @@ -34,6 +35,7 @@ struct libcu_ops {
CUpointer_attribute *attributes,
void **data, CUdeviceptr ptr);
CUresult (*cuStreamSynchronize)(CUstream hStream);
CUresult (*cuCtxSynchronize)(void);
} libcu_ops;

#if USE_DLOPEN
Expand All @@ -48,7 +50,7 @@ struct DlHandleCloser {
std::unique_ptr<void, DlHandleCloser> cuDlHandle = nullptr;
int InitCUDAOps() {
#ifdef _WIN32
const char *lib_name = "cudart.dll";
const char *lib_name = "nvcuda.dll";
#else
const char *lib_name = "libcuda.so";
#endif
Expand Down Expand Up @@ -84,6 +86,12 @@ int InitCUDAOps() {
fprintf(stderr, "cuCtxGetCurrent symbol not found in %s\n", lib_name);
return -1;
}
*(void **)&libcu_ops.cuCtxSetCurrent =
utils_get_symbol_addr(cuDlHandle.get(), "cuCtxSetCurrent", lib_name);
if (libcu_ops.cuCtxSetCurrent == nullptr) {
fprintf(stderr, "cuCtxSetCurrent symbol not found in %s\n", lib_name);
return -1;
}
*(void **)&libcu_ops.cuDeviceGet =
utils_get_symbol_addr(cuDlHandle.get(), "cuDeviceGet", lib_name);
if (libcu_ops.cuDeviceGet == nullptr) {
Expand Down Expand Up @@ -153,6 +161,12 @@ int InitCUDAOps() {
lib_name);
return -1;
}
*(void **)&libcu_ops.cuCtxSynchronize =
utils_get_symbol_addr(cuDlHandle.get(), "cuCtxSynchronize", lib_name);
if (libcu_ops.cuCtxSynchronize == nullptr) {
fprintf(stderr, "cuCtxSynchronize symbol not found in %s\n", lib_name);
return -1;
}

return 0;
}
Expand All @@ -165,6 +179,7 @@ int InitCUDAOps() {
libcu_ops.cuCtxCreate = cuCtxCreate;
libcu_ops.cuCtxDestroy = cuCtxDestroy;
libcu_ops.cuCtxGetCurrent = cuCtxGetCurrent;
libcu_ops.cuCtxSetCurrent = cuCtxSetCurrent;
libcu_ops.cuDeviceGet = cuDeviceGet;
libcu_ops.cuMemAlloc = cuMemAlloc;
libcu_ops.cuMemAllocHost = cuMemAllocHost;
Expand All @@ -176,11 +191,31 @@ int InitCUDAOps() {
libcu_ops.cuPointerGetAttribute = cuPointerGetAttribute;
libcu_ops.cuPointerGetAttributes = cuPointerGetAttributes;
libcu_ops.cuStreamSynchronize = cuStreamSynchronize;
libcu_ops.cuCtxSynchronize = cuCtxSynchronize;

return 0;
}
#endif // USE_DLOPEN

static CUresult set_context(CUcontext required_ctx, CUcontext *restore_ctx) {
CUcontext current_ctx = NULL;
CUresult cu_result = libcu_ops.cuCtxGetCurrent(&current_ctx);
if (cu_result != CUDA_SUCCESS) {
fprintf(stderr, "cuCtxGetCurrent() failed.\n");
return cu_result;
}

*restore_ctx = current_ctx;
if (current_ctx != required_ctx) {
cu_result = libcu_ops.cuCtxSetCurrent(required_ctx);
if (cu_result != CUDA_SUCCESS) {
fprintf(stderr, "cuCtxSetCurrent() failed.\n");
}
}

return cu_result;
}

static int init_cuda_lib(void) {
CUresult result = libcu_ops.cuInit(0);
if (result != CUDA_SUCCESS) {
Expand All @@ -191,8 +226,6 @@ static int init_cuda_lib(void) {

int cuda_fill(CUcontext context, CUdevice device, void *ptr, size_t size,
const void *pattern, size_t pattern_size) {

(void)context;
(void)device;
(void)pattern_size;

Expand All @@ -202,23 +235,40 @@ int cuda_fill(CUcontext context, CUdevice device, void *ptr, size_t size,
return -1;
}

// set required context
CUcontext curr_context = nullptr;
set_context(context, &curr_context);

int ret = 0;
CUresult res =
libcu_ops.cuMemsetD32((CUdeviceptr)ptr, *(unsigned int *)pattern,
size / sizeof(unsigned int));
if (res != CUDA_SUCCESS) {
fprintf(stderr, "cuMemsetD32() failed!\n");
fprintf(stderr, "cuMemsetD32(%llu, %u, %zu) failed!\n",
(CUdeviceptr)ptr, *(unsigned int *)pattern,
size / pattern_size);
return -1;
}

res = libcu_ops.cuCtxSynchronize();
if (res != CUDA_SUCCESS) {
fprintf(stderr, "cuCtxSynchronize() failed!\n");
return -1;
}

// restore context
set_context(curr_context, &curr_context);
return ret;
}

int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, void *src_ptr,
size_t size) {
(void)context;
int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr,
const void *src_ptr, size_t size) {
(void)device;

// set required context
CUcontext curr_context = nullptr;
set_context(context, &curr_context);

int ret = 0;
CUresult res =
libcu_ops.cuMemcpy((CUdeviceptr)dst_ptr, (CUdeviceptr)src_ptr, size);
Expand All @@ -227,12 +277,14 @@ int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, void *src_ptr,
return -1;
}

res = libcu_ops.cuStreamSynchronize(0);
res = libcu_ops.cuCtxSynchronize();
if (res != CUDA_SUCCESS) {
fprintf(stderr, "cuStreamSynchronize() failed!\n");
fprintf(stderr, "cuCtxSynchronize() failed!\n");
return -1;
}

// restore context
set_context(curr_context, &curr_context);
return ret;
}

Expand Down
Loading
Loading