Skip to content

Commit 60f5e3b

Browse files
committed
enable CUDA provider on Windows
1 parent fa08100 commit 60f5e3b

File tree

8 files changed

+165
-25
lines changed

8 files changed

+165
-25
lines changed

.github/workflows/reusable_gpu.yml

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ jobs:
1919
name: Level-Zero
2020
env:
2121
VCPKG_PATH: "${{github.workspace}}/../../../../vcpkg/packages/hwloc_x64-windows;${{github.workspace}}/../../../../vcpkg/packages/tbb_x64-windows;${{github.workspace}}/../../../../vcpkg/packages/jemalloc_x64-windows"
22-
COVERAGE_NAME : "exports-coverage-gpu"
22+
COVERAGE_NAME : "exports-coverage-gpu-L0"
2323
# run only on upstream; forks will not have the HW
2424
if: github.repository == 'oneapi-src/unified-memory-framework'
2525
strategy:
@@ -130,18 +130,26 @@ jobs:
130130
name: CUDA
131131
env:
132132
COVERAGE_NAME : "exports-coverage-gpu-CUDA"
133+
VCPKG_PATH: "${{github.workspace}}/build/vcpkg/packages/hwloc_x64-windows;${{github.workspace}}/build/vcpkg/packages/tbb_x64-windows;${{github.workspace}}/build/vcpkg/packages/jemalloc_x64-windows;"
134+
CUDA_PATH: "c:/cuda"
135+
133136
# run only on upstream; forks will not have the HW
134137
if: github.repository == 'oneapi-src/unified-memory-framework'
135138
strategy:
136139
matrix:
137140
shared_library: ['ON', 'OFF']
138141
build_type: ['Debug', 'Release']
139-
# TODO add windows
140-
os: ['Ubuntu']
142+
os: ['Ubuntu', 'Windows']
141143
include:
144+
- os: 'Windows'
145+
compiler: {c: cl, cxx: cl}
146+
number_of_processors: '$Env:NUMBER_OF_PROCESSORS'
142147
- os: 'Ubuntu'
143148
compiler: {c: gcc, cxx: g++}
144149
number_of_processors: '$(nproc)'
150+
exclude:
151+
- os: 'Windows'
152+
build_type: 'Debug'
145153

146154
runs-on: ["DSS-CUDA", "DSS-${{matrix.os}}"]
147155
steps:
@@ -154,10 +162,53 @@ jobs:
154162
if: matrix.os == 'Ubuntu'
155163
run: .github/scripts/get_system_info.sh
156164

165+
- name: Initialize vcpkg
166+
if: matrix.os == 'Windows'
167+
uses: lukka/run-vcpkg@5e0cab206a5ea620130caf672fce3e4a6b5666a1 # v11.5
168+
with:
169+
vcpkgGitCommitId: 3dd44b931481d7a8e9ba412621fa810232b66289
170+
vcpkgDirectory: ${{env.BUILD_DIR}}/vcpkg
171+
vcpkgJsonGlob: '**/vcpkg.json'
172+
173+
- name: Install dependencies (windows-latest)
174+
if: matrix.os == 'Windows'
175+
run: vcpkg install
176+
shell: pwsh # Specifies PowerShell as the shell for running the script.
177+
178+
- name: Install dependencies (ubuntu-latest)
179+
if: matrix.os == 'ubuntu-latest'
180+
run: |
181+
sudo apt-get update
182+
sudo apt-get install -y cmake libjemalloc-dev libhwloc-dev libnuma-dev libtbb-dev
183+
184+
- name: Configure build for Win
185+
if: matrix.os == 'Windows'
186+
run: >
187+
cmake
188+
-DCMAKE_PREFIX_PATH="${{env.VCPKG_PATH}}${{env.CUDA_PATH}}"
189+
-B ${{env.BUILD_DIR}}
190+
-DCMAKE_INSTALL_PREFIX="${{env.INSTL_DIR}}"
191+
-DCMAKE_BUILD_TYPE=${{matrix.build_type}}
192+
-DCMAKE_C_COMPILER=${{matrix.compiler.c}}
193+
-DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
194+
-DUMF_BUILD_SHARED_LIBRARY=${{matrix.shared_library}}
195+
-DUMF_BUILD_BENCHMARKS=ON
196+
-DUMF_BUILD_TESTS=ON
197+
-DUMF_BUILD_GPU_TESTS=ON
198+
-DUMF_BUILD_GPU_EXAMPLES=ON
199+
-DUMF_FORMAT_CODE_STYLE=OFF
200+
-DUMF_DEVELOPER_MODE=ON
201+
-DUMF_BUILD_LIBUMF_POOL_DISJOINT=ON
202+
-DUMF_BUILD_LIBUMF_POOL_JEMALLOC=ON
203+
-DUMF_BUILD_LEVEL_ZERO_PROVIDER=OFF
204+
-DUMF_BUILD_CUDA_PROVIDER=ON
205+
-DUMF_TESTS_FAIL_ON_SKIP=ON
206+
157207
- name: Configure build for Ubuntu
158208
if: matrix.os == 'Ubuntu'
159209
run: >
160-
cmake -B ${{env.BUILD_DIR}}
210+
cmake
211+
-B ${{env.BUILD_DIR}}
161212
-DCMAKE_INSTALL_PREFIX="${{env.INSTL_DIR}}"
162213
-DCMAKE_BUILD_TYPE=${{matrix.build_type}}
163214
-DCMAKE_C_COMPILER=${{matrix.compiler.c}}

cmake/FindCUDA.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ get_filename_component(CUDA_LIB_DIR ${CUDA_LIBRARIES} DIRECTORY)
1111
set(CUDA_LIBRARY_DIRS ${CUDA_LIB_DIR})
1212

1313
if(WINDOWS)
14-
find_file(CUDA_DLL NAMES "bin/cuda.dll" "cuda.dll")
14+
find_file(CUDA_DLL NAMES "nvcuda.dll")
1515
get_filename_component(CUDA_DLL_DIR ${CUDA_DLL} DIRECTORY)
1616
set(CUDA_DLL_DIRS ${CUDA_DLL_DIR})
1717
endif()

examples/cmake/FindCUDA.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ get_filename_component(CUDA_LIB_DIR ${CUDA_LIBRARIES} DIRECTORY)
1111
set(CUDA_LIBRARY_DIRS ${CUDA_LIB_DIR})
1212

1313
if(WINDOWS)
14-
find_file(CUDA_DLL NAMES "bin/cuda.dll" "cuda.dll")
14+
find_file(CUDA_DLL NAMES "nvcuda.dll")
1515
get_filename_component(CUDA_DLL_DIR ${CUDA_DLL} DIRECTORY)
1616
set(CUDA_DLL_DIRS ${CUDA_DLL_DIR})
1717
endif()

examples/cuda_shared_memory/cuda_shared_memory.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,18 @@
1414
#include <umf/pools/pool_disjoint.h>
1515
#include <umf/providers/provider_cuda.h>
1616

17+
// disable warning 4201: nonstandard extension used: nameless struct/union
18+
#if defined(_MSC_VER)
19+
#pragma warning(push)
20+
#pragma warning(disable : 4201)
21+
#endif // _MSC_VER
22+
1723
#include <cuda.h>
1824

25+
#if defined(_MSC_VER)
26+
#pragma warning(pop)
27+
#endif // _MSC_VER
28+
1929
int main(void) {
2030
// A result object for storing UMF API result status
2131
umf_result_t res;

src/provider/provider_cuda.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,18 @@ umf_memory_provider_ops_t *umfCUDAMemoryProviderOps(void) {
2121

2222
#else // !defined(UMF_NO_CUDA_PROVIDER)
2323

24+
// disable warning 4201: nonstandard extension used: nameless struct/union
25+
#if defined(_MSC_VER)
26+
#pragma warning(push)
27+
#pragma warning(disable : 4201)
28+
#endif // _MSC_VER
29+
2430
#include "cuda.h"
2531

32+
#if defined(_MSC_VER)
33+
#pragma warning(pop)
34+
#endif // _MSC_VER
35+
2636
#include "base_alloc_global.h"
2737
#include "utils_assert.h"
2838
#include "utils_common.h"
@@ -100,7 +110,7 @@ static umf_result_t cu2umf_result(CUresult result) {
100110

101111
static void init_cu_global_state(void) {
102112
#ifdef _WIN32
103-
const char *lib_name = "cudart.dll";
113+
const char *lib_name = "nvcuda.dll";
104114
#else
105115
const char *lib_name = "libcuda.so";
106116
#endif
@@ -159,6 +169,7 @@ static umf_result_t cu_memory_provider_initialize(void *params,
159169

160170
if (cu_params->memory_type == UMF_MEMORY_TYPE_UNKNOWN ||
161171
cu_params->memory_type > UMF_MEMORY_TYPE_SHARED) {
172+
LOG_ERR("Invalid memory type value");
162173
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
163174
}
164175

test/providers/cuda_helpers.cpp

Lines changed: 62 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ struct libcu_ops {
1818
CUresult (*cuCtxCreate)(CUcontext *pctx, unsigned int flags, CUdevice dev);
1919
CUresult (*cuCtxDestroy)(CUcontext ctx);
2020
CUresult (*cuCtxGetCurrent)(CUcontext *pctx);
21+
CUresult (*cuCtxSetCurrent)(CUcontext ctx);
2122
CUresult (*cuDeviceGet)(CUdevice *device, int ordinal);
2223
CUresult (*cuMemAlloc)(CUdeviceptr *dptr, size_t size);
2324
CUresult (*cuMemFree)(CUdeviceptr dptr);
@@ -34,6 +35,7 @@ struct libcu_ops {
3435
CUpointer_attribute *attributes,
3536
void **data, CUdeviceptr ptr);
3637
CUresult (*cuStreamSynchronize)(CUstream hStream);
38+
CUresult (*cuCtxSynchronize)(void);
3739
} libcu_ops;
3840

3941
#if USE_DLOPEN
@@ -48,7 +50,7 @@ struct DlHandleCloser {
4850
std::unique_ptr<void, DlHandleCloser> cuDlHandle = nullptr;
4951
int InitCUDAOps() {
5052
#ifdef _WIN32
51-
const char *lib_name = "cudart.dll";
53+
const char *lib_name = "nvcuda.dll";
5254
#else
5355
const char *lib_name = "libcuda.so";
5456
#endif
@@ -84,6 +86,12 @@ int InitCUDAOps() {
8486
fprintf(stderr, "cuCtxGetCurrent symbol not found in %s\n", lib_name);
8587
return -1;
8688
}
89+
*(void **)&libcu_ops.cuCtxSetCurrent =
90+
utils_get_symbol_addr(cuDlHandle.get(), "cuCtxSetCurrent", lib_name);
91+
if (libcu_ops.cuCtxSetCurrent == nullptr) {
92+
fprintf(stderr, "cuCtxSetCurrent symbol not found in %s\n", lib_name);
93+
return -1;
94+
}
8795
*(void **)&libcu_ops.cuDeviceGet =
8896
utils_get_symbol_addr(cuDlHandle.get(), "cuDeviceGet", lib_name);
8997
if (libcu_ops.cuDeviceGet == nullptr) {
@@ -153,6 +161,13 @@ int InitCUDAOps() {
153161
lib_name);
154162
return -1;
155163
}
164+
*(void **)&libcu_ops.cuCtxSynchronize = utils_get_symbol_addr(
165+
cuDlHandle.get(), "cuCtxSynchronize", lib_name);
166+
if (libcu_ops.cuCtxSynchronize == nullptr) {
167+
fprintf(stderr, "cuCtxSynchronize symbol not found in %s\n",
168+
lib_name);
169+
return -1;
170+
}
156171

157172
return 0;
158173
}
@@ -165,6 +180,7 @@ int InitCUDAOps() {
165180
libcu_ops.cuCtxCreate = cuCtxCreate;
166181
libcu_ops.cuCtxDestroy = cuCtxDestroy;
167182
libcu_ops.cuCtxGetCurrent = cuCtxGetCurrent;
183+
libcu_ops.cuCtxSetCurrent = cuCtxSetCurrent;
168184
libcu_ops.cuDeviceGet = cuDeviceGet;
169185
libcu_ops.cuMemAlloc = cuMemAlloc;
170186
libcu_ops.cuMemAllocHost = cuMemAllocHost;
@@ -176,6 +192,7 @@ int InitCUDAOps() {
176192
libcu_ops.cuPointerGetAttribute = cuPointerGetAttribute;
177193
libcu_ops.cuPointerGetAttributes = cuPointerGetAttributes;
178194
libcu_ops.cuStreamSynchronize = cuStreamSynchronize;
195+
libcu_ops.cuCtxSynchronize = cuCtxSynchronize;
179196

180197
return 0;
181198
}
@@ -191,8 +208,6 @@ static int init_cuda_lib(void) {
191208

192209
int cuda_fill(CUcontext context, CUdevice device, void *ptr, size_t size,
193210
const void *pattern, size_t pattern_size) {
194-
195-
(void)context;
196211
(void)device;
197212
(void)pattern_size;
198213

@@ -202,23 +217,40 @@ int cuda_fill(CUcontext context, CUdevice device, void *ptr, size_t size,
202217
return -1;
203218
}
204219

220+
// set required context
221+
CUcontext curr_context = nullptr;
222+
set_context(context, &curr_context);
223+
205224
int ret = 0;
206225
CUresult res =
207226
libcu_ops.cuMemsetD32((CUdeviceptr)ptr, *(unsigned int *)pattern,
208227
size / sizeof(unsigned int));
209228
if (res != CUDA_SUCCESS) {
210-
fprintf(stderr, "cuMemsetD32() failed!\n");
229+
fprintf(stderr, "cuMemsetD32(%llu, %u, %zu) failed!\n",
230+
(CUdeviceptr)ptr, *(unsigned int *)pattern,
231+
size / pattern_size);
211232
return -1;
212233
}
213234

235+
res = libcu_ops.cuCtxSynchronize();
236+
if (res != CUDA_SUCCESS) {
237+
fprintf(stderr, "cuCtxSynchronize() failed!\n");
238+
return -1;
239+
}
240+
241+
// restore context
242+
set_context(curr_context, &curr_context);
214243
return ret;
215244
}
216245

217-
int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, void *src_ptr,
218-
size_t size) {
219-
(void)context;
246+
int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr,
247+
const void *src_ptr, size_t size) {
220248
(void)device;
221249

250+
// set required context
251+
CUcontext curr_context = nullptr;
252+
set_context(context, &curr_context);
253+
222254
int ret = 0;
223255
CUresult res =
224256
libcu_ops.cuMemcpy((CUdeviceptr)dst_ptr, (CUdeviceptr)src_ptr, size);
@@ -227,12 +259,14 @@ int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, void *src_ptr,
227259
return -1;
228260
}
229261

230-
res = libcu_ops.cuStreamSynchronize(0);
262+
res = libcu_ops.cuCtxSynchronize();
231263
if (res != CUDA_SUCCESS) {
232-
fprintf(stderr, "cuStreamSynchronize() failed!\n");
264+
fprintf(stderr, "cuCtxSynchronize() failed!\n");
233265
return -1;
234266
}
235267

268+
// restore context
269+
set_context(curr_context, &curr_context);
236270
return ret;
237271
}
238272

@@ -287,6 +321,25 @@ CUcontext get_current_context() {
287321
return context;
288322
}
289323

324+
CUresult set_context(CUcontext required_ctx, CUcontext *restore_ctx) {
325+
CUcontext current_ctx = NULL;
326+
CUresult cu_result = libcu_ops.cuCtxGetCurrent(&current_ctx);
327+
if (cu_result != CUDA_SUCCESS) {
328+
fprintf(stderr, "cuCtxGetCurrent() failed.\n");
329+
return cu_result;
330+
}
331+
332+
*restore_ctx = current_ctx;
333+
if (current_ctx != required_ctx) {
334+
cu_result = libcu_ops.cuCtxSetCurrent(required_ctx);
335+
if (cu_result != CUDA_SUCCESS) {
336+
fprintf(stderr, "cuCtxSetCurrent() failed.\n");
337+
}
338+
}
339+
340+
return cu_result;
341+
}
342+
290343
UTIL_ONCE_FLAG cuda_init_flag;
291344
int InitResult;
292345
void init_cuda_once() {

test/providers/cuda_helpers.h

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,18 @@
1010

1111
#include <umf/providers/provider_cuda.h>
1212

13+
// disable warning 4201: nonstandard extension used: nameless struct/union
14+
#if defined(_MSC_VER)
15+
#pragma warning(push)
16+
#pragma warning(disable : 4201)
17+
#endif // _MSC_VER
18+
1319
#include "cuda.h"
1420

21+
#if defined(_MSC_VER)
22+
#pragma warning(pop)
23+
#endif // _MSC_VER
24+
1525
#ifdef __cplusplus
1626
extern "C" {
1727
#endif
@@ -21,15 +31,17 @@ int destroy_context(CUcontext context);
2131
int cuda_fill(CUcontext context, CUdevice device, void *ptr, size_t size,
2232
const void *pattern, size_t pattern_size);
2333

24-
int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, void *src_ptr,
25-
size_t size);
34+
int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr,
35+
const void *src_ptr, size_t size);
2636

2737
umf_usm_memory_type_t get_mem_type(CUcontext context, void *ptr);
2838

2939
CUcontext get_mem_context(void *ptr);
3040

3141
CUcontext get_current_context();
3242

43+
CUresult set_context(CUcontext required_ctx, CUcontext *restore_ctx);
44+
3345
cuda_memory_provider_params_t
3446
create_cuda_prov_params(umf_usm_memory_type_t memory_type);
3547

0 commit comments

Comments
 (0)