Skip to content

Commit 59ab7d4

Browse files
committed
enable CUDA provider on Windows
1 parent fa08100 commit 59ab7d4

File tree

6 files changed

+123
-83
lines changed

6 files changed

+123
-83
lines changed

.github/workflows/pr_push.yml

Lines changed: 1 addition & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ concurrency:
1515
permissions:
1616
contents: read
1717

18-
jobs:
18+
jobs:
1919
CodeChecks:
2020
uses: ./.github/workflows/reusable_checks.yml
2121
DocsBuild:
@@ -24,56 +24,5 @@ jobs:
2424
name: Fast builds
2525
needs: [CodeChecks, DocsBuild]
2626
uses: ./.github/workflows/reusable_fast.yml
27-
Build:
28-
name: Basic builds
29-
needs: [FastBuild]
30-
uses: ./.github/workflows/reusable_basic.yml
31-
DevDax:
32-
needs: [FastBuild]
33-
uses: ./.github/workflows/reusable_dax.yml
34-
Sanitizers:
35-
needs: [FastBuild]
36-
uses: ./.github/workflows/reusable_sanitizers.yml
37-
Qemu:
38-
needs: [FastBuild]
39-
uses: ./.github/workflows/reusable_qemu.yml
40-
Benchmarks:
41-
needs: [Build]
42-
uses: ./.github/workflows/reusable_benchmarks.yml
43-
ProxyLib:
44-
needs: [Build]
45-
uses: ./.github/workflows/reusable_proxy_lib.yml
4627
GPU:
47-
needs: [Build]
4828
uses: ./.github/workflows/reusable_gpu.yml
49-
Valgrind:
50-
needs: [Build]
51-
uses: ./.github/workflows/reusable_valgrind.yml
52-
MultiNuma:
53-
needs: [Build]
54-
uses: ./.github/workflows/reusable_multi_numa.yml
55-
Coverage:
56-
# total coverage (on upstream only)
57-
if: github.repository == 'oneapi-src/unified-memory-framework'
58-
needs: [Build, DevDax, GPU, MultiNuma, Qemu, ProxyLib]
59-
uses: ./.github/workflows/reusable_coverage.yml
60-
secrets: inherit
61-
with:
62-
trigger: "${{github.event_name}}"
63-
Coverage_partial:
64-
# partial coverage (on forks)
65-
if: github.repository != 'oneapi-src/unified-memory-framework'
66-
needs: [Build, Qemu, ProxyLib]
67-
uses: ./.github/workflows/reusable_coverage.yml
68-
CodeQL:
69-
needs: [Build]
70-
permissions:
71-
contents: read
72-
security-events: write
73-
uses: ./.github/workflows/reusable_codeql.yml
74-
Trivy:
75-
needs: [Build]
76-
permissions:
77-
contents: read
78-
security-events: write
79-
uses: ./.github/workflows/reusable_trivy.yml

.github/workflows/reusable_gpu.yml

Lines changed: 59 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -126,22 +126,28 @@ jobs:
126126
name: ${{env.COVERAGE_NAME}}-shared-${{matrix.shared_library}}
127127
path: ${{env.COVERAGE_DIR}}
128128

129+
# TODO merge with above
129130
gpu-CUDA:
130131
name: CUDA
131132
env:
132133
COVERAGE_NAME : "exports-coverage-gpu-CUDA"
134+
VCPKG_PATH: "${{github.workspace}}/build/vcpkg/packages/hwloc_x64-windows;${{github.workspace}}/build/vcpkg/packages/tbb_x64-windows;${{github.workspace}}/build/vcpkg/packages/jemalloc_x64-windows;"
135+
CUDA_PATH: "c:/cuda"
136+
133137
# run only on upstream; forks will not have the HW
134138
if: github.repository == 'oneapi-src/unified-memory-framework'
135139
strategy:
136140
matrix:
137141
shared_library: ['ON', 'OFF']
138142
build_type: ['Debug', 'Release']
139-
# TODO add windows
140-
os: ['Ubuntu']
143+
os: ['Windows']
141144
include:
142-
- os: 'Ubuntu'
143-
compiler: {c: gcc, cxx: g++}
144-
number_of_processors: '$(nproc)'
145+
- os: 'Windows'
146+
compiler: {c: cl, cxx: cl}
147+
number_of_processors: '$Env:NUMBER_OF_PROCESSORS'
148+
exclude:
149+
- os: 'Windows'
150+
build_type: 'Debug'
145151

146152
runs-on: ["DSS-CUDA", "DSS-${{matrix.os}}"]
147153
steps:
@@ -154,10 +160,53 @@ jobs:
154160
if: matrix.os == 'Ubuntu'
155161
run: .github/scripts/get_system_info.sh
156162

163+
- name: Initialize vcpkg
164+
if: matrix.os == 'Windows'
165+
uses: lukka/run-vcpkg@5e0cab206a5ea620130caf672fce3e4a6b5666a1 # v11.5
166+
with:
167+
vcpkgGitCommitId: 3dd44b931481d7a8e9ba412621fa810232b66289
168+
vcpkgDirectory: ${{env.BUILD_DIR}}/vcpkg
169+
vcpkgJsonGlob: '**/vcpkg.json'
170+
171+
- name: Install dependencies (windows-latest)
172+
if: matrix.os == 'Windows'
173+
run: vcpkg install
174+
shell: pwsh # Specifies PowerShell as the shell for running the script.
175+
176+
- name: Install dependencies (ubuntu-latest)
177+
if: matrix.os == 'ubuntu-latest'
178+
run: |
179+
sudo apt-get update
180+
sudo apt-get install -y cmake libjemalloc-dev libhwloc-dev libnuma-dev libtbb-dev
181+
182+
- name: Configure build for Win
183+
if: matrix.os == 'Windows'
184+
run: >
185+
cmake
186+
-DCMAKE_PREFIX_PATH="${{env.VCPKG_PATH}}${{env.CUDA_PATH}}"
187+
-B ${{env.BUILD_DIR}}
188+
-DCMAKE_INSTALL_PREFIX="${{env.INSTL_DIR}}"
189+
-DCMAKE_BUILD_TYPE=${{matrix.build_type}}
190+
-DCMAKE_C_COMPILER=${{matrix.compiler.c}}
191+
-DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
192+
-DUMF_BUILD_SHARED_LIBRARY=${{matrix.shared_library}}
193+
-DUMF_BUILD_BENCHMARKS=ON
194+
-DUMF_BUILD_TESTS=ON
195+
-DUMF_BUILD_GPU_TESTS=ON
196+
-DUMF_BUILD_GPU_EXAMPLES=ON
197+
-DUMF_FORMAT_CODE_STYLE=OFF
198+
-DUMF_DEVELOPER_MODE=ON
199+
-DUMF_BUILD_LIBUMF_POOL_DISJOINT=ON
200+
-DUMF_BUILD_LIBUMF_POOL_JEMALLOC=ON
201+
-DUMF_BUILD_LEVEL_ZERO_PROVIDER=OFF
202+
-DUMF_BUILD_CUDA_PROVIDER=ON
203+
-DUMF_TESTS_FAIL_ON_SKIP=ON
204+
157205
- name: Configure build for Ubuntu
158206
if: matrix.os == 'Ubuntu'
159207
run: >
160-
cmake -B ${{env.BUILD_DIR}}
208+
cmake
209+
-B ${{env.BUILD_DIR}}
161210
-DCMAKE_INSTALL_PREFIX="${{env.INSTL_DIR}}"
162211
-DCMAKE_BUILD_TYPE=${{matrix.build_type}}
163212
-DCMAKE_C_COMPILER=${{matrix.compiler.c}}
@@ -179,6 +228,10 @@ jobs:
179228
- name: Build UMF
180229
run: cmake --build ${{env.BUILD_DIR}} --config ${{matrix.build_type}} -j ${{matrix.number_of_processors}}
181230

231+
- name: aa
232+
if: matrix.os == 'Windows'
233+
run: ${{env.BUILD_DIR}}/test/${{matrix.build_type}}/umf_test-provider_cuda
234+
182235
- name: Run tests
183236
working-directory: ${{env.BUILD_DIR}}
184237
run: ctest -C ${{matrix.build_type}} --output-on-failure --test-dir test

src/provider/provider_cuda.c

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,18 @@ umf_memory_provider_ops_t *umfCUDAMemoryProviderOps(void) {
2121

2222
#else // !defined(UMF_NO_CUDA_PROVIDER)
2323

24+
// disable warning 4201: nonstandard extension used: nameless struct/union
25+
#if defined(_MSC_VER)
26+
#pragma warning(push)
27+
#pragma warning(disable : 4201)
28+
#endif // _MSC_VER
29+
2430
#include "cuda.h"
2531

32+
#if defined(_MSC_VER)
33+
#pragma warning(pop)
34+
#endif // _MSC_VER
35+
2636
#include "base_alloc_global.h"
2737
#include "utils_assert.h"
2838
#include "utils_common.h"
@@ -100,7 +110,7 @@ static umf_result_t cu2umf_result(CUresult result) {
100110

101111
static void init_cu_global_state(void) {
102112
#ifdef _WIN32
103-
const char *lib_name = "cudart.dll";
113+
const char *lib_name = "nvcuda.dll";
104114
#else
105115
const char *lib_name = "libcuda.so";
106116
#endif
@@ -159,6 +169,7 @@ static umf_result_t cu_memory_provider_initialize(void *params,
159169

160170
if (cu_params->memory_type == UMF_MEMORY_TYPE_UNKNOWN ||
161171
cu_params->memory_type > UMF_MEMORY_TYPE_SHARED) {
172+
LOG_ERR("Invalid memory type value");
162173
return UMF_RESULT_ERROR_INVALID_ARGUMENT;
163174
}
164175

@@ -252,12 +263,12 @@ static umf_result_t cu_memory_provider_alloc(void *provider, size_t size,
252263
}
253264

254265
// Remember current context and set the one from the provider
255-
CUcontext restore_ctx = NULL;
266+
/*CUcontext restore_ctx = NULL;
256267
umf_result_t umf_result = set_context(cu_provider->context, &restore_ctx);
257268
if (umf_result != UMF_RESULT_SUCCESS) {
258269
LOG_ERR("Failed to set CUDA context, ret = %d", umf_result);
259270
return umf_result;
260-
}
271+
}*/
261272

262273
CUresult cu_result = CUDA_SUCCESS;
263274
switch (cu_provider->memory_type) {
@@ -282,12 +293,12 @@ static umf_result_t cu_memory_provider_alloc(void *provider, size_t size,
282293
return UMF_RESULT_ERROR_UNKNOWN;
283294
}
284295

285-
umf_result = set_context(restore_ctx, &restore_ctx);
296+
/*umf_result = set_context(restore_ctx, &restore_ctx);
286297
if (umf_result != UMF_RESULT_SUCCESS) {
287298
LOG_ERR("Failed to restore CUDA context, ret = %d", umf_result);
288-
}
299+
}*/
289300

290-
umf_result = cu2umf_result(cu_result);
301+
umf_result_t umf_result = cu2umf_result(cu_result);
291302
if (umf_result != UMF_RESULT_SUCCESS) {
292303
LOG_ERR("Failed to allocate memory, cu_result = %d, ret = %d",
293304
cu_result, umf_result);

test/providers/cuda_helpers.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ struct DlHandleCloser {
4848
std::unique_ptr<void, DlHandleCloser> cuDlHandle = nullptr;
4949
int InitCUDAOps() {
5050
#ifdef _WIN32
51-
const char *lib_name = "cudart.dll";
51+
const char *lib_name = "nvcuda.dll";
5252
#else
5353
const char *lib_name = "libcuda.so";
5454
#endif
@@ -204,18 +204,25 @@ int cuda_fill(CUcontext context, CUdevice device, void *ptr, size_t size,
204204

205205
int ret = 0;
206206
CUresult res =
207-
libcu_ops.cuMemsetD32((CUdeviceptr)ptr, *(unsigned int *)pattern,
208-
size / sizeof(unsigned int));
207+
libcu_ops.cuMemsetD32((CUdeviceptr)ptr, *(unsigned int *)pattern, 4);
209208
if (res != CUDA_SUCCESS) {
210-
fprintf(stderr, "cuMemsetD32() failed!\n");
209+
fprintf(stderr, "cuMemsetD32(%llu, %u, %zu) failed!\n",
210+
(CUdeviceptr)ptr, *(unsigned int *)pattern,
211+
size / pattern_size);
212+
return -1;
213+
}
214+
215+
res = libcu_ops.cuStreamSynchronize(0);
216+
if (res != CUDA_SUCCESS) {
217+
fprintf(stderr, "cuStreamSynchronize() failed!\n");
211218
return -1;
212219
}
213220

214221
return ret;
215222
}
216223

217-
int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, void *src_ptr,
218-
size_t size) {
224+
int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr,
225+
const void *src_ptr, size_t size) {
219226
(void)context;
220227
(void)device;
221228

test/providers/cuda_helpers.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,18 @@
1010

1111
#include <umf/providers/provider_cuda.h>
1212

13+
// disable warning 4201: nonstandard extension used: nameless struct/union
14+
#if defined(_MSC_VER)
15+
#pragma warning(push)
16+
#pragma warning(disable : 4201)
17+
#endif // _MSC_VER
18+
1319
#include "cuda.h"
1420

21+
#if defined(_MSC_VER)
22+
#pragma warning(pop)
23+
#endif // _MSC_VER
24+
1525
#ifdef __cplusplus
1626
extern "C" {
1727
#endif
@@ -21,8 +31,8 @@ int destroy_context(CUcontext context);
2131
int cuda_fill(CUcontext context, CUdevice device, void *ptr, size_t size,
2232
const void *pattern, size_t pattern_size);
2333

24-
int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, void *src_ptr,
25-
size_t size);
34+
int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr,
35+
const void *src_ptr, size_t size);
2636

2737
umf_usm_memory_type_t get_mem_type(CUcontext context, void *ptr);
2838

test/providers/provider_cuda.cpp

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ struct umfCUDAProviderTest
7272
};
7373

7474
TEST_P(umfCUDAProviderTest, basic) {
75-
const size_t size = 1024 * 8;
7675
const uint32_t pattern = 0xAB;
7776
CUcontext expected_current_context = get_current_context();
7877

@@ -83,6 +82,11 @@ TEST_P(umfCUDAProviderTest, basic) {
8382
ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS);
8483
ASSERT_NE(provider, nullptr);
8584

85+
fprintf(stderr,
86+
"cuda_context_handle %p, cuda_device_handle, %d memory_type %d\n",
87+
params.cuda_context_handle, params.cuda_device_handle,
88+
params.memory_type);
89+
8690
size_t pageSize = 0;
8791
umf_result = umfMemoryProviderGetMinPageSize(provider, 0, &pageSize);
8892
ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS);
@@ -94,23 +98,26 @@ TEST_P(umfCUDAProviderTest, basic) {
9498
ASSERT_GE(pageSize, 0);
9599

96100
void *ptr = nullptr;
97-
umf_result = umfMemoryProviderAlloc(provider, size, 128, &ptr);
101+
const size_t size = pageSize * 8;
102+
umf_result = umfMemoryProviderAlloc(provider, size, 0, &ptr);
98103
ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS);
99104
ASSERT_NE(ptr, nullptr);
100105

101-
// use the allocated memory - fill it with a 0xAB pattern
102-
memAccessor->fill(ptr, size, &pattern, sizeof(pattern));
103-
106+
/*
104107
CUcontext actual_mem_context = get_mem_context(ptr);
105108
ASSERT_EQ(actual_mem_context, (CUcontext)params.cuda_context_handle);
106109
107110
CUcontext actual_current_context = get_current_context();
108111
ASSERT_EQ(actual_current_context, expected_current_context);
112+
*/
109113

110114
umf_usm_memory_type_t memoryTypeActual =
111115
get_mem_type((CUcontext)params.cuda_context_handle, ptr);
112116
ASSERT_EQ(memoryTypeActual, params.memory_type);
113117

118+
// use the allocated memory - fill it with a 0xAB pattern
119+
memAccessor->fill(ptr, size, &pattern, sizeof(pattern));
120+
114121
// check if the pattern was successfully applied
115122
uint32_t *hostMemory = (uint32_t *)calloc(1, size);
116123
memAccessor->copy(hostMemory, ptr, size);
@@ -171,15 +178,18 @@ TEST_P(umfCUDAProviderTest, allocInvalidSize) {
171178
ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS);
172179
ASSERT_NE(provider, nullptr);
173180

174-
// try to alloc (int)-1
175181
void *ptr = nullptr;
176-
umf_result = umfMemoryProviderAlloc(provider, -1, 0, &ptr);
177-
ASSERT_EQ(umf_result, UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY);
178182

179-
// in case of size == 0 we should got INVALID_ARGUMENT error
180-
// NOTE: this is invalid only for the DEVICE or SHARED allocations
181-
if (params.memory_type != UMF_MEMORY_TYPE_HOST) {
183+
// NOTE: some scenarios are invalid only for the DEVICE allocations
184+
if (params.memory_type == UMF_MEMORY_TYPE_DEVICE) {
185+
// try to alloc SIZE_MAX
186+
umf_result = umfMemoryProviderAlloc(provider, SIZE_MAX, 0, &ptr);
187+
ASSERT_EQ(ptr, nullptr);
188+
ASSERT_EQ(umf_result, UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY);
189+
190+
// in case of size == 0 we should got INVALID_ARGUMENT error
182191
umf_result = umfMemoryProviderAlloc(provider, 0, 0, &ptr);
192+
ASSERT_EQ(ptr, nullptr);
183193
ASSERT_EQ(umf_result, UMF_RESULT_ERROR_INVALID_ARGUMENT);
184194
}
185195

0 commit comments

Comments
 (0)