enable CUDA provider on Windows

bratpiorka · bratpiorka · commit 59ab7d4938cc · 2024-11-03T17:13:27.000+01:00
diff --git a/.github/workflows/pr_push.yml b/.github/workflows/pr_push.yml
@@ -15,7 +15,7 @@ concurrency:
 permissions:
   contents: read
 
-jobs:
+jobs:  
   CodeChecks:
     uses: ./.github/workflows/reusable_checks.yml
   DocsBuild:
@@ -24,56 +24,5 @@ jobs:
     name: Fast builds
     needs: [CodeChecks, DocsBuild]
     uses: ./.github/workflows/reusable_fast.yml
-  Build:
-    name: Basic builds
-    needs: [FastBuild]
-    uses: ./.github/workflows/reusable_basic.yml
-  DevDax:
-    needs: [FastBuild]
-    uses: ./.github/workflows/reusable_dax.yml
-  Sanitizers:
-    needs: [FastBuild]
-    uses: ./.github/workflows/reusable_sanitizers.yml
-  Qemu:
-    needs: [FastBuild]
-    uses: ./.github/workflows/reusable_qemu.yml
-  Benchmarks:
-    needs: [Build]
-    uses: ./.github/workflows/reusable_benchmarks.yml
-  ProxyLib:
-    needs: [Build]
-    uses: ./.github/workflows/reusable_proxy_lib.yml
   GPU:
-    needs: [Build]
     uses: ./.github/workflows/reusable_gpu.yml
-  Valgrind:
-    needs: [Build]
-    uses: ./.github/workflows/reusable_valgrind.yml
-  MultiNuma:
-    needs: [Build]
-    uses: ./.github/workflows/reusable_multi_numa.yml
-  Coverage:
-    # total coverage (on upstream only)
-    if: github.repository == 'oneapi-src/unified-memory-framework'
-    needs: [Build, DevDax, GPU, MultiNuma, Qemu, ProxyLib]
-    uses: ./.github/workflows/reusable_coverage.yml
-    secrets: inherit
-    with:
-      trigger: "${{github.event_name}}"
-  Coverage_partial:
-    # partial coverage (on forks)
-    if: github.repository != 'oneapi-src/unified-memory-framework'
-    needs: [Build, Qemu, ProxyLib]
-    uses: ./.github/workflows/reusable_coverage.yml
-  CodeQL:
-    needs: [Build]
-    permissions:
-      contents: read
-      security-events: write
-    uses: ./.github/workflows/reusable_codeql.yml
-  Trivy:
-    needs: [Build]
-    permissions:
-      contents: read
-      security-events: write
-    uses: ./.github/workflows/reusable_trivy.yml
diff --git a/.github/workflows/reusable_gpu.yml b/.github/workflows/reusable_gpu.yml
@@ -126,22 +126,28 @@ jobs:
           name: ${{env.COVERAGE_NAME}}-shared-${{matrix.shared_library}}
           path: ${{env.COVERAGE_DIR}}
 
+  # TODO merge with above
   gpu-CUDA:
     name: CUDA
     env:
       COVERAGE_NAME : "exports-coverage-gpu-CUDA"
+      VCPKG_PATH: "${{github.workspace}}/build/vcpkg/packages/hwloc_x64-windows;${{github.workspace}}/build/vcpkg/packages/tbb_x64-windows;${{github.workspace}}/build/vcpkg/packages/jemalloc_x64-windows;"
+      CUDA_PATH: "c:/cuda"
+
     # run only on upstream; forks will not have the HW
     if: github.repository == 'oneapi-src/unified-memory-framework'
     strategy:
       matrix:
         shared_library: ['ON', 'OFF']
         build_type: ['Debug', 'Release']
-        # TODO add windows
-        os: ['Ubuntu']
+        os: ['Windows']
         include:
-        - os: 'Ubuntu'
-          compiler: {c: gcc, cxx: g++}
-          number_of_processors: '$(nproc)'
+        - os: 'Windows'
+          compiler: {c: cl, cxx: cl}
+          number_of_processors: '$Env:NUMBER_OF_PROCESSORS'
+        exclude:
+        - os: 'Windows'
+          build_type: 'Debug'
 
     runs-on: ["DSS-CUDA", "DSS-${{matrix.os}}"]
     steps:
@@ -154,10 +160,53 @@ jobs:
       if: matrix.os == 'Ubuntu'
       run: .github/scripts/get_system_info.sh
 
+    - name: Initialize vcpkg
+      if: matrix.os == 'Windows'
+      uses: lukka/run-vcpkg@5e0cab206a5ea620130caf672fce3e4a6b5666a1 # v11.5
+      with:
+        vcpkgGitCommitId: 3dd44b931481d7a8e9ba412621fa810232b66289
+        vcpkgDirectory: ${{env.BUILD_DIR}}/vcpkg
+        vcpkgJsonGlob: '**/vcpkg.json'
+
+    - name: Install dependencies (windows-latest)
+      if: matrix.os == 'Windows'
+      run: vcpkg install
+      shell: pwsh # Specifies PowerShell as the shell for running the script.
+
+    - name: Install dependencies (ubuntu-latest)
+      if: matrix.os == 'ubuntu-latest'
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y cmake libjemalloc-dev libhwloc-dev libnuma-dev libtbb-dev
+
+    - name: Configure build for Win
+      if: matrix.os == 'Windows'
+      run: >
+        cmake
+        -DCMAKE_PREFIX_PATH="${{env.VCPKG_PATH}}${{env.CUDA_PATH}}"
+        -B ${{env.BUILD_DIR}}
+        -DCMAKE_INSTALL_PREFIX="${{env.INSTL_DIR}}"
+        -DCMAKE_BUILD_TYPE=${{matrix.build_type}}
+        -DCMAKE_C_COMPILER=${{matrix.compiler.c}}
+        -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
+        -DUMF_BUILD_SHARED_LIBRARY=${{matrix.shared_library}}
+        -DUMF_BUILD_BENCHMARKS=ON
+        -DUMF_BUILD_TESTS=ON
+        -DUMF_BUILD_GPU_TESTS=ON
+        -DUMF_BUILD_GPU_EXAMPLES=ON
+        -DUMF_FORMAT_CODE_STYLE=OFF
+        -DUMF_DEVELOPER_MODE=ON
+        -DUMF_BUILD_LIBUMF_POOL_DISJOINT=ON
+        -DUMF_BUILD_LIBUMF_POOL_JEMALLOC=ON
+        -DUMF_BUILD_LEVEL_ZERO_PROVIDER=OFF
+        -DUMF_BUILD_CUDA_PROVIDER=ON
+        -DUMF_TESTS_FAIL_ON_SKIP=ON
+
     - name: Configure build for Ubuntu
       if: matrix.os == 'Ubuntu'
       run: >
-        cmake -B ${{env.BUILD_DIR}}
+        cmake 
+        -B ${{env.BUILD_DIR}}
         -DCMAKE_INSTALL_PREFIX="${{env.INSTL_DIR}}"
         -DCMAKE_BUILD_TYPE=${{matrix.build_type}}
         -DCMAKE_C_COMPILER=${{matrix.compiler.c}}
@@ -179,6 +228,10 @@ jobs:
     - name: Build UMF
       run: cmake --build ${{env.BUILD_DIR}} --config ${{matrix.build_type}} -j ${{matrix.number_of_processors}}
 
+    - name: aa
+      if: matrix.os == 'Windows'
+      run: ${{env.BUILD_DIR}}/test/${{matrix.build_type}}/umf_test-provider_cuda
+
     - name: Run tests
       working-directory: ${{env.BUILD_DIR}}
       run: ctest -C ${{matrix.build_type}} --output-on-failure --test-dir test
diff --git a/src/provider/provider_cuda.c b/src/provider/provider_cuda.c
@@ -21,8 +21,18 @@ umf_memory_provider_ops_t *umfCUDAMemoryProviderOps(void) {
 
 #else // !defined(UMF_NO_CUDA_PROVIDER)
 
+// disable warning 4201: nonstandard extension used: nameless struct/union
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4201)
+#endif // _MSC_VER
+
 #include "cuda.h"
 
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif // _MSC_VER
+
 #include "base_alloc_global.h"
 #include "utils_assert.h"
 #include "utils_common.h"
@@ -100,7 +110,7 @@ static umf_result_t cu2umf_result(CUresult result) {
 
 static void init_cu_global_state(void) {
 #ifdef _WIN32
-    const char *lib_name = "cudart.dll";
+    const char *lib_name = "nvcuda.dll";
 #else
     const char *lib_name = "libcuda.so";
 #endif
@@ -159,6 +169,7 @@ static umf_result_t cu_memory_provider_initialize(void *params,
 
     if (cu_params->memory_type == UMF_MEMORY_TYPE_UNKNOWN ||
         cu_params->memory_type > UMF_MEMORY_TYPE_SHARED) {
+        LOG_ERR("Invalid memory type value");
         return UMF_RESULT_ERROR_INVALID_ARGUMENT;
     }
 
@@ -252,12 +263,12 @@ static umf_result_t cu_memory_provider_alloc(void *provider, size_t size,
     }
 
     // Remember current context and set the one from the provider
-    CUcontext restore_ctx = NULL;
+    /*CUcontext restore_ctx = NULL;
     umf_result_t umf_result = set_context(cu_provider->context, &restore_ctx);
     if (umf_result != UMF_RESULT_SUCCESS) {
         LOG_ERR("Failed to set CUDA context, ret = %d", umf_result);
         return umf_result;
-    }
+    }*/
 
     CUresult cu_result = CUDA_SUCCESS;
     switch (cu_provider->memory_type) {
@@ -282,12 +293,12 @@ static umf_result_t cu_memory_provider_alloc(void *provider, size_t size,
         return UMF_RESULT_ERROR_UNKNOWN;
     }
 
-    umf_result = set_context(restore_ctx, &restore_ctx);
+    /*umf_result = set_context(restore_ctx, &restore_ctx);
     if (umf_result != UMF_RESULT_SUCCESS) {
         LOG_ERR("Failed to restore CUDA context, ret = %d", umf_result);
-    }
+    }*/
 
-    umf_result = cu2umf_result(cu_result);
+    umf_result_t umf_result = cu2umf_result(cu_result);
     if (umf_result != UMF_RESULT_SUCCESS) {
         LOG_ERR("Failed to allocate memory, cu_result = %d, ret = %d",
                 cu_result, umf_result);
diff --git a/test/providers/cuda_helpers.cpp b/test/providers/cuda_helpers.cpp
@@ -48,7 +48,7 @@ struct DlHandleCloser {
 std::unique_ptr<void, DlHandleCloser> cuDlHandle = nullptr;
 int InitCUDAOps() {
 #ifdef _WIN32
-    const char *lib_name = "cudart.dll";
+    const char *lib_name = "nvcuda.dll";
 #else
     const char *lib_name = "libcuda.so";
 #endif
@@ -204,18 +204,25 @@ int cuda_fill(CUcontext context, CUdevice device, void *ptr, size_t size,
 
     int ret = 0;
     CUresult res =
-        libcu_ops.cuMemsetD32((CUdeviceptr)ptr, *(unsigned int *)pattern,
-                              size / sizeof(unsigned int));
+        libcu_ops.cuMemsetD32((CUdeviceptr)ptr, *(unsigned int *)pattern, 4);
     if (res != CUDA_SUCCESS) {
-        fprintf(stderr, "cuMemsetD32() failed!\n");
+        fprintf(stderr, "cuMemsetD32(%llu, %u, %zu) failed!\n",
+                (CUdeviceptr)ptr, *(unsigned int *)pattern,
+                size / pattern_size);
+        return -1;
+    }
+
+    res = libcu_ops.cuStreamSynchronize(0);
+    if (res != CUDA_SUCCESS) {
+        fprintf(stderr, "cuStreamSynchronize() failed!\n");
         return -1;
     }
 
     return ret;
 }
 
-int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, void *src_ptr,
-              size_t size) {
+int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr,
+              const void *src_ptr, size_t size) {
     (void)context;
     (void)device;
 
diff --git a/test/providers/cuda_helpers.h b/test/providers/cuda_helpers.h
@@ -10,8 +10,18 @@
 
 #include <umf/providers/provider_cuda.h>
 
+// disable warning 4201: nonstandard extension used: nameless struct/union
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4201)
+#endif // _MSC_VER
+
 #include "cuda.h"
 
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif // _MSC_VER
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -21,8 +31,8 @@ int destroy_context(CUcontext context);
 int cuda_fill(CUcontext context, CUdevice device, void *ptr, size_t size,
               const void *pattern, size_t pattern_size);
 
-int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, void *src_ptr,
-              size_t size);
+int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr,
+              const void *src_ptr, size_t size);
 
 umf_usm_memory_type_t get_mem_type(CUcontext context, void *ptr);
 
diff --git a/test/providers/provider_cuda.cpp b/test/providers/provider_cuda.cpp
@@ -72,7 +72,6 @@ struct umfCUDAProviderTest
 };
 
 TEST_P(umfCUDAProviderTest, basic) {
-    const size_t size = 1024 * 8;
     const uint32_t pattern = 0xAB;
     CUcontext expected_current_context = get_current_context();
 
@@ -83,6 +82,11 @@ TEST_P(umfCUDAProviderTest, basic) {
     ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS);
     ASSERT_NE(provider, nullptr);
 
+    fprintf(stderr,
+            "cuda_context_handle %p, cuda_device_handle, %d memory_type %d\n",
+            params.cuda_context_handle, params.cuda_device_handle,
+            params.memory_type);
+
     size_t pageSize = 0;
     umf_result = umfMemoryProviderGetMinPageSize(provider, 0, &pageSize);
     ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS);
@@ -94,23 +98,26 @@ TEST_P(umfCUDAProviderTest, basic) {
     ASSERT_GE(pageSize, 0);
 
     void *ptr = nullptr;
-    umf_result = umfMemoryProviderAlloc(provider, size, 128, &ptr);
+    const size_t size = pageSize * 8;
+    umf_result = umfMemoryProviderAlloc(provider, size, 0, &ptr);
     ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS);
     ASSERT_NE(ptr, nullptr);
 
-    // use the allocated memory - fill it with a 0xAB pattern
-    memAccessor->fill(ptr, size, &pattern, sizeof(pattern));
-
+    /*
     CUcontext actual_mem_context = get_mem_context(ptr);
     ASSERT_EQ(actual_mem_context, (CUcontext)params.cuda_context_handle);
 
     CUcontext actual_current_context = get_current_context();
     ASSERT_EQ(actual_current_context, expected_current_context);
+*/
 
     umf_usm_memory_type_t memoryTypeActual =
         get_mem_type((CUcontext)params.cuda_context_handle, ptr);
     ASSERT_EQ(memoryTypeActual, params.memory_type);
 
+    // use the allocated memory - fill it with a 0xAB pattern
+    memAccessor->fill(ptr, size, &pattern, sizeof(pattern));
+
     // check if the pattern was successfully applied
     uint32_t *hostMemory = (uint32_t *)calloc(1, size);
     memAccessor->copy(hostMemory, ptr, size);
@@ -171,15 +178,18 @@ TEST_P(umfCUDAProviderTest, allocInvalidSize) {
     ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS);
     ASSERT_NE(provider, nullptr);
 
-    // try to alloc (int)-1
     void *ptr = nullptr;
-    umf_result = umfMemoryProviderAlloc(provider, -1, 0, &ptr);
-    ASSERT_EQ(umf_result, UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY);
 
-    // in case of size == 0 we should got INVALID_ARGUMENT error
-    // NOTE: this is invalid only for the DEVICE or SHARED allocations
-    if (params.memory_type != UMF_MEMORY_TYPE_HOST) {
+    // NOTE: some scenarios are invalid only for the DEVICE allocations
+    if (params.memory_type == UMF_MEMORY_TYPE_DEVICE) {
+        // try to alloc SIZE_MAX
+        umf_result = umfMemoryProviderAlloc(provider, SIZE_MAX, 0, &ptr);
+        ASSERT_EQ(ptr, nullptr);
+        ASSERT_EQ(umf_result, UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY);
+
+        // in case of size == 0 we should got INVALID_ARGUMENT error
         umf_result = umfMemoryProviderAlloc(provider, 0, 0, &ptr);
+        ASSERT_EQ(ptr, nullptr);
         ASSERT_EQ(umf_result, UMF_RESULT_ERROR_INVALID_ARGUMENT);
     }