diff --git a/include/umf/providers/provider_cuda.h b/include/umf/providers/provider_cuda.h index 5f1d5a6e2..e3b81858b 100644 --- a/include/umf/providers/provider_cuda.h +++ b/include/umf/providers/provider_cuda.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2024 Intel Corporation + * Copyright (C) 2024-2025 Intel Corporation * * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception @@ -53,6 +53,13 @@ umf_result_t umfCUDAMemoryProviderParamsSetMemoryType( umf_cuda_memory_provider_params_handle_t hParams, umf_usm_memory_type_t memoryType); +/// @brief Set the allocation flags in the parameters struct. +/// @param hParams handle to the parameters of the CUDA Memory Provider. +/// @param flags valid combination of CUDA allocation flags. +/// @return UMF_RESULT_SUCCESS on success or appropriate error code on failure. +umf_result_t umfCUDAMemoryProviderParamsSetAllocFlags( + umf_cuda_memory_provider_params_handle_t hParams, unsigned int flags); + umf_memory_provider_ops_t *umfCUDAMemoryProviderOps(void); #ifdef __cplusplus diff --git a/src/libumf.def b/src/libumf.def index f93553e90..98226dace 100644 --- a/src/libumf.def +++ b/src/libumf.def @@ -118,6 +118,7 @@ EXPORTS umfScalablePoolParamsSetGranularity umfScalablePoolParamsSetKeepAllMemory ; Added in UMF_0.11 + umfCUDAMemoryProviderParamsSetAllocFlags umfFixedMemoryProviderOps umfFixedMemoryProviderParamsCreate umfFixedMemoryProviderParamsDestroy diff --git a/src/libumf.map b/src/libumf.map index 7a7ac5ad3..bbf664dcf 100644 --- a/src/libumf.map +++ b/src/libumf.map @@ -116,6 +116,7 @@ UMF_0.10 { }; UMF_0.11 { + umfCUDAMemoryProviderParamsSetAllocFlags; umfFixedMemoryProviderOps; umfFixedMemoryProviderParamsCreate; umfFixedMemoryProviderParamsDestroy; diff --git a/src/provider/provider_cuda.c b/src/provider/provider_cuda.c index a9b6e88e9..a0f963fdd 100644 --- a/src/provider/provider_cuda.c +++ b/src/provider/provider_cuda.c @@ -55,6 +55,14 @@ umf_result_t umfCUDAMemoryProviderParamsSetMemoryType( return UMF_RESULT_ERROR_NOT_SUPPORTED; } +umf_result_t umfCUDAMemoryProviderParamsSetAllocFlags( + umf_cuda_memory_provider_params_handle_t hParams, unsigned int flags) { + (void)hParams; + (void)flags; + LOG_ERR("CUDA provider is disabled (UMF_BUILD_CUDA_PROVIDER is OFF)!"); + return UMF_RESULT_ERROR_NOT_SUPPORTED; +} + umf_memory_provider_ops_t *umfCUDAMemoryProviderOps(void) { // not supported LOG_ERR("CUDA provider is disabled (UMF_BUILD_CUDA_PROVIDER is OFF)!"); @@ -89,13 +97,22 @@ typedef struct cu_memory_provider_t { CUdevice device; umf_usm_memory_type_t memory_type; size_t min_alignment; + unsigned int alloc_flags; } cu_memory_provider_t; // CUDA Memory Provider settings struct typedef struct umf_cuda_memory_provider_params_t { - void *cuda_context_handle; ///< Handle to the CUDA context - int cuda_device_handle; ///< Handle to the CUDA device - umf_usm_memory_type_t memory_type; ///< Allocation memory type + // Handle to the CUDA context + void *cuda_context_handle; + + // Handle to the CUDA device + int cuda_device_handle; + + // Allocation memory type + umf_usm_memory_type_t memory_type; + + // Allocation flags for cuMemHostAlloc/cuMemAllocManaged + unsigned int alloc_flags; } umf_cuda_memory_provider_params_t; typedef struct cu_ops_t { @@ -103,7 +120,7 @@ typedef struct cu_ops_t { size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option); CUresult (*cuMemAlloc)(CUdeviceptr *dptr, size_t bytesize); - CUresult (*cuMemAllocHost)(void **pp, size_t bytesize); + CUresult (*cuMemHostAlloc)(void **pp, size_t bytesize, unsigned int flags); CUresult (*cuMemAllocManaged)(CUdeviceptr *dptr, size_t bytesize, unsigned int flags); CUresult (*cuMemFree)(CUdeviceptr dptr); @@ -172,8 +189,8 @@ static void init_cu_global_state(void) { utils_get_symbol_addr(0, "cuMemGetAllocationGranularity", lib_name); *(void **)&g_cu_ops.cuMemAlloc = utils_get_symbol_addr(0, "cuMemAlloc_v2", lib_name); - *(void **)&g_cu_ops.cuMemAllocHost = - utils_get_symbol_addr(0, "cuMemAllocHost_v2", lib_name); + *(void **)&g_cu_ops.cuMemHostAlloc = + utils_get_symbol_addr(0, "cuMemHostAlloc", lib_name); *(void **)&g_cu_ops.cuMemAllocManaged = utils_get_symbol_addr(0, "cuMemAllocManaged", lib_name); *(void **)&g_cu_ops.cuMemFree = @@ -196,7 +213,7 @@ static void init_cu_global_state(void) { utils_get_symbol_addr(0, "cuIpcCloseMemHandle", lib_name); if (!g_cu_ops.cuMemGetAllocationGranularity || !g_cu_ops.cuMemAlloc || - !g_cu_ops.cuMemAllocHost || !g_cu_ops.cuMemAllocManaged || + !g_cu_ops.cuMemHostAlloc || !g_cu_ops.cuMemAllocManaged || !g_cu_ops.cuMemFree || !g_cu_ops.cuMemFreeHost || !g_cu_ops.cuGetErrorName || !g_cu_ops.cuGetErrorString || !g_cu_ops.cuCtxGetCurrent || !g_cu_ops.cuCtxSetCurrent || @@ -225,6 +242,7 @@ umf_result_t umfCUDAMemoryProviderParamsCreate( params_data->cuda_context_handle = NULL; params_data->cuda_device_handle = -1; params_data->memory_type = UMF_MEMORY_TYPE_UNKNOWN; + params_data->alloc_flags = 0; *hParams = params_data; @@ -275,6 +293,18 @@ umf_result_t umfCUDAMemoryProviderParamsSetMemoryType( return UMF_RESULT_SUCCESS; } +umf_result_t umfCUDAMemoryProviderParamsSetAllocFlags( + umf_cuda_memory_provider_params_handle_t hParams, unsigned int flags) { + if (!hParams) { + LOG_ERR("CUDA Memory Provider params handle is NULL"); + return UMF_RESULT_ERROR_INVALID_ARGUMENT; + } + + hParams->alloc_flags = flags; + + return UMF_RESULT_SUCCESS; +} + static umf_result_t cu_memory_provider_initialize(void *params, void **provider) { if (params == NULL) { @@ -325,6 +355,17 @@ static umf_result_t cu_memory_provider_initialize(void *params, cu_provider->memory_type = cu_params->memory_type; cu_provider->min_alignment = min_alignment; + // If the memory type is shared (CUDA managed), the allocation flags must + // be set. NOTE: we do not check here if the flags are valid - + // this will be done by CUDA runtime. + if (cu_params->memory_type == UMF_MEMORY_TYPE_SHARED && + cu_params->alloc_flags == 0) { + // the default setting is CU_MEM_ATTACH_GLOBAL + cu_provider->alloc_flags = CU_MEM_ATTACH_GLOBAL; + } else { + cu_provider->alloc_flags = cu_params->alloc_flags; + } + *provider = cu_provider; return UMF_RESULT_SUCCESS; @@ -381,7 +422,8 @@ static umf_result_t cu_memory_provider_alloc(void *provider, size_t size, CUresult cu_result = CUDA_SUCCESS; switch (cu_provider->memory_type) { case UMF_MEMORY_TYPE_HOST: { - cu_result = g_cu_ops.cuMemAllocHost(resultPtr, size); + cu_result = + g_cu_ops.cuMemHostAlloc(resultPtr, size, cu_provider->alloc_flags); break; } case UMF_MEMORY_TYPE_DEVICE: { @@ -390,7 +432,7 @@ static umf_result_t cu_memory_provider_alloc(void *provider, size_t size, } case UMF_MEMORY_TYPE_SHARED: { cu_result = g_cu_ops.cuMemAllocManaged((CUdeviceptr *)resultPtr, size, - CU_MEM_ATTACH_GLOBAL); + cu_provider->alloc_flags); break; } default: diff --git a/test/providers/cuda_helpers.cpp b/test/providers/cuda_helpers.cpp index c8bca6166..aa0647080 100644 --- a/test/providers/cuda_helpers.cpp +++ b/test/providers/cuda_helpers.cpp @@ -22,7 +22,7 @@ struct libcu_ops { CUresult (*cuDeviceGet)(CUdevice *device, int ordinal); CUresult (*cuMemAlloc)(CUdeviceptr *dptr, size_t size); CUresult (*cuMemFree)(CUdeviceptr dptr); - CUresult (*cuMemAllocHost)(void **pp, size_t size); + CUresult (*cuMemHostAlloc)(void **pp, size_t size, unsigned int flags); CUresult (*cuMemAllocManaged)(CUdeviceptr *dptr, size_t bytesize, unsigned int flags); CUresult (*cuMemFreeHost)(void *p); @@ -34,6 +34,7 @@ struct libcu_ops { CUresult (*cuPointerGetAttributes)(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr); + CUresult (*cuMemHostGetFlags)(unsigned int *pFlags, void *p); CUresult (*cuStreamSynchronize)(CUstream hStream); CUresult (*cuCtxSynchronize)(void); } libcu_ops; @@ -69,7 +70,7 @@ struct DlHandleCloser { libcu_ops.cuMemFree = [](auto... args) { return noop_stub(args...); }; - libcu_ops.cuMemAllocHost = [](auto... args) { + libcu_ops.cuMemHostAlloc = [](auto... args) { return noop_stub(args...); }; libcu_ops.cuMemAllocManaged = [](auto... args) { @@ -90,6 +91,9 @@ struct DlHandleCloser { libcu_ops.cuPointerGetAttributes = [](auto... args) { return noop_stub(args...); }; + libcu_ops.cuMemHostGetFlags = [](auto... args) { + return noop_stub(args...); + }; libcu_ops.cuStreamSynchronize = [](auto... args) { return noop_stub(args...); }; @@ -164,10 +168,10 @@ int InitCUDAOps() { fprintf(stderr, "cuMemFree_v2 symbol not found in %s\n", lib_name); return -1; } - *(void **)&libcu_ops.cuMemAllocHost = - utils_get_symbol_addr(cuDlHandle.get(), "cuMemAllocHost_v2", lib_name); - if (libcu_ops.cuMemAllocHost == nullptr) { - fprintf(stderr, "cuMemAllocHost_v2 symbol not found in %s\n", lib_name); + *(void **)&libcu_ops.cuMemHostAlloc = + utils_get_symbol_addr(cuDlHandle.get(), "cuMemHostAlloc", lib_name); + if (libcu_ops.cuMemHostAlloc == nullptr) { + fprintf(stderr, "cuMemHostAlloc symbol not found in %s\n", lib_name); return -1; } *(void **)&libcu_ops.cuMemAllocManaged = @@ -208,6 +212,12 @@ int InitCUDAOps() { lib_name); return -1; } + *(void **)&libcu_ops.cuMemHostGetFlags = + utils_get_symbol_addr(cuDlHandle.get(), "cuMemHostGetFlags", lib_name); + if (libcu_ops.cuMemHostGetFlags == nullptr) { + fprintf(stderr, "cuMemHostGetFlags symbol not found in %s\n", lib_name); + return -1; + } *(void **)&libcu_ops.cuStreamSynchronize = utils_get_symbol_addr( cuDlHandle.get(), "cuStreamSynchronize", lib_name); if (libcu_ops.cuStreamSynchronize == nullptr) { @@ -236,7 +246,7 @@ int InitCUDAOps() { libcu_ops.cuCtxSetCurrent = cuCtxSetCurrent; libcu_ops.cuDeviceGet = cuDeviceGet; libcu_ops.cuMemAlloc = cuMemAlloc; - libcu_ops.cuMemAllocHost = cuMemAllocHost; + libcu_ops.cuMemHostAlloc = cuMemHostAlloc; libcu_ops.cuMemAllocManaged = cuMemAllocManaged; libcu_ops.cuMemFree = cuMemFree; libcu_ops.cuMemFreeHost = cuMemFreeHost; @@ -244,6 +254,7 @@ int InitCUDAOps() { libcu_ops.cuMemcpy = cuMemcpy; libcu_ops.cuPointerGetAttribute = cuPointerGetAttribute; libcu_ops.cuPointerGetAttributes = cuPointerGetAttributes; + libcu_ops.cuMemHostGetFlags = cuMemHostGetFlags; libcu_ops.cuStreamSynchronize = cuStreamSynchronize; libcu_ops.cuCtxSynchronize = cuCtxSynchronize; @@ -373,6 +384,17 @@ umf_usm_memory_type_t get_mem_type(CUcontext context, void *ptr) { return UMF_MEMORY_TYPE_UNKNOWN; } +unsigned int get_mem_host_alloc_flags(void *ptr) { + unsigned int flags; + CUresult res = libcu_ops.cuMemHostGetFlags(&flags, ptr); + if (res != CUDA_SUCCESS) { + fprintf(stderr, "cuPointerGetAttribute() failed!\n"); + return 0; + } + + return flags; +} + CUcontext get_mem_context(void *ptr) { CUcontext context; CUresult res = libcu_ops.cuPointerGetAttribute( diff --git a/test/providers/cuda_helpers.h b/test/providers/cuda_helpers.h index 3d6572209..e7deb9064 100644 --- a/test/providers/cuda_helpers.h +++ b/test/providers/cuda_helpers.h @@ -44,6 +44,8 @@ int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, umf_usm_memory_type_t get_mem_type(CUcontext context, void *ptr); +unsigned int get_mem_host_alloc_flags(void *ptr); + CUcontext get_mem_context(void *ptr); CUcontext get_current_context(); diff --git a/test/providers/provider_cuda.cpp b/test/providers/provider_cuda.cpp index bacaacd6c..9c7f76dd1 100644 --- a/test/providers/provider_cuda.cpp +++ b/test/providers/provider_cuda.cpp @@ -60,7 +60,7 @@ CUDATestHelper::CUDATestHelper() { umf_cuda_memory_provider_params_handle_t create_cuda_prov_params(CUcontext context, CUdevice device, - umf_usm_memory_type_t memory_type) { + umf_usm_memory_type_t memory_type, unsigned int flags) { umf_cuda_memory_provider_params_handle_t params = nullptr; umf_result_t res = umfCUDAMemoryProviderParamsCreate(¶ms); @@ -86,6 +86,12 @@ create_cuda_prov_params(CUcontext context, CUdevice device, return nullptr; } + res = umfCUDAMemoryProviderParamsSetAllocFlags(params, flags); + if (res != UMF_RESULT_SUCCESS) { + umfCUDAMemoryProviderParamsDestroy(params); + return nullptr; + } + return params; } @@ -138,7 +144,7 @@ struct umfCUDAProviderTest expected_context = cudaTestHelper.get_test_context(); params = create_cuda_prov_params(cudaTestHelper.get_test_context(), cudaTestHelper.get_test_device(), - memory_type); + memory_type, 0 /* alloc flags */); ASSERT_NE(expected_context, nullptr); switch (memory_type) { @@ -350,7 +356,7 @@ TEST_P(umfCUDAProviderTest, multiContext) { ASSERT_EQ(ret, 0); umf_cuda_memory_provider_params_handle_t params1 = - create_cuda_prov_params(ctx1, device, UMF_MEMORY_TYPE_HOST); + create_cuda_prov_params(ctx1, device, UMF_MEMORY_TYPE_HOST, 0); ASSERT_NE(params1, nullptr); umf_memory_provider_handle_t provider1; umf_result_t umf_result = umfMemoryProviderCreate( @@ -361,7 +367,7 @@ TEST_P(umfCUDAProviderTest, multiContext) { ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); umf_cuda_memory_provider_params_handle_t params2 = - create_cuda_prov_params(ctx2, device, UMF_MEMORY_TYPE_HOST); + create_cuda_prov_params(ctx2, device, UMF_MEMORY_TYPE_HOST, 0); ASSERT_NE(params2, nullptr); umf_memory_provider_handle_t provider2; umf_result = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(), params2, @@ -406,6 +412,115 @@ TEST_P(umfCUDAProviderTest, multiContext) { ASSERT_EQ(ret, 0); } +struct umfCUDAProviderAllocFlagsTest + : umf_test::test, + ::testing::WithParamInterface< + std::tuple> { + + void SetUp() override { + test::SetUp(); + + get_cuda_device(&device); + create_context(device, &context); + } + + void TearDown() override { + destroy_context(context); + + test::TearDown(); + } + + CUdevice device; + CUcontext context; +}; + +TEST_P(umfCUDAProviderAllocFlagsTest, cudaAllocFlags) { + auto [memory_type, test_flags] = this->GetParam(); + + umf_cuda_memory_provider_params_handle_t test_params = + create_cuda_prov_params(context, device, memory_type, test_flags); + + umf_memory_provider_handle_t provider = nullptr; + umf_result_t umf_result = umfMemoryProviderCreate( + umfCUDAMemoryProviderOps(), test_params, &provider); + ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); + ASSERT_NE(provider, nullptr); + + void *ptr = nullptr; + umf_result = umfMemoryProviderAlloc(provider, 128, 0, &ptr); + ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); + ASSERT_NE(ptr, nullptr); + + if (memory_type == UMF_MEMORY_TYPE_HOST) { + // check if the memory allocation flag is set correctly + unsigned int flags = get_mem_host_alloc_flags(ptr); + ASSERT_TRUE(flags & test_flags); + } + + umf_result = umfMemoryProviderFree(provider, ptr, 128); + ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); + + umfMemoryProviderDestroy(provider); + umfCUDAMemoryProviderParamsDestroy(test_params); +} + +TEST_P(umfCUDAProviderAllocFlagsTest, reuseParams) { + auto [memory_type, test_flags] = this->GetParam(); + + // first, create a provider for SHARED memory type with empty alloc flags, + // and the reuse the test_params to create a provider for test params + umf_cuda_memory_provider_params_handle_t test_params = + create_cuda_prov_params(context, device, UMF_MEMORY_TYPE_SHARED, 0); + + umf_memory_provider_handle_t provider = nullptr; + + umf_result_t umf_result = umfMemoryProviderCreate( + umfCUDAMemoryProviderOps(), test_params, &provider); + ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); + ASSERT_NE(provider, nullptr); + + void *ptr = nullptr; + umf_result = umfMemoryProviderAlloc(provider, 128, 0, &ptr); + ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); + ASSERT_NE(ptr, nullptr); + + umf_result = umfMemoryProviderFree(provider, ptr, 128); + ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); + + umfMemoryProviderDestroy(provider); + + // reuse the test_params to create a provider for test params + umf_result = + umfCUDAMemoryProviderParamsSetMemoryType(test_params, memory_type); + ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); + + umf_result = + umfCUDAMemoryProviderParamsSetAllocFlags(test_params, test_flags); + ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); + + umf_result = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(), + test_params, &provider); + ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); + ASSERT_NE(provider, nullptr); + + umf_result = umfMemoryProviderAlloc(provider, 128, 0, &ptr); + ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); + ASSERT_NE(ptr, nullptr); + + if (memory_type == UMF_MEMORY_TYPE_HOST) { + // check if the memory allocation flag is set correctly + unsigned int flags = get_mem_host_alloc_flags(ptr); + ASSERT_TRUE(flags & test_flags); + } + + umf_result = umfMemoryProviderFree(provider, ptr, 128); + ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); + + umfMemoryProviderDestroy(provider); + + umfCUDAMemoryProviderParamsDestroy(test_params); +} + // TODO add tests that mixes CUDA Memory Provider and Disjoint Pool INSTANTIATE_TEST_SUITE_P(umfCUDAProviderTestSuite, umfCUDAProviderTest, @@ -413,6 +528,15 @@ INSTANTIATE_TEST_SUITE_P(umfCUDAProviderTestSuite, umfCUDAProviderTest, UMF_MEMORY_TYPE_SHARED, UMF_MEMORY_TYPE_HOST)); +INSTANTIATE_TEST_SUITE_P( + umfCUDAProviderAllocFlagsTestSuite, umfCUDAProviderAllocFlagsTest, + ::testing::Values( + std::make_tuple(UMF_MEMORY_TYPE_SHARED, CU_MEM_ATTACH_GLOBAL), + std::make_tuple(UMF_MEMORY_TYPE_SHARED, CU_MEM_ATTACH_HOST), + std::make_tuple(UMF_MEMORY_TYPE_HOST, CU_MEMHOSTALLOC_PORTABLE), + std::make_tuple(UMF_MEMORY_TYPE_HOST, CU_MEMHOSTALLOC_DEVICEMAP), + std::make_tuple(UMF_MEMORY_TYPE_HOST, CU_MEMHOSTALLOC_WRITECOMBINED))); + // TODO: add IPC API GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(umfIpcTest); /* diff --git a/test/providers/provider_cuda_not_impl.cpp b/test/providers/provider_cuda_not_impl.cpp index 30fc373ca..4054c26a8 100644 --- a/test/providers/provider_cuda_not_impl.cpp +++ b/test/providers/provider_cuda_not_impl.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2024-2025 Intel Corporation // Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception @@ -26,6 +26,9 @@ TEST_F(test, cuda_provider_not_implemented) { UMF_MEMORY_TYPE_DEVICE); ASSERT_EQ(result, UMF_RESULT_ERROR_NOT_SUPPORTED); + result = umfCUDAMemoryProviderParamsSetAllocFlags(hParams, 0); + ASSERT_EQ(result, UMF_RESULT_ERROR_NOT_SUPPORTED); + umf_memory_provider_ops_t *ops = umfCUDAMemoryProviderOps(); ASSERT_EQ(ops, nullptr); }