|
159 | 159 | #define cudaHostRegisterReadOnly hipHostRegisterReadOnly |
160 | 160 | #define cudaHostUnregister hipHostUnregister |
161 | 161 | #define cudaLaunchHostFunc hipLaunchHostFunc |
162 | | -#ifdef GGML_HIP_UMA |
163 | | -#define cudaMalloc hipMallocManaged |
164 | | -#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size) |
165 | | -#else |
166 | | -#define cudaMalloc hipMalloc |
167 | 162 | #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) |
168 | | -#endif |
169 | 163 | #define cudaMemcpy hipMemcpy |
170 | 164 | #define cudaMemcpyAsync hipMemcpyAsync |
171 | 165 | #define cudaMemcpyPeerAsync hipMemcpyPeerAsync |
@@ -10866,6 +10860,25 @@ int ggml_cuda_get_device() { |
10866 | 10860 | return id; |
10867 | 10861 | } |
10868 | 10862 |
|
| 10863 | +static inline cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) { |
| 10864 | +#if defined(GGML_USE_HIPBLAS) |
| 10865 | + auto res = hipMalloc(ptr, size); |
| 10866 | + // if Not enough space on VRAM => try with UMA |
| 10867 | + if (res == hipErrorOutOfMemory) { |
| 10868 | + GGML_CUDA_LOG_INFO(" Device %d: can not alloc %d MB on VRAM try alloc on HMM\n", device, (uint32_t)(size / 1024 / 1024)); |
| 10869 | + res = hipMallocManaged(ptr, size); |
| 10870 | + if (res == hipSuccess) { |
| 10871 | + // Config the memory for best speed (It's not supposed to fail) |
| 10872 | + CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device)); |
| 10873 | + GGML_CUDA_LOG_INFO(" => success\n"); |
| 10874 | + } |
| 10875 | + } |
| 10876 | + return res; |
| 10877 | +#else |
| 10878 | + return cudaMalloc(ptr, size); |
| 10879 | +#endif |
| 10880 | +} |
| 10881 | + |
10869 | 10882 | static ggml_cuda_device_info ggml_cuda_init() { |
10870 | 10883 | #ifdef __HIP_PLATFORM_AMD__ |
10871 | 10884 | // Workaround for a rocBLAS bug when using multiple graphics cards: |
@@ -11020,7 +11033,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { |
11020 | 11033 | size_t look_ahead_size = (size_t) (1.05 * size); |
11021 | 11034 | look_ahead_size = 256 * ((look_ahead_size + 255)/256); |
11022 | 11035 | ggml_cuda_set_device(device); |
11023 | | - CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size)); |
| 11036 | + CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device)); |
11024 | 11037 | *actual_size = look_ahead_size; |
11025 | 11038 | pool_size += look_ahead_size; |
11026 | 11039 | #ifdef DEBUG_CUDA_MALLOC |
@@ -11286,7 +11299,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe |
11286 | 11299 | size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0 |
11287 | 11300 |
|
11288 | 11301 | void * dev_ptr; |
11289 | | - cudaError_t err = cudaMalloc(&dev_ptr, size); |
| 11302 | + cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device); |
11290 | 11303 | if (err != cudaSuccess) { |
11291 | 11304 | // clear the error |
11292 | 11305 | cudaGetLastError(); |
@@ -11547,7 +11560,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu |
11547 | 11560 | // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first |
11548 | 11561 | ggml_cuda_set_device(id); |
11549 | 11562 | char * buf; |
11550 | | - CUDA_CHECK(cudaMalloc(&buf, size)); |
| 11563 | + CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id)); |
11551 | 11564 |
|
11552 | 11565 | // set padding to 0 to avoid possible NaN values |
11553 | 11566 | if (size > original_size) { |
|
0 commit comments