Skip to content

Commit a28250b

Browse files
authored
Update GGML_HIP_UMA (#473)
Add UMA config for higher speed like in (ggml-org/llama.cpp#7414) but made 2 changes: - Remove UMA build option - Use it in all case if hipalloc failed with 'not have enough memory' Another change is look for 'hipcc' on linux and not 'amdclang++'
1 parent c38feb4 commit a28250b

File tree

2 files changed

+38
-19
lines changed

2 files changed

+38
-19
lines changed

llama.cpp/ggml-cuda.cu

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -159,13 +159,7 @@
159159
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
160160
#define cudaHostUnregister hipHostUnregister
161161
#define cudaLaunchHostFunc hipLaunchHostFunc
162-
#ifdef GGML_HIP_UMA
163-
#define cudaMalloc hipMallocManaged
164-
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
165-
#else
166-
#define cudaMalloc hipMalloc
167162
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
168-
#endif
169163
#define cudaMemcpy hipMemcpy
170164
#define cudaMemcpyAsync hipMemcpyAsync
171165
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
@@ -10866,6 +10860,25 @@ int ggml_cuda_get_device() {
1086610860
return id;
1086710861
}
1086810862

10863+
static inline cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
10864+
#if defined(GGML_USE_HIPBLAS)
10865+
auto res = hipMalloc(ptr, size);
10866+
// if Not enough space on VRAM => try with UMA
10867+
if (res == hipErrorOutOfMemory) {
10868+
GGML_CUDA_LOG_INFO(" Device %d: can not alloc %d MB on VRAM try alloc on HMM\n", device, (uint32_t)(size / 1024 / 1024));
10869+
res = hipMallocManaged(ptr, size);
10870+
if (res == hipSuccess) {
10871+
// Config the memory for best speed (It's not supposed to fail)
10872+
CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
10873+
GGML_CUDA_LOG_INFO(" => success\n");
10874+
}
10875+
}
10876+
return res;
10877+
#else
10878+
return cudaMalloc(ptr, size);
10879+
#endif
10880+
}
10881+
1086910882
static ggml_cuda_device_info ggml_cuda_init() {
1087010883
#ifdef __HIP_PLATFORM_AMD__
1087110884
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -11020,7 +11033,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
1102011033
size_t look_ahead_size = (size_t) (1.05 * size);
1102111034
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
1102211035
ggml_cuda_set_device(device);
11023-
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
11036+
CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
1102411037
*actual_size = look_ahead_size;
1102511038
pool_size += look_ahead_size;
1102611039
#ifdef DEBUG_CUDA_MALLOC
@@ -11286,7 +11299,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
1128611299
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
1128711300

1128811301
void * dev_ptr;
11289-
cudaError_t err = cudaMalloc(&dev_ptr, size);
11302+
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
1129011303
if (err != cudaSuccess) {
1129111304
// clear the error
1129211305
cudaGetLastError();
@@ -11547,7 +11560,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
1154711560
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
1154811561
ggml_cuda_set_device(id);
1154911562
char * buf;
11550-
CUDA_CHECK(cudaMalloc(&buf, size));
11563+
CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
1155111564

1155211565
// set padding to 0 to avoid possible NaN values
1155311566
if (size > original_size) {

llamafile/cuda.c

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -778,9 +778,9 @@ static bool import_cuda_impl(void) {
778778

779779
char dso[PATH_MAX];
780780
char bindir[PATH_MAX];
781-
const char *compiler_path;
781+
const char *compiler_path = NULL;
782782
char compiler_path_buf[PATH_MAX];
783-
const char *library_path;
783+
const char *library_path = NULL;
784784
char library_path_buf[PATH_MAX];
785785

786786
// Attempt to load AMD GPU support.
@@ -791,15 +791,21 @@ static bool import_cuda_impl(void) {
791791

792792
// Get some essential paths.
793793
// ROCm SDK puts BLAS DLLs in same folder as clang++
794-
if (get_rocm_bin_path(compiler_path_buf, "amdclang++") ||
795-
get_rocm_bin_path(compiler_path_buf, "clang++")) {
796-
strcpy(library_path_buf, compiler_path_buf);
797-
dirname(library_path_buf);
798-
compiler_path = compiler_path_buf;
799-
library_path = library_path_buf;
794+
if (!IsWindows()) {
795+
if (get_rocm_bin_path(compiler_path_buf, "hipcc")) {
796+
strcpy(library_path_buf, compiler_path_buf);
797+
dirname(library_path_buf);
798+
compiler_path = compiler_path_buf;
799+
library_path = library_path_buf;
800+
}
800801
} else {
801-
compiler_path = 0;
802-
library_path = 0;
802+
if (get_rocm_bin_path(compiler_path_buf, "amdclang++") ||
803+
get_rocm_bin_path(compiler_path_buf, "clang++")) {
804+
strcpy(library_path_buf, compiler_path_buf);
805+
dirname(library_path_buf);
806+
compiler_path = compiler_path_buf;
807+
library_path = library_path_buf;
808+
}
803809
}
804810

805811
// Get path of GGML DSO for AMD.

0 commit comments

Comments
 (0)