Skip to content

Commit 7eb9f89

Browse files
raffenethzhou
authored andcommitted
mpl/hip: Use IPC handle validation instead of free hooks
The HIP memory hooks are unreliable because they rely on specific link order at application build time. Implement IPC handle validation instead which should always work.
1 parent e757dea commit 7eb9f89

File tree

2 files changed

+18
-87
lines changed

2 files changed

+18
-87
lines changed

src/mpl/include/mpl_gpu_hip.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,10 @@
1313
#include "hip/hip_runtime.h"
1414
#include "hip/hip_runtime_api.h"
1515

16-
typedef hipIpcMemHandle_t MPL_gpu_ipc_mem_handle_t;
16+
typedef struct {
17+
hipIpcMemHandle_t handle;
18+
uint32_t id;
19+
} MPL_gpu_ipc_mem_handle_t;
1720
typedef int MPL_gpu_device_handle_t;
1821
typedef struct hipPointerAttribute_t MPL_gpu_device_attr;
1922
typedef int MPL_gpu_request;

src/mpl/src/gpu/mpl_gpu_hip.c

Lines changed: 14 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,6 @@
1111
#define HIP_ERR_CHECK(ret) if (unlikely((ret) != hipSuccess)) goto fn_fail
1212
#define HI_ERR_CHECK(ret) if (unlikely((ret) != HIP_SUCCESS)) goto fn_fail
1313

14-
typedef struct gpu_free_hook {
15-
void (*free_hook) (void *dptr);
16-
struct gpu_free_hook *next;
17-
} gpu_free_hook_s;
18-
1914
static int gpu_initialized = 0;
2015
static int device_count = -1;
2116
static int max_dev_id = -1;
@@ -26,13 +21,6 @@ static char affinity_env[MAX_GPU_STR_LEN] = { 0 };
2621
static int *local_to_global_map; /* [device_count] */
2722
static int *global_to_local_map; /* [max_dev_id + 1] */
2823

29-
static gpu_free_hook_s *free_hook_chain = NULL;
30-
31-
static hipError_t(*sys_hipFree) (void *dptr);
32-
33-
static int gpu_mem_hook_init();
34-
static MPL_initlock_t free_hook_mutex = MPL_INITLOCK_INITIALIZER;
35-
3624
int MPL_gpu_get_dev_count(int *dev_cnt, int *dev_id, int *subdevice_id)
3725
{
3826
int ret = MPL_SUCCESS;
@@ -187,7 +175,11 @@ int MPL_gpu_ipc_handle_create(const void *ptr, MPL_gpu_device_attr * ptr_attr,
187175
int mpl_err = MPL_SUCCESS;
188176
hipError_t ret;
189177

190-
ret = hipIpcGetMemHandle(ipc_handle, (void *) ptr);
178+
ret = hipIpcGetMemHandle(&ipc_handle->handle, (void *) ptr);
179+
HIP_ERR_CHECK(ret);
180+
181+
ret = hipPointerGetAttribute(&ipc_handle->id, HIP_POINTER_ATTRIBUTE_BUFFER_ID,
182+
(hipDeviceptr_t) ptr);
191183
HIP_ERR_CHECK(ret);
192184

193185
fn_exit:
@@ -210,7 +202,7 @@ int MPL_gpu_ipc_handle_map(MPL_gpu_ipc_mem_handle_t * ipc_handle, int dev_id, vo
210202

211203
hipGetDevice(&prev_devid);
212204
hipSetDevice(dev_id);
213-
ret = hipIpcOpenMemHandle(ptr, *ipc_handle, hipIpcMemLazyEnablePeerAccess);
205+
ret = hipIpcOpenMemHandle(ptr, ipc_handle->handle, hipIpcMemLazyEnablePeerAccess);
214206
HIP_ERR_CHECK(ret);
215207

216208
fn_exit:
@@ -237,7 +229,13 @@ int MPL_gpu_ipc_handle_unmap(void *ptr)
237229

238230
bool MPL_gpu_ipc_handle_is_valid(MPL_gpu_ipc_mem_handle_t * handle, void *ptr)
239231
{
240-
return true;
232+
hipError_t ret;
233+
uint32_t buffer_id;
234+
235+
ret = hipPointerGetAttribute(&buffer_id, HIP_POINTER_ATTRIBUTE_BUFFER_ID, (hipDeviceptr_t) ptr);
236+
assert(ret == hipSuccess);
237+
238+
return buffer_id == handle->id;
241239
}
242240

243241
int MPL_gpu_malloc_host(void **ptr, size_t size)
@@ -376,12 +374,6 @@ int MPL_gpu_init(int debug_summary)
376374
max_dev_id = device_count - 1;
377375
}
378376

379-
/* gpu shm module would cache gpu handle to accelerate intra-node
380-
* communication; we must register hooks for memory-related functions
381-
* in hip, such as hipFree, to track user behaviors on
382-
* the memory buffer and invalidate cached handle/buffer respectively
383-
* for result correctness. */
384-
gpu_mem_hook_init();
385377
gpu_initialized = 1;
386378

387379
if (MPL_gpu_info.debug_summary) {
@@ -409,16 +401,6 @@ int MPL_gpu_finalize(void)
409401
MPL_free(local_to_global_map);
410402
MPL_free(global_to_local_map);
411403

412-
gpu_free_hook_s *prev;
413-
MPL_initlock_lock(&free_hook_mutex);
414-
while (free_hook_chain) {
415-
prev = free_hook_chain;
416-
free_hook_chain = free_hook_chain->next;
417-
MPL_free(prev);
418-
}
419-
free_hook_chain = NULL;
420-
MPL_initlock_unlock(&free_hook_mutex);
421-
422404
/* Reset initialization state */
423405
gpu_initialized = 0;
424406

@@ -463,66 +445,12 @@ int MPL_gpu_get_buffer_bounds(const void *ptr, void **pbase, uintptr_t * len)
463445
goto fn_exit;
464446
}
465447

466-
static void gpu_free_hooks_cb(void *dptr)
467-
{
468-
gpu_free_hook_s *current = free_hook_chain;
469-
if (dptr != NULL) {
470-
/* we call gpu hook only when dptr != NULL */
471-
while (current) {
472-
current->free_hook(dptr);
473-
current = current->next;
474-
}
475-
}
476-
return;
477-
}
478-
479-
static int gpu_mem_hook_init()
480-
{
481-
void *libhip_handle;
482-
void *libhiprt_handle;
483-
484-
libhip_handle = dlopen("libamdhip64.so", RTLD_LAZY | RTLD_GLOBAL);
485-
assert(libhip_handle);
486-
sys_hipFree = (void *) dlsym(libhip_handle, "hipFree");
487-
assert(sys_hipFree);
488-
return MPL_SUCCESS;
489-
}
490-
491448
int MPL_gpu_free_hook_register(void (*free_hook) (void *dptr))
492449
{
493-
gpu_free_hook_s *hook_obj = MPL_malloc(sizeof(gpu_free_hook_s), MPL_MEM_OTHER);
494-
assert(hook_obj);
495-
hook_obj->free_hook = free_hook;
496-
hook_obj->next = NULL;
497-
498-
MPL_initlock_lock(&free_hook_mutex);
499-
if (!free_hook_chain)
500-
free_hook_chain = hook_obj;
501-
else {
502-
hook_obj->next = free_hook_chain;
503-
free_hook_chain = hook_obj;
504-
}
505-
MPL_initlock_unlock(&free_hook_mutex);
506-
450+
/* free hooks not used */
507451
return MPL_SUCCESS;
508452
}
509453

510-
hipError_t hipFree(void *dptr)
511-
{
512-
hipError_t result;
513-
MPL_initlock_lock(&free_hook_mutex);
514-
515-
if (!sys_hipFree) {
516-
gpu_mem_hook_init();
517-
}
518-
519-
gpu_free_hooks_cb(dptr);
520-
result = sys_hipFree(dptr);
521-
522-
MPL_initlock_unlock(&free_hook_mutex);
523-
return result;
524-
}
525-
526454
int MPL_gpu_fast_memcpy(void *src, MPL_pointer_attr_t * src_attr, void *dest,
527455
MPL_pointer_attr_t * dest_attr, size_t size)
528456
{

0 commit comments

Comments
 (0)