Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions src/tools/perf/cuda/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ libucx_perftest_cuda_la_CPPFLAGS = $(BASE_CPPFLAGS) $(CUDA_CPPFLAGS)
libucx_perftest_cuda_la_CFLAGS = $(BASE_CFLAGS) $(CUDA_CFLAGS) $(LT_CFLAGS)
libucx_perftest_cuda_la_LDFLAGS = $(CUDA_LDFLAGS) -version-info $(SOVERSION)
libucx_perftest_cuda_la_LIBADD = $(CUDART_LIBS)
libucx_perftest_cuda_la_SOURCES = cuda_alloc.c
libucx_perftest_cuda_la_SOURCES = cuda_alloc.c ucp_cuda_impl.c

if HAVE_NVCC

Expand All @@ -22,11 +22,30 @@ libucx_perftest_cuda_la_LIBADD += \
-lstdc++

libucx_perftest_cuda_la_SOURCES += \
ucp_cuda_kernel.cu
ucp_cuda_kernel_bw.cu \
ucp_cuda_kernel_bw_thread_fc.cu \
ucp_cuda_kernel_bw_thread_nofc.cu \
ucp_cuda_kernel_bw_warp_fc.cu \
ucp_cuda_kernel_bw_warp_nofc.cu \
ucp_cuda_kernel_latency.cu \
ucp_cuda_kernel_latency_thread.cu \
ucp_cuda_kernel_latency_warp.cu \
ucp_cuda_kernel_wait.cu \
ucp_cuda_host.cu

noinst_HEADERS = \
cuda_common.h \
cuda_kernel.cuh
cuda_context.h \
cuda_kernel.cuh \
ucp_cuda_impl.h \
ucp_cuda_kernel_bw.cuh \
ucp_cuda_kernel_bw_dispatch.cuh \
ucp_cuda_kernel_bw_impl.cuh \
ucp_cuda_kernel_common.cuh \
ucp_cuda_kernel_latency.cuh \
ucp_cuda_kernel_latency_dispatch.cuh \
ucp_cuda_kernel_latency_impl.cuh \
ucp_cuda_kernel_wait.cuh

include $(top_srcdir)/config/cuda.am
endif # HAVE_NVCC
Expand Down
1 change: 1 addition & 0 deletions src/tools/perf/cuda/cuda_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#define CUDA_COMMON_H_

#include <ucs/debug/log_def.h>
#include <cuda_runtime.h>

BEGIN_C_DECLS

Expand Down
28 changes: 28 additions & 0 deletions src/tools/perf/cuda/cuda_context.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/**
* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*
* C-compatible header for CUDA perf context.
*/

#ifndef CUDA_CONTEXT_H_
#define CUDA_CONTEXT_H_

#include <tools/perf/api/libperf.h>
#include <ucs/type/status.h>

typedef unsigned long long ucx_perf_cuda_time_t;

typedef struct ucx_perf_cuda_context {
ucx_perf_channel_mode_t channel_mode;
unsigned long long channel_rand_seed;
unsigned max_outstanding;
unsigned device_fc_window;
ucx_perf_counter_t max_iters;
ucx_perf_cuda_time_t report_interval_ns;
ucx_perf_counter_t completed_iters;
ucs_status_t status;
} ucx_perf_cuda_context_t;

#endif /* CUDA_CONTEXT_H_ */
25 changes: 6 additions & 19 deletions src/tools/perf/cuda/cuda_kernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,12 @@
#define CUDA_KERNEL_CUH_

#include "cuda_common.h"
#include "cuda_context.h"

#include <tools/perf/lib/libperf_int.h>
#include <ucs/sys/device_code.h>
#include <cuda_runtime.h>


typedef unsigned long long ucx_perf_cuda_time_t;

struct ucx_perf_cuda_context {
ucx_perf_channel_mode_t channel_mode;
unsigned long long channel_rand_seed;
unsigned max_outstanding;
unsigned device_fc_window;
ucx_perf_counter_t max_iters;
ucx_perf_cuda_time_t report_interval_ns;
ucx_perf_counter_t completed_iters;
ucs_status_t status;
};

UCS_F_DEVICE ucx_perf_cuda_time_t ucx_perf_cuda_get_time_ns()
{
ucx_perf_cuda_time_t globaltimer;
Expand All @@ -41,7 +28,7 @@ public:
static const unsigned UPDATES_PER_INTERVAL = 5;

__device__
ucx_perf_cuda_reporter(ucx_perf_cuda_context &ctx) :
ucx_perf_cuda_reporter(ucx_perf_cuda_context_t &ctx) :
m_ctx(ctx),
m_max_iters(ctx.max_iters),
m_next_report_iter(1),
Expand Down Expand Up @@ -71,7 +58,7 @@ public:
}

private:
ucx_perf_cuda_context &m_ctx;
ucx_perf_cuda_context_t &m_ctx;
ucx_perf_counter_t m_max_iters;
ucx_perf_counter_t m_next_report_iter;
ucx_perf_counter_t m_last_completed;
Expand Down Expand Up @@ -213,14 +200,14 @@ public:

protected:
ucx_perf_context_t &m_perf;
ucx_perf_cuda_context *m_cpu_ctx;
ucx_perf_cuda_context *m_gpu_ctx;
ucx_perf_cuda_context_t *m_cpu_ctx;
ucx_perf_cuda_context_t *m_gpu_ctx;

private:
void init_ctx()
{
CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaHostAlloc, &m_cpu_ctx,
sizeof(ucx_perf_cuda_context), cudaHostAllocMapped);
sizeof(ucx_perf_cuda_context_t), cudaHostAllocMapped);
CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaHostGetDevicePointer,
&m_gpu_ctx, m_cpu_ctx, 0);
}
Expand Down
51 changes: 51 additions & 0 deletions src/tools/perf/cuda/ucp_cuda_host.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/**
* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*
* Pure host dispatcher: C++ runner + registration only.
* Kernel launches are in ucp_cuda_kernel_*.cu (each has kernel + host entry point).
*/

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

#include "cuda_kernel.cuh"
#include "ucp_cuda_impl.h"

class ucp_perf_cuda_test_runner : public ucx_perf_cuda_test_runner {
public:
ucp_perf_cuda_test_runner(ucx_perf_context_t &perf) :
ucx_perf_cuda_test_runner(perf)
{
size_t length = ucx_perf_get_message_size(&m_perf.params) + ONESIDED_SIGNAL_SIZE;

m_perf.send_allocator->memset(m_perf.send_buffer, 0, length);
m_perf.recv_allocator->memset(m_perf.recv_buffer, 0, length);
}

ucs_status_t run_pingpong()
{
return ucp_perf_cuda_run_pingpong(&m_perf, m_cpu_ctx, m_gpu_ctx);
}

ucs_status_t run_stream_uni()
{
return ucp_perf_cuda_run_stream_uni(&m_perf, m_cpu_ctx, m_gpu_ctx);
}
};

ucx_perf_device_dispatcher_t ucx_perf_cuda_dispatcher;

UCS_STATIC_INIT {
ucx_perf_cuda_dispatcher.ucp_dispatch = ucx_perf_cuda_dispatch<ucp_perf_cuda_test_runner>;

ucx_perf_mem_type_device_dispatchers[UCS_MEMORY_TYPE_CUDA] = &ucx_perf_cuda_dispatcher;
ucx_perf_mem_type_device_dispatchers[UCS_MEMORY_TYPE_CUDA_MANAGED] = &ucx_perf_cuda_dispatcher;
}

UCS_STATIC_CLEANUP {
ucx_perf_mem_type_device_dispatchers[UCS_MEMORY_TYPE_CUDA] = NULL;
ucx_perf_mem_type_device_dispatchers[UCS_MEMORY_TYPE_CUDA_MANAGED] = NULL;
}
Loading
Loading