Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions ggml/src/ggml-cuda/acc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
static __global__ void acc_f32(const float * x, const float * y, float * dst, const int64_t ne,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
const int64_t s11, const int64_t s12, const int64_t s13, const int64_t offset) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be better to define a dedicated macro and use it wherever needed. For example:

#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
#define XXX_AVAILABLE
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER

cudaGridDependencySynchronize();
#endif
const int64_t i = blockDim.x * blockIdx.x + threadIdx.x;

if (i >= ne) {
Expand All @@ -25,6 +28,9 @@ static __global__ void acc_f32(const float * x, const float * y, float * dst, co
val += y[((i13*ne12 + i12) * ne11 + i11) * ne10 + i10];
}
dst[i] = val;
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
}

static void acc_f32_cuda(const float * x, const float * y, float * dst, const int64_t n_elements,
Expand Down
6 changes: 6 additions & 0 deletions ggml/src/ggml-cuda/arange.cu
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
#include "arange.cuh"

static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaGridDependencySynchronize();
#endif
// blockIDx.x: idx of ne0 / BLOCK_SIZE
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
return;
}
dst[nidx] = start + step * nidx;
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
}

static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
Expand Down
6 changes: 6 additions & 0 deletions ggml/src/ggml-cuda/argmax.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
#include "sum.cuh"

static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __restrict__ dst, const int64_t ncols) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaGridDependencySynchronize();
#endif
const int64_t row = blockIdx.x;

float maxval = -FLT_MAX;
Expand Down Expand Up @@ -64,6 +67,9 @@ static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __rest
if (warp_id == 0 && lane_id == 0) {
dst[row] = argmax;
}
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
}

void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
Expand Down
6 changes: 6 additions & 0 deletions ggml/src/ggml-cuda/argsort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ static inline __device__ void ggml_cuda_swap(T & a, T & b) {

template<ggml_sort_order order>
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaGridDependencySynchronize();
#endif
// bitonic sort
int col = threadIdx.x;
int row = blockIdx.y;
Expand Down Expand Up @@ -55,6 +58,9 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n
if (col < ncols) {
dst[row * ncols + col] = dst_row[col];
}
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
}

static int next_power_of_2(int x) {
Expand Down
18 changes: 18 additions & 0 deletions ggml/src/ggml-cuda/binbcast.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst
/*int s0, */ int s1, int s2, int s3,
/*int s00,*/ int s01, int s02, int s03,
/*int s10,*/ int s11, int s12, int s13) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaGridDependencySynchronize();
#endif
const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
Expand All @@ -54,6 +57,9 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst
const int i10 = i0 % ne10;
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
}
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
}

template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
Expand All @@ -63,6 +69,9 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
/*int s0, */ int s1, int s2, int s3,
/*int s00,*/ int s01, int s02, int s03,
/*int s10,*/ int s11, int s12, int s13) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaGridDependencySynchronize();
#endif

const int i = blockDim.x*blockIdx.x + threadIdx.x;

Expand All @@ -89,13 +98,19 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s

const int i10 = i0 % ne10;
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
}

template <typename T>
static __global__ void k_repeat_back(
const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const size_t s00, const size_t s01, const size_t s02, const size_t s03,
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaGridDependencySynchronize();
#endif

const int64_t tid0 = int64_t(blockIdx.x)*blockDim.x + threadIdx.x;
const int64_t tid1 = int64_t(blockIdx.y)*blockDim.y + threadIdx.y;
Expand All @@ -118,6 +133,9 @@ static __global__ void k_repeat_back(
}
}
dst[tid3*ne2*ne1*ne0 + tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
}

template<float (*bin_op)(const float, const float)>
Expand Down
6 changes: 6 additions & 0 deletions ggml/src/ggml-cuda/clamp.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,19 @@ static __device__ __forceinline__ float op_clamp(float x, float min, float max)

template <class T>
static __global__ void op_clamp_kernel(const T * x, T * dst, const T min, const T max, const int k) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaGridDependencySynchronize();
#endif
const int i = blockDim.x*blockIdx.x + threadIdx.x;

if (i >= k) {
return;
}

dst[i] = (T)op_clamp((float)x[i], (float)min, (float)max);
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
}

template <class T>
Expand Down
13 changes: 13 additions & 0 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
#define GGML_CUDA_CC_TURING 750
#define GGML_CUDA_CC_AMPERE 800
#define GGML_CUDA_CC_ADA_LOVELACE 890
#define GGML_CUDA_CC_HOPPER 900
#define GGML_CUDA_CC_OFFSET_AMD 0x1000000
#define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000
#define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS)
Expand Down Expand Up @@ -414,6 +415,9 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
template<bool norm>
static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaGridDependencySynchronize();
#endif
const int row = blockIdx.x;
const int col = threadIdx.x;

Expand All @@ -425,10 +429,16 @@ static __global__ void reduce_rows_f32(const float * x, float * dst, const int n
sum = warp_reduce_sum(sum);

if (col != 0) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
return;
}

dst[row] = norm ? sum / ncols : sum;
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
}

template<int width = WARP_SIZE>
Expand Down Expand Up @@ -832,6 +842,9 @@ struct ggml_cuda_graph {
// Index to allow each cpy kernel to be aware of it's position within the graph
// relative to other cpy nodes.
int graph_cpynode_index = -1;
std::vector<cudaGraphNode_t> graph_nodes;
std::vector<cudaGraphNode_t> graph_dependencies;
bool allow_pdl = true; // whether Programmatic Dependent Launch can be used within CUDA graph
#endif
};

Expand Down
24 changes: 24 additions & 0 deletions ggml/src/ggml-cuda/concat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

// contiguous kernels
static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaGridDependencySynchronize();
#endif
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
return;
Expand All @@ -25,9 +28,15 @@ static __global__ void concat_f32_dim0(const float * x, const float * y, float *
blockIdx.z * (ne0 - ne00) * gridDim.y;
dst[offset_dst] = y[offset_src];
}
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
}

static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaGridDependencySynchronize();
#endif
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
return;
Expand All @@ -51,9 +60,15 @@ static __global__ void concat_f32_dim1(const float * x, const float * y, float *
blockIdx.z * ne0 * (gridDim.y - ne01);
dst[offset_dst] = y[offset_src];
}
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
}

static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaGridDependencySynchronize();
#endif
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
return;
Expand All @@ -77,6 +92,9 @@ static __global__ void concat_f32_dim2(const float * x, const float * y, float *
(blockIdx.z - ne02) * ne0 * gridDim.y;
dst[offset_dst] = y[offset_src];
}
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
}

static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) {
Expand Down Expand Up @@ -124,6 +142,9 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
uint64_t nb1,
uint64_t nb2,
uint64_t nb3){
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaGridDependencySynchronize();
#endif
static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]");

const int64_t i3 = blockIdx.z;
Expand Down Expand Up @@ -151,6 +172,9 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)

*y = *x;
}
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
}


Expand Down
6 changes: 6 additions & 0 deletions ggml/src/ggml-cuda/conv-transpose-1d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ static __global__ void conv_transpose_1d_kernel(
const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
const float * src0, const float * src1, float * dst) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaGridDependencySynchronize();
#endif
int global_index = threadIdx.x + blockIdx.x * blockDim.x;
if (global_index >= output_size) {
return;
Expand Down Expand Up @@ -38,6 +41,9 @@ static __global__ void conv_transpose_1d_kernel(
GGML_UNUSED(src1_ne3); GGML_UNUSED(dst_ne3);
GGML_UNUSED(src1_ne1); GGML_UNUSED(dst_ne1);
GGML_UNUSED(src1_ne2); GGML_UNUSED(dst_ne2);
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
}

static void conv_transpose_1d_f32_f32_cuda(
Expand Down
9 changes: 9 additions & 0 deletions ggml/src/ggml-cuda/conv2d-dw.cu
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,16 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr
const int kernel_w, const int kernel_h, const int stride_x, const int stride_y,
const int padding_x, const int padding_y, const int dilation_x, const int dilation_y,
const int channels, const int batches) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaGridDependencySynchronize();
#endif
const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
const int total_elements = batches * channels * out_h * out_w;

if (global_idx >= total_elements) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
return;
}

Expand All @@ -114,6 +120,9 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr
}

output[Layout::output_index(batch_idx, channel_idx, out_y_idx, out_x_idx, params)] = accumulator;
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
}

void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
Expand Down
9 changes: 9 additions & 0 deletions ggml/src/ggml-cuda/conv2d-transpose.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,17 @@ __global__ void conv2d_transpose_kernel(const float * __restrict__ input, const
float * __restrict__ output, const int in_w, const int in_h, const int out_w,
const int out_h, const int kernel_w, const int kernel_h, const int stride,
const int c_in, const int c_out, const int batches) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaGridDependencySynchronize();
#endif
const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;

const int total_elements = out_w * out_h * c_out * batches;

if (global_idx >= total_elements) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
return;
}

Expand Down Expand Up @@ -49,6 +55,9 @@ __global__ void conv2d_transpose_kernel(const float * __restrict__ input, const
}

output[(out_w * out_h * c_out) * n_idx + (out_w * out_h) * c_idx + (out_w) *out_y_idx + out_x_idx] = accumulator;
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
}

//input is (W, H, C_in, N), Kernel is (W, H, C_out, C_in)
Expand Down
15 changes: 15 additions & 0 deletions ggml/src/ggml-cuda/convert.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,15 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y,
const int64_t ne00, const int64_t ne01, const int64_t ne02,
const int64_t s01, const int64_t s02, const int64_t s03) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaGridDependencySynchronize();
#endif
const int64_t i00 = 2 * (int64_t(blockDim.x)*blockIdx.x + threadIdx.x);

if (i00 >= ne00) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
return;
}

Expand All @@ -33,10 +39,16 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
y[iy0 + 0] = float(v.x);
y[iy0 + y_offset] = float(v.y);
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaTriggerProgrammaticLaunchCompletion();
#endif
}

template <bool need_check>
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaGridDependencySynchronize();
#endif
#if __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;

Expand Down Expand Up @@ -592,6 +604,9 @@ template <typename src_t, typename dst_t>
static __global__ void convert_unary(
const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02,
const int64_t s01, const int64_t s02, const int64_t s03) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
cudaGridDependencySynchronize();
#endif
const int64_t i00 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;

if (i00 >= ne00) {
Expand Down
Loading
Loading