diff --git a/ggml/src/ggml-cuda/acc.cu b/ggml/src/ggml-cuda/acc.cu index e084607c029a6..c282c138ad7c4 100644 --- a/ggml/src/ggml-cuda/acc.cu +++ b/ggml/src/ggml-cuda/acc.cu @@ -3,6 +3,9 @@ static __global__ void acc_f32(const float * x, const float * y, float * dst, const int64_t ne, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13, const int64_t s11, const int64_t s12, const int64_t s13, const int64_t offset) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int64_t i = blockDim.x * blockIdx.x + threadIdx.x; if (i >= ne) { @@ -25,6 +28,9 @@ static __global__ void acc_f32(const float * x, const float * y, float * dst, co val += y[((i13*ne12 + i12) * ne11 + i11) * ne10 + i10]; } dst[i] = val; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } static void acc_f32_cuda(const float * x, const float * y, float * dst, const int64_t n_elements, diff --git a/ggml/src/ggml-cuda/arange.cu b/ggml/src/ggml-cuda/arange.cu index b5e495a246227..74e07a0d63050 100644 --- a/ggml/src/ggml-cuda/arange.cu +++ b/ggml/src/ggml-cuda/arange.cu @@ -1,12 +1,18 @@ #include "arange.cuh" static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif // blockIDx.x: idx of ne0 / BLOCK_SIZE int nidx = threadIdx.x + blockIdx.x * blockDim.x; if (nidx >= ne0) { return; } dst[nidx] = start + step * nidx; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) { diff --git a/ggml/src/ggml-cuda/argmax.cu b/ggml/src/ggml-cuda/argmax.cu index 5340eedc08916..23706249631f6 100644 --- a/ggml/src/ggml-cuda/argmax.cu +++ b/ggml/src/ggml-cuda/argmax.cu @@ -6,6 +6,9 @@ #include "sum.cuh" static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __restrict__ dst, const int64_t ncols) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int64_t row = blockIdx.x; float maxval = -FLT_MAX; @@ -64,6 +67,9 @@ static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __rest if (warp_id == 0 && lane_id == 0) { dst[row] = argmax; } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu index 607ded8558b45..a3c595bb4addc 100644 --- a/ggml/src/ggml-cuda/argsort.cu +++ b/ggml/src/ggml-cuda/argsort.cu @@ -9,6 +9,9 @@ static inline __device__ void ggml_cuda_swap(T & a, T & b) { template static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif // bitonic sort int col = threadIdx.x; int row = blockIdx.y; @@ -55,6 +58,9 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n if (col < ncols) { dst[row * ncols + col] = dst_row[col]; } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } static int next_power_of_2(int x) { diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index e1fbf0e13665d..96ba8726d00cc 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -29,6 +29,9 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst /*int s0, */ int s1, int s2, int s3, /*int s00,*/ int s01, int s02, int s03, /*int s10,*/ int s11, int s12, int s13) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int i0s = blockDim.x*blockIdx.x + threadIdx.x; const int i1 = (blockDim.y*blockIdx.y + threadIdx.y); const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3; @@ -54,6 +57,9 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst const int i10 = i0 % ne10; dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template @@ -63,6 +69,9 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s /*int s0, */ int s1, int s2, int s3, /*int s00,*/ int s01, int s02, int s03, /*int s10,*/ int s11, int s12, int s13) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -89,6 +98,9 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s const int i10 = i0 % ne10; dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template @@ -96,6 +108,9 @@ static __global__ void k_repeat_back( const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, const size_t s00, const size_t s01, const size_t s02, const size_t s03, const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int64_t tid0 = int64_t(blockIdx.x)*blockDim.x + threadIdx.x; const int64_t tid1 = int64_t(blockIdx.y)*blockDim.y + threadIdx.y; @@ -118,6 +133,9 @@ static __global__ void k_repeat_back( } } dst[tid3*ne2*ne1*ne0 + tid2*ne1*ne0 + tid1*ne0 + tid0] = sum; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template diff --git a/ggml/src/ggml-cuda/clamp.cu b/ggml/src/ggml-cuda/clamp.cu index fe415e7f78dd6..5bda480d1fcb9 100644 --- a/ggml/src/ggml-cuda/clamp.cu +++ b/ggml/src/ggml-cuda/clamp.cu @@ -6,6 +6,9 @@ static __device__ __forceinline__ float op_clamp(float x, float min, float max) template static __global__ void op_clamp_kernel(const T * x, T * dst, const T min, const T max, const int k) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { @@ -13,6 +16,9 @@ static __global__ void op_clamp_kernel(const T * x, T * dst, const T min, const } dst[i] = (T)op_clamp((float)x[i], (float)min, (float)max); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 7fb04d51b770f..45446887028ca 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -47,6 +47,7 @@ #define GGML_CUDA_CC_TURING 750 #define GGML_CUDA_CC_AMPERE 800 #define GGML_CUDA_CC_ADA_LOVELACE 890 +#define GGML_CUDA_CC_HOPPER 900 #define GGML_CUDA_CC_OFFSET_AMD 0x1000000 #define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000 #define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS) @@ -414,6 +415,9 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { // Row reduction kernel template - compute sum (norm=false) or mean (norm=true) template static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int row = blockIdx.x; const int col = threadIdx.x; @@ -425,10 +429,16 @@ static __global__ void reduce_rows_f32(const float * x, float * dst, const int n sum = warp_reduce_sum(sum); if (col != 0) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } dst[row] = norm ? sum / ncols : sum; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template @@ -832,6 +842,9 @@ struct ggml_cuda_graph { // Index to allow each cpy kernel to be aware of it's position within the graph // relative to other cpy nodes. int graph_cpynode_index = -1; + std::vector graph_nodes; + std::vector graph_dependencies; + bool allow_pdl = true; // whether Programmatic Dependent Launch can be used within CUDA graph #endif }; diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu index e9ffd274b9966..2665dcb2b84a4 100644 --- a/ggml/src/ggml-cuda/concat.cu +++ b/ggml/src/ggml-cuda/concat.cu @@ -2,6 +2,9 @@ // contiguous kernels static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif int nidx = threadIdx.x + blockIdx.x * blockDim.x; if (nidx >= ne0) { return; @@ -25,9 +28,15 @@ static __global__ void concat_f32_dim0(const float * x, const float * y, float * blockIdx.z * (ne0 - ne00) * gridDim.y; dst[offset_dst] = y[offset_src]; } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif int nidx = threadIdx.x + blockIdx.x * blockDim.x; if (nidx >= ne0) { return; @@ -51,9 +60,15 @@ static __global__ void concat_f32_dim1(const float * x, const float * y, float * blockIdx.z * ne0 * (gridDim.y - ne01); dst[offset_dst] = y[offset_src]; } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif int nidx = threadIdx.x + blockIdx.x * blockDim.x; if (nidx >= ne0) { return; @@ -77,6 +92,9 @@ static __global__ void concat_f32_dim2(const float * x, const float * y, float * (blockIdx.z - ne02) * ne0 * gridDim.y; dst[offset_dst] = y[offset_src]; } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) { @@ -124,6 +142,9 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) uint64_t nb1, uint64_t nb2, uint64_t nb3){ +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]"); const int64_t i3 = blockIdx.z; @@ -151,6 +172,9 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) *y = *x; } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } diff --git a/ggml/src/ggml-cuda/conv-transpose-1d.cu b/ggml/src/ggml-cuda/conv-transpose-1d.cu index fe4caf674d4d9..fa859047dfb5d 100644 --- a/ggml/src/ggml-cuda/conv-transpose-1d.cu +++ b/ggml/src/ggml-cuda/conv-transpose-1d.cu @@ -6,6 +6,9 @@ static __global__ void conv_transpose_1d_kernel( const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3, const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3, const float * src0, const float * src1, float * dst) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif int global_index = threadIdx.x + blockIdx.x * blockDim.x; if (global_index >= output_size) { return; @@ -38,6 +41,9 @@ static __global__ void conv_transpose_1d_kernel( GGML_UNUSED(src1_ne3); GGML_UNUSED(dst_ne3); GGML_UNUSED(src1_ne1); GGML_UNUSED(dst_ne1); GGML_UNUSED(src1_ne2); GGML_UNUSED(dst_ne2); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } static void conv_transpose_1d_f32_f32_cuda( diff --git a/ggml/src/ggml-cuda/conv2d-dw.cu b/ggml/src/ggml-cuda/conv2d-dw.cu index 7583233b1b7cd..1d291837c552b 100644 --- a/ggml/src/ggml-cuda/conv2d-dw.cu +++ b/ggml/src/ggml-cuda/conv2d-dw.cu @@ -84,10 +84,16 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr const int kernel_w, const int kernel_h, const int stride_x, const int stride_y, const int padding_x, const int padding_y, const int dilation_x, const int dilation_y, const int channels, const int batches) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int global_idx = blockIdx.x * blockDim.x + threadIdx.x; const int total_elements = batches * channels * out_h * out_w; if (global_idx >= total_elements) { + #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -114,6 +120,9 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr } output[Layout::output_index(batch_idx, channel_idx, out_y_idx, out_x_idx, params)] = accumulator; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/ggml/src/ggml-cuda/conv2d-transpose.cu b/ggml/src/ggml-cuda/conv2d-transpose.cu index 03224e404d32d..3e6c14c8ff83a 100644 --- a/ggml/src/ggml-cuda/conv2d-transpose.cu +++ b/ggml/src/ggml-cuda/conv2d-transpose.cu @@ -7,11 +7,17 @@ __global__ void conv2d_transpose_kernel(const float * __restrict__ input, const float * __restrict__ output, const int in_w, const int in_h, const int out_w, const int out_h, const int kernel_w, const int kernel_h, const int stride, const int c_in, const int c_out, const int batches) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int global_idx = blockIdx.x * blockDim.x + threadIdx.x; const int total_elements = out_w * out_h * c_out * batches; if (global_idx >= total_elements) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -49,6 +55,9 @@ __global__ void conv2d_transpose_kernel(const float * __restrict__ input, const } output[(out_w * out_h * c_out) * n_idx + (out_w * out_h) * c_idx + (out_w) *out_y_idx + out_x_idx] = accumulator; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } //input is (W, H, C_in, N), Kernel is (W, H, C_out, C_in) diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 15c927861f03d..2491fe0b43f0d 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -9,9 +9,15 @@ template static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t s01, const int64_t s02, const int64_t s03) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int64_t i00 = 2 * (int64_t(blockDim.x)*blockIdx.x + threadIdx.x); if (i00 >= ne00) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -33,10 +39,16 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __ const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs; y[iy0 + 0] = float(v.x); y[iy0 + y_offset] = float(v.y); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif #if __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE; @@ -592,6 +604,9 @@ template static __global__ void convert_unary( const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t s01, const int64_t s02, const int64_t s03) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int64_t i00 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; if (i00 >= ne00) { diff --git a/ggml/src/ggml-cuda/count-equal.cu b/ggml/src/ggml-cuda/count-equal.cu index 08898115daed2..0b02557b6885c 100644 --- a/ggml/src/ggml-cuda/count-equal.cu +++ b/ggml/src/ggml-cuda/count-equal.cu @@ -5,6 +5,9 @@ template static __global__ void count_equal(const T * __restrict__ x, const T * __restrict__ y, int64_t * __restrict__ dst, const int64_t dk, const int64_t k) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int64_t i0 = (int64_t) blockIdx.x*dk; const int64_t i1 = min(i0 + dk, k); @@ -23,6 +26,9 @@ static __global__ void count_equal(const T * __restrict__ x, const T * __restric } atomicAdd((int *) dst, nequal); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index f9bb025643ca2..3ed577c659536 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -12,9 +12,15 @@ static __global__ void cpy_flt(const char * cx, char * cdst_direct, const int ne const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int64_t i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= ne) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -35,6 +41,9 @@ static __global__ void cpy_flt(const char * cx, char * cdst_direct, const int ne const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13; cpy_1(cx + x_offset, cdst + dst_offset); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) { @@ -67,9 +76,15 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk; if (i >= ne) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -88,6 +103,9 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13; cpy_blck(cx + x_offset, cdst + dst_offset); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template @@ -95,6 +113,9 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk; if (i >= ne) { diff --git a/ggml/src/ggml-cuda/cross-entropy-loss.cu b/ggml/src/ggml-cuda/cross-entropy-loss.cu index 0c8b0819724e4..a083f2f4851e1 100644 --- a/ggml/src/ggml-cuda/cross-entropy-loss.cu +++ b/ggml/src/ggml-cuda/cross-entropy-loss.cu @@ -8,6 +8,9 @@ template static __global__ void cross_entropy_loss_f32( const float * __restrict__ logits, const float * __restrict__ labels, float * __restrict__ dst, const int nclasses, const int k) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif extern __shared__ float tmp[]; logits += int64_t(blockIdx.x)*nclasses; @@ -47,12 +50,18 @@ static __global__ void cross_entropy_loss_f32( } dst[blockIdx.x] = loss; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template static __global__ void cross_entropy_loss_back_f32( const float * __restrict__ grad, const float * __restrict__ logits, const float * __restrict__ labels, float * __restrict__ dst, const int nclasses) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif extern __shared__ float tmp[]; logits += int64_t(blockIdx.x)*nclasses; @@ -89,6 +98,9 @@ static __global__ void cross_entropy_loss_back_f32( const float val = use_shared ? tmp[i] : dst[i]; dst[i] = (val*sm_scale - labels[i])*d_by_nrows; } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/ggml/src/ggml-cuda/diagmask.cu b/ggml/src/ggml-cuda/diagmask.cu index 4b713ba22eb53..aaf7a9cb0aa96 100644 --- a/ggml/src/ggml-cuda/diagmask.cu +++ b/ggml/src/ggml-cuda/diagmask.cu @@ -1,6 +1,9 @@ #include "diagmask.cuh" static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int col = blockDim.y*blockIdx.y + threadIdx.y; const int row = blockDim.x*blockIdx.x + threadIdx.x; @@ -12,6 +15,9 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i]; //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) { diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index b6db446c6feaf..6eb2f7ed8cde7 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -505,6 +505,9 @@ template __launch_bounds__(FATTN_KQ_STRIDE/2, 1) static __global__ void flash_attn_mask_to_KV_max( const half2 * __restrict__ mask, int * __restrict__ KV_max, const int ne30, const int s31, const int s33) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int ne31 = gridDim.x; const int tid = threadIdx.x; const int sequence = blockIdx.y; @@ -544,16 +547,25 @@ static __global__ void flash_attn_mask_to_KV_max( } if (threadIdx.x != 0) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } KV_max[sequence*ne31 + jt] = KV_max_sj; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template // D == head size __launch_bounds__(D, 1) static __global__ void flash_attn_stream_k_fixup( float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne03, const int ne11) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif constexpr int ncols = ncols1*ncols2; const int bidx0 = blockIdx.x; @@ -574,6 +586,9 @@ static __global__ void flash_attn_stream_k_fixup( const bool wrote_beginning_of_tile = kbc0 % iter_k == 0; const bool did_not_write_last = kbc0/iter_k == kbc0_stop/iter_k && kbc0_stop % iter_k != 0; if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) { + #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -582,6 +597,9 @@ static __global__ void flash_attn_stream_k_fixup( const int jt = (kbc0 - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*head) / iter_k; // j index of current tile. if (jt*ncols1 + j >= ne01) { + #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -639,6 +657,9 @@ static __global__ void flash_attn_stream_k_fixup( // Write back final result: *dst = dst_val / rowsum; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template // D == head size @@ -672,6 +693,9 @@ static __global__ void flash_attn_combine_results( const int tid = threadIdx.x; __builtin_assume(tid < D); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif extern __shared__ float2 meta[]; for (int i = tid; i < 2*parallel_blocks; i += D) { ((float *) meta)[i] = ((const float *)VKQ_meta) [i]; @@ -697,6 +721,9 @@ static __global__ void flash_attn_combine_results( } dst[tid] = VKQ_numerator / VKQ_denominator; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } [[noreturn]] diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index a86b95428f6ff..8c8175f677094 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -1222,16 +1222,25 @@ static __global__ void flash_attn_ext_f16( const int32_t nb21, const int32_t nb22, const int64_t nb23, const int32_t ne31, const int32_t ne32, const int32_t ne33, const int32_t nb31, const int32_t nb32, const int64_t nb33) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif #if defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE) // Skip unused kernel variants for faster compilation: if (use_logit_softcap && !(DKQ == 128 || DKQ == 256)) { NO_DEVICE_CODE; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } #if __CUDA_ARCH__ == GGML_CUDA_CC_TURING if (ncols1*ncols2 > 32) { NO_DEVICE_CODE; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } #endif // __CUDA_ARCH__ == GGML_CUDA_CC_TURING @@ -1310,6 +1319,9 @@ static __global__ void flash_attn_ext_f16( } if (kbc >= kbc_stop) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -1353,6 +1365,9 @@ static __global__ void flash_attn_ext_f16( GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); NO_DEVICE_CODE; #endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE) +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu index 9d0b24ae7ec73..9aa1ec9aa95fa 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f16.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu @@ -29,6 +29,9 @@ static __global__ void flash_attn_tile_ext_f16( const int32_t nb21, const int32_t nb22, const int64_t nb23, const int32_t ne31, const int32_t ne32, const int32_t ne33, const int32_t nb31, const int32_t nb32, const int64_t nb33) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif #if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE) // Skip unused kernel variants for faster compilation: @@ -285,6 +288,9 @@ static __global__ void flash_attn_tile_ext_f16( GGML_UNUSED(nb23); NO_DEVICE_CODE; #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE) +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu index be72f76fb6538..4554e737e6327 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f32.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu @@ -29,6 +29,9 @@ static __global__ void flash_attn_tile_ext_f32( const int32_t nb21, const int32_t nb22, const int64_t nb23, const int32_t ne31, const int32_t ne32, const int32_t ne33, const int32_t nb31, const int32_t nb32, const int64_t nb33) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif #ifdef FLASH_ATTN_AVAILABLE // Skip unused kernel variants for faster compilation: @@ -296,6 +299,9 @@ static __global__ void flash_attn_tile_ext_f32( GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); NO_DEVICE_CODE; #endif // FLASH_ATTN_AVAILABLE +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh index a2df2f66be0c4..c619eb2592d42 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh @@ -32,16 +32,25 @@ static __global__ void flash_attn_vec_ext_f16( const int32_t nb21, const int32_t nb22, const int64_t nb23, const int32_t ne31, const int32_t ne32, const int32_t ne33, const int32_t nb31, const int32_t nb32, const int64_t nb33) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif #if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE) // Skip unused kernel variants for faster compilation: if (use_logit_softcap && !(D == 128 || D == 256)) { NO_DEVICE_CODE; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) if (ncols > 1) { NO_DEVICE_CODE; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) @@ -326,6 +335,9 @@ static __global__ void flash_attn_vec_ext_f16( GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); NO_DEVICE_CODE; #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE) +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } #ifdef __clang__ #pragma clang diagnostic pop diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh index 9ab0fc133b7a2..4311e7daeeb1f 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh @@ -112,6 +112,9 @@ static __global__ void flash_attn_vec_ext_f32( __syncthreads(); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif // Convert Q to float2 (f16 K) or q8_1 (quantized K) and store in registers: float2 Q_f2[ncols][D/(2*WARP_SIZE)]; int Q_i32[ncols][D/(sizeof(int)*QK8_1) == 0 ? 1 : D >= D/(sizeof(int)*QK8_1)]; @@ -308,6 +311,9 @@ static __global__ void flash_attn_vec_ext_f32( if (gridDim.y != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) { dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]); } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif #else GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale); diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu index 40554204b62f3..53faf810e4642 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu @@ -45,10 +45,16 @@ static __global__ void flash_attn_ext_f16( const int32_t nb21, const int32_t nb22, const int64_t nb23, const int32_t ne31, const int32_t ne32, const int32_t ne33, const int32_t nb31, const int32_t nb32, const int64_t nb33) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif #if defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE))) // Skip unused kernel variants for faster compilation: if (use_logit_softcap && !(D == 128 || D == 256)) { NO_DEVICE_CODE; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -435,6 +441,9 @@ static __global__ void flash_attn_ext_f16( GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23); NO_DEVICE_CODE; #endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE))) +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } constexpr int get_max_power_of_2(int x) { diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index f77b2629a19b0..a3fc03fe1fc25 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -9,6 +9,9 @@ static __global__ void k_get_rows( /*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3, /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. const int i00 = (blockIdx.y * blockDim.x + threadIdx.x)*2; @@ -17,6 +20,9 @@ static __global__ void k_get_rows( const int i12 = blockIdx.z % ne12; if (i00 >= ne00) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -36,6 +42,9 @@ static __global__ void k_get_rows( dst_row[iybs + iqs + 0] = float(v.x); dst_row[iybs + iqs + y_offset] = float(v.y); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template @@ -46,6 +55,9 @@ static __global__ void k_get_rows_float( /*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3, /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. const int i00 = blockIdx.y * blockDim.x + threadIdx.x; @@ -54,6 +66,9 @@ static __global__ void k_get_rows_float( const int i12 = blockIdx.z % ne12; if (i00 >= ne00) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -63,14 +78,23 @@ static __global__ void k_get_rows_float( const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03); dst_row[i00] = float(src0_row[i00]); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template static __global__ void k_get_rows_back_float( const grad_t * __restrict__ grad, const int32_t * __restrict__ rows, dst_t * __restrict__ dst, const int64_t ncols, const int64_t nrows_grad) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int col = blockIdx.x*blockDim.x + threadIdx.x; if (col >= ncols) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -86,6 +110,9 @@ static __global__ void k_get_rows_back_float( } dst[dst_row*ncols + col] = sum; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 8885fb7fbdd2f..08fab5a2583b6 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1761,10 +1761,16 @@ static __global__ void k_compute_batched_ptrs( size_t nb12, size_t nb13, size_t nbd2, size_t nbd3, int64_t r2, int64_t r3) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x; const int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y; if (i13 >= ne13 || i12 >= ne12) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -1774,6 +1780,9 @@ static __global__ void k_compute_batched_ptrs( ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03; ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13; ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } // Type traits for mapping ggml types to CUDA/cuBLAS types @@ -2063,6 +2072,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor } else if (!split && (use_batched_cublas_f16 || use_batched_cublas_bf16 || use_batched_cublas_f32) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // general KQ + KQV multi-batch without FlashAttention + ctx.cuda_graph->allow_pdl = false; // disable PDL if any library kernels will be in CUDA graph ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); } else if (use_mul_mat_vec) { ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr); @@ -2071,6 +2081,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor } else if (use_mul_mat_q) { ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda); } else { + ctx.cuda_graph->allow_pdl = false; // disable PDL if any library kernels will be in CUDA graph ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr); } } @@ -2901,6 +2912,69 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx } CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph)); + + // Set programmatic dependent launch (PDL) properties for all edges + // This will only have an effect on Hopper and later GPUs, but is harmless on older GPUs. +#if CUDA_VERSION >= 12000 + // Only allow PDL if it hasn't been disabled due to presence of library kernels in CUDA graph + // since we can't add corresponding CUDA API sync calls to these. + // TO DO identify graph nodes that contain such library kernels and refrain from setting PDL + // launch properties only on those nodes (non-trivial). + if (cuda_ctx->cuda_graph->allow_pdl) { + + size_t num_nodes = 0; + // First call with null arg gives number of nodes + CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &num_nodes)); + + if (num_nodes > cuda_ctx->cuda_graph->graph_nodes.size()) { + cuda_ctx->cuda_graph->graph_nodes.resize(num_nodes); + } + if (num_nodes > 0) { + // This call gives actual nodes + CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->graph_nodes.data(), &num_nodes)); + } + + size_t max_dependencies = 0; + for (size_t i = 0; i < num_nodes; i++) { + size_t num_dependencies = 0; + // First call with null arg gives number of dependencies + CUDA_CHECK(cudaGraphNodeGetDependencies(cuda_ctx->cuda_graph->graph_nodes[i], nullptr, &num_dependencies)); + if (num_dependencies > max_dependencies) + max_dependencies = num_dependencies; + } + if (max_dependencies > cuda_ctx->cuda_graph->graph_dependencies.size()) { + cuda_ctx->cuda_graph->graph_dependencies.resize(max_dependencies); + } + + if (num_nodes > 0) { + cudaGraphNodeType prev_node_type = cudaGraphNodeTypeKernel; + for (size_t i = 0; i < num_nodes; i++) { + cudaGraphNodeType node_type; + CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->graph_nodes[i], &node_type)); + if (node_type == cudaGraphNodeTypeKernel && prev_node_type == cudaGraphNodeTypeKernel) { + size_t num_dependencies = 0; + // First call with null arg gives number of dependencies + CUDA_CHECK(cudaGraphNodeGetDependencies(cuda_ctx->cuda_graph->graph_nodes[i], nullptr, &num_dependencies)); + if (num_dependencies > 0) { + // This call gives actual dependencies + CUDA_CHECK(cudaGraphNodeGetDependencies(cuda_ctx->cuda_graph->graph_nodes[i], cuda_ctx->cuda_graph->graph_dependencies.data(), &num_dependencies)); + for (size_t j = 0; j < num_dependencies; j++) { + cudaGraphEdgeData edge_data = {}; + edge_data.type = cudaGraphDependencyTypeProgrammatic; + edge_data.from_port = cudaGraphKernelNodePortProgrammatic; + edge_data.to_port = 0; + // Remove existing dependency and add it back with PDL edge properties + CUDA_CHECK(cudaGraphRemoveDependencies(cuda_ctx->cuda_graph->graph, &cuda_ctx->cuda_graph->graph_dependencies[j], &cuda_ctx->cuda_graph->graph_nodes[i], 1)); + CUDA_CHECK(cudaGraphAddDependencies_v2(cuda_ctx->cuda_graph->graph, &cuda_ctx->cuda_graph->graph_dependencies[j], &cuda_ctx->cuda_graph->graph_nodes[i], &edge_data, 1)); + } + } + } + prev_node_type = node_type; + } + } + } +#endif + graph_evaluated_or_captured = true; // CUDA graph has been captured std::lock_guard lock(ggml_cuda_lock); diff --git a/ggml/src/ggml-cuda/gla.cu b/ggml/src/ggml-cuda/gla.cu index f7d615a8282fc..f97b37d91b261 100644 --- a/ggml/src/ggml-cuda/gla.cu +++ b/ggml/src/ggml-cuda/gla.cu @@ -4,6 +4,9 @@ template static __global__ void gated_linear_attn_f32(const int B, const int T, const int C, const int H, const float scale, const float * k, const float * v, const float * r, const float * td, const float * s, float * dst) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int tid = threadIdx.x; const int bid = blockIdx.x; @@ -59,6 +62,9 @@ static __global__ void gated_linear_attn_f32(const int B, const int T, const int for (int i = 0; i < head_size; i++) { dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i]; } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } void ggml_cuda_op_gated_linear_attn(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/ggml/src/ggml-cuda/im2col.cu b/ggml/src/ggml-cuda/im2col.cu index 73b98133438fc..60b8fb936f3e4 100644 --- a/ggml/src/ggml-cuda/im2col.cu +++ b/ggml/src/ggml-cuda/im2col.cu @@ -10,6 +10,9 @@ static __global__ void im2col_kernel( int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t IC_IH_IW, int64_t IH_IW, int64_t N_OH, int64_t KH_KW, int64_t IC_KH_KW, int s0, int s1, int p0, int p1, int d0, int d1) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int64_t i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= IC_KH_KW) { return; @@ -38,6 +41,9 @@ static __global__ void im2col_kernel( dst[offset_dst] = x[offset_src + iih * IW + iiw]; } } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 04a8d80e1211b..db2c3c45a68fc 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -3060,10 +3060,16 @@ static __global__ void mul_mat_q( const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst, const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif // Skip unused template specializations for faster compilation: if (mmq_x > get_mmq_x_max_device() || mmq_x % mmq_get_granularity_device(mmq_x) != 0) { NO_DEVICE_CODE; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -3116,6 +3122,9 @@ static __global__ void mul_mat_q( offset_dst = 0; if (jt*mmq_x >= col_diff) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -3145,6 +3154,9 @@ static __global__ void mul_mat_q( mul_mat_q_process_tile (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst, tile_x_max_i, tile_y_max_j, 0, ncols_x/qk); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } #endif // (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA @@ -3232,6 +3244,9 @@ static __global__ void mul_mat_q( } if (kbc >= kbc_stop) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -3260,6 +3275,9 @@ static __global__ void mul_mat_q( offset_dst = 0; if (jt*mmq_x >= col_diff) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -3290,6 +3308,9 @@ static __global__ void mul_mat_q( mul_mat_q_process_tile (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst, tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } @@ -3298,6 +3319,9 @@ static __global__ void mul_mat_q_stream_k_fixup( const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile, const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_col_dst, const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif constexpr int mmq_y = get_mmq_y_device(); constexpr int qk = ggml_cuda_type_traits::qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk; @@ -3324,6 +3348,9 @@ static __global__ void mul_mat_q_stream_k_fixup( const bool wrote_beginning_of_tile = kbc0 % blocks_per_ne00 == 0; const bool did_not_write_last = kbc0/blocks_per_ne00 == kbc0_stop/blocks_per_ne00 && kbc0_stop % blocks_per_ne00 != 0; if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -3366,6 +3393,9 @@ static __global__ void mul_mat_q_stream_k_fixup( } if (!any_fixup) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -3390,6 +3420,9 @@ static __global__ void mul_mat_q_stream_k_fixup( const int j = j0 + threadIdx.y; if (j > j_max) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -3404,6 +3437,9 @@ static __global__ void mul_mat_q_stream_k_fixup( dst[j*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size]; } } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -3428,6 +3464,9 @@ static __global__ void mul_mat_q_stream_k_fixup( const int j = j0 + threadIdx.y; if (j > j_max) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -3442,6 +3481,9 @@ static __global__ void mul_mat_q_stream_k_fixup( dst[ids_dst_shared[j]*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size]; } } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } struct mmq_args { diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu index e14c93516bddf..ba89e8a044695 100644 --- a/ggml/src/ggml-cuda/mmv.cu +++ b/ggml/src/ggml-cuda/mmv.cu @@ -8,6 +8,9 @@ static __global__ void mul_mat_vec( const int ncols2, const int nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst, const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int row = blockIdx.x; const int channel_dst = blockIdx.y; const int channel_x = ids ? ids[channel_dst] : channel_dst / channel_ratio; @@ -118,11 +121,19 @@ static __global__ void mul_mat_vec( } } + if (tid >= ncols_dst) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } dst[tid*stride_col_dst + row] = sumf[tid]; + +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index dc7adf509fac0..93d9d52a439ab 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -142,6 +142,9 @@ static __global__ void mul_mat_vec_q( const int ncols_x, const int nchannels_y, const int stride_row_x, const int stride_col_y, const int stride_col_dst, const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif constexpr int qk = ggml_cuda_type_traits::qk; constexpr int qi = ggml_cuda_type_traits::qi; @@ -172,6 +175,10 @@ static __global__ void mul_mat_vec_q( const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y; const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif + for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) { const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx @@ -200,6 +207,9 @@ static __global__ void mul_mat_vec_q( } __syncthreads(); if (threadIdx.y > 0) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -221,6 +231,9 @@ static __global__ void mul_mat_vec_q( dst[j*stride_col_dst + threadIdx.x] = tmp[j][threadIdx.x]; } } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } static std::pair calc_launch_params( diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index bddcca51b7bfc..fdc5943c01745 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -5,6 +5,9 @@ template static __global__ void norm_f32( const float * x, float * dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int nrows = gridDim.x; const int nchannels = gridDim.y; @@ -46,10 +49,16 @@ static __global__ void norm_f32( for (int col = tid; col < ncols; col += block_size) { dst[col] = (x[col] - mean) * inv_std; } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif // blockIdx.x: num_groups idx // threadIdx.x: block_size idx const int start = blockIdx.x*group_size + threadIdx.x; @@ -110,6 +119,9 @@ static __global__ void rms_norm_f32( const int64_t stride_sample, const float eps, const float * mul = nullptr, const int64_t mul_stride_row = 0, const int64_t mul_stride_channel = 0, const int64_t mul_stride_sample = 0, const int mul_ncols = 0, const int mul_nrows = 0, const int mul_nchannels = 0, const int mul_nsamples = 0) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int nrows = gridDim.x; const int nchannels = gridDim.y; @@ -130,6 +142,10 @@ static __global__ void rms_norm_f32( float tmp = 0.0f; // partial sum for thread in warp +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif + for (int col = tid; col < ncols; col += block_size) { const float xi = x[col]; tmp += xi * xi; @@ -161,11 +177,17 @@ static __global__ void rms_norm_f32( dst[col] = scale * x[col]; } } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template static __global__ void rms_norm_back_f32( const float * grad, const float * xf, float * dst, const int ncols, const float eps) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int row = blockIdx.x*blockDim.y + threadIdx.y; const int tid = threadIdx.x; @@ -253,6 +275,9 @@ template static __global__ void l2_norm_f32( const float * x, float * dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int nrows = gridDim.x; const int nchannels = gridDim.y; diff --git a/ggml/src/ggml-cuda/opt-step-adamw.cu b/ggml/src/ggml-cuda/opt-step-adamw.cu index 35154f2996652..58393a8b1e6c1 100644 --- a/ggml/src/ggml-cuda/opt-step-adamw.cu +++ b/ggml/src/ggml-cuda/opt-step-adamw.cu @@ -6,10 +6,16 @@ static __global__ void opt_step_adamw_f32( float * __restrict__ x, const float * __restrict__ g, float * __restrict__ g_m, float * __restrict__ g_v, const float * __restrict__ pars, const int64_t k) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int64_t i = (int64_t) blockIdx.x*blockDim.x + threadIdx.x; if (i >= k) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -32,6 +38,9 @@ static __global__ void opt_step_adamw_f32( const float vh = sqrtf(gvi*beta2h) + eps; x[i] = x[i]*(1.0f - alpha*wd) - alpha*mh/vh; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } static void opt_step_adamw_f32_cuda( diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu index 77432b04689be..231a1bd23a5f5 100644 --- a/ggml/src/ggml-cuda/pad.cu +++ b/ggml/src/ggml-cuda/pad.cu @@ -1,6 +1,9 @@ #include "pad.cuh" static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif // blockIdx.z: idx of ne2*ne3, aka ne02*ne03 // blockIdx.y: idx of ne1 // blockIDx.x: idx of ne0 / BLOCK_SIZE @@ -23,6 +26,9 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons } else { dst[offset_dst] = 0.0f; } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } static void pad_f32_cuda(const float * x, float * dst, diff --git a/ggml/src/ggml-cuda/pool2d.cu b/ggml/src/ggml-cuda/pool2d.cu index c6d51e4d655a3..905073f13a7ac 100644 --- a/ggml/src/ggml-cuda/pool2d.cu +++ b/ggml/src/ggml-cuda/pool2d.cu @@ -6,6 +6,9 @@ static __global__ void pool2d_nchw_kernel( const int kh, const int kw, const int sh, const int sw, const int ph, const int pw, const int parallel_elements, const Ti* src, To* dst, const enum ggml_op_pool op) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx >= parallel_elements) { return; @@ -48,6 +51,9 @@ static __global__ void pool2d_nchw_kernel( } } o_ptr[cur_oh * ow + cur_ow] = res; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } static void pool2d_nchw_kernel_f32_f32_cuda( diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index a0b03a740d74c..cf955c471fe74 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -5,6 +5,9 @@ static __global__ void quantize_q8_1( const float * __restrict__ x, void * __restrict__ vy, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, const int64_t ne0, const int ne1, const int ne2) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; if (i0 >= ne0) { @@ -45,6 +48,9 @@ static __global__ void quantize_q8_1( reinterpret_cast(y[ib].ds.x) = d; reinterpret_cast(y[ib].ds.y) = sum; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template @@ -52,6 +58,9 @@ static __global__ void quantize_mmq_q8_1( const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, const int64_t ne0, const int ne1, const int ne2) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif constexpr int vals_per_scale = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 64 : 32; constexpr int vals_per_sum = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 16 : 32; @@ -59,6 +68,9 @@ static __global__ void quantize_mmq_q8_1( const int64_t i0 = ((int64_t)blockDim.x*blockIdx.y + threadIdx.x)*4; if (i0 >= ne0) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -116,12 +128,18 @@ static __global__ void quantize_mmq_q8_1( if (ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6) { if (iqs % 16 != 0 || iqs >= 96) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } y[ib].d2s6[2 + iqs/16] = sum; if (iqs % 64 != 0) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -129,10 +147,16 @@ static __global__ void quantize_mmq_q8_1( y[ib].d2s6[iqs/64] = d; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } if (iqs % 32 != 0) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -143,6 +167,9 @@ static __global__ void quantize_mmq_q8_1( } else { y[ib].d4[iqs/32] = d; } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } void quantize_row_q8_1_cuda( diff --git a/ggml/src/ggml-cuda/roll.cu b/ggml/src/ggml-cuda/roll.cu index a339dfc1ae0ba..4046c48abbf3b 100644 --- a/ggml/src/ggml-cuda/roll.cu +++ b/ggml/src/ggml-cuda/roll.cu @@ -21,10 +21,16 @@ static __global__ void roll_f32_cuda(const float * __restrict__ src, const int s1, const int s2, const int s3) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int64_t idx = int64_t(blockDim.x) * blockIdx.x + threadIdx.x; const int64_t n_elements = ne00 * ne01 * ne02 * ne03; if (idx >= n_elements) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -40,6 +46,9 @@ static __global__ void roll_f32_cuda(const float * __restrict__ src, dst[i3 * (ne00 * ne01 * ne02) + i2 * (ne01 * ne00) + i1 * ne00 + i0] = src[d3 * (ne00 * ne01 * ne02) + d2 * (ne01 * ne00) + d1 * ne00 + d0]; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } void ggml_cuda_op_roll(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index d058504cd6cc0..1fbf119fa7a0c 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -42,9 +42,16 @@ static __global__ void rope_norm( const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif + const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne0) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -60,6 +67,9 @@ static __global__ void rope_norm( dst[idst + 0] = x[ix + 0]; dst[idst + 1] = x[ix + 1]; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -77,6 +87,9 @@ static __global__ void rope_norm( dst[idst + 0] = x0*cos_theta - x1*sin_theta; dst[idst + 1] = x0*sin_theta + x1*cos_theta; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template @@ -84,9 +97,15 @@ static __global__ void rope_neox( const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne0) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -102,6 +121,9 @@ static __global__ void rope_neox( dst[idst + i0/2 + 0] = x[ix + i0/2 + 0]; dst[idst + i0/2 + 1] = x[ix + i0/2 + 1]; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -119,6 +141,9 @@ static __global__ void rope_neox( dst[idst + 0] = x0*cos_theta - x1*sin_theta; dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template @@ -126,6 +151,9 @@ static __global__ void rope_multi( const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne0) { @@ -184,6 +212,9 @@ static __global__ void rope_vision( const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne0) { diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu index 2ee9e588992f4..0279262b53ce8 100644 --- a/ggml/src/ggml-cuda/scale.cu +++ b/ggml/src/ggml-cuda/scale.cu @@ -1,13 +1,21 @@ #include "scale.cuh" + static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { return; } + dst[i] = scale * x[i] + bias; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) { diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu index 07983436459d4..7bb7496204018 100644 --- a/ggml/src/ggml-cuda/set-rows.cu +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -17,11 +17,17 @@ static __global__ void k_set_rows_quant( const int64_t s01, const int64_t s02, const int64_t s03, const int64_t s10, const int64_t s11, const int64_t s12, const int64_t s1, const int64_t s2, const int64_t s3) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x; const int64_t ne_total = (ne00 * ne01 * ne02 * ne03) / qk; if (i >= ne_total) { + #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -47,6 +53,9 @@ static __global__ void k_set_rows_quant( GGML_UNUSED(ne10); GGML_UNUSED(ne13); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } // Template dispatch function for quantized set_rows @@ -95,11 +104,17 @@ static __global__ void k_set_rows( const int64_t s01, const int64_t s02, const int64_t s03, const int64_t s10, const int64_t s11, const int64_t s12, const int64_t s1, const int64_t s2, const int64_t s3) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x; const int64_t ne_total = ne00 * ne01 * ne02 * ne03; if (i >= ne_total) { + #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } @@ -123,6 +138,9 @@ static __global__ void k_set_rows( GGML_UNUSED(ne10); GGML_UNUSED(ne13); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template diff --git a/ggml/src/ggml-cuda/softcap.cu b/ggml/src/ggml-cuda/softcap.cu index 40dfe45d65cf6..0ac788947ce7e 100644 --- a/ggml/src/ggml-cuda/softcap.cu +++ b/ggml/src/ggml-cuda/softcap.cu @@ -1,13 +1,22 @@ #include "softcap.cuh" static __global__ void softcap_f32(const float * x, float * dst, const float scale, const float softcap, const int k) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } dst[i] = tanhf(scale * x[i]) * softcap; +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } static void softcap_f32_cuda(const float * x, float * dst, const float scale, const float softcap, const int k, cudaStream_t stream) { diff --git a/ggml/src/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu index 14543e978cf0f..0367f9c1429be 100644 --- a/ggml/src/ggml-cuda/softmax.cu +++ b/ggml/src/ggml-cuda/softmax.cu @@ -45,7 +45,11 @@ struct soft_max_params { #endif // __clang__ template static __global__ void soft_max_f32( + const float * x, const T * mask, float * dst, const soft_max_params p) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int ncols = ncols_template == 0 ? p.ncols : ncols_template; const int tid = threadIdx.x; @@ -155,6 +159,9 @@ static __global__ void soft_max_f32( dst[col] = vals[col] * inv_sum; } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } #ifdef __clang__ #pragma clang diagnostic pop @@ -162,6 +169,9 @@ static __global__ void soft_max_f32( static __global__ void soft_max_back_f32( const float * grad, const float * dstf, float * dst, const int ncols, const float scale) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int tid = threadIdx.x; const int rowx = blockIdx.x; @@ -180,6 +190,9 @@ static __global__ void soft_max_back_f32( for (int col = tid; col < ncols; col += WARP_SIZE) { dst[col] = scale * (grad[col] - dgf_dot) * dstf[col]; } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu index 41979733601d2..234ae912d2c4b 100644 --- a/ggml/src/ggml-cuda/ssm-conv.cu +++ b/ggml/src/ggml-cuda/ssm-conv.cu @@ -5,6 +5,9 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1, float * __restrict__ dst, const int dst_nb0, const int dst_nb1, const int dst_nb2, const int64_t n_t) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif GGML_UNUSED(src0_nb0); const int tid = threadIdx.x; const int bidx = blockIdx.x; @@ -43,6 +46,9 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float } y_block[i * stride_y + tid] = sumf; } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template @@ -50,6 +56,9 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0, const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1, float * __restrict__ dst, const int dst_nb0, const int dst_nb1, const int dst_nb2, const int64_t n_t) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int tid = threadIdx.x; const int bidx = blockIdx.x; const int bidy = blockIdx.y; @@ -93,6 +102,9 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0, y_block[i * stride_y + tid] = sumf; } } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int src0_nb0, const int src0_nb1, diff --git a/ggml/src/ggml-cuda/ssm-scan.cu b/ggml/src/ggml-cuda/ssm-scan.cu index c9184398b422c..75c6a1fd11eea 100644 --- a/ggml/src/ggml-cuda/ssm-scan.cu +++ b/ggml/src/ggml-cuda/ssm-scan.cu @@ -9,6 +9,9 @@ __global__ void __launch_bounds__(splitD, 2) const int src2_nb1, const int src2_nb2, const int src3_nb1, const int src4_nb2, const int src4_nb3, const int src5_nb2, const int src5_nb3, const int64_t s_off, const int64_t d_inner, const int64_t L) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif constexpr int warp_size = ggml_cuda_get_physical_warp_size(); const int bidx = blockIdx.x; // split along B (sequences) @@ -81,6 +84,9 @@ __global__ void __launch_bounds__(splitD, 2) __syncthreads(); y_block[i * stride_y + tid] = sumf; } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } // assumes as many threads as d_state @@ -94,6 +100,9 @@ __global__ void __launch_bounds__(d_state, 1) const int src2_nb1, const int src2_nb2, const int src3_nb1, const int src4_nb2, const int src4_nb3, const int src5_nb2, const int src5_nb3, const int64_t s_off, const int64_t n_head, const int64_t d_head, const int64_t n_group, const int64_t n_tok) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int head_idx = (blockIdx.x * splitH) / d_head; const int head_off = ((blockIdx.x * splitH) % d_head) * sizeof(float); diff --git a/ggml/src/ggml-cuda/tsembd.cu b/ggml/src/ggml-cuda/tsembd.cu index 153ddbcda92dc..d4d460d8bf3e1 100644 --- a/ggml/src/ggml-cuda/tsembd.cu +++ b/ggml/src/ggml-cuda/tsembd.cu @@ -1,6 +1,9 @@ #include "tsembd.cuh" static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif // blockIDx.y: idx of timesteps->ne[0] // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE int i = blockIdx.y; @@ -21,6 +24,9 @@ static __global__ void timestep_embedding_f32(const float * timesteps, float * d float arg = timestep * freq; embed_data[j] = cosf(arg); embed_data[j + half] = sinf(arg); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1, diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index 91c830c4dacc3..29799890e9e8d 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -89,13 +89,22 @@ static __device__ __forceinline__ float op_elu(float x) { template static __global__ void unary_op_kernel(const T * x, T * dst, const int k) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } dst[i] = (T)op((float)x[i]); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template @@ -207,6 +216,9 @@ void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { template static __global__ void unary_gated_op_kernel(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int64_t i = int64_t(blockDim.x)*blockIdx.x + threadIdx.x; if (i >= k) { @@ -309,13 +321,22 @@ static __device__ __forceinline__ float op_silu_back(float grad, float x) { template static __global__ void silu_back_kernel(const T * grad, const T * xf, T * dst, const int k) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } dst[i] = (T)op_silu_back((float)grad[i], (float)xf[i]); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template @@ -355,13 +376,22 @@ static __device__ __forceinline__ float op_leaky_relu(float x, const float negat template static __global__ void leaky_relu_kernel(const T * x, T * dst, const int k, const float negative_slope) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif return; } dst[i] = (T)op_leaky_relu((float)x[i], negative_slope); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template diff --git a/ggml/src/ggml-cuda/upscale.cu b/ggml/src/ggml-cuda/upscale.cu index ef48aa5f97bcd..84d385ad31658 100644 --- a/ggml/src/ggml-cuda/upscale.cu +++ b/ggml/src/ggml-cuda/upscale.cu @@ -4,6 +4,9 @@ static __global__ void upscale_f32(const float * x, float * dst, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, const int ne12, const int ne13, const float sf0, const float sf1, const float sf2, const float sf3) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif int index = threadIdx.x + blockIdx.x * blockDim.x; if (index >= ne10 * ne11 * ne12 * ne13) { return; @@ -20,6 +23,9 @@ static __global__ void upscale_f32(const float * x, float * dst, int i03 = i13 / sf3; dst[index] = *( (const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00) ); +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } static __global__ void upscale_f32_bilinear(const float * x, float * dst, @@ -28,6 +34,9 @@ static __global__ void upscale_f32_bilinear(const float * x, float * dst, const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst, const float sf0, const float sf1, const float sf2, const float sf3, const float pixel_offset) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int64_t index = threadIdx.x + blockIdx.x * blockDim.x; const int64_t dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst; diff --git a/ggml/src/ggml-cuda/wkv.cu b/ggml/src/ggml-cuda/wkv.cu index d2fced705e095..3b67534d46f7e 100644 --- a/ggml/src/ggml-cuda/wkv.cu +++ b/ggml/src/ggml-cuda/wkv.cu @@ -3,6 +3,9 @@ template static __global__ void rwkv_wkv_f32(const int B, const int T, const int C, const int H, const float * k, const float * v, const float * r, const float * tf, const float * td, const float * s, float * dst) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int tid = threadIdx.x; const int bid = blockIdx.x; @@ -63,10 +66,16 @@ static __global__ void rwkv_wkv_f32(const int B, const int T, const int C, const for (int i = 0; i < head_size; i++) { dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i]; } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } template static __global__ void rwkv_wkv7_f32(const int B, const int T, const int C, const int H, const float * r, const float * w, const float * k, const float * v, const float * a, const float * b, const float * s, float * dst) { +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif const int tid = threadIdx.x; const int bid = blockIdx.x; @@ -139,6 +148,9 @@ static __global__ void rwkv_wkv7_f32(const int B, const int T, const int C, cons for (int i = 0; i < head_size; i++) { dst[T * C + batch_i * state_size + head_i * head_size * head_size + tid * head_size + i] = state[i]; } +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif } void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {