ggml-org · agray3 · Jun 11, 2025 · yeahdongcn · Aug 22, 2025
diff --git a/ggml/src/ggml-cuda/acc.cu b/ggml/src/ggml-cuda/acc.cu
@@ -3,6 +3,9 @@
 static __global__ void acc_f32(const float * x, const float * y, float * dst, const int64_t ne,
         const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
         const int64_t s11, const int64_t s12, const int64_t s13, const int64_t offset) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
     const int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
 
     if (i >= ne) {
@@ -25,6 +28,9 @@ static __global__ void acc_f32(const float * x, const float * y, float * dst, co
         val += y[((i13*ne12 + i12) * ne11 + i11) * ne10 + i10];
     }
     dst[i] = val;
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 static void acc_f32_cuda(const float * x, const float * y, float * dst, const int64_t n_elements,

diff --git a/ggml/src/ggml-cuda/arange.cu b/ggml/src/ggml-cuda/arange.cu
@@ -1,12 +1,18 @@
 #include "arange.cuh"
 
 static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
     // blockIDx.x: idx of ne0 / BLOCK_SIZE
     int nidx = threadIdx.x + blockIdx.x * blockDim.x;
     if (nidx >= ne0) {
         return;
     }
     dst[nidx] = start + step * nidx;
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {

diff --git a/ggml/src/ggml-cuda/argmax.cu b/ggml/src/ggml-cuda/argmax.cu
@@ -6,6 +6,9 @@
 #include "sum.cuh"
 
 static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __restrict__ dst, const int64_t ncols) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
     const int64_t row = blockIdx.x;
 
     float maxval = -FLT_MAX;
@@ -64,6 +67,9 @@ static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __rest
     if (warp_id == 0 && lane_id == 0) {
         dst[row] = argmax;
     }
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
@@ -9,6 +9,9 @@ static inline __device__ void ggml_cuda_swap(T & a, T & b) {
 
 template<ggml_sort_order order>
 static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
     // bitonic sort
     int col = threadIdx.x;
     int row = blockIdx.y;
@@ -55,6 +58,9 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n
     if (col < ncols) {
         dst[row * ncols + col] = dst_row[col];
     }
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 static int next_power_of_2(int x) {

diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu
@@ -29,6 +29,9 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst
         /*int s0, */ int s1,  int s2,  int s3,
         /*int s00,*/ int s01, int s02, int s03,
         /*int s10,*/ int s11, int s12, int s13) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
     const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
     const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
     const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
@@ -54,6 +57,9 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst
         const int i10 = i0 % ne10;
         dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
     }
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
@@ -63,6 +69,9 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
         /*int s0, */ int s1,  int s2,  int s3,
         /*int s00,*/ int s01, int s02, int s03,
         /*int s10,*/ int s11, int s12, int s13) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
 
     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
@@ -89,13 +98,19 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
 
     const int i10 = i0 % ne10;
     dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 template <typename T>
 static __global__ void k_repeat_back(
     const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
     const size_t s00, const size_t s01, const size_t s02, const size_t s03,
     const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
 
     const int64_t tid0  = int64_t(blockIdx.x)*blockDim.x + threadIdx.x;
     const int64_t tid1  = int64_t(blockIdx.y)*blockDim.y + threadIdx.y;
@@ -118,6 +133,9 @@ static __global__ void k_repeat_back(
         }
     }
     dst[tid3*ne2*ne1*ne0 + tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 template<float (*bin_op)(const float, const float)>

diff --git a/ggml/src/ggml-cuda/clamp.cu b/ggml/src/ggml-cuda/clamp.cu
@@ -6,13 +6,19 @@ static __device__ __forceinline__ float op_clamp(float x, float min, float max)
 
 template <class T>
 static __global__ void op_clamp_kernel(const T * x, T * dst, const T min, const T max, const int k) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
     if (i >= k) {
         return;
     }
 
     dst[i] = (T)op_clamp((float)x[i], (float)min, (float)max);
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 template <class T>

@@ -47,6 +47,7 @@
 #define GGML_CUDA_CC_TURING          750
 #define GGML_CUDA_CC_AMPERE          800
 #define GGML_CUDA_CC_ADA_LOVELACE    890
+#define GGML_CUDA_CC_HOPPER          900
 #define GGML_CUDA_CC_OFFSET_AMD      0x1000000
 #define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000
 #define GGML_CUDA_CC_IS_NVIDIA(cc)   (cc < GGML_CUDA_CC_OFFSET_MTHREADS)
@@ -414,6 +415,9 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 // Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
 template<bool norm>
 static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
     const int row = blockIdx.x;
     const int col = threadIdx.x;
 
@@ -425,10 +429,16 @@ static __global__ void reduce_rows_f32(const float * x, float * dst, const int n
     sum = warp_reduce_sum(sum);
 
     if (col != 0) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
         return;
     }
 
     dst[row] = norm ? sum / ncols : sum;
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 template<int width = WARP_SIZE>
@@ -832,6 +842,9 @@ struct ggml_cuda_graph {
     // Index to allow each cpy kernel to be aware of it's position within the graph
     // relative to other cpy nodes.
     int graph_cpynode_index = -1;
+    std::vector<cudaGraphNode_t> graph_nodes;
+    std::vector<cudaGraphNode_t> graph_dependencies;
+    bool allow_pdl = true; // whether Programmatic Dependent Launch can be used within CUDA graph 
 #endif
 };
 

diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu
@@ -2,6 +2,9 @@
 
 // contiguous kernels
 static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
     int nidx = threadIdx.x + blockIdx.x * blockDim.x;
     if (nidx >= ne0) {
         return;
@@ -25,9 +28,15 @@ static __global__ void concat_f32_dim0(const float * x, const float * y, float *
             blockIdx.z * (ne0 - ne00) * gridDim.y;
         dst[offset_dst] = y[offset_src];
     }
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
     int nidx = threadIdx.x + blockIdx.x * blockDim.x;
     if (nidx >= ne0) {
         return;
@@ -51,9 +60,15 @@ static __global__ void concat_f32_dim1(const float * x, const float * y, float *
             blockIdx.z * ne0 * (gridDim.y - ne01);
         dst[offset_dst] = y[offset_src];
     }
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
     int nidx = threadIdx.x + blockIdx.x * blockDim.x;
     if (nidx >= ne0) {
         return;
@@ -77,6 +92,9 @@ static __global__ void concat_f32_dim2(const float * x, const float * y, float *
             (blockIdx.z - ne02) * ne0 *  gridDim.y;
         dst[offset_dst] = y[offset_src];
     }
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) {
@@ -124,6 +142,9 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
           uint64_t   nb1,
           uint64_t   nb2,
           uint64_t   nb3){
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
     static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]");
 
     const int64_t i3 = blockIdx.z;
@@ -151,6 +172,9 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
 
         *y = *x;
     }
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 

diff --git a/ggml/src/ggml-cuda/conv-transpose-1d.cu b/ggml/src/ggml-cuda/conv-transpose-1d.cu
@@ -6,6 +6,9 @@ static  __global__ void conv_transpose_1d_kernel(
         const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
         const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
         const float * src0, const float * src1,  float * dst) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
     int global_index = threadIdx.x + blockIdx.x * blockDim.x;
     if (global_index >= output_size) {
         return;
@@ -38,6 +41,9 @@ static  __global__ void conv_transpose_1d_kernel(
     GGML_UNUSED(src1_ne3); GGML_UNUSED(dst_ne3);
     GGML_UNUSED(src1_ne1); GGML_UNUSED(dst_ne1);
     GGML_UNUSED(src1_ne2); GGML_UNUSED(dst_ne2);
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 static void conv_transpose_1d_f32_f32_cuda(

diff --git a/ggml/src/ggml-cuda/conv2d-dw.cu b/ggml/src/ggml-cuda/conv2d-dw.cu
@@ -84,10 +84,16 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr
                                  const int kernel_w, const int kernel_h, const int stride_x, const int stride_y,
                                  const int padding_x, const int padding_y, const int dilation_x, const int dilation_y,
                                  const int channels, const int batches) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
     const int global_idx     = blockIdx.x * blockDim.x + threadIdx.x;
     const int total_elements = batches * channels * out_h * out_w;
 
     if (global_idx >= total_elements) {
+        #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
         return;
     }
 
@@ -114,6 +120,9 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr
     }
 
     output[Layout::output_index(batch_idx, channel_idx, out_y_idx, out_x_idx, params)] = accumulator;
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

diff --git a/ggml/src/ggml-cuda/conv2d-transpose.cu b/ggml/src/ggml-cuda/conv2d-transpose.cu
@@ -7,11 +7,17 @@ __global__ void conv2d_transpose_kernel(const float * __restrict__ input, const
                                         float * __restrict__ output, const int in_w, const int in_h, const int out_w,
                                         const int out_h, const int kernel_w, const int kernel_h, const int stride,
                                         const int c_in, const int c_out, const int batches) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
     const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     const int total_elements = out_w * out_h * c_out * batches;
 
     if (global_idx >= total_elements) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
         return;
     }
 
@@ -49,6 +55,9 @@ __global__ void conv2d_transpose_kernel(const float * __restrict__ input, const
     }
 
     output[(out_w * out_h * c_out) * n_idx + (out_w * out_h) * c_idx + (out_w) *out_y_idx + out_x_idx] = accumulator;
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 //input is (W, H, C_in, N), Kernel is (W, H, C_out, C_in)

diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
@@ -9,9 +9,15 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y,
         const int64_t ne00, const int64_t ne01, const int64_t ne02,
         const int64_t s01, const int64_t s02, const int64_t s03) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
     const int64_t i00 = 2 * (int64_t(blockDim.x)*blockIdx.x + threadIdx.x);
 
     if (i00 >= ne00) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
         return;
     }
 
@@ -33,10 +39,16 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
     const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
     y[iy0 + 0]        = float(v.x);
     y[iy0 + y_offset] = float(v.y);
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 template <bool need_check>
 static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
 #if __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
     constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
 
@@ -592,6 +604,9 @@ template <typename src_t, typename dst_t>
 static __global__ void convert_unary(
         const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02,
         const int64_t s01, const int64_t s02, const int64_t s03) {
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
+    cudaGridDependencySynchronize();
+#endif
     const int64_t i00 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
 
     if (i00 >= ne00) {