@@ -29,6 +29,9 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst
2929 /* int s0, */ int s1, int s2, int s3,
3030 /* int s00,*/ int s01, int s02, int s03,
3131 /* int s10,*/ int s11, int s12, int s13) {
32+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
33+ cudaGridDependencySynchronize ();
34+ #endif
3235 const int i0s = blockDim .x *blockIdx .x + threadIdx .x ;
3336 const int i1 = (blockDim .y *blockIdx .y + threadIdx .y );
3437 const int i2 = (blockDim .z *blockIdx .z + threadIdx .z ) / ne3;
@@ -54,6 +57,9 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst
5457 const int i10 = i0 % ne10;
5558 dst_row[i0] = (dst_t )bin_op (src0 ? (float )src0_row[i0] : 0 .0f , (float )src1_row[i10]);
5659 }
60+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
61+ cudaTriggerProgrammaticLaunchCompletion ();
62+ #endif
5763}
5864
5965template <float (*bin_op)(const float , const float ), typename src0_t , typename src1_t , typename dst_t >
@@ -63,6 +69,9 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
6369 /* int s0, */ int s1, int s2, int s3,
6470 /* int s00,*/ int s01, int s02, int s03,
6571 /* int s10,*/ int s11, int s12, int s13) {
72+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
73+ cudaGridDependencySynchronize ();
74+ #endif
6675
6776 const int i = blockDim .x *blockIdx .x + threadIdx .x ;
6877
@@ -89,13 +98,19 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
8998
9099 const int i10 = i0 % ne10;
91100 dst_row[i0] = (dst_t )bin_op (src0 ? (float )src0_row[i0] : 0 .0f , (float )src1_row[i10]);
101+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
102+ cudaTriggerProgrammaticLaunchCompletion ();
103+ #endif
92104}
93105
94106template <typename T>
95107static __global__ void k_repeat_back (
96108 const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
97109 const size_t s00, const size_t s01, const size_t s02, const size_t s03,
98110 const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3) {
111+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
112+ cudaGridDependencySynchronize ();
113+ #endif
99114
100115 const int64_t tid0 = int64_t (blockIdx .x )*blockDim .x + threadIdx .x ;
101116 const int64_t tid1 = int64_t (blockIdx .y )*blockDim .y + threadIdx .y ;
@@ -118,6 +133,9 @@ static __global__ void k_repeat_back(
118133 }
119134 }
120135 dst[tid3*ne2*ne1*ne0 + tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
136+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
137+ cudaTriggerProgrammaticLaunchCompletion ();
138+ #endif
121139}
122140
123141template <float (*bin_op)(const float , const float )>
0 commit comments