Nexesenex
diff --git a/‎ggml/src/ggml-cuda/common.cuh‎
Lines changed: 2 additions & 6 deletions b/‎ggml/src/ggml-cuda/common.cuh‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎ggml/src/ggml-cuda/fattn-common.cuh‎
Lines changed: 6 additions & 3 deletions b/‎ggml/src/ggml-cuda/fattn-common.cuh‎
Lines changed: 6 additions & 3 deletions
@@ -250,8 +250,8 @@ static bool fp16_available(const int cc) {
 }
 
 static bool fast_fp16_available(const int cc) {
-    return GGML_CUDA_CC_IS_AMD(cc) ||
-        (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc > 610);
+    return fp16_available(cc) && cc > 610;
+    // return (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc != 610) || GGML_CUDA_CC_IS_AMD(cc); LCPP
 }
 
 // To be used for feature selection of external libraries, e.g. cuBLAS.
@@ -577,10 +577,6 @@ static __device__ __forceinline__ void ggml_cuda_mad(half2 & acc, const half2 v,
 }
 
 // Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.
-// Important: do not use this function if dst and src both point at registers.
-//     Due to the strict aliasing rule the compiler can do incorrect optimizations if src and dst have different types.
-//     The function is intended for copies between registers and SRAM/VRAM to make the compiler emit the right instructions.
-//     If dst and src point at different address spaces then they are guaranteed to not be aliased.
 template <int nbytes, int alignment = 0>
 static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) {
     if constexpr (alignment != 0) {
 
@@ -793,6 +793,8 @@ void launch_fattn(
     GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
         "the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
 
+    GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding.");
+
     ggml_cuda_pool & pool = ctx.pool();
     cudaStream_t main_stream = ctx.stream();
     const int id  = ggml_cuda_get_device();
@@ -876,7 +878,7 @@ void launch_fattn(
     // Optional optimization where the mask is scanned to determine whether part of the calculation can be skipped.
     // Only worth the overhead if there is at lease one FATTN_KQ_STRIDE x FATTN_KQ_STRIDE square to be skipped or
     //     multiple sequences of possibly different lengths.
-    if (mask && K->ne[1] % FATTN_KQ_STRIDE == 0 && (Q->ne[1] >= 1024 || Q->ne[3] > 1)) {
+    if (mask && (Q->ne[1] >= 1024 || Q->ne[3] > 1)) {
         const int s31 = mask->nb[1] / sizeof(half2);
         const int s33 = mask->nb[3] / sizeof(half2);
 
@@ -915,7 +917,8 @@ void launch_fattn(
 
         dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
     } else {
-        const int ntiles_KQ = (K->ne[1] + KQ_row_granularity - 1) / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size.
+        GGML_ASSERT(K->ne[1] % KQ_row_granularity == 0);
+        const int ntiles_KQ = K->ne[1] / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size.
 
         // parallel_blocks must not be larger than what the tensor size allows:
         parallel_blocks = std::min(parallel_blocks, ntiles_KQ);
@@ -944,7 +947,7 @@ void launch_fattn(
 
         blocks_num.x = ntiles_x;
         blocks_num.y = parallel_blocks;
-        blocks_num.z = (Q->ne[2]/ncols2)*Q->ne[3];
+        blocks_num.z = Q->ne[2]*Q->ne[3];
 
         if (parallel_blocks > 1) {
             dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
Original file line number	Diff line number	Diff line change
`@@ -250,8 +250,8 @@ static bool fp16_available(const int cc) {`
`250`	`250`	`}`
`251`	`251`
`252`	`252`	`static bool fast_fp16_available(const int cc) {`
`253`		`- return GGML_CUDA_CC_IS_AMD(cc) \|\|`
`254`		`- (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc > 610);`
	`253`	`+ return fp16_available(cc) && cc > 610;`
	`254`	`+ // return (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc != 610) \|\| GGML_CUDA_CC_IS_AMD(cc); LCPP`
`255`	`255`	`}`
`256`	`256`
`257`	`257`	`// To be used for feature selection of external libraries, e.g. cuBLAS.`
`@@ -577,10 +577,6 @@ static __device__ __forceinline__ void ggml_cuda_mad(half2 & acc, const half2 v,`
`577`	`577`	`}`
`578`	`578`
`579`	`579`	`// Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.`
`580`		`-// Important: do not use this function if dst and src both point at registers.`
`581`		`-// Due to the strict aliasing rule the compiler can do incorrect optimizations if src and dst have different types.`
`582`		`-// The function is intended for copies between registers and SRAM/VRAM to make the compiler emit the right instructions.`
`583`		`-// If dst and src point at different address spaces then they are guaranteed to not be aliased.`
`584`	`580`	`template <int nbytes, int alignment = 0>`
`585`	`581`	`static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) {`
`586`	`582`	`if constexpr (alignment != 0) {`