File tree Expand file tree Collapse file tree 2 files changed +13
-2
lines changed Expand file tree Collapse file tree 2 files changed +13
-2
lines changed Original file line number Diff line number Diff line change 22
33#include " common.cuh"
44
5+
6+ static __device__ __forceinline__ unsigned int ggml_cuda_cvta_generic_to_shared (void * generic_ptr) {
7+ #ifdef CP_ASYNC_AVAILABLE
8+ return __cvta_generic_to_shared (generic_ptr);
9+ #else
10+ GGML_UNUSED (generic_ptr);
11+ NO_DEVICE_CODE;
12+ return -1 ;
13+ #endif // CP_ASYNC_AVAILABLE
14+ }
15+
516// Copies data from global to shared memory, cg == cache global.
617// Both the src and dst pointers must be aligned to 16 bit.
718// Shared memory uses 32 bit addressing, the pointer is passed as unsigned int.
Original file line number Diff line number Diff line change @@ -112,7 +112,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
112112 // The minimum granularity with cp.async is 16 bytes, with synchronous data loading it's 4 bytes.
113113
114114 if (use_cp_async) {
115- const unsigned int tile_KV_32 = __cvta_generic_to_shared (tile_KV);
115+ const unsigned int tile_KV_32 = ggml_cuda_cvta_generic_to_shared (tile_KV);
116116
117117 constexpr int preload = 64 ;
118118 constexpr int h2_per_chunk = 16 /sizeof (half2);
@@ -186,7 +186,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
186186 constexpr int cols_per_warp = 8 *WARP_SIZE/nbatch_fa;
187187 constexpr int stride_j = nwarps * cols_per_warp;
188188
189- const unsigned int tile_mask_32 = __cvta_generic_to_shared (tile_mask);
189+ const unsigned int tile_mask_32 = ggml_cuda_cvta_generic_to_shared (tile_mask);
190190
191191#pragma unroll
192192 for (int j0 = 0 ; j0 < ncols1; j0 += stride_j) {
You can’t perform that action at this time.
0 commit comments