Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ci/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ docker run --privileged -it \
Inside the container, execute the following commands:

```bash
apt update -y && apt install -y bc cmake git python3.10-venv time unzip wget
apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
git config --global --add safe.directory /ws
GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
```
Expand Down
2 changes: 1 addition & 1 deletion ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ fi
if [ ! -z ${GG_BUILD_MUSA} ]; then
# Use qy1 by default (MTT S80)
MUSA_ARCH=${MUSA_ARCH:-21}
CMAKE_EXTRA="-DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
fi
## helpers

Expand Down
18 changes: 12 additions & 6 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,12 @@ typedef sycl::half2 ggml_half2;

#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP

#ifdef _MSC_VER
#define GGML_EXTENSION
#else // _MSC_VER
#define GGML_EXTENSION __extension__
#endif // _MSC_VER

#define QK4_0 32
typedef struct {
ggml_half d; // delta
Expand All @@ -167,7 +173,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 b

#define QK4_1 32
typedef struct {
union {
GGML_EXTENSION union {
struct {
ggml_half d; // delta
ggml_half m; // min
Expand All @@ -188,7 +194,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0

#define QK5_1 32
typedef struct {
union {
GGML_EXTENSION union {
struct {
ggml_half d; // delta
ggml_half m; // min
Expand All @@ -209,7 +215,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block

#define QK8_1 32
typedef struct {
union {
GGML_EXTENSION union {
struct {
ggml_half d; // delta
ggml_half s; // d * sum(qs[i])
Expand Down Expand Up @@ -250,7 +256,7 @@ static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0
typedef struct {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
union {
GGML_EXTENSION union {
struct {
ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins
Expand All @@ -277,7 +283,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12
// weight is represented as x = a * q + b
// Effectively 4.5 bits per weight
typedef struct {
union {
GGML_EXTENSION union {
struct {
ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins
Expand All @@ -294,7 +300,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2,
// weight is represented as x = a * q + b
// Effectively 5.5 bits per weight
typedef struct {
union {
GGML_EXTENSION union {
struct {
ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,10 @@ static __device__ void no_device_code(
__trap();

GGML_UNUSED(no_device_code); // suppress unused function warning

#if defined(GGML_USE_MUSA)
__builtin_unreachable();
#endif // defined(GGML_USE_MUSA)
}

#ifdef __CUDA_ARCH__
Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-cuda/concat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ static __global__ void concat_f32_dim1(const float * x, const float * y, float *
blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y;

if (blockIdx.y < ne01) { // src0
if (blockIdx.y < (unsigned)ne01) { // src0
int offset_src =
nidx +
blockIdx.y * ne0 +
Expand All @@ -64,7 +64,7 @@ static __global__ void concat_f32_dim2(const float * x, const float * y, float *
blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y;

if (blockIdx.z < ne02) { // src0
if (blockIdx.z < (unsigned)ne02) { // src0
int offset_src =
nidx +
blockIdx.y * ne0 +
Expand Down
6 changes: 4 additions & 2 deletions ggml/src/ggml-cuda/conv-transpose-1d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ static __global__ void conv_transpose_1d_kernel(
}
}
dst[global_index] = accumulator;
GGML_UNUSED(p0); GGML_UNUSED(d0); GGML_UNUSED(src0_ne3);
GGML_UNUSED(src1_ne3); GGML_UNUSED(dst_ne3);
GGML_UNUSED(src1_ne1); GGML_UNUSED(dst_ne1);
GGML_UNUSED(src1_ne2); GGML_UNUSED(dst_ne2);
}

static void conv_transpose_1d_f32_f32_cuda(
Expand Down Expand Up @@ -75,8 +79,6 @@ void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor
const int p0 = 0;//opts[3];
const int d0 = 1;//opts[4];

const int64_t kernel_size = ggml_nelements(src0);
const int64_t input_size = ggml_nelements(src1);
const int64_t output_size = ggml_nelements(dst);

conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cuda/convert.cu
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
return;
}

const src_t * x = (src_t *) vx;
const src_t * x = (const src_t *) vx;

y[i] = x[i];
}
Expand Down
9 changes: 5 additions & 4 deletions ggml/src/ggml-cuda/fattn-common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -315,14 +315,14 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(

float vals[sizeof(int)] = {0.0f};
#pragma unroll
for (int l = 0; l < sizeof(int); ++l) {
for (int l = 0; l < int(sizeof(int)); ++l) {
vals[l] = scale * x[4*threadIdx.x + l];
}

float amax = fabsf(vals[0]);
float sum = vals[0];
#pragma unroll
for (int l = 1; l < sizeof(int); ++l) {
for (int l = 1; l < int(sizeof(int)); ++l) {
amax = fmaxf(amax, fabsf(vals[l]));
sum += vals[l];
}
Expand All @@ -338,7 +338,7 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(

if (d != 0.0f) {
#pragma unroll
for (int l = 0; l < sizeof(int); ++l) {
for (int l = 0; l < int(sizeof(int)); ++l) {
q8[l] = roundf(vals[l] / d);
}
}
Expand Down Expand Up @@ -638,7 +638,7 @@ static __global__ void flash_attn_combine_results(
float VKQ_denominator = 0.0f;
for (int l = 0; l < parallel_blocks; ++l) {
const float diff = meta[l].x - kqmax;
const float KQ_max_scale = expf(diff);
float KQ_max_scale = expf(diff);
const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
*((uint32_t *) &KQ_max_scale) &= ftz_mask;

Expand All @@ -649,6 +649,7 @@ static __global__ void flash_attn_combine_results(
dst[blockIdx.z*D + tid] = VKQ_numerator / VKQ_denominator;
}

[[noreturn]]
static void on_no_fattn_vec_case(const int D) {
if (D == 64) {
fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
Expand Down
91 changes: 58 additions & 33 deletions ggml/src/ggml-cuda/fattn-mma-f16.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,15 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
#endif // CP_ASYNC_AVAILABLE

#else
GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_KV);
GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
GGML_UNUSED(kb0);
NO_DEVICE_CODE;
#endif // NEW_MMA_AVAILABLE
}
Expand Down Expand Up @@ -797,6 +806,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
__syncthreads();
}
#else
GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_Q1);
GGML_UNUSED(stride_Q2); GGML_UNUSED(stride_KV); GGML_UNUSED(stride_mask);
GGML_UNUSED(jt); GGML_UNUSED(kb0_start); GGML_UNUSED(kb0_stop);
NO_DEVICE_CODE;
#endif // NEW_MMA_AVAILABLE
}
Expand Down Expand Up @@ -931,6 +946,16 @@ static __global__ void flash_attn_ext_f16(
(Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
#else
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
NO_DEVICE_CODE;
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
}
Expand Down Expand Up @@ -985,38 +1010,38 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/4, 4); \
extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/8, 8); \

DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 8);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 8);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 8);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 8);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 8);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 8);

DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 16);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 16);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 16);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 16);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 16);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 16);

DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 32);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 32);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 32);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 32);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 32);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 32);

DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 64);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 64);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 64);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 64);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 64);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 64);
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 8)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 8)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 8)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 8)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 8)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 8)

DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 16)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 16)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 16)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 16)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 16)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 16)

DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 32)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 32)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 32)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 32)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 32)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 32)

DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 64)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 64)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 64)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 64)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 64)
DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 64)

// Kernels with ncols == 128 are only 4% faster due to register pressure.
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128);
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128);
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128);
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128);
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128);
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128); // Needs too much shared memory.
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128)
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128)
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128)
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128)
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128)
// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128) // Needs too much shared memory.
14 changes: 13 additions & 1 deletion ggml/src/ggml-cuda/fattn-tile-f16.cu
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,19 @@ static __global__ void flash_attn_tile_ext_f16(
}
}
#else
NO_DEVICE_CODE;
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
NO_DEVICE_CODE;
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
}

Expand Down
12 changes: 12 additions & 0 deletions ggml/src/ggml-cuda/fattn-tile-f32.cu
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,18 @@ static __global__ void flash_attn_tile_ext_f32(
}
}
#else
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
NO_DEVICE_CODE;
#endif // FLASH_ATTN_AVAILABLE
}
Expand Down
14 changes: 13 additions & 1 deletion ggml/src/ggml-cuda/fattn-vec-f16.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,19 @@ static __global__ void flash_attn_vec_ext_f16(
dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
}
#else
NO_DEVICE_CODE;
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
NO_DEVICE_CODE;
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
}

Expand Down
10 changes: 10 additions & 0 deletions ggml/src/ggml-cuda/fattn-vec-f32.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,16 @@ static __global__ void flash_attn_vec_ext_f32(
dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
}
#else
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
NO_DEVICE_CODE;
#endif // FLASH_ATTN_AVAILABLE
}
Expand Down
Loading
Loading