Skip to content

Commit b7578a4

Browse files
committed
Merge branch 'concedo_experimental' into crokeso
2 parents a982c99 + 4403503 commit b7578a4

32 files changed

+1677
-1007
lines changed

common/arg.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1447,6 +1447,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14471447
params.n_keep = value;
14481448
}
14491449
));
1450+
add_opt(common_arg(
1451+
{"--swa-full"},
1452+
string_format("use full-size SWA cache (default: %s)\n"
1453+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
1454+
[](common_params & params) {
1455+
params.swa_full = true;
1456+
}
1457+
).set_env("LLAMA_ARG_SWA_FULL"));
14501458
add_opt(common_arg(
14511459
{"--no-context-shift"},
14521460
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -2059,13 +2067,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20592067
params.grp_attn_w = value;
20602068
}
20612069
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
2062-
add_opt(common_arg(
2063-
{"-dkvc", "--dump-kv-cache"},
2064-
"verbose print of the KV cache",
2065-
[](common_params & params) {
2066-
params.dump_kv_cache = true;
2067-
}
2068-
));
20692070
add_opt(common_arg(
20702071
{"-nkvo", "--no-kv-offload"},
20712072
"disable KV offload",

common/common.cpp

Lines changed: 1 addition & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1144,6 +1144,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11441144
cparams.flash_attn = params.flash_attn;
11451145
cparams.no_perf = params.no_perf;
11461146
cparams.op_offload = !params.no_op_offload;
1147+
cparams.swa_full = params.swa_full;
11471148

11481149
if (params.reranking) {
11491150
cparams.embeddings = true;
@@ -1336,81 +1337,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
13361337
return text;
13371338
}
13381339

1339-
//
1340-
// KV cache utils
1341-
//
1342-
1343-
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1344-
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
1345-
1346-
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
1347-
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1348-
1349-
llama_kv_cache_view_cell * c_curr = view.cells;
1350-
llama_seq_id * cs_curr = view.cells_sequences;
1351-
1352-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1353-
if (i % row_size == 0) {
1354-
printf("\n%5d: ", i);
1355-
}
1356-
int seq_count = 0;
1357-
for (int j = 0; j < view.n_seq_max; j++) {
1358-
if (cs_curr[j] >= 0) { seq_count++; }
1359-
}
1360-
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
1361-
}
1362-
1363-
printf("\n=== Done dumping\n");
1364-
}
1365-
1366-
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1367-
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
1368-
1369-
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
1370-
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1371-
1372-
std::unordered_map<llama_seq_id, size_t> seqs;
1373-
llama_kv_cache_view_cell * c_curr = view.cells;
1374-
llama_seq_id * cs_curr = view.cells_sequences;
1375-
1376-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1377-
for (int j = 0; j < view.n_seq_max; j++) {
1378-
if (cs_curr[j] < 0) { continue; }
1379-
if (seqs.find(cs_curr[j]) == seqs.end()) {
1380-
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1381-
const size_t sz = seqs.size();
1382-
seqs[cs_curr[j]] = sz;
1383-
}
1384-
}
1385-
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1386-
}
1387-
1388-
printf("=== Sequence legend: ");
1389-
for (const auto & it : seqs) {
1390-
printf("%zu=%d, ", it.second, it.first);
1391-
}
1392-
printf("'+'=other sequence ids");
1393-
1394-
c_curr = view.cells;
1395-
cs_curr = view.cells_sequences;
1396-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1397-
if (i % row_size == 0) {
1398-
printf("\n%5d: ", i);
1399-
}
1400-
for (int j = 0; j < view.n_seq_max; j++) {
1401-
if (cs_curr[j] >= 0) {
1402-
const auto & it = seqs.find(cs_curr[j]);
1403-
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
1404-
} else {
1405-
putchar('.');
1406-
}
1407-
}
1408-
putchar(' ');
1409-
}
1410-
1411-
printf("\n=== Done dumping\n");
1412-
}
1413-
14141340
//
14151341
// Embedding utils
14161342
//

common/common.h

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -319,13 +319,13 @@ struct common_params {
319319
bool flash_attn = false; // flash attention
320320
bool no_perf = false; // disable performance metrics
321321
bool ctx_shift = true; // context shift on inifinite text generation
322+
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
322323

323324
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
324325
bool use_mmap = true; // use mmap for faster loads
325326
bool use_mlock = false; // use mlock to keep model in memory
326327
bool verbose_prompt = false; // print prompt tokens before generation
327328
bool display_prompt = true; // print prompt before generation
328-
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
329329
bool no_kv_offload = false; // disable KV offloading
330330
bool warmup = true; // warmup run
331331
bool check_tensors = false; // validate tensor data
@@ -617,16 +617,6 @@ std::string common_detokenize(
617617
const std::vector<llama_token> & tokens,
618618
bool special = true);
619619

620-
//
621-
// KV cache utils
622-
//
623-
624-
// Dump the KV cache view with the number of sequences per cell.
625-
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
626-
627-
// Dump the KV cache view showing individual sequences in each cell (long output).
628-
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
629-
630620
//
631621
// Embedding utils
632622
//

ggml/src/ggml-cuda/cpy.cu

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
#include "cpy.cuh"
22
#include "dequantize.cuh"
3+
#ifdef GGML_USE_MUSA
4+
#include "ggml-musa/mudnn.cuh"
5+
#endif // GGML_USE_MUSA
36

47
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
58

@@ -672,7 +675,14 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
672675
#endif
673676
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
674677
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
675-
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
678+
#ifdef GGML_USE_MUSA
679+
if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
680+
CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
681+
} else
682+
#endif // GGML_USE_MUSA
683+
{
684+
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
685+
}
676686
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
677687
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
678688
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {

ggml/src/ggml-cuda/fattn-mma-f16.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -414,7 +414,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
414414
GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
415415
GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
416416
GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
417-
GGML_UNUSED(kb0);
417+
GGML_UNUSED(kb0); GGML_UNUSED(tile_Q);
418418
NO_DEVICE_CODE;
419419
#endif // NEW_MMA_AVAILABLE
420420
}

ggml/src/ggml-cuda/fattn-vec-f16.cuh

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
#include "fattn-common.cuh"
33

44
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
5-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
5+
#ifndef GGML_USE_HIP
66
__launch_bounds__(D, 1)
7-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
7+
#endif // GGML_USE_HIP
88
static __global__ void flash_attn_vec_ext_f16(
99
const char * __restrict__ Q,
1010
const char * __restrict__ K,
@@ -48,6 +48,12 @@ static __global__ void flash_attn_vec_ext_f16(
4848
NO_DEVICE_CODE;
4949
return;
5050
}
51+
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
52+
if (ncols > 1) {
53+
NO_DEVICE_CODE;
54+
return;
55+
}
56+
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
5157

5258
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
5359

@@ -91,6 +97,13 @@ static __global__ void flash_attn_vec_ext_f16(
9197
kqsum_shared[j][threadIdx.x] = 0.0f;
9298
}
9399
}
100+
101+
__shared__ half maskh_shared[ncols*D];
102+
#pragma unroll
103+
for (int j = 0; j < ncols; ++j) {
104+
maskh_shared[j*D + tid] = 0.0f;
105+
}
106+
94107
__syncthreads();
95108

96109
// Convert Q to half2 (f16 K) or q8_1 (quantized K) and store in registers:
@@ -174,6 +187,35 @@ static __global__ void flash_attn_vec_ext_f16(
174187
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
175188
// Calculate KQ tile and keep track of new maximum KQ values:
176189

190+
if (mask) {
191+
#pragma unroll
192+
for (int j = 0; j < ncols; ++j) {
193+
maskh_shared[j*D + tid] = slopeh*maskh[j*ne11 + k_VKQ_0 + tid];
194+
}
195+
196+
__syncthreads();
197+
198+
// When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.
199+
// In such cases, skip the KV slice.
200+
// On AMD __all_sync would not work correctly because it assumes a warp size of 64.
201+
#ifndef GGML_USE_HIP
202+
bool skip = true;
203+
#pragma unroll
204+
for (int j = 0; j < ncols; ++j) {
205+
#pragma unroll
206+
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
207+
const int i = i0 + threadIdx.x;
208+
209+
const float2 tmp = __half22float2(((const half2 *) maskh_shared)[j*(D/2) + i]);
210+
skip = skip && isinf(tmp.x) && isinf(tmp.y);
211+
}
212+
}
213+
if (__all_sync(0xFFFFFFFF, skip)) {
214+
continue;
215+
}
216+
#endif // GGML_USE_HIP
217+
}
218+
177219
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
178220
// see https://github.com/ggerganov/llama.cpp/pull/7061 .
179221
// Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
@@ -201,7 +243,7 @@ static __global__ void flash_attn_vec_ext_f16(
201243
sum = logit_softcap*tanhf(sum);
202244
}
203245

204-
sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
246+
sum += maskh_shared[j*D + i_KQ];
205247

206248
if (ncols == 1) {
207249
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
@@ -334,7 +376,9 @@ void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml
334376
float logit_softcap;
335377
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
336378

337-
if (Q->ne[1] == 1) {
379+
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
380+
381+
if (Q->ne[1] == 1 || GGML_CUDA_CC_IS_NVIDIA(cc)) {
338382
constexpr int cols_per_block = 1;
339383
if (logit_softcap == 0.0f) {
340384
constexpr bool use_logit_softcap = false;

ggml/src/ggml-cuda/fattn-vec-f32.cuh

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
#include "fattn-common.cuh"
33

44
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
5-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
5+
#ifndef GGML_USE_HIP
66
__launch_bounds__(D, 1)
7-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
7+
#endif // GGML_USE_HIP
88
static __global__ void flash_attn_vec_ext_f32(
99
const char * __restrict__ Q,
1010
const char * __restrict__ K,
@@ -60,6 +60,12 @@ static __global__ void flash_attn_vec_ext_f32(
6060
NO_DEVICE_CODE;
6161
return;
6262
}
63+
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
64+
if (ncols > 1) {
65+
NO_DEVICE_CODE;
66+
return;
67+
}
68+
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
6369

6470
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
6571

@@ -104,6 +110,13 @@ static __global__ void flash_attn_vec_ext_f32(
104110
kqsum_shared[j][threadIdx.x] = 0.0f;
105111
}
106112
}
113+
114+
__shared__ float maskf_shared[ncols*D];
115+
#pragma unroll
116+
for (int j = 0; j < ncols; ++j) {
117+
maskf_shared[j*D + tid] = 0.0f;
118+
}
119+
107120
__syncthreads();
108121

109122
// Convert Q to float2 (f16 K) or q8_1 (quantized K) and store in registers:
@@ -181,6 +194,34 @@ static __global__ void flash_attn_vec_ext_f32(
181194
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
182195
// Calculate KQ tile and keep track of new maximum KQ values:
183196

197+
if (mask) {
198+
#pragma unroll
199+
for (int j = 0; j < ncols; ++j) {
200+
maskf_shared[j*D + tid] = slope*__half2float(maskh[j*ne11 + k_VKQ_0 + tid]);
201+
}
202+
203+
__syncthreads();
204+
205+
// When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.
206+
// In such cases, skip the KV slice.
207+
// On AMD __all_sync would not work correctly because it assumes a warp size of 64.
208+
#ifndef GGML_USE_HIP
209+
bool skip = true;
210+
#pragma unroll
211+
for (int j = 0; j < ncols; ++j) {
212+
#pragma unroll
213+
for (int i0 = 0; i0 < D; i0 += WARP_SIZE) {
214+
const int i = i0 + threadIdx.x;
215+
216+
skip = skip && isinf(maskf_shared[j*D + i]);
217+
}
218+
}
219+
if (__all_sync(0xFFFFFFFF, skip)) {
220+
continue;
221+
}
222+
#endif // GGML_USE_HIP
223+
}
224+
184225
float kqmax_new_arr[ncols];
185226
#pragma unroll
186227
for (int j = 0; j < ncols; ++j) {
@@ -204,7 +245,7 @@ static __global__ void flash_attn_vec_ext_f32(
204245
sum = logit_softcap*tanhf(sum);
205246
}
206247

207-
sum += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
248+
sum += maskf_shared[j*D + i_KQ];
208249

209250
kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum);
210251

@@ -326,7 +367,9 @@ void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml
326367
float logit_softcap;
327368
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
328369

329-
if (Q->ne[1] == 1) {
370+
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
371+
372+
if (Q->ne[1] == 1 || GGML_CUDA_CC_IS_NVIDIA(cc)) {
330373
constexpr int cols_per_block = 1;
331374
if (logit_softcap == 0.0f) {
332375
constexpr bool use_logit_softcap = false;

ggml/src/ggml-metal/ggml-metal.metal

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3159,7 +3159,7 @@ template<
31593159
typename kd4x4_t, // key type in device memory
31603160
short nl_k,
31613161
void (*deq_k)(device const kd4x4_t *, short, thread k4x4_t &),
3162-
typename vd4x4_t, // key type in device memory
3162+
typename vd4x4_t, // value type in device memory
31633163
short nl_v,
31643164
void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &),
31653165
short DK, // K head size
@@ -3680,7 +3680,7 @@ template<
36803680
typename kd4_t, // key type in device memory
36813681
short nl_k,
36823682
void (*deq_k_t4)(device const kd4_t *, short, thread k4_t &),
3683-
typename vd4_t, // key type in device memory
3683+
typename vd4_t, // value type in device memory
36843684
short nl_v,
36853685
void (*deq_v_t4)(device const vd4_t *, short, thread v4_t &),
36863686
short DK, // K head size

0 commit comments

Comments
 (0)