Skip to content

Commit da7fd4a

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/musa.Dockerfile # .github/workflows/build.yml # README.md # ci/README.md # docs/docker.md # examples/lookahead/lookahead.cpp # examples/lookup/lookup.cpp # examples/parallel/parallel.cpp # ggml/src/ggml-musa/CMakeLists.txt # ggml/src/ggml-sycl/ggml-sycl.cpp # tests/test-arg-parser.cpp
2 parents 9f976e9 + 3398305 commit da7fd4a

File tree

22 files changed

+270
-384
lines changed

22 files changed

+270
-384
lines changed

common/arg.cpp

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1453,7 +1453,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14531453
[](common_params & params) {
14541454
params.swa_full = true;
14551455
}
1456-
));
1456+
).set_env("LLAMA_ARG_SWA_FULL"));
14571457
add_opt(common_arg(
14581458
{"--no-context-shift"},
14591459
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -2066,13 +2066,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20662066
params.grp_attn_w = value;
20672067
}
20682068
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
2069-
add_opt(common_arg(
2070-
{"-dkvc", "--dump-kv-cache"},
2071-
"verbose print of the KV cache",
2072-
[](common_params & params) {
2073-
params.dump_kv_cache = true;
2074-
}
2075-
));
20762069
add_opt(common_arg(
20772070
{"-nkvo", "--no-kv-offload"},
20782071
"disable KV offload",

common/common.cpp

Lines changed: 0 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1337,81 +1337,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
13371337
return text;
13381338
}
13391339

1340-
//
1341-
// KV cache utils
1342-
//
1343-
1344-
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1345-
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
1346-
1347-
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
1348-
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1349-
1350-
llama_kv_cache_view_cell * c_curr = view.cells;
1351-
llama_seq_id * cs_curr = view.cells_sequences;
1352-
1353-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1354-
if (i % row_size == 0) {
1355-
printf("\n%5d: ", i);
1356-
}
1357-
int seq_count = 0;
1358-
for (int j = 0; j < view.n_seq_max; j++) {
1359-
if (cs_curr[j] >= 0) { seq_count++; }
1360-
}
1361-
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
1362-
}
1363-
1364-
printf("\n=== Done dumping\n");
1365-
}
1366-
1367-
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1368-
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
1369-
1370-
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
1371-
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1372-
1373-
std::unordered_map<llama_seq_id, size_t> seqs;
1374-
llama_kv_cache_view_cell * c_curr = view.cells;
1375-
llama_seq_id * cs_curr = view.cells_sequences;
1376-
1377-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1378-
for (int j = 0; j < view.n_seq_max; j++) {
1379-
if (cs_curr[j] < 0) { continue; }
1380-
if (seqs.find(cs_curr[j]) == seqs.end()) {
1381-
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1382-
const size_t sz = seqs.size();
1383-
seqs[cs_curr[j]] = sz;
1384-
}
1385-
}
1386-
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1387-
}
1388-
1389-
printf("=== Sequence legend: ");
1390-
for (const auto & it : seqs) {
1391-
printf("%zu=%d, ", it.second, it.first);
1392-
}
1393-
printf("'+'=other sequence ids");
1394-
1395-
c_curr = view.cells;
1396-
cs_curr = view.cells_sequences;
1397-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1398-
if (i % row_size == 0) {
1399-
printf("\n%5d: ", i);
1400-
}
1401-
for (int j = 0; j < view.n_seq_max; j++) {
1402-
if (cs_curr[j] >= 0) {
1403-
const auto & it = seqs.find(cs_curr[j]);
1404-
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
1405-
} else {
1406-
putchar('.');
1407-
}
1408-
}
1409-
putchar(' ');
1410-
}
1411-
1412-
printf("\n=== Done dumping\n");
1413-
}
1414-
14151340
//
14161341
// Embedding utils
14171342
//

common/common.h

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,6 @@ struct common_params {
326326
bool use_mlock = false; // use mlock to keep model in memory
327327
bool verbose_prompt = false; // print prompt tokens before generation
328328
bool display_prompt = true; // print prompt before generation
329-
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
330329
bool no_kv_offload = false; // disable KV offloading
331330
bool warmup = true; // warmup run
332331
bool check_tensors = false; // validate tensor data
@@ -618,16 +617,6 @@ std::string common_detokenize(
618617
const std::vector<llama_token> & tokens,
619618
bool special = true);
620619

621-
//
622-
// KV cache utils
623-
//
624-
625-
// Dump the KV cache view with the number of sequences per cell.
626-
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
627-
628-
// Dump the KV cache view showing individual sequences in each cell (long output).
629-
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
630-
631620
//
632621
// Embedding utils
633622
//

ggml/src/ggml-cuda/cpy.cu

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
#include "cpy.cuh"
22
#include "dequantize.cuh"
3+
#ifdef GGML_USE_MUSA
4+
#include "ggml-musa/mudnn.cuh"
5+
#endif // GGML_USE_MUSA
36

47
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
58

@@ -597,7 +600,14 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
597600
#endif
598601
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
599602
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
600-
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
603+
#ifdef GGML_USE_MUSA
604+
if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
605+
CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
606+
} else
607+
#endif // GGML_USE_MUSA
608+
{
609+
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
610+
}
601611
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
602612
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
603613
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {

ggml/src/ggml-cuda/fattn-mma-f16.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -772,7 +772,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
772772
GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
773773
GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
774774
GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
775-
GGML_UNUSED(kb0);
775+
GGML_UNUSED(kb0); GGML_UNUSED(tile_Q);
776776
NO_DEVICE_CODE;
777777
#endif // NEW_MMA_AVAILABLE
778778
}

ggml/src/ggml-cuda/fattn-vec-f16.cuh

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
#include "fattn-common.cuh"
33

44
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
5-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
5+
#ifndef GGML_USE_HIP
66
__launch_bounds__(D, 1)
7-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
7+
#endif // GGML_USE_HIP
88
static __global__ void flash_attn_vec_ext_f16(
99
const char * __restrict__ Q,
1010
const char * __restrict__ K,
@@ -48,6 +48,12 @@ static __global__ void flash_attn_vec_ext_f16(
4848
NO_DEVICE_CODE;
4949
return;
5050
}
51+
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
52+
if (ncols > 1) {
53+
NO_DEVICE_CODE;
54+
return;
55+
}
56+
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
5157

5258
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
5359

@@ -91,6 +97,13 @@ static __global__ void flash_attn_vec_ext_f16(
9197
kqsum_shared[j][threadIdx.x] = 0.0f;
9298
}
9399
}
100+
101+
__shared__ half maskh_shared[ncols*D];
102+
#pragma unroll
103+
for (int j = 0; j < ncols; ++j) {
104+
maskh_shared[j*D + tid] = 0.0f;
105+
}
106+
94107
__syncthreads();
95108

96109
// Convert Q to half2 (f16 K) or q8_1 (quantized K) and store in registers:
@@ -175,6 +188,35 @@ static __global__ void flash_attn_vec_ext_f16(
175188
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
176189
// Calculate KQ tile and keep track of new maximum KQ values:
177190

191+
if (mask) {
192+
#pragma unroll
193+
for (int j = 0; j < ncols; ++j) {
194+
maskh_shared[j*D + tid] = slopeh*maskh[j*ne11 + k_VKQ_0 + tid];
195+
}
196+
197+
__syncthreads();
198+
199+
// When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.
200+
// In such cases, skip the KV slice.
201+
// On AMD __all_sync would not work correctly because it assumes a warp size of 64.
202+
#ifndef GGML_USE_HIP
203+
bool skip = true;
204+
#pragma unroll
205+
for (int j = 0; j < ncols; ++j) {
206+
#pragma unroll
207+
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
208+
const int i = i0 + threadIdx.x;
209+
210+
const float2 tmp = __half22float2(((const half2 *) maskh_shared)[j*(D/2) + i]);
211+
skip = skip && isinf(tmp.x) && isinf(tmp.y);
212+
}
213+
}
214+
if (__all_sync(0xFFFFFFFF, skip)) {
215+
continue;
216+
}
217+
#endif // GGML_USE_HIP
218+
}
219+
178220
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
179221
// see https://github.com/ggerganov/llama.cpp/pull/7061 .
180222
// Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
@@ -202,7 +244,7 @@ static __global__ void flash_attn_vec_ext_f16(
202244
sum = logit_softcap*tanhf(sum);
203245
}
204246

205-
sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
247+
sum += maskh_shared[j*D + i_KQ];
206248

207249
if (ncols == 1) {
208250
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
@@ -335,7 +377,9 @@ void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml
335377
float logit_softcap;
336378
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
337379

338-
if (Q->ne[1] == 1) {
380+
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
381+
382+
if (Q->ne[1] == 1 || GGML_CUDA_CC_IS_NVIDIA(cc)) {
339383
constexpr int cols_per_block = 1;
340384
if (logit_softcap == 0.0f) {
341385
constexpr bool use_logit_softcap = false;

ggml/src/ggml-cuda/fattn-vec-f32.cuh

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
#include "fattn-common.cuh"
33

44
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
5-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
5+
#ifndef GGML_USE_HIP
66
__launch_bounds__(D, 1)
7-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
7+
#endif // GGML_USE_HIP
88
static __global__ void flash_attn_vec_ext_f32(
99
const char * __restrict__ Q,
1010
const char * __restrict__ K,
@@ -60,6 +60,12 @@ static __global__ void flash_attn_vec_ext_f32(
6060
NO_DEVICE_CODE;
6161
return;
6262
}
63+
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
64+
if (ncols > 1) {
65+
NO_DEVICE_CODE;
66+
return;
67+
}
68+
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
6369

6470
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
6571

@@ -104,6 +110,13 @@ static __global__ void flash_attn_vec_ext_f32(
104110
kqsum_shared[j][threadIdx.x] = 0.0f;
105111
}
106112
}
113+
114+
__shared__ float maskf_shared[ncols*D];
115+
#pragma unroll
116+
for (int j = 0; j < ncols; ++j) {
117+
maskf_shared[j*D + tid] = 0.0f;
118+
}
119+
107120
__syncthreads();
108121

109122
// Convert Q to float2 (f16 K) or q8_1 (quantized K) and store in registers:
@@ -181,6 +194,34 @@ static __global__ void flash_attn_vec_ext_f32(
181194
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
182195
// Calculate KQ tile and keep track of new maximum KQ values:
183196

197+
if (mask) {
198+
#pragma unroll
199+
for (int j = 0; j < ncols; ++j) {
200+
maskf_shared[j*D + tid] = slope*__half2float(maskh[j*ne11 + k_VKQ_0 + tid]);
201+
}
202+
203+
__syncthreads();
204+
205+
// When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.
206+
// In such cases, skip the KV slice.
207+
// On AMD __all_sync would not work correctly because it assumes a warp size of 64.
208+
#ifndef GGML_USE_HIP
209+
bool skip = true;
210+
#pragma unroll
211+
for (int j = 0; j < ncols; ++j) {
212+
#pragma unroll
213+
for (int i0 = 0; i0 < D; i0 += WARP_SIZE) {
214+
const int i = i0 + threadIdx.x;
215+
216+
skip = skip && isinf(maskf_shared[j*D + i]);
217+
}
218+
}
219+
if (__all_sync(0xFFFFFFFF, skip)) {
220+
continue;
221+
}
222+
#endif // GGML_USE_HIP
223+
}
224+
184225
float kqmax_new_arr[ncols];
185226
#pragma unroll
186227
for (int j = 0; j < ncols; ++j) {
@@ -204,7 +245,7 @@ static __global__ void flash_attn_vec_ext_f32(
204245
sum = logit_softcap*tanhf(sum);
205246
}
206247

207-
sum += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
248+
sum += maskf_shared[j*D + i_KQ];
208249

209250
kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum);
210251

@@ -326,7 +367,9 @@ void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml
326367
float logit_softcap;
327368
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
328369

329-
if (Q->ne[1] == 1) {
370+
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
371+
372+
if (Q->ne[1] == 1 || GGML_CUDA_CC_IS_NVIDIA(cc)) {
330373
constexpr int cols_per_block = 1;
331374
if (logit_softcap == 0.0f) {
332375
constexpr bool use_logit_softcap = false;

ggml/src/ggml-metal/ggml-metal.metal

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3255,7 +3255,7 @@ template<
32553255
typename kd4x4_t, // key type in device memory
32563256
short nl_k,
32573257
void (*deq_k)(device const kd4x4_t *, short, thread k4x4_t &),
3258-
typename vd4x4_t, // key type in device memory
3258+
typename vd4x4_t, // value type in device memory
32593259
short nl_v,
32603260
void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &),
32613261
short DK, // K head size
@@ -3776,7 +3776,7 @@ template<
37763776
typename kd4_t, // key type in device memory
37773777
short nl_k,
37783778
void (*deq_k_t4)(device const kd4_t *, short, thread k4_t &),
3779-
typename vd4_t, // key type in device memory
3779+
typename vd4_t, // value type in device memory
37803780
short nl_v,
37813781
void (*deq_v_t4)(device const vd4_t *, short, thread v4_t &),
37823782
short DK, // K head size

0 commit comments

Comments
 (0)