Skip to content

Commit 7fe4678

Browse files
authored
llama : fix session save/load with quantized KV (ggml-org#5649)
1 parent ba2135c commit 7fe4678

File tree

1 file changed

+10
-10
lines changed

1 file changed

+10
-10
lines changed

llama.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12176,18 +12176,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
1217612176
data_ctx->write(&kv_used, sizeof(kv_used));
1217712177

1217812178
if (kv_buf_size) {
12179-
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
12180-
1218112179
std::vector<uint8_t> tmp_buf;
1218212180
for (int il = 0; il < (int) n_layer; ++il) {
12183-
tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head);
12181+
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12182+
tmp_buf.resize(k_size);
1218412183
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
1218512184
data_ctx->write(tmp_buf.data(), tmp_buf.size());
1218612185

1218712186
// v is not contiguous, copy row by row
12188-
tmp_buf.resize(elt_size*kv_head);
12187+
size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12188+
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12189+
tmp_buf.resize(v_row_size);
1218912190
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12190-
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
12191+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
1219112192
data_ctx->write(tmp_buf.data(), tmp_buf.size());
1219212193
}
1219312194
}
@@ -12289,17 +12290,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
1228912290
if (kv_buf_size) {
1229012291
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
1229112292

12292-
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
12293-
1229412293
for (int il = 0; il < (int) n_layer; ++il) {
12295-
size_t k_size = elt_size*n_embd_k_gqa*kv_head;
12294+
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
1229612295
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
1229712296
inp += k_size;
1229812297

1229912298
// v is not contiguous, copy row by row
12300-
size_t v_row_size = elt_size*kv_head;
12299+
size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12300+
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
1230112301
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12302-
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
12302+
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
1230312303
inp += v_row_size;
1230412304
}
1230512305
}

0 commit comments

Comments
 (0)