@@ -12176,18 +12176,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12176
12176
data_ctx->write(&kv_used, sizeof(kv_used));
12177
12177
12178
12178
if (kv_buf_size) {
12179
- const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
12180
-
12181
12179
std::vector<uint8_t> tmp_buf;
12182
12180
for (int il = 0; il < (int) n_layer; ++il) {
12183
- tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head);
12181
+ size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
12182
+ tmp_buf.resize(k_size);
12184
12183
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
12185
12184
data_ctx->write(tmp_buf.data(), tmp_buf.size());
12186
12185
12187
12186
// v is not contiguous, copy row by row
12188
- tmp_buf.resize(elt_size*kv_head);
12187
+ size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12188
+ size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12189
+ tmp_buf.resize(v_row_size);
12189
12190
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12190
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx , tmp_buf.size());
12191
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride , tmp_buf.size());
12191
12192
data_ctx->write(tmp_buf.data(), tmp_buf.size());
12192
12193
}
12193
12194
}
@@ -12289,17 +12290,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12289
12290
if (kv_buf_size) {
12290
12291
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
12291
12292
12292
- const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
12293
-
12294
12293
for (int il = 0; il < (int) n_layer; ++il) {
12295
- size_t k_size = elt_size* n_embd_k_gqa*kv_head;
12294
+ size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head) ;
12296
12295
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
12297
12296
inp += k_size;
12298
12297
12299
12298
// v is not contiguous, copy row by row
12300
- size_t v_row_size = elt_size*kv_head;
12299
+ size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12300
+ size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12301
12301
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12302
- ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx , v_row_size);
12302
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride , v_row_size);
12303
12303
inp += v_row_size;
12304
12304
}
12305
12305
}
0 commit comments