Skip to content

Commit de9df67

Browse files
committed
Merge branch 'master' into esocrok
2 parents 3c38456 + b617cfd commit de9df67

25 files changed

+1593
-290
lines changed

ggml/src/ggml-alloc.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,26 @@ static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor
598598
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
599599
}
600600

601+
// free the extra space at the end if the new tensor is smaller
602+
static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_tensor * node, struct ggml_tensor * parent) {
603+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
604+
struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
605+
606+
size_t parent_size = ggml_backend_buft_get_alloc_size(galloc->bufts[p_hn->buffer_id], parent);
607+
size_t node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
608+
609+
GGML_ASSERT(parent_size >= node_size);
610+
611+
if (parent_size > node_size) {
612+
struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
613+
struct buffer_address p_addr = p_hn->addr;
614+
p_addr.offset += node_size;
615+
size_t extra_size = parent_size - node_size;
616+
AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
617+
ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
618+
}
619+
}
620+
601621
static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
602622
GGML_ASSERT(buffer_id >= 0);
603623
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
@@ -643,13 +663,15 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
643663
hn->addr = p_hn->addr;
644664
p_hn->allocated = false; // avoid freeing the parent
645665
view_src_hn->allocated = false;
666+
ggml_gallocr_free_extra_space(galloc, node, view_src);
646667
return;
647668
}
648669
} else {
649670
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
650671
hn->buffer_id = p_hn->buffer_id;
651672
hn->addr = p_hn->addr;
652673
p_hn->allocated = false; // avoid freeing the parent
674+
ggml_gallocr_free_extra_space(galloc, node, parent);
653675
return;
654676
}
655677
}

src/llama-batch.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ class llama_batch_allocr {
123123
uint32_t n_seq_max;
124124
uint32_t n_outputs;
125125

126-
std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
126+
std::array<llama_seq_id, 1> seq_id_0 = {{ 0 }}; // default sequence id
127127

128128
std::vector<llama_pos> pos;
129129
std::vector<int32_t> n_seq_id;

src/llama-context.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2346,7 +2346,8 @@ llama_context * llama_init_from_model(
23462346
return nullptr;
23472347
}
23482348

2349-
if (params.pooling_type != model->hparams.pooling_type) {
2349+
if (params.pooling_type != LLAMA_POOLING_TYPE_UNSPECIFIED &&
2350+
params.pooling_type != model->hparams.pooling_type) {
23502351
//user-specified pooling-type is different from the model default
23512352
LLAMA_LOG_WARN("%s: model default pooling_type is [%d], but [%d] was specified\n", __func__,
23522353
model->hparams.pooling_type, params.pooling_type);

tools/server/public/index.html.gz

4.43 KB
Binary file not shown.

0 commit comments

Comments
 (0)