Nexesenex
diff --git a/‎ggml/src/ggml-alloc.c‎
Lines changed: 22 additions & 0 deletions b/‎ggml/src/ggml-alloc.c‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎src/llama-batch.h‎
Lines changed: 1 addition & 1 deletion b/‎src/llama-batch.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/llama-context.cpp‎
Lines changed: 2 additions & 1 deletion b/‎src/llama-context.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tools/server/public/index.html.gz‎
4.43 KB b/‎tools/server/public/index.html.gz‎
4.43 KB
@@ -598,6 +598,26 @@ static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor
     return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
 }
 
+// free the extra space at the end if the new tensor is smaller
+static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_tensor * node, struct ggml_tensor * parent) {
+    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
+    struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
+
+    size_t parent_size = ggml_backend_buft_get_alloc_size(galloc->bufts[p_hn->buffer_id], parent);
+    size_t node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
+
+    GGML_ASSERT(parent_size >= node_size);
+
+    if (parent_size > node_size) {
+        struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
+        struct buffer_address p_addr = p_hn->addr;
+        p_addr.offset += node_size;
+        size_t extra_size = parent_size - node_size;
+        AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
+        ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
+    }
+}
+
 static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
     GGML_ASSERT(buffer_id >= 0);
     struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
@@ -643,13 +663,15 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
                             hn->addr = p_hn->addr;
                             p_hn->allocated = false; // avoid freeing the parent
                             view_src_hn->allocated = false;
+                            ggml_gallocr_free_extra_space(galloc, node, view_src);
                             return;
                         }
                     } else {
                         AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
                         hn->buffer_id = p_hn->buffer_id;
                         hn->addr = p_hn->addr;
                         p_hn->allocated = false; // avoid freeing the parent
+                        ggml_gallocr_free_extra_space(galloc, node, parent);
                         return;
                     }
                 }
 
@@ -123,7 +123,7 @@ class llama_batch_allocr {
     uint32_t n_seq_max;
     uint32_t n_outputs;
 
-    std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
+    std::array<llama_seq_id, 1> seq_id_0 = {{ 0 }}; // default sequence id
 
     std::vector<llama_pos>      pos;
     std::vector<int32_t>        n_seq_id;
 
@@ -2346,7 +2346,8 @@ llama_context * llama_init_from_model(
         return nullptr;
     }
 
-    if (params.pooling_type != model->hparams.pooling_type) {
+    if (params.pooling_type != LLAMA_POOLING_TYPE_UNSPECIFIED &&
+        params.pooling_type != model->hparams.pooling_type) {
         //user-specified pooling-type is different from the model default
         LLAMA_LOG_WARN("%s: model default pooling_type is [%d], but [%d] was specified\n", __func__,
                        model->hparams.pooling_type, params.pooling_type);
Original file line number	Diff line number	Diff line change
`@@ -2346,7 +2346,8 @@ llama_context * llama_init_from_model(`
`2346`	`2346`	`return nullptr;`
`2347`	`2347`	`}`
`2348`	`2348`
`2349`		`- if (params.pooling_type != model->hparams.pooling_type) {`
	`2349`	`+ if (params.pooling_type != LLAMA_POOLING_TYPE_UNSPECIFIED &&`
	`2350`	`+ params.pooling_type != model->hparams.pooling_type) {`
`2350`	`2351`	`//user-specified pooling-type is different from the model default`
`2351`	`2352`	`LLAMA_LOG_WARN("%s: model default pooling_type is [%d], but [%d] was specified\n", __func__,`
`2352`	`2353`	`model->hparams.pooling_type, params.pooling_type);`