ggml-org
diff --git a/‎.github/labeler.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/labeler.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 4 deletions b/‎README.md‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎ggml/include/ggml.h‎
Lines changed: 1 addition & 0 deletions b/‎ggml/include/ggml.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 52 additions & 7 deletions b/‎ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 52 additions & 7 deletions
diff --git a/‎ggml/src/ggml-cpu/repack.cpp‎
Lines changed: 25 additions & 0 deletions b/‎ggml/src/ggml-cpu/repack.cpp‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cuda/ggml-cuda.cu‎
Lines changed: 17 additions & 0 deletions b/‎ggml/src/ggml-cuda/ggml-cuda.cu‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎ggml/src/ggml-hexagon/htp/ops-utils.h‎
Lines changed: 17 additions & 17 deletions b/‎ggml/src/ggml-hexagon/htp/ops-utils.h‎
Lines changed: 17 additions & 17 deletions
@@ -76,6 +76,10 @@ ggml:
     - changed-files:
         - any-glob-to-any-file:
             - ggml/**
+model:
+    - changed-files:
+        - any-glob-to-any-file:
+            - src/models/**
 nix:
     - changed-files:
         - any-glob-to-any-file:
 
@@ -17,14 +17,13 @@ LLM inference in C/C++
 
 ## Hot topics
 
-- **[guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)**
-- **[[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)**
+- **[guide : using the new WebUI of llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/16938)**
+- [guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)
+- [[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)
 - Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
-- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
 - Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
 - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
-- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
 - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
 - Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
 
 
@@ -2108,6 +2108,7 @@ extern "C" {
     enum ggml_scale_mode {
         GGML_SCALE_MODE_NEAREST  = 0,
         GGML_SCALE_MODE_BILINEAR = 1,
+        GGML_SCALE_MODE_BICUBIC  = 2,
 
         GGML_SCALE_MODE_COUNT
     };
 
@@ -7507,10 +7507,17 @@ static void ggml_compute_forward_upscale_f32(
     float sf1 = (float)ne1/src0->ne[1];
     float sf2 = (float)ne2/src0->ne[2];
     float sf3 = (float)ne3/src0->ne[3];
+    float pixel_offset = 0.5f;
 
     const int32_t mode_flags = ggml_get_op_params_i32(dst, 0);
     const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);
 
+    if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
+        pixel_offset = 0.0f;
+        sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
+        sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
+    }
+
     if (mode == GGML_SCALE_MODE_NEAREST) {
         for (int64_t i3 = 0; i3 < ne3; i3++) {
             const int64_t i03 = i3 / sf3;
@@ -7530,13 +7537,6 @@ static void ggml_compute_forward_upscale_f32(
             }
         }
     } else if (mode == GGML_SCALE_MODE_BILINEAR) {
-        float pixel_offset = 0.5f;
-        if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
-            pixel_offset = 0.0f;
-            sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
-            sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
-        }
-
         for (int64_t i3 = 0; i3 < ne3; i3++) {
             const int64_t i03 = i3 / sf3;
             for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
@@ -7571,6 +7571,51 @@ static void ggml_compute_forward_upscale_f32(
 
                         const float val = a*(1 - dx)*(1 - dy) + b*dx*(1 - dy) + c*(1 - dx)*dy + d*dx*dy;
 
+                        float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
+                        *y_dst = val;
+                    }
+                }
+            }
+        }
+    } else if (mode == GGML_SCALE_MODE_BICUBIC) {
+        // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+        const float a = -0.75f; // use alpha = -0.75 (same as PyTorch)
+        auto weight1 = [a](float x) { return ((a + 2) * x - (a + 3)) * x * x + 1; };
+        auto weight2 = [a](float x) { return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a; };
+        auto bicubic = [=](float p0, float p1, float p2, float p3, float x) {
+            const float w0 = weight2(x + 1);
+            const float w1 = weight1(x + 0);
+            const float w2 = weight1(1 - x);
+            const float w3 = weight2(2 - x);
+            return p0*w0 + p1*w1 + p2*w2 + p3*w3;
+        };
+
+        for (int64_t i3 = 0; i3 < ne3; i3++) {
+            const int64_t i03 = i3 / sf3;
+            for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
+                const int64_t i02 = i2 / sf2;
+                for (int64_t i1 = 0; i1 < ne1; i1++) {
+                    const float y = ((float)i1 + pixel_offset) / sf1 - pixel_offset;
+                    const int64_t y0 = (int64_t)floorf(y);
+                    const float dy = y - (float)y0;
+
+                    for (int64_t i0 = 0; i0 < ne0; i0++) {
+                        const float x = ((float)i0 + pixel_offset) / sf0 - pixel_offset;
+                        const int64_t x0 = (int64_t)floorf(x);
+                        const float dx = x - (float)x0;
+
+                        auto p = [=](int64_t x_off, int64_t y_off) -> float {
+                            int64_t i00 = std::max(int64_t(0), std::min(x0 + x_off, ne00 - 1));
+                            int64_t i01 = std::max(int64_t(0), std::min(y0 + y_off, ne01 - 1));
+                            return *(const float *)((const char *)src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                        };
+
+                        const float val = bicubic(
+                            bicubic(p(-1,-1), p(0,-1), p(1,-1), p(2,-1), dx),
+                            bicubic(p(-1, 0), p(0, 0), p(1, 0), p(2, 0), dx),
+                            bicubic(p(-1, 1), p(0, 1), p(1, 1), p(2, 1), dx),
+                            bicubic(p(-1, 2), p(0, 2), p(1, 2), p(2, 2), dx), dy);
+
                         float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
                         *y_dst = val;
                     }
 
@@ -1678,10 +1678,24 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
         int64_t nchunk     = (nr + chunk_size - 1) / chunk_size;
 
+        // Ensure minimum chunk size to avoid alignment issues with high thread counts
+        // Minimum chunk size should be at least NB_COLS to prevent overlapping chunks after alignment
+        const int64_t min_chunk_size = NB_COLS;
+        if (nchunk > 0 && (nr / nchunk) < min_chunk_size && nr >= min_chunk_size) {
+            nchunk = (nr + min_chunk_size - 1) / min_chunk_size;
+        }
+
         if (nth == 1 || nchunk < nth || disable_chunking) {
             nchunk = nth;
         }
 
+        // Ensure nchunk doesn't exceed the number of rows divided by minimum chunk size
+        // This prevents creating too many tiny chunks that could overlap after alignment
+        const int64_t max_nchunk = (nr + min_chunk_size - 1) / min_chunk_size;
+        if (nchunk > max_nchunk) {
+            nchunk = max_nchunk;
+        }
+
         if (ith == 0) {
             // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
             ggml_threadpool_chunk_set(params->threadpool, nth);
@@ -1695,8 +1709,15 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         while (current_chunk < nchunk) {
             int64_t src0_start = (current_chunk * ne01) / nchunk;
             int64_t src0_end   = ((current_chunk + 1) * ne01) / nchunk;
+
+            // Align boundaries to NB_COLS - round up to ensure all data is included
+            // The chunk size limiting above ensures chunks are large enough to prevent overlaps
             src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
             src0_end   = (src0_end   % NB_COLS) ? src0_end   + NB_COLS - (src0_end   % NB_COLS) : src0_end;
+            if (src0_end > ne01) {
+                src0_end = ne01;
+            }
+
             if (src0_start >= src0_end) {
                 break;
             }
@@ -1808,8 +1829,12 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             int64_t src0_cur_start = (ith * ne01) / nth;
             int64_t src0_cur_end   = ((ith + 1) * ne01) / nth;
 
+            // Align boundaries to NB_COLS - round up to ensure all data is included
             src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
             src0_cur_end   = (src0_cur_end   % NB_COLS) ? src0_cur_end   + NB_COLS - (src0_cur_end   % NB_COLS) : src0_cur_end;
+            if (src0_cur_end > ne01) {
+                src0_cur_end = ne01;
+            }
 
             if (src0_cur_start >= src0_cur_end) {
                 return;
 
@@ -2115,6 +2115,14 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_f(const ggml_tensor * tensor) {
     const int cc      = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
     use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, is_mul_mat_id ? src1->ne[2] : src1->ne[1]);
 
+    const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft) ||
+                       ggml_backend_buft_is_cuda_split(src1->buffer->buft);
+
+    //TODO: add support for fusion for split buffers
+    if (split) {
+        return false;
+    }
+
     //we only support fusion for ncols_dst = 1
     if (tensor->op == GGML_OP_MUL_MAT && dst->ne[1] != 1) {
         return false;
@@ -2154,6 +2162,15 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) {
         return false;
     }
 
+
+    const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft) ||
+                       ggml_backend_buft_is_cuda_split(src1->buffer->buft);
+
+    //TODO: add support for fusion for split buffers
+    if (split) {
+        return false;
+    }
+
     return use_mul_mat_vec_q;
 }
 
 
@@ -43,46 +43,46 @@ static inline int32_t htp_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_s
 }
 
 static inline void htp_dump_int8_line(char * pref, const int8_t * x, int n) {
-    char str[1024], *p = str;
-    p += sprintf(p, "%s: ", pref);
-    for (int i = 0; i < 16; i++) {
-        p += sprintf(p, "%d, ", x[i]);
+    char str[1024], *p = str, *p_end = str + sizeof(str);
+    p += snprintf(p, p_end - p, "%s: ", pref);
+    for (int i = 0; i < n && p < p_end; i++) {
+        p += snprintf(p, p_end - p, "%d, ", x[i]);
     }
     FARF(HIGH, "%s\n", str);
 }
 
 static inline void htp_dump_uint8_line(char * pref, const uint8_t * x, uint32_t n) {
-    char str[1024], *p = str;
-    p += sprintf(p, "%s: ", pref);
-    for (int i = 0; i < n; i++) {
-        p += sprintf(p, "%d, ", x[i]);
+    char str[1024], *p = str, *p_end = str + sizeof(str);
+    p += snprintf(p, p_end - p, "%s: ", pref);
+    for (int i = 0; i < n && p < p_end; i++) {
+        p += snprintf(p, p_end - p, "%d, ", x[i]);
     }
     FARF(HIGH, "%s\n", str);
 }
 
 static inline void htp_dump_int32_line(char * pref, const int32_t * x, uint32_t n) {
-    char str[1024], *p = str;
-    p += sprintf(p, "%s: ", pref);
+    char str[1024], *p = str, *p_end = str + sizeof(str);
+    p += snprintf(p, p_end - p, "%s: ", pref);
     for (int i = 0; i < n; i++) {
-        p += sprintf(p, "%d, ", (int) x[i]);
+        p += snprintf(p, p_end - p, "%d, ", (int) x[i]);
     }
     FARF(HIGH, "%s\n", str);
 }
 
 static inline void htp_dump_fp16_line(char * pref, const __fp16 * x, uint32_t n) {
-    char str[1024], *p = str;
-    p += sprintf(p, "%s: ", pref);
+    char str[1024], *p = str, *p_end = str + sizeof(str);
+    p += snprintf(p, p_end - p, "%s: ", pref);
     for (int i = 0; i < n; i++) {
-        p += sprintf(p, "%.6f, ", (float) x[i]);
+        p += snprintf(p, p_end - p, "%.6f, ", (float) x[i]);
     }
     FARF(HIGH, "%s\n", str);
 }
 
 static inline void htp_dump_fp32_line(char * pref, const float * x, uint32_t n) {
-    char str[1024], *p = str;
-    p += sprintf(p, "%s: ", pref);
+    char str[1024], *p = str, *p_end = str + sizeof(str);
+    p += snprintf(p, p_end - p, "%s: ", pref);
     for (int i = 0; i < n; i++) {
-        p += sprintf(p, "%.6f, ", x[i]);
+        p += snprintf(p, p_end - p, "%.6f, ", x[i]);
     }
     FARF(HIGH, "%s\n", str);
 }
Original file line number	Diff line number	Diff line change
`@@ -43,46 +43,46 @@ static inline int32_t htp_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_s`
`43`	`43`	`}`
`44`	`44`
`45`	`45`	`static inline void htp_dump_int8_line(char * pref, const int8_t * x, int n) {`
`46`		`- char str[1024], *p = str;`
`47`		`- p += sprintf(p, "%s: ", pref);`
`48`		`- for (int i = 0; i < 16; i++) {`
`49`		`- p += sprintf(p, "%d, ", x[i]);`
	`46`	`+ char str[1024], p = str, p_end = str + sizeof(str);`
	`47`	`+ p += snprintf(p, p_end - p, "%s: ", pref);`
	`48`	`+ for (int i = 0; i < n && p < p_end; i++) {`
	`49`	`+ p += snprintf(p, p_end - p, "%d, ", x[i]);`
`50`	`50`	`}`
`51`	`51`	`FARF(HIGH, "%s\n", str);`
`52`	`52`	`}`
`53`	`53`
`54`	`54`	`static inline void htp_dump_uint8_line(char * pref, const uint8_t * x, uint32_t n) {`
`55`		`- char str[1024], *p = str;`
`56`		`- p += sprintf(p, "%s: ", pref);`
`57`		`- for (int i = 0; i < n; i++) {`
`58`		`- p += sprintf(p, "%d, ", x[i]);`
	`55`	`+ char str[1024], p = str, p_end = str + sizeof(str);`
	`56`	`+ p += snprintf(p, p_end - p, "%s: ", pref);`
	`57`	`+ for (int i = 0; i < n && p < p_end; i++) {`
	`58`	`+ p += snprintf(p, p_end - p, "%d, ", x[i]);`
`59`	`59`	`}`
`60`	`60`	`FARF(HIGH, "%s\n", str);`
`61`	`61`	`}`
`62`	`62`
`63`	`63`	`static inline void htp_dump_int32_line(char * pref, const int32_t * x, uint32_t n) {`
`64`		`- char str[1024], *p = str;`
`65`		`- p += sprintf(p, "%s: ", pref);`
	`64`	`+ char str[1024], p = str, p_end = str + sizeof(str);`
	`65`	`+ p += snprintf(p, p_end - p, "%s: ", pref);`
`66`	`66`	`for (int i = 0; i < n; i++) {`
`67`		`- p += sprintf(p, "%d, ", (int) x[i]);`
	`67`	`+ p += snprintf(p, p_end - p, "%d, ", (int) x[i]);`
`68`	`68`	`}`
`69`	`69`	`FARF(HIGH, "%s\n", str);`
`70`	`70`	`}`
`71`	`71`
`72`	`72`	`static inline void htp_dump_fp16_line(char * pref, const __fp16 * x, uint32_t n) {`
`73`		`- char str[1024], *p = str;`
`74`		`- p += sprintf(p, "%s: ", pref);`
	`73`	`+ char str[1024], p = str, p_end = str + sizeof(str);`
	`74`	`+ p += snprintf(p, p_end - p, "%s: ", pref);`
`75`	`75`	`for (int i = 0; i < n; i++) {`
`76`		`- p += sprintf(p, "%.6f, ", (float) x[i]);`
	`76`	`+ p += snprintf(p, p_end - p, "%.6f, ", (float) x[i]);`
`77`	`77`	`}`
`78`	`78`	`FARF(HIGH, "%s\n", str);`
`79`	`79`	`}`
`80`	`80`
`81`	`81`	`static inline void htp_dump_fp32_line(char * pref, const float * x, uint32_t n) {`
`82`		`- char str[1024], *p = str;`
`83`		`- p += sprintf(p, "%s: ", pref);`
	`82`	`+ char str[1024], p = str, p_end = str + sizeof(str);`
	`83`	`+ p += snprintf(p, p_end - p, "%s: ", pref);`
`84`	`84`	`for (int i = 0; i < n; i++) {`
`85`		`- p += sprintf(p, "%.6f, ", x[i]);`
	`85`	`+ p += snprintf(p, p_end - p, "%.6f, ", x[i]);`
`86`	`86`	`}`
`87`	`87`	`FARF(HIGH, "%s\n", str);`
`88`	`88`	`}`