Nexesenex
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 1 addition & 1 deletion b/‎convert_hf_to_gguf.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/build.md‎
Lines changed: 5 additions & 1 deletion b/‎docs/build.md‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎examples/baby-llama/baby-llama.cpp‎
Lines changed: 0 additions & 1 deletion b/‎examples/baby-llama/baby-llama.cpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/batched-bench/batched-bench.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/batched-bench/batched-bench.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎flake.lock‎
Lines changed: 10 additions & 10 deletions b/‎flake.lock‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎ggml/include/ggml.h‎
Lines changed: 1 addition & 0 deletions b/‎ggml/include/ggml.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ggml/src/ggml-aarch64.c‎
Lines changed: 14 additions & 14 deletions b/‎ggml/src/ggml-aarch64.c‎
Lines changed: 14 additions & 14 deletions
@@ -316,7 +316,7 @@ def prepare_tensors(self):
                 if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
                     if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
                         data = gguf.quantize_bf16(data)
-                        assert data.dtype == np.int16
+                        assert data.dtype == np.uint16
                         data_qtype = gguf.GGMLQuantizationType.BF16
 
                     elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
 
@@ -178,7 +178,11 @@ For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](ht
   cmake --build build --config Release
   ```
 
-The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
+The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
+
+The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
+
+The following compilation options are also available to tweak performance:
 
 | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
 |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 
@@ -1,7 +1,6 @@
 #include "ggml.h"
 #include "train.h"
 
-#include <vector>
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
 
@@ -69,7 +69,7 @@ int main(int argc, char ** argv) {
     llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
 
     // ensure enough sequences are available
-    ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
+    ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
 
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
 
 
@@ -349,6 +349,7 @@ extern "C" {
     GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
     GGML_API float       ggml_bf16_to_fp32(ggml_bf16_t);  // consider just doing << 16
     GGML_API void        ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
+    GGML_API void        ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
     GGML_API void        ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
 
     struct ggml_object;
 
@@ -384,8 +384,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
@@ -496,8 +496,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
@@ -614,7 +614,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    if (svcntw() == 8) {
+    if (ggml_sve_cnt_b == QK8_0) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
         float * res_ptr = s;
@@ -680,12 +680,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
         return;
     }
     else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
+        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                     "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
                     "performance");
     }
     else if (ggml_cpu_has_neon()) {
-        GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
+        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
                     "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
                     "quantization format for optimal performance");
     }
@@ -745,8 +745,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
@@ -1266,8 +1266,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (svcntw() == 8) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
+    if (ggml_sve_cnt_b == QK8_0) {
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
@@ -1728,7 +1728,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(blocklen);
 
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    if (svcntw() == 8) {
+    if (ggml_sve_cnt_b == QK8_0) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
         float * res_ptr = s;
@@ -2139,12 +2139,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
         return;
     }
     else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
+        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
                     "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
                     "performance");
     }
     else if (ggml_cpu_has_neon()) {
-        GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
+        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
                     "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
                     "quantization format for optimal performance");
     }