Skip to content

Commit a7f1c7a

Browse files
authored
Merge b3509
b3509
2 parents 8942475 + ecf6b7f commit a7f1c7a

File tree

16 files changed

+224
-75
lines changed

16 files changed

+224
-75
lines changed

convert_hf_to_gguf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ def prepare_tensors(self):
316316
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
317317
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
318318
data = gguf.quantize_bf16(data)
319-
assert data.dtype == np.int16
319+
assert data.dtype == np.uint16
320320
data_qtype = gguf.GGMLQuantizationType.BF16
321321

322322
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):

docs/build.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,11 @@ For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](ht
178178
cmake --build build --config Release
179179
```
180180
181-
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
181+
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
182+
183+
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
184+
185+
The following compilation options are also available to tweak performance:
182186
183187
| Option | Legal values | Default | Description |
184188
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|

examples/baby-llama/baby-llama.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#include "ggml.h"
22
#include "train.h"
33

4-
#include <vector>
54
#include <cassert>
65
#include <cstdlib>
76
#include <cstring>

examples/batched-bench/batched-bench.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ int main(int argc, char ** argv) {
6969
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
7070

7171
// ensure enough sequences are available
72-
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
72+
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
7373

7474
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
7575

flake.lock

Lines changed: 10 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ggml/include/ggml.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,7 @@ extern "C" {
349349
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
350350
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
351351
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
352+
GGML_API void ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
352353
GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
353354

354355
struct ggml_object;

ggml/src/ggml-aarch64.c

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -384,8 +384,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
384384
UNUSED(blocklen);
385385

386386
#if defined(__ARM_FEATURE_SVE)
387-
if (svcntw() == 8) {
388-
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
387+
if (ggml_sve_cnt_b == QK8_0) {
388+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
389389
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
390390
}
391391
#endif
@@ -496,8 +496,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
496496
UNUSED(blocklen);
497497

498498
#if defined(__ARM_FEATURE_SVE)
499-
if (svcntw() == 8) {
500-
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
499+
if (ggml_sve_cnt_b == QK8_0) {
500+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
501501
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
502502
}
503503
#endif
@@ -614,7 +614,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
614614
UNUSED(blocklen);
615615

616616
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
617-
if (svcntw() == 8) {
617+
if (ggml_sve_cnt_b == QK8_0) {
618618
const void * b_ptr = vx;
619619
const void * a_ptr = vy;
620620
float * res_ptr = s;
@@ -680,12 +680,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
680680
return;
681681
}
682682
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
683-
GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
683+
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
684684
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
685685
"performance");
686686
}
687687
else if (ggml_cpu_has_neon()) {
688-
GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
688+
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
689689
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
690690
"quantization format for optimal performance");
691691
}
@@ -745,8 +745,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
745745
UNUSED(blocklen);
746746

747747
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
748-
if (svcntw() == 8) {
749-
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
748+
if (ggml_sve_cnt_b == QK8_0) {
749+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
750750
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
751751
}
752752
#endif
@@ -1266,8 +1266,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
12661266
UNUSED(blocklen);
12671267

12681268
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
1269-
if (svcntw() == 8) {
1270-
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
1269+
if (ggml_sve_cnt_b == QK8_0) {
1270+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
12711271
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
12721272
}
12731273
#endif
@@ -1728,7 +1728,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
17281728
UNUSED(blocklen);
17291729

17301730
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
1731-
if (svcntw() == 8) {
1731+
if (ggml_sve_cnt_b == QK8_0) {
17321732
const void * b_ptr = vx;
17331733
const void * a_ptr = vy;
17341734
float * res_ptr = s;
@@ -2139,12 +2139,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
21392139
return;
21402140
}
21412141
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
2142-
GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
2142+
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
21432143
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
21442144
"performance");
21452145
}
21462146
else if (ggml_cpu_has_neon()) {
2147-
GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
2147+
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
21482148
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
21492149
"quantization format for optimal performance");
21502150
}

0 commit comments

Comments
 (0)