Skip to content

Commit da5dad1

Browse files
authored
feat: sync llama.cpp (#73)
* feat: sync llama.cpp * feat: sync llama.cpp
1 parent 1d6a492 commit da5dad1

19 files changed

+906
-858
lines changed

cpp/common.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1330,6 +1330,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
13301330
else { invalid_param = true; }
13311331
return true;
13321332
}
1333+
if (arg == "--no-warmup") {
1334+
params.warmup = false;
1335+
return true;
1336+
}
13331337
#ifndef LOG_DISABLE_LOGS
13341338
// Parse args for logging parameters
13351339
if (log_param_single_parse(argv[i])) {
@@ -1452,6 +1456,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
14521456
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
14531457
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
14541458
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
1459+
options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" });
14551460
options.push_back({ "server infill",
14561461
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
14571462

@@ -1635,7 +1640,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
16351640
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
16361641
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
16371642
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
1638-
options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
1643+
options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
16391644
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
16401645
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
16411646
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });

cpp/ggml-aarch64.c

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -384,8 +384,8 @@ void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
384384
UNUSED(blocklen);
385385

386386
#if defined(__ARM_FEATURE_SVE)
387-
if (svcntw() == 8) {
388-
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
387+
if (lm_ggml_sve_cnt_b == QK8_0) {
388+
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
389389
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
390390
}
391391
#endif
@@ -496,8 +496,8 @@ void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
496496
UNUSED(blocklen);
497497

498498
#if defined(__ARM_FEATURE_SVE)
499-
if (svcntw() == 8) {
500-
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
499+
if (lm_ggml_sve_cnt_b == QK8_0) {
500+
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
501501
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
502502
}
503503
#endif
@@ -614,7 +614,7 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
614614
UNUSED(blocklen);
615615

616616
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
617-
if (svcntw() == 8) {
617+
if (lm_ggml_sve_cnt_b == QK8_0) {
618618
const void * b_ptr = vx;
619619
const void * a_ptr = vy;
620620
float * res_ptr = s;
@@ -680,12 +680,12 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
680680
return;
681681
}
682682
else if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
683-
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
683+
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
684684
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
685685
"performance");
686686
}
687687
else if (lm_ggml_cpu_has_neon()) {
688-
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (svcntw() == 8)) || lm_ggml_cpu_has_matmul_int8()) &&
688+
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) || lm_ggml_cpu_has_matmul_int8()) &&
689689
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
690690
"quantization format for optimal performance");
691691
}
@@ -745,8 +745,8 @@ void lm_ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
745745
UNUSED(blocklen);
746746

747747
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
748-
if (svcntw() == 8) {
749-
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
748+
if (lm_ggml_sve_cnt_b == QK8_0) {
749+
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
750750
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
751751
}
752752
#endif
@@ -1266,8 +1266,8 @@ void lm_ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
12661266
UNUSED(blocklen);
12671267

12681268
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
1269-
if (svcntw() == 8) {
1270-
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
1269+
if (lm_ggml_sve_cnt_b == QK8_0) {
1270+
LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
12711271
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
12721272
}
12731273
#endif
@@ -1728,7 +1728,7 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
17281728
UNUSED(blocklen);
17291729

17301730
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
1731-
if (svcntw() == 8) {
1731+
if (lm_ggml_sve_cnt_b == QK8_0) {
17321732
const void * b_ptr = vx;
17331733
const void * a_ptr = vy;
17341734
float * res_ptr = s;
@@ -2139,12 +2139,12 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
21392139
return;
21402140
}
21412141
else if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
2142-
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (svcntw() == 8)) &&
2142+
LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
21432143
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
21442144
"performance");
21452145
}
21462146
else if (lm_ggml_cpu_has_neon()) {
2147-
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (svcntw() == 8)) || lm_ggml_cpu_has_matmul_int8()) &&
2147+
LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) || lm_ggml_cpu_has_matmul_int8()) &&
21482148
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
21492149
"quantization format for optimal performance");
21502150
}

cpp/ggml-common.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,11 @@ typedef half2 lm_ggml_half2;
1919

2020
#define LM_GGML_COMMON_DECL
2121
#elif defined(LM_GGML_COMMON_DECL_CUDA)
22+
#if defined(LM_GGML_COMMON_DECL_MUSA)
23+
#include <musa_fp16.h>
24+
#else
2225
#include <cuda_fp16.h>
26+
#endif
2327
#include <cstdint>
2428

2529
typedef half lm_ggml_half;
@@ -415,7 +419,7 @@ static_assert(sizeof(block_iq4_xs) == sizeof(lm_ggml_half) + sizeof(uint16_t) +
415419
#define LM_GGML_TABLE_END() };
416420

417421
#define LM_GGML_COMMON_IMPL
418-
#elif defined(LM_GGML_COMMON_IMPL_CUDA) || defined(LM_GGML_COMMON_IMPL_HIP)
422+
#elif defined(LM_GGML_COMMON_IMPL_CUDA) || defined(LM_GGML_COMMON_IMPL_HIP) || defined(LM_GGML_COMMON_IMPL_MUSA)
419423
#include <cstdint>
420424

421425
#define LM_GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {

cpp/ggml-impl.h

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,9 @@ static inline float lm_ggml_compute_bf16_to_fp32(lm_ggml_bf16_t h) {
8080
/**
8181
* Converts float32 to brain16.
8282
*
83-
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
84-
* Subnormals shall be flushed to zero, and NANs will be quiet.
83+
* This is binary identical with Google Brain float conversion.
84+
* Floats shall round to nearest even, and NANs shall be quiet.
85+
* Subnormals aren't flushed to zero, except perhaps when used.
8586
* This code should vectorize nicely if using modern compilers.
8687
*/
8788
static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
@@ -95,10 +96,6 @@ static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
9596
h.bits = (u.i >> 16) | 64; /* force to quiet */
9697
return h;
9798
}
98-
if (!(u.i & 0x7f800000)) { /* subnormal */
99-
h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
100-
return h;
101-
}
10299
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
103100
return h;
104101
}
@@ -146,6 +143,7 @@ extern "C" {
146143

147144
#if defined(__ARM_FEATURE_SVE)
148145
#include <arm_sve.h>
146+
#include <sys/prctl.h>
149147
#endif
150148

151149
// 16-bit float

cpp/ggml-quants.c

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3818,7 +3818,7 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void
38183818
float sumf = 0;
38193819

38203820
#if defined(__ARM_FEATURE_SVE)
3821-
if (svcntb() == QK8_0) {
3821+
if (lm_ggml_sve_cnt_b == QK8_0) {
38223822
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
38233823
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
38243824

@@ -4190,15 +4190,18 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void
41904190
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
41914191
#endif
41924192
for (; ib < nb; ++ib) {
4193-
int sumi = 0;
4193+
int sumi0 = 0;
4194+
int sumi1 = 0;
41944195

41954196
for (int j = 0; j < qk/2; ++j) {
41964197
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
41974198
const int v1 = (x[ib].qs[j] >> 4) - 8;
41984199

4199-
sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]);
4200+
sumi0 += (v0 * y[ib].qs[j]);
4201+
sumi1 += (v1 * y[ib].qs[j + qk/2]);
42004202
}
42014203

4204+
int sumi = sumi0 + sumi1;
42024205
sumf += sumi*LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d);
42034206
}
42044207

@@ -4474,15 +4477,18 @@ void lm_ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void
44744477
sumf = hsum_float_8(acc) + summs;
44754478
#endif
44764479
for (; ib < nb; ++ib) {
4477-
int sumi = 0;
4480+
int sumi0 = 0;
4481+
int sumi1 = 0;
44784482

44794483
for (int j = 0; j < qk/2; ++j) {
44804484
const int v0 = (x[ib].qs[j] & 0x0F);
44814485
const int v1 = (x[ib].qs[j] >> 4);
44824486

4483-
sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]);
4487+
sumi0 += (v0 * y[ib].qs[j]);
4488+
sumi1 += (v1 * y[ib].qs[j + qk/2]);
44844489
}
44854490

4491+
int sumi = sumi0 + sumi1;
44864492
sumf += (LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d))*sumi + LM_GGML_FP16_TO_FP32(x[ib].m)*LM_GGML_FP16_TO_FP32(y[ib].s);
44874493
}
44884494

@@ -4823,18 +4829,21 @@ void lm_ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void
48234829
uint32_t qh;
48244830
memcpy(&qh, x[ib].qh, sizeof(qh));
48254831

4826-
int sumi = 0;
4832+
int sumi0 = 0;
4833+
int sumi1 = 0;
48274834

48284835
for (int j = 0; j < qk/2; ++j) {
48294836
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
48304837
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
48314838

4832-
const int32_t x0 = ((x[ib].qs[j] & 0x0F) | xh_0) - 16;
4833-
const int32_t x1 = ((x[ib].qs[j] >> 4) | xh_1) - 16;
4839+
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
4840+
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
48344841

4835-
sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]);
4842+
sumi0 += (x0 * y[ib].qs[j]);
4843+
sumi1 += (x1 * y[ib].qs[j + qk/2]);
48364844
}
48374845

4846+
int sumi = sumi0 + sumi1;
48384847
sumf += (LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d)) * sumi;
48394848
}
48404849

@@ -5194,7 +5203,8 @@ void lm_ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void
51945203
uint32_t qh;
51955204
memcpy(&qh, x[ib].qh, sizeof(qh));
51965205

5197-
int sumi = 0;
5206+
int sumi0 = 0;
5207+
int sumi1 = 0;
51985208

51995209
for (int j = 0; j < qk/2; ++j) {
52005210
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
@@ -5203,9 +5213,11 @@ void lm_ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void
52035213
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
52045214
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
52055215

5206-
sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]);
5216+
sumi0 += (x0 * y[ib].qs[j]);
5217+
sumi1 += (x1 * y[ib].qs[j + qk/2]);
52075218
}
52085219

5220+
int sumi = sumi0 + sumi1;
52095221
sumf += (LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d))*sumi + LM_GGML_FP16_TO_FP32(x[ib].m)*LM_GGML_FP16_TO_FP32(y[ib].s);
52105222
}
52115223

@@ -5291,7 +5303,7 @@ void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void
52915303
float sumf = 0;
52925304

52935305
#if defined(__ARM_FEATURE_SVE)
5294-
if (svcntb() == QK8_0) {
5306+
if (lm_ggml_sve_cnt_b == QK8_0) {
52955307
svfloat32_t sumv0 = svdup_n_f32(0.0f);
52965308
svfloat32_t sumv1 = svdup_n_f32(0.0f);
52975309

@@ -6437,22 +6449,22 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void
64376449
// compute mask for subtraction
64386450
vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
64396451
vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
6440-
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
6452+
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
64416453
m <<= 1;
64426454

64436455
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
64446456
vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
6445-
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
6457+
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
64466458
m <<= 1;
64476459

64486460
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
64496461
vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
6450-
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
6462+
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
64516463
m <<= 1;
64526464

64536465
vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
64546466
vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
6455-
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
6467+
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
64566468
m <<= 1;
64576469

64586470
// load Q8 and take product with Q3
@@ -7708,13 +7720,13 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void
77087720
vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
77097721
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
77107722
vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
7711-
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
7723+
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
77127724
m <<= 1;
77137725

77147726
vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
77157727
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
77167728
vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
7717-
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
7729+
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
77187730
m <<= 1;
77197731

77207732
vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);

cpp/ggml-quants.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,10 @@ void iq2xs_free_impl(enum lm_ggml_type type);
127127
void iq3xs_init_impl(int grid_size);
128128
void iq3xs_free_impl(int grid_size);
129129

130+
#if defined(__ARM_FEATURE_SVE)
131+
extern int lm_ggml_sve_cnt_b;
132+
#endif
133+
130134
#ifdef __cplusplus
131135
}
132136
#endif

0 commit comments

Comments
 (0)