Skip to content

Commit c7efdc6

Browse files
committed
Latest commits, makefile fixes for vulkan shader compilation
1 parent 8aead97 commit c7efdc6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+150439
-145493
lines changed

Makefile

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,7 @@ endif
106106
base = base
107107
ggmlsrc_f = $(base)/ggml
108108
llamacpp_f = $(base)/llama
109-
#common_f = $(base)/common
110109
common_f = $(base)
111-
# ggmlsrc_f = base/ggml_llamacpp
112-
# llamacpp_f = base/ggml_llamacpp
113110

114111

115112
# IMGUI_DIR = imgui
@@ -319,12 +316,12 @@ endif
319316

320317
ifndef SDL2
321318
LIBS += -lshell32 -lvulkan
322-
LDFLAGS_VK = -lvulkan
319+
LDFLAGS_VK = $(shell pkg-config --libs vulkan)
323320
#LDFLAGS_VK+ = -lclblast
324321
else
325322
#LDFLAGS_VK =
326323
#LDFLAGS_VK+ = -lvulkan-1 -lclblast
327-
LDFLAGS_VK+ = -lvulkan
324+
LDFLAGS_VK+ = $(shell pkg-config --libs vulkan)
328325
#CXXFLAGS_UI += -I$(VULKAN_DIR)/include
329326
endif
330327

@@ -672,7 +669,7 @@ OBJS_VK = \
672669
$(TMP)vkt_sgemm.o
673670

674671
GLSLC_CMD = glslc
675-
_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
672+
_ggml_vk_genshaders_cmd = $(shell pwd)/vkt-shaders-gen
676673
_ggml_vk_header = $(ggmlsrc_f)/ggml-vulkan-shaders.hpp
677674
_ggml_vk_source = $(ggmlsrc_f)/ggml-vulkan-shaders.cpp
678675
_ggml_vk_input_dir = $(ggmlsrc_f)/vulkan-shaders

base/common.cpp

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1108,13 +1108,14 @@ void llama_batch_add(
11081108
batch.n_tokens++;
11091109
}
11101110

1111-
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
1111+
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
1112+
llama_init_result iparams;
11121113
auto mparams = llama_model_params_from_gpt_params(params);
11131114

11141115
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
11151116
if (model == NULL) {
11161117
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
1117-
return std::make_tuple(nullptr, nullptr);
1118+
return iparams;
11181119
}
11191120

11201121
auto cparams = llama_context_params_from_gpt_params(params);
@@ -1123,7 +1124,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
11231124
if (lctx == NULL) {
11241125
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
11251126
llama_free_model(model);
1126-
return std::make_tuple(nullptr, nullptr);
1127+
return iparams;
11271128
}
11281129

11291130
if (!params.control_vectors.empty()) {
@@ -1134,7 +1135,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
11341135
if (cvec.n_embd == -1) {
11351136
llama_free(lctx);
11361137
llama_free_model(model);
1137-
return std::make_tuple(nullptr, nullptr);
1138+
return iparams;
11381139
}
11391140

11401141
int err = llama_control_vector_apply(lctx,
@@ -1146,7 +1147,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
11461147
if (err) {
11471148
llama_free(lctx);
11481149
llama_free_model(model);
1149-
return std::make_tuple(nullptr, nullptr);
1150+
return iparams;
11501151
}
11511152
}
11521153

@@ -1164,7 +1165,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
11641165
// fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
11651166
// llama_free(lctx);
11661167
// llama_free_model(model);
1167-
// return std::make_tuple(nullptr, nullptr);
1168+
// return iparams;
11681169
// }
11691170
// }
11701171

@@ -1200,7 +1201,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
12001201
llama_reset_timings(lctx);
12011202
}
12021203

1203-
return std::make_tuple(model, lctx);
1204+
iparams.model = model;
1205+
iparams.context = lctx;
1206+
return iparams;
12041207
}
12051208

12061209
//

base/common.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,12 @@ void process_escapes(std::string& input);
163163
//
164164

165165
// TODO: avoid tuplue, use struct
166-
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
166+
struct llama_init_result {
167+
struct llama_model * model = nullptr;
168+
struct llama_context * context = nullptr;
169+
};
170+
171+
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
167172

168173
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
169174
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);

base/ggml/ggml-aarch64.c

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -384,8 +384,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
384384
UNUSED(blocklen);
385385

386386
#if defined(__ARM_FEATURE_SVE)
387-
if (svcntw() == 8) {
388-
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
387+
if (ggml_sve_cnt_b == QK8_0) {
388+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
389389
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
390390
}
391391
#endif
@@ -496,8 +496,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
496496
UNUSED(blocklen);
497497

498498
#if defined(__ARM_FEATURE_SVE)
499-
if (svcntw() == 8) {
500-
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
499+
if (ggml_sve_cnt_b == QK8_0) {
500+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
501501
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
502502
}
503503
#endif
@@ -614,7 +614,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
614614
UNUSED(blocklen);
615615

616616
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
617-
if (svcntw() == 8) {
617+
if (ggml_sve_cnt_b == QK8_0) {
618618
const void * b_ptr = vx;
619619
const void * a_ptr = vy;
620620
float * res_ptr = s;
@@ -680,12 +680,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
680680
return;
681681
}
682682
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
683-
GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
683+
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
684684
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
685685
"performance");
686686
}
687687
else if (ggml_cpu_has_neon()) {
688-
GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
688+
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
689689
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
690690
"quantization format for optimal performance");
691691
}
@@ -745,8 +745,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
745745
UNUSED(blocklen);
746746

747747
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
748-
if (svcntw() == 8) {
749-
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
748+
if (ggml_sve_cnt_b == QK8_0) {
749+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
750750
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
751751
}
752752
#endif
@@ -1266,8 +1266,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
12661266
UNUSED(blocklen);
12671267

12681268
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
1269-
if (svcntw() == 8) {
1270-
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
1269+
if (ggml_sve_cnt_b == QK8_0) {
1270+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
12711271
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
12721272
}
12731273
#endif
@@ -1728,7 +1728,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
17281728
UNUSED(blocklen);
17291729

17301730
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
1731-
if (svcntw() == 8) {
1731+
if (ggml_sve_cnt_b == QK8_0) {
17321732
const void * b_ptr = vx;
17331733
const void * a_ptr = vy;
17341734
float * res_ptr = s;
@@ -2139,12 +2139,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
21392139
return;
21402140
}
21412141
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
2142-
GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
2142+
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
21432143
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
21442144
"performance");
21452145
}
21462146
else if (ggml_cpu_has_neon()) {
2147-
GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
2147+
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
21482148
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
21492149
"quantization format for optimal performance");
21502150
}

base/ggml/ggml-common.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,11 @@ typedef half2 ggml_half2;
1919

2020
#define GGML_COMMON_DECL
2121
#elif defined(GGML_COMMON_DECL_CUDA)
22+
#if defined(GGML_COMMON_DECL_MUSA)
23+
#include <musa_fp16.h>
24+
#else
2225
#include <cuda_fp16.h>
26+
#endif
2327
#include <cstdint>
2428

2529
typedef half ggml_half;
@@ -415,7 +419,7 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
415419
#define GGML_TABLE_END() };
416420

417421
#define GGML_COMMON_IMPL
418-
#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP)
422+
#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP) || defined(GGML_COMMON_IMPL_MUSA)
419423
#include <cstdint>
420424

421425
#define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {

base/ggml/ggml-impl.h

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,9 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
8080
/**
8181
* Converts float32 to brain16.
8282
*
83-
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
84-
* Subnormals shall be flushed to zero, and NANs will be quiet.
83+
* This is binary identical with Google Brain float conversion.
84+
* Floats shall round to nearest even, and NANs shall be quiet.
85+
* Subnormals aren't flushed to zero, except perhaps when used.
8586
* This code should vectorize nicely if using modern compilers.
8687
*/
8788
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
@@ -95,10 +96,6 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
9596
h.bits = (u.i >> 16) | 64; /* force to quiet */
9697
return h;
9798
}
98-
if (!(u.i & 0x7f800000)) { /* subnormal */
99-
h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
100-
return h;
101-
}
10299
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
103100
return h;
104101
}
@@ -146,6 +143,7 @@ extern "C" {
146143

147144
#if defined(__ARM_FEATURE_SVE)
148145
#include <arm_sve.h>
146+
#include <sys/prctl.h>
149147
#endif
150148

151149
// 16-bit float

base/ggml/ggml-quants.c

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3818,7 +3818,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
38183818
float sumf = 0;
38193819

38203820
#if defined(__ARM_FEATURE_SVE)
3821-
if (svcntb() == QK8_0) {
3821+
if (ggml_sve_cnt_b == QK8_0) {
38223822
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
38233823
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
38243824

@@ -5303,7 +5303,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
53035303
float sumf = 0;
53045304

53055305
#if defined(__ARM_FEATURE_SVE)
5306-
if (svcntb() == QK8_0) {
5306+
if (ggml_sve_cnt_b == QK8_0) {
53075307
svfloat32_t sumv0 = svdup_n_f32(0.0f);
53085308
svfloat32_t sumv1 = svdup_n_f32(0.0f);
53095309

@@ -6449,22 +6449,22 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
64496449
// compute mask for subtraction
64506450
vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
64516451
vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
6452-
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
6452+
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
64536453
m <<= 1;
64546454

64556455
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
64566456
vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
6457-
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
6457+
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
64586458
m <<= 1;
64596459

64606460
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
64616461
vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
6462-
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
6462+
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
64636463
m <<= 1;
64646464

64656465
vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
64666466
vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
6467-
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
6467+
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
64686468
m <<= 1;
64696469

64706470
// load Q8 and take product with Q3
@@ -7720,13 +7720,13 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
77207720
vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
77217721
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
77227722
vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
7723-
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
7723+
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
77247724
m <<= 1;
77257725

77267726
vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
77277727
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
77287728
vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
7729-
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
7729+
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
77307730
m <<= 1;
77317731

77327732
vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);

base/ggml/ggml-quants.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,10 @@ void iq2xs_free_impl(enum ggml_type type);
127127
void iq3xs_init_impl(int grid_size);
128128
void iq3xs_free_impl(int grid_size);
129129

130+
#if defined(__ARM_FEATURE_SVE)
131+
extern int ggml_sve_cnt_b;
132+
#endif
133+
130134
#ifdef __cplusplus
131135
}
132136
#endif

0 commit comments

Comments
 (0)