Skip to content

Commit 04d6ff1

Browse files
Merge branch 'ggml-org:master' into master
2 parents 03c2d46 + acd6cb1 commit 04d6ff1

File tree

32 files changed

+1159
-201
lines changed

32 files changed

+1159
-201
lines changed

README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
270270
| [CANN](docs/build.md#cann) | Ascend NPU |
271271
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
272272
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
273-
274273
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
275274

276275
## Obtaining and quantizing models

common/arg.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1612,7 +1612,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
16121612
[](common_params & params, const std::string & value) {
16131613
params.antiprompt.emplace_back(value);
16141614
}
1615-
).set_examples({LLAMA_EXAMPLE_MAIN}));
1615+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
16161616
add_opt(common_arg(
16171617
{"-sp", "--special"},
16181618
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
@@ -2655,6 +2655,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26552655
params.i_chunk = value;
26562656
}
26572657
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2658+
add_opt(common_arg(
2659+
{"--show-statistics"},
2660+
string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
2661+
[](common_params & params) {
2662+
params.show_statistics = true;
2663+
}
2664+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
26582665
add_opt(common_arg(
26592666
{"--parse-special"},
26602667
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),

common/common.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -432,9 +432,10 @@ struct common_params {
432432
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
433433
int32_t i_chunk = 0; // start processing from this chunk
434434

435-
bool process_output = false; // collect data for the output tensor
436-
bool compute_ppl = true; // whether to compute perplexity
437-
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
435+
bool process_output = false; // collect data for the output tensor
436+
bool compute_ppl = true; // whether to compute perplexity
437+
bool show_statistics = false; // show imatrix statistics per tensor
438+
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
438439

439440
// cvector-generator params
440441
int n_pca_batch = 100;

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -494,9 +494,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
494494

495495
# Fetch KleidiAI sources:
496496
include(FetchContent)
497-
set(KLEIDIAI_COMMIT_TAG "v1.9.0")
497+
set(KLEIDIAI_COMMIT_TAG "v1.11.0")
498498
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
499-
set(KLEIDIAI_ARCHIVE_MD5 "2a8e1bb55d201557553545536489a017")
499+
set(KLEIDIAI_ARCHIVE_MD5 "3fe9e5ab964c375c53839296eb71eaa2")
500500

501501
if (POLICY CMP0135)
502502
cmake_policy(SET CMP0135 NEW)

ggml/src/ggml-cpu/kleidiai/kernels.cpp

Lines changed: 109 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,94 @@
2222

2323
#include "kai_common.h"
2424

25+
#include "simd-mappings.h"
26+
2527
#include "kernels.h"
2628

2729
#define NELEMS(x) sizeof(x) / sizeof(*x)
30+
31+
static const size_t INT4_PER_BYTE = 2;
32+
static const size_t INT4_BITS = 4;
33+
static const int Q4_0_ZERO_POINT = 8;
34+
const size_t INT4_PER_UINT16 = 4;
35+
36+
static void dequantize_row_qsi4c32pscalef16(
37+
const void *packed_data,
38+
int32_t row_idx,
39+
int64_t nc,
40+
float *out,
41+
size_t nr_pack,
42+
size_t packed_row_stride,
43+
size_t kr,
44+
size_t bl,
45+
size_t num_bytes_multiplier
46+
) {
47+
size_t group_idx = row_idx / nr_pack;
48+
size_t row_in_group = row_idx % nr_pack;
49+
const uint8_t *packed_group = (const uint8_t *)packed_data + group_idx * packed_row_stride;
50+
size_t num_blocks = nc / bl;
51+
const uint8_t *block_ptr = packed_group;
52+
53+
for (size_t b = 0; b < num_blocks; ++b) {
54+
uint16_t scale_f16 = *((const uint16_t *)(block_ptr + row_in_group * num_bytes_multiplier));
55+
float scale = GGML_CPU_FP16_TO_FP32(scale_f16);
56+
57+
const uint8_t *segment_ptr = block_ptr + nr_pack * num_bytes_multiplier;
58+
size_t num_segments = bl / kr;
59+
size_t num_bytes_per_segment = kr / INT4_PER_BYTE;
60+
61+
for (size_t s = 0; s < num_segments; ++s) {
62+
const uint8_t *seg_base = segment_ptr + s * nr_pack * num_bytes_per_segment;
63+
const uint8_t *qbytes = seg_base + row_in_group * num_bytes_per_segment;
64+
for (size_t k = 0; k < num_bytes_per_segment; ++k) {
65+
uint8_t byte = qbytes[k] ^ 0x88;
66+
int x0 = (byte & 0x0F) - Q4_0_ZERO_POINT;
67+
int x1 = (byte >> INT4_BITS) - Q4_0_ZERO_POINT;
68+
out[b * bl + s * num_bytes_per_segment + k] = x0 * scale;
69+
out[b * bl + s * num_bytes_per_segment + k + bl/2] = x1 * scale;
70+
}
71+
}
72+
block_ptr += nr_pack * num_bytes_multiplier + num_segments * nr_pack * num_bytes_per_segment;
73+
}
74+
}
75+
76+
static void dequantize_row_qsi4c32ps1s0scalef16(
77+
const void *packed_data,
78+
int32_t row_idx,
79+
int64_t k,
80+
float *out,
81+
size_t nr,
82+
size_t packed_row_stride,
83+
size_t kr,
84+
size_t bl,
85+
size_t num_bytes_multiplier
86+
) {
87+
const size_t num_blocks = k / bl;
88+
const size_t bl4 = bl / INT4_PER_UINT16;
89+
90+
size_t group_idx = row_idx / nr;
91+
size_t row_in_group = row_idx % nr;
92+
93+
const uint8_t *packed_group = (const uint8_t *)packed_data + group_idx * packed_row_stride;
94+
const uint16_t *qdata = (const uint16_t *)packed_group;
95+
const uint16_t *scales = (const uint16_t *)(packed_group + packed_row_stride - (nr * num_blocks * num_bytes_multiplier));
96+
97+
for (size_t block_idx = 0; block_idx < num_blocks; ++block_idx) {
98+
uint16_t scale_f16 = scales[row_in_group + block_idx * nr];
99+
float scale = GGML_CPU_FP16_TO_FP32(scale_f16);
100+
101+
for (size_t bl4_idx = 0; bl4_idx < bl4; ++bl4_idx) {
102+
uint16_t q = qdata[(block_idx * bl4 + bl4_idx) * nr + row_in_group];
103+
104+
for (size_t qidx = 0; qidx < INT4_PER_UINT16; ++qidx) {
105+
int v = ((q >> (qidx * 4)) & 0xF) - Q4_0_ZERO_POINT;
106+
out[block_idx * bl + bl4_idx * INT4_BITS + qidx] = v * scale;
107+
}
108+
}
109+
}
110+
GGML_UNUSED(kr);
111+
}
112+
28113
static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
29114
#if defined(__ARM_FEATURE_SME)
30115
{
@@ -63,8 +148,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
63148
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon,
64149
},
65150
/* .rhs_info = */ {
66-
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
67-
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
151+
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
152+
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
153+
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
154+
/* .to_float = */ dequantize_row_qsi4c32ps1s0scalef16,
68155
},
69156
/* .required_cpu = */ CPU_FEATURE_SME,
70157
/* .lhs_type = */ GGML_TYPE_F32,
@@ -107,8 +194,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
107194
/* .pack_func = */ kai_run_lhs_pack_bf16p2vlx2_f32_sme,
108195
},
109196
/* .rhs_info = */ {
110-
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
111-
/* .pack_func = */ kai_run_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
197+
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
198+
/* .packed_stride = */ NULL,
199+
/* .pack_func = */ kai_run_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
200+
/* .to_float = */ NULL,
112201
},
113202
/* .required_cpu = */ CPU_FEATURE_SME,
114203
/* .lhs_type = */ GGML_TYPE_F32,
@@ -154,8 +243,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
154243
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
155244
},
156245
/* .rhs_info = */ {
157-
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
158-
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
246+
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
247+
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
248+
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
249+
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
159250
},
160251
/* .required_cpu = */ CPU_FEATURE_DOTPROD,
161252
/* .lhs_type = */ GGML_TYPE_F32,
@@ -200,8 +291,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
200291
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
201292
},
202293
/* .rhs_info = */ {
203-
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
204-
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
294+
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
295+
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
296+
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
297+
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
205298
},
206299
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
207300
/* .lhs_type = */ GGML_TYPE_F32,
@@ -247,8 +340,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
247340
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
248341
},
249342
/* .rhs_info = */ {
250-
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
251-
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
343+
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
344+
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
345+
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
346+
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
252347
},
253348
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
254349
/* .lhs_type = */ GGML_TYPE_F32,
@@ -293,8 +388,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
293388
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
294389
},
295390
/* .rhs_info = */ {
296-
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
297-
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
391+
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
392+
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
393+
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
394+
/* .to_float = */ dequantize_row_qsi4c32pscalef16,
298395
},
299396
/* .required_cpu = */ CPU_FEATURE_DOTPROD,
300397
/* .lhs_type = */ GGML_TYPE_F32,

ggml/src/ggml-cpu/kleidiai/kernels.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,12 +71,15 @@ struct rhs_packing_info {
7171
std::function<size_t(size_t n, size_t k, size_t nr, size_t kr, size_t bl)>,
7272
std::function<size_t(size_t n, size_t k)>
7373
> packed_size;
74+
size_t (*packed_stride)(size_t k, size_t nr, size_t kr, size_t bl);
7475
std::variant<
7576
std::function<void(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl, const uint8_t* rhs,
7677
const float* bias, void* rhs_packed, size_t extra_bytes, const struct kai_rhs_pack_qs4cxs1s0_param* params)>,
7778
std::function<void(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs,
7879
const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params)>
7980
> pack_func;
81+
void (*to_float)(const void *packed_data, int32_t row_idx, int64_t nc, float *out, size_t nr_pack, size_t packed_row_stride,
82+
size_t kr, size_t bl, size_t num_bytes_multiplier);
8083
};
8184

8285
struct ggml_kleidiai_kernels {

0 commit comments

Comments
 (0)