Skip to content

Commit 3f10612

Browse files
authored
Merge branch 'ggml-org:master' into lovedheart-vulkan-mxfp4-optimization
2 parents 31efd0c + 6424594 commit 3f10612

File tree

14 files changed

+113
-114
lines changed

14 files changed

+113
-114
lines changed

CODEOWNERS

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
/tools/server/ @ngxson
66
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
77
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
8-
/ggml/src/ggml-cuda/mmv.* @JohannesGaessler
98
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
109
/ggml/src/ggml-opt.cpp @JohannesGaessler
1110
/ggml/src/gguf.cpp @JohannesGaessler

common/chat.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -632,7 +632,6 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
632632
case COMMON_REASONING_FORMAT_AUTO: return "auto";
633633
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
634634
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
635-
case COMMON_REASONING_FORMAT_GRANITE: return "granite";
636635
default:
637636
throw std::runtime_error("Unknown reasoning format");
638637
}

common/common.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -239,12 +239,15 @@ struct common_params_diffusion {
239239
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
240240
};
241241

242+
// reasoning API response format (not to be confused as chat template's reasoning format)
242243
enum common_reasoning_format {
243244
COMMON_REASONING_FORMAT_NONE,
244-
COMMON_REASONING_FORMAT_AUTO,
245+
COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
245246
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
246247
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
247-
COMMON_REASONING_FORMAT_GRANITE, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
248+
// do not extend this enum unless you absolutely have to
249+
// in most cases, use COMMON_REASONING_FORMAT_AUTO
250+
// see: https://github.com/ggml-org/llama.cpp/pull/15408
248251
};
249252

250253

ggml/src/ggml-cpu/arch-fallback.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@
7373
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
7474
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
7575
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
76-
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
7776
// repack.cpp
7877
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
7978
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8

ggml/src/ggml-cpu/arch/powerpc/quants.c

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,72 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
278278
#endif
279279
}
280280

281+
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
282+
assert(nrc == 1);
283+
UNUSED(nrc);
284+
UNUSED(bx);
285+
UNUSED(by);
286+
UNUSED(bs);
287+
assert(n % QK_MXFP4 == 0);
288+
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
289+
290+
const block_mxfp4 * GGML_RESTRICT x = vx;
291+
const block_q8_0 * GGML_RESTRICT y = vy;
292+
293+
const int nb = n / QK_MXFP4;
294+
295+
int ib = 0;
296+
float sumf = 0;
297+
298+
#if defined(__POWER9_VECTOR__)
299+
const vector signed char lowMask = vec_splats((signed char)0xF);
300+
const vector unsigned char vshift4 = vec_splats((unsigned char)4);
301+
vector float vsumf0 = vec_splats(0.0f);
302+
303+
vector signed char kv = vec_xl(0, (const signed char *)kvalues_mxfp4);
304+
305+
#pragma GCC unroll 8
306+
for (; ib < nb; ++ib) {
307+
__builtin_prefetch(x[ib].qs, 0, 1);
308+
__builtin_prefetch(y[ib].qs, 0, 1);
309+
310+
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d) *
311+
GGML_E8M0_TO_FP32_HALF(x[ib].e));
312+
313+
vector signed char q8y0 = vec_xl( 0, y[ib].qs);
314+
vector signed char q8y1 = vec_xl(16, y[ib].qs);
315+
316+
vector signed char qxs = (vector signed char)vec_xl(0, x[ib].qs);
317+
318+
vector unsigned char lo_nibbles = (vector unsigned char)vec_and(qxs, lowMask);
319+
vector unsigned char hi_nibbles = (vector unsigned char)vec_sr(qxs, vshift4);
320+
321+
vector signed char q4x0 = vec_perm(kv, kv, lo_nibbles);
322+
vector signed char q4x1 = vec_perm(kv, kv, hi_nibbles);
323+
324+
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
325+
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
326+
327+
vector signed int vsumi0 = vec_splats((int32_t)0);
328+
vsumi0 = vec_sum4s(qv0, vsumi0);
329+
vsumi0 = vec_sum4s(qv1, vsumi0);
330+
331+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vyd, vsumf0);
332+
}
333+
334+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
335+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
336+
sumf = vec_extract(vsumf0, 0);
337+
*s = sumf;
338+
#else
339+
UNUSED(x);
340+
UNUSED(y);
341+
UNUSED(ib);
342+
UNUSED(sumf);
343+
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
344+
#endif
345+
}
346+
281347
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
282348
const int qk = QK8_0;
283349
const int nb = n / qk;

scripts/sync-ggml-am.sh

Lines changed: 3 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -74,21 +74,7 @@ while read c; do
7474
cmake/common.cmake \
7575
cmake/ggml-config.cmake.in \
7676
src/ggml-cpu/cmake/FindSIMD.cmake \
77-
src/ggml*.h \
78-
src/ggml*.c \
79-
src/ggml*.cpp \
80-
src/gguf*.cpp \
81-
src/ggml-blas/* \
82-
src/ggml-cann/* \
83-
src/ggml-cpu/* \
84-
src/ggml-cuda/* \
85-
src/ggml-hip/* \
86-
src/ggml-metal/* \
87-
src/ggml-musa/* \
88-
src/ggml-opencl/* \
89-
src/ggml-rpc/* \
90-
src/ggml-sycl/* \
91-
src/ggml-vulkan/* \
77+
src/ggml* \
9278
include/ggml*.h \
9379
include/gguf*.h \
9480
tests/test-opt.cpp \
@@ -131,21 +117,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
131117
# cmake/ggml-config.cmake.in -> ggml/cmake/ggml-config.cmake.in
132118
# src/ggml-cpu/cmake/FindSIMD.cmake -> ggml/src/ggml-cpu/cmake/FindSIMD.cmake
133119
#
134-
# src/ggml*.c -> ggml/src/ggml*.c
135-
# src/ggml*.cpp -> ggml/src/ggml*.cpp
136-
# src/ggml*.h -> ggml/src/ggml*.h
137-
# src/gguf*.cpp -> ggml/src/gguf*.cpp
138-
# src/ggml-blas/* -> ggml/src/ggml-blas/*
139-
# src/ggml-cann/* -> ggml/src/ggml-cann/*
140-
# src/ggml-cpu/* -> ggml/src/ggml-cpu/*
141-
# src/ggml-cuda/* -> ggml/src/ggml-cuda/*
142-
# src/ggml-hip/* -> ggml/src/ggml-hip/*
143-
# src/ggml-metal/* -> ggml/src/ggml-metal/*
144-
# src/ggml-musa/* -> ggml/src/ggml-musa/*
145-
# src/ggml-opencl/* -> ggml/src/ggml-opencl/*
146-
# src/ggml-rpc/* -> ggml/src/ggml-rpc/*
147-
# src/ggml-sycl/* -> ggml/src/ggml-sycl/*
148-
# src/ggml-vulkan/* -> ggml/src/ggml-vulkan/*
120+
# src/ggml* -> ggml/src/ggml*
149121
#
150122
# include/ggml*.h -> ggml/include/ggml*.h
151123
# include/gguf*.h -> ggml/include/gguf*.h
@@ -163,20 +135,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
163135
-e 's/([[:space:]]| [ab]\/)cmake\/common.cmake/\1ggml\/cmake\/common.cmake/g' \
164136
-e 's/([[:space:]]| [ab]\/)cmake\/ggml-config.cmake.in/\1ggml\/cmake\/ggml-config.cmake.in/g' \
165137
-e 's/([[:space:]]| [ab]\/)src\/ggml-cpu\/cmake\/FindSIMD.cmake/\1ggml\/src\/ggml-cpu\/cmake\/FindSIMD.cmake/g' \
166-
-e 's/([[:space:]]| [ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\2.c/g' \
167-
-e 's/([[:space:]]| [ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\2.cpp/g' \
168-
-e 's/([[:space:]]| [ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\2.h/g' \
169-
-e 's/([[:space:]]| [ab]\/)src\/gguf(.*)\.cpp/\1ggml\/src\/gguf\2.cpp/g' \
170-
-e 's/([[:space:]]| [ab]\/)src\/ggml-blas\//\1ggml\/src\/ggml-blas\//g' \
171-
-e 's/([[:space:]]| [ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \
172-
-e 's/([[:space:]]| [ab]\/)src\/ggml-cpu\//\1ggml\/src\/ggml-cpu\//g' \
173-
-e 's/([[:space:]]| [ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \
174-
-e 's/([[:space:]]| [ab]\/)src\/ggml-hip\//\1ggml\/src\/ggml-hip\//g' \
175-
-e 's/([[:space:]]| [ab]\/)src\/ggml-metal\//\1ggml\/src\/ggml-metal\//g' \
176-
-e 's/([[:space:]]| [ab]\/)src\/ggml-opencl\//\1ggml\/src\/ggml-opencl\//g' \
177-
-e 's/([[:space:]]| [ab]\/)src\/ggml-rpc\//\1ggml\/src\/ggml-rpc\//g' \
178-
-e 's/([[:space:]]| [ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \
179-
-e 's/([[:space:]]| [ab]\/)src\/ggml-vulkan\//\1ggml\/src\/ggml-vulkan\//g' \
138+
-e 's/([[:space:]]| [ab]\/)src\/ggml(.*)/\1ggml\/src\/ggml\2/g' \
180139
-e 's/([[:space:]]| [ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\2.h/g' \
181140
-e 's/([[:space:]]| [ab]\/)include\/gguf(.*)\.h/\1ggml\/include\/gguf\2.h/g' \
182141
-e 's/([[:space:]]| [ab]\/)tests\/(.*)\.cpp/\1tests\/\2.cpp/g' \

scripts/sync-ggml.last

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
b141fc226b68e4af383101c39da90b54ede98850
1+
323951f1bdcdfbd5b5ff3a9a7c3770e63b1a560e

scripts/sync-ggml.sh

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,7 @@ cp -rpv ../ggml/src/CMakeLists.txt ./ggml/src/CMakeLists.txt
66
cp -rpv ../ggml/cmake/* ./ggml/cmake/
77
cp -rpv ../ggml/src/ggml-cpu/cmake/* ./ggml/src/ggml-cpu/cmake/
88

9-
cp -rpv ../ggml/src/ggml*.c ./ggml/src/
10-
cp -rpv ../ggml/src/ggml*.cpp ./ggml/src/
11-
cp -rpv ../ggml/src/ggml*.h ./ggml/src/
12-
cp -rpv ../ggml/src/gguf*.cpp ./ggml/src/
13-
cp -rpv ../ggml/src/ggml-blas/* ./ggml/src/ggml-blas/
14-
cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/
15-
cp -rpv ../ggml/src/ggml-cpu/* ./ggml/src/ggml-cpu/
16-
cp -rpv ../ggml/src/ggml-cuda/* ./ggml/src/ggml-cuda/
17-
cp -rpv ../ggml/src/ggml-hip/* ./ggml/src/ggml-hip/
18-
cp -rpv ../ggml/src/ggml-metal/* ./ggml/src/ggml-metal/
19-
cp -rpv ../ggml/src/ggml-musa/* ./ggml/src/ggml-musa/
20-
cp -rpv ../ggml/src/ggml-opencl/* ./ggml/src/ggml-opencl/
21-
cp -rpv ../ggml/src/ggml-rpc/* ./ggml/src/ggml-rpc/
22-
cp -rpv ../ggml/src/ggml-sycl/* ./ggml/src/ggml-sycl/
23-
cp -rpv ../ggml/src/ggml-vulkan/* ./ggml/src/ggml-vulkan/
9+
cp -rpv ../ggml/src/ggml* ./ggml/src/
2410

2511
cp -rpv ../ggml/include/ggml*.h ./ggml/include/
2612
cp -rpv ../ggml/include/gguf*.h ./ggml/include/

src/llama-context.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -145,11 +145,6 @@ llama_context::llama_context(
145145
__func__, n_ctx_per_seq, hparams.n_ctx_train);
146146
}
147147

148-
if (!params.swa_full && cparams.n_seq_max > 1 && hparams.is_swa_any()) {
149-
LLAMA_LOG_WARN("%s: requested n_seq_max (%u) > 1, but swa_full is not enabled -- performance may be degraded: %s\n",
150-
__func__, cparams.n_seq_max, "https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573");
151-
}
152-
153148
if (!hparams.vocab_only) {
154149
// GPU backends
155150
for (auto * dev : model.devices) {

tests/test-chat.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1408,7 +1408,7 @@ static void test_template_output_parsers() {
14081408
/* is_partial= */ false,
14091409
{
14101410
/* .format = */ COMMON_CHAT_FORMAT_GRANITE,
1411-
/* .reasoning_format = */ COMMON_REASONING_FORMAT_GRANITE,
1411+
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
14121412
}));
14131413

14141414
// Test parsing tool calls

0 commit comments

Comments
 (0)