Skip to content

Commit 1c41c38

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/cuda.Dockerfile # CODEOWNERS # README.md # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/common.h # ggml/src/ggml-opencl/ggml-opencl.cpp # scripts/sync-ggml-am.sh # scripts/sync-ggml.last # scripts/sync-ggml.sh # tests/test-chat.cpp # tools/batched-bench/batched-bench.cpp # tools/mtmd/clip.h
2 parents eb33467 + a094f38 commit 1c41c38

40 files changed

+427
-275
lines changed

common/arg.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1532,6 +1532,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15321532
params.ctx_shift = false;
15331533
}
15341534
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
1535+
add_opt(common_arg(
1536+
{"--context-shift"},
1537+
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
1538+
[](common_params & params) {
1539+
params.ctx_shift = true;
1540+
}
1541+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
15351542
add_opt(common_arg(
15361543
{"--chunks"}, "N",
15371544
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -1825,7 +1832,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
18251832
[](common_params & params, const std::string & value) {
18261833
params.sampling.top_n_sigma = std::stof(value);
18271834
}
1828-
).set_examples({LLAMA_EXAMPLE_MAIN}).set_sparam());
1835+
).set_sparam());
18291836
add_opt(common_arg(
18301837
{"--xtc-probability"}, "N",
18311838
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),

common/chat.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -632,7 +632,6 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
632632
case COMMON_REASONING_FORMAT_AUTO: return "auto";
633633
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
634634
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
635-
case COMMON_REASONING_FORMAT_GRANITE: return "granite";
636635
default:
637636
throw std::runtime_error("Unknown reasoning format");
638637
}

common/common.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -235,12 +235,15 @@ struct common_params_diffusion {
235235
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
236236
};
237237

238+
// reasoning API response format (not to be confused as chat template's reasoning format)
238239
enum common_reasoning_format {
239240
COMMON_REASONING_FORMAT_NONE,
240-
COMMON_REASONING_FORMAT_AUTO,
241+
COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
241242
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
242243
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
243-
COMMON_REASONING_FORMAT_GRANITE, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
244+
// do not extend this enum unless you absolutely have to
245+
// in most cases, use COMMON_REASONING_FORMAT_AUTO
246+
// see: https://github.com/ggml-org/llama.cpp/pull/15408
244247
};
245248

246249

@@ -368,7 +371,7 @@ struct common_params {
368371
bool cont_batching = true; // insert new sequences for decoding on-the-fly
369372
bool flash_attn = false; // flash attention
370373
bool no_perf = false; // disable performance metrics
371-
bool ctx_shift = true; // context shift on inifinite text generation
374+
bool ctx_shift = false; // context shift on inifinite text generation
372375
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
373376
bool kv_unified = false; // enable unified KV cache
374377

ggml/src/ggml-cpu/arch-fallback.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@
7373
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
7474
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
7575
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
76-
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
7776
// repack.cpp
7877
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
7978
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8

ggml/src/ggml-cpu/arch/powerpc/quants.c

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,72 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
278278
#endif
279279
}
280280

281+
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
282+
assert(nrc == 1);
283+
UNUSED(nrc);
284+
UNUSED(bx);
285+
UNUSED(by);
286+
UNUSED(bs);
287+
assert(n % QK_MXFP4 == 0);
288+
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
289+
290+
const block_mxfp4 * GGML_RESTRICT x = vx;
291+
const block_q8_0 * GGML_RESTRICT y = vy;
292+
293+
const int nb = n / QK_MXFP4;
294+
295+
int ib = 0;
296+
float sumf = 0;
297+
298+
#if defined(__POWER9_VECTOR__)
299+
const vector signed char lowMask = vec_splats((signed char)0xF);
300+
const vector unsigned char vshift4 = vec_splats((unsigned char)4);
301+
vector float vsumf0 = vec_splats(0.0f);
302+
303+
vector signed char kv = vec_xl(0, (const signed char *)kvalues_mxfp4);
304+
305+
#pragma GCC unroll 8
306+
for (; ib < nb; ++ib) {
307+
__builtin_prefetch(x[ib].qs, 0, 1);
308+
__builtin_prefetch(y[ib].qs, 0, 1);
309+
310+
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d) *
311+
GGML_E8M0_TO_FP32_HALF(x[ib].e));
312+
313+
vector signed char q8y0 = vec_xl( 0, y[ib].qs);
314+
vector signed char q8y1 = vec_xl(16, y[ib].qs);
315+
316+
vector signed char qxs = (vector signed char)vec_xl(0, x[ib].qs);
317+
318+
vector unsigned char lo_nibbles = (vector unsigned char)vec_and(qxs, lowMask);
319+
vector unsigned char hi_nibbles = (vector unsigned char)vec_sr(qxs, vshift4);
320+
321+
vector signed char q4x0 = vec_perm(kv, kv, lo_nibbles);
322+
vector signed char q4x1 = vec_perm(kv, kv, hi_nibbles);
323+
324+
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
325+
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
326+
327+
vector signed int vsumi0 = vec_splats((int32_t)0);
328+
vsumi0 = vec_sum4s(qv0, vsumi0);
329+
vsumi0 = vec_sum4s(qv1, vsumi0);
330+
331+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vyd, vsumf0);
332+
}
333+
334+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
335+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
336+
sumf = vec_extract(vsumf0, 0);
337+
*s = sumf;
338+
#else
339+
UNUSED(x);
340+
UNUSED(y);
341+
UNUSED(ib);
342+
UNUSED(sumf);
343+
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
344+
#endif
345+
}
346+
281347
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
282348
const int qk = QK8_0;
283349
const int nb = n / qk;

ggml/src/ggml-cuda/add-id.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@ static __global__ void add_id_kernel(
1111
const int64_t i1 = blockIdx.x;
1212
const int64_t i2 = blockIdx.y;
1313

14-
const int i11 = *(int32_t *) ((char *) src2 + i1*sizeof(int32_t) + i2*nb21);
14+
const int i11 = *(const int32_t *) ((const char *) src2 + i1*sizeof(int32_t) + i2*nb21);
1515

1616
const size_t nb1 = ne0 * sizeof(float);
1717
const size_t nb2 = ne1 * nb1;
1818

1919
float * dst_row = (float *)((char *)dst + i1*nb1 + i2*nb2);
20-
const float * src0_row = (const float *)((char *)src0 + i1*nb01 + i2*nb02);
21-
const float * src1_row = (const float *)((char *)src1 + i11*nb11);
20+
const float * src0_row = (const float *)((const char *)src0 + i1*nb01 + i2*nb02);
21+
const float * src1_row = (const float *)((const char *)src1 + i11*nb11);
2222

2323
for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
2424
dst_row[i0] = src0_row[i0] + src1_row[i0];

ggml/src/ggml-cuda/common.cuh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@
7878
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
7979

8080
// Moore Threads
81+
#define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons
82+
8183
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
8284
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
8385
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
@@ -494,13 +496,14 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
494496
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
495497
}
496498

497-
#if CUDART_VERSION < CUDART_HMASK
499+
#if (defined(CUDART_VERSION) && CUDART_VERSION < CUDART_HMASK) || defined(GGML_USE_HIP) || \
500+
(defined(MUSART_VERSION) && MUSART_VERSION < MUSART_HMASK)
498501
static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
499502
const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
500503
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
501504
return mask_low | mask_high;
502505
}
503-
#endif // CUDART_VERSION < CUDART_HMASK
506+
#endif // (defined(CUDART_VERSION) && CUDART_VERSION < CUDART_HMASK) || defined(GGML_USE_HIP) || (defined(MUSART_VERSION) && MUSART_VERSION < MUSART_HMASK)
504507

505508
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
506509
#if defined(GGML_USE_HIP)

ggml/src/ggml-cuda/fattn-mma-f16.cuh

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1237,10 +1237,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
12371237
}
12381238
#else
12391239
GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
1240-
GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
1240+
GGML_UNUSED(mask_h2); GGML_UNUSED(sinks_f);
1241+
GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
12411242
GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
1242-
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_Q1);
1243-
GGML_UNUSED(stride_Q2); GGML_UNUSED(stride_K); GGML_UNUSED(stride_V); GGML_UNUSED(stride_mask);
1243+
GGML_UNUSED(ne01); GGML_UNUSED(ne02);
1244+
GGML_UNUSED(stride_Q1); GGML_UNUSED(stride_Q2);
1245+
GGML_UNUSED(stride_K); GGML_UNUSED(stride_V); GGML_UNUSED(stride_mask);
12441246
GGML_UNUSED(jt); GGML_UNUSED(kb0_start); GGML_UNUSED(kb0_stop);
12451247
NO_DEVICE_CODE;
12461248
#endif // TURING_MMA_AVAILABLE
@@ -1395,8 +1397,8 @@ static __global__ void flash_attn_ext_f16(
13951397
(Q_f2, K_h2, V_h2, mask_h2, sinks_f, dstk, dst_meta, scale, slope, logit_softcap,
13961398
ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
13971399
#else
1398-
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(sinks);
1399-
GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
1400+
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
1401+
GGML_UNUSED(sinks); GGML_UNUSED(KV_max); GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
14001402
GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
14011403
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
14021404
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);

ggml/src/ggml-cuda/fattn-tile-f16.cu

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -299,17 +299,17 @@ static __global__ void flash_attn_tile_ext_f16(
299299
}
300300
}
301301
#else
302-
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(sinks);
303-
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
304-
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
302+
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
303+
GGML_UNUSED(sinks); GGML_UNUSED(KV_max); GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
304+
GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
305305
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
306-
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
307-
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
308-
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
309-
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
310-
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
311-
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
312-
GGML_UNUSED(nb23);
306+
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
307+
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
308+
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
309+
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
310+
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
311+
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
312+
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
313313
NO_DEVICE_CODE;
314314
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
315315
}

ggml/src/ggml-cuda/fattn-tile-f32.cu

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -38,17 +38,6 @@ static __global__ void flash_attn_tile_ext_f32(
3838
return;
3939
#endif // FP16_MMA_AVAILABLE
4040
if (use_logit_softcap && !(D == 128 || D == 256)) {
41-
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(sinks);
42-
GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
43-
GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
44-
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
45-
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
46-
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
47-
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
48-
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
49-
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
50-
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
51-
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
5241
NO_DEVICE_CODE;
5342
return;
5443
}
@@ -313,7 +302,7 @@ static __global__ void flash_attn_tile_ext_f32(
313302
}
314303
#else
315304
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
316-
GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
305+
GGML_UNUSED(sinks); GGML_UNUSED(KV_max); GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
317306
GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
318307
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
319308
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);

0 commit comments

Comments
 (0)