Skip to content

Commit 0fcfbdb

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/musa.Dockerfile # .github/workflows/build.yml # .github/workflows/close-issue.yml # ci/README.md # docs/build.md # docs/docker.md # ggml/CMakeLists.txt # ggml/cmake/ggml-config.cmake.in # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/aclnn_ops.h # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-cuda/fattn-wmma-f16.cu # ggml/src/ggml-musa/CMakeLists.txt # ggml/src/ggml-rpc/ggml-rpc.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/vecdotq.hpp # scripts/sync-ggml.last # tests/test-backend-ops.cpp # tools/imatrix/README.md # tools/imatrix/imatrix.cpp
2 parents 0d72c79 + 64bf1c3 commit 0fcfbdb

33 files changed

+501
-348
lines changed

common/arg.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2657,6 +2657,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26572657
params.i_chunk = value;
26582658
}
26592659
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2660+
add_opt(common_arg(
2661+
{"--show-statistics"},
2662+
string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
2663+
[](common_params & params) {
2664+
params.show_statistics = true;
2665+
}
2666+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
26602667
add_opt(common_arg(
26612668
{"--parse-special"},
26622669
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),

common/common.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -428,9 +428,10 @@ struct common_params {
428428
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
429429
int32_t i_chunk = 0; // start processing from this chunk
430430

431-
bool process_output = false; // collect data for the output tensor
432-
bool compute_ppl = true; // whether to compute perplexity
433-
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
431+
bool process_output = false; // collect data for the output tensor
432+
bool compute_ppl = true; // whether to compute perplexity
433+
bool show_statistics = false; // show imatrix statistics per tensor
434+
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
434435

435436
// cvector-generator params
436437
int n_pca_batch = 100;

convert_hf_to_gguf.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6486,7 +6486,7 @@ def prepare_tensors(self):
64866486
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
64876487

64886488

6489-
@ModelBase.register("Glm4ForCausalLM")
6489+
@ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration")
64906490
class Glm4Model(TextModel):
64916491
model_arch = gguf.MODEL_ARCH.GLM4
64926492

@@ -6508,14 +6508,22 @@ def set_vocab(self):
65086508

65096509
def set_gguf_parameters(self):
65106510
super().set_gguf_parameters()
6511-
rope_dim = self.hparams["head_dim"]
6511+
if (rope_dim := self.hparams.get("head_dim")) is None:
6512+
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
65126513
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
65136514
rope_scaling = self.hparams.get("rope_scaling") or {}
65146515
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
65156516
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
65166517
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
65176518
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
65186519

6520+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6521+
if name.startswith("model.visual."): # ignore visual part of Glm4v
6522+
return []
6523+
elif name.startswith("model.language_model."):
6524+
name = name.replace("language_model.", "") # for Glm4v
6525+
return super().modify_tensors(data_torch, name, bid)
6526+
65196527

65206528
@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
65216529
class ChatGLMModel(TextModel):

ggml/src/ggml-backend.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -647,6 +647,7 @@ struct ggml_backend_sched {
647647
// pipeline parallelism support
648648
int n_copies;
649649
int cur_copy;
650+
int next_copy;
650651
ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
651652
struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
652653
int n_graph_inputs;
@@ -1439,8 +1440,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
14391440
}
14401441
}
14411442

1442-
sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1443-
14441443
return GGML_STATUS_SUCCESS;
14451444
}
14461445

@@ -1541,10 +1540,10 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
15411540
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
15421541
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
15431542

1544-
ggml_backend_sched_split_graph(sched, measure_graph);
1545-
15461543
ggml_backend_sched_synchronize(sched);
15471544

1545+
ggml_backend_sched_split_graph(sched, measure_graph);
1546+
15481547
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
15491548
return false;
15501549
}
@@ -1556,6 +1555,10 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
15561555

15571556
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
15581557
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
1558+
GGML_ASSERT(!sched->is_alloc);
1559+
1560+
sched->cur_copy = sched->next_copy;
1561+
sched->next_copy = (sched->next_copy + 1) % sched->n_copies;
15591562

15601563
ggml_backend_sched_split_graph(sched, graph);
15611564

@@ -1596,7 +1599,7 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
15961599
// if the graph is not already allocated, always use copy 0 after a synchronization
15971600
// this ensures that during generation the same copy is used every time,
15981601
// which avoids changes in the graph that could cause CUDA or other graphs to be disabled
1599-
sched->cur_copy = 0;
1602+
sched->next_copy = 0;
16001603
}
16011604
}
16021605

ggml/src/ggml-cpu/arch/loongarch/quants.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -544,7 +544,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
544544
__m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs, 0) );
545545
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
546546
__m128 tmp = max4;
547-
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
547+
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x1 ));
548548
const float max_scalar = ((v4f32)max4)[0];
549549

550550
// Quantize these floats

ggml/src/ggml-cpu/repack.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
#include <cmath>
1515
#include <cstring>
1616
#include <cassert>
17-
#include <cstdlib> // for qsort
1817
#include <cstdio> // for GGML_ASSERT
1918

2019
#include "repack.h"

ggml/src/ggml-cuda/common.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -769,7 +769,7 @@ struct ggml_tensor_extra_gpu {
769769
};
770770

771771

772-
#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS))
772+
#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) || defined(GGML_MUSA_GRAPHS)
773773
#define USE_CUDA_GRAPH
774774
#endif
775775

ggml/src/ggml-cuda/convert.cu

Lines changed: 64 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,24 +6,33 @@
66
#define CUDA_Q8_0_NE_ALIGN 2048
77

88
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
9-
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
10-
const int64_t i = (int64_t)2*(blockDim.x*blockIdx.x + threadIdx.x);
9+
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y,
10+
const int64_t ne00, const int64_t ne01, const int64_t ne02,
11+
const int64_t s01, const int64_t s02, const int64_t s03) {
12+
const int64_t i00 = 2 * (int64_t(blockDim.x)*blockIdx.x + threadIdx.x);
1113

12-
if (i >= k) {
14+
if (i00 >= ne00) {
1315
return;
1416
}
1517

16-
const int64_t ib = i/qk; // block index
17-
const int64_t iqs = (i%qk)/qr; // quant index
18-
const int64_t iybs = i - i%qk; // y block start index
18+
const int64_t i01 = blockIdx.y;
19+
const int64_t i02 = blockIdx.z % ne02;
20+
const int64_t i03 = blockIdx.z / ne02;
21+
22+
const int64_t ibx0 = i03*s03 + i02*s02 + i01*s01;
23+
24+
const int64_t ib = ibx0 + i00/qk; // block index
25+
const int64_t iqs = (i00%qk)/qr; // quant index
26+
const int64_t iybs = i00 - i00%qk; // y block start index
1927
const int64_t y_offset = qr == 1 ? 1 : qk/2;
2028

2129
// dequantize
2230
dfloat2 v;
2331
dequantize_kernel(vx, ib, iqs, v);
2432

25-
y[iybs + iqs + 0] = v.x;
26-
y[iybs + iqs + y_offset] = v.y;
33+
const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
34+
y[iy0 + 0] = float(v.x);
35+
y[iy0 + y_offset] = float(v.y);
2736
}
2837

2938
template <bool need_check>
@@ -457,9 +466,17 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
457466
}
458467

459468
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
460-
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
461-
const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
462-
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
469+
static void dequantize_block_cuda(const void * vx, dst_t * y,
470+
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
471+
const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
472+
const dim3 num_blocks((ne00 + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE), ne01, ne02*ne03);
473+
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
474+
(vx, y, ne00, ne01, ne02, s01, s02, s03);
475+
}
476+
477+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
478+
static void dequantize_block_cont_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
479+
dequantize_block_cuda<qk, qr, dequantize_kernel, dst_t>(vx, y, k, 1, 1, 1, k/qk, k/qk, k/qk, stream);
463480
}
464481

465482
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
@@ -624,14 +641,14 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
624641
case GGML_TYPE_Q4_1:
625642
return dequantize_row_q4_1_cuda;
626643
case GGML_TYPE_Q5_0:
627-
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
644+
return dequantize_block_cont_cuda<QK5_0, QR5_0, dequantize_q5_0>;
628645
case GGML_TYPE_Q5_1:
629-
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
646+
return dequantize_block_cont_cuda<QK5_1, QR5_1, dequantize_q5_1>;
630647
case GGML_TYPE_Q8_0:
631648
if (fp16_available(ggml_cuda_info().devices[ggml_cuda_get_device()].cc)) {
632649
return dequantize_block_q8_0_f16_cuda;
633650
}
634-
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
651+
return dequantize_block_cont_cuda<QK8_0, QR8_0, dequantize_q8_0>;
635652
case GGML_TYPE_Q2_K:
636653
return dequantize_row_q2_K_cuda;
637654
case GGML_TYPE_Q3_K:
@@ -676,11 +693,11 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
676693
case GGML_TYPE_Q4_1:
677694
return dequantize_row_q4_1_cuda;
678695
case GGML_TYPE_Q5_0:
679-
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
696+
return dequantize_block_cont_cuda<QK5_0, QR5_0, dequantize_q5_0>;
680697
case GGML_TYPE_Q5_1:
681-
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
698+
return dequantize_block_cont_cuda<QK5_1, QR5_1, dequantize_q5_1>;
682699
case GGML_TYPE_Q8_0:
683-
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
700+
return dequantize_block_cont_cuda<QK8_0, QR8_0, dequantize_q8_0>;
684701
case GGML_TYPE_Q2_K:
685702
return dequantize_row_q2_K_cuda;
686703
case GGML_TYPE_Q3_K:
@@ -722,6 +739,16 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
722739
switch (type) {
723740
case GGML_TYPE_F32:
724741
return convert_unary_cuda<float>;
742+
case GGML_TYPE_Q4_0:
743+
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
744+
case GGML_TYPE_Q4_1:
745+
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
746+
case GGML_TYPE_Q5_0:
747+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
748+
case GGML_TYPE_Q5_1:
749+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
750+
case GGML_TYPE_Q8_0:
751+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
725752
case GGML_TYPE_BF16:
726753
return convert_unary_cuda<nv_bfloat16>;
727754
default:
@@ -733,6 +760,16 @@ to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type) {
733760
switch (type) {
734761
case GGML_TYPE_F32:
735762
return convert_unary_cuda<float, nv_bfloat16>;
763+
case GGML_TYPE_Q4_0:
764+
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
765+
case GGML_TYPE_Q4_1:
766+
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
767+
case GGML_TYPE_Q5_0:
768+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
769+
case GGML_TYPE_Q5_1:
770+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
771+
case GGML_TYPE_Q8_0:
772+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
736773
case GGML_TYPE_F16:
737774
return convert_unary_cuda<half, nv_bfloat16>;
738775
default:
@@ -744,6 +781,16 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) {
744781
switch (type) {
745782
case GGML_TYPE_F16:
746783
return convert_unary_cuda<half, float>;
784+
case GGML_TYPE_Q4_0:
785+
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
786+
case GGML_TYPE_Q4_1:
787+
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
788+
case GGML_TYPE_Q5_0:
789+
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
790+
case GGML_TYPE_Q5_1:
791+
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
792+
case GGML_TYPE_Q8_0:
793+
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
747794
case GGML_TYPE_BF16:
748795
return convert_unary_cuda<nv_bfloat16, float>;
749796
default:

ggml/src/ggml-cuda/cpy.cu

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#include "cpy.cuh"
22
#include "dequantize.cuh"
33
#include "cpy-utils.cuh"
4-
#ifdef GGML_USE_MUSA
4+
#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
55
#include "ggml-musa/mudnn.cuh"
6-
#endif // GGML_USE_MUSA
6+
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
77

88
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
99

@@ -121,7 +121,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int
121121
// Copy destination pointers to GPU to be available when pointer indirection is in use
122122

123123
void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
124-
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
124+
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
125125
if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
126126
CUDA_CHECK(cudaStreamSynchronize(stream));
127127
if (cuda_graph->dest_ptrs_d != nullptr) {
@@ -314,7 +314,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
314314

315315
char ** dest_ptrs_d = nullptr;
316316
int graph_cpynode_index = -1;
317-
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
317+
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
318318
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
319319
dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
320320
graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
@@ -324,11 +324,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
324324
#endif
325325
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
326326
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
327-
#ifdef GGML_USE_MUSA
327+
#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
328328
if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
329329
CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
330330
} else
331-
#endif // GGML_USE_MUSA
331+
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
332332
{
333333
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
334334
}
@@ -379,7 +379,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
379379
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
380380
ggml_type_name(src0->type), ggml_type_name(src1->type));
381381
}
382-
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
382+
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
383383
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
384384
ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
385385
}

0 commit comments

Comments
 (0)