Skip to content

Commit 9e182b3

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # README.md # docs/backend/SYCL.md # ggml/src/ggml-sycl/CMakeLists.txt # ggml/src/ggml-vulkan/CMakeLists.txt # ggml/src/ggml-vulkan/ggml-vulkan.cpp # scripts/sync-ggml.last # tests/test-chat-template.cpp
2 parents 0fd94e1 + 8293970 commit 9e182b3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+2394
-830
lines changed

convert_hf_to_gguf.py

Lines changed: 110 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -708,6 +708,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
708708
if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f":
709709
# ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k
710710
res = "superbpe"
711+
if chkhsh == "1994ffd01900cfb37395608534236ecd63f2bd5995d6cb1004dda1af50240f15":
712+
# ref: https://huggingface.co/trillionlabs/Trillion-7B-preview
713+
res = "trillion"
714+
if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224":
715+
# ref: https://huggingface.co/inclusionAI/Ling-lite
716+
res = "bailingmoe"
711717

712718
if res is None:
713719
logger.warning("\n")
@@ -3551,8 +3557,8 @@ def set_gguf_parameters(self):
35513557
head_size = hidden_size // num_attention_heads
35523558
rms_norm_eps = self.hparams["rms_norm_eps"]
35533559
intermediate_size = self.hparams["intermediate_size"]
3554-
time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
3555-
time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
3560+
time_mix_extra_dim = self.hparams.get("lora_rank_tokenshift", 64 if hidden_size >= 4096 else 32)
3561+
time_decay_extra_dim = self.hparams.get("lora_rank_decay", 128 if hidden_size >= 4096 else 64)
35563562

35573563
# RWKV isn't context limited
35583564
self.gguf_writer.add_context_length(1048576)
@@ -5130,6 +5136,108 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
51305136
return super().modify_tensors(data_torch, name, bid)
51315137

51325138

5139+
@Model.register("BailingMoeForCausalLM")
5140+
class BailingMoeModel(Model):
5141+
model_arch = gguf.MODEL_ARCH.BAILINGMOE
5142+
5143+
def set_vocab(self):
5144+
self._set_vocab_gpt2()
5145+
5146+
def set_gguf_parameters(self):
5147+
super().set_gguf_parameters()
5148+
hparams = self.hparams
5149+
if hparams.get("head_dim"):
5150+
rope_dim = hparams["head_dim"]
5151+
else:
5152+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
5153+
5154+
self.gguf_writer.add_rope_dimension_count(rope_dim)
5155+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
5156+
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
5157+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
5158+
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
5159+
self.gguf_writer.add_expert_weights_scale(1.0)
5160+
self.gguf_writer.add_expert_count(hparams["num_experts"])
5161+
self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
5162+
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
5163+
5164+
_experts: list[dict[str, Tensor]] | None = None
5165+
5166+
@staticmethod
5167+
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
5168+
if n_head_kv is not None and n_head != n_head_kv:
5169+
n_head = n_head_kv
5170+
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
5171+
.swapaxes(1, 2)
5172+
.reshape(weights.shape))
5173+
5174+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5175+
n_head = self.hparams["num_attention_heads"]
5176+
n_kv_head = self.hparams.get("num_key_value_heads")
5177+
n_embd = self.hparams["hidden_size"]
5178+
head_dim = self.hparams.get("head_dim", n_embd // n_head)
5179+
5180+
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
5181+
5182+
if name.endswith("attention.dense.weight"):
5183+
return [(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch)]
5184+
elif name.endswith("query_key_value.weight"):
5185+
q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2)
5186+
5187+
return [
5188+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), BailingMoeModel.permute(q, n_head, n_head)),
5189+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), BailingMoeModel.permute(k, n_head, n_kv_head)),
5190+
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v)
5191+
]
5192+
elif name.find("mlp.experts") != -1:
5193+
n_experts = self.hparams["num_experts"]
5194+
assert bid is not None
5195+
5196+
tensors: list[tuple[str, Tensor]] = []
5197+
5198+
if self._experts is None:
5199+
self._experts = [{} for _ in range(self.block_count)]
5200+
5201+
self._experts[bid][name] = data_torch
5202+
5203+
if len(self._experts[bid]) >= n_experts * 3:
5204+
# merge the experts into a single 3d tensor
5205+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
5206+
datas: list[Tensor] = []
5207+
5208+
for xid in range(n_experts):
5209+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
5210+
datas.append(self._experts[bid][ename])
5211+
del self._experts[bid][ename]
5212+
5213+
data_torch = torch.stack(datas, dim=0)
5214+
5215+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
5216+
5217+
new_name = self.map_tensor_name(merged_name)
5218+
5219+
tensors.append((new_name, data_torch))
5220+
5221+
return tensors
5222+
5223+
new_name = self.map_tensor_name(name)
5224+
5225+
if new_name == output_name and self.hparams.get("norm_head"):
5226+
data_torch = data_torch.float()
5227+
data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7
5228+
5229+
return [(new_name, data_torch)]
5230+
5231+
def prepare_tensors(self):
5232+
super().prepare_tensors()
5233+
5234+
if self._experts is not None:
5235+
# flatten `list[dict[str, Tensor]]` into `list[str]`
5236+
experts = [k for d in self._experts for k in d.keys()]
5237+
if len(experts) > 0:
5238+
raise ValueError(f"Unprocessed experts: {experts}")
5239+
5240+
51335241
@Model.register("ChameleonForConditionalGeneration")
51345242
@Model.register("ChameleonForCausalLM") # obsolete
51355243
class ChameleonModel(Model):

convert_hf_to_gguf_update.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@ class TOKENIZER_TYPE(IntEnum):
111111
{"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
112112
{"name": "gpt-4o", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
113113
{"name": "superbpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
114+
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
115+
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
114116
]
115117

116118

examples/llava/clip.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1517,14 +1517,16 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
15171517
const int n_kv = gguf_get_n_kv(ctx);
15181518
const int ftype = get_u32(ctx, KEY_FTYPE);
15191519
const std::string ftype_str = get_ftype(ftype);
1520-
const int idx_desc = get_key_idx(ctx, KEY_DESCRIPTION);
1521-
const std::string description = gguf_get_val_str(ctx, idx_desc);
15221520
const int idx_name = gguf_find_key(ctx, KEY_NAME);
15231521
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
15241522
const std::string name = gguf_get_val_str(ctx, idx_name);
15251523
LOG_INF("%s: model name: %s\n", __func__, name.c_str());
15261524
}
1527-
LOG_INF("%s: description: %s\n", __func__, description.c_str());
1525+
const int idx_desc = gguf_find_key(ctx, KEY_DESCRIPTION);
1526+
if (idx_desc != -1) { // ditto
1527+
const std::string description = gguf_get_val_str(ctx, idx_desc);
1528+
LOG_INF("%s: description: %s\n", __func__, description.c_str());
1529+
}
15281530
LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
15291531
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
15301532
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);

examples/tts/tts.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -699,11 +699,13 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
699699
const std::string voice_data = audio_data;
700700

701701
auto tmp = common_tokenize(vocab, voice_data, false, true);
702-
printf("\n\n");
702+
703+
std::ostringstream tokens_oss;
703704
for (size_t i = 0; i < tmp.size(); ++i) {
704-
printf("%d, ", tmp[i]);
705+
tokens_oss << tmp[i] << ", ";
705706
}
706-
printf("\n\n");
707+
LOG_INF("\n\n%s: llama tokens: %s\n\n", __func__, tokens_oss.str().c_str());
708+
707709
prompt_add(prompt_inp, tmp);
708710
#else
709711
prompt_add(prompt_inp, llama_tokens {

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ bool g_mul_mat_q = true;
3333
#include "ggml-cuda/rope.cuh"
3434
#include "ggml-cuda/scale.cuh"
3535
#include "ggml-cuda/softmax.cuh"
36+
#include "ggml-cuda/ssm-conv.cuh"
37+
#include "ggml-cuda/ssm-scan.cuh"
3638
#include "ggml-cuda/sum.cuh"
3739
#include "ggml-cuda/sumrows.cuh"
3840
#include "ggml-cuda/tsembd.cuh"
@@ -2301,6 +2303,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
23012303
case GGML_OP_SUM_ROWS:
23022304
ggml_cuda_op_sum_rows(ctx, dst);
23032305
break;
2306+
case GGML_OP_SSM_CONV:
2307+
ggml_cuda_op_ssm_conv(ctx, dst);
2308+
break;
2309+
case GGML_OP_SSM_SCAN:
2310+
ggml_cuda_op_ssm_scan(ctx, dst);
2311+
break;
23042312
case GGML_OP_ARGSORT:
23052313
ggml_cuda_op_argsort(ctx, dst);
23062314
break;
@@ -3198,6 +3206,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
31983206
case GGML_OP_COS:
31993207
case GGML_OP_CLAMP:
32003208
case GGML_OP_LOG:
3209+
case GGML_OP_SSM_SCAN:
3210+
case GGML_OP_SSM_CONV:
32013211
return true;
32023212
case GGML_OP_CONT:
32033213
return op->src[0]->type != GGML_TYPE_BF16;

ggml/src/ggml-cuda/ssm-conv.cu

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#include "ssm-conv.cuh"
2+
3+
template <size_t split_d_inner, size_t d_conv>
4+
static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float * __restrict__ src1,
5+
const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1,
6+
float * __restrict__ dst, const int dst_nb0, const int dst_nb1, const int dst_nb2,
7+
const int nc, const int ncs, const int nr, const int n_t, const int n_s) {
8+
const int tid = threadIdx.x;
9+
const int bidx = blockIdx.x;
10+
const int bidy = blockIdx.y;
11+
12+
const float * x_block = (const float *) ((char *) src0 + bidx * src0_nb2 + bidy * split_d_inner * src0_nb1);
13+
const float * w_block = (const float *) ((char *) src1 + bidy * split_d_inner * src1_nb1);
14+
float * y_block = (float *) ((char *) dst + bidx * dst_nb2 + bidy * split_d_inner * dst_nb0);
15+
16+
const int stride_x = src0_nb1 / sizeof(float);
17+
const int stride_w = src1_nb1 / sizeof(float);
18+
const int stride_y = dst_nb1 / sizeof(float);
19+
20+
float x[d_conv] = { 0.0f };
21+
float w[d_conv] = { 0.0f };
22+
23+
#pragma unroll
24+
for (int j = 0; j < d_conv; j++) {
25+
w[j] = w_block[tid * stride_w + j];
26+
}
27+
28+
for (int i = 0; i < n_t; i++) {
29+
float sumf = 0.0f;
30+
31+
if (i == 0) {
32+
for (int j = 0; j < d_conv; j++) {
33+
x[j] = x_block[tid * stride_x + j];
34+
}
35+
} else {
36+
x[(i - 1) % d_conv] = x_block[tid * stride_x + i + d_conv - 1];
37+
}
38+
39+
#pragma unroll
40+
for (int j = 0; j < d_conv; j++) {
41+
sumf += x[(i + j) % d_conv] * w[j];
42+
}
43+
y_block[i * stride_y + tid] = sumf;
44+
}
45+
}
46+
47+
template <size_t split_d_inner, size_t d_conv, size_t split_n_t>
48+
static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0, const float * __restrict__ src1,
49+
const int src0_nb0, const int src0_nb1, const int src0_nb2,
50+
const int src1_nb1, float * __restrict__ dst, const int dst_nb0,
51+
const int dst_nb1, const int dst_nb2, const int nc, const int ncs,
52+
const int nr, const int n_t, const int n_s) {
53+
const int tid = threadIdx.x;
54+
const int bidx = blockIdx.x;
55+
const int bidy = blockIdx.y;
56+
const int bidz = blockIdx.z;
57+
58+
const float * x_block = (const float *) ((char *) src0 + bidx * src0_nb2 + bidy * split_d_inner * src0_nb1 +
59+
bidz * split_n_t * src0_nb0);
60+
const float * w_block = (const float *) ((char *) src1 + bidy * split_d_inner * src1_nb1);
61+
float * y_block =
62+
(float *) ((char *) dst + bidx * dst_nb2 + bidz * split_n_t * dst_nb1 + bidy * split_d_inner * dst_nb0);
63+
64+
const int stride_x = src0_nb1 / sizeof(float);
65+
const int stride_w = src1_nb1 / sizeof(float);
66+
const int stride_y = dst_nb1 / sizeof(float);
67+
68+
float x[d_conv] = { 0.0f };
69+
float w[d_conv] = { 0.0f };
70+
71+
#pragma unroll
72+
for (int j = 0; j < d_conv; j++) {
73+
w[j] = w_block[tid * stride_w + j];
74+
}
75+
76+
#pragma unroll
77+
for (int i = 0; i < split_n_t; i++) {
78+
if (bidz * split_n_t + i < n_t) {
79+
float sumf = 0.0f;
80+
81+
if (i == 0) {
82+
for (int j = 0; j < d_conv; j++) {
83+
x[j] = x_block[tid * stride_x + j];
84+
}
85+
} else {
86+
x[(i - 1) % d_conv] = x_block[tid * stride_x + i + d_conv - 1];
87+
}
88+
89+
#pragma unroll
90+
for (int j = 0; j < d_conv; j++) {
91+
sumf += x[(i + j) % d_conv] * w[j];
92+
}
93+
y_block[i * stride_y + tid] = sumf;
94+
}
95+
}
96+
}
97+
98+
static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int src0_nb0, const int src0_nb1,
99+
const int src0_nb2, const int src1_nb1, float * dst, const int dst_nb0, const int dst_nb1,
100+
const int dst_nb2, const int nc, const int ncs, const int nr, const int n_t,
101+
const int n_s, cudaStream_t stream) {
102+
const int threads = 128;
103+
GGML_ASSERT(nr % threads == 0);
104+
105+
if (n_t <= 32) {
106+
const dim3 blocks(n_s, (nr + threads - 1) / threads, 1);
107+
if (nc == 4) {
108+
ssm_conv_f32<threads, 4><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
109+
dst, dst_nb0, dst_nb1, dst_nb2, nc, ncs, nr, n_t,
110+
n_s);
111+
} else {
112+
GGML_ABORT("Only support kernel size = 4 now.");
113+
}
114+
} else {
115+
if (nc == 4) {
116+
const int split_n_t = 32;
117+
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
118+
ssm_conv_long_token_f32<threads, 4, split_n_t>
119+
<<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0,
120+
dst_nb1, dst_nb2, nc, ncs, nr, n_t, n_s);
121+
} else {
122+
GGML_ABORT("Only support kernel size = 4 right now.");
123+
}
124+
}
125+
}
126+
127+
void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
128+
const struct ggml_tensor * src0 = dst->src[0]; // conv_x
129+
const struct ggml_tensor * src1 = dst->src[1]; // conv1d.weight
130+
131+
const int nc = src1->ne[0]; // d_conv
132+
const int ncs = src0->ne[0]; // d_conv - 1 + n_t
133+
const int nr = src0->ne[1]; // d_inner
134+
const int n_t = dst->ne[1]; // tokens per sequence
135+
const int n_s = dst->ne[2]; // number of sequences in the batch
136+
137+
GGML_ASSERT(dst->ne[0] == nr);
138+
GGML_ASSERT(src0->nb[0] == sizeof(float));
139+
GGML_ASSERT(src1->nb[0] == sizeof(float));
140+
GGML_ASSERT(src0->nb[1] == src0->ne[0] * sizeof(float));
141+
142+
const float * src0_d = (const float *) src0->data;
143+
const float * src1_d = (const float *) src1->data;
144+
float * dst_d = (float *) dst->data;
145+
cudaStream_t stream = ctx.stream();
146+
147+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
148+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
149+
ssm_conv_f32_cuda(src0_d, src1_d, src0->nb[0], src0->nb[1], src0->nb[2], src1->nb[1], dst_d, dst->nb[0], dst->nb[1],
150+
dst->nb[2], nc, ncs, nr, n_t, n_s, stream);
151+
}

ggml/src/ggml-cuda/ssm-conv.cuh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#include "common.cuh"
2+
3+
void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

0 commit comments

Comments
 (0)