Skip to content

Commit 9d364b8

Browse files
ikawrakowIwan Kawrakow
andauthored
Adding Ling/Ring (a.k.a., Bailing-MoE2) support (ikawrakow#833)
* Adding Ling/Ring (a.k.a., Bailing-MoE2) * Add expert group selection (not working, so turned off) * BailingMoE2 conversion * WIP * Bits and pieces --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 8d0d01a commit 9d364b8

25 files changed

+1295
-56
lines changed

convert_hf_to_gguf.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -654,6 +654,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
654654
if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
655655
# ref: https://huggingface.co/moonshotai/Kimi-K2-Base
656656
res = "kimi-k2"
657+
if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
658+
# ref: https://huggingface.co/inclusionAI/Ling-mini-base-2.0
659+
res = "bailingmoe2"
657660

658661
if res is None:
659662
logger.warning("\n")
@@ -4461,6 +4464,103 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
44614464
name = name.removeprefix("transformer.")
44624465
return [(self.map_tensor_name(name), data_torch)]
44634466

4467+
@Model.register("BailingMoeV2ForCausalLM")
4468+
class BailingMoeV2Model(Model):
4469+
model_arch = gguf.MODEL_ARCH.BAILINGMOE2
4470+
4471+
def __init__(self, *args, **kwargs):
4472+
super().__init__(*args, **kwargs)
4473+
if nextn_layers := self.hparams.get("num_nextn_predict_layers", 0):
4474+
self.block_count = self.hparams["num_hidden_layers"] + nextn_layers
4475+
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
4476+
4477+
def set_vocab(self):
4478+
self._set_vocab_gpt2()
4479+
4480+
def set_gguf_parameters(self):
4481+
super().set_gguf_parameters()
4482+
hparams = self.hparams
4483+
if (rope_dim := hparams.get("head_dim")) is None:
4484+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
4485+
4486+
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
4487+
rope_scaling = self.hparams.get("rope_scaling") or {}
4488+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
4489+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
4490+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
4491+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
4492+
else:
4493+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
4494+
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
4495+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
4496+
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
4497+
self.gguf_writer.add_expert_shared_feed_forward_length(hparams["moe_shared_expert_intermediate_size"])
4498+
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
4499+
self.gguf_writer.add_expert_count(hparams["num_experts"])
4500+
self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
4501+
self.gguf_writer.add_expert_group_count(hparams["n_group"])
4502+
self.gguf_writer.add_expert_group_used_count(hparams["topk_group"])
4503+
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
4504+
4505+
if hparams["score_function"] == "sigmoid":
4506+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
4507+
elif hparams["score_function"] == "softmax":
4508+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
4509+
else:
4510+
raise ValueError(f"Unsupported score_function value: {hparams['score_function']}")
4511+
4512+
if (nextn_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
4513+
self.gguf_writer.add_nextn_predict_layers(nextn_layers)
4514+
4515+
_experts: list[dict[str, Tensor]] | None = None
4516+
4517+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4518+
if "mlp.experts" in name:
4519+
n_experts = self.hparams["num_experts"]
4520+
assert bid is not None
4521+
4522+
tensors: list[tuple[str, Tensor]] = []
4523+
4524+
if self._experts is None:
4525+
self._experts = [{} for _ in range(self.block_count)]
4526+
4527+
self._experts[bid][name] = data_torch
4528+
4529+
if len(self._experts[bid]) >= n_experts * 3:
4530+
# merge the experts into a single 3d tensor
4531+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
4532+
datas: list[Tensor] = []
4533+
4534+
for xid in range(n_experts):
4535+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
4536+
datas.append(self._experts[bid][ename])
4537+
del self._experts[bid][ename]
4538+
4539+
data_torch = torch.stack(datas, dim=0)
4540+
4541+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
4542+
4543+
new_name = self.map_tensor_name(merged_name)
4544+
4545+
tensors.append((new_name, data_torch))
4546+
4547+
return tensors
4548+
4549+
if name.endswith(".expert_bias"):
4550+
name = name.replace(".expert_bias", ".expert_bias.bias")
4551+
4552+
return [(self.map_tensor_name(name), data_torch)]
4553+
4554+
def prepare_tensors(self):
4555+
super().prepare_tensors()
4556+
4557+
if self._experts is not None:
4558+
# flatten `list[dict[str, Tensor]]` into `list[str]`
4559+
experts = [k for d in self._experts for k in d.keys()]
4560+
if len(experts) > 0:
4561+
raise ValueError(f"Unprocessed experts: {experts}")
4562+
4563+
44644564
###### CONVERSION LOGIC ######
44654565

44664566

ggml/include/ggml.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,7 @@ extern "C" {
630630
GGML_OP_TRANSPOSE,
631631
GGML_OP_GET_ROWS,
632632
GGML_OP_GET_ROWS_BACK,
633+
GGML_OP_SET_ROWS,
633634
GGML_OP_DIAG,
634635
GGML_OP_DIAG_MASK_INF,
635636
GGML_OP_DIAG_MASK_ZERO,
@@ -1559,6 +1560,19 @@ extern "C" {
15591560
struct ggml_tensor * a,
15601561
float s);
15611562

1563+
// x = s * a + b
1564+
GGML_API struct ggml_tensor * ggml_scale_bias(
1565+
struct ggml_context * ctx,
1566+
struct ggml_tensor * a,
1567+
float s,
1568+
float b);
1569+
1570+
GGML_API struct ggml_tensor * ggml_scale_bias_inplace(
1571+
struct ggml_context * ctx,
1572+
struct ggml_tensor * a,
1573+
float s,
1574+
float b);
1575+
15621576
GGML_API struct ggml_tensor * ggml_softcap(
15631577
struct ggml_context * ctx,
15641578
struct ggml_tensor * a,
@@ -1781,6 +1795,23 @@ extern "C" {
17811795
struct ggml_tensor * b,
17821796
struct ggml_tensor * c);
17831797

1798+
// a TD [n_embd, ne1, ne2, ne3]
1799+
// b TS [n_embd, n_rows, ne02, ne03] | ne02 == ne2, ne03 == ne3
1800+
// c I64 [n_rows, ne11, ne12, 1] | c[i] in [0, ne1)
1801+
//
1802+
// undefined behavior if destination rows overlap
1803+
//
1804+
// broadcast:
1805+
// ne2 % ne11 == 0
1806+
// ne3 % ne12 == 0
1807+
//
1808+
// return view(a)
1809+
GGML_API struct ggml_tensor * ggml_set_rows(
1810+
struct ggml_context * ctx,
1811+
struct ggml_tensor * a, // destination
1812+
struct ggml_tensor * b, // source
1813+
struct ggml_tensor * c); // row indices
1814+
17841815
GGML_API struct ggml_tensor * ggml_diag(
17851816
struct ggml_context * ctx,
17861817
struct ggml_tensor * a);

ggml/src/ggml-cuda.cu

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
#include "ggml-cuda/topk-moe.cuh"
4545
#include "ggml-cuda/conv2d.cuh"
4646
#include "ggml-cuda/conv2d-dw.cuh"
47+
#include "ggml-cuda/set-rows.cuh"
48+
#include "ggml-cuda/argmax.cuh"
4749

4850
#include <algorithm>
4951
#include <array>
@@ -3105,12 +3107,18 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
31053107
auto next = i < cgraph->n_nodes - 1 ? cgraph->nodes[i+1] : nullptr;
31063108

31073109
switch (dst->op) {
3110+
case GGML_OP_ARGMAX:
3111+
ggml_cuda_argmax(ctx, dst);
3112+
break;
31083113
case GGML_OP_REPEAT:
31093114
ggml_cuda_op_repeat(ctx, dst);
31103115
break;
31113116
case GGML_OP_GET_ROWS:
31123117
ggml_cuda_op_get_rows(ctx, dst);
31133118
break;
3119+
case GGML_OP_SET_ROWS:
3120+
ggml_cuda_op_set_rows(ctx, dst);
3121+
break;
31143122
case GGML_OP_DUP:
31153123
ggml_cuda_dup(ctx, dst);
31163124
break;
@@ -4204,6 +4212,14 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
42044212
return false;
42054213
}
42064214
} break;
4215+
case GGML_OP_SET_ROWS:
4216+
{
4217+
return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
4218+
op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 ||
4219+
op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) &&
4220+
op->src[0]->type == GGML_TYPE_F32 &&
4221+
(op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
4222+
} break;
42074223
case GGML_OP_CPY:
42084224
{
42094225
ggml_type src0_type = op->src[0]->type;
@@ -4260,6 +4276,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
42604276
}
42614277
return false;
42624278
} break;
4279+
case GGML_OP_ARGMAX:
4280+
return true;
42634281
case GGML_OP_DUP:
42644282
case GGML_OP_REPEAT:
42654283
case GGML_OP_CONCAT:

ggml/src/ggml-cuda/argmax.cu

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#include <algorithm>
2+
#include <cstdint>
3+
4+
#include "argmax.cuh"
5+
#include "common.cuh"
6+
7+
static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __restrict__ dst, const int64_t ncols) {
8+
const int64_t row = blockIdx.x;
9+
10+
float maxval = -FLT_MAX;
11+
int argmax = -1;
12+
const float * rowx = x + row * ncols;
13+
14+
for (int32_t col = threadIdx.x; col < ncols; col += blockDim.x) {
15+
const float val = rowx[col];
16+
if (val > maxval) {
17+
maxval = val;
18+
argmax = col;
19+
}
20+
}
21+
22+
#pragma unroll
23+
for (int offset = 16; offset > 0; offset >>= 1) {
24+
const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
25+
const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
26+
if (val > maxval) {
27+
maxval = val;
28+
argmax = col;
29+
}
30+
}
31+
32+
const int n_warps = blockDim.x / WARP_SIZE;
33+
const int lane_id = threadIdx.x % WARP_SIZE;
34+
const int warp_id = threadIdx.x / WARP_SIZE;
35+
if (n_warps > 1) {
36+
constexpr int max_warps = 1024 / WARP_SIZE;
37+
__shared__ float shared_maxval[max_warps];
38+
__shared__ int shared_argmax[max_warps];
39+
if (lane_id == 0) {
40+
shared_maxval[warp_id] = maxval;
41+
shared_argmax[warp_id] = argmax;
42+
}
43+
44+
__syncthreads();
45+
46+
if (warp_id == 0) {
47+
if (lane_id < n_warps) {
48+
maxval = shared_maxval[lane_id];
49+
argmax = shared_argmax[lane_id];
50+
}
51+
#pragma unroll
52+
for (int offset = 16; offset > 0; offset >>= 1) {
53+
const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
54+
const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
55+
if (val > maxval) {
56+
maxval = val;
57+
argmax = col;
58+
}
59+
}
60+
}
61+
}
62+
63+
if (warp_id == 0 && lane_id == 0) {
64+
dst[row] = argmax;
65+
}
66+
}
67+
68+
void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
69+
const ggml_tensor * src0 = dst->src[0];
70+
71+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
72+
GGML_ASSERT( dst->type == GGML_TYPE_I32);
73+
74+
GGML_ASSERT(ggml_is_contiguous(src0));
75+
76+
const int64_t ne00 = src0->ne[0];
77+
const int64_t nrows = ggml_nrows(src0);
78+
79+
const float * src0_d = (const float *) src0->data;
80+
int32_t * dst_d = (int32_t *) dst->data;
81+
82+
cudaStream_t stream = ctx.stream();
83+
84+
const int64_t num_blocks = nrows;
85+
const int64_t num_threads = std::min<int64_t>(1024, (ne00 + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE);
86+
const dim3 blocks_dim(num_threads, 1, 1);
87+
const dim3 blocks_num(num_blocks, 1, 1);
88+
89+
argmax_f32<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, dst_d, ne00);
90+
}

ggml/src/ggml-cuda/argmax.cuh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#include "common.cuh"
2+
3+
void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

ggml/src/ggml-cuda/argsort.cu

Lines changed: 44 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,20 @@ static inline __device__ void ggml_cuda_swap(T & a, T & b) {
1313
b = tmp;
1414
}
1515

16-
template<ggml_sort_order order>
17-
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad,
18-
int min_experts, float thresh_experts) {
16+
struct store_ser {
17+
constexpr static bool has_thresh = true;
18+
int min_experts;
19+
float thresh_experts;
20+
store_ser(int min, float thresh) : min_experts(min), thresh_experts(thresh) {}
21+
};
22+
23+
struct store {
24+
constexpr static bool has_thresh = false;
25+
};
26+
27+
template<ggml_sort_order order, typename Store>
28+
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad, Store s) {
29+
// int min_experts, float thresh_experts) {
1930
// bitonic sort
2031
int col = threadIdx.x;
2132
int row = blockIdx.y;
@@ -58,19 +69,30 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n
5869
}
5970
}
6071

61-
if (min_experts >= 0 && min_experts < ncols && thresh_experts > 0) {
72+
if constexpr (Store::has_thresh) {
6273
__syncthreads();
6374
float max_val = x_row[dst_row[0]];
6475
if (col < ncols) {
65-
dst[row * ncols + col] = col < min_experts || x_row[dst_row[col]] >= thresh_experts*max_val ? dst_row[col] : -1;
76+
dst[row * ncols + col] = col < s.min_experts || x_row[dst_row[col]] >= s.thresh_experts*max_val ? dst_row[col] : -1;
6677
}
67-
}
68-
else {
69-
// copy the result to dst without the padding
78+
} else {
7079
if (col < ncols) {
7180
dst[row * ncols + col] = dst_row[col];
7281
}
7382
}
83+
//if (min_experts >= 0 && min_experts < ncols && thresh_experts > 0) {
84+
// __syncthreads();
85+
// float max_val = x_row[dst_row[0]];
86+
// if (col < ncols) {
87+
// dst[row * ncols + col] = col < min_experts || x_row[dst_row[col]] >= thresh_experts*max_val ? dst_row[col] : -1;
88+
// }
89+
//}
90+
//else {
91+
// // copy the result to dst without the padding
92+
// if (col < ncols) {
93+
// dst[row * ncols + col] = dst_row[col];
94+
// }
95+
//}
7496
}
7597

7698
static int next_power_of_2(int x) {
@@ -94,9 +116,21 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
94116
GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
95117

96118
if (order == GGML_SORT_ORDER_ASC) {
97-
k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad, min_experts, thresh_experts);
119+
if (min_experts >= 0 && min_experts < ncols && thresh_experts > 0) {
120+
k_argsort_f32_i32<GGML_SORT_ORDER_ASC, store_ser><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad,
121+
{min_experts, thresh_experts});
122+
} else {
123+
k_argsort_f32_i32<GGML_SORT_ORDER_ASC, store><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad, {});
124+
}
125+
//k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad, min_experts, thresh_experts);
98126
} else if (order == GGML_SORT_ORDER_DESC) {
99-
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad, min_experts, thresh_experts);
127+
if (min_experts >= 0 && min_experts < ncols && thresh_experts > 0) {
128+
k_argsort_f32_i32<GGML_SORT_ORDER_DESC, store_ser><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad,
129+
{min_experts, thresh_experts});
130+
} else {
131+
k_argsort_f32_i32<GGML_SORT_ORDER_DESC, store><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad, {});
132+
}
133+
//k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad, min_experts, thresh_experts);
100134
} else {
101135
GGML_ABORT("fatal error");
102136
}

0 commit comments

Comments
 (0)