Skip to content

Commit e043815

Browse files
authored
Merge branch 'master' into modern-bert-support
2 parents 2522ce8 + b907255 commit e043815

25 files changed

+508
-551
lines changed

common/common.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -288,9 +288,9 @@ struct common_params {
288288
float rope_freq_base = 0.0f; // RoPE base frequency
289289
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
290290
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
291-
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
292-
float yarn_beta_fast = 32.0f; // YaRN low correction dim
293-
float yarn_beta_slow = 1.0f; // YaRN high correction dim
291+
float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
292+
float yarn_beta_fast = -1.0f; // YaRN low correction dim
293+
float yarn_beta_slow = -1.0f; // YaRN high correction dim
294294
int32_t yarn_orig_ctx = 0; // YaRN original context length
295295

296296
// offload params

convert_hf_to_gguf.py

Lines changed: 78 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -735,6 +735,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
735735
if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
736736
# ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
737737
res = "qwen2"
738+
if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
739+
# ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
740+
res = "grok-2"
738741
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
739742
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
740743
res = "llama-bpe"
@@ -2685,57 +2688,109 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
26852688
yield (new_name, data_torch)
26862689

26872690

2688-
@ModelBase.register("GrokForCausalLM")
2691+
@ModelBase.register("GrokForCausalLM", "Grok1ForCausalLM")
26892692
class GrokModel(TextModel):
26902693
model_arch = gguf.MODEL_ARCH.GROK
26912694

26922695
def set_vocab(self):
2693-
self._set_vocab_sentencepiece()
2696+
if (self.dir_model / 'tokenizer.model').is_file():
2697+
self._set_vocab_sentencepiece()
2698+
return
2699+
2700+
if not (self.dir_model / 'tokenizer.json').is_file() or not (self.dir_model / 'chat_template.jinja').is_file():
2701+
logger.error('Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer')
2702+
sys.exit(1)
2703+
2704+
self._set_vocab_gpt2()
26942705

26952706
def __init__(self, *args, **kwargs):
26962707
super().__init__(*args, **kwargs)
26972708

26982709
def set_gguf_parameters(self):
26992710
super().set_gguf_parameters()
27002711

2701-
_experts: list[dict[str, Tensor]] | None = None
2712+
self.gguf_writer.add_attn_logit_softcapping(self.hparams.get("attn_logit_softcapping", 30.0))
2713+
self.gguf_writer.add_router_logit_softcapping(self.hparams.get("router_logit_softcapping", 30.0))
2714+
if (final_logit_softcap := self.hparams.get("final_logit_softcapping")):
2715+
self.gguf_writer.add_final_logit_softcapping(final_logit_softcap)
2716+
2717+
if (rope_dim := self.hparams.get("head_dim")) is None:
2718+
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
2719+
2720+
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
2721+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
2722+
2723+
# Treat "original" as "yarn", seems to have been a mistake
2724+
if self.hparams.get("rope_type") in ("yarn", "original"):
2725+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
2726+
self.gguf_writer.add_rope_scaling_factor(self.hparams["scaling_factor"])
2727+
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["original_max_position_embeddings"])
2728+
self.gguf_writer.add_rope_scaling_yarn_ext_factor(self.hparams["extrapolation_factor"])
2729+
self.gguf_writer.add_rope_scaling_yarn_attn_factor(self.hparams["attn_factor"])
2730+
self.gguf_writer.add_rope_scaling_yarn_beta_fast(self.hparams["beta_fast"])
2731+
self.gguf_writer.add_rope_scaling_yarn_beta_slow(self.hparams["beta_slow"])
2732+
2733+
if temp_len := self.hparams.get("attn_temperature_len"):
2734+
self.gguf_writer.add_attn_temperature_length(temp_len)
2735+
2736+
self.gguf_writer.add_attn_output_scale(self.hparams.get("attn_output_multiplier", rope_dim**-0.5))
2737+
self.gguf_writer.add_embedding_scale(self.hparams["embedding_multiplier_scale"])
2738+
self.gguf_writer.add_logit_scale(self.hparams["output_multiplier_scale"])
2739+
2740+
_experts: list[dict[str, list[Tensor]]] | None = None
2741+
_cur_expert = ""
27022742

27032743
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2744+
tensors: list[tuple[str, Tensor]] = []
2745+
is_expert = ".moe." in name or ".block_sparse_moe.experts." in name
2746+
2747+
if not is_expert:
2748+
tensors.append((self.map_tensor_name(name), data_torch))
2749+
27042750
# process the experts separately
2705-
if name.find(".moe.") != -1:
2751+
if is_expert or self._cur_expert:
27062752
n_experts = self.hparams["num_local_experts"]
27072753

27082754
assert bid is not None
27092755

27102756
if self._experts is None:
27112757
self._experts = [{} for _ in range(self.block_count)]
27122758

2713-
self._experts[bid][name] = data_torch
2759+
# concatenate split tensors
2760+
if name in self._experts[bid]:
2761+
self._cur_expert = name
2762+
self._experts[bid][name].append(data_torch)
2763+
return []
2764+
elif is_expert:
2765+
self._cur_expert = name
2766+
self._experts[bid][name] = [data_torch]
2767+
return []
2768+
else:
2769+
self._cur_expert = ""
27142770

2715-
if len(self._experts[bid]) >= n_experts * 3:
2716-
tensors: list[tuple[str, Tensor]] = []
2771+
for bid in range(self.block_count):
2772+
if len(self._experts[bid]) >= n_experts * 3:
2773+
# merge the experts into a single 3d tensor
2774+
for wid in [("linear", "w1", 0), ("linear_1", "w2", 1), ("linear_v", "w3", 0)]:
2775+
datas: list[Tensor] = []
27172776

2718-
# merge the experts into a single 3d tensor
2719-
for wid in ["linear", "linear_1", "linear_v"]:
2720-
datas: list[Tensor] = []
2777+
for xid in range(n_experts):
2778+
ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid[0]}.weight"
2779+
if ename not in self._experts[bid]:
2780+
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid[1]}.weight"
2781+
tensor_list = self._experts[bid][ename]
2782+
datas.append(torch.cat(tensor_list, dim=wid[2]) if len(tensor_list) > 1 else tensor_list[0])
2783+
del self._experts[bid][ename]
27212784

2722-
for xid in range(n_experts):
2723-
ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
2724-
datas.append(self._experts[bid][ename])
2725-
del self._experts[bid][ename]
2785+
data_torch = torch.stack(datas, dim=0)
27262786

2727-
data_torch = torch.stack(datas, dim=0)
2787+
merged_name = f"transformer.decoder_layer.{bid}.moe.{wid[0]}.weight"
27282788

2729-
merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
2730-
2731-
new_name = self.map_tensor_name(merged_name)
2789+
new_name = self.map_tensor_name(merged_name)
27322790

2733-
tensors.append((new_name, data_torch))
2734-
return tensors
2735-
else:
2736-
return []
2791+
yield (new_name, data_torch)
27372792

2738-
return [(self.map_tensor_name(name), data_torch)]
2793+
yield from tensors
27392794

27402795

27412796
@ModelBase.register("DbrxForCausalLM")

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ class TOKENIZER_TYPE(IntEnum):
159159
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
160160
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
161161
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
162+
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
162163
]
163164

164165

ggml/src/ggml-cuda/mmf.cuh

Lines changed: 23 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -57,31 +57,33 @@ static __global__ void mul_mat_f(
5757
T * tile_xy = (T *) compute_base + threadIdx.y*(tile_A::I * tile_k_padded);
5858

5959
if constexpr (has_ids) {
60-
__shared__ int has_any;
61-
if (threadIdx.y == 0) {
62-
int local_has_any = 0;
63-
for (int j = threadIdx.x; j < cols_per_block; j += warp_size) {
64-
int slot = -1;
65-
for (int k = 0; k < nchannels_dst; ++k) {
66-
const int idv = ids[j*stride_row_id + k*stride_col_id];
67-
if (idv == expert_idx) {
68-
slot = k;
69-
break;
70-
}
71-
}
72-
if (j < cols_per_block) {
73-
local_has_any |= (slot >= 0);
74-
slot_map[j] = slot;
60+
int found = 0;
61+
62+
for (int j0 = 0; j0 < cols_per_block; j0 += nwarps) {
63+
const int j = j0 + threadIdx.y;
64+
const int32_t * __restrict__ id_row = ids + j*stride_row_id;
65+
66+
if (threadIdx.x == 0) {
67+
slot_map[j] = -1;
68+
}
69+
70+
for (int k = threadIdx.x; k < nchannels_dst; k += warp_size) {
71+
int match = id_row[k*stride_col_id] == expert_idx;
72+
73+
if (match) {
74+
slot_map[j] = k;
75+
found = 1;
76+
break;
7577
}
7678
}
77-
has_any = warp_reduce_any(local_has_any);
7879
}
79-
__syncthreads();
80-
if (has_any == 0) {
80+
81+
if (!__syncthreads_or(found)) {
8182
return;
8283
}
8384
}
8485

86+
8587
for (int col = threadIdx.y*warp_size + threadIdx.x; col < ncols; col += nwarps*warp_size) {
8688
tile_A A[ntA][warp_size / tile_A::J];
8789
#pragma unroll
@@ -106,14 +108,7 @@ static __global__ void mul_mat_f(
106108
if constexpr (!has_ids) {
107109
tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[j*stride_col_y + col] : 0.0f;
108110
} else {
109-
float val = 0.0f;
110-
if (j < cols_per_block) {
111-
const int slot = slot_map[j];
112-
if (slot >= 0) {
113-
val = y[slot*stride_channel_y + j*stride_col_y + col];
114-
}
115-
}
116-
tile_xy[j0*tile_k_padded + threadIdx.x] = val;
111+
tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[slot_map[j]*stride_channel_y + j*stride_col_y + col] : 0.0f;
117112
}
118113
}
119114
} else if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
@@ -125,14 +120,7 @@ static __global__ void mul_mat_f(
125120
const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f);
126121
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
127122
} else {
128-
float2 tmp = make_float2(0.0f, 0.0f);
129-
if (j < cols_per_block) {
130-
const int slot = slot_map[j];
131-
if (slot >= 0) {
132-
const float2 * y2_slot = (const float2 *)(y + slot*stride_channel_y);
133-
tmp = y2_slot[j*stride_col_y + col];
134-
}
135-
}
123+
float2 tmp = j < cols_per_block && slot_map[j] >= 0 ? *(const float2*) &y[slot_map[j]*stride_channel_y + 2*(j*stride_col_y + col)] : make_float2(0.0f, 0.0f);
136124
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
137125
}
138126
}
@@ -221,7 +209,7 @@ static inline void mul_mat_f_switch_ids(
221209
const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared_total, cudaStream_t stream) {
222210
if (ids) {
223211
mul_mat_f<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps, true><<<block_nums, block_dims, nbytes_shared_total, stream>>>
224-
(x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
212+
(x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
225213
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
226214
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
227215
} else {

0 commit comments

Comments
 (0)