Skip to content

Commit dfb96fb

Browse files
committed
Merge branch 'concedo_experimental' into crokeso
2 parents 6942dbb + abf527a commit dfb96fb

27 files changed

+1093
-165
lines changed

convert_hf_to_gguf.py

Lines changed: 153 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2230,6 +2230,7 @@ def prepare_tensors(self):
22302230
"MixtralForCausalLM",
22312231
"VLlama3ForCausalLM",
22322232
"LlavaForConditionalGeneration",
2233+
"VoxtralForConditionalGeneration",
22332234
"LlamaModel")
22342235
class LlamaModel(TextModel):
22352236
model_arch = gguf.MODEL_ARCH.LLAMA
@@ -2242,6 +2243,11 @@ def __init__(self, *args, **kwargs):
22422243
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
22432244

22442245
def set_vocab(self):
2246+
path_tekken_json = self.dir_model / "tekken.json"
2247+
path_tokenizer_json = self.dir_model / "tokenizer.json"
2248+
if path_tekken_json.is_file() and not path_tokenizer_json.is_file():
2249+
return self.set_vocab_tekken()
2250+
22452251
try:
22462252
self._set_vocab_sentencepiece()
22472253
except FileNotFoundError:
@@ -2274,6 +2280,52 @@ def set_vocab(self):
22742280
if self.hparams.get("vocab_size", 32000) == 49152:
22752281
self.gguf_writer.add_add_bos_token(False)
22762282

2283+
def set_vocab_tekken(self):
2284+
vocab = gguf.vocab.MistralVocab(self.dir_model)
2285+
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
2286+
2287+
tokens = []
2288+
scores = []
2289+
toktypes = []
2290+
2291+
for text, score, toktype in vocab.all_tokens():
2292+
tokens.append(text)
2293+
scores.append(score)
2294+
toktypes.append(toktype)
2295+
2296+
assert len(tokens) == vocab.vocab_size, (
2297+
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
2298+
)
2299+
2300+
if vocab.tokenizer_type == gguf.vocab.MistralTokenizerType.tekken:
2301+
self.gguf_writer.add_tokenizer_pre("tekken")
2302+
self.gguf_writer.add_token_merges(
2303+
vocab.extract_vocab_merges_from_model()
2304+
)
2305+
2306+
logger.info(
2307+
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
2308+
)
2309+
2310+
self.gguf_writer.add_bos_token_id(vocab.bos_id)
2311+
self.gguf_writer.add_eos_token_id(vocab.eos_id)
2312+
self.gguf_writer.add_unk_token_id(vocab.unk_id)
2313+
self.gguf_writer.add_pad_token_id(vocab.pad_id)
2314+
2315+
self.gguf_writer.add_token_list(tokens)
2316+
self.gguf_writer.add_token_scores(scores)
2317+
self.gguf_writer.add_token_types(toktypes)
2318+
self.gguf_writer.add_vocab_size(vocab.vocab_size)
2319+
2320+
self.gguf_writer.add_add_bos_token(True)
2321+
self.gguf_writer.add_add_eos_token(False)
2322+
2323+
script_dir = Path(__file__).parent
2324+
template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja"
2325+
with open(template_path, "r", encoding="utf-8") as f:
2326+
template = f.read()
2327+
self.gguf_writer.add_chat_template(template)
2328+
22772329
def set_gguf_parameters(self):
22782330
super().set_gguf_parameters()
22792331
hparams = self.hparams
@@ -2301,12 +2353,13 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
23012353
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
23022354
n_head = self.hparams["num_attention_heads"]
23032355
n_kv_head = self.hparams.get("num_key_value_heads")
2304-
is_vision_tensor = "vision_tower" in name \
2356+
is_multimodal_tensor = "vision_tower" in name \
23052357
or "vision_model" in name \
2358+
or "audio_tower" in name \
23062359
or "model.connector" in name \
23072360
or "multi_modal_projector" in name
23082361

2309-
if is_vision_tensor:
2362+
if is_multimodal_tensor:
23102363
return [] # skip vision tensors
23112364
elif self.hf_arch == "LlamaModel":
23122365
name = "model." + name
@@ -7561,9 +7614,10 @@ class WhisperEncoderModel(MmprojModel):
75617614

75627615
def __init__(self, *args, **kwargs):
75637616
super().__init__(*args, **kwargs)
7564-
self.hparams["hidden_size"] = self.hparams["d_model"]
7565-
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
7566-
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
7617+
if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
7618+
self.hparams["hidden_size"] = self.hparams["d_model"]
7619+
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
7620+
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
75677621

75687622
def set_gguf_parameters(self):
75697623
super().set_gguf_parameters()
@@ -7602,9 +7656,21 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
76027656

76037657
def set_gguf_parameters(self):
76047658
super().set_gguf_parameters()
7659+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX)
76057660
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
76067661

76077662

7663+
@ModelBase.register("VoxtralForConditionalGeneration")
7664+
class VoxtralWhisperEncoderModel(WhisperEncoderModel):
7665+
has_vision_encoder = False # no vision encoder
7666+
has_audio_encoder = True
7667+
7668+
def set_gguf_parameters(self):
7669+
super().set_gguf_parameters()
7670+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.VOXTRAL)
7671+
self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size
7672+
7673+
76087674
@ModelBase.register("FalconH1ForCausalLM")
76097675
class FalconH1Model(Mamba2Model):
76107676
model_arch = gguf.MODEL_ARCH.FALCON_H1
@@ -7919,6 +7985,88 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
79197985
return [(self.map_tensor_name(name), data_torch)]
79207986

79217987

7988+
@ModelBase.register("SmallThinkerForCausalLM")
7989+
class SmallThinkerModel(TextModel):
7990+
model_arch = gguf.MODEL_ARCH.SMALLTHINKER
7991+
7992+
def set_gguf_parameters(self):
7993+
super().set_gguf_parameters()
7994+
if (n_experts := self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))) is not None:
7995+
self.gguf_writer.add_expert_count(n_experts)
7996+
if (n_experts_used := self.hparams.get("num_experts_per_tok", self.hparams.get("moe_num_active_primary_experts"))) is not None:
7997+
self.gguf_writer.add_expert_used_count(n_experts_used)
7998+
if (moe_intermediate_size := self.hparams.get("moe_ffn_hidden_size")) is not None:
7999+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
8000+
self.gguf_writer.add_feed_forward_length(moe_intermediate_size)
8001+
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
8002+
if (self.hparams.get('moe_primary_router_apply_softmax')):
8003+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
8004+
else:
8005+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
8006+
# YaRN is not enabled by default
8007+
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
8008+
rope_scaling = self.hparams.get("rope_scaling") or {}
8009+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
8010+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
8011+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
8012+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
8013+
8014+
sliding_window_layout = self.hparams.get("sliding_window_layout")
8015+
if sliding_window_layout:
8016+
for i in sliding_window_layout:
8017+
if i != 0:
8018+
sliding_window = self.hparams.get("sliding_window_size")
8019+
if sliding_window:
8020+
self.gguf_writer.add_sliding_window(sliding_window)
8021+
break
8022+
8023+
_experts: list[dict[str, Tensor]] | None = None
8024+
8025+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8026+
# process the experts separately
8027+
if name.find("experts") != -1:
8028+
n_experts = self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))
8029+
assert bid is not None
8030+
8031+
if self._experts is None:
8032+
self._experts = [{} for _ in range(self.block_count)]
8033+
8034+
self._experts[bid][name] = data_torch
8035+
8036+
if len(self._experts[bid]) >= n_experts * 3:
8037+
tensors: list[tuple[str, Tensor]] = []
8038+
8039+
# merge the experts into a single 3d tensor
8040+
for w_name in ["down", "gate", "up"]:
8041+
datas: list[Tensor] = []
8042+
8043+
for xid in range(n_experts):
8044+
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
8045+
datas.append(self._experts[bid][ename])
8046+
del self._experts[bid][ename]
8047+
8048+
data_torch = torch.stack(datas, dim=0)
8049+
8050+
merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
8051+
8052+
new_name = self.map_tensor_name(merged_name)
8053+
8054+
tensors.append((new_name, data_torch))
8055+
return tensors
8056+
else:
8057+
return []
8058+
8059+
return [(self.map_tensor_name(name), data_torch)]
8060+
8061+
def prepare_tensors(self):
8062+
super().prepare_tensors()
8063+
8064+
if self._experts is not None:
8065+
# flatten `list[dict[str, Tensor]]` into `list[str]`
8066+
experts = [k for d in self._experts for k in d.keys()]
8067+
if len(experts) > 0:
8068+
raise ValueError(f"Unprocessed experts: {experts}")
8069+
79228070
###### CONVERSION LOGIC ######
79238071

79248072

expose.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,10 @@ extern "C"
272272
{
273273
return audio_multimodal_supported;
274274
}
275+
bool has_vision_support()
276+
{
277+
return vision_multimodal_supported;
278+
}
275279
float get_last_eval_time() {
276280
return last_eval_time;
277281
}

expose.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,7 @@ extern std::string draftmodel_filename;
295295
extern std::vector<std::string> generated_tokens;
296296
extern bool generation_finished;
297297
extern bool audio_multimodal_supported;
298+
extern bool vision_multimodal_supported;
298299
extern float last_eval_time;
299300
extern float last_process_time;
300301
extern int last_token_count;

ggml/src/ggml-cuda/fattn-vec-f16.cuh

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,10 @@ static __global__ void flash_attn_vec_ext_f16(
174174
K += blockIdx.y*D * nb11;
175175
V += blockIdx.y*D * nb21;
176176
maskh += blockIdx.y*D;
177-
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
177+
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D,
178+
// Increment pointers after each loop:
179+
K += gridDim.y*D*nb11, V += gridDim.y*D*nb21, maskh += gridDim.y*D) {
180+
178181
// Calculate KQ tile and keep track of new maximum KQ values:
179182

180183
if (mask) {
@@ -291,10 +294,6 @@ static __global__ void flash_attn_vec_ext_f16(
291294
}
292295
}
293296

294-
K += gridDim.y*D * nb11;
295-
V += gridDim.y*D * nb21;
296-
maskh += gridDim.y*D;
297-
298297
__syncthreads();
299298
}
300299

ggml/src/ggml-cuda/fattn-vec-f32.cuh

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,10 @@ static __global__ void flash_attn_vec_ext_f32(
180180
K += blockIdx.y*D * nb11;
181181
V += blockIdx.y*D * nb21;
182182
maskh += blockIdx.y*D;
183-
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
183+
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D,
184+
// Increment pointers after each loop:
185+
K += gridDim.y*D*nb11, V += gridDim.y*D*nb21, maskh += gridDim.y*D) {
186+
184187
// Calculate KQ tile and keep track of new maximum KQ values:
185188

186189
if (mask) {
@@ -286,10 +289,6 @@ static __global__ void flash_attn_vec_ext_f32(
286289
}
287290
}
288291

289-
K += gridDim.y*D * nb11;
290-
V += gridDim.y*D * nb21;
291-
maskh += gridDim.y*D;
292-
293292
__syncthreads();
294293
}
295294

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,7 @@ struct vk_device_struct {
500500
vk_pipeline pipeline_rwkv_wkv7_f32;
501501
vk_pipeline pipeline_opt_step_adamw_f32;
502502
vk_pipeline pipeline_conv2d_f32;
503+
vk_pipeline pipeline_conv2d_f16_f32;
503504
vk_pipeline pipeline_conv2d_dw_whcn_f32;
504505
vk_pipeline pipeline_conv2d_dw_cwhn_f32;
505506

@@ -3090,12 +3091,21 @@ static void ggml_vk_load_shaders(vk_device& device) {
30903091
device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
30913092
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
30923093
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
3094+
ggml_vk_create_pipeline(
3095+
device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
3096+
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3097+
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
30933098
} else {
30943099
ggml_vk_create_pipeline(
30953100
device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
30963101
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
30973102
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
30983103
false);
3104+
ggml_vk_create_pipeline(
3105+
device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
3106+
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3107+
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
3108+
false);
30993109
}
31003110

31013111
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
@@ -6982,9 +6992,13 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
69826992
}
69836993
return nullptr;
69846994
case GGML_OP_CONV_2D:
6985-
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
6995+
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
69866996
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
6987-
return ctx->device->pipeline_conv2d_f32;
6997+
if (src0->type == GGML_TYPE_F32) {
6998+
return ctx->device->pipeline_conv2d_f32;
6999+
} else if (src0->type == GGML_TYPE_F16) {
7000+
return ctx->device->pipeline_conv2d_f16_f32;
7001+
}
69887002
}
69897003
return nullptr;
69907004
case GGML_OP_CONV_2D_DW:
@@ -7906,6 +7920,13 @@ static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
79067920
const uint32_t src1_type_size = ggml_type_size(src1->type);
79077921
const uint32_t dst_type_size = ggml_type_size(dst->type);
79087922

7923+
// Skip empty skip_rows operations. For most ops the empty check at the start
7924+
// of ggml_vk_build_graph is sufficient, but set_rows can have a nonempty dst
7925+
// with empty srcs.
7926+
if (ggml_is_empty(src0) || ggml_is_empty(src1)) {
7927+
return;
7928+
}
7929+
79097930
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SET_ROWS, {
79107931
(uint32_t)ggml_nelements(src0),
79117932
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
@@ -8202,13 +8223,13 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
82028223

82038224
static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0,
82048225
const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
8205-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
8226+
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
82068227
GGML_ASSERT(src1->type == GGML_TYPE_F32);
82078228
GGML_ASSERT(dst->type == GGML_TYPE_F32);
82088229

82098230
GGML_TENSOR_BINARY_OP_LOCALS
82108231

8211-
GGML_ASSERT(nb00 == sizeof(float));
8232+
GGML_ASSERT(nb00 == sizeof(float) || nb00 == sizeof(ggml_fp16_t));
82128233
GGML_ASSERT(nb10 == sizeof(float));
82138234
GGML_ASSERT(nb0 == sizeof(float));
82148235

@@ -10891,7 +10912,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
1089110912
const vk_device& device = ggml_vk_get_device(ctx->device);
1089210913
bool is_Apple = ggml_vk_get_device(ctx->device)->vendor_id == VK_VENDOR_ID_APPLE;
1089310914
// Channel-contiguous format is not supported yet.
10894-
return (op->src[0]->type == GGML_TYPE_F32 &&
10915+
return ((op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
1089510916
op->src[1]->type == GGML_TYPE_F32 &&
1089610917
op->type == GGML_TYPE_F32 &&
1089710918
ggml_is_contiguous(op->src[0]) &&

ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -670,6 +670,7 @@ void process_shaders() {
670670
string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
671671

672672
string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}});
673+
string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}});
673674

674675
string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
675676
string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));

0 commit comments

Comments
 (0)