Skip to content

Commit d7c3049

Browse files
author
prima
committed
Merge branch 'remoteManagement' of https://github.com/esolithe/koboldcpp into remoteManagement
2 parents ffc7210 + 0b83819 commit d7c3049

33 files changed

+722
-181
lines changed

Makefile

Lines changed: 30 additions & 28 deletions
Large diffs are not rendered by default.

convert_hf_to_gguf.py

Lines changed: 123 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3328,7 +3328,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
33283328
@ModelBase.register("InternVisionModel")
33293329
class InternVisionModel(MmprojModel):
33303330
def set_gguf_parameters(self):
3331+
assert self.hparams_vision is not None
3332+
if isinstance(self.hparams_vision['image_size'], list):
3333+
self.hparams_vision['image_size'] = self.hparams_vision['image_size'][0]
3334+
if isinstance(self.hparams_vision['patch_size'], list):
3335+
self.hparams_vision['patch_size'] = self.hparams_vision['patch_size'][0]
33313336
super().set_gguf_parameters()
3337+
33323338
hparams = self.hparams
33333339
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL)
33343340
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
@@ -3352,14 +3358,30 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
33523358
return gguf.GGMLQuantizationType.F32
33533359
return False
33543360

3361+
def _mapping_interns1_name(self, name):
3362+
names_map = {
3363+
"model.multi_modal_projector.layer_norm.bias": "mlp1.0.bias",
3364+
"model.multi_modal_projector.layer_norm.weight": "mlp1.0.weight",
3365+
"model.multi_modal_projector.linear_1.bias": "mlp1.1.bias",
3366+
"model.multi_modal_projector.linear_1.weight": "mlp1.1.weight",
3367+
"model.multi_modal_projector.linear_2.bias": "mlp1.3.bias",
3368+
"model.multi_modal_projector.linear_2.weight": "mlp1.3.weight",
3369+
}
3370+
if name in names_map:
3371+
name = names_map[name]
3372+
return name
3373+
33553374
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
33563375
del bid # unused
3357-
if name.startswith("vision_model") or name.startswith("mlp"):
3376+
vision_prefix = ['vision_model', 'mlp', 'model.vision_tower', 'model.multi_modal_projector']
3377+
# deal with intern-s1 special case
3378+
name = self._mapping_interns1_name(name)
3379+
if any([name.startswith(prefix) for prefix in vision_prefix]):
33583380
# process visual tensors
33593381
# correct name
33603382
if name.startswith("vision_model"):
33613383
name = "vision_tower." + name
3362-
if (".ls" in name or "position_embedding" in name) and not name.endswith(".weight"):
3384+
if (".ls" in name or ".lambda_" in name or "position_embedding" in name) and not name.endswith(".weight"):
33633385
name += ".weight"
33643386
# split QKV tensors if needed
33653387
if ".qkv." in name:
@@ -3445,6 +3467,10 @@ def set_gguf_parameters(self):
34453467

34463468
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
34473469
# process the experts separately
3470+
name = name.replace("language_model.", "") # InternVL
3471+
if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
3472+
# skip visual tensors
3473+
return []
34483474
if name.find("experts") != -1:
34493475
n_experts = self.hparams["num_experts"]
34503476
assert bid is not None
@@ -3498,6 +3524,85 @@ class Qwen3Model(Qwen2Model):
34983524
class Qwen3MoeModel(Qwen2MoeModel):
34993525
model_arch = gguf.MODEL_ARCH.QWEN3MOE
35003526

3527+
def __init__(self, *args, **kwargs):
3528+
super().__init__(*args, **kwargs)
3529+
hparams = ModelBase.load_hparams(self.dir_model)
3530+
self.origin_hf_arch = hparams.get('architectures', [None])[0]
3531+
3532+
def set_vocab(self):
3533+
# deal with intern-s1
3534+
if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
3535+
self._set_vocab_interns1()
3536+
return
3537+
3538+
try:
3539+
self._set_vocab_sentencepiece()
3540+
except FileNotFoundError:
3541+
self._set_vocab_gpt2()
3542+
3543+
def _set_vocab_interns1(self):
3544+
tokens: list[str] = []
3545+
toktypes: list[int] = []
3546+
3547+
from transformers import AutoTokenizer
3548+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
3549+
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
3550+
vocab_size = self.hparams.get("vocab_size", len(vocab))
3551+
assert max(vocab.values()) < vocab_size
3552+
3553+
tokpre = self.get_vocab_base_pre(tokenizer)
3554+
3555+
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
3556+
added_vocab = tokenizer.get_added_vocab()
3557+
3558+
added_tokens_decoder = tokenizer.added_tokens_decoder
3559+
3560+
for i in range(vocab_size):
3561+
if i not in reverse_vocab:
3562+
tokens.append(f"[PAD{i}]")
3563+
toktypes.append(gguf.TokenType.UNUSED)
3564+
else:
3565+
token: str = reverse_vocab[i]
3566+
if token in added_vocab:
3567+
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
3568+
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
3569+
if not added_tokens_decoder[i].normalized:
3570+
previous_token = token
3571+
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
3572+
if previous_token != token:
3573+
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
3574+
3575+
if added_tokens_decoder[i].special or self.does_token_look_special(token):
3576+
toktypes.append(gguf.TokenType.CONTROL)
3577+
else:
3578+
toktypes.append(gguf.TokenType.USER_DEFINED)
3579+
else:
3580+
toktypes.append(gguf.TokenType.NORMAL)
3581+
tokens.append(token)
3582+
3583+
self.gguf_writer.add_tokenizer_model("gpt2")
3584+
self.gguf_writer.add_tokenizer_pre(tokpre)
3585+
self.gguf_writer.add_token_list(tokens)
3586+
self.gguf_writer.add_token_types(toktypes)
3587+
3588+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
3589+
special_tokens_map_file = self.dir_model / 'special_tokens_map.json'
3590+
additional_special_tokens = []
3591+
if special_tokens_map_file.is_file():
3592+
with open(special_tokens_map_file, encoding = 'utf-8') as f:
3593+
additional_special_tokens = json.load(f).get('additional_special_tokens', [])
3594+
tokenizer_cfg_file = self.dir_model / 'special_tokens_map.json'
3595+
if tokenizer_cfg_file.is_file():
3596+
with open(tokenizer_cfg_file, encoding = 'utf-8') as f:
3597+
added_tokens_decoder = json.load(f).get('added_tokens_decoder', {})
3598+
token2ids_map = {data['content'] : int(token) for token, data in added_tokens_decoder.items() if data['special']}
3599+
for token in additional_special_tokens:
3600+
if token in token2ids_map:
3601+
special_vocab._set_special_token(token, token2ids_map[token])
3602+
special_vocab._set_special_token('eos', 151645)
3603+
special_vocab._set_special_token("bos", 151643)
3604+
special_vocab.add_to_gguf(self.gguf_writer)
3605+
35013606

35023607
@ModelBase.register("GPT2LMHeadModel")
35033608
class GPT2Model(TextModel):
@@ -7997,15 +8102,13 @@ def repack_mxfp4(self, new_name: str, blocks: Tensor, scales: Tensor):
79978102
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
79988103
blocks0: Tensor = torch.zeros(1)
79998104
blocks1: Tensor = torch.zeros(1)
8000-
found_mxfp4_tensors = False
80018105
# we assume that tensors are loaded in the correct order
80028106
for name, data_torch in self.get_tensors():
80038107
if "mlp.experts.down_proj_blocks" in name:
80048108
blocks0 = data_torch
80058109
elif "mlp.experts.down_proj_scales" in name:
80068110
new_name = self.map_tensor_name(name.replace("_scales", ".weight"))
80078111
self.repack_mxfp4(new_name, blocks0, data_torch)
8008-
found_mxfp4_tensors = True
80098112
elif "mlp.experts.gate_up_proj_blocks" in name:
80108113
blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :]
80118114
elif "mlp.experts.gate_up_proj_scales" in name:
@@ -8014,9 +8117,6 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
80148117
new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight"))
80158118
self.repack_mxfp4(new_name_gate, blocks0, scales0)
80168119
self.repack_mxfp4(new_name_up, blocks1, scales1)
8017-
found_mxfp4_tensors = True
8018-
if not found_mxfp4_tensors:
8019-
raise ValueError("No MXFP4 tensors found in the model. Please make sure you are using MXFP4 model.")
80208120
return []
80218121

80228122
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
@@ -8029,7 +8129,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
80298129
if "down_proj" in name:
80308130
if name.endswith("_bias"):
80318131
name = name.replace("down_proj_bias", "down_proj.bias")
8132+
elif "_blocks" not in name and "_scales" not in name:
8133+
logger.warning(f"{name} is not in MXFP4, performance may be degraded")
8134+
name = name.replace("down_proj", "down_proj.weight")
8135+
data_torch = data_torch.transpose(-1, -2)
80328136
else:
8137+
# otherwise, it should already be repacked to ggml MXFP4 format
80338138
return []
80348139

80358140
# split the gate_up into gate and up
@@ -8042,7 +8147,18 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
80428147
(self.map_tensor_name(name_gate), gate_proj_bias),
80438148
(self.map_tensor_name(name_up), up_proj_bias)
80448149
]
8150+
elif "_blocks" not in name and "_scales" not in name:
8151+
logger.warning(f"{name} is not in MXFP4, performance may be degraded")
8152+
name_up = name.replace("gate_up_proj", "up_proj.weight")
8153+
name_gate = name.replace("gate_up_proj", "gate_proj.weight")
8154+
data_torch = data_torch.transpose(-1, -2)
8155+
gate_proj_weight, up_proj_weight = data_torch[:, ::2, :], data_torch[:, 1::2, :]
8156+
return [
8157+
(self.map_tensor_name(name_gate), gate_proj_weight),
8158+
(self.map_tensor_name(name_up), up_proj_weight)
8159+
]
80458160
else:
8161+
# otherwise, it should already be repacked to ggml MXFP4 format
80468162
return []
80478163

80488164
return [(self.map_tensor_name(name), data_torch)]

ggml/src/ggml-blas/ggml-blas.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -281,10 +281,10 @@ ggml_backend_t ggml_backend_blas_init(void) {
281281
ggml_backend_blas_context * ctx = new ggml_backend_blas_context;
282282

283283
ggml_backend_t backend = new ggml_backend {
284-
/* .guid = */ ggml_backend_blas_guid(),
285-
/* .interface = */ blas_backend_i,
286-
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
287-
/* .context = */ ctx,
284+
/* .guid = */ ggml_backend_blas_guid(),
285+
/* .iface = */ blas_backend_i,
286+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
287+
/* .context = */ ctx,
288288
};
289289

290290
#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)

ggml/src/ggml-cpu/ggml-cpu.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -214,10 +214,10 @@ ggml_backend_t ggml_backend_cpu_init(void) {
214214
ctx->abort_callback_data = NULL;
215215

216216
ggml_backend_t cpu_backend = new ggml_backend {
217-
/* .guid = */ ggml_backend_cpu_guid(),
218-
/* .interface = */ ggml_backend_cpu_i,
219-
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
220-
/* .context = */ ctx,
217+
/* .guid = */ ggml_backend_cpu_guid(),
218+
/* .iface = */ ggml_backend_cpu_i,
219+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
220+
/* .context = */ ctx,
221221
};
222222

223223
if (cpu_backend == NULL) {

ggml/src/ggml-cpu/repack.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1707,8 +1707,13 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
17071707
// instance for IQ4
17081708
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
17091709

1710+
bool permit_repack = true;
1711+
#if defined(GGML_USE_CLBLAST)
1712+
permit_repack = false; //kcpp: clblast cannot handle repacking
1713+
#endif
1714+
17101715
if (cur->type == GGML_TYPE_Q4_0) {
1711-
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
1716+
if ((ggml_cpu_has_avx2() && permit_repack) || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
17121717
if (cur->ne[1] % 8 == 0) {
17131718
return &q4_0_8x8_q8_0;
17141719
}
@@ -1724,13 +1729,13 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
17241729
}
17251730
}
17261731
} else if (cur->type == GGML_TYPE_Q4_K) {
1727-
if (ggml_cpu_has_avx2()) {
1732+
if (ggml_cpu_has_avx2() && permit_repack) {
17281733
if (cur->ne[1] % 8 == 0) {
17291734
return &q4_K_8x8_q8_K;
17301735
}
17311736
}
17321737
} else if (cur->type == GGML_TYPE_Q2_K) {
1733-
if (ggml_cpu_has_avx512()) {
1738+
if (ggml_cpu_has_avx512() && permit_repack) {
17341739
if (cur->ne[1] % 8 == 0) {
17351740
return &q2_K_8x8_q8_K;
17361741
}

0 commit comments

Comments
 (0)