Skip to content

Commit 57ce374

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/ISSUE_TEMPLATE/010-bug-compilation.yml # .github/ISSUE_TEMPLATE/011-bug-results.yml # .github/labeler.yml # .github/workflows/build.yml # .github/workflows/release.yml # .gitmodules # CMakeLists.txt # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-opencl/kernels/softmax_4_f16.cl # ggml/src/ggml-opencl/kernels/softmax_4_f32.cl # ggml/src/ggml-opencl/kernels/softmax_f16.cl # ggml/src/ggml-opencl/kernels/softmax_f32.cl # ggml/src/ggml-sycl/element_wise.cpp # ggml/src/ggml-sycl/element_wise.hpp # ggml/src/ggml-sycl/ggml-sycl.cpp # scripts/sync-ggml-am.sh # scripts/sync-ggml.last # scripts/sync-ggml.sh # tests/test-backend-ops.cpp # tests/test-c.c
2 parents ac0366a + ef797db commit 57ce374

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+2950
-985
lines changed

convert_hf_to_gguf.py

Lines changed: 111 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4408,9 +4408,6 @@ def __init__(self, *args, **kwargs):
44084408
]
44094409

44104410
def set_vocab(self):
4411-
with open(self.dir_model / "chat_template.jinja") as f:
4412-
# quick hack to make sure chat template is added
4413-
self.gguf_writer.add_chat_template(f.read())
44144411
super().set_vocab()
44154412

44164413
def set_gguf_parameters(self):
@@ -4781,6 +4778,14 @@ def set_gguf_parameters(self):
47814778
class MambaModel(TextModel):
47824779
model_arch = gguf.MODEL_ARCH.MAMBA
47834780

4781+
def __init__(self, dir_model: Path, *args, **kwargs):
4782+
# Avoid using AutoConfig for hparams
4783+
hparams = kwargs.pop("hparams", None)
4784+
if hparams is None:
4785+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
4786+
hparams = json.load(f)
4787+
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
4788+
47844789
def set_vocab(self):
47854790
vocab_size = self.hparams["vocab_size"]
47864791
# Round vocab size to next multiple of 8
@@ -4855,6 +4860,100 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
48554860
return [(new_name, data_torch)]
48564861

48574862

4863+
@ModelBase.register("Mamba2ForCausalLM")
4864+
class Mamba2Model(TextModel):
4865+
model_arch = gguf.MODEL_ARCH.MAMBA2
4866+
4867+
def __init__(self, dir_model: Path, *args, **kwargs):
4868+
# Avoid using AutoConfig for hparams
4869+
# It wrongly assumes all Mamba2 models are Mamba-Codestral-7B-v0.1
4870+
hparams = kwargs.pop("hparams", None)
4871+
if hparams is None:
4872+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
4873+
hparams = json.load(f)
4874+
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
4875+
4876+
def set_vocab(self):
4877+
vocab_size = self.hparams["vocab_size"]
4878+
# Round vocab size to next multiple of 16
4879+
pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16)
4880+
# pad using ceiling division
4881+
# ref: https://stackoverflow.com/a/17511341/22827863
4882+
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
4883+
self.hparams["vocab_size"] = vocab_size
4884+
4885+
if (self.dir_model / "tokenizer.model").is_file():
4886+
self._set_vocab_sentencepiece()
4887+
elif (self.dir_model / "tokenizer.model.v3").is_file():
4888+
# mamba-codestral
4889+
raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}")
4890+
elif (self.dir_model / "tokenizer.json").is_file():
4891+
self._set_vocab_gpt2()
4892+
else:
4893+
# Use the GPT-NeoX tokenizer when no tokenizer files are present
4894+
self._set_vocab_builtin("gpt-neox", vocab_size)
4895+
4896+
def set_gguf_parameters(self):
4897+
d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
4898+
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
4899+
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
4900+
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
4901+
head_dim = self.find_hparam(["head_dim"], optional=True) or 64
4902+
n_group = self.find_hparam(["n_groups"], optional=True) or 1
4903+
4904+
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
4905+
4906+
# Fail early for models which don't have a block expansion factor of 2
4907+
# TODO: does this really matter?
4908+
assert d_inner == 2 * d_model
4909+
assert d_inner % head_dim == 0
4910+
4911+
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
4912+
self.gguf_writer.add_embedding_length(d_model)
4913+
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
4914+
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
4915+
self.gguf_writer.add_block_count(self.block_count)
4916+
self.gguf_writer.add_ssm_conv_kernel(d_conv)
4917+
self.gguf_writer.add_ssm_inner_size(d_inner)
4918+
self.gguf_writer.add_ssm_state_size(d_state)
4919+
self.gguf_writer.add_ssm_time_step_rank(d_inner // head_dim)
4920+
self.gguf_writer.add_ssm_group_count(n_group)
4921+
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
4922+
self.gguf_writer.add_file_type(self.ftype)
4923+
4924+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4925+
4926+
if name.startswith("model.backbone") or name.startswith("model.lm_head"):
4927+
# map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2
4928+
name = name.removeprefix("model.")
4929+
4930+
if name.endswith(".dt_bias"):
4931+
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
4932+
4933+
new_name = self.map_tensor_name(name)
4934+
4935+
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
4936+
data_torch = data_torch.squeeze()
4937+
elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [
4938+
gguf.MODEL_TENSOR.SSM_A,
4939+
gguf.MODEL_TENSOR.SSM_D,
4940+
]):
4941+
# unsqueeze A to use similar shape semantics as Mamba-1
4942+
# (D is also unsqueezed, but for more straightforward broadcast internally)
4943+
data_torch = data_torch.reshape((*data_torch.shape, 1))
4944+
elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
4945+
d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
4946+
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
4947+
n_group = self.hparams.get("n_groups", 1)
4948+
data_torch = data_torch.reshape((n_group, d_inner // n_group))
4949+
4950+
if name.endswith(".A_log"):
4951+
logger.debug("A_log --> A ==> " + new_name)
4952+
data_torch = -torch.exp(data_torch)
4953+
4954+
yield (new_name, data_torch)
4955+
4956+
48584957
@ModelBase.register("CohereForCausalLM")
48594958
class CommandR2Model(TextModel):
48604959
model_arch = gguf.MODEL_ARCH.COMMAND_R
@@ -6615,12 +6714,20 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
66156714
# maybe we should fallback to text model's arch in that case, since not many models have both
66166715
text_config = hparams.get("text_config", {})
66176716
vision_config = hparams.get("vision_config", {})
6618-
arch = hparams["architectures"][0]
6717+
arch = None
6718+
if (arches := hparams.get("architectures")) is not None and len(arches) > 0:
6719+
arch = arches[0]
6720+
elif "ssm_cfg" in hparams:
6721+
# For non-hf Mamba and Mamba2 models
6722+
arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"
6723+
66196724
# if "architectures" is found in the sub-config, use that instead
66206725
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
66216726
arch = text_config["architectures"][0]
66226727
elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
66236728
arch = vision_config["architectures"][0]
6729+
if arch is None:
6730+
raise ValueError("Failed to detect model architecture")
66246731
return arch
66256732

66266733

ggml/include/ggml-kompute.h

Lines changed: 0 additions & 50 deletions
This file was deleted.

ggml/include/ggml.h

Lines changed: 51 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,8 @@ extern "C" {
563563
GGML_GLU_OP_REGLU,
564564
GGML_GLU_OP_GEGLU,
565565
GGML_GLU_OP_SWIGLU,
566+
GGML_GLU_OP_GEGLU_ERF,
567+
GGML_GLU_OP_GEGLU_QUICK,
566568

567569
GGML_GLU_OP_COUNT,
568570
};
@@ -659,6 +661,9 @@ extern "C" {
659661

660662
// misc
661663

664+
GGML_API const char * ggml_version(void);
665+
GGML_API const char * ggml_commit(void);
666+
662667
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
663668
GGML_API int64_t ggml_time_ms(void);
664669
GGML_API int64_t ggml_time_us(void);
@@ -1157,6 +1162,22 @@ extern "C" {
11571162
struct ggml_context * ctx,
11581163
struct ggml_tensor * a);
11591164

1165+
GGML_API struct ggml_tensor * ggml_geglu_erf(
1166+
struct ggml_context * ctx,
1167+
struct ggml_tensor * a);
1168+
1169+
GGML_API struct ggml_tensor * ggml_geglu_erf_swapped(
1170+
struct ggml_context * ctx,
1171+
struct ggml_tensor * a);
1172+
1173+
GGML_API struct ggml_tensor * ggml_geglu_quick(
1174+
struct ggml_context * ctx,
1175+
struct ggml_tensor * a);
1176+
1177+
GGML_API struct ggml_tensor * ggml_geglu_quick_swapped(
1178+
struct ggml_context * ctx,
1179+
struct ggml_tensor * a);
1180+
11601181
// A: n columns, r rows,
11611182
// B: n columns, r rows,
11621183
GGML_API struct ggml_tensor * ggml_glu_split(
@@ -1180,6 +1201,16 @@ extern "C" {
11801201
struct ggml_tensor * a,
11811202
struct ggml_tensor * b);
11821203

1204+
GGML_API struct ggml_tensor * ggml_geglu_erf_split(
1205+
struct ggml_context * ctx,
1206+
struct ggml_tensor * a,
1207+
struct ggml_tensor * b);
1208+
1209+
GGML_API struct ggml_tensor * ggml_geglu_quick_split(
1210+
struct ggml_context * ctx,
1211+
struct ggml_tensor * a,
1212+
struct ggml_tensor * b);
1213+
11831214
// normalize along rows
11841215
GGML_API struct ggml_tensor * ggml_norm(
11851216
struct ggml_context * ctx,
@@ -1523,8 +1554,14 @@ extern "C" {
15231554
struct ggml_context * ctx,
15241555
struct ggml_tensor * a);
15251556

1557+
// a [ne0, ne01, ne02, ne03]
1558+
// mask [ne0, ne11, ne12, ne13] | ne11 >= ne01, F16 or F32, optional
1559+
//
1560+
// broadcast:
1561+
// ne02 % ne12 == 0
1562+
// ne03 % ne13 == 0
1563+
//
15261564
// fused soft_max(a*scale + mask*(ALiBi slope))
1527-
// mask is optional
15281565
// max_bias = 0.0f for no ALiBi
15291566
GGML_API struct ggml_tensor * ggml_soft_max_ext(
15301567
struct ggml_context * ctx,
@@ -1987,11 +2024,17 @@ extern "C" {
19872024

19882025
#define GGML_KQ_MASK_PAD 64
19892026

1990-
// q: [n_embd_k, n_batch, n_head, 1]
1991-
// k: [n_embd_k, n_kv, n_head_kv, 1]
1992-
// v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
1993-
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1994-
// res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
2027+
// q: [n_embd_k, n_batch, n_head, ne3 ]
2028+
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
2029+
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
2030+
// mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
2031+
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
2032+
//
2033+
// broadcast:
2034+
// n_head % n_head_kv == 0
2035+
// n_head % ne32 == 0
2036+
// ne3 % ne33 == 0
2037+
//
19952038
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
19962039
struct ggml_context * ctx,
19972040
struct ggml_tensor * q,
@@ -2030,7 +2073,8 @@ extern "C" {
20302073
struct ggml_tensor * dt,
20312074
struct ggml_tensor * A,
20322075
struct ggml_tensor * B,
2033-
struct ggml_tensor * C);
2076+
struct ggml_tensor * C,
2077+
struct ggml_tensor * ids);
20342078

20352079
// partition into non-overlapping windows with padding if needed
20362080
// example:

ggml/src/ggml-backend-reg.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,6 @@
6161
#include "ggml-cann.h"
6262
#endif
6363

64-
#ifdef GGML_USE_KOMPUTE
65-
#include "ggml-kompute.h"
66-
#endif
67-
6864
// disable C++17 deprecation warning for std::codecvt_utf8
6965
#if defined(__clang__)
7066
# pragma clang diagnostic push
@@ -189,9 +185,6 @@ struct ggml_backend_registry {
189185
#ifdef GGML_USE_RPC
190186
register_backend(ggml_backend_rpc_reg());
191187
#endif
192-
#ifdef GGML_USE_KOMPUTE
193-
register_backend(ggml_backend_kompute_reg());
194-
#endif
195188
#ifdef GGML_USE_CPU
196189
register_backend(ggml_backend_cpu_reg());
197190
#endif
@@ -576,7 +569,6 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
576569
ggml_backend_load_best("cann", silent, dir_path);
577570
ggml_backend_load_best("cuda", silent, dir_path);
578571
ggml_backend_load_best("hip", silent, dir_path);
579-
ggml_backend_load_best("kompute", silent, dir_path);
580572
ggml_backend_load_best("metal", silent, dir_path);
581573
ggml_backend_load_best("rpc", silent, dir_path);
582574
ggml_backend_load_best("sycl", silent, dir_path);

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2186,6 +2186,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
21862186
case GGML_GLU_OP_REGLU:
21872187
case GGML_GLU_OP_GEGLU:
21882188
case GGML_GLU_OP_SWIGLU:
2189+
case GGML_GLU_OP_GEGLU_ERF:
2190+
case GGML_GLU_OP_GEGLU_QUICK:
21892191
{
21902192
n_tasks = n_threads;
21912193
} break;

0 commit comments

Comments
 (0)