Skip to content

Commit 71c7d07

Browse files
authored
Merge branch 'ggml-org:master' into mradermacher
2 parents 268a78d + bea0452 commit 71c7d07

File tree

130 files changed

+14622
-13794
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

130 files changed

+14622
-13794
lines changed

.github/workflows/docker.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ jobs:
4040
# https://github.com/ggml-org/llama.cpp/issues/11888
4141
#- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
4242
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
43-
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
43+
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
4444
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
4545
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
4646
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }

convert_hf_to_gguf.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,6 +1064,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
10641064
if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e":
10651065
# ref: https://huggingface.co/ibm-granite/granite-docling-258M
10661066
res = "granite-docling"
1067+
if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95":
1068+
# ref: https://huggingface.co/MiniMaxAI/MiniMax-M2
1069+
res = "minimax-m2"
10671070

10681071
if res is None:
10691072
logger.warning("\n")
@@ -7136,6 +7139,64 @@ def prepare_tensors(self):
71367139
raise ValueError(f"Unprocessed experts: {experts}")
71377140

71387141

7142+
@ModelBase.register("MiniMaxM2ForCausalLM")
7143+
class MiniMaxM2Model(TextModel):
7144+
model_arch = gguf.MODEL_ARCH.MINIMAXM2
7145+
_experts_cache: dict[int, dict[str, Tensor]] = {}
7146+
7147+
def __init__(self, *args, **kwargs):
7148+
super().__init__(*args, **kwargs)
7149+
self.hparams["num_experts"] = self.hparams["num_local_experts"]
7150+
7151+
def set_gguf_parameters(self):
7152+
super().set_gguf_parameters()
7153+
if self.hparams["scoring_func"] == "sigmoid":
7154+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
7155+
elif self.hparams["scoring_func"] == "softmax":
7156+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
7157+
else:
7158+
raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
7159+
7160+
self.gguf_writer.add_expert_feed_forward_length(self.find_hparam(["intermediate_size"]))
7161+
self.gguf_writer.add_rope_dimension_count(self.find_hparam(["rotary_dim"]))
7162+
7163+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
7164+
if name.endswith("e_score_correction_bias"):
7165+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
7166+
7167+
# merge expert weights
7168+
if 'experts' in name:
7169+
n_experts = self.hparams["num_experts"]
7170+
assert bid is not None
7171+
7172+
expert_cache = self._experts_cache.setdefault(bid, {})
7173+
expert_cache[name] = data_torch
7174+
expert_weights = ["w1", "w2", "w3"]
7175+
7176+
# not enough expert weights to merge
7177+
if len(expert_cache) < n_experts * len(expert_weights):
7178+
return []
7179+
7180+
tensors: list[tuple[str, Tensor]] = []
7181+
for w_name in expert_weights:
7182+
datas: list[Tensor] = []
7183+
7184+
for xid in range(n_experts):
7185+
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
7186+
datas.append(expert_cache[ename])
7187+
del expert_cache[ename]
7188+
7189+
data_torch = torch.stack(datas, dim=0)
7190+
merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
7191+
new_name = self.map_tensor_name(merged_name)
7192+
tensors.append((new_name, data_torch))
7193+
7194+
del self._experts_cache[bid]
7195+
return tensors
7196+
7197+
return super().modify_tensors(data_torch, name, bid)
7198+
7199+
71397200
@ModelBase.register("Dots1ForCausalLM")
71407201
class Dots1Model(Qwen2MoeModel):
71417202
model_arch = gguf.MODEL_ARCH.DOTS1

convert_hf_to_gguf_update.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ class TOKENIZER_TYPE(IntEnum):
141141
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
142142
{"name": "bailingmoe2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
143143
{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
144+
{"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
144145
]
145146

146147
# some models are known to be broken upstream, so we will skip them as exceptions
@@ -435,7 +436,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
435436
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
436437
else:
437438
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
438-
except OSError as e:
439+
except (OSError, TypeError) as e:
439440
logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
440441
continue # Skip this model and continue with the next one in the loop
441442

ggml/src/ggml-cuda/common.cuh

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,11 @@ static const char * cu_get_error_str(CUresult err) {
224224
#define AMD_MFMA_AVAILABLE
225225
#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
226226

227+
// The Volta instructions are in principle available on Turing or newer but they are effectively unusable:
228+
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
229+
#define VOLTA_MMA_AVAILABLE
230+
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
231+
227232
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
228233
#define TURING_MMA_AVAILABLE
229234
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
@@ -278,7 +283,10 @@ static bool amd_mfma_available(const int cc) {
278283
#endif //!defined(GGML_HIP_NO_MMQ_MFMA)
279284
}
280285

281-
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
286+
static bool volta_mma_available(const int cc) {
287+
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_VOLTA;
288+
}
289+
282290
static bool turing_mma_available(const int cc) {
283291
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
284292
}

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "ggml-cuda/mmq.cuh"
2828
#include "ggml-cuda/mmvf.cuh"
2929
#include "ggml-cuda/mmvq.cuh"
30+
#include "ggml-cuda/moe-expert-reduce.cuh"
3031
#include "ggml-cuda/norm.cuh"
3132
#include "ggml-cuda/opt-step-adamw.cuh"
3233
#include "ggml-cuda/opt-step-sgd.cuh"
@@ -3184,6 +3185,31 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
31843185
continue;
31853186
}
31863187

3188+
if (node->op == GGML_OP_MUL) {
3189+
int current_node = i + 1;
3190+
int num_views = 0;
3191+
int num_adds = 0;
3192+
while (current_node < cgraph->n_nodes && cgraph->nodes[current_node]->op == GGML_OP_VIEW) {
3193+
num_views++;
3194+
current_node++;
3195+
}
3196+
3197+
while (current_node < cgraph->n_nodes && cgraph->nodes[current_node]->op == GGML_OP_ADD &&
3198+
num_adds < num_views - 1) {
3199+
num_adds++;
3200+
current_node++;
3201+
}
3202+
3203+
if (num_adds == num_views - 1 && num_views > 0) {
3204+
ggml_tensor * dst_node = cgraph->nodes[current_node - 1];
3205+
if (ggml_cuda_should_use_moe_expert_reduce(cgraph, i, current_node)) {
3206+
ggml_cuda_op_moe_expert_reduce(*cuda_ctx, node->src[0], node->src[1], dst_node);
3207+
i += num_views + num_adds;
3208+
continue;
3209+
}
3210+
}
3211+
}
3212+
31873213
if (node->op == GGML_OP_ADD) {
31883214
int n_fuse = 0;
31893215
ggml_op ops[8];

0 commit comments

Comments
 (0)