Skip to content

Commit 5910f24

Browse files
committed
Merge branch 'master' into esocrok
2 parents e1fa23b + 2f68ce7 commit 5910f24

File tree

135 files changed

+15732
-14242
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

135 files changed

+15732
-14242
lines changed

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2053,7 +2053,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20532053
params.system_prompt.pop_back();
20542054
}
20552055
}
2056-
).set_examples({LLAMA_EXAMPLE_MAIN}));
2056+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
20572057
add_opt(common_arg(
20582058
{"--in-file"}, "FNAME",
20592059
"an input file (repeat to specify multiple files)",

convert_hf_to_gguf.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1054,6 +1054,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
10541054
if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e":
10551055
# ref: https://huggingface.co/ibm-granite/granite-docling-258M
10561056
res = "granite-docling"
1057+
if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95":
1058+
# ref: https://huggingface.co/MiniMaxAI/MiniMax-M2
1059+
res = "minimax-m2"
10571060

10581061
if res is None:
10591062
logger.warning("\n")
@@ -7126,6 +7129,64 @@ def prepare_tensors(self):
71267129
raise ValueError(f"Unprocessed experts: {experts}")
71277130

71287131

7132+
@ModelBase.register("MiniMaxM2ForCausalLM")
7133+
class MiniMaxM2Model(TextModel):
7134+
model_arch = gguf.MODEL_ARCH.MINIMAXM2
7135+
_experts_cache: dict[int, dict[str, Tensor]] = {}
7136+
7137+
def __init__(self, *args, **kwargs):
7138+
super().__init__(*args, **kwargs)
7139+
self.hparams["num_experts"] = self.hparams["num_local_experts"]
7140+
7141+
def set_gguf_parameters(self):
7142+
super().set_gguf_parameters()
7143+
if self.hparams["scoring_func"] == "sigmoid":
7144+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
7145+
elif self.hparams["scoring_func"] == "softmax":
7146+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
7147+
else:
7148+
raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
7149+
7150+
self.gguf_writer.add_expert_feed_forward_length(self.find_hparam(["intermediate_size"]))
7151+
self.gguf_writer.add_rope_dimension_count(self.find_hparam(["rotary_dim"]))
7152+
7153+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
7154+
if name.endswith("e_score_correction_bias"):
7155+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
7156+
7157+
# merge expert weights
7158+
if 'experts' in name:
7159+
n_experts = self.hparams["num_experts"]
7160+
assert bid is not None
7161+
7162+
expert_cache = self._experts_cache.setdefault(bid, {})
7163+
expert_cache[name] = data_torch
7164+
expert_weights = ["w1", "w2", "w3"]
7165+
7166+
# not enough expert weights to merge
7167+
if len(expert_cache) < n_experts * len(expert_weights):
7168+
return []
7169+
7170+
tensors: list[tuple[str, Tensor]] = []
7171+
for w_name in expert_weights:
7172+
datas: list[Tensor] = []
7173+
7174+
for xid in range(n_experts):
7175+
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
7176+
datas.append(expert_cache[ename])
7177+
del expert_cache[ename]
7178+
7179+
data_torch = torch.stack(datas, dim=0)
7180+
merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
7181+
new_name = self.map_tensor_name(merged_name)
7182+
tensors.append((new_name, data_torch))
7183+
7184+
del self._experts_cache[bid]
7185+
return tensors
7186+
7187+
return super().modify_tensors(data_torch, name, bid)
7188+
7189+
71297190
@ModelBase.register("Dots1ForCausalLM")
71307191
class Dots1Model(Qwen2MoeModel):
71317192
model_arch = gguf.MODEL_ARCH.DOTS1

convert_hf_to_gguf_update.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ class TOKENIZER_TYPE(IntEnum):
141141
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
142142
{"name": "bailingmoe2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
143143
{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
144+
{"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
144145
]
145146

146147
# some models are known to be broken upstream, so we will skip them as exceptions
@@ -435,7 +436,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
435436
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
436437
else:
437438
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
438-
except OSError as e:
439+
except (OSError, TypeError) as e:
439440
logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
440441
continue # Skip this model and continue with the next one in the loop
441442

ggml/src/ggml-cuda/common.cuh

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,11 @@ static const char * cu_get_error_str(CUresult err) {
229229
#define AMD_MFMA_AVAILABLE
230230
#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
231231

232+
// The Volta instructions are in principle available on Turing or newer but they are effectively unusable:
233+
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
234+
#define VOLTA_MMA_AVAILABLE
235+
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
236+
232237
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
233238
#define TURING_MMA_AVAILABLE
234239
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
@@ -283,7 +288,10 @@ static bool amd_mfma_available(const int cc) {
283288
#endif //!defined(GGML_HIP_NO_MMQ_MFMA)
284289
}
285290

286-
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
291+
static bool volta_mma_available(const int cc) {
292+
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_VOLTA;
293+
}
294+
287295
static bool turing_mma_available(const int cc) {
288296
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
289297
}

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ bool g_mul_mat_q = true;
2929
#include "ggml-cuda/mmq.cuh"
3030
#include "ggml-cuda/mmvf.cuh"
3131
#include "ggml-cuda/mmvq.cuh"
32+
#include "ggml-cuda/moe-expert-reduce.cuh"
3233
#include "ggml-cuda/norm.cuh"
3334
#include "ggml-cuda/opt-step-adamw.cuh"
3435
#include "ggml-cuda/opt-step-sgd.cuh"
@@ -3182,6 +3183,31 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
31823183
continue;
31833184
}
31843185

3186+
if (node->op == GGML_OP_MUL) {
3187+
int current_node = i + 1;
3188+
int num_views = 0;
3189+
int num_adds = 0;
3190+
while (current_node < cgraph->n_nodes && cgraph->nodes[current_node]->op == GGML_OP_VIEW) {
3191+
num_views++;
3192+
current_node++;
3193+
}
3194+
3195+
while (current_node < cgraph->n_nodes && cgraph->nodes[current_node]->op == GGML_OP_ADD &&
3196+
num_adds < num_views - 1) {
3197+
num_adds++;
3198+
current_node++;
3199+
}
3200+
3201+
if (num_adds == num_views - 1 && num_views > 0) {
3202+
ggml_tensor * dst_node = cgraph->nodes[current_node - 1];
3203+
if (ggml_cuda_should_use_moe_expert_reduce(cgraph, i, current_node)) {
3204+
ggml_cuda_op_moe_expert_reduce(*cuda_ctx, node->src[0], node->src[1], dst_node);
3205+
i += num_views + num_adds;
3206+
continue;
3207+
}
3208+
}
3209+
}
3210+
31853211
if (node->op == GGML_OP_ADD) {
31863212
int n_fuse = 0;
31873213
ggml_op ops[8];

0 commit comments

Comments
 (0)