Skip to content

Commit 67cf20b

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 8152481 + 3b337b0 commit 67cf20b

26 files changed

+5025
-1914
lines changed

.devops/musa.Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
ARG UBUNTU_VERSION=22.04
22
# This needs to generally match the container host's environment.
3-
ARG MUSA_VERSION=rc4.2.0
3+
ARG MUSA_VERSION=rc4.3.0
44
# Target the MUSA build image
5-
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
5+
ARG BASE_MUSA_DEV_CONTAINER=sh-harbor.mthreads.com/haive/mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
66

7-
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
7+
ARG BASE_MUSA_RUN_CONTAINER=sh-harbor.mthreads.com/haive/mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
88

99
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
1010

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,7 @@ jobs:
475475
476476
ubuntu-22-cmake-musa:
477477
runs-on: ubuntu-22.04
478-
container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
478+
container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
479479

480480
steps:
481481
- name: Clone

CODEOWNERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,4 +103,5 @@
103103
/LICENSE @ggerganov
104104
/README.md @ggerganov
105105
/SECURITY.md @ggerganov
106+
/build-xcframework.sh @danbev
106107
requirements*.txt @CISC

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
178178
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
179179
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
180180
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
181+
- Java: [QuasarByte/llama-cpp-jna](https://github.com/QuasarByte/llama-cpp-jna)
181182
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
182183
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
183184
- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)

ci/README-MUSA.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ docker run --privileged -it \
2121
-v $HOME/llama.cpp/ci-cache:/ci-cache \
2222
-v $HOME/llama.cpp/ci-results:/ci-results \
2323
-v $PWD:/ws -w /ws \
24-
mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
24+
mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
2525
```
2626

2727
Inside the container, execute the following commands:

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -738,7 +738,7 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
738738
// MoE utils
739739
//
740740

741-
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
741+
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
742742

743743
static std::string llm_ffn_exps_block_regex(int idx) {
744744
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);

convert_hf_to_gguf.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7995,6 +7995,121 @@ def prepare_tensors(self):
79957995
raise ValueError(f"Unprocessed experts: {experts}")
79967996

79977997

7998+
@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
7999+
class GroveMoeModel(TextModel):
8000+
model_arch = gguf.MODEL_ARCH.GROVEMOE
8001+
8002+
def set_gguf_parameters(self):
8003+
super().set_gguf_parameters()
8004+
if (n_experts := self.hparams.get("num_experts")) is not None:
8005+
self.gguf_writer.add_expert_count(n_experts)
8006+
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
8007+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
8008+
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
8009+
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L299
8010+
self.gguf_writer.add_expert_chunk_feed_forward_length(self.hparams.get("head_dim") or 128)
8011+
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L298
8012+
self.gguf_writer.add_experts_per_group(2)
8013+
# FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
8014+
self.gguf_writer.add_expert_group_scale(0.05)
8015+
# YaRN is not enabled by default
8016+
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
8017+
rope_scaling = self.hparams.get("rope_scaling") or {}
8018+
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
8019+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
8020+
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
8021+
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
8022+
8023+
_experts: list[dict[str, Tensor]] | None = None
8024+
_chunk_experts: list[dict[str, Tensor]] | None = None
8025+
8026+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8027+
if name.endswith(".expert_bias"):
8028+
# FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303
8029+
return []
8030+
8031+
# process the experts separately
8032+
if name.find("chunk_experts") != -1:
8033+
n_experts = self.hparams["num_experts"] // 2 # see add_experts_per_group
8034+
assert bid is not None
8035+
8036+
if self._chunk_experts is None:
8037+
self._chunk_experts = [{} for _ in range(self.block_count)]
8038+
8039+
self._chunk_experts[bid][name] = data_torch
8040+
8041+
if len(self._chunk_experts[bid]) >= n_experts * 3:
8042+
tensors: list[tuple[str, Tensor]] = []
8043+
8044+
# merge the experts into a single 3d tensor
8045+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
8046+
datas: list[Tensor] = []
8047+
8048+
for xid in range(n_experts):
8049+
ename = f"model.layers.{bid}.mlp.chunk_experts.{xid}.{w_name}.weight"
8050+
datas.append(self._chunk_experts[bid][ename])
8051+
del self._chunk_experts[bid][ename]
8052+
8053+
data_torch = torch.stack(datas, dim=0)
8054+
8055+
merged_name = f"model.layers.{bid}.mlp.chunk_experts.{w_name}.weight"
8056+
8057+
new_name = self.map_tensor_name(merged_name)
8058+
8059+
tensors.append((new_name, data_torch))
8060+
return tensors
8061+
else:
8062+
return []
8063+
elif name.find("experts") != -1:
8064+
n_experts = self.hparams["num_experts"]
8065+
assert bid is not None
8066+
8067+
if self._experts is None:
8068+
self._experts = [{} for _ in range(self.block_count)]
8069+
8070+
self._experts[bid][name] = data_torch
8071+
8072+
if len(self._experts[bid]) >= n_experts * 3:
8073+
tensors: list[tuple[str, Tensor]] = []
8074+
8075+
# merge the experts into a single 3d tensor
8076+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
8077+
datas: list[Tensor] = []
8078+
8079+
for xid in range(n_experts):
8080+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
8081+
datas.append(self._experts[bid][ename])
8082+
del self._experts[bid][ename]
8083+
8084+
data_torch = torch.stack(datas, dim=0)
8085+
8086+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
8087+
8088+
new_name = self.map_tensor_name(merged_name)
8089+
8090+
tensors.append((new_name, data_torch))
8091+
return tensors
8092+
else:
8093+
return []
8094+
8095+
return [(self.map_tensor_name(name), data_torch)]
8096+
8097+
def prepare_tensors(self):
8098+
super().prepare_tensors()
8099+
8100+
if self._chunk_experts is not None:
8101+
# flatten `list[dict[str, Tensor]]` into `list[str]`
8102+
chunk_experts = [k for d in self._chunk_experts for k in d.keys()]
8103+
if len(chunk_experts) > 0:
8104+
raise ValueError(f"Unprocessed adjugate experts: {chunk_experts}")
8105+
8106+
if self._experts is not None:
8107+
# flatten `list[dict[str, Tensor]]` into `list[str]`
8108+
experts = [k for d in self._experts for k in d.keys()]
8109+
if len(experts) > 0:
8110+
raise ValueError(f"Unprocessed experts: {experts}")
8111+
8112+
79988113
@ModelBase.register("ChameleonForConditionalGeneration")
79998114
@ModelBase.register("ChameleonForCausalLM") # obsolete
80008115
class ChameleonModel(TextModel):

docs/docker.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment
110110

111111
The defaults are:
112112

113-
- `MUSA_VERSION` set to `rc4.2.0`
113+
- `MUSA_VERSION` set to `rc4.3.0`
114114

115115
The resulting images, are essentially the same as the non-MUSA images:
116116

ggml/src/ggml-cuda/binbcast.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ static __global__ void k_bin_bcast(const src0_t * src0,
5454
const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
5555
const uint32_t i3 = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z);
5656

57-
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) {
57+
if (i0s >= (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) {
5858
return;
5959
}
6060

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
#include "ggml-cuda/sumrows.cuh"
4646
#include "ggml-cuda/mean.cuh"
4747
#include "ggml-cuda/tsembd.cuh"
48+
#include "ggml-cuda/topk-moe.cuh"
4849
#include "ggml-cuda/unary.cuh"
4950
#include "ggml-cuda/upscale.cuh"
5051
#include "ggml-cuda/wkv.cuh"
@@ -2825,6 +2826,44 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
28252826
GGML_ASSERT(unary_ops.size() == num_unary);
28262827
#endif
28272828

2829+
//TODO: remove special case once ggml_can_fuse can handle empty nodes
2830+
std::initializer_list<enum ggml_op> topk_moe_ops = ggml_cuda_topk_moe_ops(false);
2831+
std::initializer_list<enum ggml_op> topk_moe_ops_with_norm = ggml_cuda_topk_moe_ops(true);
2832+
2833+
if (ops.size() == topk_moe_ops_with_norm.size() && std::equal(ops.begin(), ops.end(), topk_moe_ops_with_norm.begin())) {
2834+
2835+
if (node_idx + topk_moe_ops_with_norm.size() > (size_t)cgraph->n_nodes) {
2836+
return false;
2837+
}
2838+
2839+
for (size_t i = 0; i < topk_moe_ops_with_norm.size(); i++) {
2840+
if (cgraph->nodes[node_idx + i]->op != topk_moe_ops_with_norm.begin()[i]) return false;
2841+
}
2842+
ggml_tensor * softmax = cgraph->nodes[node_idx];
2843+
ggml_tensor * weights = cgraph->nodes[node_idx+8];
2844+
2845+
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
2846+
return true;
2847+
}
2848+
}
2849+
2850+
if (ops.size() == topk_moe_ops.size() && std::equal(ops.begin(), ops.end(), topk_moe_ops.begin())) {
2851+
2852+
if (node_idx + topk_moe_ops.size() > (size_t)cgraph->n_nodes) {
2853+
return false;
2854+
}
2855+
2856+
for (size_t i = 0; i < topk_moe_ops.size(); i++) {
2857+
if (cgraph->nodes[node_idx + i]->op != topk_moe_ops.begin()[i]) return false;
2858+
}
2859+
2860+
ggml_tensor * softmax = cgraph->nodes[node_idx];
2861+
ggml_tensor * weights = cgraph->nodes[node_idx+4];
2862+
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
2863+
return true;
2864+
}
2865+
}
2866+
28282867
if (!ggml_can_fuse(cgraph, node_idx, ops)) {
28292868
return false;
28302869
}
@@ -2915,6 +2954,22 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
29152954
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
29162955
if (!disable_fusion) {
29172956

2957+
if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ true), {})) {
2958+
ggml_tensor * weights = cgraph->nodes[i+8];
2959+
ggml_tensor * selected_experts = cgraph->nodes[i+3];
2960+
ggml_cuda_op_topk_moe(*cuda_ctx, node, weights, selected_experts, /*with norm*/ true);
2961+
i += 8;
2962+
continue;
2963+
}
2964+
2965+
if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ false), {})) {
2966+
ggml_tensor * weights = cgraph->nodes[i+4];
2967+
ggml_tensor * selected_experts = cgraph->nodes[i+3];
2968+
ggml_cuda_op_topk_moe(*cuda_ctx, node, weights, selected_experts, /*with norm*/ false);
2969+
i += 4;
2970+
continue;
2971+
}
2972+
29182973
if (node->op == GGML_OP_ADD) {
29192974
int n_fuse = 0;
29202975
ggml_op ops[8];

0 commit comments

Comments
 (0)