Skip to content

Commit f037995

Browse files
Merge pull request #89 from menloresearch/update-dev-from-master-2025-05-14-00-08
Sync master with upstream release b5371
2 parents b527a69 + e5c834f commit f037995

33 files changed

+651
-1962
lines changed

convert_hf_to_gguf.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5746,11 +5746,20 @@ def set_gguf_parameters(self):
57465746
logger.info("gguf: (granite) logits_scale = %s", logits_scale)
57475747

57485748

5749-
@ModelBase.register("GraniteMoeForCausalLM")
5749+
@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM")
57505750
class GraniteMoeModel(GraniteModel):
57515751
"""Conversion for IBM's GraniteMoeForCausalLM"""
57525752
model_arch = gguf.MODEL_ARCH.GRANITE_MOE
57535753

5754+
def set_gguf_parameters(self):
5755+
"""GraniteMoeShared uses GraniteMoe parameters plus the following:
5756+
- shared_intermediate_size
5757+
"""
5758+
super().set_gguf_parameters()
5759+
if shared_feed_forward_length := self.hparams.get("shared_intermediate_size"):
5760+
self.gguf_writer.add_expert_shared_feed_forward_length(shared_feed_forward_length)
5761+
logger.info("gguf: (granitemoeshared) shared_feed_forward_length = %s", shared_feed_forward_length)
5762+
57545763
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
57555764
"""In modeling_granitemoe, the JetMoe implementation of parallel experts
57565765
is used. This essentially merges w1 and w3 into a single tensor with 2x
@@ -5761,12 +5770,21 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
57615770
if name.endswith("block_sparse_moe.input_linear.weight"):
57625771
ffn_dim = self.hparams["intermediate_size"]
57635772
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
5764-
gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
5773+
gate, up = data_torch.split(ffn_dim, dim=-2)
57655774
return [
57665775
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
57675776
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
57685777
]
57695778

5779+
if name.endswith("shared_mlp.input_linear.weight"):
5780+
ffn_dim = self.hparams["shared_intermediate_size"]
5781+
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
5782+
gate, up = data_torch.split(ffn_dim, dim=-2)
5783+
return [
5784+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
5785+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
5786+
]
5787+
57705788
return super().modify_tensors(data_torch, name, bid)
57715789

57725790

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -385,9 +385,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
385385

386386
# Fetch KleidiAI sources:
387387
include(FetchContent)
388-
set(KLEIDIAI_COMMIT_TAG "v1.5.0")
388+
set(KLEIDIAI_COMMIT_TAG "v1.6.0")
389389
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
390-
set(KLEIDIAI_ARCHIVE_MD5 "ea22e1aefb800e9bc8c74d91633cc58e")
390+
set(KLEIDIAI_ARCHIVE_MD5 "75b4ad68f25ab673dcc01065e5a0b05f")
391391

392392
if (POLICY CMP0135)
393393
cmake_policy(SET CMP0135 NEW)

ggml/src/ggml-cpu/kleidiai/kernels.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#pragma once
66

77
#include <functional>
8+
#include <variant>
89
#include "ggml.h"
910

1011
enum cpu_feature {

ggml/src/ggml-cpu/kleidiai/kleidiai.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
//
44
#include <arm_neon.h>
55
#include <assert.h>
6+
#include <atomic>
67
#include <cfloat>
8+
#include <stdexcept>
79
#include <stdint.h>
810
#include <string.h>
911
#if defined(__linux__)

ggml/src/ggml-metal/ggml-metal.m

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4358,7 +4358,7 @@ static bool ggml_metal_encode_node(
43584358
// TODO: add vec kernels for (ne00%64 == 0) and maybe also for (ne00%32 == 0)
43594359
// for now avoiding mainly to keep the number of templates/kernels a bit lower
43604360
// these are now trivial to add after: https://github.com/ggml-org/llama.cpp/pull/12612
4361-
if (ne01 >= 4 || (ne00%128 != 0 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
4361+
if (ne01 >= 20 || (ne00%128 != 0 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
43624362
switch (src1->type) {
43634363
case GGML_TYPE_F16:
43644364
{

ggml/src/ggml-metal/ggml-metal.metal

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3887,6 +3887,11 @@ kernel void kernel_flash_attn_ext_vec(
38873887
sm[tiisg] = pm[ic + tiisg];
38883888
}
38893889

3890+
// skip -INF blocks
3891+
if (simd_max(sm[tiisg]) == -INFINITY) {
3892+
continue;
3893+
}
3894+
38903895
// Q*K^T
38913896
{
38923897
// each simdgroup processes 1 query and NE (NW/NL) head elements

gguf-py/gguf/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1905,6 +1905,9 @@ class MODEL_TENSOR(IntEnum):
19051905
MODEL_TENSOR.FFN_GATE_EXP,
19061906
MODEL_TENSOR.FFN_DOWN_EXP,
19071907
MODEL_TENSOR.FFN_UP_EXP,
1908+
MODEL_TENSOR.FFN_GATE_SHEXP,
1909+
MODEL_TENSOR.FFN_UP_SHEXP,
1910+
MODEL_TENSOR.FFN_DOWN_SHEXP,
19081911
],
19091912
MODEL_ARCH.CHAMELEON: [
19101913
MODEL_TENSOR.TOKEN_EMBD,

gguf-py/gguf/tensor_mapping.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,7 @@ class TensorNameMap:
428428
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
429429
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
430430
"language_model.model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
431+
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
431432
),
432433

433434
MODEL_TENSOR.ATTN_Q_NORM: (

0 commit comments

Comments
 (0)