Skip to content

Commit a1f6913

Browse files
Merge pull request #181 from menloresearch/update-dev-from-master-2025-07-28-00-13
Sync master with upstream release b6002
2 parents 02af438 + 89d1029 commit a1f6913

File tree

10 files changed

+14812
-13105
lines changed

10 files changed

+14812
-13105
lines changed

convert_hf_to_gguf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3791,7 +3791,7 @@ def set_gguf_parameters(self):
37913791
self.gguf_writer.add_block_count(block_count)
37923792
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
37933793
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
3794-
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1000000.0))
3794+
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))
37953795

37963796
# Mamba parameters
37973797
self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
@@ -3802,7 +3802,7 @@ def set_gguf_parameters(self):
38023802
self.gguf_writer.add_ssm_group_count(0)
38033803

38043804
# MLP feed forward parameters (for attention layers)
3805-
self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 16384))
3805+
self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 13312))
38063806
self.gguf_writer.add_file_type(self.ftype)
38073807

38083808
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:

docs/ops.md

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22

33
List of GGML operations and backend support status.
44

5+
## How to add a backend to this table:
6+
7+
1. Run `test-backend-ops support --output csv` with your backend name and redirect output to a csv file in `docs/ops/` (e.g., `docs/ops/CUDA.csv`)
8+
2. Regenerate `/docs/ops.md` via `./scripts/create_ops_docs.py`
9+
510
Legend:
611
- ✅ Fully supported by this backend
712
- 🟡 Partially supported by this backend
@@ -18,7 +23,8 @@ Legend:
1823
| ARGSORT |||||
1924
| CLAMP |||| 🟡 |
2025
| CONCAT ||| 🟡 ||
21-
| CONT ||| 🟡 ||
26+
| CONT |||||
27+
| CONV_2D |||||
2228
| CONV_2D_DW |||||
2329
| CONV_TRANSPOSE_1D |||||
2430
| CONV_TRANSPOSE_2D |||||
@@ -30,7 +36,7 @@ Legend:
3036
| DIAG_MASK_INF |||| 🟡 |
3137
| DIV |||| 🟡 |
3238
| DUP ||| 🟡 | 🟡 |
33-
| ELU ||| | 🟡 |
39+
| ELU ||| 🟡 | 🟡 |
3440
| EXP ||| 🟡 ||
3541
| FLASH_ATTN_EXT ||| 🟡 | 🟡 |
3642
| GATED_LINEAR_ATTN |||||
@@ -66,14 +72,16 @@ Legend:
6672
| REPEAT_BACK |||||
6773
| RMS_NORM |||| 🟡 |
6874
| RMS_NORM_BACK |||||
69-
| RMS_NORM_MUL |||||
75+
| RMS_NORM_MUL |||||
76+
| RMS_NORM_MUL_ADD |||||
77+
| ROLL |||||
7078
| ROPE |||||
7179
| ROPE_BACK |||||
7280
| RWKV_WKV6 |||||
7381
| RWKV_WKV7 |||||
7482
| SCALE |||||
7583
| SET |||||
76-
| SET_ROWS || 🟡 | | 🟡 |
84+
| SET_ROWS || 🟡 | 🟡 | 🟡 |
7785
| SGN ||| 🟡 ||
7886
| SIGMOID ||| 🟡 | 🟡 |
7987
| SILU ||| 🟡 | 🟡 |

docs/ops/CPU.csv

Lines changed: 7349 additions & 6534 deletions
Large diffs are not rendered by default.

docs/ops/CUDA.csv

Lines changed: 7349 additions & 6534 deletions
Large diffs are not rendered by default.

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,7 @@ struct vk_device_struct {
484484
vk_pipeline pipeline_rwkv_wkv7_f32;
485485
vk_pipeline pipeline_opt_step_adamw_f32;
486486
vk_pipeline pipeline_conv2d_f32;
487+
vk_pipeline pipeline_conv2d_f16_f32;
487488
vk_pipeline pipeline_conv2d_dw_whcn_f32;
488489
vk_pipeline pipeline_conv2d_dw_cwhn_f32;
489490

@@ -3074,12 +3075,21 @@ static void ggml_vk_load_shaders(vk_device& device) {
30743075
device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
30753076
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
30763077
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
3078+
ggml_vk_create_pipeline(
3079+
device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
3080+
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3081+
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
30773082
} else {
30783083
ggml_vk_create_pipeline(
30793084
device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
30803085
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
30813086
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
30823087
false);
3088+
ggml_vk_create_pipeline(
3089+
device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
3090+
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3091+
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
3092+
false);
30833093
}
30843094

30853095
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
@@ -6958,9 +6968,13 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
69586968
}
69596969
return nullptr;
69606970
case GGML_OP_CONV_2D:
6961-
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
6971+
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
69626972
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
6963-
return ctx->device->pipeline_conv2d_f32;
6973+
if (src0->type == GGML_TYPE_F32) {
6974+
return ctx->device->pipeline_conv2d_f32;
6975+
} else if (src0->type == GGML_TYPE_F16) {
6976+
return ctx->device->pipeline_conv2d_f16_f32;
6977+
}
69646978
}
69656979
return nullptr;
69666980
case GGML_OP_CONV_2D_DW:
@@ -7882,6 +7896,13 @@ static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
78827896
const uint32_t src1_type_size = ggml_type_size(src1->type);
78837897
const uint32_t dst_type_size = ggml_type_size(dst->type);
78847898

7899+
// Skip empty skip_rows operations. For most ops the empty check at the start
7900+
// of ggml_vk_build_graph is sufficient, but set_rows can have a nonempty dst
7901+
// with empty srcs.
7902+
if (ggml_is_empty(src0) || ggml_is_empty(src1)) {
7903+
return;
7904+
}
7905+
78857906
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SET_ROWS, {
78867907
(uint32_t)ggml_nelements(src0),
78877908
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
@@ -8178,13 +8199,13 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
81788199

81798200
static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0,
81808201
const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
8181-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
8202+
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
81828203
GGML_ASSERT(src1->type == GGML_TYPE_F32);
81838204
GGML_ASSERT(dst->type == GGML_TYPE_F32);
81848205

81858206
GGML_TENSOR_BINARY_OP_LOCALS
81868207

8187-
GGML_ASSERT(nb00 == sizeof(float));
8208+
GGML_ASSERT(nb00 == sizeof(float) || nb00 == sizeof(ggml_fp16_t));
81888209
GGML_ASSERT(nb10 == sizeof(float));
81898210
GGML_ASSERT(nb0 == sizeof(float));
81908211

@@ -10867,7 +10888,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
1086710888
const vk_device& device = ggml_vk_get_device(ctx->device);
1086810889
bool is_Apple = ggml_vk_get_device(ctx->device)->vendor_id == VK_VENDOR_ID_APPLE;
1086910890
// Channel-contiguous format is not supported yet.
10870-
return (op->src[0]->type == GGML_TYPE_F32 &&
10891+
return ((op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
1087110892
op->src[1]->type == GGML_TYPE_F32 &&
1087210893
op->type == GGML_TYPE_F32 &&
1087310894
ggml_is_contiguous(op->src[0]) &&

ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -656,6 +656,7 @@ void process_shaders() {
656656
string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
657657

658658
string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}});
659+
string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}});
659660

660661
string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
661662
string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));

scripts/create_ops_docs.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,11 @@ def generate_markdown(self) -> str:
112112
lines.append("")
113113
lines.append("List of GGML operations and backend support status.")
114114
lines.append("")
115+
lines.append("## How to add a backend to this table:")
116+
lines.append("")
117+
lines.append("1. Run `test-backend-ops support --output csv` with your backend name and redirect output to a csv file in `docs/ops/` (e.g., `docs/ops/CUDA.csv`)")
118+
lines.append("2. Regenerate `/docs/ops.md` via `./scripts/create_ops_docs.py`")
119+
lines.append("")
115120
lines.append("Legend:")
116121
lines.append("- ✅ Fully supported by this backend")
117122
lines.append("- 🟡 Partially supported by this backend")

src/llama-hparams.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ struct llama_hparams {
9898
float rope_freq_scale_train;
9999
float rope_freq_scale_train_swa;
100100
uint32_t n_ctx_orig_yarn;
101-
float rope_yarn_log_mul;
101+
float rope_yarn_log_mul = 0.0f;
102102

103103
std::array<int, 4> rope_sections;
104104

src/llama-model.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1369,7 +1369,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
13691369
// that have no expert_gating_func model parameter set
13701370
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
13711371
}
1372-
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
1372+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
13731373

13741374
switch (hparams.n_layer) {
13751375
case 27: type = LLM_TYPE_16B; break;
@@ -16191,7 +16191,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
1619116191
{
1619216192
// PLaMo-2 uses combined QKV tensor
1619316193
ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
16194-
cb(qkv, "qkv", il);
16194+
cb(qkv, "wqkv", il);
1619516195

1619616196
// split QKV tensor into Q, K, V
1619716197
const int64_t n_embd_head_q = hparams.n_embd_head_k;
@@ -16231,7 +16231,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
1623116231
ext_factor, attn_factor, beta_fast, beta_slow
1623216232
);
1623316233

16234-
cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f, il);
16234+
cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
1623516235
}
1623616236

1623716237
cb(cur, "attn_out", il);
@@ -16306,8 +16306,9 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
1630616306
ggml_build_forward_expand(gf,
1630716307
ggml_cpy(ctx0, last_conv,
1630816308
ggml_view_1d(ctx0, conv_states_all,
16309-
(d_conv - 1)*(d_inner)*(n_seqs),
16310-
kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
16309+
(d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
16310+
kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
16311+
cb(conv_states_all, "mamba_conv1d_state", il);
1631116312

1631216313
// 1D convolution
1631316314
x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
@@ -16370,9 +16371,9 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
1637016371
// store last states
1637116372
ggml_build_forward_expand(gf,
1637216373
ggml_cpy(ctx0,
16373-
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
16374-
ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs,
16375-
kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
16374+
ggml_view_1d(ctx0, y_ssm, n_heads*head_dim*d_state*n_seqs, n_heads*head_dim*n_seq_tokens*n_seqs*ggml_element_size(y_ssm)),
16375+
ggml_view_1d(ctx0, ssm_states_all, n_heads*head_dim*d_state*n_seqs, kv_head*n_seqs*n_heads*head_dim*d_state*ggml_element_size(ssm_states_all))));
16376+
cb(ssm_states_all, "mamba_ssm_states", il);
1637616377

1637716378
ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
1637816379
cb(y, "mamba_y_view", il);

0 commit comments

Comments
 (0)