@@ -12965,7 +12965,9 @@ struct llm_build_hybrid_mamba : public llm_graph_context {
1296512965 }
1296612966
1296712967 // For Granite architectures - scale residual
12968- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12968+ if (hparams.f_residual_scale) {
12969+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12970+ }
1296912971 ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
1297012972 cb(ffn_inp, "ffn_inp", il);
1297112973
@@ -13023,7 +13025,9 @@ struct llm_build_hybrid_mamba : public llm_graph_context {
1302313025 }
1302413026
1302513027 // For Granite architectures - scale residual
13026- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
13028+ if (hparams.f_residual_scale) {
13029+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
13030+ }
1302713031 cur = ggml_add(ctx0, cur, ffn_inp);
1302813032 cb(cur, "ffn_out", il);
1302913033
@@ -13047,7 +13051,9 @@ struct llm_build_hybrid_mamba : public llm_graph_context {
1304713051 cur = build_lora_mm(model.output, cur);
1304813052
1304913053 // For Granite architectures - scale logits
13050- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
13054+ if (hparams.f_logit_scale) {
13055+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
13056+ }
1305113057 cb(cur, "result_output", -1);
1305213058 res->t_logits = cur;
1305313059
0 commit comments