graph : use fill instead of scale_bias in grouped expert selection (ggml-org#17867)

CISC · web-flow · commit c8554b66e0ed · 2025-12-08T21:29:59.000+01:00
* use fill instead of scale_bias in grouped expert selection

* do not explicitly use _inplace
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -973,7 +973,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
 
         // mask out the other groups
         selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
-        selection_probs = ggml_set_rows(ctx0, ggml_scale_bias(ctx0, selection_groups, 0.0f, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
+        selection_probs = ggml_set_rows(ctx0, ggml_fill(ctx0, selection_groups, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
         selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
         cb(selection_probs, "ffn_moe_probs_masked", il);
     }

Original file line number	Diff line number	Diff line change
`@@ -973,7 +973,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(`
`973`	`973`
`974`	`974`	`// mask out the other groups`
`975`	`975`	`selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]`
`976`		`- selection_probs = ggml_set_rows(ctx0, ggml_scale_bias(ctx0, selection_groups, 0.0f, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]`
	`976`	`+ selection_probs = ggml_set_rows(ctx0, ggml_fill(ctx0, selection_groups, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]`
`977`	`977`	`selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]`
`978`	`978`	`cb(selection_probs, "ffn_moe_probs_masked", il);`
`979`	`979`	`}`