assume mul_ptr is not null when calling fused ops, formatting changes

am17an · am17an · commit f47daede4139 · 2025-07-22T10:41:59.000+08:00
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2767,11 +2767,11 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
 #endif
 
 static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops) {
-    if(!ggml_can_fuse(cgraph, node_idx, ops)) {
+    if (!ggml_can_fuse(cgraph, node_idx, ops)) {
         return false;
     }
 
-    if(ops.size() == 2 && ops.begin()[0] == GGML_OP_RMS_NORM && ops.begin()[1] == GGML_OP_MUL) {
+    if (ops.size() == 2 && ops.begin()[0] == GGML_OP_RMS_NORM && ops.begin()[1] == GGML_OP_MUL) {
         const ggml_tensor *rms_norm = cgraph->nodes[node_idx];
         const ggml_tensor *mul      = cgraph->nodes[node_idx+1];
 
diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu
@@ -122,9 +122,7 @@ static __global__ void rms_norm_f32(
 
     const float * mul_ptr = nullptr;
     if constexpr (do_multiply) {
-        if (mul != nullptr) {
-            mul_ptr = mul + sample*mul_stride_sample + channel*mul_stride_channel + row*mul_stride_row;
-        }
+        mul_ptr = mul + sample*mul_stride_sample + channel*mul_stride_channel + row*mul_stride_row;
     }
 
     float tmp = 0.0f; // partial sum for thread in warp
@@ -154,7 +152,7 @@ static __global__ void rms_norm_f32(
 
     for (int col = tid; col < ncols; col += block_size) {
         if constexpr (do_multiply) {
-            dst[col] = scale * x[col] * (mul_ptr ? mul_ptr[col] : 1.0f);
+            dst[col] = scale * x[col] * mul_ptr[col];
         } else {
             dst[col] = scale * x[col];
         }
@@ -335,6 +333,10 @@ static void rms_norm_mul_f32_cuda(
         const int64_t mul_stride_row, const int64_t mul_stride_channel, const int64_t mul_stride_sample,
         const float eps, cudaStream_t stream) {
     const dim3 blocks_num(nrows, nchannels, nsamples);
+    if(mul == nullptr) {
+        rms_norm_f32_cuda(x, dst, ncols, nrows, nchannels, nsamples, stride_row, stride_channel, stride_sample, eps, stream);
+        return;
+    }
     if (ncols < 1024) {
         const dim3 block_dims(WARP_SIZE, 1, 1);
         rms_norm_f32<WARP_SIZE, true><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, mul_stride_sample);
@@ -443,7 +445,7 @@ void ggml_cuda_op_rms_norm_fused(ggml_backend_cuda_context & ctx, ggml_tensor *
     const float * src0_d = (const float *) rms_norm_src->data;
     const float * mul_d = nullptr;
 
-    if(mul_tensor->src[0] == dst) {
+    if (mul_tensor->src[0] == dst) {
         mul_d = (float *) mul_tensor->src[1]->data;
     } else if(mul_tensor->src[1] == dst) {
         mul_d = (float *) mul_tensor->src[0]->data;