feat(ggml-cpu): Add partial implementation of scale for f16

gabe-l-hart · gabe-l-hart · commit 6256f9a8118b · 2025-11-04T16:35:06.000-07:00
This is used to zero-out the state in build_rs, so it's required to support
F16 cache states for recurrent models. The bias route does not get hit in
that case, but would need to be implemented if used elsewhere.

Branch: Mamba2SSD

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
@@ -4558,6 +4558,60 @@ static void ggml_compute_forward_scale_f32(
     }
 }
 
+static void ggml_compute_forward_scale_f16(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    float s; // scale factor
+    float b; // bias
+
+    memcpy(&s, (float *) dst->op_params + 0, sizeof(float));
+    memcpy(&b, (float *) dst->op_params + 1, sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    const size_t nb01 = src0->nb[1];
+
+    const size_t nb1 = dst->nb[1];
+
+    if (b == 0.0f) {
+        for (int i1 = ir0; i1 < ir1; i1++) {
+            if (dst->data != src0->data) {
+                // src0 is same shape as dst => same indices
+                // TODO: add x parameter to ggml_vec_scale_f32 and remove this memcpy
+                memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(ggml_fp16_t));
+            }
+            ggml_vec_scale_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*nb1), s);
+        }
+    } else {
+        //TODO: support bias!
+        GGML_ABORT("fatal error");
+        // for (int i1 = ir0; i1 < ir1; i1++) {
+        //     ggml_vec_mad1_f16(nc,
+        //         (ggml_fp16_t *) ((char *) dst->data  + i1*nb1),
+        //         (ggml_fp16_t *) ((char *) src0->data + i1*nb1),
+        //         s, b);
+        // }
+    }
+}
+
 void ggml_compute_forward_scale(
         const ggml_compute_params * params,
         ggml_tensor * dst) {
@@ -4569,6 +4623,10 @@ void ggml_compute_forward_scale(
             {
                 ggml_compute_forward_scale_f32(params, dst);
             } break;
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_scale_f16(params, dst);
+            } break;
         default:
             {
                 GGML_ABORT("fatal error");