Modify unary-ops.cpp to add the functor-based logic besides the template system to retain optimizations

pwilkin · pwilkin · commit 58e6e0f8e5ee · 2025-09-25T21:21:02.000+02:00
diff --git a/ggml/src/ggml-cpu/unary-ops.cpp b/ggml/src/ggml-cpu/unary-ops.cpp
@@ -73,8 +73,87 @@ static inline float op_log(float x) {
     return logf(x);
 }
 
+template <float (*op)(float), typename src0_t, typename dst_t>
+static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
+    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
+    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
+
+    for (int i = 0; i < n; i++) {
+        y[i] = f32_to_dst(op(src0_to_f32(x[i])));
+    }
+}
+
+template <float (*op)(float), typename src0_t, typename dst_t>
+static void apply_unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(dst_t));
+    GGML_ASSERT(nb00 == sizeof(src0_t));
+
+    const auto [ir0, ir1] = get_thread_range(params, src0);
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne02*ne01);
+        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        dst_t        * dst_ptr  = (dst_t  *)       ((char *)       dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+        const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+        vec_unary_op<op>(ne0, dst_ptr, src0_ptr);
+    }
+}
+
+// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
+template <float (*op)(float)>
+static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    /*  */ if (src0->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
+        apply_unary_op<op, float, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
+        apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
+        apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
+        apply_unary_op<op, ggml_bf16_t, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F32) {
+        apply_unary_op<op, ggml_fp16_t, float>(params, dst);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type));
+        GGML_ABORT("fatal error");
+    }
+}
+
+template <float (*op)(float, ggml_tensor *)>
+static void unary_op_params(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    /*  */ if (src0->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
+        apply_unary_op<op, float, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
+        apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
+        apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
+        apply_unary_op<op, ggml_bf16_t, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F32) {
+        apply_unary_op<op, ggml_fp16_t, float>(params, dst);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type));
+        GGML_ABORT("fatal error");
+    }
+}
+
+// Extend vec_unary_op to support functors
 template <typename Op, typename src0_t, typename dst_t>
-static inline void vec_unary_op(const Op & op, int64_t n, dst_t * y, const src0_t * x) {
+static inline void vec_unary_op_functor(int64_t n, dst_t * y, const src0_t * x, Op op) {
     constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
     constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
 
@@ -83,8 +162,9 @@ static inline void vec_unary_op(const Op & op, int64_t n, dst_t * y, const src0_
     }
 }
 
+// Extend apply_unary_op to support functors
 template <typename Op, typename src0_t, typename dst_t>
-static void apply_unary_op(const Op& op, const ggml_compute_params * params, ggml_tensor * dst) {
+static void apply_unary_op_functor(const ggml_compute_params * params, ggml_tensor * dst, Op op) {
     const ggml_tensor * src0 = dst->src[0];
 
     GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
@@ -104,25 +184,25 @@ static void apply_unary_op(const Op& op, const ggml_compute_params * params, ggm
         dst_t        * dst_ptr  = (dst_t  *)       ((char *)       dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
         const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
 
-        vec_unary_op<decltype(op), src0_t, dst_t>(op, ne0, dst_ptr, src0_ptr);
+        vec_unary_op_functor(ne0, dst_ptr, src0_ptr, op);
     }
 }
 
-// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
+// Generic dispatcher for functors
 template <typename Op>
-static void unary_op(const Op& op, const ggml_compute_params * params, ggml_tensor * dst) {
+static void unary_op_functor(const ggml_compute_params * params, ggml_tensor * dst, Op op) {
     const ggml_tensor * src0 = dst->src[0];
 
     /*  */ if (src0->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
-        apply_unary_op<decltype(op), float, float>(op, params, dst);
+        apply_unary_op_functor<Op, float, float>(params, dst, op);
     } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
-        apply_unary_op<decltype(op), ggml_fp16_t, ggml_fp16_t>(op, params, dst);
+        apply_unary_op_functor<Op, ggml_fp16_t, ggml_fp16_t>(params, dst, op);
     } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
-        apply_unary_op<decltype(op), ggml_bf16_t, ggml_bf16_t>(op, params, dst);
+        apply_unary_op_functor<Op, ggml_bf16_t, ggml_bf16_t>(params, dst, op);
     } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
-        apply_unary_op<decltype(op), ggml_bf16_t, float>(op, params, dst);
+        apply_unary_op_functor<Op, ggml_bf16_t, float>(params, dst, op);
     } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F32) {
-        apply_unary_op<decltype(op), ggml_fp16_t, float>(op, params, dst);
+        apply_unary_op_functor<Op, ggml_fp16_t, float>(params, dst, op);
     } else {
         fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
             ggml_type_name(dst->type), ggml_type_name(src0->type));
@@ -131,80 +211,79 @@ static void unary_op(const Op& op, const ggml_compute_params * params, ggml_tens
 }
 
 void ggml_compute_forward_abs(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op(op_abs, params, dst);
+    unary_op<op_abs>(params, dst);
 }
 
 void ggml_compute_forward_sgn(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op(op_sgn, params, dst);
+    unary_op<op_sgn>(params, dst);
 }
 
 void ggml_compute_forward_neg(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op(op_neg, params, dst);
+    unary_op<op_neg>(params, dst);
 }
 
 void ggml_compute_forward_step(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op(op_step, params, dst);
+    unary_op<op_step>(params, dst);
 }
 
 void ggml_compute_forward_tanh(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op(op_tanh, params, dst);
+    unary_op<op_tanh>(params, dst);
 }
 
 void ggml_compute_forward_elu(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op(op_elu, params, dst);
+    unary_op<op_elu>(params, dst);
 }
 
 void ggml_compute_forward_relu(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op(op_relu, params, dst);
+    unary_op<op_relu>(params, dst);
 }
 
 void ggml_compute_forward_sigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op(op_sigmoid, params, dst);
+    unary_op<op_sigmoid>(params, dst);
 }
 
 void ggml_compute_forward_hardsigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op(op_hardsigmoid, params, dst);
+    unary_op<op_hardsigmoid>(params, dst);
 }
 
 void ggml_compute_forward_exp(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op(op_exp, params, dst);
+    unary_op<op_exp>(params, dst);
 }
 
 void ggml_compute_forward_hardswish(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op(op_hardswish, params, dst);
+    unary_op<op_hardswish>(params, dst);
 }
 
 void ggml_compute_forward_sqr(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op(op_sqr, params, dst);
+    unary_op<op_sqr>(params, dst);
 }
 
 void ggml_compute_forward_sqrt(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op(op_sqrt, params, dst);
+    unary_op<op_sqrt>(params, dst);
 }
 
 void ggml_compute_forward_sin(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op(op_sin, params, dst);
+    unary_op<op_sin>(params, dst);
 }
 
 void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op(op_cos, params, dst);
+    unary_op<op_cos>(params, dst);
 }
 
 void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * dst) {
-    unary_op(op_log, params, dst);
+    unary_op<op_log>(params, dst);
 }
 
 void ggml_compute_forward_xielu(const ggml_compute_params * params, ggml_tensor * dst) {
-    // Get the XIELU parameters from the operation
-    float alpha_n = ggml_get_op_params_f32(dst, 1);
-    float alpha_p = ggml_get_op_params_f32(dst, 2);
+    const float alpha_n = ggml_get_op_params_f32(dst, 1);
+    const float alpha_p = ggml_get_op_params_f32(dst, 2);
     const float beta = ggml_get_op_params_f32(dst, 3);
     const float eps = ggml_get_op_params_f32(dst, 4);
 
     const auto xielu_op_params = [alpha_n, alpha_p, beta, eps](float f) {
         return op_xielu(f, alpha_n, alpha_p, beta, eps);
     };
 
-    unary_op(xielu_op_params, params, dst);
+    unary_op_functor(params, dst, xielu_op_params);
 }