Refactor: remove unused methods, inline and factorize softplus, add const modifiers

pwilkin · pwilkin · commit 40f2f807d7e4 · 2025-09-18T12:51:27.000+02:00
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
@@ -9407,7 +9407,7 @@ static void ggml_compute_forward_ssm_scan_f32(
                 // n_head
                 for (int h = ih0; h < ih1; ++h) {
                     // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
-                    const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
+                    const float dt_soft_plus = softplus(dt[h]);
                     const float dA = expf(dt_soft_plus * A[h]);
                     const int g = h / (nh / ng); // repeat_interleave
 
@@ -9504,7 +9504,7 @@ static void ggml_compute_forward_ssm_scan_f32(
                 // n_head
                 for (int h = ih0; h < ih1; ++h) {
                     // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
-                    const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
+                    const float dt_soft_plus = softplus(dt[h]);
                     const int g = h / (nh / ng); // repeat_interleave
 
                     // dim
diff --git a/ggml/src/ggml-cpu/unary-ops.cpp b/ggml/src/ggml-cpu/unary-ops.cpp
@@ -194,11 +194,6 @@ void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor *
     unary_op(op_log, params, dst);
 }
 
-static float softplus(float input, float beta=1.0f, float threshold=20.0f) {
-    if (input * beta > threshold) return input;
-    return (1/beta) * logf(1 + expf(beta * input));
-}
-
 void ggml_compute_forward_xielu(const ggml_compute_params * params, ggml_tensor * dst) {
     // Get the XIELU parameters from the operation
     float alpha_n = ggml_get_op_params_f32(dst, 1);
diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu
@@ -211,30 +211,28 @@ void ggml_cuda_op_unary_gated(ggml_backend_cuda_context & ctx, ggml_tensor * dst
     }
 }
 
-// Functor for XIELU operation with parameters
 struct op_xielu_functor {
     float alpha_n, alpha_p, beta, eps;
 
     __host__ __device__ __forceinline__ op_xielu_functor(float a_n, float a_p, float b, float e)
         : alpha_n(a_n), alpha_p(a_p), beta(b), eps(e) {}
 
     __device__ __forceinline__ float operator()(float x) const {
-        float gate_pos = (x > 0.0f);        // positive branch gate
+        const float gate_pos = (x > 0.0f);        // positive branch gate
 
         // Positive branch: alpha_p * v^2 + beta * v
-        float y_pos = alpha_p * x * x + beta * x;
+        const float y_pos = alpha_p * x * x + beta * x;
 
         // Negative branch:
-        float min_v_eps = fminf(x, eps);  // works fine even if eps < 0
-        float y_neg = (expm1f(min_v_eps) - x) * alpha_n + beta * x;
+        const float min_v_eps = fminf(x, eps);  // works fine even if eps < 0
+        const float y_neg = (expm1f(min_v_eps) - x) * alpha_n + beta * x;
 
         // Select the appropriate branch based on the gate
         return gate_pos * y_pos + (1.0f - gate_pos) * y_neg;
     }
 };
 
 // swiglu_oai
-
 template <typename T>
 static __global__ void swiglu_oai_kernel(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1, float alpha, float limit) {
     const int64_t i = int64_t(blockDim.x)*blockIdx.x + threadIdx.x;
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
@@ -89,6 +89,10 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
     return true;
 }
 
+static inline float softplus(float input) {
+    return (input > 20.0f) ? input : logf(1 + expf(input));
+}
+
 //
 // logging
 //
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -2646,12 +2646,14 @@ struct ggml_tensor * ggml_silu(
     return ggml_unary(ctx, a, GGML_UNARY_OP_SILU);
 }
 
-// ggml_xielu
-static float softplus(float input) {
-    if (input > 20.0f) return input;
-    return logf(1 + expf(input));
+struct ggml_tensor * ggml_silu_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU);
 }
 
+// ggml_xielu
+
 struct ggml_tensor * ggml_xielu(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
@@ -2673,12 +2675,6 @@ struct ggml_tensor * ggml_xielu(
     return result;
 }
 
-struct ggml_tensor * ggml_silu_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU);
-}
-
 // ggml_silu_back
 
 struct ggml_tensor * ggml_silu_back(
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -18852,19 +18852,6 @@ struct llm_build_smallthinker : public llm_graph_context{
     }
 };
 
-// TODO: maybe put this as a general helper in ggml.c?
-static float get_scalar_f32_val(const ggml_tensor *t) {
-    float onef;
-    if (t->buffer) {
-        ggml_backend_tensor_get(t, &onef, 0, sizeof(float));
-    } else {
-        GGML_ASSERT(t->data);
-        onef = *((float *) t->data);
-    }
-    return onef;
-}
-
-// Apertus model graph builder with xIELU activation
 struct llm_build_apertus : public llm_graph_context {
     llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_v;