@@ -38,148 +38,168 @@ void ggml_cuda_op_unary(ggml_backend_cuda_context & ctx, ggml_tensor * dst, cons
3838 }
3939}
4040
41+ static __device__ __forceinline__ float op_abs (float x) {
42+ return fabsf (x);
43+ }
44+
4145void ggml_cuda_op_abs (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
42- auto op = [] __device__ (float x) -> float {
43- return fabsf (x);
44- };
45- ggml_cuda_op_unary (ctx, dst, op);
46+ ggml_cuda_op_unary (ctx, dst, op_abs);
47+ }
48+
49+ static __device__ __forceinline__ float op_sgn (float x) {
50+ return (x > 0 .f ? 1 .f : ((x < 0 .f ? -1 .f : 0 .f )));
4651}
4752
4853void ggml_cuda_op_sgn (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
49- auto op = [] __device__ (float x) -> float {
50- return (x > 0 .f ? 1 .f : ((x < 0 .f ? -1 .f : 0 .f )));
51- };
52- ggml_cuda_op_unary (ctx, dst, op);
54+ ggml_cuda_op_unary (ctx, dst, op_sgn);
55+ }
56+
57+ static __device__ __forceinline__ float op_neg (float x) {
58+ return -x;
5359}
5460
5561void ggml_cuda_op_neg (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
56- auto op = [] __device__ (float x) -> float {
57- return -x;
58- };
59- ggml_cuda_op_unary (ctx, dst, op);
62+ ggml_cuda_op_unary (ctx, dst, op_neg);
63+ }
64+
65+ static __device__ __forceinline__ float op_step (float x) {
66+ return x > 0 .0f ;
6067}
6168
6269void ggml_cuda_op_step (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
63- auto op = [] __device__ (float x) -> float {
64- return x > 0 .0f ;
65- };
66- ggml_cuda_op_unary (ctx, dst, op);
70+ ggml_cuda_op_unary (ctx, dst, op_step);
71+ }
72+
73+ static __device__ __forceinline__ float op_gelu (float x) {
74+ const float GELU_COEF_A = 0 .044715f ;
75+ const float SQRT_2_OVER_PI = 0 .79788456080286535587989211986876f ;
76+ return 0 .5f *x*(1 .0f + tanhf (SQRT_2_OVER_PI*x*(1 .0f + GELU_COEF_A*x*x)));
6777}
6878
6979void ggml_cuda_op_gelu (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
70- auto op = [] __device__ ( float x) -> float {
71- const float GELU_COEF_A = 0 . 044715f ;
72- const float SQRT_2_OVER_PI = 0 . 79788456080286535587989211986876f ;
73- return 0 . 5f *x*( 1 . 0f + tanhf (SQRT_2_OVER_PI*x*( 1 . 0f + GELU_COEF_A*x*x)));
74- } ;
75- ggml_cuda_op_unary (ctx, dst, op );
80+ ggml_cuda_op_unary (ctx, dst, op_gelu);
81+ }
82+
83+ static __device__ __forceinline__ float op_gelu_erf ( float x) {
84+ const float SQRT_2_INV = 0 . 70710678118654752440084436210484f ;
85+ return 0 . 5f *x*( 1 . 0f + erff (x*SQRT_2_INV) );
7686}
7787
7888void ggml_cuda_op_gelu_erf (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
79- auto op = [] __device__ (float x) -> float {
80- const float SQRT_2_INV = 0 .70710678118654752440084436210484f ;
81- return 0 .5f *x*(1 .0f + erff (x*SQRT_2_INV));
82- };
83- ggml_cuda_op_unary (ctx, dst, op);
89+ ggml_cuda_op_unary (ctx, dst, op_gelu_erf);
90+ }
91+
92+ static __device__ __forceinline__ float op_gelu_quick (float x) {
93+ const float GELU_QUICK_COEF = -1 .702f ;
94+ return x * (1 .0f / (1 .0f + expf (GELU_QUICK_COEF * x)));
8495}
8596
8697void ggml_cuda_op_gelu_quick (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
87- auto op = [] __device__ ( float x) -> float {
88- const float GELU_QUICK_COEF = - 1 . 702f ;
89- return x * ( 1 . 0f / ( 1 . 0f + expf (GELU_QUICK_COEF * x)));
90- };
91- ggml_cuda_op_unary (ctx, dst, op );
98+ ggml_cuda_op_unary (ctx, dst, op_gelu_quick);
99+ }
100+
101+ static __device__ __forceinline__ float op_silu ( float x) {
102+ return x / ( 1 . 0f + expf (-x) );
92103}
93104
94105void ggml_cuda_op_silu (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
95- auto op = [] __device__ (float x) -> float {
96- return x / (1 .0f + expf (-x));
97- };
98- ggml_cuda_op_unary (ctx, dst, op);
106+ ggml_cuda_op_unary (ctx, dst, op_silu);
107+ }
108+
109+ static __device__ __forceinline__ float op_tanh (float x) {
110+ return tanhf (x);
99111}
100112
101113void ggml_cuda_op_tanh (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
102- auto op = [] __device__ (float x) -> float {
103- return tanhf (x);
104- };
105- ggml_cuda_op_unary (ctx, dst, op);
114+ ggml_cuda_op_unary (ctx, dst, op_tanh);
115+ }
116+
117+ static __device__ __forceinline__ float op_relu (float x) {
118+ return fmaxf (x, 0 );
106119}
107120
108121void ggml_cuda_op_relu (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
109- auto op = [] __device__ (float x) -> float {
110- return fmaxf (x, 0 );
111- };
112- ggml_cuda_op_unary (ctx, dst, op);
122+ ggml_cuda_op_unary (ctx, dst, op_relu);
123+ }
124+
125+ static __device__ __forceinline__ float op_sigmoid (float x) {
126+ return 1 .0f / (1 .0f + expf (-x));
113127}
114128
115129void ggml_cuda_op_sigmoid (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
116- auto op = [] __device__ (float x) -> float {
117- return 1 .0f / (1 .0f + expf (-x));
118- };
119- ggml_cuda_op_unary (ctx, dst, op);
130+ ggml_cuda_op_unary (ctx, dst, op_sigmoid);
131+ }
132+
133+ static __device__ __forceinline__ float op_hardsigmoid (float x) {
134+ return fminf (1 .0f , fmaxf (0 .0f , (x + 3 .0f ) / 6 .0f ));
120135}
121136
122137void ggml_cuda_op_hardsigmoid (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
123- auto op = [] __device__ (float x) -> float {
124- return fminf (1 .0f , fmaxf (0 .0f , (x + 3 .0f ) / 6 .0f ));
125- };
126- ggml_cuda_op_unary (ctx, dst, op);
138+ ggml_cuda_op_unary (ctx, dst, op_hardsigmoid);
139+ }
140+
141+ static __device__ __forceinline__ float op_hardswish (float x) {
142+ return x * fminf (1 .0f , fmaxf (0 .0f , (x + 3 .0f ) / 6 .0f ));
127143}
128144
129145void ggml_cuda_op_hardswish (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
130- auto op = [] __device__ (float x) -> float {
131- return x * fminf (1 .0f , fmaxf (0 .0f , (x + 3 .0f ) / 6 .0f ));
132- };
133- ggml_cuda_op_unary (ctx, dst, op);
146+ ggml_cuda_op_unary (ctx, dst, op_hardswish);
147+ }
148+
149+ static __device__ __forceinline__ float op_exp (float x) {
150+ return expf (x);
134151}
135152
136153void ggml_cuda_op_exp (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
137- auto op = [] __device__ (float x) -> float {
138- return expf (x);
139- };
140- ggml_cuda_op_unary (ctx, dst, op);
154+ ggml_cuda_op_unary (ctx, dst, op_exp);
155+ }
156+
157+ static __device__ __forceinline__ float op_sqr (float x) {
158+ return x * x;
141159}
142160
143161void ggml_cuda_op_sqr (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
144- auto op = [] __device__ (float x) -> float {
145- return x * x;
146- };
147- ggml_cuda_op_unary (ctx, dst, op);
162+ ggml_cuda_op_unary (ctx, dst, op_sqr);
163+ }
164+
165+ static __device__ __forceinline__ float op_sqrt (float x) {
166+ return sqrtf (x);
148167}
149168
150169void ggml_cuda_op_sqrt (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
151- auto op = [] __device__ (float x) -> float {
152- return sqrtf (x);
153- };
154- ggml_cuda_op_unary (ctx, dst, op);
170+ ggml_cuda_op_unary (ctx, dst, op_sqrt);
171+ }
172+
173+ static __device__ __forceinline__ float op_sin (float x) {
174+ return sinf (x);
155175}
156176
157177void ggml_cuda_op_sin (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
158- auto op = [] __device__ (float x) -> float {
159- return sinf (x);
160- };
161- ggml_cuda_op_unary (ctx, dst, op);
178+ ggml_cuda_op_unary (ctx, dst, op_sin);
179+ }
180+
181+ static __device__ __forceinline__ float op_cos (float x) {
182+ return cosf (x);
162183}
163184
164185void ggml_cuda_op_cos (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
165- auto op = [] __device__ (float x) -> float {
166- return cosf (x);
167- };
168- ggml_cuda_op_unary (ctx, dst, op);
186+ ggml_cuda_op_unary (ctx, dst, op_cos);
187+ }
188+
189+ static __device__ __forceinline__ float op_log (float x) {
190+ return logf (x);
169191}
170192
171193void ggml_cuda_op_log (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
172- auto op = [] __device__ (float x) -> float {
173- return logf (x);
174- };
175- ggml_cuda_op_unary (ctx, dst, op);
194+ ggml_cuda_op_unary (ctx, dst, op_log);
195+ }
196+
197+ static __device__ __forceinline__ float op_elu (float x) {
198+ return (x > 0 .f ) ? x : expm1f (x);
176199}
177200
178201void ggml_cuda_op_elu (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
179- auto op = [] __device__ (float x) -> float {
180- return (x > 0 .f ) ? x : expm1f (x);
181- };
182- ggml_cuda_op_unary (ctx, dst, op);
202+ ggml_cuda_op_unary (ctx, dst, op_elu);
183203}
184204/* gated ops */
185205
@@ -258,70 +278,82 @@ void ggml_cuda_op_unary_gated(ggml_backend_cuda_context & ctx, ggml_tensor * dst
258278 }
259279}
260280
281+ static __device__ __forceinline__ float op_reglu (float x) {
282+ return fmaxf (x, 0 );
283+ }
284+
261285void ggml_cuda_op_reglu (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
262- auto op = [] __device__ (float x) -> float {
263- return fmaxf (x, 0 );
264- };
265- ggml_cuda_op_unary_gated (ctx, dst, op);
286+ ggml_cuda_op_unary_gated (ctx, dst, op_reglu);
287+ }
288+
289+ static __device__ __forceinline__ float op_geglu (float x) {
290+ const float GELU_COEF_A = 0 .044715f ;
291+ const float SQRT_2_OVER_PI = 0 .79788456080286535587989211986876f ;
292+ return 0 .5f *x*(1 .0f + tanhf (SQRT_2_OVER_PI*x*(1 .0f + GELU_COEF_A*x*x)));
266293}
267294
268295void ggml_cuda_op_geglu (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
269- auto op = [] __device__ (float x) -> float {
270- const float GELU_COEF_A = 0 .044715f ;
271- const float SQRT_2_OVER_PI = 0 .79788456080286535587989211986876f ;
272- return 0 .5f *x*(1 .0f + tanhf (SQRT_2_OVER_PI*x*(1 .0f + GELU_COEF_A*x*x)));
273- };
274- ggml_cuda_op_unary_gated (ctx, dst, op);
296+ ggml_cuda_op_unary_gated (ctx, dst, op_geglu);
297+ }
298+
299+ static __device__ __forceinline__ float op_swiglu (float x) {
300+ return x / (1 .0f + expf (-x));
275301}
276302
277303void ggml_cuda_op_swiglu (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
278- auto op = [] __device__ (float x) -> float {
279- return x / (1 .0f + expf (-x));
280- };
281- ggml_cuda_op_unary_gated (ctx, dst, op);
304+ ggml_cuda_op_unary_gated (ctx, dst, op_swiglu);
305+ }
306+
307+ static __device__ __forceinline__ float op_geglu_erf (float x) {
308+ const float SQRT_2_INV = 0 .70710678118654752440084436210484f ;
309+ return 0 .5f *x*(1 .0f + erff (x*SQRT_2_INV));
282310}
283311
284312void ggml_cuda_op_geglu_erf (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
285- auto op = [] __device__ (float x) -> float {
286- const float SQRT_2_INV = 0 .70710678118654752440084436210484f ;
287- return 0 .5f *x*(1 .0f + erff (x*SQRT_2_INV));
288- };
289- ggml_cuda_op_unary_gated (ctx, dst, op);
313+ ggml_cuda_op_unary_gated (ctx, dst, op_geglu_erf);
290314}
291315
292- void ggml_cuda_op_geglu_quick (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
293- auto op = [] __device__ (float x) -> float {
294- const float GELU_QUICK_COEF = -1 .702f ;
295- return x * (1 .0f / (1 .0f + expf (GELU_QUICK_COEF * x)));
296- };
297- ggml_cuda_op_unary_gated (ctx, dst, op);
316+ static __device__ __forceinline__ float op_geglu_quick (float x) {
317+ const float GELU_QUICK_COEF = -1 .702f ;
318+ return x * (1 .0f / (1 .0f + expf (GELU_QUICK_COEF * x)));
298319}
299320
300- void ggml_cuda_op_xielu (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
301- // Get the XIELU parameters from the operation
302- const float * op_params = (const float *)dst->op_params ;
303- float alpha_n = op_params[0 ];
304- float alpha_p = op_params[1 ];
305- const float beta = op_params[2 ];
306- const float eps = op_params[3 ];
321+ void ggml_cuda_op_geglu_quick (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
322+ ggml_cuda_op_unary_gated (ctx, dst, op_geglu_quick);
323+ }
307324
308- const auto op = [alpha_n, alpha_p, beta, eps] __device__ (float x) -> float {
309- float out;
325+ // Functor for XIELU operation with parameters
326+ struct op_xielu_functor {
327+ float alpha_n, alpha_p, beta, eps;
328+
329+ __host__ __device__ __forceinline__ op_xielu_functor (float a_n, float a_p, float b, float e)
330+ : alpha_n(a_n), alpha_p(a_p), beta(b), eps(e) {}
331+
332+ __device__ __forceinline__ float operator ()(float x) const {
310333 float gate_pos = (x > 0 .0f ); // positive branch gate
311- float gate_neg = 1 .0f - gate_pos; // negative branch gate
312334
313335 // Positive branch: alpha_p * v^2 + beta * v
314336 float y_pos = alpha_p * x * x + beta * x;
315337
316338 // Negative branch:
317339 float min_v_eps = fminf (x, eps); // works fine even if eps < 0
318340 float y_neg = (expm1f (min_v_eps) - x) * alpha_n + beta * x;
319- out = y_pos * gate_pos + y_neg * gate_neg;
320341
321- return out;
322- };
342+ // Select the appropriate branch based on the gate
343+ return gate_pos * y_pos + (1 .0f - gate_pos) * y_neg;
344+ }
345+ };
323346
324- ggml_cuda_op_unary_gated (ctx, dst, op);
347+ void ggml_cuda_op_xielu (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
348+ // Get the XIELU parameters from the operation
349+ const float * op_params = (const float *)dst->op_params ;
350+ float alpha_n = op_params[0 ];
351+ float alpha_p = op_params[1 ];
352+ const float beta = op_params[2 ];
353+ const float eps = op_params[3 ];
354+
355+ op_xielu_functor op (alpha_n, alpha_p, beta, eps);
356+ ggml_cuda_op_unary (ctx, dst, op);
325357}
326358
327359// swiglu_oai
0 commit comments