Skip to content

Commit 1606a3c

Browse files
committed
Revert stupid LLM refactorings
1 parent 1b18472 commit 1606a3c

File tree

1 file changed

+155
-123
lines changed

1 file changed

+155
-123
lines changed

ggml/src/ggml-cuda/unary.cu

Lines changed: 155 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -38,148 +38,168 @@ void ggml_cuda_op_unary(ggml_backend_cuda_context & ctx, ggml_tensor * dst, cons
3838
}
3939
}
4040

41+
static __device__ __forceinline__ float op_abs(float x) {
42+
return fabsf(x);
43+
}
44+
4145
void ggml_cuda_op_abs(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
42-
auto op = [] __device__ (float x) -> float {
43-
return fabsf(x);
44-
};
45-
ggml_cuda_op_unary(ctx, dst, op);
46+
ggml_cuda_op_unary(ctx, dst, op_abs);
47+
}
48+
49+
static __device__ __forceinline__ float op_sgn(float x) {
50+
return (x > 0.f ? 1.f : ((x < 0.f ? -1.f : 0.f)));
4651
}
4752

4853
void ggml_cuda_op_sgn(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
49-
auto op = [] __device__ (float x) -> float {
50-
return (x > 0.f ? 1.f : ((x < 0.f ? -1.f : 0.f)));
51-
};
52-
ggml_cuda_op_unary(ctx, dst, op);
54+
ggml_cuda_op_unary(ctx, dst, op_sgn);
55+
}
56+
57+
static __device__ __forceinline__ float op_neg(float x) {
58+
return -x;
5359
}
5460

5561
void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
56-
auto op = [] __device__ (float x) -> float {
57-
return -x;
58-
};
59-
ggml_cuda_op_unary(ctx, dst, op);
62+
ggml_cuda_op_unary(ctx, dst, op_neg);
63+
}
64+
65+
static __device__ __forceinline__ float op_step(float x) {
66+
return x > 0.0f;
6067
}
6168

6269
void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
63-
auto op = [] __device__ (float x) -> float {
64-
return x > 0.0f;
65-
};
66-
ggml_cuda_op_unary(ctx, dst, op);
70+
ggml_cuda_op_unary(ctx, dst, op_step);
71+
}
72+
73+
static __device__ __forceinline__ float op_gelu(float x) {
74+
const float GELU_COEF_A = 0.044715f;
75+
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
76+
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
6777
}
6878

6979
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
70-
auto op = [] __device__ (float x) -> float {
71-
const float GELU_COEF_A = 0.044715f;
72-
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
73-
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
74-
};
75-
ggml_cuda_op_unary(ctx, dst, op);
80+
ggml_cuda_op_unary(ctx, dst, op_gelu);
81+
}
82+
83+
static __device__ __forceinline__ float op_gelu_erf(float x) {
84+
const float SQRT_2_INV = 0.70710678118654752440084436210484f;
85+
return 0.5f*x*(1.0f + erff(x*SQRT_2_INV));
7686
}
7787

7888
void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
79-
auto op = [] __device__ (float x) -> float {
80-
const float SQRT_2_INV = 0.70710678118654752440084436210484f;
81-
return 0.5f*x*(1.0f + erff(x*SQRT_2_INV));
82-
};
83-
ggml_cuda_op_unary(ctx, dst, op);
89+
ggml_cuda_op_unary(ctx, dst, op_gelu_erf);
90+
}
91+
92+
static __device__ __forceinline__ float op_gelu_quick(float x) {
93+
const float GELU_QUICK_COEF = -1.702f;
94+
return x * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x)));
8495
}
8596

8697
void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
87-
auto op = [] __device__ (float x) -> float {
88-
const float GELU_QUICK_COEF = -1.702f;
89-
return x * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x)));
90-
};
91-
ggml_cuda_op_unary(ctx, dst, op);
98+
ggml_cuda_op_unary(ctx, dst, op_gelu_quick);
99+
}
100+
101+
static __device__ __forceinline__ float op_silu(float x) {
102+
return x / (1.0f + expf(-x));
92103
}
93104

94105
void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
95-
auto op = [] __device__ (float x) -> float {
96-
return x / (1.0f + expf(-x));
97-
};
98-
ggml_cuda_op_unary(ctx, dst, op);
106+
ggml_cuda_op_unary(ctx, dst, op_silu);
107+
}
108+
109+
static __device__ __forceinline__ float op_tanh(float x) {
110+
return tanhf(x);
99111
}
100112

101113
void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
102-
auto op = [] __device__ (float x) -> float {
103-
return tanhf(x);
104-
};
105-
ggml_cuda_op_unary(ctx, dst, op);
114+
ggml_cuda_op_unary(ctx, dst, op_tanh);
115+
}
116+
117+
static __device__ __forceinline__ float op_relu(float x) {
118+
return fmaxf(x, 0);
106119
}
107120

108121
void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
109-
auto op = [] __device__ (float x) -> float {
110-
return fmaxf(x, 0);
111-
};
112-
ggml_cuda_op_unary(ctx, dst, op);
122+
ggml_cuda_op_unary(ctx, dst, op_relu);
123+
}
124+
125+
static __device__ __forceinline__ float op_sigmoid(float x) {
126+
return 1.0f / (1.0f + expf(-x));
113127
}
114128

115129
void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
116-
auto op = [] __device__ (float x) -> float {
117-
return 1.0f / (1.0f + expf(-x));
118-
};
119-
ggml_cuda_op_unary(ctx, dst, op);
130+
ggml_cuda_op_unary(ctx, dst, op_sigmoid);
131+
}
132+
133+
static __device__ __forceinline__ float op_hardsigmoid(float x) {
134+
return fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
120135
}
121136

122137
void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
123-
auto op = [] __device__ (float x) -> float {
124-
return fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
125-
};
126-
ggml_cuda_op_unary(ctx, dst, op);
138+
ggml_cuda_op_unary(ctx, dst, op_hardsigmoid);
139+
}
140+
141+
static __device__ __forceinline__ float op_hardswish(float x) {
142+
return x * fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
127143
}
128144

129145
void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
130-
auto op = [] __device__ (float x) -> float {
131-
return x * fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
132-
};
133-
ggml_cuda_op_unary(ctx, dst, op);
146+
ggml_cuda_op_unary(ctx, dst, op_hardswish);
147+
}
148+
149+
static __device__ __forceinline__ float op_exp(float x) {
150+
return expf(x);
134151
}
135152

136153
void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
137-
auto op = [] __device__ (float x) -> float {
138-
return expf(x);
139-
};
140-
ggml_cuda_op_unary(ctx, dst, op);
154+
ggml_cuda_op_unary(ctx, dst, op_exp);
155+
}
156+
157+
static __device__ __forceinline__ float op_sqr(float x) {
158+
return x * x;
141159
}
142160

143161
void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
144-
auto op = [] __device__ (float x) -> float {
145-
return x * x;
146-
};
147-
ggml_cuda_op_unary(ctx, dst, op);
162+
ggml_cuda_op_unary(ctx, dst, op_sqr);
163+
}
164+
165+
static __device__ __forceinline__ float op_sqrt(float x) {
166+
return sqrtf(x);
148167
}
149168

150169
void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
151-
auto op = [] __device__ (float x) -> float {
152-
return sqrtf(x);
153-
};
154-
ggml_cuda_op_unary(ctx, dst, op);
170+
ggml_cuda_op_unary(ctx, dst, op_sqrt);
171+
}
172+
173+
static __device__ __forceinline__ float op_sin(float x) {
174+
return sinf(x);
155175
}
156176

157177
void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
158-
auto op = [] __device__ (float x) -> float {
159-
return sinf(x);
160-
};
161-
ggml_cuda_op_unary(ctx, dst, op);
178+
ggml_cuda_op_unary(ctx, dst, op_sin);
179+
}
180+
181+
static __device__ __forceinline__ float op_cos(float x) {
182+
return cosf(x);
162183
}
163184

164185
void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
165-
auto op = [] __device__ (float x) -> float {
166-
return cosf(x);
167-
};
168-
ggml_cuda_op_unary(ctx, dst, op);
186+
ggml_cuda_op_unary(ctx, dst, op_cos);
187+
}
188+
189+
static __device__ __forceinline__ float op_log(float x) {
190+
return logf(x);
169191
}
170192

171193
void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
172-
auto op = [] __device__ (float x) -> float {
173-
return logf(x);
174-
};
175-
ggml_cuda_op_unary(ctx, dst, op);
194+
ggml_cuda_op_unary(ctx, dst, op_log);
195+
}
196+
197+
static __device__ __forceinline__ float op_elu(float x) {
198+
return (x > 0.f) ? x : expm1f(x);
176199
}
177200

178201
void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
179-
auto op = [] __device__ (float x) -> float {
180-
return (x > 0.f) ? x : expm1f(x);
181-
};
182-
ggml_cuda_op_unary(ctx, dst, op);
202+
ggml_cuda_op_unary(ctx, dst, op_elu);
183203
}
184204
/* gated ops */
185205

@@ -258,70 +278,82 @@ void ggml_cuda_op_unary_gated(ggml_backend_cuda_context & ctx, ggml_tensor * dst
258278
}
259279
}
260280

281+
static __device__ __forceinline__ float op_reglu(float x) {
282+
return fmaxf(x, 0);
283+
}
284+
261285
void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
262-
auto op = [] __device__ (float x) -> float {
263-
return fmaxf(x, 0);
264-
};
265-
ggml_cuda_op_unary_gated(ctx, dst, op);
286+
ggml_cuda_op_unary_gated(ctx, dst, op_reglu);
287+
}
288+
289+
static __device__ __forceinline__ float op_geglu(float x) {
290+
const float GELU_COEF_A = 0.044715f;
291+
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
292+
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
266293
}
267294

268295
void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
269-
auto op = [] __device__ (float x) -> float {
270-
const float GELU_COEF_A = 0.044715f;
271-
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
272-
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
273-
};
274-
ggml_cuda_op_unary_gated(ctx, dst, op);
296+
ggml_cuda_op_unary_gated(ctx, dst, op_geglu);
297+
}
298+
299+
static __device__ __forceinline__ float op_swiglu(float x) {
300+
return x / (1.0f + expf(-x));
275301
}
276302

277303
void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
278-
auto op = [] __device__ (float x) -> float {
279-
return x / (1.0f + expf(-x));
280-
};
281-
ggml_cuda_op_unary_gated(ctx, dst, op);
304+
ggml_cuda_op_unary_gated(ctx, dst, op_swiglu);
305+
}
306+
307+
static __device__ __forceinline__ float op_geglu_erf(float x) {
308+
const float SQRT_2_INV = 0.70710678118654752440084436210484f;
309+
return 0.5f*x*(1.0f + erff(x*SQRT_2_INV));
282310
}
283311

284312
void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
285-
auto op = [] __device__ (float x) -> float {
286-
const float SQRT_2_INV = 0.70710678118654752440084436210484f;
287-
return 0.5f*x*(1.0f + erff(x*SQRT_2_INV));
288-
};
289-
ggml_cuda_op_unary_gated(ctx, dst, op);
313+
ggml_cuda_op_unary_gated(ctx, dst, op_geglu_erf);
290314
}
291315

292-
void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
293-
auto op = [] __device__ (float x) -> float {
294-
const float GELU_QUICK_COEF = -1.702f;
295-
return x * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x)));
296-
};
297-
ggml_cuda_op_unary_gated(ctx, dst, op);
316+
static __device__ __forceinline__ float op_geglu_quick(float x) {
317+
const float GELU_QUICK_COEF = -1.702f;
318+
return x * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x)));
298319
}
299320

300-
void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
301-
// Get the XIELU parameters from the operation
302-
const float * op_params = (const float*)dst->op_params;
303-
float alpha_n = op_params[0];
304-
float alpha_p = op_params[1];
305-
const float beta = op_params[2];
306-
const float eps = op_params[3];
321+
void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
322+
ggml_cuda_op_unary_gated(ctx, dst, op_geglu_quick);
323+
}
307324

308-
const auto op = [alpha_n, alpha_p, beta, eps] __device__ (float x) -> float {
309-
float out;
325+
// Functor for XIELU operation with parameters
326+
struct op_xielu_functor {
327+
float alpha_n, alpha_p, beta, eps;
328+
329+
__host__ __device__ __forceinline__ op_xielu_functor(float a_n, float a_p, float b, float e)
330+
: alpha_n(a_n), alpha_p(a_p), beta(b), eps(e) {}
331+
332+
__device__ __forceinline__ float operator()(float x) const {
310333
float gate_pos = (x > 0.0f); // positive branch gate
311-
float gate_neg = 1.0f - gate_pos; // negative branch gate
312334

313335
// Positive branch: alpha_p * v^2 + beta * v
314336
float y_pos = alpha_p * x * x + beta * x;
315337

316338
// Negative branch:
317339
float min_v_eps = fminf(x, eps); // works fine even if eps < 0
318340
float y_neg = (expm1f(min_v_eps) - x) * alpha_n + beta * x;
319-
out = y_pos * gate_pos + y_neg * gate_neg;
320341

321-
return out;
322-
};
342+
// Select the appropriate branch based on the gate
343+
return gate_pos * y_pos + (1.0f - gate_pos) * y_neg;
344+
}
345+
};
323346

324-
ggml_cuda_op_unary_gated(ctx, dst, op);
347+
void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
348+
// Get the XIELU parameters from the operation
349+
const float * op_params = (const float*)dst->op_params;
350+
float alpha_n = op_params[0];
351+
float alpha_p = op_params[1];
352+
const float beta = op_params[2];
353+
const float eps = op_params[3];
354+
355+
op_xielu_functor op(alpha_n, alpha_p, beta, eps);
356+
ggml_cuda_op_unary(ctx, dst, op);
325357
}
326358

327359
// swiglu_oai

0 commit comments

Comments
 (0)