Skip to content

Commit a5d6e65

Browse files
committed
ggml : remove SVE paths
1 parent d9e0e7c commit a5d6e65

File tree

1 file changed

+7
-331
lines changed

1 file changed

+7
-331
lines changed

ggml/src/ggml-cpu/vec.h

Lines changed: 7 additions & 331 deletions
Original file line numberDiff line numberDiff line change
@@ -119,111 +119,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
119119
}
120120

121121
#if defined(GGML_SIMD)
122-
#if defined(__ARM_FEATURE_SVE)
123-
124-
const int sve_register_length = svcntb() * 8;
125-
const int ggml_f16_epr = sve_register_length / 16; // running when 16
126-
const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
127-
128-
const int np = (n & ~(ggml_f16_step - 1));
129-
130-
svfloat16_t sum_00 = svdup_n_f16(0.0f);
131-
svfloat16_t sum_01 = svdup_n_f16(0.0f);
132-
svfloat16_t sum_02 = svdup_n_f16(0.0f);
133-
svfloat16_t sum_03 = svdup_n_f16(0.0f);
134-
135-
svfloat16_t sum_10 = svdup_n_f16(0.0f);
136-
svfloat16_t sum_11 = svdup_n_f16(0.0f);
137-
svfloat16_t sum_12 = svdup_n_f16(0.0f);
138-
svfloat16_t sum_13 = svdup_n_f16(0.0f);
139-
140-
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
141-
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
142-
143-
for (int i = 0; i < np; i += ggml_f16_step) {
144-
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
145-
146-
ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst
147-
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
148-
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
149-
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
150-
151-
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
152-
153-
ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements
154-
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
155-
ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
156-
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
157-
158-
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
159-
160-
ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
161-
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
162-
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
163-
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
164-
165-
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
166-
167-
ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
168-
sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
169-
ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
170-
sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
171-
172-
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
173-
174-
ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
175-
176-
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
177-
ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
178-
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
179-
180-
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
181-
182-
ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
183-
184-
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
185-
ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
186-
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
187-
188-
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
189-
190-
ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
191-
192-
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
193-
ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
194-
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
195-
196-
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
197-
198-
ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
199-
200-
sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
201-
ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
202-
sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
203-
}
204-
205-
const int np2 = (n & ~(ggml_f16_epr - 1));
206-
for (int k = np; k < np2; k += ggml_f16_epr) {
207-
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
208-
209-
svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
210-
sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
211-
rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
212-
sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
213-
}
214-
215-
if (np2 < n) {
216-
svbool_t pg = svwhilelt_b16(np2, n);
217-
svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
218-
svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
219-
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
220-
221-
sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
222-
sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
223-
}
224-
GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
225-
GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
226-
#elif defined(__riscv_v_intrinsic)
122+
#if defined(__riscv_v_intrinsic)
227123
// todo: RVV impl
228124
for (int i = 0; i < n; ++i) {
229125
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
@@ -277,86 +173,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
277173

278174
inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
279175
#if defined(GGML_SIMD)
280-
#if defined(__ARM_FEATURE_SVE)
281-
282-
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
283-
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
284-
const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
285-
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
286-
287-
const int np = (n & ~(ggml_f32_step - 1));
288-
svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
289-
svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
290-
for (int i = 0; i < np; i += ggml_f32_step) {
291-
292-
ax1 = GGML_F32_VEC_LOAD(x + i);
293-
ay1 = GGML_F32_VEC_LOAD(y + i);
294-
ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
295-
296-
GGML_F32_VEC_STORE(y + i, ay1);
297-
298-
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
299-
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
300-
ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
301-
302-
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
303-
304-
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
305-
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
306-
ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
307-
308-
GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
309-
310-
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
311-
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
312-
ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
313-
314-
GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
315-
316-
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
317-
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
318-
ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
319-
320-
GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
321-
322-
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
323-
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
324-
ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
325-
326-
GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
327-
328-
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
329-
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
330-
ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
331-
332-
GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
333-
334-
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
335-
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
336-
ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
337-
338-
GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
339-
}
340-
// leftovers
341-
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
342-
const int np2 = (n & ~(ggml_f32_epr - 1));
343-
for (int i = np; i < np2; i += ggml_f32_epr) {
344-
ax1 = GGML_F32_VEC_LOAD(x + i);
345-
ay1 = GGML_F32_VEC_LOAD(y + i);
346-
ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
347-
348-
GGML_F32_VEC_STORE(y + i, ay1);
349-
}
350-
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
351-
if (np2 < n) {
352-
svbool_t pg =svwhilelt_b32(np2, n);
353-
ax1 = svld1_f32(pg, x + np2);
354-
ay1 = svld1_f32(pg, y + np2);
355-
ay1 = svmad_f32_m(pg, ax1, vx, ay1);
356-
357-
svst1_f32(pg, y + np2, ay1);
358-
}
359-
#elif defined(__riscv_v_intrinsic)
176+
#if defined(__riscv_v_intrinsic)
360177
for (int i = 0, avl; i < n; i += avl) {
361178
avl = __riscv_vsetvl_e32m8(n - i);
362179
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
@@ -397,84 +214,7 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
397214

398215
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
399216
#if defined(GGML_SIMD)
400-
#if defined(__ARM_FEATURE_SVE)
401-
const int sve_register_length = svcntb() * 8;
402-
const int ggml_f16_epr = sve_register_length / 16;
403-
const int ggml_f16_step = 8 * ggml_f16_epr;
404-
405-
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
406-
407-
const int np= (n & ~(ggml_f16_step - 1));
408-
409-
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
410-
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
411-
for (int i = 0; i < np; i += ggml_f16_step) {
412-
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
413-
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
414-
ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
415-
416-
GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
417-
418-
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
419-
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
420-
ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
421-
422-
GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
423-
424-
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
425-
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
426-
ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
427-
428-
GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
429-
430-
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
431-
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
432-
ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
433-
434-
GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
435-
436-
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
437-
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
438-
ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
439-
440-
GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
441-
442-
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
443-
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
444-
ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
445-
446-
GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
447-
448-
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
449-
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
450-
ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
451-
452-
GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
453-
454-
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
455-
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
456-
ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
457-
458-
GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
459-
}
460-
const int np2 = (n & ~(ggml_f16_epr - 1));
461-
for (int k = np; k < np2; k += ggml_f16_epr) {
462-
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
463-
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
464-
ry = GGML_F16x_VEC_FMA(ry, rx, vx);
465-
466-
GGML_F16x_VEC_STORE(y + k, ry, 0);
467-
}
468-
469-
if (np2 < n) {
470-
svbool_t pg = svwhilelt_b16(np2, n);
471-
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
472-
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
473-
hy = svmad_f16_x(pg, hx, vx, hy);
474-
svst1_f16(pg, (__fp16 *)(y + np2), hy);
475-
}
476-
477-
#elif defined(__riscv_v_intrinsic)
217+
#if defined(__riscv_v_intrinsic)
478218
// todo: RVV impl
479219
// scalar
480220
for (int i = 0; i < n; ++i) {
@@ -523,14 +263,7 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
523263
}
524264

525265
#if defined(GGML_SIMD)
526-
#if defined(__ARM_FEATURE_SVE)
527-
// scalar Route to scalar implementation //TODO: Write SVE code
528-
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
529-
for (int i = 0; i < n; ++i) {
530-
y[i] += x[k][i]*v[k][0];
531-
}
532-
}
533-
#elif defined(__riscv_v_intrinsic)
266+
#if defined(__riscv_v_intrinsic)
534267
for (int i = 0, avl; i < n; i += avl) {
535268
avl = __riscv_vsetvl_e32m8(n - i);
536269
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
@@ -586,12 +319,7 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
586319
#if defined(GGML_USE_ACCELERATE)
587320
vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
588321
#elif defined(GGML_SIMD)
589-
#if defined(__ARM_FEATURE_SVE)
590-
// scalar ; TODO: Write SVE code
591-
for (int i = 0; i < n; ++i) {
592-
y[i] = x[i]*s + b;
593-
}
594-
#elif defined(__riscv_v_intrinsic)
322+
#if defined(__riscv_v_intrinsic)
595323
for (int i = 0, avl; i < n; i += avl) {
596324
avl = __riscv_vsetvl_e32m8(n - i);
597325
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
@@ -634,33 +362,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
634362
#if defined(GGML_USE_ACCELERATE)
635363
vDSP_vsmul(y, 1, &v, y, 1, n);
636364
#elif defined(GGML_SIMD)
637-
#if defined(__ARM_FEATURE_SVE)
638-
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
639-
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
640-
const int ggml_f32_step = 2 * ggml_f32_epr;
641-
642-
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
643-
const int np = (n & ~(ggml_f32_step - 1));
644-
svfloat32_t ay1;
645-
svfloat32_t ay2;
646-
for (int i = 0; i < np; i += ggml_f32_step) {
647-
ay1 = GGML_F32_VEC_LOAD(y + i);
648-
ay1 = GGML_F32_VEC_MUL(ay1, vx);
649-
GGML_F32_VEC_STORE(y + i, ay1);
650-
651-
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
652-
ay2 = GGML_F32_VEC_MUL(ay2, vx);
653-
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
654-
}
655-
// leftovers
656-
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
657-
if (np < n) {
658-
svbool_t pg = svwhilelt_b32(np, n);
659-
ay1 = svld1_f32(pg, y + np);
660-
ay1 = svmul_f32_m(pg, ay1, vx);
661-
svst1_f32(pg, y + np, ay1);
662-
}
663-
#elif defined(__riscv_v_intrinsic)
365+
#if defined(__riscv_v_intrinsic)
664366
for (int i = 0, avl; i < n; i += avl) {
665367
avl = __riscv_vsetvl_e32m8(n - i);
666368
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
@@ -698,33 +400,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
698400

699401
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
700402
#if defined(GGML_SIMD)
701-
#if defined(__ARM_FEATURE_SVE)
702-
const int sve_register_length = svcntb() * 8;
703-
const int ggml_f16_epr = sve_register_length / 16;
704-
const int ggml_f16_step = 2 * ggml_f16_epr;
705-
706-
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
707-
const int np = (n & ~(ggml_f16_step - 1));
708-
svfloat16_t ay1, ay2;
709-
710-
for (int i = 0; i < np; i += ggml_f16_step) {
711-
ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
712-
ay1 = GGML_F16x_VEC_MUL(ay1, vx);
713-
GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
714-
715-
ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
716-
ay2 = GGML_F16x_VEC_MUL(ay2, vx);
717-
GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
718-
}
719-
// leftovers
720-
// maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
721-
if (np < n) {
722-
svbool_t pg = svwhilelt_b16(np, n);
723-
svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
724-
svfloat16_t out = svmul_f16_m(pg, hy, vx);
725-
svst1_f16(pg, (__fp16 *)(y + np), out);
726-
}
727-
#elif defined(__riscv_v_intrinsic)
403+
#if defined(__riscv_v_intrinsic)
728404
// todo: RVV impl
729405
// scalar
730406
for (int i = 0; i < n; ++i) {

0 commit comments

Comments
 (0)