@@ -353,7 +353,144 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
353353
354354#if defined(__VXE__ ) || defined(__VXE2__ )
355355#include <vecintrin.h>
356- #include <ggml-cpu/arch/s390/typedef.h>
356+
357+ #define vec_neg (a ) (-(a)) // Vector Negate
358+ #define vec_add (a , b ) ((a) + (b)) // Vector Add
359+ #define vec_sub (a , b ) ((a) - (b)) // Vector Subtract
360+ #define vec_mul (a , b ) ((a) * (b)) // Vector Multiply
361+ #define vec_div (a , b ) ((a) / (b)) // Vector Divide
362+ #define vec_sl (a , b ) ((a) << (b)) // Vector Shift Left
363+ #define vec_sra (a , b ) ((a) >> (b)) // Vector Shift Right
364+ #define vec_sr (a , b ) ((a) >> (b)) // Vector Shift Right Algebraic
365+ #define vec_slo (a , b ) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
366+ #define vec_sro (a , b ) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
367+
368+ #ifndef vec_and
369+ #define vec_and (a , b ) ((a) & (b)) // Vector AND
370+ #endif
371+
372+ #ifndef vec_or
373+ #define vec_or (a , b ) ((a) | (b)) // Vector OR
374+ #endif
375+
376+ #ifndef vec_xor
377+ #define vec_xor (a , b ) ((a) ^ (b)) // Vector XOR
378+ #endif
379+
380+ typedef signed char char8x16_t __attribute__((vector_size (16 )));
381+ typedef unsigned char uchar8x16_t __attribute__((vector_size (16 )));
382+
383+ typedef int8_t int8x16_t __attribute__((vector_size (16 )));
384+ typedef int16_t int16x8_t __attribute__((vector_size (16 )));
385+ typedef int32_t int32x4_t __attribute__((vector_size (16 )));
386+
387+ typedef uint8_t uint8x16_t __attribute__((vector_size (16 )));
388+ typedef uint16_t uint16x8_t __attribute__((vector_size (16 )));
389+ typedef uint32_t uint32x4_t __attribute__((vector_size (16 )));
390+
391+ typedef float float32x4_t __attribute__((vector_size (16 )));
392+ typedef double double64x2_t __attribute__((vector_size (16 )));
393+
394+ typedef signed long long long64x2_t __attribute__((vector_size (16 )));
395+ typedef unsigned long long ulong64x2_t __attribute__((vector_size (16 )));
396+
397+ typedef struct ggml_uint8x16x2_t {
398+ uint8x16_t val [2 ];
399+ } ggml_uint8x16x2_t ;
400+
401+ inline static ggml_uint8x16x2_t ggml_vec_xl_u8x2 (const uint8_t * ptr ) {
402+ ggml_uint8x16x2_t res ;
403+
404+ res .val [0 ] = vec_xl ( 0 , ptr );
405+ res .val [1 ] = vec_xl (16 , ptr );
406+
407+ return res ;
408+ }
409+
410+ typedef struct ggml_uint8x16x4_t {
411+ uint8x16_t val [4 ];
412+ } ggml_uint8x16x4_t ;
413+
414+ inline static ggml_uint8x16x4_t ggml_vec_xl_u8x4 (const uint8_t * ptr ) {
415+ ggml_uint8x16x4_t res ;
416+
417+ res .val [0 ] = vec_xl ( 0 , ptr );
418+ res .val [1 ] = vec_xl (16 , ptr );
419+ res .val [2 ] = vec_xl (32 , ptr );
420+ res .val [3 ] = vec_xl (48 , ptr );
421+
422+ return res ;
423+ }
424+
425+ typedef struct ggml_int8x16x4_t {
426+ int8x16_t val [4 ];
427+ } ggml_int8x16x4_t ;
428+
429+ inline static ggml_int8x16x4_t ggml_vec_xl_s8x4 (const int8_t * ptr ) {
430+ ggml_int8x16x4_t res ;
431+
432+ res .val [0 ] = vec_xl ( 0 , ptr );
433+ res .val [1 ] = vec_xl (16 , ptr );
434+ res .val [2 ] = vec_xl (32 , ptr );
435+ res .val [3 ] = vec_xl (48 , ptr );
436+
437+ return res ;
438+ }
439+
440+ typedef struct ggml_int16x8x2_t {
441+ int16x8_t val [2 ];
442+ } ggml_int16x8x2_t ;
443+
444+ inline static ggml_int16x8x2_t ggml_vec_xl_s16x2 (const int16_t * ptr ) {
445+ ggml_int16x8x2_t res ;
446+
447+ res .val [0 ] = vec_xl ( 0 , ptr );
448+ res .val [1 ] = vec_xl (16 , ptr );
449+
450+ return res ;
451+ }
452+
453+ /*
454+ ! WARNING: Very slow. Use vec_perm if possible. Refer to iq4_xs
455+ ! or iq4_nl for example implementation.
456+ */
457+ inline static int8x16_t ggml_vec_tbl (int8x16_t a , uint8x16_t b ) {
458+ int8x16_t res ;
459+
460+ res [ 0 ] = a [b [ 0 ]];
461+ res [ 1 ] = a [b [ 1 ]];
462+ res [ 2 ] = a [b [ 2 ]];
463+ res [ 3 ] = a [b [ 3 ]];
464+ res [ 4 ] = a [b [ 4 ]];
465+ res [ 5 ] = a [b [ 5 ]];
466+ res [ 6 ] = a [b [ 6 ]];
467+ res [ 7 ] = a [b [ 7 ]];
468+ res [ 8 ] = a [b [ 8 ]];
469+ res [ 9 ] = a [b [ 9 ]];
470+ res [10 ] = a [b [10 ]];
471+ res [11 ] = a [b [11 ]];
472+ res [12 ] = a [b [12 ]];
473+ res [13 ] = a [b [13 ]];
474+ res [14 ] = a [b [14 ]];
475+ res [15 ] = a [b [15 ]];
476+
477+ return res ;
478+ }
479+
480+ inline static int16x8_t vec_padd_s16 (int16x8_t a , int16x8_t b ) {
481+ const uchar8x16_t v_maske = { 0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 ,
482+ 16 , 17 , 20 , 21 , 24 , 25 , 28 , 29 };
483+
484+ const int16x8_t v_abo = vec_pack ((int32x4_t )a , (int32x4_t )b );
485+ const int16x8_t v_abe = vec_perm (a , b , v_maske );
486+ return v_abo + v_abe ;
487+ }
488+
489+ inline static int32x4_t ggml_vec_dot (int32x4_t acc , int8x16_t a , int8x16_t b ) {
490+ const int16x8_t p = vec_mule (a , b ) + vec_mulo (a , b );
491+ return acc + (vec_unpackh (p ) + vec_unpackl (p ));
492+ }
493+
357494#endif
358495
359496#if defined(__loongarch_asx )
0 commit comments