@@ -59,6 +59,15 @@ struct ggml_compute_params {
5959#endif
6060#endif
6161
62+ #if defined(__s390x__ ) && defined(__VEC__ )
63+ #ifndef __VXE__
64+ #define __VXE__
65+ #endif
66+ #ifndef __VXE2__
67+ #define __VXE2__
68+ #endif
69+ #endif
70+
6271#if defined(__ARM_FEATURE_SVE )
6372#include <arm_sve.h>
6473#include <sys/prctl.h>
@@ -359,6 +368,148 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
359368#endif
360369#endif
361370
371+ #if defined(__VXE__ ) || defined(__VXE2__ )
372+ #include <vecintrin.h>
373+
374+ #define vec_neg (a ) (-(a)) // Vector Negate
375+ #define vec_add (a , b ) ((a) + (b)) // Vector Add
376+ #define vec_sub (a , b ) ((a) - (b)) // Vector Subtract
377+ #define vec_mul (a , b ) ((a) * (b)) // Vector Multiply
378+ #define vec_div (a , b ) ((a) / (b)) // Vector Divide
379+ #define vec_sl (a , b ) ((a) << (b)) // Vector Shift Left
380+ #define vec_sra (a , b ) ((a) >> (b)) // Vector Shift Right
381+ #define vec_sr (a , b ) ((a) >> (b)) // Vector Shift Right Algebraic
382+ #define vec_slo (a , b ) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
383+ #define vec_sro (a , b ) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
384+
385+ #ifndef vec_and
386+ #define vec_and (a , b ) ((a) & (b)) // Vector AND
387+ #endif
388+
389+ #ifndef vec_or
390+ #define vec_or (a , b ) ((a) | (b)) // Vector OR
391+ #endif
392+
393+ #ifndef vec_xor
394+ #define vec_xor (a , b ) ((a) ^ (b)) // Vector XOR
395+ #endif
396+
397+ typedef signed char char8x16_t __attribute__((vector_size (16 )));
398+ typedef unsigned char uchar8x16_t __attribute__((vector_size (16 )));
399+
400+ typedef int8_t int8x16_t __attribute__((vector_size (16 )));
401+ typedef int16_t int16x8_t __attribute__((vector_size (16 )));
402+ typedef int32_t int32x4_t __attribute__((vector_size (16 )));
403+
404+ typedef uint8_t uint8x16_t __attribute__((vector_size (16 )));
405+ typedef uint16_t uint16x8_t __attribute__((vector_size (16 )));
406+ typedef uint32_t uint32x4_t __attribute__((vector_size (16 )));
407+
408+ typedef float float32x4_t __attribute__((vector_size (16 )));
409+ typedef double double64x2_t __attribute((vector_size (16 )));
410+
411+ typedef signed long long long64x2_t __attribute((vector_size (16 )));
412+ typedef unsigned long long ulong64x2_t __attribute__((vector_size (16 )));
413+
414+ typedef struct ggml_uint8x16x2_t {
415+ uint8x16_t val [2 ];
416+ } ggml_uint8x16x2_t ;
417+
418+ inline static ggml_uint8x16x2_t ggml_vec_xl_u8x2 (const uint8_t * ptr ) {
419+ ggml_uint8x16x2_t res ;
420+
421+ res .val [0 ] = vec_xl ( 0 , ptr );
422+ res .val [1 ] = vec_xl (16 , ptr );
423+
424+ return res ;
425+ }
426+
427+ typedef struct ggml_uint8x16x4_t {
428+ uint8x16_t val [4 ];
429+ } ggml_uint8x16x4_t ;
430+
431+ inline static ggml_uint8x16x4_t ggml_vec_xl_u8x4 (const uint8_t * ptr ) {
432+ ggml_uint8x16x4_t res ;
433+
434+ res .val [0 ] = vec_xl ( 0 , ptr );
435+ res .val [1 ] = vec_xl (16 , ptr );
436+ res .val [2 ] = vec_xl (32 , ptr );
437+ res .val [3 ] = vec_xl (48 , ptr );
438+
439+ return res ;
440+ }
441+
442+ typedef struct ggml_int8x16x4_t {
443+ int8x16_t val [4 ];
444+ } ggml_int8x16x4_t ;
445+
446+ inline static ggml_int8x16x4_t ggml_vec_xl_s8x4 (const int8_t * ptr ) {
447+ ggml_int8x16x4_t res ;
448+
449+ res .val [0 ] = vec_xl ( 0 , ptr );
450+ res .val [1 ] = vec_xl (16 , ptr );
451+ res .val [2 ] = vec_xl (32 , ptr );
452+ res .val [3 ] = vec_xl (48 , ptr );
453+
454+ return res ;
455+ }
456+
457+ typedef struct ggml_int16x8x2_t {
458+ int16x8_t val [2 ];
459+ } ggml_int16x8x2_t ;
460+
461+ inline static ggml_int16x8x2_t ggml_vec_xl_s16x2 (const int16_t * ptr ) {
462+ ggml_int16x8x2_t res ;
463+
464+ res .val [0 ] = vec_xl ( 0 , ptr );
465+ res .val [1 ] = vec_xl (16 , ptr );
466+
467+ return res ;
468+ }
469+
470+ /*
471+ ! WARNING: Very slow. Use vec_perm if possible. Refer to iq4_xs
472+ ! or iq4_nl for example implementation.
473+ */
474+ inline static int8x16_t ggml_vec_tbl (int8x16_t a , uint8x16_t b ) {
475+ int8x16_t res ;
476+
477+ res [ 0 ] = a [b [ 0 ]];
478+ res [ 1 ] = a [b [ 1 ]];
479+ res [ 2 ] = a [b [ 2 ]];
480+ res [ 3 ] = a [b [ 3 ]];
481+ res [ 4 ] = a [b [ 4 ]];
482+ res [ 5 ] = a [b [ 5 ]];
483+ res [ 6 ] = a [b [ 6 ]];
484+ res [ 7 ] = a [b [ 7 ]];
485+ res [ 8 ] = a [b [ 8 ]];
486+ res [ 9 ] = a [b [ 9 ]];
487+ res [10 ] = a [b [10 ]];
488+ res [11 ] = a [b [11 ]];
489+ res [12 ] = a [b [12 ]];
490+ res [13 ] = a [b [13 ]];
491+ res [14 ] = a [b [14 ]];
492+ res [15 ] = a [b [15 ]];
493+
494+ return res ;
495+ }
496+
497+ inline static int16x8_t vec_padd_s16 (int16x8_t a , int16x8_t b ) {
498+ const uchar8x16_t v_maske = { 0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 ,
499+ 16 , 17 , 20 , 21 , 24 , 25 , 28 , 29 };
500+
501+ const int16x8_t v_abo = vec_pack ((int32x4_t )a , (int32x4_t )b );
502+ const int16x8_t v_abe = vec_perm (a , b , v_maske );
503+ return v_abo + v_abe ;
504+ }
505+
506+ inline static int32x4_t ggml_vec_dot (int32x4_t acc , int8x16_t a , int8x16_t b ) {
507+ const int16x8_t p = vec_mule (a , b ) + vec_mulo (a , b );
508+ return acc + (vec_unpackh (p ) + vec_unpackl (p ));
509+ }
510+
511+ #endif
512+
362513#if defined(__loongarch_asx )
363514/* float type data load instructions */
364515static __m128 __lsx_vreplfr2vr_s (const float val ) {
0 commit comments