|
| 1 | +diff --git a/ggml.c b/ggml.c |
| 2 | +index 44c43b4..fedd4ca 100644 |
| 3 | +--- a/ggml.c |
| 4 | ++++ b/ggml.c |
| 5 | +@@ -9,7 +9,7 @@ |
| 6 | + |
| 7 | + #if defined(_MSC_VER) || defined(__MINGW32__) |
| 8 | + #include <malloc.h> // using malloc.h with MSC/MINGW |
| 9 | +-#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) |
| 10 | ++#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(__MVS__) |
| 11 | + #include <alloca.h> |
| 12 | + #endif |
| 13 | + |
| 14 | +@@ -197,10 +197,19 @@ typedef void * thread_ret_t; |
| 15 | + #else |
| 16 | + inline static void * ggml_aligned_malloc(size_t size) { |
| 17 | + void * aligned_memory = NULL; |
| 18 | ++#ifdef __MVS__ |
| 19 | ++ if (size == 0) |
| 20 | ++ size = 1; |
| 21 | ++ aligned_memory = malloc(size); |
| 22 | ++ int result = 0; |
| 23 | ++ if (aligned_memory == NULL) |
| 24 | ++ result = errno; |
| 25 | ++#else |
| 26 | + #ifdef GGML_USE_METAL |
| 27 | + int result = posix_memalign(&aligned_memory, getpagesize(), size); |
| 28 | + #else |
| 29 | + int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size); |
| 30 | ++#endif |
| 31 | + #endif |
| 32 | + if (result != 0) { |
| 33 | + // Handle allocation failure |
| 34 | +@@ -299,7 +308,7 @@ typedef double ggml_float; |
| 35 | + #if defined(_MSC_VER) || defined(__MINGW32__) |
| 36 | + #include <intrin.h> |
| 37 | + #else |
| 38 | +-#if !defined(__riscv) |
| 39 | ++#if !defined(__riscv) && !defined(__MVS__) |
| 40 | + #include <immintrin.h> |
| 41 | + #endif |
| 42 | + #endif |
| 43 | +@@ -583,7 +592,7 @@ int64_t ggml_cycles_per_ms(void) { |
| 44 | + #if defined(__cpp_lib_hardware_interference_size) |
| 45 | + #define CACHE_LINE_SIZE hardware_destructive_interference_size |
| 46 | + #else |
| 47 | +-#if defined(__POWER9_VECTOR__) |
| 48 | ++#if defined(__POWER9_VECTOR__) || defined(__MVS__) |
| 49 | + #define CACHE_LINE_SIZE 128 |
| 50 | + #else |
| 51 | + #define CACHE_LINE_SIZE 64 |
| 52 | +@@ -2051,10 +2060,11 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { |
| 53 | + #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL |
| 54 | + #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE |
| 55 | + |
| 56 | +-#elif defined(__POWER9_VECTOR__) |
| 57 | ++#elif defined(__POWER9_VECTOR__) || defined(__MVS__) |
| 58 | + |
| 59 | + #define GGML_SIMD |
| 60 | + |
| 61 | ++ |
| 62 | + // F32 POWER9 |
| 63 | + |
| 64 | + #define GGML_F32_STEP 32 |
| 65 | +@@ -2066,21 +2076,23 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { |
| 66 | + #define GGML_F32x4_LOAD(p) vec_xl(0, p) |
| 67 | + #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) |
| 68 | + #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a) |
| 69 | +-#define GGML_F32x4_ADD vec_add |
| 70 | +-#define GGML_F32x4_MUL vec_mul |
| 71 | ++#define GGML_F32x4_ADD(a, b) a+b |
| 72 | ++//#define GGML_F32x4_MUL(a, b) vec_madd(a, b, (vector float){0.0, 0.0,0.0, 0.0}) |
| 73 | ++#define GGML_F32x4_MUL(a, b) a*b |
| 74 | ++//#define GGML_F32x4_MUL(a, b) __builtin_s390_vfmasb( a, b, a) |
| 75 | + #define GGML_F32x4_REDUCE(res, x) \ |
| 76 | + { \ |
| 77 | + int offset = GGML_F32_ARR >> 1; \ |
| 78 | + for (int i = 0; i < offset; ++i) { \ |
| 79 | +- x[i] = vec_add(x[i], x[offset+i]); \ |
| 80 | ++ x[i] = GGML_F32x4_ADD(x[i], x[offset+i]); \ |
| 81 | + } \ |
| 82 | + offset >>= 1; \ |
| 83 | + for (int i = 0; i < offset; ++i) { \ |
| 84 | +- x[i] = vec_add(x[i], x[offset+i]); \ |
| 85 | ++ x[i] = GGML_F32x4_ADD(x[i], x[offset+i]); \ |
| 86 | + } \ |
| 87 | + offset >>= 1; \ |
| 88 | + for (int i = 0; i < offset; ++i) { \ |
| 89 | +- x[i] = vec_add(x[i], x[offset+i]); \ |
| 90 | ++ x[i] = GGML_F32x4_ADD(x[i], x[offset+i]); \ |
| 91 | + } \ |
| 92 | + res = vec_extract(x[0], 0) + \ |
| 93 | + vec_extract(x[0], 1) + \ |
| 94 | +@@ -2364,7 +2376,7 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest |
| 95 | + float sumf = 0.0f; |
| 96 | + const int np = (n & ~(GGML_F32_STEP - 1)); |
| 97 | + |
| 98 | +- GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; |
| 99 | ++ GGML_F32_VEC sum2[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; |
| 100 | + |
| 101 | + GGML_F32_VEC ax[GGML_F32_ARR]; |
| 102 | + GGML_F32_VEC ay[GGML_F32_ARR]; |
| 103 | +@@ -2374,12 +2386,12 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest |
| 104 | + ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); |
| 105 | + ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); |
| 106 | + |
| 107 | +- sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]); |
| 108 | ++ sum2[j] = GGML_F32_VEC_FMA(sum2[j], ax[j], ay[j]); |
| 109 | + } |
| 110 | + } |
| 111 | + |
| 112 | + // reduce sum0..sum3 to sum0 |
| 113 | +- GGML_F32_VEC_REDUCE(sumf, sum); |
| 114 | ++ GGML_F32_VEC_REDUCE(sumf, sum2); |
| 115 | + |
| 116 | + // leftovers |
| 117 | + for (int i = np; i < n; ++i) { |
| 118 | +@@ -2399,7 +2411,7 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest |
| 119 | + static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { |
| 120 | + ggml_float sumf = 0.0; |
| 121 | + |
| 122 | +-#if defined(GGML_SIMD) |
| 123 | ++#if defined(GGML_SIMD) && !defined(__MVS__) |
| 124 | + const int np = (n & ~(GGML_F16_STEP - 1)); |
| 125 | + |
| 126 | + GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO }; |
| 127 | +@@ -3437,7 +3449,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re |
| 128 | + x[i] = (ggml_fp16_t *) ((char *) xv + i*xs); |
| 129 | + } |
| 130 | + |
| 131 | +-#if defined(GGML_SIMD) |
| 132 | ++#if defined(GGML_SIMD) && !defined(__MVS__) |
| 133 | + const int np = (n & ~(GGML_F16_STEP - 1)); |
| 134 | + |
| 135 | + GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } }; |
0 commit comments