Add SIMD/openblas code, currently not exercised

IgorTodorovskiIBM · IgorTodorovskiIBM · commit 7b896be812c5 · 2023-11-14T14:22:31.000-05:00
diff --git a/buildenv b/buildenv
@@ -1,9 +1,11 @@
+export ZOPEN_BUILD_LINE="DEV"
 export ZOPEN_STABLE_DEPS="zoslib make cmake"
 export ZOPEN_DEV_URL="https://github.com/ggerganov/llama.cpp.git"
 export ZOPEN_DEV_DEPS="zoslib make cmake"
 export ZOPEN_DEV_TAG="master-9e232f0"
 export ZOPEN_BUILD_LINE="DEV"
 export ZOPEN_NAME="llamacpp-master"
+export ZOPEN_RUNTIME_DEPS="ncurses"
 
 rm -f "llama"
 ln -s "llama.cpp" "llama"
@@ -21,6 +23,12 @@ export ZOPEN_MAKE_MINIMAL=Y
 export ZOPEN_INSTALL="cmake"
 export ZOPEN_INSTALL_OPTS="--install ../build"
 
+# Needs more testing
+if false; then
+  export ZOPEN_EXTRA_CPPFLAGS="-DGGML_USE_OPENBLAS -I /home/itodoro/projects/openblas/openblas/include/openblas -mvx -mzvector -march=z15"
+  export ZOPEN_EXTRA_LIBS="/home/itodoro/projects/openblas/openblas/libopenblas.x"
+fi
+
 export ZOPEN_CHECK="skip"
 
 zopen_check_results()
diff --git a/patches/simd.needsmoretestingpatch b/patches/simd.needsmoretestingpatch
@@ -0,0 +1,135 @@
+diff --git a/ggml.c b/ggml.c
+index 44c43b4..fedd4ca 100644
+--- a/ggml.c
++++ b/ggml.c
+@@ -9,7 +9,7 @@
+ 
+ #if defined(_MSC_VER) || defined(__MINGW32__)
+ #include <malloc.h> // using malloc.h with MSC/MINGW
+-#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
++#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(__MVS__)
+ #include <alloca.h>
+ #endif
+ 
+@@ -197,10 +197,19 @@ typedef void * thread_ret_t;
+ #else
+ inline static void * ggml_aligned_malloc(size_t size) {
+     void * aligned_memory = NULL;
++#ifdef __MVS__
++    if (size == 0)
++      size = 1;
++    aligned_memory = malloc(size);
++    int result = 0;
++    if (aligned_memory == NULL)
++      result = errno;
++#else
+ #ifdef GGML_USE_METAL
+     int result = posix_memalign(&aligned_memory, getpagesize(), size);
+ #else
+     int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
++#endif
+ #endif
+     if (result != 0) {
+         // Handle allocation failure
+@@ -299,7 +308,7 @@ typedef double ggml_float;
+ #if defined(_MSC_VER) || defined(__MINGW32__)
+ #include <intrin.h>
+ #else
+-#if !defined(__riscv)
++#if !defined(__riscv) && !defined(__MVS__)
+ #include <immintrin.h>
+ #endif
+ #endif
+@@ -583,7 +592,7 @@ int64_t ggml_cycles_per_ms(void) {
+ #if defined(__cpp_lib_hardware_interference_size)
+ #define CACHE_LINE_SIZE hardware_destructive_interference_size
+ #else
+-#if defined(__POWER9_VECTOR__)
++#if defined(__POWER9_VECTOR__) || defined(__MVS__)
+ #define CACHE_LINE_SIZE 128
+ #else
+ #define CACHE_LINE_SIZE 64
+@@ -2051,10 +2060,11 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
+ #define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
+ #define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
+ 
+-#elif defined(__POWER9_VECTOR__)
++#elif defined(__POWER9_VECTOR__) || defined(__MVS__)
+ 
+ #define GGML_SIMD
+ 
++
+ // F32 POWER9
+ 
+ #define GGML_F32_STEP 32
+@@ -2066,21 +2076,23 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
+ #define GGML_F32x4_LOAD(p)      vec_xl(0, p)
+ #define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
+ #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
+-#define GGML_F32x4_ADD          vec_add
+-#define GGML_F32x4_MUL          vec_mul
++#define GGML_F32x4_ADD(a, b)    a+b
++//#define GGML_F32x4_MUL(a, b)    vec_madd(a, b, (vector float){0.0, 0.0,0.0, 0.0})
++#define GGML_F32x4_MUL(a, b)    a*b
++//#define GGML_F32x4_MUL(a, b)    __builtin_s390_vfmasb( a, b, a)
+ #define GGML_F32x4_REDUCE(res, x)              \
+ {                                              \
+     int offset = GGML_F32_ARR >> 1;            \
+     for (int i = 0; i < offset; ++i) {         \
+-        x[i] = vec_add(x[i], x[offset+i]);     \
++        x[i] = GGML_F32x4_ADD(x[i], x[offset+i]);     \
+     }                                          \
+     offset >>= 1;                              \
+     for (int i = 0; i < offset; ++i) {         \
+-        x[i] = vec_add(x[i], x[offset+i]);     \
++        x[i] = GGML_F32x4_ADD(x[i], x[offset+i]);     \
+     }                                          \
+     offset >>= 1;                              \
+     for (int i = 0; i < offset; ++i) {         \
+-        x[i] = vec_add(x[i], x[offset+i]);     \
++        x[i] = GGML_F32x4_ADD(x[i], x[offset+i]);     \
+     }                                          \
+     res = vec_extract(x[0], 0) +               \
+           vec_extract(x[0], 1) +               \
+@@ -2364,7 +2376,7 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
+     float sumf = 0.0f;
+     const int np = (n & ~(GGML_F32_STEP - 1));
+ 
+-    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
++    GGML_F32_VEC sum2[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
+ 
+     GGML_F32_VEC ax[GGML_F32_ARR];
+     GGML_F32_VEC ay[GGML_F32_ARR];
+@@ -2374,12 +2386,12 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
+             ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+             ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+ 
+-            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
++            sum2[j] = GGML_F32_VEC_FMA(sum2[j], ax[j], ay[j]);
+         }
+     }
+ 
+     // reduce sum0..sum3 to sum0
+-    GGML_F32_VEC_REDUCE(sumf, sum);
++    GGML_F32_VEC_REDUCE(sumf, sum2);
+ 
+     // leftovers
+     for (int i = np; i < n; ++i) {
+@@ -2399,7 +2411,7 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
+ static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
+     ggml_float sumf = 0.0;
+ 
+-#if defined(GGML_SIMD)
++#if defined(GGML_SIMD) && !defined(__MVS__)
+     const int np = (n & ~(GGML_F16_STEP - 1));
+ 
+     GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
+@@ -3437,7 +3449,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re
+         x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
+     }
+ 
+-#if defined(GGML_SIMD)
++#if defined(GGML_SIMD) && !defined(__MVS__)
+     const int np = (n & ~(GGML_F16_STEP - 1));
+ 
+     GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };