Skip to content

Commit 7b896be

Browse files
Add SIMD/openblas code, currently not exercised
1 parent 3215abd commit 7b896be

File tree

2 files changed

+143
-0
lines changed

2 files changed

+143
-0
lines changed

buildenv

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1+
export ZOPEN_BUILD_LINE="DEV"
12
export ZOPEN_STABLE_DEPS="zoslib make cmake"
23
export ZOPEN_DEV_URL="https://github.com/ggerganov/llama.cpp.git"
34
export ZOPEN_DEV_DEPS="zoslib make cmake"
45
export ZOPEN_DEV_TAG="master-9e232f0"
56
export ZOPEN_BUILD_LINE="DEV"
67
export ZOPEN_NAME="llamacpp-master"
8+
export ZOPEN_RUNTIME_DEPS="ncurses"
79

810
rm -f "llama"
911
ln -s "llama.cpp" "llama"
@@ -21,6 +23,12 @@ export ZOPEN_MAKE_MINIMAL=Y
2123
export ZOPEN_INSTALL="cmake"
2224
export ZOPEN_INSTALL_OPTS="--install ../build"
2325

26+
# Needs more testing
27+
if false; then
28+
export ZOPEN_EXTRA_CPPFLAGS="-DGGML_USE_OPENBLAS -I /home/itodoro/projects/openblas/openblas/include/openblas -mvx -mzvector -march=z15"
29+
export ZOPEN_EXTRA_LIBS="/home/itodoro/projects/openblas/openblas/libopenblas.x"
30+
fi
31+
2432
export ZOPEN_CHECK="skip"
2533

2634
zopen_check_results()

patches/simd.needsmoretestingpatch

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
diff --git a/ggml.c b/ggml.c
2+
index 44c43b4..fedd4ca 100644
3+
--- a/ggml.c
4+
+++ b/ggml.c
5+
@@ -9,7 +9,7 @@
6+
7+
#if defined(_MSC_VER) || defined(__MINGW32__)
8+
#include <malloc.h> // using malloc.h with MSC/MINGW
9+
-#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
10+
+#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(__MVS__)
11+
#include <alloca.h>
12+
#endif
13+
14+
@@ -197,10 +197,19 @@ typedef void * thread_ret_t;
15+
#else
16+
inline static void * ggml_aligned_malloc(size_t size) {
17+
void * aligned_memory = NULL;
18+
+#ifdef __MVS__
19+
+ if (size == 0)
20+
+ size = 1;
21+
+ aligned_memory = malloc(size);
22+
+ int result = 0;
23+
+ if (aligned_memory == NULL)
24+
+ result = errno;
25+
+#else
26+
#ifdef GGML_USE_METAL
27+
int result = posix_memalign(&aligned_memory, getpagesize(), size);
28+
#else
29+
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
30+
+#endif
31+
#endif
32+
if (result != 0) {
33+
// Handle allocation failure
34+
@@ -299,7 +308,7 @@ typedef double ggml_float;
35+
#if defined(_MSC_VER) || defined(__MINGW32__)
36+
#include <intrin.h>
37+
#else
38+
-#if !defined(__riscv)
39+
+#if !defined(__riscv) && !defined(__MVS__)
40+
#include <immintrin.h>
41+
#endif
42+
#endif
43+
@@ -583,7 +592,7 @@ int64_t ggml_cycles_per_ms(void) {
44+
#if defined(__cpp_lib_hardware_interference_size)
45+
#define CACHE_LINE_SIZE hardware_destructive_interference_size
46+
#else
47+
-#if defined(__POWER9_VECTOR__)
48+
+#if defined(__POWER9_VECTOR__) || defined(__MVS__)
49+
#define CACHE_LINE_SIZE 128
50+
#else
51+
#define CACHE_LINE_SIZE 64
52+
@@ -2051,10 +2060,11 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
53+
#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
54+
#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
55+
56+
-#elif defined(__POWER9_VECTOR__)
57+
+#elif defined(__POWER9_VECTOR__) || defined(__MVS__)
58+
59+
#define GGML_SIMD
60+
61+
+
62+
// F32 POWER9
63+
64+
#define GGML_F32_STEP 32
65+
@@ -2066,21 +2076,23 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
66+
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
67+
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
68+
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
69+
-#define GGML_F32x4_ADD vec_add
70+
-#define GGML_F32x4_MUL vec_mul
71+
+#define GGML_F32x4_ADD(a, b) a+b
72+
+//#define GGML_F32x4_MUL(a, b) vec_madd(a, b, (vector float){0.0, 0.0,0.0, 0.0})
73+
+#define GGML_F32x4_MUL(a, b) a*b
74+
+//#define GGML_F32x4_MUL(a, b) __builtin_s390_vfmasb( a, b, a)
75+
#define GGML_F32x4_REDUCE(res, x) \
76+
{ \
77+
int offset = GGML_F32_ARR >> 1; \
78+
for (int i = 0; i < offset; ++i) { \
79+
- x[i] = vec_add(x[i], x[offset+i]); \
80+
+ x[i] = GGML_F32x4_ADD(x[i], x[offset+i]); \
81+
} \
82+
offset >>= 1; \
83+
for (int i = 0; i < offset; ++i) { \
84+
- x[i] = vec_add(x[i], x[offset+i]); \
85+
+ x[i] = GGML_F32x4_ADD(x[i], x[offset+i]); \
86+
} \
87+
offset >>= 1; \
88+
for (int i = 0; i < offset; ++i) { \
89+
- x[i] = vec_add(x[i], x[offset+i]); \
90+
+ x[i] = GGML_F32x4_ADD(x[i], x[offset+i]); \
91+
} \
92+
res = vec_extract(x[0], 0) + \
93+
vec_extract(x[0], 1) + \
94+
@@ -2364,7 +2376,7 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
95+
float sumf = 0.0f;
96+
const int np = (n & ~(GGML_F32_STEP - 1));
97+
98+
- GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
99+
+ GGML_F32_VEC sum2[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
100+
101+
GGML_F32_VEC ax[GGML_F32_ARR];
102+
GGML_F32_VEC ay[GGML_F32_ARR];
103+
@@ -2374,12 +2386,12 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
104+
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
105+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
106+
107+
- sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
108+
+ sum2[j] = GGML_F32_VEC_FMA(sum2[j], ax[j], ay[j]);
109+
}
110+
}
111+
112+
// reduce sum0..sum3 to sum0
113+
- GGML_F32_VEC_REDUCE(sumf, sum);
114+
+ GGML_F32_VEC_REDUCE(sumf, sum2);
115+
116+
// leftovers
117+
for (int i = np; i < n; ++i) {
118+
@@ -2399,7 +2411,7 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
119+
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
120+
ggml_float sumf = 0.0;
121+
122+
-#if defined(GGML_SIMD)
123+
+#if defined(GGML_SIMD) && !defined(__MVS__)
124+
const int np = (n & ~(GGML_F16_STEP - 1));
125+
126+
GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
127+
@@ -3437,7 +3449,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re
128+
x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
129+
}
130+
131+
-#if defined(GGML_SIMD)
132+
+#if defined(GGML_SIMD) && !defined(__MVS__)
133+
const int np = (n & ~(GGML_F16_STEP - 1));
134+
135+
GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };

0 commit comments

Comments
 (0)