Skip to content

Commit 4e65e11

Browse files
Fix q8_0 test in test-quantize-fns
vec_signed uses unexpected rounding mode. Explicitly use different rounding function.
1 parent 4534eef commit 4e65e11

File tree

1 file changed

+4
-2
lines changed

1 file changed

+4
-2
lines changed

ggml/src/ggml-cpu/arch/s390/quants.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
7575

7676
for (int j = 0; j < 8; j++) {
7777
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
78-
const int32x4_t vi = vec_signed(v);
78+
/* Uses non-default rounding for vec_signed or vec_round */
79+
const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 0, 1));
7980

8081
y[i].qs[4*j + 0] = vec_extract(vi, 0);
8182
y[i].qs[4*j + 1] = vec_extract(vi, 1);
@@ -122,7 +123,8 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
122123

123124
for (int j = 0; j < 8; j++) {
124125
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
125-
const int32x4_t vi = vec_signed(v);
126+
/* Uses non-default rounding for vec_signed or vec_round */
127+
const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 0, 1));
126128

127129
y[i].qs[4*j + 0] = vec_extract(vi, 0);
128130
y[i].qs[4*j + 1] = vec_extract(vi, 1);

0 commit comments

Comments
 (0)