Skip to content

Commit 7b0b7c1

Browse files
authored
Merge pull request #2190 from martin-frbg/zdot-zen
Replace vpermpd with vpermilpd in the Haswell/Zen zdot microkernel
2 parents d14cf1c + 28e9645 commit 7b0b7c1

File tree

1 file changed

+16
-8
lines changed

1 file changed

+16
-8
lines changed

kernel/x86_64/zdot_microk_haswell-2.c

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
6666

6767
"vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i
6868
"vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i
69-
"vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t"
70-
"vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"
69+
"vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t"
70+
"vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t"
71+
// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t"
72+
// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"
7173

7274
"vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i
7375
"vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i
74-
"vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t"
75-
"vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"
76+
"vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t"
77+
"vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t"
78+
// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t"
79+
// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"
7680

7781
"vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r
7882
"addq $16 , %0 \n\t"
@@ -151,13 +155,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
151155

152156
"vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i
153157
"vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i
154-
"vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t"
155-
"vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"
158+
"vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t"
159+
"vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t"
160+
// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t"
161+
// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"
156162

157163
"vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i
158164
"vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i
159-
"vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t"
160-
"vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"
165+
"vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t"
166+
"vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t"
167+
// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t"
168+
// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"
161169

162170
"vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r
163171
"addq $16 , %0 \n\t"

0 commit comments

Comments
 (0)