Skip to content

Commit 2359c7c

Browse files
authored
Use .p2align instead of .align for portability
The OSX assembler apparently mishandles the argument to decimal .align, leading to a significant loss of performance as observed in #730, #901 and most recently #1470
1 parent e3a80e6 commit 2359c7c

17 files changed

+31
-31
lines changed

kernel/x86_64/caxpy_microk_haswell-2.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,11 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
5050
"vmulps (%5), %%ymm0 , %%ymm0 \n\t"
5151
#endif
5252

53-
".align 16 \n\t"
53+
".p2align 4 \n\t"
5454
"1: \n\t"
5555

5656
"vmovups (%2,%0,4), %%ymm5 \n\t" // 4 complex values from x
57-
".align 2 \n\t"
57+
".p2align 1 \n\t"
5858
"vmovups 32(%2,%0,4), %%ymm7 \n\t" // 4 complex values from x
5959
"vmovups 64(%2,%0,4), %%ymm9 \n\t" // 4 complex values from x
6060
"vmovups 96(%2,%0,4), %%ymm11 \n\t" // 4 complex values from x
@@ -70,7 +70,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
7070
"vpermilps $0xb1 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part
7171

7272
"vfmadd213ps (%3,%0,4), %%ymm0 , %%ymm5 \n\t"
73-
".align 2 \n\t"
73+
".p2align 1 \n\t"
7474
"vfmadd213ps 32(%3,%0,4), %%ymm0 , %%ymm7 \n\t"
7575
"vfmadd213ps 64(%3,%0,4), %%ymm0 , %%ymm9 \n\t"
7676
"vfmadd213ps 96(%3,%0,4), %%ymm0 , %%ymm11 \n\t"
@@ -96,7 +96,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
9696
"vfmadd231ps %%ymm1 , %%ymm10, %%ymm15 \n\t"
9797

9898
"vmovups %%ymm5 , (%3,%0,4) \n\t"
99-
".align 2 \n\t"
99+
".p2align 1 \n\t"
100100
"vmovups %%ymm7 , 32(%3,%0,4) \n\t"
101101
"vmovups %%ymm9 , 64(%3,%0,4) \n\t"
102102
"vmovups %%ymm11, 96(%3,%0,4) \n\t"

kernel/x86_64/cdot_microk_haswell-2.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
4646
"vxorps %%ymm6, %%ymm6, %%ymm6 \n\t"
4747
"vxorps %%ymm7, %%ymm7, %%ymm7 \n\t"
4848

49-
".align 16 \n\t"
49+
".p2align 4 \n\t"
5050
"1: \n\t"
5151
"vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x
5252
"vmovups 32(%2,%0,4), %%ymm9 \n\t" // 2 * x

kernel/x86_64/cscal_microk_haswell-2.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
5454
"subq $16, %0 \n\t"
5555
"jz 2f \n\t"
5656

57-
".align 16 \n\t"
57+
".p2align 4 \n\t"
5858
"1: \n\t"
5959

6060
//"prefetcht0 128(%1) \n\t"
@@ -156,7 +156,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
156156
"subq $16, %0 \n\t"
157157
"jz 2f \n\t"
158158

159-
".align 16 \n\t"
159+
".p2align 4 \n\t"
160160
"1: \n\t"
161161

162162
//"prefetcht0 128(%1) \n\t"
@@ -245,7 +245,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
245245
"subq $16, %0 \n\t"
246246
"jz 2f \n\t"
247247

248-
".align 16 \n\t"
248+
".p2align 4 \n\t"
249249
"1: \n\t"
250250

251251
//"prefetcht0 128(%1) \n\t"
@@ -312,7 +312,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
312312

313313
"addq $128, %1 \n\t"
314314

315-
".align 16 \n\t"
315+
".p2align 4 \n\t"
316316
"1: \n\t"
317317

318318
//"prefetcht0 128(%1) \n\t"

kernel/x86_64/daxpy_microk_haswell-2.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
3838
(
3939
"vbroadcastsd (%4), %%ymm0 \n\t" // alpha
4040

41-
".align 16 \n\t"
41+
".p2align 4 \n\t"
4242
"1: \n\t"
4343

4444
"vmovups (%3,%0,8), %%ymm12 \n\t" // 4 * y

kernel/x86_64/ddot_microk_haswell-2.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
4141
"vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t"
4242
"vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t"
4343

44-
".align 16 \n\t"
44+
".p2align 4 \n\t"
4545
"1: \n\t"
4646
"vmovups (%2,%0,8), %%ymm12 \n\t" // 2 * x
4747
"vmovups 32(%2,%0,8), %%ymm13 \n\t" // 2 * x

kernel/x86_64/dscal_microk_haswell-2.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
5858
"subq $1 , %0 \n\t"
5959
"jz 2f \n\t"
6060

61-
".align 16 \n\t"
61+
".p2align 4 \n\t"
6262
"1: \n\t"
6363
// "prefetcht0 640(%1) \n\t"
6464

@@ -156,7 +156,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
156156
"cmpq $0, %0 \n\t"
157157
"je 2f \n\t"
158158

159-
".align 16 \n\t"
159+
".p2align 4 \n\t"
160160
"1: \n\t"
161161

162162
"vmovups %%xmm0 ,-128(%1) \n\t"

kernel/x86_64/dsymv_L_microk_haswell-2.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
4444
"vbroadcastsd 16(%8), %%ymm6 \n\t" // temp1[1]
4545
"vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1]
4646

47-
".align 16 \n\t"
47+
".p2align 4 \n\t"
4848
"1: \n\t"
4949

5050
"vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y

kernel/x86_64/dsymv_U_microk_haswell-2.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
4646
"vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1]
4747
"xorq %0,%0 \n\t"
4848

49-
".align 16 \n\t"
49+
".p2align 4 \n\t"
5050
"1: \n\t"
5151

5252
"vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y

kernel/x86_64/dtrmm_kernel_4x8_haswell.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA
2424
" cmp $0, %1 \n\t"
2525
" jz 2f \n\t"
2626

27-
" .align 16 \n\t"
27+
" .p2align 4 \n\t"
2828
"1: \n\t"
2929
" vmovups (%2,%0,4) , %%ymm0 \n\t"
3030
" vmovups (%3,%0,8) , %%ymm1 \n\t"

kernel/x86_64/dtrsm_kernel_RN_haswell.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
128128
" cmpq %1, %0 \n\t"
129129
" je 21f \n\t"
130130

131-
" .align 16 \n\t"
131+
" .p2align 4 \n\t"
132132
"1: \n\t"
133133

134134
" vmovups (%2,%1,4), %%ymm4 \n\t" // read a

0 commit comments

Comments
 (0)