Skip to content

Commit 9e2f316

Browse files
authored
Power8 inline assembly fixes
Quoting patch author amodra from #1078 Lots of issues here. - The vsx regs weren't listed as clobbered. - Poor choice of vsx regs, which along with the lack of clobbers led to trashing v0..v21 and fr14..fr23. Ideally you'd let gcc choose all temp vsx regs, but asms currently have a limit of 30 i/o parms. - Other regs were clobbered unnecessarily, seemingly in an attempt to clobber inputs, with gcc-7 complaining about the clobber of r2. (Changed inputs should be also listed as outputs or as an i/o.) - "r" constraint used instead of "b" for gprs used in insns where the r0 encoding means zero rather than r0. - There were unused asm inputs too. - All memory was clobbered rather than hooking up memory outputs with proper memory constraints, and that and the lack of proper memory input constraints meant the asms needed to be volatile and their containing function noinline. - Some parameters were being passed unnecessarily via memory. - When a copy of a
1 parent e2489c9 commit 9e2f316

38 files changed

+3314
-3640
lines changed

kernel/power/casum.c

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5353

5454
#ifndef HAVE_KERNEL_16
5555

56-
static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
56+
static FLOAT casum_kernel_16(BLASLONG n, FLOAT *x1)
5757
{
5858

5959
BLASLONG i=0;
@@ -92,11 +92,7 @@ static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
9292

9393
}
9494

95-
svec[0] = sum0+sum1+sum2+sum3;
96-
svec[1] = 0.0;
97-
svec[2] = 0.0;
98-
svec[3] = 0.0;
99-
95+
return sum0+sum1+sum2+sum3;
10096
}
10197

10298
#endif
@@ -106,7 +102,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
106102
BLASLONG i=0;
107103
BLASLONG ip=0;
108104
FLOAT sumf = 0.0;
109-
FLOAT svec[4] __attribute__ ((aligned (16)));;
110105
BLASLONG n1;
111106
BLASLONG inc_x2;
112107

@@ -119,8 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
119114
if ( n1 > 0 )
120115
{
121116

122-
casum_kernel_16(n1, x, svec);
123-
sumf = svec[0] + svec[1]+svec[2]+svec[3];
117+
sumf = casum_kernel_16(n1, x);
124118
i=n1;
125119
ip = 2 * n1;
126120
}

kernel/power/casum_microk_power8.c

Lines changed: 140 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -34,144 +34,145 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3434
**************************************************************************************/
3535

3636
#define HAVE_KERNEL_16 1
37-
static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
3837

39-
static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
38+
static float casum_kernel_16 (long n, float *x)
4039
{
41-
42-
43-
BLASLONG i = n;
44-
BLASLONG o16 = 16;
45-
BLASLONG o32 = 32;
46-
BLASLONG o48 = 48;
47-
BLASLONG o64 = 64;
48-
BLASLONG o80 = 80;
49-
BLASLONG o96 = 96;
50-
BLASLONG o112 = 112;
51-
FLOAT *x1=x;
52-
BLASLONG pre = 384;
53-
54-
__asm__ __volatile__
55-
(
56-
57-
"dcbt %2 , %4 \n\t"
58-
59-
"xxlxor 32,32,32 \n\t"
60-
"xxlxor 33,33,33 \n\t"
61-
"xxlxor 34,34,34 \n\t"
62-
"xxlxor 35,35,35 \n\t"
63-
"xxlxor 36,36,36 \n\t"
64-
"xxlxor 37,37,37 \n\t"
65-
"xxlxor 38,38,38 \n\t"
66-
"xxlxor 39,39,39 \n\t"
67-
68-
"lxvw4x 40, 0, %2 \n\t"
69-
"lxvw4x 41, %5, %2 \n\t"
70-
"lxvw4x 42, %6, %2 \n\t"
71-
"lxvw4x 43, %7, %2 \n\t"
72-
"lxvw4x 44, %8, %2 \n\t"
73-
"lxvw4x 45, %9, %2 \n\t"
74-
"lxvw4x 46, %10, %2 \n\t"
75-
"lxvw4x 47, %11, %2 \n\t"
76-
77-
"addi %2, %2, 128 \n\t"
78-
79-
"addic. %0 , %0 , -16 \n\t"
80-
"ble 2f \n\t"
81-
82-
".align 5 \n\t"
83-
"1: \n\t"
84-
85-
"dcbt %2 , %4 \n\t"
86-
87-
"xvabssp 48, 40 \n\t"
88-
"xvabssp 49, 41 \n\t"
89-
"xvabssp 50, 42 \n\t"
90-
"xvabssp 51, 43 \n\t"
91-
92-
"lxvw4x 40, 0, %2 \n\t"
93-
"lxvw4x 41, %5, %2 \n\t"
94-
95-
"xvabssp 52, 44 \n\t"
96-
"xvabssp 53, 45 \n\t"
97-
98-
"lxvw4x 42, %6, %2 \n\t"
99-
"lxvw4x 43, %7, %2 \n\t"
100-
101-
"xvabssp 54, 46 \n\t"
102-
"xvabssp 55, 47 \n\t"
103-
104-
"lxvw4x 44, %8, %2 \n\t"
105-
"lxvw4x 45, %9, %2 \n\t"
106-
107-
"xvaddsp 32, 32, 48 \n\t"
108-
"xvaddsp 33, 33, 49 \n\t"
109-
110-
"lxvw4x 46, %10, %2 \n\t"
111-
"lxvw4x 47, %11, %2 \n\t"
112-
113-
"xvaddsp 34, 34, 50 \n\t"
114-
"xvaddsp 35, 35, 51 \n\t"
115-
"addi %2, %2, 128 \n\t"
116-
"xvaddsp 36, 36, 52 \n\t"
117-
"xvaddsp 37, 37, 53 \n\t"
118-
"addic. %0 , %0 , -16 \n\t"
119-
"xvaddsp 38, 38, 54 \n\t"
120-
"xvaddsp 39, 39, 55 \n\t"
121-
122-
"bgt 1b \n\t"
123-
124-
"2: \n\t"
125-
126-
127-
"xvabssp 48, 40 \n\t"
128-
"xvabssp 49, 41 \n\t"
129-
"xvabssp 50, 42 \n\t"
130-
"xvabssp 51, 43 \n\t"
131-
"xvabssp 52, 44 \n\t"
132-
"xvabssp 53, 45 \n\t"
133-
"xvabssp 54, 46 \n\t"
134-
"xvabssp 55, 47 \n\t"
135-
136-
"xvaddsp 32, 32, 48 \n\t"
137-
"xvaddsp 33, 33, 49 \n\t"
138-
"xvaddsp 34, 34, 50 \n\t"
139-
"xvaddsp 35, 35, 51 \n\t"
140-
"xvaddsp 36, 36, 52 \n\t"
141-
"xvaddsp 37, 37, 53 \n\t"
142-
"xvaddsp 38, 38, 54 \n\t"
143-
"xvaddsp 39, 39, 55 \n\t"
144-
145-
"xvaddsp 32, 32, 33 \n\t"
146-
"xvaddsp 34, 34, 35 \n\t"
147-
"xvaddsp 36, 36, 37 \n\t"
148-
"xvaddsp 38, 38, 39 \n\t"
149-
150-
"xvaddsp 32, 32, 34 \n\t"
151-
"xvaddsp 36, 36, 38 \n\t"
152-
153-
"xvaddsp 32, 32, 36 \n\t"
154-
155-
156-
"stxvw4x 32, 0, %3 \n\t"
157-
158-
:
159-
:
160-
"r" (i), // 0
161-
"r" (n), // 1
162-
"r" (x1), // 2
163-
"r" (svec), // 3
164-
"r" (pre), // 4
165-
"r" (o16), // 5
166-
"r" (o32), // 6
167-
"r" (o48), // 7
168-
"r" (o64), // 8
169-
"r" (o80), // 9
170-
"r" (o96), // 10
171-
"r" (o112) // 11
172-
: "cr0", "%0", "%2", "memory"
173-
);
174-
175-
}
176-
177-
40+
float sum;
41+
__vector float t0;
42+
__vector float t1;
43+
__vector float t2;
44+
__vector float t3;
45+
46+
__asm__
47+
(
48+
"dcbt 0, %2 \n\t"
49+
50+
"xxlxor 32, 32, 32 \n\t"
51+
"xxlxor 33, 33, 33 \n\t"
52+
"xxlxor 34, 34, 34 \n\t"
53+
"xxlxor 35, 35, 35 \n\t"
54+
"xxlxor 36, 36, 36 \n\t"
55+
"xxlxor 37, 37, 37 \n\t"
56+
"xxlxor 38, 38, 38 \n\t"
57+
"xxlxor 39, 39, 39 \n\t"
58+
59+
"lxvw4x 40, 0, %2 \n\t"
60+
"lxvw4x 41, %8, %2 \n\t"
61+
"lxvw4x 42, %9, %2 \n\t"
62+
"lxvw4x 43, %10, %2 \n\t"
63+
"lxvw4x 44, %11, %2 \n\t"
64+
"lxvw4x 45, %12, %2 \n\t"
65+
"lxvw4x 46, %13, %2 \n\t"
66+
"lxvw4x 47, %14, %2 \n\t"
67+
68+
"addi %2, %2, 128 \n\t"
69+
70+
"addic. %1, %1, -16 \n\t"
71+
"ble 2f \n\t"
72+
73+
".p2align 5 \n"
74+
"1: \n\t"
75+
76+
"xvabssp 48, 40 \n\t"
77+
"xvabssp 49, 41 \n\t"
78+
"xvabssp 50, 42 \n\t"
79+
"xvabssp 51, 43 \n\t"
80+
81+
"lxvw4x 40, 0, %2 \n\t"
82+
"lxvw4x 41, %8, %2 \n\t"
83+
84+
"xvabssp %x3, 44 \n\t"
85+
"xvabssp %x4, 45 \n\t"
86+
87+
"lxvw4x 42, %9, %2 \n\t"
88+
"lxvw4x 43, %10, %2 \n\t"
89+
90+
"xvabssp %x5, 46 \n\t"
91+
"xvabssp %x6, 47 \n\t"
92+
93+
"lxvw4x 44, %11, %2 \n\t"
94+
"lxvw4x 45, %12, %2 \n\t"
95+
96+
"xvaddsp 32, 32, 48 \n\t"
97+
"xvaddsp 33, 33, 49 \n\t"
98+
99+
"lxvw4x 46, %13, %2 \n\t"
100+
"lxvw4x 47, %14, %2 \n\t"
101+
102+
"xvaddsp 34, 34, 50 \n\t"
103+
"xvaddsp 35, 35, 51 \n\t"
104+
"addi %2, %2, 128 \n\t"
105+
"xvaddsp 36, 36, %x3 \n\t"
106+
"xvaddsp 37, 37, %x4 \n\t"
107+
"addic. %1, %1, -16 \n\t"
108+
"xvaddsp 38, 38, %x5 \n\t"
109+
"xvaddsp 39, 39, %x6 \n\t"
110+
111+
"bgt 1b \n"
112+
113+
"2: \n\t"
114+
115+
"xvabssp 48, 40 \n\t"
116+
"xvabssp 49, 41 \n\t"
117+
"xvabssp 50, 42 \n\t"
118+
"xvabssp 51, 43 \n\t"
119+
"xvabssp %x3, 44 \n\t"
120+
"xvabssp %x4, 45 \n\t"
121+
"xvabssp %x5, 46 \n\t"
122+
"xvabssp %x6, 47 \n\t"
123+
124+
"xvaddsp 32, 32, 48 \n\t"
125+
"xvaddsp 33, 33, 49 \n\t"
126+
"xvaddsp 34, 34, 50 \n\t"
127+
"xvaddsp 35, 35, 51 \n\t"
128+
"xvaddsp 36, 36, %x3 \n\t"
129+
"xvaddsp 37, 37, %x4 \n\t"
130+
"xvaddsp 38, 38, %x5 \n\t"
131+
"xvaddsp 39, 39, %x6 \n\t"
132+
133+
"xvaddsp 32, 32, 33 \n\t"
134+
"xvaddsp 34, 34, 35 \n\t"
135+
"xvaddsp 36, 36, 37 \n\t"
136+
"xvaddsp 38, 38, 39 \n\t"
137+
138+
"xvaddsp 32, 32, 34 \n\t"
139+
"xvaddsp 36, 36, 38 \n\t"
140+
141+
"xvaddsp 32, 32, 36 \n\t"
142+
143+
"xxsldwi 33, 32, 32, 2 \n\t"
144+
"xvaddsp 32, 32, 33 \n\t"
145+
146+
"xxsldwi 33, 32, 32, 1 \n\t"
147+
"xvaddsp 32, 32, 33 \n\t"
148+
149+
"xscvspdp %0, 32 \n"
150+
151+
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
152+
"#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
153+
:
154+
"=f" (sum), // 0
155+
"+r" (n), // 1
156+
"+b" (x), // 2
157+
"=wa" (t0), // 3
158+
"=wa" (t1), // 4
159+
"=wa" (t2), // 5
160+
"=wa" (t3) // 6
161+
:
162+
"m" (*x),
163+
"b" (16), // 8
164+
"b" (32), // 9
165+
"b" (48), // 10
166+
"b" (64), // 11
167+
"b" (80), // 12
168+
"b" (96), // 13
169+
"b" (112) // 14
170+
:
171+
"cr0",
172+
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
173+
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
174+
"vs48","vs49","vs50","vs51"
175+
);
176+
177+
return sum;
178+
}

0 commit comments

Comments
 (0)