Skip to content

Commit 040672e

Browse files
authored
Merge pull request #1098 from martin-frbg/amodra-power8
Power8 inline assembly fixes
2 parents c8ce9e4 + 9e2f316 commit 040672e

38 files changed

+3314
-3640
lines changed

kernel/power/casum.c

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5353

5454
#ifndef HAVE_KERNEL_16
5555

56-
static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
56+
static FLOAT casum_kernel_16(BLASLONG n, FLOAT *x1)
5757
{
5858

5959
BLASLONG i=0;
@@ -92,11 +92,7 @@ static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
9292

9393
}
9494

95-
svec[0] = sum0+sum1+sum2+sum3;
96-
svec[1] = 0.0;
97-
svec[2] = 0.0;
98-
svec[3] = 0.0;
99-
95+
return sum0+sum1+sum2+sum3;
10096
}
10197

10298
#endif
@@ -106,7 +102,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
106102
BLASLONG i=0;
107103
BLASLONG ip=0;
108104
FLOAT sumf = 0.0;
109-
FLOAT svec[4] __attribute__ ((aligned (16)));;
110105
BLASLONG n1;
111106
BLASLONG inc_x2;
112107

@@ -119,8 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
119114
if ( n1 > 0 )
120115
{
121116

122-
casum_kernel_16(n1, x, svec);
123-
sumf = svec[0] + svec[1]+svec[2]+svec[3];
117+
sumf = casum_kernel_16(n1, x);
124118
i=n1;
125119
ip = 2 * n1;
126120
}

kernel/power/casum_microk_power8.c

Lines changed: 140 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -34,144 +34,145 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3434
**************************************************************************************/
3535

3636
#define HAVE_KERNEL_16 1
37-
static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
3837

39-
static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
38+
static float casum_kernel_16 (long n, float *x)
4039
{
41-
42-
43-
BLASLONG i = n;
44-
BLASLONG o16 = 16;
45-
BLASLONG o32 = 32;
46-
BLASLONG o48 = 48;
47-
BLASLONG o64 = 64;
48-
BLASLONG o80 = 80;
49-
BLASLONG o96 = 96;
50-
BLASLONG o112 = 112;
51-
FLOAT *x1=x;
52-
BLASLONG pre = 384;
53-
54-
__asm__ __volatile__
55-
(
56-
57-
"dcbt %2 , %4 \n\t"
58-
59-
"xxlxor 32,32,32 \n\t"
60-
"xxlxor 33,33,33 \n\t"
61-
"xxlxor 34,34,34 \n\t"
62-
"xxlxor 35,35,35 \n\t"
63-
"xxlxor 36,36,36 \n\t"
64-
"xxlxor 37,37,37 \n\t"
65-
"xxlxor 38,38,38 \n\t"
66-
"xxlxor 39,39,39 \n\t"
67-
68-
"lxvw4x 40, 0, %2 \n\t"
69-
"lxvw4x 41, %5, %2 \n\t"
70-
"lxvw4x 42, %6, %2 \n\t"
71-
"lxvw4x 43, %7, %2 \n\t"
72-
"lxvw4x 44, %8, %2 \n\t"
73-
"lxvw4x 45, %9, %2 \n\t"
74-
"lxvw4x 46, %10, %2 \n\t"
75-
"lxvw4x 47, %11, %2 \n\t"
76-
77-
"addi %2, %2, 128 \n\t"
78-
79-
"addic. %0 , %0 , -16 \n\t"
80-
"ble 2f \n\t"
81-
82-
".align 5 \n\t"
83-
"1: \n\t"
84-
85-
"dcbt %2 , %4 \n\t"
86-
87-
"xvabssp 48, 40 \n\t"
88-
"xvabssp 49, 41 \n\t"
89-
"xvabssp 50, 42 \n\t"
90-
"xvabssp 51, 43 \n\t"
91-
92-
"lxvw4x 40, 0, %2 \n\t"
93-
"lxvw4x 41, %5, %2 \n\t"
94-
95-
"xvabssp 52, 44 \n\t"
96-
"xvabssp 53, 45 \n\t"
97-
98-
"lxvw4x 42, %6, %2 \n\t"
99-
"lxvw4x 43, %7, %2 \n\t"
100-
101-
"xvabssp 54, 46 \n\t"
102-
"xvabssp 55, 47 \n\t"
103-
104-
"lxvw4x 44, %8, %2 \n\t"
105-
"lxvw4x 45, %9, %2 \n\t"
106-
107-
"xvaddsp 32, 32, 48 \n\t"
108-
"xvaddsp 33, 33, 49 \n\t"
109-
110-
"lxvw4x 46, %10, %2 \n\t"
111-
"lxvw4x 47, %11, %2 \n\t"
112-
113-
"xvaddsp 34, 34, 50 \n\t"
114-
"xvaddsp 35, 35, 51 \n\t"
115-
"addi %2, %2, 128 \n\t"
116-
"xvaddsp 36, 36, 52 \n\t"
117-
"xvaddsp 37, 37, 53 \n\t"
118-
"addic. %0 , %0 , -16 \n\t"
119-
"xvaddsp 38, 38, 54 \n\t"
120-
"xvaddsp 39, 39, 55 \n\t"
121-
122-
"bgt 1b \n\t"
123-
124-
"2: \n\t"
125-
126-
127-
"xvabssp 48, 40 \n\t"
128-
"xvabssp 49, 41 \n\t"
129-
"xvabssp 50, 42 \n\t"
130-
"xvabssp 51, 43 \n\t"
131-
"xvabssp 52, 44 \n\t"
132-
"xvabssp 53, 45 \n\t"
133-
"xvabssp 54, 46 \n\t"
134-
"xvabssp 55, 47 \n\t"
135-
136-
"xvaddsp 32, 32, 48 \n\t"
137-
"xvaddsp 33, 33, 49 \n\t"
138-
"xvaddsp 34, 34, 50 \n\t"
139-
"xvaddsp 35, 35, 51 \n\t"
140-
"xvaddsp 36, 36, 52 \n\t"
141-
"xvaddsp 37, 37, 53 \n\t"
142-
"xvaddsp 38, 38, 54 \n\t"
143-
"xvaddsp 39, 39, 55 \n\t"
144-
145-
"xvaddsp 32, 32, 33 \n\t"
146-
"xvaddsp 34, 34, 35 \n\t"
147-
"xvaddsp 36, 36, 37 \n\t"
148-
"xvaddsp 38, 38, 39 \n\t"
149-
150-
"xvaddsp 32, 32, 34 \n\t"
151-
"xvaddsp 36, 36, 38 \n\t"
152-
153-
"xvaddsp 32, 32, 36 \n\t"
154-
155-
156-
"stxvw4x 32, 0, %3 \n\t"
157-
158-
:
159-
:
160-
"r" (i), // 0
161-
"r" (n), // 1
162-
"r" (x1), // 2
163-
"r" (svec), // 3
164-
"r" (pre), // 4
165-
"r" (o16), // 5
166-
"r" (o32), // 6
167-
"r" (o48), // 7
168-
"r" (o64), // 8
169-
"r" (o80), // 9
170-
"r" (o96), // 10
171-
"r" (o112) // 11
172-
: "cr0", "%0", "%2", "memory"
173-
);
174-
175-
}
176-
177-
40+
float sum;
41+
__vector float t0;
42+
__vector float t1;
43+
__vector float t2;
44+
__vector float t3;
45+
46+
__asm__
47+
(
48+
"dcbt 0, %2 \n\t"
49+
50+
"xxlxor 32, 32, 32 \n\t"
51+
"xxlxor 33, 33, 33 \n\t"
52+
"xxlxor 34, 34, 34 \n\t"
53+
"xxlxor 35, 35, 35 \n\t"
54+
"xxlxor 36, 36, 36 \n\t"
55+
"xxlxor 37, 37, 37 \n\t"
56+
"xxlxor 38, 38, 38 \n\t"
57+
"xxlxor 39, 39, 39 \n\t"
58+
59+
"lxvw4x 40, 0, %2 \n\t"
60+
"lxvw4x 41, %8, %2 \n\t"
61+
"lxvw4x 42, %9, %2 \n\t"
62+
"lxvw4x 43, %10, %2 \n\t"
63+
"lxvw4x 44, %11, %2 \n\t"
64+
"lxvw4x 45, %12, %2 \n\t"
65+
"lxvw4x 46, %13, %2 \n\t"
66+
"lxvw4x 47, %14, %2 \n\t"
67+
68+
"addi %2, %2, 128 \n\t"
69+
70+
"addic. %1, %1, -16 \n\t"
71+
"ble 2f \n\t"
72+
73+
".p2align 5 \n"
74+
"1: \n\t"
75+
76+
"xvabssp 48, 40 \n\t"
77+
"xvabssp 49, 41 \n\t"
78+
"xvabssp 50, 42 \n\t"
79+
"xvabssp 51, 43 \n\t"
80+
81+
"lxvw4x 40, 0, %2 \n\t"
82+
"lxvw4x 41, %8, %2 \n\t"
83+
84+
"xvabssp %x3, 44 \n\t"
85+
"xvabssp %x4, 45 \n\t"
86+
87+
"lxvw4x 42, %9, %2 \n\t"
88+
"lxvw4x 43, %10, %2 \n\t"
89+
90+
"xvabssp %x5, 46 \n\t"
91+
"xvabssp %x6, 47 \n\t"
92+
93+
"lxvw4x 44, %11, %2 \n\t"
94+
"lxvw4x 45, %12, %2 \n\t"
95+
96+
"xvaddsp 32, 32, 48 \n\t"
97+
"xvaddsp 33, 33, 49 \n\t"
98+
99+
"lxvw4x 46, %13, %2 \n\t"
100+
"lxvw4x 47, %14, %2 \n\t"
101+
102+
"xvaddsp 34, 34, 50 \n\t"
103+
"xvaddsp 35, 35, 51 \n\t"
104+
"addi %2, %2, 128 \n\t"
105+
"xvaddsp 36, 36, %x3 \n\t"
106+
"xvaddsp 37, 37, %x4 \n\t"
107+
"addic. %1, %1, -16 \n\t"
108+
"xvaddsp 38, 38, %x5 \n\t"
109+
"xvaddsp 39, 39, %x6 \n\t"
110+
111+
"bgt 1b \n"
112+
113+
"2: \n\t"
114+
115+
"xvabssp 48, 40 \n\t"
116+
"xvabssp 49, 41 \n\t"
117+
"xvabssp 50, 42 \n\t"
118+
"xvabssp 51, 43 \n\t"
119+
"xvabssp %x3, 44 \n\t"
120+
"xvabssp %x4, 45 \n\t"
121+
"xvabssp %x5, 46 \n\t"
122+
"xvabssp %x6, 47 \n\t"
123+
124+
"xvaddsp 32, 32, 48 \n\t"
125+
"xvaddsp 33, 33, 49 \n\t"
126+
"xvaddsp 34, 34, 50 \n\t"
127+
"xvaddsp 35, 35, 51 \n\t"
128+
"xvaddsp 36, 36, %x3 \n\t"
129+
"xvaddsp 37, 37, %x4 \n\t"
130+
"xvaddsp 38, 38, %x5 \n\t"
131+
"xvaddsp 39, 39, %x6 \n\t"
132+
133+
"xvaddsp 32, 32, 33 \n\t"
134+
"xvaddsp 34, 34, 35 \n\t"
135+
"xvaddsp 36, 36, 37 \n\t"
136+
"xvaddsp 38, 38, 39 \n\t"
137+
138+
"xvaddsp 32, 32, 34 \n\t"
139+
"xvaddsp 36, 36, 38 \n\t"
140+
141+
"xvaddsp 32, 32, 36 \n\t"
142+
143+
"xxsldwi 33, 32, 32, 2 \n\t"
144+
"xvaddsp 32, 32, 33 \n\t"
145+
146+
"xxsldwi 33, 32, 32, 1 \n\t"
147+
"xvaddsp 32, 32, 33 \n\t"
148+
149+
"xscvspdp %0, 32 \n"
150+
151+
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
152+
"#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
153+
:
154+
"=f" (sum), // 0
155+
"+r" (n), // 1
156+
"+b" (x), // 2
157+
"=wa" (t0), // 3
158+
"=wa" (t1), // 4
159+
"=wa" (t2), // 5
160+
"=wa" (t3) // 6
161+
:
162+
"m" (*x),
163+
"b" (16), // 8
164+
"b" (32), // 9
165+
"b" (48), // 10
166+
"b" (64), // 11
167+
"b" (80), // 12
168+
"b" (96), // 13
169+
"b" (112) // 14
170+
:
171+
"cr0",
172+
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
173+
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
174+
"vs48","vs49","vs50","vs51"
175+
);
176+
177+
return sum;
178+
}

0 commit comments

Comments
 (0)