Skip to content

Commit dd71df8

Browse files
authored
Merge pull request #4880 from ChipKerchner/betterPowerGEMVTail
[POWER] Vectorize SGEMV transpose reduce stage
2 parents a8d6b02 + a0aeba6 commit dd71df8

File tree

2 files changed

+82
-26
lines changed

2 files changed

+82
-26
lines changed

kernel/power/sgemv_t.c

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -79,15 +79,32 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
7979
}
8080

8181

82-
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
83-
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
84-
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
85-
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
86-
87-
y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
88-
y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
89-
y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
90-
y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
82+
register __vector float t0, t1, t2, t3;
83+
register __vector float a = { alpha, alpha, alpha, alpha };
84+
__vector float *v_y = (__vector float*) y;
85+
86+
t0 = vec_mergeh(temp0, temp2);
87+
t1 = vec_mergel(temp0, temp2);
88+
t2 = vec_mergeh(temp1, temp3);
89+
t3 = vec_mergel(temp1, temp3);
90+
temp0 = vec_mergeh(t0, t2);
91+
temp1 = vec_mergel(t0, t2);
92+
temp2 = vec_mergeh(t1, t3);
93+
temp3 = vec_mergel(t1, t3);
94+
temp0 += temp1 + temp2 + temp3;
95+
96+
t0 = vec_mergeh(temp4, temp6);
97+
t1 = vec_mergel(temp4, temp6);
98+
t2 = vec_mergeh(temp5, temp7);
99+
t3 = vec_mergel(temp5, temp7);
100+
temp4 = vec_mergeh(t0, t2);
101+
temp5 = vec_mergel(t0, t2);
102+
temp6 = vec_mergeh(t1, t3);
103+
temp7 = vec_mergel(t1, t3);
104+
temp4 += temp5 + temp6 + temp7;
105+
106+
v_y[0] += a * temp0;
107+
v_y[1] += a * temp4;
91108

92109
}
93110

@@ -116,10 +133,21 @@ static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
116133
temp3 += v_x[i] * va3[i];
117134
}
118135

119-
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
120-
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
121-
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
122-
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
136+
register __vector float t0, t1, t2, t3;
137+
register __vector float a = { alpha, alpha, alpha, alpha };
138+
__vector float *v_y = (__vector float*) y;
139+
140+
t0 = vec_mergeh(temp0, temp2);
141+
t1 = vec_mergel(temp0, temp2);
142+
t2 = vec_mergeh(temp1, temp3);
143+
t3 = vec_mergel(temp1, temp3);
144+
temp0 = vec_mergeh(t0, t2);
145+
temp1 = vec_mergel(t0, t2);
146+
temp2 = vec_mergeh(t1, t3);
147+
temp3 = vec_mergel(t1, t3);
148+
temp0 += temp1 + temp2 + temp3;
149+
150+
v_y[0] += a * temp0;
123151

124152
}
125153

kernel/power/sgemv_t_8.c

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -100,15 +100,32 @@ static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
100100
}
101101

102102

103-
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
104-
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
105-
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
106-
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
107-
108-
y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
109-
y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
110-
y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
111-
y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
103+
register __vector float t0, t1, t2, t3;
104+
register __vector float a = { alpha, alpha, alpha, alpha };
105+
__vector float *v_y = (__vector float*) y;
106+
107+
t0 = vec_mergeh(temp0, temp2);
108+
t1 = vec_mergel(temp0, temp2);
109+
t2 = vec_mergeh(temp1, temp3);
110+
t3 = vec_mergel(temp1, temp3);
111+
temp0 = vec_mergeh(t0, t2);
112+
temp1 = vec_mergel(t0, t2);
113+
temp2 = vec_mergeh(t1, t3);
114+
temp3 = vec_mergel(t1, t3);
115+
temp0 += temp1 + temp2 + temp3;
116+
117+
t0 = vec_mergeh(temp4, temp6);
118+
t1 = vec_mergel(temp4, temp6);
119+
t2 = vec_mergeh(temp5, temp7);
120+
t3 = vec_mergel(temp5, temp7);
121+
temp4 = vec_mergeh(t0, t2);
122+
temp5 = vec_mergel(t0, t2);
123+
temp6 = vec_mergeh(t1, t3);
124+
temp7 = vec_mergel(t1, t3);
125+
temp4 += temp5 + temp6 + temp7;
126+
127+
v_y[0] += a * temp0;
128+
v_y[1] += a * temp4;
112129

113130
}
114131

@@ -137,10 +154,21 @@ static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
137154
temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1];
138155
}
139156

140-
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
141-
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
142-
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
143-
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
157+
register __vector float t0, t1, t2, t3;
158+
register __vector float a = { alpha, alpha, alpha, alpha };
159+
__vector float *v_y = (__vector float*) y;
160+
161+
t0 = vec_mergeh(temp0, temp2);
162+
t1 = vec_mergel(temp0, temp2);
163+
t2 = vec_mergeh(temp1, temp3);
164+
t3 = vec_mergel(temp1, temp3);
165+
temp0 = vec_mergeh(t0, t2);
166+
temp1 = vec_mergel(t0, t2);
167+
temp2 = vec_mergeh(t1, t3);
168+
temp3 = vec_mergel(t1, t3);
169+
temp0 += temp1 + temp2 + temp3;
170+
171+
v_y[0] += a * temp0;
144172

145173
}
146174

0 commit comments

Comments
 (0)