@@ -100,15 +100,32 @@ static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
100
100
}
101
101
102
102
103
- y [0 ] += alpha * (temp0 [0 ] + temp0 [1 ]+ temp0 [2 ] + temp0 [3 ]);
104
- y [1 ] += alpha * (temp1 [0 ] + temp1 [1 ]+ temp1 [2 ] + temp1 [3 ]);
105
- y [2 ] += alpha * (temp2 [0 ] + temp2 [1 ]+ temp2 [2 ] + temp2 [3 ]);
106
- y [3 ] += alpha * (temp3 [0 ] + temp3 [1 ]+ temp3 [2 ] + temp3 [3 ]);
107
-
108
- y [4 ] += alpha * (temp4 [0 ] + temp4 [1 ]+ temp4 [2 ] + temp4 [3 ]);
109
- y [5 ] += alpha * (temp5 [0 ] + temp5 [1 ]+ temp5 [2 ] + temp5 [3 ]);
110
- y [6 ] += alpha * (temp6 [0 ] + temp6 [1 ]+ temp6 [2 ] + temp6 [3 ]);
111
- y [7 ] += alpha * (temp7 [0 ] + temp7 [1 ]+ temp7 [2 ] + temp7 [3 ]);
103
+ register __vector float t0 , t1 , t2 , t3 ;
104
+ register __vector float a = { alpha , alpha , alpha , alpha };
105
+ __vector float * v_y = (__vector float * ) y ;
106
+
107
+ t0 = vec_mergeh (temp0 , temp2 );
108
+ t1 = vec_mergel (temp0 , temp2 );
109
+ t2 = vec_mergeh (temp1 , temp3 );
110
+ t3 = vec_mergel (temp1 , temp3 );
111
+ temp0 = vec_mergeh (t0 , t2 );
112
+ temp1 = vec_mergel (t0 , t2 );
113
+ temp2 = vec_mergeh (t1 , t3 );
114
+ temp3 = vec_mergel (t1 , t3 );
115
+ temp0 += temp1 + temp2 + temp3 ;
116
+
117
+ t0 = vec_mergeh (temp4 , temp6 );
118
+ t1 = vec_mergel (temp4 , temp6 );
119
+ t2 = vec_mergeh (temp5 , temp7 );
120
+ t3 = vec_mergel (temp5 , temp7 );
121
+ temp4 = vec_mergeh (t0 , t2 );
122
+ temp5 = vec_mergel (t0 , t2 );
123
+ temp6 = vec_mergeh (t1 , t3 );
124
+ temp7 = vec_mergel (t1 , t3 );
125
+ temp4 += temp5 + temp6 + temp7 ;
126
+
127
+ v_y [0 ] += a * temp0 ;
128
+ v_y [1 ] += a * temp4 ;
112
129
113
130
}
114
131
@@ -137,10 +154,21 @@ static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
137
154
temp3 += v_x [i ] * va3 [i ] + v_x [i + 1 ] * va3 [i + 1 ];
138
155
}
139
156
140
- y [0 ] += alpha * (temp0 [0 ] + temp0 [1 ]+ temp0 [2 ] + temp0 [3 ]);
141
- y [1 ] += alpha * (temp1 [0 ] + temp1 [1 ]+ temp1 [2 ] + temp1 [3 ]);
142
- y [2 ] += alpha * (temp2 [0 ] + temp2 [1 ]+ temp2 [2 ] + temp2 [3 ]);
143
- y [3 ] += alpha * (temp3 [0 ] + temp3 [1 ]+ temp3 [2 ] + temp3 [3 ]);
157
+ register __vector float t0 , t1 , t2 , t3 ;
158
+ register __vector float a = { alpha , alpha , alpha , alpha };
159
+ __vector float * v_y = (__vector float * ) y ;
160
+
161
+ t0 = vec_mergeh (temp0 , temp2 );
162
+ t1 = vec_mergel (temp0 , temp2 );
163
+ t2 = vec_mergeh (temp1 , temp3 );
164
+ t3 = vec_mergel (temp1 , temp3 );
165
+ temp0 = vec_mergeh (t0 , t2 );
166
+ temp1 = vec_mergel (t0 , t2 );
167
+ temp2 = vec_mergeh (t1 , t3 );
168
+ temp3 = vec_mergel (t1 , t3 );
169
+ temp0 += temp1 + temp2 + temp3 ;
170
+
171
+ v_y [0 ] += a * temp0 ;
144
172
145
173
}
146
174
0 commit comments