Skip to content

Commit 627c654

Browse files
committed
Fixing issue with beta == 0 in UserGemm kernels
Related to 1af16a8
1 parent 7385f68 commit 627c654

File tree

3 files changed

+108
-108
lines changed

3 files changed

+108
-108
lines changed

src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -161,47 +161,47 @@ __kernel void sgemm_Col_NN_B0_MX096_NX096_KX16 (
161161
C+= gidy*96*ldc;
162162
C+= idy*ldc;
163163

164-
C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc];
165-
C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc];
166-
C[32*ldc] = alpha*rC[0][2] + beta*C[32*ldc];
167-
C[48*ldc] = alpha*rC[0][3] + beta*C[48*ldc];
168-
C[64*ldc] = alpha*rC[0][4] + beta*C[64*ldc];
169-
C[80*ldc] = alpha*rC[0][5] + beta*C[80*ldc];
164+
C[0 *ldc] = alpha*rC[0][0];
165+
C[16*ldc] = alpha*rC[0][1];
166+
C[32*ldc] = alpha*rC[0][2];
167+
C[48*ldc] = alpha*rC[0][3];
168+
C[64*ldc] = alpha*rC[0][4];
169+
C[80*ldc] = alpha*rC[0][5];
170170
C+=16;
171-
C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc];
172-
C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc];
173-
C[32*ldc] = alpha*rC[1][2] + beta*C[32*ldc];
174-
C[48*ldc] = alpha*rC[1][3] + beta*C[48*ldc];
175-
C[64*ldc] = alpha*rC[1][4] + beta*C[64*ldc];
176-
C[80*ldc] = alpha*rC[1][5] + beta*C[80*ldc];
171+
C[0 *ldc] = alpha*rC[1][0];
172+
C[16*ldc] = alpha*rC[1][1];
173+
C[32*ldc] = alpha*rC[1][2];
174+
C[48*ldc] = alpha*rC[1][3];
175+
C[64*ldc] = alpha*rC[1][4];
176+
C[80*ldc] = alpha*rC[1][5];
177177
C+=16;
178-
C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc];
179-
C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc];
180-
C[32*ldc] = alpha*rC[2][2] + beta*C[32*ldc];
181-
C[48*ldc] = alpha*rC[2][3] + beta*C[48*ldc];
182-
C[64*ldc] = alpha*rC[2][4] + beta*C[64*ldc];
183-
C[80*ldc] = alpha*rC[2][5] + beta*C[80*ldc];
178+
C[0 *ldc] = alpha*rC[2][0];
179+
C[16*ldc] = alpha*rC[2][1];
180+
C[32*ldc] = alpha*rC[2][2];
181+
C[48*ldc] = alpha*rC[2][3];
182+
C[64*ldc] = alpha*rC[2][4];
183+
C[80*ldc] = alpha*rC[2][5];
184184
C+=16;
185-
C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc];
186-
C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc];
187-
C[32*ldc] = alpha*rC[3][2] + beta*C[32*ldc];
188-
C[48*ldc] = alpha*rC[3][3] + beta*C[48*ldc];
189-
C[64*ldc] = alpha*rC[3][4] + beta*C[64*ldc];
190-
C[80*ldc] = alpha*rC[3][5] + beta*C[80*ldc];
185+
C[0 *ldc] = alpha*rC[3][0];
186+
C[16*ldc] = alpha*rC[3][1];
187+
C[32*ldc] = alpha*rC[3][2];
188+
C[48*ldc] = alpha*rC[3][3];
189+
C[64*ldc] = alpha*rC[3][4];
190+
C[80*ldc] = alpha*rC[3][5];
191191
C+=16;
192-
C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc];
193-
C[16*ldc] = alpha*rC[4][1] + beta*C[16*ldc];
194-
C[32*ldc] = alpha*rC[4][2] + beta*C[32*ldc];
195-
C[48*ldc] = alpha*rC[4][3] + beta*C[48*ldc];
196-
C[64*ldc] = alpha*rC[4][4] + beta*C[64*ldc];
197-
C[80*ldc] = alpha*rC[4][5] + beta*C[80*ldc];
192+
C[0 *ldc] = alpha*rC[4][0];
193+
C[16*ldc] = alpha*rC[4][1];
194+
C[32*ldc] = alpha*rC[4][2];
195+
C[48*ldc] = alpha*rC[4][3];
196+
C[64*ldc] = alpha*rC[4][4];
197+
C[80*ldc] = alpha*rC[4][5];
198198
C+=16;
199-
C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc];
200-
C[16*ldc] = alpha*rC[5][1] + beta*C[16*ldc];
201-
C[32*ldc] = alpha*rC[5][2] + beta*C[32*ldc];
202-
C[48*ldc] = alpha*rC[5][3] + beta*C[48*ldc];
203-
C[64*ldc] = alpha*rC[5][4] + beta*C[64*ldc];
204-
C[80*ldc] = alpha*rC[5][5] + beta*C[80*ldc];
199+
C[0 *ldc] = alpha*rC[5][0];
200+
C[16*ldc] = alpha*rC[5][1];
201+
C[32*ldc] = alpha*rC[5][2];
202+
C[48*ldc] = alpha*rC[5][3];
203+
C[64*ldc] = alpha*rC[5][4];
204+
C[80*ldc] = alpha*rC[5][5];
205205

206206
}
207207
);

src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -163,47 +163,47 @@ __kernel void sgemm_Col_NT_B0_MX096_NX096_KX16 (
163163
C+= gidy*96*ldc;
164164
C+= idy*ldc;
165165

166-
C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc];
167-
C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc];
168-
C[32*ldc] = alpha*rC[0][2] + beta*C[32*ldc];
169-
C[48*ldc] = alpha*rC[0][3] + beta*C[48*ldc];
170-
C[64*ldc] = alpha*rC[0][4] + beta*C[64*ldc];
171-
C[80*ldc] = alpha*rC[0][5] + beta*C[80*ldc];
166+
C[0*ldc] = alpha*rC[0][0];
167+
C[16*ldc] = alpha*rC[0][1];
168+
C[32*ldc] = alpha*rC[0][2];
169+
C[48*ldc] = alpha*rC[0][3];
170+
C[64*ldc] = alpha*rC[0][4];
171+
C[80*ldc] = alpha*rC[0][5];
172172
C+=16;
173-
C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc];
174-
C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc];
175-
C[32*ldc] = alpha*rC[1][2] + beta*C[32*ldc];
176-
C[48*ldc] = alpha*rC[1][3] + beta*C[48*ldc];
177-
C[64*ldc] = alpha*rC[1][4] + beta*C[64*ldc];
178-
C[80*ldc] = alpha*rC[1][5] + beta*C[80*ldc];
173+
C[0*ldc] = alpha*rC[1][0];
174+
C[16*ldc] = alpha*rC[1][1];
175+
C[32*ldc] = alpha*rC[1][2];
176+
C[48*ldc] = alpha*rC[1][3];
177+
C[64*ldc] = alpha*rC[1][4];
178+
C[80*ldc] = alpha*rC[1][5];
179179
C+=16;
180-
C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc];
181-
C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc];
182-
C[32*ldc] = alpha*rC[2][2] + beta*C[32*ldc];
183-
C[48*ldc] = alpha*rC[2][3] + beta*C[48*ldc];
184-
C[64*ldc] = alpha*rC[2][4] + beta*C[64*ldc];
185-
C[80*ldc] = alpha*rC[2][5] + beta*C[80*ldc];
180+
C[0*ldc] = alpha*rC[2][0];
181+
C[16*ldc] = alpha*rC[2][1];
182+
C[32*ldc] = alpha*rC[2][2];
183+
C[48*ldc] = alpha*rC[2][3];
184+
C[64*ldc] = alpha*rC[2][4];
185+
C[80*ldc] = alpha*rC[2][5];
186186
C+=16;
187-
C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc];
188-
C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc];
189-
C[32*ldc] = alpha*rC[3][2] + beta*C[32*ldc];
190-
C[48*ldc] = alpha*rC[3][3] + beta*C[48*ldc];
191-
C[64*ldc] = alpha*rC[3][4] + beta*C[64*ldc];
192-
C[80*ldc] = alpha*rC[3][5] + beta*C[80*ldc];
187+
C[0*ldc] = alpha*rC[3][0];
188+
C[16*ldc] = alpha*rC[3][1];
189+
C[32*ldc] = alpha*rC[3][2];
190+
C[48*ldc] = alpha*rC[3][3];
191+
C[64*ldc] = alpha*rC[3][4];
192+
C[80*ldc] = alpha*rC[3][5];
193193
C+=16;
194-
C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc];
195-
C[16*ldc] = alpha*rC[4][1] + beta*C[16*ldc];
196-
C[32*ldc] = alpha*rC[4][2] + beta*C[32*ldc];
197-
C[48*ldc] = alpha*rC[4][3] + beta*C[48*ldc];
198-
C[64*ldc] = alpha*rC[4][4] + beta*C[64*ldc];
199-
C[80*ldc] = alpha*rC[4][5] + beta*C[80*ldc];
194+
C[0*ldc] = alpha*rC[4][0];
195+
C[16*ldc] = alpha*rC[4][1];
196+
C[32*ldc] = alpha*rC[4][2];
197+
C[48*ldc] = alpha*rC[4][3];
198+
C[64*ldc] = alpha*rC[4][4];
199+
C[80*ldc] = alpha*rC[4][5];
200200
C+=16;
201-
C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc];
202-
C[16*ldc] = alpha*rC[5][1] + beta*C[16*ldc];
203-
C[32*ldc] = alpha*rC[5][2] + beta*C[32*ldc];
204-
C[48*ldc] = alpha*rC[5][3] + beta*C[48*ldc];
205-
C[64*ldc] = alpha*rC[5][4] + beta*C[64*ldc];
206-
C[80*ldc] = alpha*rC[5][5] + beta*C[80*ldc];
201+
C[0*ldc] = alpha*rC[5][0];
202+
C[16*ldc] = alpha*rC[5][1];
203+
C[32*ldc] = alpha*rC[5][2];
204+
C[48*ldc] = alpha*rC[5][3];
205+
C[64*ldc] = alpha*rC[5][4];
206+
C[80*ldc] = alpha*rC[5][5];
207207

208208
}
209209
);

src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -162,47 +162,47 @@ __kernel void sgemm_Col_TN_B0_MX096_NX096_KX16 (
162162
C+= gidy*96*ldc;
163163
C+= idy*ldc;
164164

165-
C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc];
166-
C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc];
167-
C[32*ldc] = alpha*rC[0][2] + beta*C[32*ldc];
168-
C[48*ldc] = alpha*rC[0][3] + beta*C[48*ldc];
169-
C[64*ldc] = alpha*rC[0][4] + beta*C[64*ldc];
170-
C[80*ldc] = alpha*rC[0][5] + beta*C[80*ldc];
165+
C[0*ldc] = alpha*rC[0][0];
166+
C[16*ldc] = alpha*rC[0][1];
167+
C[32*ldc] = alpha*rC[0][2];
168+
C[48*ldc] = alpha*rC[0][3];
169+
C[64*ldc] = alpha*rC[0][4];
170+
C[80*ldc] = alpha*rC[0][5];
171171
C+=16;
172-
C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc];
173-
C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc];
174-
C[32*ldc] = alpha*rC[1][2] + beta*C[32*ldc];
175-
C[48*ldc] = alpha*rC[1][3] + beta*C[48*ldc];
176-
C[64*ldc] = alpha*rC[1][4] + beta*C[64*ldc];
177-
C[80*ldc] = alpha*rC[1][5] + beta*C[80*ldc];
172+
C[0*ldc] = alpha*rC[1][0];
173+
C[16*ldc] = alpha*rC[1][1];
174+
C[32*ldc] = alpha*rC[1][2];
175+
C[48*ldc] = alpha*rC[1][3];
176+
C[64*ldc] = alpha*rC[1][4];
177+
C[80*ldc] = alpha*rC[1][5];
178178
C+=16;
179-
C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc];
180-
C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc];
181-
C[32*ldc] = alpha*rC[2][2] + beta*C[32*ldc];
182-
C[48*ldc] = alpha*rC[2][3] + beta*C[48*ldc];
183-
C[64*ldc] = alpha*rC[2][4] + beta*C[64*ldc];
184-
C[80*ldc] = alpha*rC[2][5] + beta*C[80*ldc];
179+
C[0*ldc] = alpha*rC[2][0];
180+
C[16*ldc] = alpha*rC[2][1];
181+
C[32*ldc] = alpha*rC[2][2];
182+
C[48*ldc] = alpha*rC[2][3];
183+
C[64*ldc] = alpha*rC[2][4];
184+
C[80*ldc] = alpha*rC[2][5];
185185
C+=16;
186-
C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc];
187-
C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc];
188-
C[32*ldc] = alpha*rC[3][2] + beta*C[32*ldc];
189-
C[48*ldc] = alpha*rC[3][3] + beta*C[48*ldc];
190-
C[64*ldc] = alpha*rC[3][4] + beta*C[64*ldc];
191-
C[80*ldc] = alpha*rC[3][5] + beta*C[80*ldc];
186+
C[0*ldc] = alpha*rC[3][0];
187+
C[16*ldc] = alpha*rC[3][1];
188+
C[32*ldc] = alpha*rC[3][2];
189+
C[48*ldc] = alpha*rC[3][3];
190+
C[64*ldc] = alpha*rC[3][4];
191+
C[80*ldc] = alpha*rC[3][5];
192192
C+=16;
193-
C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc];
194-
C[16*ldc] = alpha*rC[4][1] + beta*C[16*ldc];
195-
C[32*ldc] = alpha*rC[4][2] + beta*C[32*ldc];
196-
C[48*ldc] = alpha*rC[4][3] + beta*C[48*ldc];
197-
C[64*ldc] = alpha*rC[4][4] + beta*C[64*ldc];
198-
C[80*ldc] = alpha*rC[4][5] + beta*C[80*ldc];
193+
C[0*ldc] = alpha*rC[4][0];
194+
C[16*ldc] = alpha*rC[4][1];
195+
C[32*ldc] = alpha*rC[4][2];
196+
C[48*ldc] = alpha*rC[4][3];
197+
C[64*ldc] = alpha*rC[4][4];
198+
C[80*ldc] = alpha*rC[4][5];
199199
C+=16;
200-
C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc];
201-
C[16*ldc] = alpha*rC[5][1] + beta*C[16*ldc];
202-
C[32*ldc] = alpha*rC[5][2] + beta*C[32*ldc];
203-
C[48*ldc] = alpha*rC[5][3] + beta*C[48*ldc];
204-
C[64*ldc] = alpha*rC[5][4] + beta*C[64*ldc];
205-
C[80*ldc] = alpha*rC[5][5] + beta*C[80*ldc];
200+
C[0*ldc] = alpha*rC[5][0];
201+
C[16*ldc] = alpha*rC[5][1];
202+
C[32*ldc] = alpha*rC[5][2];
203+
C[48*ldc] = alpha*rC[5][3];
204+
C[64*ldc] = alpha*rC[5][4];
205+
C[80*ldc] = alpha*rC[5][5];
206206

207207
}
208208
);

0 commit comments

Comments
 (0)