@@ -131,12 +131,15 @@ function dtrmm( side, uplo, transa, diag, M, N, alpha, A, strideA1, strideA2, of
131
131
var sa1 ;
132
132
var sb0 ;
133
133
var sb1 ;
134
+ var oa ;
134
135
var ob ;
136
+ var ia ;
137
+ var ib ;
135
138
var i ;
136
139
var j ;
137
140
var k ;
138
141
139
- // Note on variable naming convention: sa#, ix#, i # where # corresponds to the loop number, with `0` being the innermost loop...
142
+ // Note on variable naming convention: sa#, sb # where # corresponds to the loop number, with `0` being the innermost loop...
140
143
141
144
isrma = isRowMajor ( [ strideA1 , strideA2 ] ) ;
142
145
nonunit = ( diag === 'non-unit' ) ;
@@ -172,10 +175,10 @@ function dtrmm( side, uplo, transa, diag, M, N, alpha, A, strideA1, strideA2, of
172
175
tmp = alpha * B [ ob2 ] ;
173
176
ia = offsetA + ( k * sa0 ) ;
174
177
for ( i = 0 ; i < k ; i ++ ) {
175
- B [ ib + ( i * sb1 ) ] += ( tmp * A [ ia + ( i * sa1 ) ] ) ;
178
+ B [ ib + ( i * sb1 ) ] += ( tmp * A [ ia + ( i * sa1 ) ] ) ;
176
179
}
177
180
if ( nonunit ) {
178
- tmp *= A [ ia + ( k * sa1 ) ] ;
181
+ tmp *= A [ ia + ( k * sa1 ) ] ;
179
182
}
180
183
B [ ob2 ] = tmp ;
181
184
}
@@ -187,17 +190,17 @@ function dtrmm( side, uplo, transa, diag, M, N, alpha, A, strideA1, strideA2, of
187
190
( ! isrma && side === 'right' && uplo === 'upper' && transa === 'no-transpose' )
188
191
) {
189
192
for ( j = 0 ; j < N ; j ++ ) {
193
+ ib = offsetB + ( j * sb0 ) ;
190
194
for ( k = M - 1 ; k >= 0 ; k -- ) {
191
- ob2 = offsetB + ( k * sb1 ) + ( j * sb0 ) ;
195
+ ob2 = ib + ( k * sb1 ) ;
192
196
tmp = alpha * B [ ob2 ] ;
197
+ ia = offsetA + ( k * sa0 ) ;
193
198
for ( i = k + 1 ; i < M ; i ++ ) {
194
- oa2 = offsetA + ( i * sa1 ) + ( k * sa0 ) ;
195
- ob = offsetB + ( i * sb1 ) + ( j * sb0 ) ;
196
- B [ ob ] += ( tmp * A [ oa2 ] ) ;
199
+ oa2 = ia + ( i * sa1 ) ;
200
+ B [ ib + ( i * sb1 ) ] += ( tmp * A [ oa2 ] ) ;
197
201
}
198
202
if ( nonunit ) {
199
- oa2 = offsetA + ( k * sa1 ) + ( k * sa0 ) ;
200
- tmp *= A [ oa2 ] ;
203
+ tmp *= A [ ia + ( k * sa1 ) ] ;
201
204
}
202
205
B [ ob2 ] = tmp ;
203
206
}
@@ -209,19 +212,20 @@ function dtrmm( side, uplo, transa, diag, M, N, alpha, A, strideA1, strideA2, of
209
212
( ! isrma && side === 'right' && uplo === 'lower' && transa !== 'no-transpose' )
210
213
) {
211
214
for ( j = 0 ; j < N ; j ++ ) {
215
+ ib = offsetB + ( j * sb0 ) ;
212
216
for ( i = M - 1 ; i >= 0 ; i -- ) {
213
- ob2 = offsetB + ( i * sb1 ) + ( j * sb0 ) ;
217
+ ob2 = ib + ( i * sb1 ) ;
214
218
tmp = 0.0 ;
215
- for ( k = 0 ; k < i ; k ++ ) {
216
- oa2 = offsetA + ( k * sa1 ) + ( i * sa0 ) ;
217
- tmp += A [ oa2 ] * B [ offsetB + ( k * sb1 ) + ( j * sb0 ) ] ;
218
- }
219
+ ia = offsetA + ( i * sa0 ) ;
219
220
if ( nonunit ) {
220
- oa2 = offsetA + ( i * sa1 ) + ( i * sa0 ) ;
221
- tmp += ( A [ oa2 ] * B [ ob2 ] ) ;
221
+ tmp += ( A [ ia + ( i * sa1 ) ] * B [ ob2 ] ) ;
222
222
} else {
223
223
tmp += B [ ob2 ] ;
224
224
}
225
+ for ( k = 0 ; k < i ; k ++ ) {
226
+ oa2 = ia + ( k * sa1 ) ;
227
+ tmp += A [ oa2 ] * B [ ib + ( k * sb1 ) ] ;
228
+ }
225
229
B [ ob2 ] = alpha * tmp ;
226
230
}
227
231
}
@@ -232,16 +236,16 @@ function dtrmm( side, uplo, transa, diag, M, N, alpha, A, strideA1, strideA2, of
232
236
( ! isrma && side === 'right' && uplo === 'upper' && transa === 'transpose' )
233
237
) {
234
238
for ( j = 0 ; j < N ; j ++ ) {
239
+ ib = offsetB + ( j * sb0 ) ;
235
240
for ( i = 0 ; i < M ; i ++ ) {
236
- ob2 = offsetB + ( i * sb1 ) + ( j * sb0 ) ;
241
+ ia = offsetA + ( i * sa0 ) ;
242
+ ob2 = ib + ( i * sb1 ) ;
237
243
tmp = 0.0 ;
238
244
for ( k = i + 1 ; k < M ; k ++ ) {
239
- oa2 = offsetA + ( k * sa1 ) + ( i * sa0 ) ;
240
- tmp += A [ oa2 ] * B [ offsetB + ( k * sb1 ) + ( j * sb0 ) ] ;
245
+ tmp += A [ ia + ( k * sa1 ) ] * B [ ib + ( k * sb1 ) ] ;
241
246
}
242
247
if ( nonunit ) {
243
- oa2 = offsetA + ( i * sa1 ) + ( i * sa0 ) ;
244
- tmp += ( A [ oa2 ] * B [ ob2 ] ) ;
248
+ tmp += ( A [ ia + ( i * sa1 ) ] * B [ ob2 ] ) ;
245
249
} else {
246
250
tmp += B [ ob2 ] ;
247
251
}
@@ -255,25 +259,22 @@ function dtrmm( side, uplo, transa, diag, M, N, alpha, A, strideA1, strideA2, of
255
259
( ! isrma && side === 'left' && uplo === 'lower' && transa === 'no-transpose' )
256
260
) {
257
261
for ( j = N - 1 ; j >= 0 ; j -- ) {
258
- if ( nonunit ) {
259
- oa2 = offsetA + ( j * sa1 ) + ( j * sa0 ) ;
260
- tmp = A [ oa2 ] ;
261
- for ( i = 0 ; i < M ; i ++ ) {
262
- ob2 = offsetB + ( i * sb1 ) + ( j * sb0 ) ;
263
- B [ ob2 ] *= tmp ;
264
- }
265
- }
262
+ ia = offsetA + ( j * sa0 ) ;
263
+ ib = offsetB + ( j * sb0 ) ;
266
264
for ( i = 0 ; i < M ; i ++ ) {
267
- ob2 = offsetB + ( i * sb1 ) + ( j * sb0 ) ;
268
- B [ ob2 ] *= alpha ;
269
- }
270
- for ( k = 0 ; k < j ; k ++ ) {
271
- oa2 = offsetA + ( k * sa1 ) + ( j * sa0 ) ;
272
- if ( A [ oa2 ] !== 0.0 ) {
273
- tmp = alpha * A [ oa2 ] ;
274
- for ( i = 0 ; i < M ; i ++ ) {
275
- ob2 = offsetB + ( i * sb1 ) + ( j * sb0 ) ;
276
- B [ ob2 ] += ( tmp * B [ offsetB + ( i * sb1 ) + ( k * sb0 ) ] ) ;
265
+ ob = ib + ( i * sb1 ) ;
266
+ B [ ob ] *= alpha ;
267
+ if ( nonunit ) {
268
+ oa2 = ia + ( j * sa1 ) ;
269
+ tmp = A [ oa2 ] ;
270
+ B [ ob ] *= tmp ;
271
+ }
272
+ for ( k = 0 ; k < j ; k ++ ) {
273
+ oa2 = ia + ( k * sa1 ) ;
274
+ ob2 = offsetB + ( k * sb0 ) ;
275
+ if ( A [ oa2 ] !== 0.0 ) {
276
+ tmp = alpha * A [ oa2 ] ;
277
+ B [ ob ] += ( tmp * B [ ob2 + ( i * sb1 ) ] ) ;
277
278
}
278
279
}
279
280
}
@@ -285,25 +286,21 @@ function dtrmm( side, uplo, transa, diag, M, N, alpha, A, strideA1, strideA2, of
285
286
( ! isrma && side === 'left' && uplo === 'upper' && transa === 'no-transpose' )
286
287
) {
287
288
for ( j = 0 ; j < N ; j ++ ) {
288
- if ( nonunit ) {
289
- oa2 = offsetA + ( j * sa1 ) + ( j * sa0 ) ;
290
- tmp = A [ oa2 ] ;
291
- for ( i = 0 ; i < M ; i ++ ) {
292
- ob2 = offsetB + ( i * sb1 ) + ( j * sb0 ) ;
293
- B [ ob2 ] *= tmp ;
294
- }
295
- }
289
+ ia = offsetA + ( j * sa0 ) ;
296
290
for ( i = 0 ; i < M ; i ++ ) {
297
- ob2 = offsetB + ( i * sb1 ) + ( j * sb0 ) ;
298
- B [ ob2 ] *= alpha ;
299
- }
300
- for ( k = j + 1 ; k < N ; k ++ ) {
301
- oa2 = offsetA + ( k * sa1 ) + ( j * sa0 ) ;
302
- if ( A [ oa2 ] !== 0.0 ) {
303
- tmp = alpha * A [ oa2 ] ;
304
- for ( i = 0 ; i < M ; i ++ ) {
305
- ob2 = offsetB + ( i * sb1 ) + ( j * sb0 ) ;
306
- B [ ob2 ] += ( tmp * B [ offsetB + ( i * sb1 ) + ( k * sb0 ) ] ) ;
291
+ ib = offsetB + ( i * sb1 ) ;
292
+ ob = ib + ( j * sb0 ) ;
293
+ B [ ob ] *= alpha ;
294
+ if ( nonunit ) {
295
+ oa = ia + ( j * sa1 ) ;
296
+ B [ ob ] *= A [ oa ] ;
297
+ }
298
+ for ( k = j + 1 ; k < N ; k ++ ) {
299
+ oa2 = ia + ( k * sa1 ) ;
300
+ ob2 = ib + ( k * sb0 ) ;
301
+ if ( A [ oa2 ] !== 0.0 ) {
302
+ tmp = alpha * A [ oa2 ] ;
303
+ B [ ob ] += ( tmp * B [ ob2 ] ) ;
307
304
}
308
305
}
309
306
}
@@ -315,39 +312,44 @@ function dtrmm( side, uplo, transa, diag, M, N, alpha, A, strideA1, strideA2, of
315
312
( ! isrma && side === 'left' && uplo === 'lower' && transa !== 'no-transpose' )
316
313
) {
317
314
for ( j = 0 ; j < N ; j ++ ) {
315
+ ia = offsetA + ( j * sa1 ) ;
318
316
for ( i = 0 ; i < M ; i ++ ) {
319
- ob2 = offsetB + ( i * sb1 ) + ( j * sb0 ) ;
320
- oa2 = offsetA + ( j * sa1 ) + ( j * sa0 ) ;
317
+ ib = offsetB + ( i * sb1 ) ;
318
+ oa = ia + ( j * sa0 ) ;
319
+ ob = ib + ( j * sb0 ) ;
321
320
if ( nonunit ) {
322
- tmp = B [ ob2 ] * A [ oa2 ] ;
321
+ tmp = B [ ob ] * A [ oa ] ;
323
322
} else {
324
- tmp = B [ ob2 ] ;
323
+ tmp = B [ ob ] ;
325
324
}
326
325
for ( k = j + 1 ; k < N ; k ++ ) {
327
- oa2 = offsetA + ( j * sa1 ) + ( k * sa0 ) ;
328
- tmp += ( B [ offsetB + ( i * sb1 ) + ( k * sb0 ) ] * A [ oa2 ] ) ;
326
+ oa2 = ia + ( k * sa0 ) ;
327
+ ob2 = ib + ( k * sb0 ) ;
328
+ tmp += ( B [ ob2 ] * A [ oa2 ] ) ;
329
329
}
330
- B [ ob2 ] = alpha * tmp ;
330
+ B [ ob ] = alpha * tmp ;
331
331
}
332
332
}
333
333
return B ;
334
334
}
335
335
// ( isrma && side === 'right' && uplo === 'lower' && transa !== 'no-transpose' ) || ( !isrma && side === 'left' && uplo === 'upper' && transa !== 'no-transpose' )
336
336
for ( i = 0 ; i < M ; i ++ ) {
337
+ ib = offsetB + ( i * sb1 ) ;
337
338
for ( j = N - 1 ; j >= 0 ; j -- ) {
338
- ob2 = offsetB + ( i * sb1 ) + ( j * sb0 ) ;
339
- oa2 = offsetA + ( j * sa1 ) + ( j * sa0 ) ;
339
+ ia = offsetA + ( j * sa1 ) ;
340
+ oa = ia + ( j * sa0 ) ;
341
+ ob = ib + ( j * sb0 ) ;
340
342
if ( nonunit ) {
341
- tmp = B [ ob2 ] * A [ oa2 ] ;
343
+ tmp = B [ ob ] * A [ oa ] ;
342
344
} else {
343
- tmp = B [ ob2 ] ;
345
+ tmp = B [ ob ] ;
344
346
}
345
347
for ( k = 0 ; k < j ; k ++ ) {
346
- oa2 = offsetA + ( j * sa1 ) + ( k * sa0 ) ;
347
- tmp += ( B [ offsetB + ( i * sb1 ) + ( k * sb0 ) ] * A [ oa2 ] ) ;
348
+ oa2 = ia + ( k * sa0 ) ;
349
+ ob2 = ib + ( k * sb0 ) ;
350
+ tmp += ( B [ ob2 ] * A [ oa2 ] ) ;
348
351
}
349
-
350
- B [ ob2 ] = alpha * tmp ;
352
+ B [ ob ] = alpha * tmp ;
351
353
}
352
354
}
353
355
return B ;
0 commit comments