@@ -117,10 +117,10 @@ function zeros( M, N, X, strideX1, strideX2, offsetX ) { // TODO: consider movin
117117* var Float32Array = require( '@stdlib/array/float32' );
118118*
119119* var A = new Float32Array( [ 1.0, 3.0, 0.0, 4.0 ] );
120- * var B = new Float32Array( [ 5.0, 7 .0, 0.0, 8.0 ] );
120+ * var B = new Float32Array( [ 5.0, 6 .0, 0.0, 8.0 ] );
121121*
122122* strsm( 'left', 'upper', 'no-transpose', 'non-unit', 2, 2, 6.0, A, 2, 1, 0, B, 2, 1, 0 );
123- * // B => <Float32Array>[ 30.0, 6 .0, 0.0, 12.0 ]
123+ * // B => <Float32Array>[ 30.0, 0 .0, 0.0, 12.0 ]
124124*/
125125function strsm ( side , uplo , transa , diag , M , N , alpha , A , strideA1 , strideA2 , offsetA , B , strideB1 , strideB2 , offsetB ) { // eslint-disable-line max-params
126126 var nonunit ;
@@ -171,123 +171,146 @@ function strsm( side, uplo, transa, diag, M, N, alpha, A, strideA1, strideA2, of
171171 zeros ( M , N , B , sb0 , sb1 , offsetB ) ;
172172 return B ;
173173 }
174- if ( side === 'left' ) {
175- if ( transa === 'no-transpose' ) {
176- // B := alpha * inv( A ) * B
177- if ( uplo === 'upper' ) {
178- for ( j = 0 ; j < N ; j ++ ) {
179- ob = offsetB + ( j * sb0 ) ;
180- if ( alpha !== 1.0 ) {
181- for ( i = 0 ; i < M ; i ++ ) {
182- B [ ob + ( i * sb1 ) ] = f32 ( B [ ob + ( i * sb1 ) ] * alpha ) ;
183- }
184- }
185- for ( k = M - 1 ; k >= 0 ; k -- ) {
186- oa2 = offsetA + ( k * sa1 ) + ( k * sa0 ) ;
187- ob2 = ob + ( k * sb1 ) ;
188- if ( B [ ob2 ] !== 0.0 ) {
189- if ( nonunit ) {
190- B [ ob2 ] = f32 ( B [ ob2 ] / A [ oa2 ] ) ;
191- }
192- for ( i = 0 ; i < k ; i ++ ) {
193- B [ ob + ( i * sb1 ) ] -= f32 ( B [ ob2 ] * A [ offsetA + ( i * sa1 ) + ( k * sa0 ) ] ) ;
194- }
195- }
196- }
174+ if (
175+ ( isrma && side === 'left' && uplo === 'upper' && transa === 'no-transpose' ) ||
176+ ( ! isrma && side === 'right' && uplo === 'lower' && transa === 'no-transpose' )
177+ ) {
178+ for ( k = N - 1 ; k >= 0 ; k -- ) {
179+ if ( nonunit ) {
180+ oa2 = offsetA + ( k * sa1 ) + ( k * sa0 ) ;
181+ tmp = f32 ( 1.0 / A [ oa2 ] ) ;
182+ for ( i = 0 ; i < M ; i ++ ) {
183+ ob2 = offsetB + ( i * sb0 ) + ( k * sb1 ) ;
184+ B [ ob2 ] = f32 ( B [ ob2 ] * tmp ) ;
197185 }
198- return B ;
199186 }
200- // Lower
201- for ( j = 0 ; j < N ; j ++ ) {
202- ob = offsetB + ( j * sb0 ) ;
203- if ( alpha !== 1.0 ) {
187+ for ( j = 0 ; j < k ; j ++ ) {
188+ oa2 = offsetA + ( j * sa1 ) + ( k * sa0 ) ;
189+ if ( A [ oa2 ] !== 0.0 ) {
204190 for ( i = 0 ; i < M ; i ++ ) {
205- B [ ob + ( i * sb1 ) ] = f32 ( B [ ob + ( i * sb1 ) ] * alpha ) ;
191+ ob = offsetB + ( i * sb0 ) ;
192+ B [ ob + j * sb1 ] -= f32 ( A [ oa2 ] * B [ ob + k * sb1 ] ) ;
206193 }
207194 }
208- for ( k = 0 ; k < M ; k ++ ) {
209- oa2 = offsetA + ( k * sa1 ) + ( k * sa0 ) ;
210- ob2 = ob + ( k * sb1 ) ;
211- if ( B [ ob2 ] !== 0.0 ) {
212- if ( nonunit ) {
213- B [ ob2 ] = f32 ( B [ ob2 ] / A [ oa2 ] ) ;
214- }
215- for ( i = k + 1 ; i < M ; i ++ ) {
216- oa2 = offsetA + i * sa1 + k * sa0 ;
217- B [ ob + ( i * sb1 ) ] -= f32 ( B [ ob2 ] * A [ oa2 ] ) ;
218- }
219- }
195+ }
196+ if ( alpha !== 1.0 ) {
197+ for ( i = 0 ; i < M ; i ++ ) {
198+ ob2 = offsetB + ( i * sb0 ) + ( k * sb1 ) ;
199+ B [ ob2 ] = f32 ( B [ ob2 ] * alpha ) ;
220200 }
221201 }
222- return B ;
223202 }
224- // B := alpha * inv( A**T ) * B
225- if ( uplo === 'upper' ) {
226- for ( j = 0 ; j < N ; j ++ ) {
227- ob = offsetB + ( j * sb0 ) ;
203+ return B ;
204+ }
205+ if (
206+ ( isrma && side === 'left' && uplo === 'lower' && transa === 'no-transpose' ) ||
207+ ( ! isrma && side === 'right' && uplo === 'upper' && transa === 'no-transpose' )
208+ ) {
209+ for ( j = 0 ; j < N ; j ++ ) {
210+ ob = offsetB + ( j * sb0 ) ;
211+ if ( alpha !== 1.0 ) {
228212 for ( i = 0 ; i < M ; i ++ ) {
229- oa2 = offsetA + ( i * sa1 ) + ( i * sa0 ) ;
230- tmp = f32 ( B [ ob + ( i * sb1 ) ] * alpha ) ;
231- for ( k = 0 ; k < i ; k ++ ) {
232- oa = offsetA + ( k * sa1 ) ;
233- tmp -= f32 ( A [ oa + ( i * sa0 ) ] * B [ ob + ( k * sb1 ) ] ) ;
234- }
213+ B [ ob + ( i * sb1 ) ] = f32 ( B [ ob + ( i * sb1 ) ] * alpha ) ;
214+ }
215+ }
216+ for ( k = 0 ; k < M ; k ++ ) {
217+ oa2 = offsetA + ( k * sa1 ) + ( k * sa0 ) ;
218+ ob2 = ob + ( k * sb1 ) ;
219+ if ( B [ ob2 ] !== 0.0 ) {
235220 if ( nonunit ) {
236- tmp = f32 ( tmp / A [ oa2 ] ) ;
221+ B [ ob2 ] = f32 ( B [ ob2 ] / A [ oa2 ] ) ;
222+ }
223+ for ( i = k + 1 ; i < M ; i ++ ) {
224+ oa2 = offsetA + ( i * sa1 ) + ( k * sa0 ) ;
225+ B [ ob + ( i * sb1 ) ] -= f32 ( B [ ob2 ] * A [ oa2 ] ) ;
226+ }
227+ }
228+ }
229+ }
230+ return B ;
231+ }
232+ if (
233+ ( isrma && side === 'left' && uplo === 'upper' && transa !== 'no-transpose' ) ||
234+ ( ! isrma && side === 'right' && uplo === 'lower' && transa !== 'no-transpose' )
235+ ) {
236+ for ( j = 0 ; j < N ; j ++ ) {
237+ for ( i = 0 ; i < M ; i ++ ) {
238+ ob2 = offsetB + ( i * sb0 ) + ( j * sb1 ) ;
239+ if ( alpha !== 1.0 ) {
240+ B [ ob2 ] = f32 ( B [ ob2 ] * alpha ) ;
241+ }
242+ }
243+ for ( k = 0 ; k < j ; k ++ ) {
244+ for ( i = 0 ; i < M ; i ++ ) {
245+ ob = offsetB + ( i * sb0 ) ;
246+ oa2 = offsetA + ( k * sa1 ) + ( j * sa0 ) ;
247+ if ( A [ oa2 ] !== 0.0 ) {
248+ B [ ob + ( j * sb1 ) ] -= f32 ( A [ oa2 ] * B [ ob + ( k * sb1 ) ] ) ;
237249 }
238- B [ ob + ( i * sb1 ) ] = tmp ;
239250 }
240251 }
241- return B ;
252+ if ( nonunit ) {
253+ oa2 = offsetA + ( j * sa1 ) + ( j * sa0 ) ;
254+ tmp = f32 ( 1.0 / A [ oa2 ] ) ;
255+ for ( i = 0 ; i < M ; i ++ ) {
256+ ob2 = offsetB + ( i * sb0 ) + ( j * sb1 ) ;
257+ B [ ob2 ] = f32 ( B [ ob2 ] * tmp ) ;
258+ }
259+ }
242260 }
243- // Lower
261+ return B ;
262+ }
263+ if (
264+ ( isrma && side === 'left' && uplo === 'lower' && transa !== 'no-transpose' ) ||
265+ ( ! isrma && side === 'right' && uplo === 'upper' && transa === 'transpose' )
266+ ) {
244267 for ( j = 0 ; j < N ; j ++ ) {
245268 ob = offsetB + ( j * sb0 ) ;
269+ if ( alpha !== 1.0 ) {
270+ for ( i = 0 ; i < M ; i ++ ) {
271+ B [ ob + ( i * sb1 ) ] = f32 ( B [ ob + ( i * sb1 ) ] * alpha ) ;
272+ }
273+ }
246274 for ( i = M - 1 ; i >= 0 ; i -- ) {
247- oa2 = offsetA + ( i * sa1 ) + ( i * sa0 ) ;
248- tmp = f32 ( B [ ob + ( i * sb1 ) ] * alpha ) ;
275+ ob2 = ob + ( i * sb1 ) ;
249276 for ( k = i + 1 ; k < M ; k ++ ) {
250- tmp -= f32 ( A [ offsetA + ( k * sa1 ) + ( i * sa0 ) ] * B [ ob + ( k * sb1 ) ] ) ;
277+ oa2 = offsetA + ( i * sa0 ) + ( k * sa1 ) ;
278+ B [ ob2 ] -= f32 ( A [ oa2 ] * B [ ob + ( k * sb1 ) ] ) ;
251279 }
252280 if ( nonunit ) {
253- tmp = f32 ( tmp / A [ oa2 ] ) ;
281+ oa2 = offsetA + ( i * sa0 ) + ( i * sa1 ) ;
282+ B [ ob2 ] = f32 ( B [ ob2 ] / A [ oa2 ] ) ;
254283 }
255- B [ ob + ( i * sb1 ) ] = tmp ;
284+ B [ ob + ( i * sb1 ) ] = B [ ob2 ] ;
256285 }
257286 }
258287 return B ;
259288 }
260- // Right
261- if ( transa === 'no-transpose' ) {
262- if ( uplo === 'upper' ) {
263- for ( j = 0 ; j < N ; j ++ ) {
264- for ( i = 0 ; i < M ; i ++ ) {
265- ob2 = offsetB + ( i * sb1 ) + ( j * sb0 ) ;
266- if ( alpha !== 1.0 ) {
267- B [ ob2 ] = f32 ( B [ ob2 ] * alpha ) ;
268- }
269- }
270- for ( k = 0 ; k < j ; k ++ ) {
271- for ( i = 0 ; i < M ; i ++ ) {
272- ob2 = offsetB + i * sb1 ;
273- oa2 = offsetA + k * sa1 + j * sa0 ;
274- if ( A [ oa2 ] !== 0.0 ) {
275- B [ ob2 + j * sb0 ] -= f32 ( A [ oa2 ] * B [ ob2 + k * sb0 ] ) ;
276- }
277- }
289+ if (
290+ ( isrma && side === 'right' && uplo === 'upper' && transa === 'no-transpose' ) ||
291+ ( ! isrma && side === 'left' && uplo === 'lower' && transa === 'no-transpose' )
292+ ) {
293+ for ( j = 0 ; j < N ; j ++ ) {
294+ ob = offsetB + ( j * sb1 ) ;
295+ for ( i = 0 ; i < M ; i ++ ) {
296+ oa2 = offsetA + ( i * sa1 ) + ( i * sa0 ) ;
297+ tmp = f32 ( B [ ob + ( i * sb0 ) ] * alpha ) ;
298+ for ( k = 0 ; k < i ; k ++ ) {
299+ oa = offsetA + ( k * sa1 ) ;
300+ tmp -= f32 ( A [ oa + ( i * sa0 ) ] * B [ ob + ( k * sb0 ) ] ) ;
278301 }
279302 if ( nonunit ) {
280- oa2 = offsetA + j * sa1 + j * sa0 ;
281- tmp = f32 ( 1.0 / A [ oa2 ] ) ;
282- for ( i = 0 ; i < M ; i ++ ) {
283- ob2 = offsetB + i * sb1 + j * sb0 ;
284- B [ ob2 ] = f32 ( B [ ob2 ] * tmp ) ;
285- }
303+ tmp = f32 ( tmp / A [ oa2 ] ) ;
286304 }
305+ B [ ob + ( i * sb0 ) ] = tmp ;
287306 }
288- return B ;
289307 }
290- // Lower
308+ return B ;
309+ }
310+ if (
311+ ( isrma && side === 'right' && uplo === 'lower' && transa === 'no-transpose' ) ||
312+ ( ! isrma && side === 'left' && uplo === 'upper' && transa === 'no-transpose' )
313+ ) {
291314 for ( j = N - 1 ; j >= 0 ; j -- ) {
292315 for ( i = 0 ; i < M ; i ++ ) {
293316 ob2 = offsetB + ( i * sb1 ) + ( j * sb0 ) ;
@@ -315,36 +338,34 @@ function strsm( side, uplo, transa, diag, M, N, alpha, A, strideA1, strideA2, of
315338 }
316339 return B ;
317340 }
318- // B := alpha * B * inv( A**T )
319- if ( uplo === 'upper' ) {
320- for ( k = N - 1 ; k >= 0 ; k -- ) {
321- if ( nonunit ) {
322- oa2 = offsetA + k * sa1 + k * sa0 ;
323- tmp = f32 ( 1.0 / A [ oa2 ] ) ;
341+ if (
342+ ( isrma && side === 'right' && uplo === 'upper' && transa !== 'no-transpose' ) ||
343+ ( ! isrma && side === 'left' && uplo === 'lower' && transa !== 'no-transpose' )
344+ ) {
345+ for ( j = 0 ; j < N ; j ++ ) {
346+ ob = offsetB + ( j * sb1 ) ;
347+ if ( alpha !== 1.0 ) {
324348 for ( i = 0 ; i < M ; i ++ ) {
325- ob2 = offsetB + i * sb1 + k * sb0 ;
326- B [ ob2 ] = f32 ( B [ ob2 ] * tmp ) ;
349+ B [ ob + ( i * sb0 ) ] = f32 ( B [ ob + ( i * sb0 ) ] * alpha ) ;
327350 }
328351 }
329- for ( j = 0 ; j < k ; j ++ ) {
330- oa2 = offsetA + ( j * sa1 ) + ( k * sa0 ) ;
331- if ( A [ oa2 ] !== 0.0 ) {
332- for ( i = 0 ; i < M ; i ++ ) {
333- ob = offsetB + i * sb1 ;
334- B [ ob + j * sb0 ] -= f32 ( A [ oa2 ] * B [ ob + k * sb0 ] ) ;
352+ for ( k = M - 1 ; k >= 0 ; k -- ) {
353+ oa2 = offsetA + ( k * sa1 ) + ( k * sa0 ) ;
354+ ob2 = ob + ( k * sb0 ) ;
355+ if ( B [ ob2 ] !== 0.0 ) {
356+ if ( nonunit ) {
357+ B [ ob2 ] = f32 ( B [ ob2 ] / A [ oa2 ] ) ;
358+ }
359+ for ( i = 0 ; i < k ; i ++ ) {
360+ oa = offsetA + ( i * sa1 ) + ( k * sa0 ) ;
361+ B [ ob + ( i * sb0 ) ] -= f32 ( B [ ob2 ] * A [ oa ] ) ;
335362 }
336- }
337- }
338- if ( alpha !== 1.0 ) {
339- for ( i = 0 ; i < M ; i ++ ) {
340- ob2 = offsetB + i * sb1 + k * sb0 ;
341- B [ ob2 ] = f32 ( B [ ob2 ] * alpha ) ;
342363 }
343364 }
344365 }
345366 return B ;
346367 }
347- // Lower
368+ // ( isrma && side === 'right' && uplo === 'lower' && transa !== 'no-transpose' ) || ( !isrma && side === 'left' && uplo === 'upper' && transa !== 'no-transpose' )
348369 for ( k = 0 ; k < N ; k ++ ) {
349370 ob = offsetB + ( k * sb0 ) ;
350371 oa = offsetA + ( k * sa1 ) ;
@@ -360,7 +381,7 @@ function strsm( side, uplo, transa, diag, M, N, alpha, A, strideA1, strideA2, of
360381 oa2 = offsetA + j * sa1 + k * sa0 ;
361382 if ( A [ oa2 ] !== 0.0 ) {
362383 for ( i = 0 ; i < M ; i ++ ) {
363- B [ ob2 + ( i * sb1 ) ] -= f32 ( a_jk * B [ ob + ( i * sb1 ) ] ) ;
384+ B [ ob2 + ( i * sb1 ) ] -= f32 ( A [ oa2 ] * B [ ob + ( i * sb1 ) ] ) ;
364385 }
365386 }
366387 }
0 commit comments