Skip to content

Commit f0aaeee

Browse files
committed
refactor: simplify implementation
--- type: pre_commit_static_analysis_report description: Results of running static analysis checks when committing changes. report: - task: lint_filenames status: passed - task: lint_editorconfig status: passed - task: lint_markdown status: na - task: lint_package_json status: na - task: lint_repl_help status: na - task: lint_javascript_src status: passed - task: lint_javascript_cli status: na - task: lint_javascript_examples status: na - task: lint_javascript_tests status: na - task: lint_javascript_benchmarks status: na - task: lint_python status: na - task: lint_r status: na - task: lint_c_src status: passed - task: lint_c_examples status: na - task: lint_c_benchmarks status: na - task: lint_c_tests_fixtures status: na - task: lint_shell status: na - task: lint_typescript_declarations status: na - task: lint_typescript_tests status: na - task: lint_license_headers status: passed ---
1 parent 65d7e76 commit f0aaeee

File tree

3 files changed

+11
-150
lines changed

3 files changed

+11
-150
lines changed

lib/node_modules/@stdlib/blas/ext/base/sapxsumpw/lib/ndarray.js

Lines changed: 3 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -16,20 +16,12 @@
1616
* limitations under the License.
1717
*/
1818

19-
/* eslint-disable max-len */
20-
2119
'use strict';
2220

2321
// MODULES //
2422

25-
var float64ToFloat32 = require( '@stdlib/number/float64/base/to-float32' );
26-
var floor = require( '@stdlib/math/base/special/floor' );
27-
28-
29-
// VARIABLES //
30-
31-
// Blocksize for pairwise summation (NOTE: decreasing the blocksize decreases rounding error as more pairs are summed, but also decreases performance. Because the inner loop is unrolled eight times, the blocksize is effectively `16`.):
32-
var BLOCKSIZE = 128;
23+
var f32 = require( '@stdlib/number/float64/base/to-float32' );
24+
var ssumpw = require( '@stdlib/blas/ext/base/ssumpw' ).ndarray;
3325

3426

3527
// MAIN //
@@ -61,74 +53,7 @@ var BLOCKSIZE = 128;
6153
* // returns 25.0
6254
*/
6355
function sapxsumpw( N, alpha, x, strideX, offsetX ) {
64-
var ix;
65-
var s0;
66-
var s1;
67-
var s2;
68-
var s3;
69-
var s4;
70-
var s5;
71-
var s6;
72-
var s7;
73-
var M;
74-
var s;
75-
var n;
76-
var i;
77-
78-
if ( N <= 0 ) {
79-
return 0.0;
80-
}
81-
ix = offsetX;
82-
if ( strideX === 0 ) {
83-
return float64ToFloat32( N * float64ToFloat32( alpha + x[ ix ] ) );
84-
}
85-
if ( N < 8 ) {
86-
// Use simple summation...
87-
s = 0.0;
88-
for ( i = 0; i < N; i++ ) {
89-
s = float64ToFloat32( s + float64ToFloat32( alpha + x[ ix ] ) );
90-
ix += strideX;
91-
}
92-
return s;
93-
}
94-
if ( N <= BLOCKSIZE ) {
95-
// Sum a block with 8 accumulators (by loop unrolling, we lower the effective blocksize to 16)...
96-
s0 = float64ToFloat32( alpha + x[ ix ] );
97-
s1 = float64ToFloat32( alpha + x[ ix+strideX ] );
98-
s2 = float64ToFloat32( alpha + x[ ix+(2*strideX) ] );
99-
s3 = float64ToFloat32( alpha + x[ ix+(3*strideX) ] );
100-
s4 = float64ToFloat32( alpha + x[ ix+(4*strideX) ] );
101-
s5 = float64ToFloat32( alpha + x[ ix+(5*strideX) ] );
102-
s6 = float64ToFloat32( alpha + x[ ix+(6*strideX) ] );
103-
s7 = float64ToFloat32( alpha + x[ ix+(7*strideX) ] );
104-
ix += 8 * strideX;
105-
106-
M = N % 8;
107-
for ( i = 8; i < N-M; i += 8 ) {
108-
s0 = float64ToFloat32( s0 + float64ToFloat32( alpha + x[ ix ] ) );
109-
s1 = float64ToFloat32( s1 + float64ToFloat32( alpha + x[ ix+strideX ] ) );
110-
s2 = float64ToFloat32( s2 + float64ToFloat32( alpha + x[ ix+(2*strideX) ] ) );
111-
s3 = float64ToFloat32( s3 + float64ToFloat32( alpha + x[ ix+(3*strideX) ] ) );
112-
s4 = float64ToFloat32( s4 + float64ToFloat32( alpha + x[ ix+(4*strideX) ] ) );
113-
s5 = float64ToFloat32( s5 + float64ToFloat32( alpha + x[ ix+(5*strideX) ] ) );
114-
s6 = float64ToFloat32( s6 + float64ToFloat32( alpha + x[ ix+(6*strideX) ] ) );
115-
s7 = float64ToFloat32( s7 + float64ToFloat32( alpha + x[ ix+(7*strideX) ] ) );
116-
ix += 8 * strideX;
117-
}
118-
// Pairwise sum the accumulators:
119-
s = float64ToFloat32( float64ToFloat32( float64ToFloat32(s0+s1) + float64ToFloat32(s2+s3) ) + float64ToFloat32( float64ToFloat32(s4+s5) + float64ToFloat32(s6+s7) ) );
120-
121-
// Clean-up loop...
122-
for ( i; i < N; i++ ) {
123-
s = float64ToFloat32( s + float64ToFloat32( alpha + x[ ix ] ) );
124-
ix += strideX;
125-
}
126-
return s;
127-
}
128-
// Recurse by dividing by two, but avoiding non-multiples of unroll factor...
129-
n = floor( N/2 );
130-
n -= n % 8;
131-
return float64ToFloat32( sapxsumpw( n, alpha, x, strideX, ix ) + sapxsumpw( N-n, alpha, x, strideX, ix+(n*strideX) ) );
56+
return f32( f32( N * alpha ) + ssumpw( N, x, strideX, offsetX ) );
13257
}
13358

13459

lib/node_modules/@stdlib/blas/ext/base/sapxsumpw/manifest.json

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@
4343
"@stdlib/napi/argv-strided-float32array",
4444
"@stdlib/napi/create-double",
4545
"@stdlib/strided/base/stride2offset",
46-
"@stdlib/blas/base/shared"
46+
"@stdlib/blas/base/shared",
47+
"@stdlib/blas/ext/base/ssumpw"
4748
]
4849
},
4950
{
@@ -58,7 +59,8 @@
5859
"libpath": [],
5960
"dependencies": [
6061
"@stdlib/strided/base/stride2offset",
61-
"@stdlib/blas/base/shared"
62+
"@stdlib/blas/base/shared",
63+
"@stdlib/blas/ext/base/ssumpw"
6264
]
6365
},
6466
{
@@ -73,7 +75,8 @@
7375
"libpath": [],
7476
"dependencies": [
7577
"@stdlib/strided/base/stride2offset",
76-
"@stdlib/blas/base/shared"
78+
"@stdlib/blas/base/shared",
79+
"@stdlib/blas/ext/base/ssumpw"
7780
]
7881
}
7982
]

lib/node_modules/@stdlib/blas/ext/base/sapxsumpw/src/main.c

Lines changed: 2 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "stdlib/blas/ext/base/sapxsumpw.h"
2020
#include "stdlib/strided/base/stride2offset.h"
2121
#include "stdlib/blas/base/shared.h"
22+
#include "stdlib/blas/ext/base/ssumpw.h"
2223

2324
/**
2425
* Adds a scalar constant to each single-precision floating-point strided array element and computes the sum using pairwise summation.
@@ -53,73 +54,5 @@ float API_SUFFIX(stdlib_strided_sapxsumpw)( const CBLAS_INT N, const float alpha
5354
* @return output value
5455
*/
5556
float API_SUFFIX(stdlib_strided_sapxsumpw_ndarray)( const CBLAS_INT N, const float alpha, const float *X, const CBLAS_INT strideX, const CBLAS_INT offsetX ) {
56-
CBLAS_INT ix;
57-
CBLAS_INT M;
58-
CBLAS_INT n;
59-
CBLAS_INT i;
60-
float sum;
61-
float s0;
62-
float s1;
63-
float s2;
64-
float s3;
65-
float s4;
66-
float s5;
67-
float s6;
68-
float s7;
69-
70-
if ( N <= 0 ) {
71-
return 0.0f;
72-
}
73-
ix = offsetX;
74-
if ( strideX == 0 ) {
75-
return N * ( alpha + X[ ix ] );
76-
}
77-
if ( N < 8 ) {
78-
// Use simple summation...
79-
sum = 0.0f;
80-
for ( i = 0; i < N; i++ ) {
81-
sum += alpha + X[ ix ];
82-
ix += strideX;
83-
}
84-
return sum;
85-
}
86-
// Blocksize for pairwise summation: 128 (NOTE: decreasing the blocksize decreases rounding error as more pairs are summed, but also decreases performance. Because the inner loop is unrolled eight times, the blocksize is effectively `16`.)
87-
if ( N <= 128 ) {
88-
// Sum a block with 8 accumulators (by loop unrolling, we lower the effective blocksize to 16)...
89-
s0 = alpha + X[ ix ];
90-
s1 = alpha + X[ ix+strideX ];
91-
s2 = alpha + X[ ix+(2*strideX) ];
92-
s3 = alpha + X[ ix+(3*strideX) ];
93-
s4 = alpha + X[ ix+(4*strideX) ];
94-
s5 = alpha + X[ ix+(5*strideX) ];
95-
s6 = alpha + X[ ix+(6*strideX) ];
96-
s7 = alpha + X[ ix+(7*strideX) ];
97-
ix += 8 * strideX;
98-
99-
M = N % 8;
100-
for ( i = 8; i < N-M; i += 8 ) {
101-
s0 += alpha + X[ ix ];
102-
s1 += alpha + X[ ix+strideX ];
103-
s2 += alpha + X[ ix+(2*strideX) ];
104-
s3 += alpha + X[ ix+(3*strideX) ];
105-
s4 += alpha + X[ ix+(4*strideX) ];
106-
s5 += alpha + X[ ix+(5*strideX) ];
107-
s6 += alpha + X[ ix+(6*strideX) ];
108-
s7 += alpha + X[ ix+(7*strideX) ];
109-
ix += 8 * strideX;
110-
}
111-
// Pairwise sum the accumulators:
112-
sum = ( (s0+s1) + (s2+s3) ) + ( (s4+s5) + (s6+s7) );
113-
114-
// Clean-up loop...
115-
for (; i < N; i++ ) {
116-
sum += alpha + X[ ix ];
117-
ix += strideX;
118-
}
119-
return sum;
120-
}
121-
// Recurse by dividing by two, but avoiding non-multiples of unroll factor...
122-
n = N / 2;
123-
n -= n % 8;
124-
return API_SUFFIX(stdlib_strided_sapxsumpw_ndarray)( n, alpha, X, strideX, ix ) + API_SUFFIX(stdlib_strided_sapxsumpw_ndarray)( N-n, alpha, X, strideX, ix+(n*strideX) );
57+
return ( N * alpha ) + API_SUFFIX(stdlib_strided_ssumpw_ndarray)( N, X, strideX, offsetX );
12558
}

0 commit comments

Comments
 (0)