Skip to content

Commit b6754fa

Browse files
committed
refactor: simplify implementation
--- type: pre_commit_static_analysis_report description: Results of running static analysis checks when committing changes. report: - task: lint_filenames status: passed - task: lint_editorconfig status: passed - task: lint_markdown status: na - task: lint_package_json status: na - task: lint_repl_help status: na - task: lint_javascript_src status: passed - task: lint_javascript_cli status: na - task: lint_javascript_examples status: na - task: lint_javascript_tests status: na - task: lint_javascript_benchmarks status: na - task: lint_python status: na - task: lint_r status: na - task: lint_c_src status: passed - task: lint_c_examples status: na - task: lint_c_benchmarks status: na - task: lint_c_tests_fixtures status: na - task: lint_shell status: na - task: lint_typescript_declarations status: na - task: lint_typescript_tests status: na - task: lint_license_headers status: passed ---
1 parent f0aaeee commit b6754fa

File tree

3 files changed

+11
-148
lines changed

3 files changed

+11
-148
lines changed

lib/node_modules/@stdlib/blas/ext/base/sdsapxsumpw/lib/ndarray.js

Lines changed: 3 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,8 @@
2020

2121
// MODULES //
2222

23-
var float64ToFloat32 = require( '@stdlib/number/float64/base/to-float32' );
24-
var floor = require( '@stdlib/math/base/special/floor' );
25-
26-
27-
// VARIABLES //
28-
29-
// Blocksize for pairwise summation (NOTE: decreasing the blocksize decreases rounding error as more pairs are summed, but also decreases performance. Because the inner loop is unrolled eight times, the blocksize is effectively `16`.):
30-
var BLOCKSIZE = 128;
23+
var f32 = require( '@stdlib/number/float64/base/to-float32' );
24+
var sdssumpw = require( '@stdlib/blas/ext/base/sdssumpw' ).ndarray;
3125

3226

3327
// MAIN //
@@ -59,74 +53,7 @@ var BLOCKSIZE = 128;
5953
* // returns 25.0
6054
*/
6155
function sdsapxsumpw( N, alpha, x, strideX, offsetX ) {
62-
var ix;
63-
var s0;
64-
var s1;
65-
var s2;
66-
var s3;
67-
var s4;
68-
var s5;
69-
var s6;
70-
var s7;
71-
var M;
72-
var s;
73-
var n;
74-
var i;
75-
76-
if ( N <= 0 ) {
77-
return 0.0;
78-
}
79-
ix = offsetX;
80-
if ( strideX === 0 ) {
81-
return float64ToFloat32( N * float64ToFloat32( alpha + x[ ix ] ) );
82-
}
83-
if ( N < 8 ) {
84-
// Use simple summation...
85-
s = 0.0;
86-
for ( i = 0; i < N; i++ ) {
87-
s += alpha + x[ ix ];
88-
ix += strideX;
89-
}
90-
return float64ToFloat32( s );
91-
}
92-
if ( N <= BLOCKSIZE ) {
93-
// Sum a block with 8 accumulators (by loop unrolling, we lower the effective blocksize to 16)...
94-
s0 = alpha + x[ ix ];
95-
s1 = alpha + x[ ix+strideX ];
96-
s2 = alpha + x[ ix+(2*strideX) ];
97-
s3 = alpha + x[ ix+(3*strideX) ];
98-
s4 = alpha + x[ ix+(4*strideX) ];
99-
s5 = alpha + x[ ix+(5*strideX) ];
100-
s6 = alpha + x[ ix+(6*strideX) ];
101-
s7 = alpha + x[ ix+(7*strideX) ];
102-
ix += 8 * strideX;
103-
104-
M = N % 8;
105-
for ( i = 8; i < N-M; i += 8 ) {
106-
s0 += alpha + x[ ix ];
107-
s1 += alpha + x[ ix+strideX ];
108-
s2 += alpha + x[ ix+(2*strideX) ];
109-
s3 += alpha + x[ ix+(3*strideX) ];
110-
s4 += alpha + x[ ix+(4*strideX) ];
111-
s5 += alpha + x[ ix+(5*strideX) ];
112-
s6 += alpha + x[ ix+(6*strideX) ];
113-
s7 += alpha + x[ ix+(7*strideX) ];
114-
ix += 8 * strideX;
115-
}
116-
// Pairwise sum the accumulators:
117-
s = ( (s0+s1) + (s2+s3) ) + ( (s4+s5) + (s6+s7) );
118-
119-
// Clean-up loop...
120-
for ( i; i < N; i++ ) {
121-
s += alpha + x[ ix ];
122-
ix += strideX;
123-
}
124-
return float64ToFloat32( s );
125-
}
126-
// Recurse by dividing by two, but avoiding non-multiples of unroll factor...
127-
n = floor( N/2 );
128-
n -= n % 8;
129-
return float64ToFloat32( sdsapxsumpw( n, alpha, x, strideX, ix ) + sdsapxsumpw( N-n, alpha, x, strideX, ix+(n*strideX) ) ); // eslint-disable-line max-len
56+
return f32( ( N * alpha ) + sdssumpw( N, x, strideX, offsetX ) );
13057
}
13158

13259

lib/node_modules/@stdlib/blas/ext/base/sdsapxsumpw/manifest.json

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@
4343
"@stdlib/napi/argv-strided-float32array",
4444
"@stdlib/napi/create-double",
4545
"@stdlib/strided/base/stride2offset",
46-
"@stdlib/blas/base/shared"
46+
"@stdlib/blas/base/shared",
47+
"@stdlib/blas/ext/base/sdssumpw"
4748
]
4849
},
4950
{
@@ -58,7 +59,8 @@
5859
"libpath": [],
5960
"dependencies": [
6061
"@stdlib/strided/base/stride2offset",
61-
"@stdlib/blas/base/shared"
62+
"@stdlib/blas/base/shared",
63+
"@stdlib/blas/ext/base/sdssumpw"
6264
]
6365
},
6466
{
@@ -73,7 +75,8 @@
7375
"libpath": [],
7476
"dependencies": [
7577
"@stdlib/strided/base/stride2offset",
76-
"@stdlib/blas/base/shared"
78+
"@stdlib/blas/base/shared",
79+
"@stdlib/blas/ext/base/sdssumpw"
7780
]
7881
}
7982
]

lib/node_modules/@stdlib/blas/ext/base/sdsapxsumpw/src/main.c

Lines changed: 2 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "stdlib/blas/ext/base/sdsapxsumpw.h"
2020
#include "stdlib/strided/base/stride2offset.h"
2121
#include "stdlib/blas/base/shared.h"
22+
#include "stdlib/blas/ext/base/sdssumpw.h"
2223

2324
/**
2425
* Adds a scalar constant to each single-precision floating-point strided array element and computes the sum using pairwise summation with extended accumulation.
@@ -53,73 +54,5 @@ float API_SUFFIX(stdlib_strided_sdsapxsumpw)( const CBLAS_INT N, const float alp
5354
* @return output value
5455
*/
5556
float API_SUFFIX(stdlib_strided_sdsapxsumpw_ndarray)( const CBLAS_INT N, const float alpha, const float *X, const CBLAS_INT strideX, const CBLAS_INT offsetX ) {
56-
CBLAS_INT ix;
57-
CBLAS_INT M;
58-
CBLAS_INT n;
59-
CBLAS_INT i;
60-
double sum;
61-
double s0;
62-
double s1;
63-
double s2;
64-
double s3;
65-
double s4;
66-
double s5;
67-
double s6;
68-
double s7;
69-
70-
if ( N <= 0 ) {
71-
return 0.0;
72-
}
73-
ix = offsetX;
74-
if ( strideX == 0 ) {
75-
return N * ( alpha + X[ ix ] );
76-
}
77-
if ( N < 8 ) {
78-
// Use simple summation...
79-
sum = 0.0;
80-
for ( i = 0; i < N; i++ ) {
81-
sum += alpha + X[ ix ];
82-
ix += strideX;
83-
}
84-
return sum;
85-
}
86-
// Blocksize for pairwise summation: 128 (NOTE: decreasing the blocksize decreases rounding error as more pairs are summed, but also decreases performance. Because the inner loop is unrolled eight times, the blocksize is effectively `16`.)
87-
if ( N <= 128 ) {
88-
// Sum a block with 8 accumulators (by loop unrolling, we lower the effective blocksize to 16)...
89-
s0 = alpha + X[ ix ];
90-
s1 = alpha + X[ ix+strideX ];
91-
s2 = alpha + X[ ix+(2*strideX) ];
92-
s3 = alpha + X[ ix+(3*strideX) ];
93-
s4 = alpha + X[ ix+(4*strideX) ];
94-
s5 = alpha + X[ ix+(5*strideX) ];
95-
s6 = alpha + X[ ix+(6*strideX) ];
96-
s7 = alpha + X[ ix+(7*strideX) ];
97-
ix += 8 * strideX;
98-
99-
M = N % 8;
100-
for ( i = 8; i < N-M; i += 8 ) {
101-
s0 += alpha + X[ ix ];
102-
s1 += alpha + X[ ix+strideX ];
103-
s2 += alpha + X[ ix+(2*strideX) ];
104-
s3 += alpha + X[ ix+(3*strideX) ];
105-
s4 += alpha + X[ ix+(4*strideX) ];
106-
s5 += alpha + X[ ix+(5*strideX) ];
107-
s6 += alpha + X[ ix+(6*strideX) ];
108-
s7 += alpha + X[ ix+(7*strideX) ];
109-
ix += 8 * strideX;
110-
}
111-
// Pairwise sum the accumulators:
112-
sum = ( (s0+s1) + (s2+s3) ) + ( (s4+s5) + (s6+s7) );
113-
114-
// Clean-up loop...
115-
for (; i < N; i++ ) {
116-
sum += alpha + X[ ix ];
117-
ix += strideX;
118-
}
119-
return sum;
120-
}
121-
// Recurse by dividing by two, but avoiding non-multiples of unroll factor...
122-
n = N / 2;
123-
n -= n % 8;
124-
return API_SUFFIX(stdlib_strided_sdsapxsumpw_ndarray)( n, alpha, X, strideX, ix ) + API_SUFFIX(stdlib_strided_sdsapxsumpw_ndarray)( N-n, alpha, X, strideX, ix+(n*strideX) );
57+
return ( N * (double)alpha ) + (double)API_SUFFIX(stdlib_strided_sdssumpw_ndarray)( N, X, strideX, offsetX );
12558
}

0 commit comments

Comments
 (0)