|
16 | 16 | * limitations under the License.
|
17 | 17 | */
|
18 | 18 |
|
19 |
| -/* eslint-disable max-len */ |
20 |
| - |
21 | 19 | 'use strict';
|
22 | 20 |
|
23 | 21 | // MODULES //
|
24 | 22 |
|
25 |
| -var float64ToFloat32 = require( '@stdlib/number/float64/base/to-float32' ); |
26 |
| -var floor = require( '@stdlib/math/base/special/floor' ); |
27 |
| - |
28 |
| - |
29 |
| -// VARIABLES // |
30 |
| - |
31 |
| -// Blocksize for pairwise summation (NOTE: decreasing the blocksize decreases rounding error as more pairs are summed, but also decreases performance. Because the inner loop is unrolled eight times, the blocksize is effectively `16`.): |
32 |
| -var BLOCKSIZE = 128; |
| 23 | +var f32 = require( '@stdlib/number/float64/base/to-float32' ); |
| 24 | +var ssumpw = require( '@stdlib/blas/ext/base/ssumpw' ).ndarray; |
33 | 25 |
|
34 | 26 |
|
35 | 27 | // MAIN //
|
@@ -61,74 +53,7 @@ var BLOCKSIZE = 128;
|
61 | 53 | * // returns 25.0
|
62 | 54 | */
|
63 | 55 | function sapxsumpw( N, alpha, x, strideX, offsetX ) {
|
64 |
| - var ix; |
65 |
| - var s0; |
66 |
| - var s1; |
67 |
| - var s2; |
68 |
| - var s3; |
69 |
| - var s4; |
70 |
| - var s5; |
71 |
| - var s6; |
72 |
| - var s7; |
73 |
| - var M; |
74 |
| - var s; |
75 |
| - var n; |
76 |
| - var i; |
77 |
| - |
78 |
| - if ( N <= 0 ) { |
79 |
| - return 0.0; |
80 |
| - } |
81 |
| - ix = offsetX; |
82 |
| - if ( strideX === 0 ) { |
83 |
| - return float64ToFloat32( N * float64ToFloat32( alpha + x[ ix ] ) ); |
84 |
| - } |
85 |
| - if ( N < 8 ) { |
86 |
| - // Use simple summation... |
87 |
| - s = 0.0; |
88 |
| - for ( i = 0; i < N; i++ ) { |
89 |
| - s = float64ToFloat32( s + float64ToFloat32( alpha + x[ ix ] ) ); |
90 |
| - ix += strideX; |
91 |
| - } |
92 |
| - return s; |
93 |
| - } |
94 |
| - if ( N <= BLOCKSIZE ) { |
95 |
| - // Sum a block with 8 accumulators (by loop unrolling, we lower the effective blocksize to 16)... |
96 |
| - s0 = float64ToFloat32( alpha + x[ ix ] ); |
97 |
| - s1 = float64ToFloat32( alpha + x[ ix+strideX ] ); |
98 |
| - s2 = float64ToFloat32( alpha + x[ ix+(2*strideX) ] ); |
99 |
| - s3 = float64ToFloat32( alpha + x[ ix+(3*strideX) ] ); |
100 |
| - s4 = float64ToFloat32( alpha + x[ ix+(4*strideX) ] ); |
101 |
| - s5 = float64ToFloat32( alpha + x[ ix+(5*strideX) ] ); |
102 |
| - s6 = float64ToFloat32( alpha + x[ ix+(6*strideX) ] ); |
103 |
| - s7 = float64ToFloat32( alpha + x[ ix+(7*strideX) ] ); |
104 |
| - ix += 8 * strideX; |
105 |
| - |
106 |
| - M = N % 8; |
107 |
| - for ( i = 8; i < N-M; i += 8 ) { |
108 |
| - s0 = float64ToFloat32( s0 + float64ToFloat32( alpha + x[ ix ] ) ); |
109 |
| - s1 = float64ToFloat32( s1 + float64ToFloat32( alpha + x[ ix+strideX ] ) ); |
110 |
| - s2 = float64ToFloat32( s2 + float64ToFloat32( alpha + x[ ix+(2*strideX) ] ) ); |
111 |
| - s3 = float64ToFloat32( s3 + float64ToFloat32( alpha + x[ ix+(3*strideX) ] ) ); |
112 |
| - s4 = float64ToFloat32( s4 + float64ToFloat32( alpha + x[ ix+(4*strideX) ] ) ); |
113 |
| - s5 = float64ToFloat32( s5 + float64ToFloat32( alpha + x[ ix+(5*strideX) ] ) ); |
114 |
| - s6 = float64ToFloat32( s6 + float64ToFloat32( alpha + x[ ix+(6*strideX) ] ) ); |
115 |
| - s7 = float64ToFloat32( s7 + float64ToFloat32( alpha + x[ ix+(7*strideX) ] ) ); |
116 |
| - ix += 8 * strideX; |
117 |
| - } |
118 |
| - // Pairwise sum the accumulators: |
119 |
| - s = float64ToFloat32( float64ToFloat32( float64ToFloat32(s0+s1) + float64ToFloat32(s2+s3) ) + float64ToFloat32( float64ToFloat32(s4+s5) + float64ToFloat32(s6+s7) ) ); |
120 |
| - |
121 |
| - // Clean-up loop... |
122 |
| - for ( i; i < N; i++ ) { |
123 |
| - s = float64ToFloat32( s + float64ToFloat32( alpha + x[ ix ] ) ); |
124 |
| - ix += strideX; |
125 |
| - } |
126 |
| - return s; |
127 |
| - } |
128 |
| - // Recurse by dividing by two, but avoiding non-multiples of unroll factor... |
129 |
| - n = floor( N/2 ); |
130 |
| - n -= n % 8; |
131 |
| - return float64ToFloat32( sapxsumpw( n, alpha, x, strideX, ix ) + sapxsumpw( N-n, alpha, x, strideX, ix+(n*strideX) ) ); |
| 56 | + return f32( f32( N * alpha ) + ssumpw( N, x, strideX, offsetX ) ); |
132 | 57 | }
|
133 | 58 |
|
134 | 59 |
|
|
0 commit comments