33#include < iostream>
44#include < random>
55#include < cmath>
6+ #include < immintrin.h>
67// no vectorize
7- #pragma GCC optimize("no-tree-vectorize")
8+ // #pragma GCC optimize("no-tree-vectorize")
89/* local NU coord fold+rescale macro: does the following affine transform to x:
910 when p=true: map [-3pi,-pi) and [-pi,pi) and [pi,3pi) each to [0,N)
1011 otherwise, map [-N,0) and [0,N) and [N,2N) each to [0,N)
@@ -67,7 +68,7 @@ FLT foldRescale03(FLT x, BIGINT N) {
6768 FLT fN = FLT (N);
6869 if constexpr (p) {
6970 static constexpr FLT x2pi = FLT (M_1_2PI);
70- result = x * x2pi + FLT (0.5 );
71+ result = std::fma (x, x2pi, FLT (0.5 ) );
7172 result -= floor (result);
7273 } else {
7374 const FLT invN = FLT (1.0 ) / fN ;
@@ -77,8 +78,22 @@ FLT foldRescale03(FLT x, BIGINT N) {
7778 return result * fN ;
7879}
7980
81+ #ifdef __AVX2__
82+
83+ inline __attribute__ ((always_inline))
84+ __m256d foldRescaleVec(__m256d x, BIGINT N) {
85+ __m256d result;
86+ __m256d fN = _mm256_set1_pd (FLT (N));
87+ static const __m256d x2pi = _mm256_set1_pd (FLT (M_1_2PI));
88+ static const __m256d half = _mm256_set1_pd (FLT (0.5 ));
89+ result = _mm256_fmadd_pd (x, x2pi, half);
90+ result = _mm256_sub_pd (result, _mm256_floor_pd (result));
91+ return _mm256_mul_pd (result, fN );
92+ }
93+ #endif
94+
8095static std::mt19937_64 gen;
81- static std::uniform_real_distribution<> dis (-3 * M_PI, 3 * M_PI );
96+ static std::uniform_real_distribution<> dis (-10 , 10 );
8297static const auto N = std::uniform_int_distribution<>{0 , 1000 }(gen);
8398static std::uniform_real_distribution<> disN (-N, 2 *N);
8499static volatile auto pirange = true ;
@@ -194,26 +209,71 @@ static void BM_FoldRescale05N(benchmark::State &state) {
194209}
195210
196211
197- BENCHMARK (BM_BASELINE);
198-
199- BENCHMARK (BM_FoldRescaleMacro);
200- BENCHMARK (BM_FoldRescale00);
201- BENCHMARK (BM_FoldRescale01);
202- BENCHMARK (BM_FoldRescale02);
203- BENCHMARK (BM_FoldRescale03);
204- BENCHMARK (BM_FoldRescale04);
205- BENCHMARK (BM_FoldRescale05);
206-
207- BENCHMARK (BM_FoldRescaleMacroN);
208- BENCHMARK (BM_FoldRescale00N);
209- BENCHMARK (BM_FoldRescale01N);
210- BENCHMARK (BM_FoldRescale02N);
211- BENCHMARK (BM_FoldRescale03N);
212- BENCHMARK (BM_FoldRescale04N);
213- BENCHMARK (BM_FoldRescale05N);
212+ #ifdef __AVX2__
213+ static void BM_FoldRescaleVec (benchmark::State &state) {
214+ for (auto _: state) {
215+ // Generate 4 floating point numbers
216+ double x1 = dis (gen);
217+ double x2 = dis (gen);
218+ double x3 = dis (gen);
219+ double x4 = dis (gen);
220+ // Pack them into an AVX vector
221+ __m256d x = _mm256_set_pd (x1, x2, x3, x4);
222+ // Call the foldRescaleVec function
223+ benchmark::DoNotOptimize (foldRescaleVec (x, N));
224+ }
225+ }
226+ #endif
227+
228+
229+ BENCHMARK (BM_BASELINE)->Iterations(10000000 );
230+ BENCHMARK (BM_FoldRescaleMacro)->Iterations(1000000 );
231+ BENCHMARK (BM_FoldRescale00)->Iterations(1000000 );
232+ BENCHMARK (BM_FoldRescale01)->Iterations(1000000 );
233+ BENCHMARK (BM_FoldRescale02)->Iterations(1000000 );
234+ BENCHMARK (BM_FoldRescale03)->Iterations(10000000 );
235+ BENCHMARK (BM_FoldRescale04)->Iterations(1000000 );
236+ BENCHMARK (BM_FoldRescale05)->Iterations(1000000 );
237+ #ifdef __AVX2__
238+ BENCHMARK (BM_FoldRescaleVec)->Iterations(1000000 /4 );
239+ #endif
240+ BENCHMARK (BM_FoldRescaleMacroN)->Iterations(1000000 );
241+ BENCHMARK (BM_FoldRescale00N)->Iterations(1000000 );
242+ BENCHMARK (BM_FoldRescale01N)->Iterations(1000000 );
243+ BENCHMARK (BM_FoldRescale02N)->Iterations(1000000 );
244+ BENCHMARK (BM_FoldRescale03N)->Iterations(1000000 );
245+ BENCHMARK (BM_FoldRescale04N)->Iterations(1000000 );
246+ BENCHMARK (BM_FoldRescale05N)->Iterations(1000000 );
247+
248+
249+ #ifdef __AVX2__
250+ void testFoldRescaleVec_avx256_vs_foldRescale00 () {
251+ // Generate 4 floating point numbers
252+ double x1 = dis (gen);
253+ double x2 = dis (gen);
254+ double x3 = dis (gen);
255+ double x4 = dis (gen);
256+
257+ // Pack them into an AVX vector
258+ __m256d xVec = _mm256_set_pd (x1, x2, x3, x4);
259+
260+ // Call the foldRescaleVec function
261+ __m256d resultVec = foldRescaleVec (xVec, N);
262+
263+ // Extract the results from the AVX vector
264+
265+ for (int i = 0 ; i < 4 ; ++i) {
266+ double result00 = foldRescale03<true >(xVec[i], N);
267+ if (std::abs (1 - result00 / resultVec[i]) > 1e-14 ) {
268+ std::cout << " input: " << xVec[i] << " result00: " << result00 << " result256: " << resultVec[i] << std::endl;
269+ throw std::runtime_error (" foldRescaleVec is not equivalent to foldRescale00" );
270+ }
271+ }
272+ }
273+ #endif
214274
215275void testFoldRescaleFunctions () {
216- for (bool p: {true , false }) {
276+ for (bool p: {true }) {
217277 for (int i = 0 ; i < 1024 ; ++i) { // Run the test 1000 times
218278 FLT x = dis (gen);
219279 FLT resultMacro = FOLDRESCALE (x, N, p);
@@ -223,6 +283,7 @@ void testFoldRescaleFunctions() {
223283 FLT result03 = p ? foldRescale03<true >(x, N) : foldRescale03<false >(x, N);
224284 FLT result04 = FOLDRESCALE04 (x, N, p);
225285 FLT result05 = FOLDRESCALE05 (x, N, p);
286+
226287 // function that compares two floating point number with a tolerance, using relative error
227288 auto compare = [](FLT a, FLT b) {
228289 return std::abs (a - b) > std::max (std::abs (a), std::abs (b)) * 10e-13 ;
@@ -284,9 +345,11 @@ int main(int argc, char **argv) {
284345 static std::random_device rd;
285346 const auto seed = rd ();
286347 std::cout << " Seed: " << seed << " \n " ;
287- // gen.seed(226307105);
288348 gen.seed (seed);
289349 testFoldRescaleFunctions ();
350+ #ifdef __AVX2__
351+ testFoldRescaleVec_avx256_vs_foldRescale00 ();
352+ #endif
290353 ::benchmark::Initialize (&argc, argv);
291354 BaselineSubtractingReporter reporter;
292355 ::benchmark::RunSpecifiedBenchmarks (&reporter);
0 commit comments