Skip to content

Commit 4af801e

Browse files
author
Marco Barbone
committed
added vectorized foldrescale to devel
1 parent bf3994a commit 4af801e

File tree

2 files changed

+86
-22
lines changed

2 files changed

+86
-22
lines changed

devel/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@ endif()
1919

2020
add_executable(foldrescale foldrescale.cpp)
2121
target_link_libraries(foldrescale finufft benchmark)
22+
target_compile_options(foldrescale PRIVATE -mavx2)

devel/foldrescale.cpp

Lines changed: 85 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
#include <iostream>
44
#include <random>
55
#include <cmath>
6+
#include <immintrin.h>
67
// no vectorize
7-
#pragma GCC optimize("no-tree-vectorize")
8+
//#pragma GCC optimize("no-tree-vectorize")
89
/* local NU coord fold+rescale macro: does the following affine transform to x:
910
when p=true: map [-3pi,-pi) and [-pi,pi) and [pi,3pi) each to [0,N)
1011
otherwise, map [-N,0) and [0,N) and [N,2N) each to [0,N)
@@ -67,7 +68,7 @@ FLT foldRescale03(FLT x, BIGINT N) {
6768
FLT fN = FLT(N);
6869
if constexpr (p) {
6970
static constexpr FLT x2pi = FLT(M_1_2PI);
70-
result = x * x2pi + FLT(0.5);
71+
result = std::fma(x, x2pi, FLT(0.5));
7172
result -= floor(result);
7273
} else {
7374
const FLT invN = FLT(1.0) / fN;
@@ -77,8 +78,22 @@ FLT foldRescale03(FLT x, BIGINT N) {
7778
return result * fN;
7879
}
7980

81+
#ifdef __AVX2__
82+
83+
inline __attribute__((always_inline))
84+
__m256d foldRescaleVec(__m256d x, BIGINT N) {
85+
__m256d result;
86+
__m256d fN = _mm256_set1_pd(FLT(N));
87+
static const __m256d x2pi = _mm256_set1_pd(FLT(M_1_2PI));
88+
static const __m256d half = _mm256_set1_pd(FLT(0.5));
89+
result = _mm256_fmadd_pd(x, x2pi, half);
90+
result = _mm256_sub_pd(result, _mm256_floor_pd(result));
91+
return _mm256_mul_pd(result, fN);
92+
}
93+
#endif
94+
8095
static std::mt19937_64 gen;
81-
static std::uniform_real_distribution<> dis(-3 * M_PI, 3 * M_PI);
96+
static std::uniform_real_distribution<> dis(-10, 10);
8297
static const auto N = std::uniform_int_distribution<>{0, 1000}(gen);
8398
static std::uniform_real_distribution<> disN(-N, 2*N);
8499
static volatile auto pirange = true;
@@ -194,26 +209,71 @@ static void BM_FoldRescale05N(benchmark::State &state) {
194209
}
195210

196211

197-
BENCHMARK(BM_BASELINE);
198-
199-
BENCHMARK(BM_FoldRescaleMacro);
200-
BENCHMARK(BM_FoldRescale00);
201-
BENCHMARK(BM_FoldRescale01);
202-
BENCHMARK(BM_FoldRescale02);
203-
BENCHMARK(BM_FoldRescale03);
204-
BENCHMARK(BM_FoldRescale04);
205-
BENCHMARK(BM_FoldRescale05);
206-
207-
BENCHMARK(BM_FoldRescaleMacroN);
208-
BENCHMARK(BM_FoldRescale00N);
209-
BENCHMARK(BM_FoldRescale01N);
210-
BENCHMARK(BM_FoldRescale02N);
211-
BENCHMARK(BM_FoldRescale03N);
212-
BENCHMARK(BM_FoldRescale04N);
213-
BENCHMARK(BM_FoldRescale05N);
212+
#ifdef __AVX2__
213+
static void BM_FoldRescaleVec(benchmark::State &state) {
214+
for (auto _: state) {
215+
// Generate 4 floating point numbers
216+
double x1 = dis(gen);
217+
double x2 = dis(gen);
218+
double x3 = dis(gen);
219+
double x4 = dis(gen);
220+
// Pack them into an AVX vector
221+
__m256d x = _mm256_set_pd(x1, x2, x3, x4);
222+
// Call the foldRescaleVec function
223+
benchmark::DoNotOptimize(foldRescaleVec(x, N));
224+
}
225+
}
226+
#endif
227+
228+
229+
BENCHMARK(BM_BASELINE)->Iterations(10000000);
230+
BENCHMARK(BM_FoldRescaleMacro)->Iterations(1000000);
231+
BENCHMARK(BM_FoldRescale00)->Iterations(1000000);
232+
BENCHMARK(BM_FoldRescale01)->Iterations(1000000);
233+
BENCHMARK(BM_FoldRescale02)->Iterations(1000000);
234+
BENCHMARK(BM_FoldRescale03)->Iterations(10000000);
235+
BENCHMARK(BM_FoldRescale04)->Iterations(1000000);
236+
BENCHMARK(BM_FoldRescale05)->Iterations(1000000);
237+
#ifdef __AVX2__
238+
BENCHMARK(BM_FoldRescaleVec)->Iterations(1000000/4);
239+
#endif
240+
BENCHMARK(BM_FoldRescaleMacroN)->Iterations(1000000);
241+
BENCHMARK(BM_FoldRescale00N)->Iterations(1000000);
242+
BENCHMARK(BM_FoldRescale01N)->Iterations(1000000);
243+
BENCHMARK(BM_FoldRescale02N)->Iterations(1000000);
244+
BENCHMARK(BM_FoldRescale03N)->Iterations(1000000);
245+
BENCHMARK(BM_FoldRescale04N)->Iterations(1000000);
246+
BENCHMARK(BM_FoldRescale05N)->Iterations(1000000);
247+
248+
249+
#ifdef __AVX2__
250+
void testFoldRescaleVec_avx256_vs_foldRescale00() {
251+
// Generate 4 floating point numbers
252+
double x1 = dis(gen);
253+
double x2 = dis(gen);
254+
double x3 = dis(gen);
255+
double x4 = dis(gen);
256+
257+
// Pack them into an AVX vector
258+
__m256d xVec = _mm256_set_pd(x1, x2, x3, x4);
259+
260+
// Call the foldRescaleVec function
261+
__m256d resultVec = foldRescaleVec(xVec, N);
262+
263+
// Extract the results from the AVX vector
264+
265+
for (int i = 0; i < 4; ++i) {
266+
double result00 = foldRescale03<true>(xVec[i], N);
267+
if (std::abs(1 - result00 / resultVec[i]) > 1e-14) {
268+
std::cout << "input: " << xVec[i] << " result00: " << result00 << " result256: " << resultVec[i] << std::endl;
269+
throw std::runtime_error("foldRescaleVec is not equivalent to foldRescale00");
270+
}
271+
}
272+
}
273+
#endif
214274

215275
void testFoldRescaleFunctions() {
216-
for (bool p: {true, false}) {
276+
for (bool p: {true}) {
217277
for (int i = 0; i < 1024; ++i) { // Run the test 1000 times
218278
FLT x = dis(gen);
219279
FLT resultMacro = FOLDRESCALE(x, N, p);
@@ -223,6 +283,7 @@ void testFoldRescaleFunctions() {
223283
FLT result03 = p ? foldRescale03<true>(x, N) : foldRescale03<false>(x, N);
224284
FLT result04 = FOLDRESCALE04(x, N, p);
225285
FLT result05 = FOLDRESCALE05(x, N, p);
286+
226287
// function that compares two floating point number with a tolerance, using relative error
227288
auto compare = [](FLT a, FLT b) {
228289
return std::abs(a - b) > std::max(std::abs(a), std::abs(b)) * 10e-13;
@@ -284,9 +345,11 @@ int main(int argc, char **argv) {
284345
static std::random_device rd;
285346
const auto seed = rd();
286347
std::cout << "Seed: " << seed << "\n";
287-
// gen.seed(226307105);
288348
gen.seed(seed);
289349
testFoldRescaleFunctions();
350+
#ifdef __AVX2__
351+
testFoldRescaleVec_avx256_vs_foldRescale00();
352+
#endif
290353
::benchmark::Initialize(&argc, argv);
291354
BaselineSubtractingReporter reporter;
292355
::benchmark::RunSpecifiedBenchmarks(&reporter);

0 commit comments

Comments
 (0)