Skip to content

Commit e810d81

Browse files
authored
[Microbenchmarks] Add benchmark for conditional scalar assignment autovec (#295)
Benchmarks with vs. without autovec for loops containing: * Just a single conditional scalar assignment * Multiple conditional assignments * A single conditional assignment, with extra arithmetic work
1 parent 6241e63 commit e810d81

File tree

2 files changed

+223
-0
lines changed

2 files changed

+223
-0
lines changed

MicroBenchmarks/LoopVectorization/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ endif()
1010
llvm_test_run()
1111

1212
llvm_test_executable(LoopVectorizationBenchmarks
13+
ConditionalScalarAssignment.cpp
1314
main.cpp
1415
MathFunctions.cpp
1516
RuntimeChecks.cpp
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
#include <iostream>
2+
#include <memory>
3+
#include <random>
4+
5+
#include "benchmark/benchmark.h"
6+
7+
#define ITERATIONS 100000
8+
9+
template <typename T>
10+
using CSAFunc = T (*)(T *, T *, T *, T);
11+
12+
// Find the last element in A above the given threshold,
13+
// with default loop vectorization settings.
14+
template <typename T>
15+
static T run_single_csa_only_autovec(T *A, T *B, T *C, T Threshold) {
16+
// Pick a default value that's out of range of the uniform distribution
17+
// created for 'A' in init_data below.
18+
T Result = 101;
19+
for (unsigned i = 0; i < ITERATIONS; i++)
20+
if (A[i] > Threshold)
21+
Result = A[i];
22+
23+
return Result;
24+
}
25+
26+
// Find the last element in A above the given threshold,
27+
// with loop vectorization disabled.
28+
template <typename T>
29+
static T run_single_csa_only_novec(T *A, T *B, T *C, T Threshold) {
30+
// Pick a default value that's out of range of the uniform distribution
31+
// created for 'A' in init_data below.
32+
T Result = 101;
33+
#pragma clang loop vectorize(disable) interleave(disable)
34+
for (unsigned i = 0; i < ITERATIONS; i++)
35+
if (A[i] > Threshold)
36+
Result = A[i];
37+
38+
return Result;
39+
}
40+
41+
// Find the last elements in A, B, and C above the given threshold,
42+
// with default loop vectorization settings.
43+
template <typename T>
44+
static T run_multi_csa_only_autovec(T *A, T *B, T *C, T Threshold) {
45+
// Pick a default value that's out of range of the uniform distribution
46+
// created for 'A', 'B', and 'C' in init_data below.
47+
T ResultA = 101;
48+
T ResultB = 101;
49+
T ResultC = 101;
50+
for (unsigned i = 0; i < ITERATIONS; i++) {
51+
if (A[i] > Threshold)
52+
ResultA = A[i];
53+
if (B[i] > Threshold)
54+
ResultB = B[i];
55+
if (C[i] > Threshold)
56+
ResultC = C[i];
57+
}
58+
59+
return ResultA ^ ResultB ^ ResultC;
60+
}
61+
62+
// Find the last elements in A, B, and C above the given threshold,
63+
// with loop vectorization disabled.
64+
template <typename T>
65+
static T run_multi_csa_only_novec(T *A, T *B, T *C, T Threshold) {
66+
// Pick a default value that's out of range of the uniform distribution
67+
// created for 'A', 'B', and 'C' in init_data below.
68+
T ResultA = 101;
69+
T ResultB = 101;
70+
T ResultC = 101;
71+
#pragma clang loop vectorize(disable) interleave(disable)
72+
for (unsigned i = 0; i < ITERATIONS; i++) {
73+
if (A[i] > Threshold)
74+
ResultA = A[i];
75+
if (B[i] > Threshold)
76+
ResultB = B[i];
77+
if (C[i] > Threshold)
78+
ResultC = C[i];
79+
}
80+
81+
return ResultA ^ ResultB ^ ResultC;
82+
}
83+
84+
// Find the last element in A above the given threshold,
85+
// with default loop vectorization settings.
86+
template <typename T>
87+
static T run_csa_with_arith_autovec(T *A, T *B, T *C, T Threshold) {
88+
// Pick a default value that's out of range of the uniform distribution
89+
// created for 'A' in init_data below.
90+
T Result = 101;
91+
for (unsigned i = 0; i < ITERATIONS; i++) {
92+
// Do some work to make the difference noticeable
93+
C[i] = A[i] * 13 + B[i] * 5;
94+
if (A[i] > Threshold)
95+
Result = A[i];
96+
}
97+
98+
return Result;
99+
}
100+
101+
// Find the last element in A above the given threshold,
102+
// with loop vectorization disabled.
103+
template <typename T>
104+
static T run_csa_with_arith_novec(T *A, T *B, T* C, T Threshold) {
105+
// Pick a default value that's out of range of the uniform distribution
106+
// created for 'A' in init_data below.
107+
T Result = 101;
108+
#pragma clang loop vectorize(disable) interleave(disable)
109+
for (unsigned i = 0; i < ITERATIONS; i++) {
110+
// Do some work to make the difference noticeable
111+
C[i] = A[i] * 13 + B[i] * 5;
112+
if (A[i] > Threshold)
113+
Result = A[i];
114+
}
115+
116+
return Result;
117+
}
118+
119+
// Initialize arrays A, B, and C with random numbers
120+
template <typename T> static void init_data(T *A, T* B, T *C) {
121+
std::uniform_int_distribution<T> dist(0, 100);
122+
std::mt19937 rng(12345);
123+
for (unsigned i = 0; i < ITERATIONS; i++) {
124+
A[i] = dist(rng);
125+
B[i] = dist(rng);
126+
C[i] = dist(rng);
127+
}
128+
}
129+
130+
// Benchmark auto-vectorized version.
131+
template <typename T>
132+
static void __attribute__((always_inline))
133+
benchmark_csa_autovec(benchmark::State &state, CSAFunc<T> VecFn,
134+
CSAFunc<T> NoVecFn, T Threshold) {
135+
std::unique_ptr<T[]> A(new T[ITERATIONS]);
136+
std::unique_ptr<T[]> B(new T[ITERATIONS]);
137+
std::unique_ptr<T[]> C(new T[ITERATIONS]);
138+
init_data(&A[0], &B[0], &C[0]);
139+
140+
#ifdef BENCH_AND_VERIFY
141+
// Verify the vectorized and un-vectorized versions produce the same results.
142+
{
143+
T VecRes = VecFn(&A[0], &B[0], &C[0], Threshold);
144+
T NoVecRes = NoVecFn(&A[0], &B[0], &C[0], Threshold);
145+
// We're only interested in whether the conditional assignment results
146+
// were the same.
147+
if (VecRes != NoVecRes) {
148+
std::cerr << "ERROR: autovec result different to scalar result; "
149+
<< VecRes << " != " << NoVecRes << "\n";
150+
exit(1);
151+
}
152+
}
153+
#endif
154+
155+
for (auto _ : state) {
156+
VecFn(&A[0], &B[0], &C[0], Threshold);
157+
benchmark::DoNotOptimize(A);
158+
benchmark::DoNotOptimize(B);
159+
benchmark::DoNotOptimize(C);
160+
benchmark::ClobberMemory();
161+
}
162+
}
163+
164+
// Benchmark version with vectorization disabled.
165+
template <typename T>
166+
static void __attribute__((always_inline))
167+
benchmark_csa_novec(benchmark::State &state, CSAFunc<T> NoVecFn, T Threshold) {
168+
std::unique_ptr<T[]> A(new T[ITERATIONS]);
169+
std::unique_ptr<T[]> B(new T[ITERATIONS]);
170+
std::unique_ptr<T[]> C(new T[ITERATIONS]);
171+
init_data(&A[0], &B[0], &C[0]);
172+
173+
for (auto _ : state) {
174+
NoVecFn(&A[0], &B[0], &C[0], Threshold);
175+
benchmark::DoNotOptimize(A);
176+
benchmark::DoNotOptimize(B);
177+
benchmark::DoNotOptimize(C);
178+
}
179+
}
180+
181+
// Add benchmarks with and without auto-vectorization
182+
#define ADD_BENCHMARK(ty, Threshold) \
183+
void BENCHMARK_single_csa_only_autovec_##ty##_(benchmark::State &state) { \
184+
benchmark_csa_autovec<ty>(state, run_single_csa_only_autovec, \
185+
run_single_csa_only_novec, Threshold); \
186+
} \
187+
BENCHMARK(BENCHMARK_single_csa_only_autovec_##ty##_)->Unit( \
188+
benchmark::kNanosecond); \
189+
\
190+
void BENCHMARK_single_csa_only_novec_##ty##_(benchmark::State &state) { \
191+
benchmark_csa_novec<ty>(state, run_single_csa_only_novec, Threshold); \
192+
} \
193+
BENCHMARK(BENCHMARK_single_csa_only_novec_##ty##_)->Unit( \
194+
benchmark::kNanosecond); \
195+
void BENCHMARK_multi_csa_only_autovec_##ty##_(benchmark::State &state) { \
196+
benchmark_csa_autovec<ty>(state, run_multi_csa_only_autovec, \
197+
run_multi_csa_only_novec, Threshold); \
198+
} \
199+
BENCHMARK(BENCHMARK_multi_csa_only_autovec_##ty##_)->Unit( \
200+
benchmark::kNanosecond); \
201+
\
202+
void BENCHMARK_multi_csa_only_novec_##ty##_(benchmark::State &state) { \
203+
benchmark_csa_novec<ty>(state, run_multi_csa_only_novec, Threshold); \
204+
} \
205+
BENCHMARK(BENCHMARK_multi_csa_only_novec_##ty##_)->Unit( \
206+
benchmark::kNanosecond); \
207+
void BENCHMARK_csa_with_arith_autovec_##ty##_(benchmark::State &state) { \
208+
benchmark_csa_autovec<ty>(state, run_csa_with_arith_autovec, \
209+
run_csa_with_arith_novec, Threshold); \
210+
} \
211+
BENCHMARK(BENCHMARK_csa_with_arith_autovec_##ty##_)->Unit( \
212+
benchmark::kNanosecond); \
213+
\
214+
void BENCHMARK_csa_with_arith_novec_##ty##_(benchmark::State &state) { \
215+
benchmark_csa_novec<ty>(state, run_csa_with_arith_novec, Threshold); \
216+
} \
217+
BENCHMARK(BENCHMARK_csa_with_arith_novec_##ty##_)->Unit( \
218+
benchmark::kNanosecond);
219+
220+
ADD_BENCHMARK(int32_t, 75)
221+
ADD_BENCHMARK(uint8_t, 90)
222+
ADD_BENCHMARK(int64_t, 60)

0 commit comments

Comments
 (0)