Skip to content

Commit c0a3b12

Browse files
committed
Improve: jmp vs cmov
1 parent 8aa9921 commit c0a3b12

File tree

1 file changed

+161
-12
lines changed

1 file changed

+161
-12
lines changed

less_slow.cpp

Lines changed: 161 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -996,6 +996,129 @@ static void branch_cost(bm::State &state) {
996996

997997
BENCHMARK(branch_cost)->RangeMultiplier(4)->Range(256, 32 * 1024);
998998

999+
/**
1000+
* It's hard to reason if the above code should compile into a conditional move or a jump,
1001+
* so let's define explicit inline assembly kernels and compare both.
1002+
*/
1003+
#if defined(__GNUC__) && !defined(__clang__) //! GCC/Clang inline asm note in your code, keep MSVC out
1004+
1005+
#if defined(__x86_64__) || defined(__i386__)
1006+
1007+
static void branch_cost_cmov(bm::State &state) {
1008+
auto const count = static_cast<std::size_t>(state.range(0));
1009+
aligned_array<std::int32_t> random_values(count);
1010+
std::generate_n(random_values.begin(), count, &std::rand);
1011+
std::int32_t variable = 0;
1012+
std::size_t iteration = 0;
1013+
1014+
for (auto _ : state) {
1015+
std::int32_t const random = random_values[(++iteration) & (count - 1)];
1016+
std::int32_t sum; // early-clobber temp for LEA result
1017+
1018+
asm volatile( //
1019+
"leal (%[var],%[rnd],1), %[sum]\n\t" // sum := variable + random
1020+
"imull %[rnd], %[var]\n\t" // var := variable * random
1021+
"testl $1, %[rnd]\n\t" // if (random & 1) var := sum
1022+
"cmovne %[sum], %[var]\n\t"
1023+
: [var] "+r"(variable), [sum] "=&r"(sum)
1024+
: [rnd] "r"(random)
1025+
: "cc");
1026+
bm::DoNotOptimize(variable);
1027+
}
1028+
}
1029+
1030+
static void branch_cost_jump(bm::State &state) {
1031+
auto const count = static_cast<std::size_t>(state.range(0));
1032+
aligned_array<std::int32_t> random_values(count);
1033+
std::generate_n(random_values.begin(), count, &std::rand);
1034+
std::int32_t variable = 0;
1035+
std::size_t iteration = 0;
1036+
1037+
for (auto _ : state) {
1038+
std::int32_t const random = random_values[(++iteration) & (count - 1)];
1039+
1040+
asm volatile( //
1041+
"testl $1, %[rnd]\n\t"
1042+
"jnz 1f\n\t" // if odd -> jump to add
1043+
"imull %[rnd], %[var]\n\t" // even: var *= rnd
1044+
"jmp 2f\n\t"
1045+
"1:\n\t"
1046+
"addl %[rnd], %[var]\n\t" // odd: var += rnd
1047+
"2:\n\t"
1048+
: [var] "+r"(variable)
1049+
: [rnd] "r"(random)
1050+
: "cc");
1051+
bm::DoNotOptimize(variable);
1052+
}
1053+
}
1054+
1055+
BENCHMARK(branch_cost_cmov)->RangeMultiplier(4)->Range(256, 32 * 1024);
1056+
BENCHMARK(branch_cost_jump)->RangeMultiplier(4)->Range(256, 32 * 1024);
1057+
1058+
#elif defined(__aarch64__)
1059+
1060+
static void branch_cost_csel(bm::State &state) {
1061+
auto const count = static_cast<std::size_t>(state.range(0));
1062+
aligned_array<std::int32_t> random_values(count);
1063+
std::generate_n(random_values.begin(), count, &std::rand);
1064+
std::int32_t variable = 0;
1065+
std::size_t iteration = 0;
1066+
1067+
for (auto _ : state) {
1068+
std::int32_t const random = random_values[(++iteration) & (count - 1)];
1069+
std::int32_t sum;
1070+
1071+
asm volatile( //
1072+
"add %w[sum], %w[var], %w[rnd]\n\t" // sum := variable + random
1073+
"mul %w[var], %w[var], %w[rnd]\n\t" // var := variable * random
1074+
"tst %w[rnd], #1\n\t" // if (random & 1) var := sum
1075+
"csel %w[var], %w[sum], %w[var], NE\n\t"
1076+
: [var] "+r"(variable), [sum] "=&r"(sum)
1077+
: [rnd] "r"(random)
1078+
: "cc");
1079+
bm::DoNotOptimize(variable);
1080+
}
1081+
}
1082+
1083+
static void branch_cost_branch(bm::State &state) {
1084+
auto const count = static_cast<std::size_t>(state.range(0));
1085+
aligned_array<std::int32_t> random_values(count);
1086+
std::generate_n(random_values.begin(), count, &std::rand);
1087+
std::int32_t variable = 0;
1088+
std::size_t iteration = 0;
1089+
1090+
for (auto _ : state) {
1091+
std::int32_t const random = random_values[(++iteration) & (count - 1)];
1092+
1093+
asm volatile( //
1094+
"tst %w[rnd], #1\n\t"
1095+
"b.ne 1f\n\t" // if odd -> jump to add
1096+
"mul %w[var], %w[var], %w[rnd]\n\t" // even: var *= rnd
1097+
"b 2f\n\t"
1098+
"1:\n\t"
1099+
"add %w[var], %w[var], %w[rnd]\n\t" // odd: var += rnd
1100+
"2:\n\t"
1101+
: [var] "+r"(variable)
1102+
: [rnd] "r"(random)
1103+
: "cc");
1104+
bm::DoNotOptimize(variable);
1105+
}
1106+
}
1107+
1108+
BENCHMARK(branch_cost_csel)->RangeMultiplier(4)->Range(256, 32 * 1024);
1109+
BENCHMARK(branch_cost_branch)->RangeMultiplier(4)->Range(256, 32 * 1024);
1110+
1111+
#endif
1112+
1113+
#endif // __GNUC__ && !__clang__
1114+
1115+
/**
1116+
* Results are quite interesting. On Intel:
1117+
* - `branch_cost` up to 4K runs at @b 0.7ns, beyond that it jumps to @b 3.7ns.
1118+
* - `branch_cost_cmov` consistently runs at @b 1.3ns, regardless of the size.
1119+
* - `branch_cost_jump` has similar, but slightly worse performance than `branch_cost`.
1120+
*/
1121+
9991122
#pragma endregion // Branch Prediction
10001123

10011124
#pragma region Cache Misses
@@ -1069,28 +1192,54 @@ BENCHMARK(cache_misses_cost<access_order_t::random>)
10691192
* value. This optimization is crucial for performance, especially when dealing
10701193
* with heavy objects.
10711194
*/
1072-
#include <optional> // `std::optional`
1195+
struct heavy_t {
1196+
std::uint64_t data[8];
1197+
1198+
heavy_t() noexcept { std::iota(data, data + 8, 0); }
1199+
1200+
heavy_t(heavy_t &&) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); }
1201+
heavy_t(heavy_t const &) { std::this_thread::sleep_for(std::chrono::milliseconds(2)); }
1202+
heavy_t &operator=(heavy_t &&) {
1203+
std::this_thread::sleep_for(std::chrono::milliseconds(1));
1204+
return *this;
1205+
}
1206+
heavy_t &operator=(heavy_t const &) {
1207+
std::this_thread::sleep_for(std::chrono::milliseconds(2));
1208+
return *this;
1209+
}
1210+
};
10731211

1074-
std::optional<std::string> make_heavy_object_mutable() {
1075-
std::string x(1024, 'x');
1212+
heavy_t make_heavy_object() { return heavy_t {}; }
1213+
1214+
heavy_t make_named_heavy_object() {
1215+
heavy_t const x; //! Even with `const`, RVO is possible
10761216
return x;
10771217
}
10781218

1079-
std::optional<std::string> make_heavy_object_immutable() {
1080-
std::string const x(1024, 'x'); //! `const` is the only difference
1081-
return x;
1219+
heavy_t make_conditional_heavy_object() {
1220+
heavy_t x;
1221+
heavy_t &x1 = x;
1222+
heavy_t &x2 = x;
1223+
static std::size_t counter = 0; //! Condition prevents RVO
1224+
if (counter++ % 2 == 0) { return x1; }
1225+
else { return x2; }
1226+
}
1227+
1228+
static void rvo_trivial(bm::State &state) {
1229+
for (auto _ : state) bm::DoNotOptimize(make_heavy_object());
10821230
}
10831231

1084-
static void rvo_friendly(bm::State &state) {
1085-
for (auto _ : state) bm::DoNotOptimize(make_heavy_object_mutable());
1232+
static void rvo_likely(bm::State &state) {
1233+
for (auto _ : state) bm::DoNotOptimize(make_named_heavy_object());
10861234
}
10871235

1088-
static void rvo_impossible(bm::State &state) {
1089-
for (auto _ : state) bm::DoNotOptimize(make_heavy_object_immutable());
1236+
static void rvo_banned(bm::State &state) {
1237+
for (auto _ : state) bm::DoNotOptimize(make_conditional_heavy_object());
10901238
}
10911239

1092-
BENCHMARK(rvo_friendly);
1093-
BENCHMARK(rvo_impossible);
1240+
BENCHMARK(rvo_trivial);
1241+
BENCHMARK(rvo_likely);
1242+
BENCHMARK(rvo_banned);
10941243

10951244
/**
10961245
* Despite intuition, marking a local object as `const` hurts our performance.

0 commit comments

Comments
 (0)