@@ -996,6 +996,129 @@ static void branch_cost(bm::State &state) {
996996
997997BENCHMARK (branch_cost)->RangeMultiplier(4 )->Range(256 , 32 * 1024 );
998998
999+ /* *
1000+ * It's hard to reason if the above code should compile into a conditional move or a jump,
1001+ * so let's define explicit inline assembly kernels and compare both.
1002+ */
1003+ #if defined(__GNUC__) && !defined(__clang__) // ! GCC/Clang inline asm note in your code, keep MSVC out
1004+
1005+ #if defined(__x86_64__) || defined(__i386__)
1006+
1007+ static void branch_cost_cmov (bm::State &state) {
1008+ auto const count = static_cast <std::size_t >(state.range (0 ));
1009+ aligned_array<std::int32_t > random_values (count);
1010+ std::generate_n (random_values.begin (), count, &std::rand);
1011+ std::int32_t variable = 0 ;
1012+ std::size_t iteration = 0 ;
1013+
1014+ for (auto _ : state) {
1015+ std::int32_t const random = random_values[(++iteration) & (count - 1 )];
1016+ std::int32_t sum; // early-clobber temp for LEA result
1017+
1018+ asm volatile ( //
1019+ " leal (%[var],%[rnd],1), %[sum]\n\t " // sum := variable + random
1020+ " imull %[rnd], %[var]\n\t " // var := variable * random
1021+ " testl $1, %[rnd]\n\t " // if (random & 1) var := sum
1022+ " cmovne %[sum], %[var]\n\t "
1023+ : [var] " +r" (variable), [sum] " =&r" (sum)
1024+ : [rnd] " r" (random)
1025+ : " cc" );
1026+ bm::DoNotOptimize (variable);
1027+ }
1028+ }
1029+
1030+ static void branch_cost_jump (bm::State &state) {
1031+ auto const count = static_cast <std::size_t >(state.range (0 ));
1032+ aligned_array<std::int32_t > random_values (count);
1033+ std::generate_n (random_values.begin (), count, &std::rand);
1034+ std::int32_t variable = 0 ;
1035+ std::size_t iteration = 0 ;
1036+
1037+ for (auto _ : state) {
1038+ std::int32_t const random = random_values[(++iteration) & (count - 1 )];
1039+
1040+ asm volatile ( //
1041+ " testl $1, %[rnd]\n\t "
1042+ " jnz 1f\n\t " // if odd -> jump to add
1043+ " imull %[rnd], %[var]\n\t " // even: var *= rnd
1044+ " jmp 2f\n\t "
1045+ " 1:\n\t "
1046+ " addl %[rnd], %[var]\n\t " // odd: var += rnd
1047+ " 2:\n\t "
1048+ : [var] " +r" (variable)
1049+ : [rnd] " r" (random)
1050+ : " cc" );
1051+ bm::DoNotOptimize (variable);
1052+ }
1053+ }
1054+
1055+ BENCHMARK (branch_cost_cmov)->RangeMultiplier(4 )->Range(256 , 32 * 1024 );
1056+ BENCHMARK (branch_cost_jump)->RangeMultiplier(4 )->Range(256 , 32 * 1024 );
1057+
1058+ #elif defined(__aarch64__)
1059+
1060+ static void branch_cost_csel (bm::State &state) {
1061+ auto const count = static_cast <std::size_t >(state.range (0 ));
1062+ aligned_array<std::int32_t > random_values (count);
1063+ std::generate_n (random_values.begin (), count, &std::rand);
1064+ std::int32_t variable = 0 ;
1065+ std::size_t iteration = 0 ;
1066+
1067+ for (auto _ : state) {
1068+ std::int32_t const random = random_values[(++iteration) & (count - 1 )];
1069+ std::int32_t sum;
1070+
1071+ asm volatile ( //
1072+ " add %w[sum], %w[var], %w[rnd]\n\t " // sum := variable + random
1073+ " mul %w[var], %w[var], %w[rnd]\n\t " // var := variable * random
1074+ " tst %w[rnd], #1\n\t " // if (random & 1) var := sum
1075+ " csel %w[var], %w[sum], %w[var], NE\n\t "
1076+ : [var] " +r" (variable), [sum] " =&r" (sum)
1077+ : [rnd] " r" (random)
1078+ : " cc" );
1079+ bm::DoNotOptimize (variable);
1080+ }
1081+ }
1082+
1083+ static void branch_cost_branch (bm::State &state) {
1084+ auto const count = static_cast <std::size_t >(state.range (0 ));
1085+ aligned_array<std::int32_t > random_values (count);
1086+ std::generate_n (random_values.begin (), count, &std::rand);
1087+ std::int32_t variable = 0 ;
1088+ std::size_t iteration = 0 ;
1089+
1090+ for (auto _ : state) {
1091+ std::int32_t const random = random_values[(++iteration) & (count - 1 )];
1092+
1093+ asm volatile ( //
1094+ " tst %w[rnd], #1\n\t "
1095+ " b.ne 1f\n\t " // if odd -> jump to add
1096+ " mul %w[var], %w[var], %w[rnd]\n\t " // even: var *= rnd
1097+ " b 2f\n\t "
1098+ " 1:\n\t "
1099+ " add %w[var], %w[var], %w[rnd]\n\t " // odd: var += rnd
1100+ " 2:\n\t "
1101+ : [var] " +r" (variable)
1102+ : [rnd] " r" (random)
1103+ : " cc" );
1104+ bm::DoNotOptimize (variable);
1105+ }
1106+ }
1107+
1108+ BENCHMARK (branch_cost_csel)->RangeMultiplier(4 )->Range(256 , 32 * 1024 );
1109+ BENCHMARK (branch_cost_branch)->RangeMultiplier(4 )->Range(256 , 32 * 1024 );
1110+
1111+ #endif
1112+
1113+ #endif // __GNUC__ && !__clang__
1114+
1115+ /* *
1116+ * Results are quite interesting. On Intel:
1117+ * - `branch_cost` up to 4K runs at @b 0.7ns, beyond that it jumps to @b 3.7ns.
1118+ * - `branch_cost_cmov` consistently runs at @b 1.3ns, regardless of the size.
1119+ * - `branch_cost_jump` has similar, but slightly worse performance than `branch_cost`.
1120+ */
1121+
9991122#pragma endregion // Branch Prediction
10001123
10011124#pragma region Cache Misses
@@ -1069,28 +1192,54 @@ BENCHMARK(cache_misses_cost<access_order_t::random>)
10691192 * value. This optimization is crucial for performance, especially when dealing
10701193 * with heavy objects.
10711194 */
1072- #include < optional> // `std::optional`
1195+ struct heavy_t {
1196+ std::uint64_t data[8 ];
1197+
1198+ heavy_t () noexcept { std::iota (data, data + 8 , 0 ); }
1199+
1200+ heavy_t (heavy_t &&) { std::this_thread::sleep_for (std::chrono::milliseconds (1 )); }
1201+ heavy_t (heavy_t const &) { std::this_thread::sleep_for (std::chrono::milliseconds (2 )); }
1202+ heavy_t &operator =(heavy_t &&) {
1203+ std::this_thread::sleep_for (std::chrono::milliseconds (1 ));
1204+ return *this ;
1205+ }
1206+ heavy_t &operator =(heavy_t const &) {
1207+ std::this_thread::sleep_for (std::chrono::milliseconds (2 ));
1208+ return *this ;
1209+ }
1210+ };
10731211
1074- std::optional<std::string> make_heavy_object_mutable () {
1075- std::string x (1024 , ' x' );
1212+ heavy_t make_heavy_object () { return heavy_t {}; }
1213+
1214+ heavy_t make_named_heavy_object () {
1215+ heavy_t const x; // ! Even with `const`, RVO is possible
10761216 return x;
10771217}
10781218
1079- std::optional<std::string> make_heavy_object_immutable () {
1080- std::string const x (1024 , ' x' ); // ! `const` is the only difference
1081- return x;
1219+ heavy_t make_conditional_heavy_object () {
1220+ heavy_t x;
1221+ heavy_t &x1 = x;
1222+ heavy_t &x2 = x;
1223+ static std::size_t counter = 0 ; // ! Condition prevents RVO
1224+ if (counter++ % 2 == 0 ) { return x1; }
1225+ else { return x2; }
1226+ }
1227+
1228+ static void rvo_trivial (bm::State &state) {
1229+ for (auto _ : state) bm::DoNotOptimize (make_heavy_object ());
10821230}
10831231
1084- static void rvo_friendly (bm::State &state) {
1085- for (auto _ : state) bm::DoNotOptimize (make_heavy_object_mutable ());
1232+ static void rvo_likely (bm::State &state) {
1233+ for (auto _ : state) bm::DoNotOptimize (make_named_heavy_object ());
10861234}
10871235
1088- static void rvo_impossible (bm::State &state) {
1089- for (auto _ : state) bm::DoNotOptimize (make_heavy_object_immutable ());
1236+ static void rvo_banned (bm::State &state) {
1237+ for (auto _ : state) bm::DoNotOptimize (make_conditional_heavy_object ());
10901238}
10911239
1092- BENCHMARK (rvo_friendly);
1093- BENCHMARK (rvo_impossible);
1240+ BENCHMARK (rvo_trivial);
1241+ BENCHMARK (rvo_likely);
1242+ BENCHMARK (rvo_banned);
10941243
10951244/* *
10961245 * Despite intuition, marking a local object as `const` hurts our performance.
0 commit comments