|
| 1 | +// Licensed to the Apache Software Foundation (ASF) under one |
| 2 | +// or more contributor license agreements. See the NOTICE file |
| 3 | +// distributed with this work for additional information |
| 4 | +// regarding copyright ownership. The ASF licenses this file |
| 5 | +// to you under the Apache License, Version 2.0 (the |
| 6 | +// "License"); you may not use this file except in compliance |
| 7 | +// with the License. You may obtain a copy of the License at |
| 8 | +// |
| 9 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +// |
| 11 | +// Unless required by applicable law or agreed to in writing, |
| 12 | +// software distributed under the License is distributed on an |
| 13 | +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | +// KIND, either express or implied. See the License for the |
| 15 | +// specific language governing permissions and limitations |
| 16 | +// under the License. |
| 17 | + |
| 18 | +#include <stdexcept> |
| 19 | +#include <vector> |
| 20 | + |
| 21 | +#include <benchmark/benchmark.h> |
| 22 | + |
| 23 | +#include "arrow/testing/util.h" |
| 24 | +#include "arrow/util/bpacking_internal.h" |
| 25 | + |
| 26 | +#if defined(ARROW_HAVE_RUNTIME_AVX2) |
| 27 | +# include "arrow/util/bpacking_avx2_internal.h" |
| 28 | +# include "arrow/util/cpu_info.h" |
| 29 | +#endif |
| 30 | +#if defined(ARROW_HAVE_RUNTIME_AVX512) |
| 31 | +# include "arrow/util/bpacking_avx512_internal.h" |
| 32 | +#endif |
| 33 | +#if defined(ARROW_HAVE_NEON) |
| 34 | +# include "arrow/util/bpacking_neon_internal.h" |
| 35 | +#endif |
| 36 | + |
| 37 | +namespace arrow::internal { |
| 38 | +namespace { |
| 39 | + |
| 40 | +template <typename Int> |
| 41 | +using UnpackFunc = int (*)(const uint8_t*, Int*, int, int); |
| 42 | + |
| 43 | +/// Get the number of bytes associate with a packing. |
| 44 | +constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) { |
| 45 | + const auto num_bits = num_values * bit_width; |
| 46 | + if (num_bits % 8 != 0) { |
| 47 | + throw std::invalid_argument("Must pack a multiple of 8 bits."); |
| 48 | + } |
| 49 | + return num_bits / 8; |
| 50 | +} |
| 51 | + |
| 52 | +/// Generate random bytes as packed integers. |
| 53 | +std::vector<uint8_t> GenerateRandomPackedValues(int32_t num_values, int32_t bit_width) { |
| 54 | + constexpr uint32_t kSeed = 3214; |
| 55 | + const auto num_bytes = GetNumBytes(num_values, bit_width); |
| 56 | + |
| 57 | + std::vector<uint8_t> out(num_bytes); |
| 58 | + random_bytes(num_bytes, kSeed, out.data()); |
| 59 | + |
| 60 | + return out; |
| 61 | +} |
| 62 | + |
| 63 | +const uint8_t* GetNextAlignedByte(const uint8_t* ptr, std::size_t alignment) { |
| 64 | + auto addr = reinterpret_cast<std::uintptr_t>(ptr); |
| 65 | + |
| 66 | + if (addr % alignment == 0) { |
| 67 | + return ptr; |
| 68 | + } |
| 69 | + |
| 70 | + auto remainder = addr % alignment; |
| 71 | + auto bytes_to_add = alignment - remainder; |
| 72 | + |
| 73 | + return ptr + bytes_to_add; |
| 74 | +} |
| 75 | + |
| 76 | +template <typename Int> |
| 77 | +void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc<Int> unpack, bool skip, |
| 78 | + std::string skip_msg) { |
| 79 | + if (skip) { |
| 80 | + state.SkipWithMessage(skip_msg); |
| 81 | + } |
| 82 | + |
| 83 | + const auto bit_width = static_cast<int32_t>(state.range(0)); |
| 84 | + const auto num_values = static_cast<int32_t>(state.range(1)); |
| 85 | + |
| 86 | + // Assume std::vector allocation is likely be aligned for greater than a byte. |
| 87 | + // So we allocate more values than necessary and skip to the next byte with the |
| 88 | + // desired (non) alignment to test the proper condition. |
| 89 | + constexpr int32_t kExtraValues = sizeof(Int) * 8; |
| 90 | + const auto packed = GenerateRandomPackedValues(num_values + kExtraValues, bit_width); |
| 91 | + const uint8_t* packed_ptr = |
| 92 | + GetNextAlignedByte(packed.data(), sizeof(Int)) + (aligned ? 0 : 1); |
| 93 | + |
| 94 | + std::vector<Int> unpacked(num_values, 0); |
| 95 | + |
| 96 | + for (auto _ : state) { |
| 97 | + unpack(packed_ptr, unpacked.data(), num_values, bit_width); |
| 98 | + benchmark::ClobberMemory(); |
| 99 | + } |
| 100 | + state.SetItemsProcessed(num_values * state.iterations()); |
| 101 | +} |
| 102 | + |
| 103 | +constexpr int32_t kMinRange = 64; |
| 104 | +constexpr int32_t kMaxRange = 32768; |
| 105 | +constexpr std::initializer_list<int64_t> kBitWidths32 = {1, 2, 8, 20}; |
| 106 | +constexpr std::initializer_list<int64_t> kBitWidths64 = {1, 2, 8, 20, 47}; |
| 107 | +static const std::vector<std::vector<int64_t>> kBitWidthsNumValues32 = { |
| 108 | + kBitWidths32, |
| 109 | + benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), |
| 110 | +}; |
| 111 | +static const std::vector<std::vector<int64_t>> kBitWidthsNumValues64 = { |
| 112 | + kBitWidths64, |
| 113 | + benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), |
| 114 | +}; |
| 115 | + |
| 116 | +/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. |
| 117 | +void BM_UnpackUint32(benchmark::State& state, bool aligned, UnpackFunc<uint32_t> unpack, |
| 118 | + bool skip = false, std::string skip_msg = "") { |
| 119 | + return BM_Unpack<uint32_t>(state, aligned, unpack, skip, std::move(skip_msg)); |
| 120 | +} |
| 121 | +/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. |
| 122 | +void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc<uint64_t> unpack, |
| 123 | + bool skip = false, std::string skip_msg = "") { |
| 124 | + return BM_Unpack<uint64_t>(state, aligned, unpack, skip, std::move(skip_msg)); |
| 125 | +} |
| 126 | + |
| 127 | +BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, unpack32_scalar) |
| 128 | + ->ArgsProduct(kBitWidthsNumValues32); |
| 129 | +BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, unpack64_scalar) |
| 130 | + ->ArgsProduct(kBitWidthsNumValues64); |
| 131 | + |
| 132 | +#if defined(ARROW_HAVE_RUNTIME_AVX2) |
| 133 | +BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false, unpack32_avx2, |
| 134 | + !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), |
| 135 | + "Avx2 not available") |
| 136 | + ->ArgsProduct(kBitWidthsNumValues32); |
| 137 | +#endif |
| 138 | + |
| 139 | +#if defined(ARROW_HAVE_RUNTIME_AVX512) |
| 140 | +BENCHMARK_CAPTURE(BM_UnpackUint32, Avx512Unaligned, false, unpack32_avx512, |
| 141 | + !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), |
| 142 | + "Avx512 not available") |
| 143 | + ->ArgsProduct(kBitWidthsNumValues32); |
| 144 | +#endif |
| 145 | + |
| 146 | +#if defined(ARROW_HAVE_NEON) |
| 147 | +BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, unpack32_neon) |
| 148 | + ->ArgsProduct(kBitWidthsNumValues32); |
| 149 | +#endif |
| 150 | + |
| 151 | +BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicAligned, true, unpack32) |
| 152 | + ->ArgsProduct(kBitWidthsNumValues32); |
| 153 | +BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicUnaligned, false, unpack32) |
| 154 | + ->ArgsProduct(kBitWidthsNumValues32); |
| 155 | + |
| 156 | +BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicAligned, true, unpack64) |
| 157 | + ->ArgsProduct(kBitWidthsNumValues64); |
| 158 | +BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicUnaligned, false, unpack64) |
| 159 | + ->ArgsProduct(kBitWidthsNumValues64); |
| 160 | + |
| 161 | +} // namespace |
| 162 | +} // namespace arrow::internal |
0 commit comments