Skip to content

Commit 931acd8

Browse files
AntoinePrvpitrou
andauthored
GH-47514: [C++][Parquet] Add unpack tests and benchmarks (#47515)
### Rationale for this change Tests and benchmarks make it easier to iterate and compare improvements. ### What changes are included in this PR? - New tests - New benchmarks - !Uniform API between ``unpackDD_XXX`` functions for genericity in tests/benchmarks. Also results in safer API suggesting that the data may not be aligned. ### Are these changes tested? Yes very much. ### Are there any user-facing changes? No. * GitHub Issue: #47514 * GitHub Issue: #39594 Lead-authored-by: AntoinePrv <[email protected]> Co-authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 0648128 commit 931acd8

13 files changed

+480
-66
lines changed

cpp/src/arrow/util/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ add_arrow_test(bit-utility-test
9898
SOURCES
9999
bit_block_counter_test.cc
100100
bit_util_test.cc
101+
bpacking_test.cc
101102
rle_encoding_test.cc)
102103

103104
add_arrow_test(threading-utility-test
@@ -117,6 +118,7 @@ add_arrow_test(crc32-test
117118

118119
add_arrow_benchmark(bit_block_counter_benchmark)
119120
add_arrow_benchmark(bit_util_benchmark)
121+
add_arrow_benchmark(bpacking_benchmark)
120122
add_arrow_benchmark(bitmap_reader_benchmark)
121123
add_arrow_benchmark(cache_benchmark)
122124
add_arrow_benchmark(compression_benchmark)

cpp/src/arrow/util/bit_stream_utils_internal.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
#include "arrow/util/bit_util.h"
2727
#include "arrow/util/bpacking_internal.h"
28+
#include "arrow/util/endian.h"
2829
#include "arrow/util/logging.h"
2930
#include "arrow/util/macros.h"
3031
#include "arrow/util/ubsan.h"
@@ -339,8 +340,8 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
339340

340341
if (sizeof(T) == 4) {
341342
int num_unpacked =
342-
internal::unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
343-
reinterpret_cast<uint32_t*>(v + i), batch_size - i, num_bits);
343+
internal::unpack32(buffer + byte_offset, reinterpret_cast<uint32_t*>(v + i),
344+
batch_size - i, num_bits);
344345
i += num_unpacked;
345346
byte_offset += num_unpacked * num_bits / 8;
346347
} else if (sizeof(T) == 8 && num_bits > 32) {
@@ -360,8 +361,7 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
360361
while (i < batch_size) {
361362
int unpack_size = std::min(buffer_size, batch_size - i);
362363
int num_unpacked =
363-
internal::unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
364-
unpack_buffer, unpack_size, num_bits);
364+
internal::unpack32(buffer + byte_offset, unpack_buffer, unpack_size, num_bits);
365365
if (num_unpacked == 0) {
366366
break;
367367
}

cpp/src/arrow/util/bpacking.cc

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,9 @@
3636
namespace arrow {
3737
namespace internal {
3838

39-
namespace {
39+
int unpack32_scalar(const uint8_t* in_, uint32_t* out, int batch_size, int num_bits) {
40+
const uint32_t* in = reinterpret_cast<const uint32_t*>(in_);
4041

41-
int unpack32_default(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
4242
batch_size = batch_size / 32 * 32;
4343
int num_loops = batch_size / 32;
4444

@@ -149,11 +149,13 @@ int unpack32_default(const uint32_t* in, uint32_t* out, int batch_size, int num_
149149
return batch_size;
150150
}
151151

152+
namespace {
153+
152154
struct Unpack32DynamicFunction {
153-
using FunctionType = decltype(&unpack32_default);
155+
using FunctionType = decltype(&unpack32_scalar);
154156

155157
static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() {
156-
return {{DispatchLevel::NONE, unpack32_default}
158+
return {{DispatchLevel::NONE, unpack32_scalar}
157159
#if defined(ARROW_HAVE_RUNTIME_AVX2)
158160
,
159161
{DispatchLevel::AVX2, unpack32_avx2}
@@ -168,7 +170,7 @@ struct Unpack32DynamicFunction {
168170

169171
} // namespace
170172

171-
int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
173+
int unpack32(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) {
172174
#if defined(ARROW_HAVE_NEON)
173175
return unpack32_neon(in, out, batch_size, num_bits);
174176
#else
@@ -177,9 +179,7 @@ int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
177179
#endif
178180
}
179181

180-
namespace {
181-
182-
int unpack64_default(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) {
182+
int unpack64_scalar(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) {
183183
batch_size = batch_size / 32 * 32;
184184
int num_loops = batch_size / 32;
185185

@@ -386,11 +386,9 @@ int unpack64_default(const uint8_t* in, uint64_t* out, int batch_size, int num_b
386386
return batch_size;
387387
}
388388

389-
} // namespace
390-
391389
int unpack64(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) {
392390
// TODO: unpack64_neon, unpack64_avx2 and unpack64_avx512
393-
return unpack64_default(in, out, batch_size, num_bits);
391+
return unpack64_scalar(in, out, batch_size, num_bits);
394392
}
395393

396394
} // namespace internal

cpp/src/arrow/util/bpacking64_default_internal.h

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,10 @@
2626

2727
#pragma once
2828

29-
#include "arrow/util/bit_util.h"
29+
#include "arrow/util/endian.h"
3030
#include "arrow/util/ubsan.h"
3131

32-
namespace arrow {
33-
namespace internal {
32+
namespace arrow::internal {
3433

3534
inline const uint8_t* unpack0_64(const uint8_t* in, uint64_t* out) {
3635
for (int k = 0; k < 32; k += 1) {
@@ -5638,5 +5637,4 @@ inline const uint8_t* unpack64_64(const uint8_t* in, uint64_t* out) {
56385637
return in;
56395638
}
56405639

5641-
} // namespace internal
5642-
} // namespace arrow
5640+
} // namespace arrow::internal

cpp/src/arrow/util/bpacking_avx2.cc

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,11 @@
1919
#include "arrow/util/bpacking_simd256_generated_internal.h"
2020
#include "arrow/util/bpacking_simd_internal.h"
2121

22-
namespace arrow {
23-
namespace internal {
22+
namespace arrow::internal {
2423

25-
int unpack32_avx2(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
26-
return unpack32_specialized<UnpackBits256<DispatchLevel::AVX2>>(in, out, batch_size,
27-
num_bits);
24+
int unpack32_avx2(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) {
25+
return unpack32_specialized<UnpackBits256<DispatchLevel::AVX2>>(
26+
reinterpret_cast<const uint32_t*>(in), out, batch_size, num_bits);
2827
}
2928

30-
} // namespace internal
31-
} // namespace arrow
29+
} // namespace arrow::internal

cpp/src/arrow/util/bpacking_avx2_internal.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,13 @@
1717

1818
#pragma once
1919

20-
#include <stdint.h>
20+
#include "arrow/util/visibility.h"
2121

22-
namespace arrow {
23-
namespace internal {
22+
#include <cstdint>
2423

25-
int unpack32_avx2(const uint32_t* in, uint32_t* out, int batch_size, int num_bits);
24+
namespace arrow::internal {
2625

27-
} // namespace internal
28-
} // namespace arrow
26+
ARROW_EXPORT int unpack32_avx2(const uint8_t* in, uint32_t* out, int batch_size,
27+
int num_bits);
28+
29+
} // namespace arrow::internal

cpp/src/arrow/util/bpacking_avx512.cc

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,11 @@
1919
#include "arrow/util/bpacking_simd512_generated_internal.h"
2020
#include "arrow/util/bpacking_simd_internal.h"
2121

22-
namespace arrow {
23-
namespace internal {
22+
namespace arrow::internal {
2423

25-
int unpack32_avx512(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
26-
return unpack32_specialized<UnpackBits512<DispatchLevel::AVX512>>(in, out, batch_size,
27-
num_bits);
24+
int unpack32_avx512(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) {
25+
return unpack32_specialized<UnpackBits512<DispatchLevel::AVX512>>(
26+
reinterpret_cast<const uint32_t*>(in), out, batch_size, num_bits);
2827
}
2928

30-
} // namespace internal
31-
} // namespace arrow
29+
} // namespace arrow::internal

cpp/src/arrow/util/bpacking_avx512_internal.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,13 @@
1717

1818
#pragma once
1919

20-
#include <stdint.h>
20+
#include "arrow/util/visibility.h"
2121

22-
namespace arrow {
23-
namespace internal {
22+
#include <cstdint>
2423

25-
int unpack32_avx512(const uint32_t* in, uint32_t* out, int batch_size, int num_bits);
24+
namespace arrow::internal {
2625

27-
} // namespace internal
28-
} // namespace arrow
26+
ARROW_EXPORT int unpack32_avx512(const uint8_t* in, uint32_t* out, int batch_size,
27+
int num_bits);
28+
29+
} // namespace arrow::internal
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include <stdexcept>
19+
#include <vector>
20+
21+
#include <benchmark/benchmark.h>
22+
23+
#include "arrow/testing/util.h"
24+
#include "arrow/util/bpacking_internal.h"
25+
26+
#if defined(ARROW_HAVE_RUNTIME_AVX2)
27+
# include "arrow/util/bpacking_avx2_internal.h"
28+
# include "arrow/util/cpu_info.h"
29+
#endif
30+
#if defined(ARROW_HAVE_RUNTIME_AVX512)
31+
# include "arrow/util/bpacking_avx512_internal.h"
32+
#endif
33+
#if defined(ARROW_HAVE_NEON)
34+
# include "arrow/util/bpacking_neon_internal.h"
35+
#endif
36+
37+
namespace arrow::internal {
38+
namespace {
39+
40+
template <typename Int>
41+
using UnpackFunc = int (*)(const uint8_t*, Int*, int, int);
42+
43+
/// Get the number of bytes associate with a packing.
44+
constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) {
45+
const auto num_bits = num_values * bit_width;
46+
if (num_bits % 8 != 0) {
47+
throw std::invalid_argument("Must pack a multiple of 8 bits.");
48+
}
49+
return num_bits / 8;
50+
}
51+
52+
/// Generate random bytes as packed integers.
53+
std::vector<uint8_t> GenerateRandomPackedValues(int32_t num_values, int32_t bit_width) {
54+
constexpr uint32_t kSeed = 3214;
55+
const auto num_bytes = GetNumBytes(num_values, bit_width);
56+
57+
std::vector<uint8_t> out(num_bytes);
58+
random_bytes(num_bytes, kSeed, out.data());
59+
60+
return out;
61+
}
62+
63+
const uint8_t* GetNextAlignedByte(const uint8_t* ptr, std::size_t alignment) {
64+
auto addr = reinterpret_cast<std::uintptr_t>(ptr);
65+
66+
if (addr % alignment == 0) {
67+
return ptr;
68+
}
69+
70+
auto remainder = addr % alignment;
71+
auto bytes_to_add = alignment - remainder;
72+
73+
return ptr + bytes_to_add;
74+
}
75+
76+
template <typename Int>
77+
void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc<Int> unpack, bool skip,
78+
std::string skip_msg) {
79+
if (skip) {
80+
state.SkipWithMessage(skip_msg);
81+
}
82+
83+
const auto bit_width = static_cast<int32_t>(state.range(0));
84+
const auto num_values = static_cast<int32_t>(state.range(1));
85+
86+
// Assume std::vector allocation is likely be aligned for greater than a byte.
87+
// So we allocate more values than necessary and skip to the next byte with the
88+
// desired (non) alignment to test the proper condition.
89+
constexpr int32_t kExtraValues = sizeof(Int) * 8;
90+
const auto packed = GenerateRandomPackedValues(num_values + kExtraValues, bit_width);
91+
const uint8_t* packed_ptr =
92+
GetNextAlignedByte(packed.data(), sizeof(Int)) + (aligned ? 0 : 1);
93+
94+
std::vector<Int> unpacked(num_values, 0);
95+
96+
for (auto _ : state) {
97+
unpack(packed_ptr, unpacked.data(), num_values, bit_width);
98+
benchmark::ClobberMemory();
99+
}
100+
state.SetItemsProcessed(num_values * state.iterations());
101+
}
102+
103+
constexpr int32_t kMinRange = 64;
104+
constexpr int32_t kMaxRange = 32768;
105+
constexpr std::initializer_list<int64_t> kBitWidths32 = {1, 2, 8, 20};
106+
constexpr std::initializer_list<int64_t> kBitWidths64 = {1, 2, 8, 20, 47};
107+
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues32 = {
108+
kBitWidths32,
109+
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
110+
};
111+
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues64 = {
112+
kBitWidths64,
113+
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
114+
};
115+
116+
/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
117+
void BM_UnpackUint32(benchmark::State& state, bool aligned, UnpackFunc<uint32_t> unpack,
118+
bool skip = false, std::string skip_msg = "") {
119+
return BM_Unpack<uint32_t>(state, aligned, unpack, skip, std::move(skip_msg));
120+
}
121+
/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
122+
void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc<uint64_t> unpack,
123+
bool skip = false, std::string skip_msg = "") {
124+
return BM_Unpack<uint64_t>(state, aligned, unpack, skip, std::move(skip_msg));
125+
}
126+
127+
BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, unpack32_scalar)
128+
->ArgsProduct(kBitWidthsNumValues32);
129+
BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, unpack64_scalar)
130+
->ArgsProduct(kBitWidthsNumValues64);
131+
132+
#if defined(ARROW_HAVE_RUNTIME_AVX2)
133+
BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false, unpack32_avx2,
134+
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
135+
"Avx2 not available")
136+
->ArgsProduct(kBitWidthsNumValues32);
137+
#endif
138+
139+
#if defined(ARROW_HAVE_RUNTIME_AVX512)
140+
BENCHMARK_CAPTURE(BM_UnpackUint32, Avx512Unaligned, false, unpack32_avx512,
141+
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
142+
"Avx512 not available")
143+
->ArgsProduct(kBitWidthsNumValues32);
144+
#endif
145+
146+
#if defined(ARROW_HAVE_NEON)
147+
BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, unpack32_neon)
148+
->ArgsProduct(kBitWidthsNumValues32);
149+
#endif
150+
151+
BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicAligned, true, unpack32)
152+
->ArgsProduct(kBitWidthsNumValues32);
153+
BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicUnaligned, false, unpack32)
154+
->ArgsProduct(kBitWidthsNumValues32);
155+
156+
BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicAligned, true, unpack64)
157+
->ArgsProduct(kBitWidthsNumValues64);
158+
BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicUnaligned, false, unpack64)
159+
->ArgsProduct(kBitWidthsNumValues64);
160+
161+
} // namespace
162+
} // namespace arrow::internal

cpp/src/arrow/util/bpacking_internal.h

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,23 @@
1717

1818
#pragma once
1919

20-
#include "arrow/util/endian.h"
2120
#include "arrow/util/visibility.h"
2221

23-
#include <stdint.h>
22+
#include <cstdint>
2423

25-
namespace arrow {
26-
namespace internal {
24+
namespace arrow::internal {
25+
26+
/// The scalar 32 bit unpacking.
27+
ARROW_EXPORT int unpack32_scalar(const uint8_t* in, uint32_t* out, int batch_size,
28+
int num_bits);
29+
30+
/// The scalar 64 bit unpacking.
31+
ARROW_EXPORT int unpack64_scalar(const uint8_t* in, uint64_t* out, int batch_size,
32+
int num_bits);
2733

2834
ARROW_EXPORT
29-
int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits);
35+
int unpack32(const uint8_t* in, uint32_t* out, int batch_size, int num_bits);
3036
ARROW_EXPORT
3137
int unpack64(const uint8_t* in, uint64_t* out, int batch_size, int num_bits);
3238

33-
} // namespace internal
34-
} // namespace arrow
39+
} // namespace arrow::internal

0 commit comments

Comments
 (0)