Skip to content

Commit f1e7830

Browse files
committed
Use template for internal unpack functions
1 parent a03d398 commit f1e7830

File tree

10 files changed

+124
-142
lines changed

10 files changed

+124
-142
lines changed

cpp/src/arrow/util/bpacking.cc

Lines changed: 11 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -27,40 +27,21 @@ namespace arrow::internal {
2727

2828
namespace {
2929

30-
struct Unpack32DynamicFunction {
31-
using FunctionType = decltype(&unpack32_scalar);
32-
using Implementation = std::pair<DispatchLevel, FunctionType>;
33-
34-
static auto implementations() {
35-
return std::array {
36-
// Current SIMD unpack algorithm works terribly on SSE4.2 due to lack of variable
37-
// rhsift and poor xsimd fallback.
38-
Implementation{DispatchLevel::NONE, &unpack32_scalar},
39-
#if defined(ARROW_HAVE_RUNTIME_AVX2)
40-
Implementation{DispatchLevel::AVX2, &unpack32_avx2},
41-
#endif
42-
#if defined(ARROW_HAVE_RUNTIME_AVX512)
43-
Implementation{DispatchLevel::AVX512, &unpack32_avx512},
44-
#endif
45-
};
46-
}
47-
};
48-
49-
struct Unpack64DynamicFunction {
50-
using FunctionType = decltype(&unpack64_scalar);
30+
template <typename Uint>
31+
struct UnpackDynamicFunction {
32+
using FunctionType = decltype(&unpack_scalar<Uint>);
5133
using Implementation = std::pair<DispatchLevel, FunctionType>;
5234

5335
static auto implementations() {
5436
return std::array {
5537
// Current SIMD unpack algorithm works terribly on SSE4.2 due to lack of variable
5638
// rhsift and poor xsimd fallback.
57-
Implementation{DispatchLevel::NONE, &unpack64_scalar},
39+
Implementation{DispatchLevel::NONE, &unpack_scalar<Uint>},
5840
#if defined(ARROW_HAVE_RUNTIME_AVX2)
59-
// Note that Avx2 implementation only slightly outperform scalar
60-
Implementation{DispatchLevel::AVX2, &unpack64_avx2},
41+
Implementation{DispatchLevel::AVX2, &unpack_avx2<Uint>},
6142
#endif
6243
#if defined(ARROW_HAVE_RUNTIME_AVX512)
63-
Implementation{DispatchLevel::AVX512, &unpack64_avx512},
44+
Implementation{DispatchLevel::AVX512, &unpack_avx512<Uint>},
6445
#endif
6546
};
6647
}
@@ -69,31 +50,18 @@ struct Unpack64DynamicFunction {
6950
} // namespace
7051

7152
template <typename Uint>
72-
ARROW_EXPORT int unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
53+
int unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
7354
if constexpr (std::is_same_v<Uint, uint16_t>) {
7455
// Current SIMD unpack function do not out beat scalar implementation for uin16_t
75-
return unpack16_scalar(in, out, batch_size, num_bits);
76-
}
77-
78-
if constexpr (std::is_same_v<Uint, uint32_t>) {
79-
#if defined(ARROW_HAVE_NEON)
80-
return unpack32_neon(in, out, batch_size, num_bits);
81-
#else
82-
static DynamicDispatch<Unpack32DynamicFunction> dispatch;
83-
return dispatch.func(in, out, batch_size, num_bits);
84-
#endif
85-
}
86-
87-
if constexpr (std::is_same_v<Uint, uint64_t>) {
56+
return unpack_scalar<uint16_t>(in, out, batch_size, num_bits);
57+
} else {
8858
#if defined(ARROW_HAVE_NEON)
89-
return unpack64_neon(in, out, batch_size, num_bits);
59+
return unpack_neon(in, out, batch_size, num_bits);
9060
#else
91-
static DynamicDispatch<Unpack64DynamicFunction> dispatch;
61+
static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch;
9262
return dispatch.func(in, out, batch_size, num_bits);
9363
#endif
9464
}
95-
96-
return 0;
9765
}
9866

9967
template int unpack<uint16_t>(const uint8_t*, uint16_t*, int, int);

cpp/src/arrow/util/bpacking_benchmark.cc

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -129,54 +129,54 @@ void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc<uint64_t>
129129
return BM_Unpack<uint64_t>(state, aligned, unpack, skip, std::move(skip_msg));
130130
}
131131

132-
BENCHMARK_CAPTURE(BM_UnpackUint16, ScalarUnaligned, false, unpack16_scalar)
132+
BENCHMARK_CAPTURE(BM_UnpackUint16, ScalarUnaligned, false, &unpack_scalar<uint16_t>)
133133
->ArgsProduct(kBitWidthsNumValues16);
134-
BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, unpack32_scalar)
134+
BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, &unpack_scalar<uint32_t>)
135135
->ArgsProduct(kBitWidthsNumValues32);
136-
BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, unpack64_scalar)
136+
BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, &unpack_scalar<uint64_t>)
137137
->ArgsProduct(kBitWidthsNumValues64);
138138

139139
#if defined(ARROW_HAVE_SSE4_2)
140-
BENCHMARK_CAPTURE(BM_UnpackUint16, Sse42Unaligned, false, unpack16_sse4_2)
140+
BENCHMARK_CAPTURE(BM_UnpackUint16, Sse42Unaligned, false, &unpack_sse4_2<uint16_t>)
141141
->ArgsProduct(kBitWidthsNumValues16);
142-
BENCHMARK_CAPTURE(BM_UnpackUint32, Sse42Unaligned, false, unpack32_sse4_2)
142+
BENCHMARK_CAPTURE(BM_UnpackUint32, Sse42Unaligned, false, &unpack_sse4_2<uint32_t>)
143143
->ArgsProduct(kBitWidthsNumValues32);
144-
BENCHMARK_CAPTURE(BM_UnpackUint64, Sse42Unaligned, false, unpack64_sse4_2)
144+
BENCHMARK_CAPTURE(BM_UnpackUint64, Sse42Unaligned, false, &unpack_sse4_2<uint64_t>)
145145
->ArgsProduct(kBitWidthsNumValues64);
146146
#endif
147147

148148
#if defined(ARROW_HAVE_RUNTIME_AVX2)
149-
BENCHMARK_CAPTURE(BM_UnpackUint16, Avx2Unaligned, false, unpack16_avx2,
149+
BENCHMARK_CAPTURE(BM_UnpackUint16, Avx2Unaligned, false, &unpack_avx2<uint16_t>,
150150
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
151151
"Avx2 not available")
152152
->ArgsProduct(kBitWidthsNumValues16);
153-
BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false, unpack32_avx2,
153+
BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false, &unpack_avx2<uint32_t>,
154154
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
155155
"Avx2 not available")
156156
->ArgsProduct(kBitWidthsNumValues32);
157-
BENCHMARK_CAPTURE(BM_UnpackUint64, Avx2Unaligned, false, unpack64_avx2,
157+
BENCHMARK_CAPTURE(BM_UnpackUint64, Avx2Unaligned, false, &unpack_avx2<uint64_t>,
158158
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
159159
"Avx2 not available")
160160
->ArgsProduct(kBitWidthsNumValues64);
161161
#endif
162162

163163
#if defined(ARROW_HAVE_RUNTIME_AVX512)
164-
BENCHMARK_CAPTURE(BM_UnpackUint32, Avx512Unaligned, false, unpack32_avx512,
164+
BENCHMARK_CAPTURE(BM_UnpackUint32, Avx512Unaligned, false, &unpack_avx512<uint32_t>,
165165
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
166166
"Avx512 not available")
167167
->ArgsProduct(kBitWidthsNumValues32);
168-
BENCHMARK_CAPTURE(BM_UnpackUint64, Avx512Unaligned, false, unpack64_avx512,
168+
BENCHMARK_CAPTURE(BM_UnpackUint64, Avx512Unaligned, false, &unpack_avx512<uint64_t>,
169169
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
170170
"Avx512 not available")
171171
->ArgsProduct(kBitWidthsNumValues64);
172172
#endif
173173

174174
#if defined(ARROW_HAVE_NEON)
175-
BENCHMARK_CAPTURE(BM_UnpackUint16, NeonUnaligned, false, unpack16_neon)
175+
BENCHMARK_CAPTURE(BM_UnpackUint16, NeonUnaligned, false, &unpack_neon<uint16_t>)
176176
->ArgsProduct(kBitWidthsNumValues16);
177-
BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, unpack32_neon)
177+
BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, &unpack_neon<uint32_t>)
178178
->ArgsProduct(kBitWidthsNumValues32);
179-
BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, unpack64_neon)
179+
BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, &unpack_neon<uint64_t>)
180180
->ArgsProduct(kBitWidthsNumValues64);
181181
#endif
182182

cpp/src/arrow/util/bpacking_internal.h

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,13 @@ namespace arrow::internal {
2626
template <typename Uint>
2727
ARROW_EXPORT int unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits);
2828

29-
extern template ARROW_TEMPLATE_EXPORT int unpack<uint16_t>(const uint8_t* in,
30-
uint16_t* out, int batch_size,
31-
int num_bits);
29+
extern template ARROW_TEMPLATE_EXPORT int unpack<uint16_t>(const uint8_t*, uint16_t*, int,
30+
int);
3231

33-
extern template ARROW_TEMPLATE_EXPORT int unpack<uint32_t>(const uint8_t* in,
34-
uint32_t* out, int batch_size,
35-
int num_bits);
32+
extern template ARROW_TEMPLATE_EXPORT int unpack<uint32_t>(const uint8_t*, uint32_t*, int,
33+
int);
3634

37-
extern template ARROW_TEMPLATE_EXPORT int unpack<uint64_t>(const uint8_t* in,
38-
uint64_t* out, int batch_size,
39-
int num_bits);
35+
extern template ARROW_TEMPLATE_EXPORT int unpack<uint64_t>(const uint8_t*, uint64_t*, int,
36+
int);
4037

4138
} // namespace arrow::internal

cpp/src/arrow/util/bpacking_scalar.cc

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,13 @@
2121

2222
namespace arrow::internal {
2323

24-
int unpack16_scalar(const uint8_t* in, uint16_t* out, int batch_size, int num_bits) {
24+
template <typename Uint>
25+
int unpack_scalar(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
2526
return unpack_jump<ScalarUnpackerForWidth>(in, out, batch_size, num_bits);
2627
}
2728

28-
int unpack32_scalar(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) {
29-
return unpack_jump<ScalarUnpackerForWidth>(in, out, batch_size, num_bits);
30-
}
31-
32-
int unpack64_scalar(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) {
33-
return unpack_jump<ScalarUnpackerForWidth>(in, out, batch_size, num_bits);
34-
}
29+
template int unpack_scalar<uint16_t>(const uint8_t*, uint16_t*, int, int);
30+
template int unpack_scalar<uint32_t>(const uint8_t*, uint32_t*, int, int);
31+
template int unpack_scalar<uint64_t>(const uint8_t*, uint64_t*, int, int);
3532

3633
} // namespace arrow::internal

cpp/src/arrow/util/bpacking_scalar_internal.h

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,17 @@
2323

2424
namespace arrow::internal {
2525

26-
ARROW_EXPORT int unpack16_scalar(const uint8_t* in, uint16_t* out, int batch_size,
27-
int num_bits);
26+
template <typename Uint>
27+
ARROW_EXPORT int unpack_scalar(const uint8_t* in, Uint* out, int batch_size,
28+
int num_bits);
2829

29-
ARROW_EXPORT int unpack32_scalar(const uint8_t* in, uint32_t* out, int batch_size,
30-
int num_bits);
30+
extern template ARROW_TEMPLATE_EXPORT int unpack_scalar<uint16_t>(const uint8_t*,
31+
uint16_t*, int, int);
3132

32-
ARROW_EXPORT int unpack64_scalar(const uint8_t* in, uint64_t* out, int batch_size,
33-
int num_bits);
33+
extern template ARROW_TEMPLATE_EXPORT int unpack_scalar<uint32_t>(const uint8_t*,
34+
uint32_t*, int, int);
35+
36+
extern template ARROW_TEMPLATE_EXPORT int unpack_scalar<uint64_t>(const uint8_t*,
37+
uint64_t*, int, int);
3438

3539
} // namespace arrow::internal

cpp/src/arrow/util/bpacking_simd_avx2.cc

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,13 @@
2121

2222
namespace arrow::internal {
2323

24-
int unpack16_avx2(const uint8_t* in, uint16_t* out, int batch_size, int num_bits) {
24+
template <typename Uint>
25+
int unpack_avx2(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
2526
return unpack_jump<Simd256UnpackerForWidth>(in, out, batch_size, num_bits);
2627
}
2728

28-
int unpack32_avx2(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) {
29-
return unpack_jump<Simd256UnpackerForWidth>(in, out, batch_size, num_bits);
30-
}
31-
32-
int unpack64_avx2(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) {
33-
return unpack_jump<Simd256UnpackerForWidth>(in, out, batch_size, num_bits);
34-
}
29+
template int unpack_avx2<uint16_t>(const uint8_t*, uint16_t*, int, int);
30+
template int unpack_avx2<uint32_t>(const uint8_t*, uint32_t*, int, int);
31+
template int unpack_avx2<uint64_t>(const uint8_t*, uint64_t*, int, int);
3532

3633
} // namespace arrow::internal

cpp/src/arrow/util/bpacking_simd_avx512.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,12 @@
2121

2222
namespace arrow::internal {
2323

24-
int unpack32_avx512(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) {
24+
template <typename Uint>
25+
int unpack_avx512(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
2526
return unpack_jump<Simd512UnpackerForWidth>(in, out, batch_size, num_bits);
2627
}
2728

28-
int unpack64_avx512(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) {
29-
return unpack_jump<Simd512UnpackerForWidth>(in, out, batch_size, num_bits);
30-
}
29+
template int unpack_avx512<uint32_t>(const uint8_t*, uint32_t*, int, int);
30+
template int unpack_avx512<uint64_t>(const uint8_t*, uint64_t*, int, int);
3131

3232
} // namespace arrow::internal

cpp/src/arrow/util/bpacking_simd_internal.h

Lines changed: 43 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -24,44 +24,65 @@
2424
namespace arrow::internal {
2525

2626
#if defined(ARROW_HAVE_NEON)
27-
ARROW_EXPORT int unpack16_neon(const uint8_t* in, uint16_t* out, int batch_size,
28-
int num_bits);
2927

30-
ARROW_EXPORT int unpack32_neon(const uint8_t* in, uint32_t* out, int batch_size,
31-
int num_bits);
28+
template <typename Uint>
29+
ARROW_EXPORT int unpack_neon(const uint8_t* in, Uint* out, int batch_size, int num_bits);
30+
31+
extern template ARROW_TEMPLATE_EXPORT int unpack_neon<uint16_t>(const uint8_t*, uint16_t*,
32+
int, int);
33+
34+
extern template ARROW_TEMPLATE_EXPORT int unpack_neon<uint32_t>(const uint8_t*, uint32_t*,
35+
int, int);
36+
37+
extern template ARROW_TEMPLATE_EXPORT int unpack_neon<uint64_t>(const uint8_t*, uint64_t*,
38+
int, int);
3239

33-
ARROW_EXPORT int unpack64_neon(const uint8_t* in, uint64_t* out, int batch_size,
34-
int num_bits);
3540
#endif
3641

3742
#if defined(ARROW_HAVE_SSE4_2)
38-
ARROW_EXPORT int unpack16_sse4_2(const uint8_t* in, uint16_t* out, int batch_size,
39-
int num_bits);
4043

41-
ARROW_EXPORT int unpack32_sse4_2(const uint8_t* in, uint32_t* out, int batch_size,
42-
int num_bits);
44+
template <typename Uint>
45+
ARROW_EXPORT int unpack_sse4_2(const uint8_t* in, Uint* out, int batch_size,
46+
int num_bits);
47+
48+
extern template ARROW_TEMPLATE_EXPORT int unpack_sse4_2<uint16_t>(const uint8_t*,
49+
uint16_t*, int, int);
4350

44-
ARROW_EXPORT int unpack64_sse4_2(const uint8_t* in, uint64_t* out, int batch_size,
45-
int num_bits);
51+
extern template ARROW_TEMPLATE_EXPORT int unpack_sse4_2<uint32_t>(const uint8_t*,
52+
uint32_t*, int, int);
53+
54+
extern template ARROW_TEMPLATE_EXPORT int unpack_sse4_2<uint64_t>(const uint8_t*,
55+
uint64_t*, int, int);
4656
#endif
4757

4858
#if defined(ARROW_HAVE_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX2)
49-
ARROW_EXPORT int unpack16_avx2(const uint8_t* in, uint16_t* out, int batch_size,
50-
int num_bits);
5159

52-
ARROW_EXPORT int unpack32_avx2(const uint8_t* in, uint32_t* out, int batch_size,
53-
int num_bits);
60+
template <typename Uint>
61+
ARROW_EXPORT int unpack_avx2(const uint8_t* in, Uint* out, int batch_size, int num_bits);
62+
63+
extern template ARROW_TEMPLATE_EXPORT int unpack_avx2<uint16_t>(const uint8_t*, uint16_t*,
64+
int, int);
65+
66+
extern template ARROW_TEMPLATE_EXPORT int unpack_avx2<uint32_t>(const uint8_t*, uint32_t*,
67+
int, int);
68+
69+
extern template ARROW_TEMPLATE_EXPORT int unpack_avx2<uint64_t>(const uint8_t*, uint64_t*,
70+
int, int);
5471

55-
ARROW_EXPORT int unpack64_avx2(const uint8_t* in, uint64_t* out, int batch_size,
56-
int num_bits);
5772
#endif
5873

5974
#if defined(ARROW_HAVE_AVX512) || defined(ARROW_HAVE_RUNTIME_AVX512)
60-
ARROW_EXPORT int unpack32_avx512(const uint8_t* in, uint32_t* out, int batch_size,
61-
int num_bits);
6275

63-
ARROW_EXPORT int unpack64_avx512(const uint8_t* in, uint64_t* out, int batch_size,
64-
int num_bits);
76+
template <typename Uint>
77+
ARROW_EXPORT int unpack_avx512(const uint8_t* in, Uint* out, int batch_size,
78+
int num_bits);
79+
80+
extern template ARROW_TEMPLATE_EXPORT int unpack_avx512<uint32_t>(const uint8_t*,
81+
uint32_t*, int, int);
82+
83+
extern template ARROW_TEMPLATE_EXPORT int unpack_avx512<uint64_t>(const uint8_t*,
84+
uint64_t*, int, int);
85+
6586
#endif
6687

6788
} // namespace arrow::internal

cpp/src/arrow/util/bpacking_simd_min.cc

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,31 +22,29 @@
2222
namespace arrow::internal {
2323

2424
#if defined(ARROW_HAVE_NEON)
25-
int unpack16_neon(const uint8_t* in, uint16_t* out, int batch_size, int num_bits) {
26-
return unpack_jump<Simd128UnpackerForWidth>(in, out, batch_size, num_bits);
27-
}
2825

29-
int unpack32_neon(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) {
26+
template <typename Uint>
27+
int unpack_neon(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
3028
return unpack_jump<Simd128UnpackerForWidth>(in, out, batch_size, num_bits);
3129
}
3230

33-
int unpack64_neon(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) {
34-
return unpack_jump<Simd128UnpackerForWidth>(in, out, batch_size, num_bits);
35-
}
31+
template int unpack_neon<uint16_t>(const uint8_t*, uint16_t*, int, int);
32+
template int unpack_neon<uint32_t>(const uint8_t*, uint32_t*, int, int);
33+
template int unpack_neon<uint64_t>(const uint8_t*, uint64_t*, int, int);
34+
3635
#endif
3736

3837
#if defined(ARROW_HAVE_SSE4_2)
39-
int unpack16_sse4_2(const uint8_t* in, uint16_t* out, int batch_size, int num_bits) {
40-
return unpack_jump<Simd128UnpackerForWidth>(in, out, batch_size, num_bits);
41-
}
4238

43-
int unpack32_sse4_2(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) {
39+
template <typename Uint>
40+
int unpack_sse4_2(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
4441
return unpack_jump<Simd128UnpackerForWidth>(in, out, batch_size, num_bits);
4542
}
4643

47-
int unpack64_sse4_2(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) {
48-
return unpack_jump<Simd128UnpackerForWidth>(in, out, batch_size, num_bits);
49-
}
44+
template int unpack_sse4_2<uint16_t>(const uint8_t*, uint16_t*, int, int);
45+
template int unpack_sse4_2<uint32_t>(const uint8_t*, uint32_t*, int, int);
46+
template int unpack_sse4_2<uint64_t>(const uint8_t*, uint64_t*, int, int);
47+
5048
#endif
5149

5250
} // namespace arrow::internal

0 commit comments

Comments
 (0)